From 1e5f8484a3b615bdfff238edd42c48c8f4d34359 Mon Sep 17 00:00:00 2001 From: David Schaefer <david.schaefer@ufz.de> Date: Wed, 13 Oct 2021 12:51:50 +0200 Subject: [PATCH 1/5] first steps --- saqc/funcs/generic.py | 70 ++++++++++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 28 deletions(-) diff --git a/saqc/funcs/generic.py b/saqc/funcs/generic.py index cc2142cb5..2291217bb 100644 --- a/saqc/funcs/generic.py +++ b/saqc/funcs/generic.py @@ -3,7 +3,7 @@ from functools import partial from inspect import signature -from typing import Tuple, Union, Callable +from typing import Sequence, Tuple, Union, Callable import numpy as np import pandas as pd @@ -17,6 +17,8 @@ from saqc.core.visitor import ENVIRONMENT import operator as op +from saqc.lib.tools import toSequence + _OP = {"<": op.lt, "<=": op.le, "==": op.eq, "!=": op.ne, ">": op.gt, ">=": op.ge} @@ -54,33 +56,23 @@ def _execGeneric( flags: Flags, data: DictOfSeries, func: Callable[[pd.Series], pd.Series], - field: str, + fields: Sequence[str], ) -> pd.Series: + # TODO: # - check series.index compatibility - # - field is only needed to translate 'this' parameters - # -> maybe we could do the translation on the tree instead - - sig = signature(func) - args = [] - for k, v in sig.parameters.items(): - k = field if k == "this" else k - if k not in data: - raise NameError(f"variable '{k}' not found") - args.append(data[k]) globs = { "isflagged": partial(_dslIsFlagged, flags), "ismissing": lambda var: pd.isnull(var), "mask": lambda cond: data[cond.name].mask(cond), - "this": field, "GOOD": GOOD, "BAD": BAD, "UNFLAGGED": UNFLAGGED, **ENVIRONMENT, } func.__globals__.update(globs) - return func(*args) + return func(*[data[f] for f in fields]) @processing(module="generic") @@ -155,9 +147,11 @@ def process( @flagging(masking="none", module="generic") def flag( data: DictOfSeries, - field: str, + field: Union[str, Sequence[str]], flags: Flags, + target: Union[str, Sequence[str]], func: Callable[[pd.Series], pd.Series], + sources: Sequence[str], flag: float = BAD, to_mask: float = UNFLAGGED, **kwargs, @@ -183,17 +177,18 @@ def flag( Parameters ---------- - data : dios.DictOfSeries + data : DictOfSeries A dictionary of pandas.Series, holding all the data. - field : str - The fieldname of the column, where you want the result from the generic expressions evaluation to be projected - to. - flags : saqc.Flags + field : str or list of str + Name of the column(s), holding the data-to-be-flagged. + flags : Flags Container to store flags of the data. + target : str or list of str + Name of the column(s), to write the results to func : Callable - The expression that is to be evaluated is passed in form of a callable, with parameter names that will be - interpreted as data column entries. The Callable must return an boolen array like. - See the examples section to learn more. + Function to call on `field` or `sources`. Must return a boolean pd.Series | np.ndarray + sources : list of str + Sequence of field names. flag : float, default BAD flag to set. @@ -236,7 +231,28 @@ def flag( Your expression also is allowed to include pandas and numpy functions >>> lambda level: np.sqrt(level) > 7 + + TEMP: + + Multiple fields, single target + + # just work on a single field + >>> saqc.generic.flag(field="a", lambda x: x < 0) + + # use as multivariate function: multiple in, single out + >>> saqc.generic.flag(field=["x", "y", "z"], target="a", lambda x, y, z: x + y > z) + + # use as multivariate function: multiple in, broadcast to all `field`s + >>> saqc.generic.flag(field=["x", "y", "z"], lambda x, y, z: x + y > z) + + # use as multivariate function: multiple in, broadcast to all `targets` + >>> saqc.generic.flag(field=["x", "y", "z"], targets=["a", "b", "c"], lambda x, y, z: x + y > z) + + + # not supported + >>> saqc.generic.flag(field=["x", "y", "z"], target=["a", "b"], lambda x, y, z: x < z, y > z ) """ + # we get the data unmasked, in order to also receive flags, # so let's do to the masking manually data_masked, _ = _maskData(data, flags, data.columns, to_mask) @@ -247,11 +263,9 @@ def flag( if not np.issubdtype(mask.dtype, np.bool_): raise TypeError(f"generic expression does not return a boolean array") - if field not in flags: - flags[field] = pd.Series(data=UNFLAGGED, index=mask.index, name=field) - - mask = ~_isflagged(flags[field], to_mask) & mask + for f in toSequence(field): + mask &= ~_isflagged(flags[f], to_mask) - flags[mask, field] = flag + flags[mask, target] = flag return data, flags -- GitLab From 8b069be1adf9c749c1c95c3439e51d10c6bb6e54 Mon Sep 17 00:00:00 2001 From: David Schaefer <david.schaefer@ufz.de> Date: Tue, 19 Oct 2021 18:02:23 +0200 Subject: [PATCH 2/5] first passing draft --- ressources/data/config_ci.csv | 4 +- saqc/core/core.py | 155 ++++++++++++++-- saqc/core/flags.py | 16 +- saqc/core/modules/tools.py | 2 +- saqc/core/reader.py | 2 +- saqc/core/register.py | 10 +- saqc/funcs/generic.py | 112 ++++++------ saqc/funcs/tools.py | 10 +- tests/funcs/test_generic_api_functions.py | 71 ++++---- tests/funcs/test_generic_config_functions.py | 180 ++++++------------- 10 files changed, 317 insertions(+), 245 deletions(-) diff --git a/ressources/data/config_ci.csv b/ressources/data/config_ci.csv index ecbe227e8..cb20243ea 100644 --- a/ressources/data/config_ci.csv +++ b/ressources/data/config_ci.csv @@ -1,8 +1,8 @@ varname ; test #-------; ----------------------------------------------------- -SM2 ; resampling.shift(freq="15Min") +'.*' ; resampling.shift(freq="15Min") '.*' ; outliers.flagRange(min=10, max=60) SM2 ; breaks.flagMissing() SM2 ; outliers.flagRange(min=10, max=60) SM2 ; outliers.flagMAD(window="30d", z=3.5) -Dummy ; generic.flag(func=(isflagged(SM1) | isflagged(SM2))) +Dummy ; generic.flag(field=["SM1", "SM2"], func=(y >= x)) diff --git a/saqc/core/core.py b/saqc/core/core.py index 252dca351..23b53455b 100644 --- a/saqc/core/core.py +++ b/saqc/core/core.py @@ -5,7 +5,7 @@ from __future__ import annotations import inspect import warnings import copy as stdcopy -from typing import Any, Callable, Tuple, Union, Optional +from typing import Any, Callable, Sequence, Tuple, Union, Optional import pandas as pd import numpy as np @@ -189,6 +189,7 @@ class SaQC(FuncModules): return data.to_df(), self._translator.backward(flags) + def _wrap(self, func: Callable): """Enrich a function by special saqc-functionality. @@ -210,11 +211,10 @@ class SaQC(FuncModules): target: str = None, regex: bool = False, flag: ExternalFlag = None, + inplace: bool = False, **kwargs, ) -> SaQC: - if regex and target is not None: - raise ValueError("explicit `target` not supported with `regex=True`") kwargs.setdefault("to_mask", self._translator.TO_MASK) @@ -224,37 +224,162 @@ class SaQC(FuncModules): # expand regular expressions if regex: - fields = self._data.columns.str.match(field) - fields = self._data.columns[fields] - targets = fields - else: - fields, targets = toSequence(field), toSequence(target, default=field) + if field != target: + raise ValueError("explicit `target` not supported with `regex=True`") + field = self._data.columns[self._data.columns.str.match(field)] + target = field + + if target is None: + target = field + + fields, targets = toSequence(field), toSequence(target) + + + if not func._multi: + if len(fields) == 1: + # Write the result generated from a single field to multiple targets. + # Could be optimized to call the function only once and write to + # all targets + fields = fields * len(targets) + if len(targets) == 1: + # Write the results generated from multiple fields to a single target + # sort of nonsense, as `target` is simply overwritten mutliple times, + # but principally not illegal. + # Could be optimized to call the function only once on `fields[-1]` + targets = targets * len(fields) + + # else: # multivariate function + # import pdb; pdb.set_trace() + # fields = [fields, ] + # if len(targets) == 1: + # # write the result generated from a single set of fields to multiple targets + # targets = [targets, ] + + if not func._multi and len(fields) != len(targets): + import pdb; pdb.set_trace() + # TODO: a better error message + raise ValueError( + "invalid combination of `field` and `target` parameters" + ) + out = self + # NOTE: initialize all target fields for field, target in zip(fields, targets): if field != target: + try: + out = out._callFunction( + FUNC_MAP["tools.copy"], + data=out._data, + flags=out._flags, + field=field, + target=target + ) + except ValueError: + pass + + if not func._multi: + # NOTE: we call univariate functions iteratively + for target in targets: out = out._callFunction( - FUNC_MAP["tools.copy"], + func, data=out._data, flags=out._flags, - field=field, - new_field=target, + field=target, + target=target, + *args, + **kwargs, ) - field = target - + else: out = out._callFunction( func, data=out._data, flags=out._flags, - field=field, + field=fields, + target=targets, *args, **kwargs, ) + return out return inner + # def _wrap(self, func: Callable): + # """Enrich a function by special saqc-functionality. + + # For each saqc function this realize + # - the source-target workflow, + # - regex's in field, + # - use default of translator for ``to_mask`` if not specified by user, + # - translation of ``flag`` and + # - working inplace. + # Therefore it adds the following keywords to each saqc function: + # ``target``, ``regex`` and ``inplace``. + + # The returned function returns a Saqc object. + # """ + + # def inner( + # field: Union[str, Sequence[str]], + # *args, + # target: Union[str, Sequence[str]] = None, + # regex: bool = False, + # flag: ExternalFlag = None, + # **kwargs, + # ) -> SaQC: + + # if regex and target is not None: + # raise ValueError("explicit `target` not supported with `regex=True`") + + # kwargs.setdefault("to_mask", self._translator.TO_MASK) + + # # translation + # if flag is not None: + # kwargs["flag"] = self._translator(flag) + + # # expand regular expressions + # if regex: + # fields = self._data.columns.str.match(field) + # fields = self._data.columns[fields].to_list() + # targets = fields + # else: + # fields, targets = toSequence(field), toSequence(target, default=field) + + # out = self + + # if func._multi: + # if len(fields) > 1 and len(targets) == 1: + # targets = targets * len(fields) + + # # import pdb; pdb.set_trace() + # # pass + + # for field, target in zip(fields, targets): + # if field != target: + # out = out._callFunction( + # FUNC_MAP["tools.copy"], + # data=out._data, + # flags=out._flags, + # field=field, + # new_field=target, + # ) + # field = target + + # out = out._callFunction( + # func, + # data=out._data, + # flags=out._flags, + # field=field, + # target=target, + # *args, + # **kwargs, + # ) + # return out + + # return inner + def _callFunction( self, function: Callable, @@ -314,7 +439,7 @@ def _warnForUnusedKwargs(func, keywords, translator: Translator): sig_kws = inspect.signature(func).parameters # we need to ignore kws that are injected or by default hidden in ``**kwargs`` - ignore = ("to_mask",) + ignore = {"to_mask", "target"} missing = [] for kw in keywords: diff --git a/saqc/core/flags.py b/saqc/core/flags.py index 873cd62ac..d9a657db6 100644 --- a/saqc/core/flags.py +++ b/saqc/core/flags.py @@ -3,7 +3,8 @@ from __future__ import annotations import pandas as pd import dios -from typing import Mapping, Union, Dict, DefaultDict, Optional, Type, Tuple, Iterable +from typing import Mapping, Sequence, Union, Dict, DefaultDict, Optional, Type, Tuple, Iterable, overload +from dios.dios.dios import DictOfSeries from saqc.constants import * from saqc.core.history import History @@ -283,9 +284,18 @@ class Flags: # ---------------------------------------------------------------------- # item access - + @overload def __getitem__(self, key: str) -> pd.Series: - return self._data[key].max() + ... + + @overload + def __getitem__(self, key: Sequence[str]) -> "Flags": + ... + + def __getitem__(self, key: Union[str, Sequence[str]]) -> Union[pd.Series, "Flags"]: + if isinstance(key, str): + return self._data[key].max() + return Flags({k: self[k] for k in key}) def __setitem__(self, key: SelectT, value: ValueT): # force-KW is only internally available diff --git a/saqc/core/modules/tools.py b/saqc/core/modules/tools.py index 7667261a7..fe340ce59 100644 --- a/saqc/core/modules/tools.py +++ b/saqc/core/modules/tools.py @@ -11,7 +11,7 @@ from saqc.core.modules.base import ModuleBase class Tools(ModuleBase): - def copy(self, field: str, new_field: str, **kwargs) -> saqc.SaQC: + def copy(self, field: str, target: str, **kwargs) -> saqc.SaQC: return self.defer("copy", locals()) def drop(self, field: str, **kwargs) -> saqc.SaQC: diff --git a/saqc/core/reader.py b/saqc/core/reader.py index 219f5b4c4..7beb7435e 100644 --- a/saqc/core/reader.py +++ b/saqc/core/reader.py @@ -71,6 +71,6 @@ def fromConfig(fname, *args, **kwargs): tree = ast.parse(expr, mode="eval") func, kwargs = ConfigFunctionParser().parse(tree.body) - saqc = getattr(saqc, func)(field=field, regex=regex, **kwargs) + saqc = getattr(saqc, func)(target=field, regex=regex, **{"field": field, **kwargs}) return saqc diff --git a/saqc/core/register.py b/saqc/core/register.py index 0a108c383..7a9939407 100644 --- a/saqc/core/register.py +++ b/saqc/core/register.py @@ -35,7 +35,7 @@ class CallState: mask: dios.DictOfSeries -def processing(module: Optional[str] = None): +def processing(module: str = None, multi: bool = False): # executed on module import def inner(func): func_name = func.__name__ @@ -47,13 +47,14 @@ def processing(module: Optional[str] = None): kwargs["to_mask"] = _getMaskingThresh(kwargs) return func(data, field, flags, *args, **kwargs) + callWrapper._multi = multi FUNC_MAP[func_name] = callWrapper return callWrapper return inner -def flagging(masking: MaskingStrT = "all", module: Optional[str] = None): +def flagging(masking: MaskingStrT = "all", module: str = None, multi: bool = False): # executed on module import if masking not in ("all", "field", "none"): @@ -78,6 +79,7 @@ def flagging(masking: MaskingStrT = "all", module: Optional[str] = None): FUNC_MAP[func_name] = callWrapper callWrapper._module = module callWrapper._masking = masking + callWrapper._multi = multi return callWrapper @@ -243,8 +245,8 @@ def _getMaskingThresh(kwargs): def _isflagged( - flagscol: Union[np.array, pd.Series], thresh: float -) -> Union[np.array, pd.Series]: + flagscol: Union[np.ndarray, pd.Series], thresh: float +) -> Union[np.ndarray, pd.Series]: """ Return a mask of flags accordingly to `thresh`. Return type is same as flags. """ diff --git a/saqc/funcs/generic.py b/saqc/funcs/generic.py index ae5f63802..3b4428205 100644 --- a/saqc/funcs/generic.py +++ b/saqc/funcs/generic.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- from functools import partial -from inspect import signature from typing import Sequence, Tuple, Union, Callable import numpy as np @@ -19,67 +18,72 @@ import operator as op from saqc.lib.tools import toSequence -_OP = {"<": op.lt, "<=": op.le, "==": op.eq, "!=": op.ne, ">": op.gt, ">=": op.ge} +# _OP = {"<": op.lt, "<=": op.le, "==": op.eq, "!=": op.ne, ">": op.gt, ">=": op.ge} -def _dslIsFlagged( - flags: Flags, var: pd.Series, flag: float = None, comparator: str = None -) -> Union[pd.Series, DictOfSeries]: - """ - helper function for `flag` +# def _dslIsFlagged( +# flags: Flags, var: pd.Series, flag: float = None, comparator: str = None +# ) -> Union[pd.Series, DictOfSeries]: +# """ +# helper function for `flag` - Param Combinations - ------------------ - - ``isflagged('var')`` : show me (anything) flagged - - ``isflagged('var', DOUBT)`` : show me ``flags >= DOUBT`` - - ``isflagged('var', DOUBT, '==')`` : show me ``flags == DOUBT`` +# Param Combinations +# ------------------ +# - ``isflagged('var')`` : show me (anything) flagged +# - ``isflagged('var', DOUBT)`` : show me ``flags >= DOUBT`` +# - ``isflagged('var', DOUBT, '==')`` : show me ``flags == DOUBT`` - Raises - ------ - ValueError: if `comparator` is passed but no `flag` vaule. Eg. ``isflagged('var', comparator='>=')`` - """ - if flag is None: - if comparator is not None: - raise ValueError("if `comparator` is used, explicitly pass a `flag` level.") - flag = UNFLAGGED - comparator = ">" +# Raises +# ------ +# ValueError: if `comparator` is passed but no `flag` vaule. Eg. ``isflagged('var', comparator='>=')`` +# """ +# if flag is None: +# if comparator is not None: +# raise ValueError("if `comparator` is used, explicitly pass a `flag` level.") +# flag = UNFLAGGED +# comparator = ">" - # default - if comparator is None: - comparator = ">=" +# # default +# if comparator is None: +# comparator = ">=" - _op = _OP[comparator] - return _op(flags[var.name], flag) +# _op = _OP[comparator] +# return _op(flags[var.name], flag) def _execGeneric( flags: Flags, data: DictOfSeries, func: Callable[[pd.Series], pd.Series], - fields: Sequence[str], -) -> pd.Series: +) -> Union[DictOfSeries, float]: # TODO: # - check series.index compatibility globs = { - "isflagged": partial(_dslIsFlagged, flags), - "ismissing": lambda var: pd.isnull(var), - "mask": lambda cond: data[cond.name].mask(cond), "GOOD": GOOD, "BAD": BAD, "UNFLAGGED": UNFLAGGED, **ENVIRONMENT, } + func.__globals__.update(globs) - return func(*[data[f] for f in fields]) + if isinstance(data, pd.Series): + data = data.to_frame() + + out = func(*[data[c] for c in data.columns]) + if isinstance(out, (np.ndarray, pd.Series)): + return DictOfSeries(out) + return out -@processing(module="generic") + +@processing(module="generic", multi=True) def process( data: DictOfSeries, - field: str, + field: Union[str, Sequence[str]], flags: Flags, + target: Union[str, Sequence[str]], func: Callable[[pd.Series], pd.Series], to_mask: float = UNFLAGGED, **kwargs, @@ -132,25 +136,27 @@ def process( >>> lambda temperature, uncertainty: np.round(temperature) * np.sqrt(uncertainty) """ + fields, targets = toSequence(field), toSequence(target) data_masked, _ = _maskData(data.copy(), flags, data.columns, to_mask) - data[field] = _execGeneric(flags, data_masked, func, field).squeeze() + value = _execGeneric(flags[fields], data_masked[fields], func) + data.aloc[targets] = value - if field in flags: - flags.drop(field) - - flags[field] = initFlagsLike(data[field])[field] + #NOTE: we generate new data, so we also need to drop existing flags + for t in targets: + if t in flags: + flags.drop(t) + flags[t] = initFlagsLike(data[t])[t] return data, flags -@flagging(masking="all", module="generic") +@flagging(masking="all", module="generic", multi=True) def flag( data: DictOfSeries, field: Union[str, Sequence[str]], flags: Flags, - target: Union[str, Sequence[str]], + target: Union[str, Sequence[str]], func: Callable[[pd.Series], pd.Series], - sources: Sequence[str], flag: float = BAD, to_mask: float = UNFLAGGED, **kwargs, @@ -252,19 +258,23 @@ def flag( >>> saqc.generic.flag(field=["x", "y", "z"], target=["a", "b"], lambda x, y, z: x < z, y > z ) """ - # we get the data unmasked, in order to also receive flags, - # so let's do to the masking manually - # data_masked, _ = _maskData(data, flags, data.columns, to_mask) + fields, targets = toSequence(field), toSequence(target) + value = _execGeneric(flags, data[fields].copy(), func) + + if len(target) != len(value.columns): + raise ValueError( + f"The generic function returned {len(value.columns)} fields, but we only got {len(target)} targets." + ) - mask = _execGeneric(flags, data, func, field).squeeze() - if np.isscalar(mask): + if np.isscalar(value): raise TypeError(f"generic expression does not return an array") - if not np.issubdtype(mask.dtype, np.bool_): + if not (value.dtypes == bool).all(): raise TypeError(f"generic expression does not return a boolean array") - for f in toSequence(field): - mask &= ~_isflagged(flags[f], to_mask) + for f in fields: + value = value & _isflagged(flags[f], thresh=to_mask) - flags[mask, target] = flag + for i, t in enumerate(targets): + flags[value[i], t] = flag return data, flags diff --git a/saqc/funcs/tools.py b/saqc/funcs/tools.py index 67494c275..54ed0cf77 100644 --- a/saqc/funcs/tools.py +++ b/saqc/funcs/tools.py @@ -22,7 +22,7 @@ _MPL_DEFAULT_BACKEND = mpl.get_backend() @processing(module="tools") def copy( - data: DictOfSeries, field: str, flags: Flags, new_field: str, **kwargs + data: DictOfSeries, field: str, flags: Flags, target: str, **kwargs ) -> Tuple[DictOfSeries, Flags]: """ The function generates a copy of the data "field" and inserts it under the name field + suffix into the existing @@ -36,7 +36,7 @@ def copy( The fieldname of the data column, you want to fork (copy). flags : saqc.Flags Container to store quality flags to data. - new_field: str + target: str Target name. Returns @@ -48,12 +48,12 @@ def copy( The quality flags of data Flags shape may have changed relatively to the flags input. """ - if new_field in flags.columns.union(data.columns): + if target in flags.columns.union(data.columns): raise ValueError(f"{field}: field already exist") - data[new_field] = data[field].copy() + data[target] = data[field].copy() # implicit copy in history access - flags.history[new_field] = flags.history[field] + flags.history[target] = flags.history[field] return data, flags diff --git a/tests/funcs/test_generic_api_functions.py b/tests/funcs/test_generic_api_functions.py index f5c10b90c..8ed97449f 100644 --- a/tests/funcs/test_generic_api_functions.py +++ b/tests/funcs/test_generic_api_functions.py @@ -1,13 +1,16 @@ #! /usr/bin/env python # -*- coding: utf-8 -*- +from dataclasses import field, fields import pytest import pandas as pd +from dios.dios.dios import DictOfSeries from saqc.constants import * from saqc.core.register import flagging from saqc.funcs.tools import mask from saqc import SaQC +from saqc.lib.tools import toSequence from tests.common import initData, flagAll @@ -23,41 +26,41 @@ def data(): def test_addFieldFlagGeneric(data): saqc = SaQC(data=data) - func = lambda var1: pd.Series(False, index=data[var1.name].index) - data, flags = saqc.generic.flag("tmp1", func, flag=BAD).getResult() - assert "tmp1" in flags.columns and "tmp1" not in data + func = lambda x: pd.Series(False, index=x.index) + data, flags = saqc.generic.flag(field="var1", target="tmp1", func=func, flag=BAD).getResult() + assert "tmp1" in flags.columns and "tmp1" in data def test_addFieldProcGeneric(data): saqc = SaQC(data=data) - - func = lambda: pd.Series([]) - data, flags = saqc.generic.process("tmp1", func, flag=BAD).getResult(raw=True) - assert "tmp1" in data.columns and data["tmp1"].empty - - func = lambda var1, var2: var1 + var2 - data, flags = saqc.generic.process("tmp2", func, flag=BAD).getResult() - assert "tmp2" in data.columns and (data["tmp2"] == data["var1"] + data["var2"]).all( - axis=None - ) - - -def test_mask(data): - saqc = SaQC(data=data) - data_org = data.copy(deep=True) - mean = data["var1"] / 2 - - data, _ = saqc.generic.process( - "var1", lambda var1: mask(var1 < mean), flag=BAD - ).getResult() - assert ( - (data["var1"].isna()) == (data_org["var1"] < 10) & data_org["var1"].isna() - ).all(axis=None) - - data, flags = saqc.generic.process( - "tmp", lambda var1: mask(var1 < mean), flag=BAD - ).getResult() - assert ("tmp" in data.columns) and ("tmp" in flags.columns) - assert ( - (data["tmp"].isna()) == (data_org["var1"] < 10) & data_org["var1"].isna() - ).all(axis=None) + fields = ["var1", "var2"] + params = [ + ("tmp", lambda x, y: x + y), + # (["tmp1", "tmp2"], lambda x, y: (x + y, y*2)) + ] + for target, func in params: + expected = DictOfSeries(func(*[data[f] for f in fields]), columns=toSequence(target)) + data, _ = saqc.generic.process(field=fields, target=target, func=func, flag=BAD).getResult(raw=True) + # import pdb; pdb.set_trace() + # assert (data[target] == expected).all(axis=None) + + +# def test_mask(data): +# saqc = SaQC(data=data) +# data_org = data.copy(deep=True) +# mean = data["var1"] / 2 + +# data, _ = saqc.generic.process( +# "var1", lambda x: mask(x < mean), flag=BAD +# ).getResult() +# assert ( +# (data["var1"].isna()) == (data_org["var1"] < 10) & data_org["var1"].isna() +# ).all(axis=None) + +# data, flags = saqc.generic.process( +# field="var1", target="tmp", func=lambda x: mask(x < mean), flag=BAD +# ).getResult() +# assert ("tmp" in data.columns) and ("tmp" in flags.columns) +# assert ( +# (data["tmp"].isna()) == (data_org["var1"] < 10) & data_org["var1"].isna() +# ).all(axis=None) diff --git a/tests/funcs/test_generic_config_functions.py b/tests/funcs/test_generic_config_functions.py index 1298e7317..96a0c8fbc 100644 --- a/tests/funcs/test_generic_config_functions.py +++ b/tests/funcs/test_generic_config_functions.py @@ -14,6 +14,7 @@ from saqc.core.visitor import ConfigFunctionParser from saqc.core.register import flagging from saqc.funcs.generic import _execGeneric from saqc import SaQC +from saqc.lib.tools import toSequence from tests.common import initData, writeIO @@ -38,30 +39,13 @@ def data_diff(): ) -def _compileGeneric(expr, flags): +def _compileGeneric(expr): tree = ast.parse(expr, mode="eval") _, kwargs = ConfigFunctionParser().parse(tree.body) return kwargs["func"] -def test_missingIdentifier(data): - flags = Flags() - - # NOTE: - # - the error is only raised at runtime during parsing would be better - tests = [ - "fff(var2) < 5", - "var3 != 42", - ] - - for test in tests: - func = _compileGeneric(f"generic.flag(func={test})", flags) - with pytest.raises(NameError): - _execGeneric(flags, data, func, field="") - - def test_syntaxError(): - flags = Flags() tests = [ "range(x=5", "rangex=5)", @@ -70,42 +54,39 @@ def test_syntaxError(): for test in tests: with pytest.raises(SyntaxError): - _compileGeneric(f"flag(func={test})", flags) + _compileGeneric(f"flag(func={test})") def test_typeError(): """ test that forbidden constructs actually throw an error - TODO: find a few more cases or get rid of the test """ - flags = Flags() - # : think about cases that should be forbidden + # TODO: think about cases that should be forbidden tests = ("lambda x: x * 2",) for test in tests: with pytest.raises(TypeError): - _compileGeneric(f"generic.flag(func={test})", flags) + _compileGeneric(f"generic.flag(func={test})") def test_comparisonOperators(data): flags = initFlagsLike(data) var1, var2, *_ = data.columns - this = var1 tests = [ - ("this > 100", data[this] > 100), - (f"10 >= {var2}", 10 >= data[var2]), - (f"{var2} < 100", data[var2] < 100), - (f"this <= {var2}", data[this] <= data[var2]), - (f"{var1} == {var2}", data[this] == data[var2]), - (f"{var1} != {var2}", data[this] != data[var2]), + ("var1", "x > 100", data[var1] > 100), + ("var2", "10 >= y", 10 >= data[var2]), + ("var2", f"y < 100", data[var2] < 100), + (["var1", "var2"], "x <= y", data[var1] <= data[var2]), + (["var1", "var2"], "x == y", data[var1] == data[var2]), + (["var1", "var2"], "x != y", data[var1] != data[var2]), ] - for test, expected in tests: - func = _compileGeneric(f"generic.flag(func={test})", flags) - result = _execGeneric(flags, data, func, field=var1) - assert np.all(result == expected) + for fields, test, expected in tests: + func = _compileGeneric(f"generic.flag(func={test})") + result = _execGeneric(flags[toSequence(fields)], data[fields], func) + assert (result == expected).all(axis=None) def test_arithmeticOperators(data): @@ -123,149 +104,89 @@ def test_arithmeticOperators(data): ] for test, expected in tests: - func = _compileGeneric(f"generic.process(func={test})", flags) - result = _execGeneric(flags, data, func, field=var1) - assert np.all(result == expected) + func = _compileGeneric(f"generic.process(func={test})") + result = _execGeneric(flags[[this.name]], this, func) + assert (result == expected).all(axis=None) + def test_nonReduncingBuiltins(data): flags = initFlagsLike(data) var1, *_ = data.columns - this = var1 - mean = data[var1].mean() + this = data[var1] tests = [ - (f"abs({this})", np.abs(data[this])), - (f"log({this})", np.log(data[this])), - (f"exp({this})", np.exp(data[this])), - ( - f"ismissing(mask({this} < {mean}))", - data[this].mask(data[this] < mean).isna(), - ), + ("abs(x)", np.abs(this)), + ("log(x)", np.log(this)), + ("exp(x)", np.exp(this)), ] for test, expected in tests: - func = _compileGeneric(f"generic.process(func={test})", flags) - result = _execGeneric(flags, data, func, field=this) - assert (result == expected).all() + func = _compileGeneric(f"generic.process(func={test})") + result = _execGeneric(flags[[this.name]], this, func) + assert (result == expected).all(axis=None) def test_reduncingBuiltins(data): data.loc[::4] = np.nan flags = initFlagsLike(data) var1 = data.columns[0] - this = data.iloc[:, 0] + this = data[var1] tests = [ - ("min(this)", np.nanmin(this)), - (f"max({var1})", np.nanmax(this)), - (f"sum({var1})", np.nansum(this)), - ("mean(this)", np.nanmean(this)), - (f"std({this.name})", np.std(this)), - (f"len({this.name})", len(this)), + ("min(x)", np.nanmin(this)), + ("max(x)", np.nanmax(this)), + ("sum(x)", np.nansum(this)), + ("mean(x)", np.nanmean(this)), + ("std(x)", np.std(this)), + ("len(x)", len(this)), ] for test, expected in tests: - func = _compileGeneric(f"generic.process(func={test})", flags) - result = _execGeneric(flags, data, func, field=this.name) + func = _compileGeneric(f"generic.process(func={test})") + result = _execGeneric(flags[[this.name]], this, func) assert result == expected -def test_ismissing(data): - - flags = initFlagsLike(data) - data.iloc[: len(data) // 2, 0] = np.nan - data.iloc[(len(data) // 2) + 1 :, 0] = -9999 - this = data.iloc[:, 0] - - tests = [ - (f"ismissing({this.name})", pd.isnull(this)), - (f"~ismissing({this.name})", pd.notnull(this)), - ] - - for test, expected in tests: - func = _compileGeneric(f"generic.flag(func={test})", flags) - result = _execGeneric(flags, data, func, this.name) - assert np.all(result == expected) - - def test_bitOps(data): var1, var2, *_ = data.columns - this = var1 - - flags = initFlagsLike(data) - - tests = [ - ("~(this > mean(this))", ~(data[this] > np.nanmean(data[this]))), - (f"(this <= 0) | (0 < {var1})", (data[this] <= 0) | (0 < data[var1])), - (f"({var2} >= 0) & (0 > this)", (data[var2] >= 0) & (0 > data[this])), - ] - for test, expected in tests: - func = _compileGeneric(f"generic.flag(func={test})", flags) - result = _execGeneric(flags, data, func, this) - assert np.all(result == expected) - - -def test_isflagged(data): - - var1, var2, *_ = data.columns flags = initFlagsLike(data) - flags[data[var1].index[::2], var1] = BAD tests = [ - (f"isflagged({var1})", flags[var1] > UNFLAGGED), - (f"isflagged({var1}, flag=BAD)", flags[var1] >= BAD), - (f"isflagged({var1}, UNFLAGGED, '==')", flags[var1] == UNFLAGGED), - (f"~isflagged({var2})", flags[var2] == UNFLAGGED), - ( - f"~({var2}>999) & (~isflagged({var2}))", - ~(data[var2] > 999) & (flags[var2] == UNFLAGGED), - ), + (var1, "~(x > mean(x))", ~(data[var1] > np.nanmean(data[var1]))), + (var1, "(x <= 0) | (0 < x)", (data[var1] <= 0) | (0 < data[var1])), + ([var1, var2], "(y>= 0) & (0 > x)", (data[var2] >= 0) & (0 > data[var1])), ] - for i, (test, expected) in enumerate(tests): - try: - func = _compileGeneric(f"generic.flag(func={test}, flag=BAD)", flags) - result = _execGeneric(flags, data, func, field=None) - assert np.all(result == expected) - except Exception: - print(i, test) - raise - - # test bad combination - for comp in [">", ">=", "==", "!=", "<", "<="]: - fails = f"isflagged({var1}, comparator='{comp}')" - - func = _compileGeneric(f"generic.flag(func={fails}, flag=BAD)", flags) - with pytest.raises(ValueError): - _execGeneric(flags, data, func, field=None) + for field, test, expected in tests: + func = _compileGeneric(f"generic.flag(func={test})") + result = _execGeneric(flags[toSequence(field)], data[field], func) + assert (result == expected).all(axis=None) def test_variableAssignments(data): - var1, var2, *_ = data.columns config = f""" varname ; test - dummy1 ; generic.process(func=var1 + var2) - dummy2 ; generic.flag(func=var1 + var2 > 0) + dummy1 ; generic.process(field=["var1", "var2"], func=x + y) + dummy2 ; generic.flag(field=["var1", "var2"], func=x + y > 0) """ fobj = writeIO(config) saqc = fromConfig(fobj, data) result_data, result_flags = saqc.getResult(raw=True) - assert set(result_data.columns) == set(data.columns) | { - "dummy1", - } - assert set(result_flags.columns) == set(data.columns) | {"dummy1", "dummy2"} + expected_columns = set(data.columns) | {"dummy1", "dummy2"} + assert set(result_data.columns) == expected_columns + assert set(result_flags.columns) == expected_columns def test_processMultiple(data_diff): config = f""" varname ; test - dummy ; generic.process(func=var1 + 1) - dummy ; generic.process(func=var2 - 1) + dummy ; generic.process(field="var1", func=x + 1) + dummy ; generic.process(field="var2", func=y - 1) """ fobj = writeIO(config) @@ -280,7 +201,8 @@ def test_callableArgumentsUnary(data): @flagging(masking="field") def testFuncUnary(data, field, flags, func, **kwargs): - data[field] = data[field].rolling(window=window).apply(func) + value = data[field].rolling(window=window).apply(func) + data[field] = value return data, initFlagsLike(data) var = data.columns[0] -- GitLab From d82325c0f46d766cb5fcd0208e1a029e9b163c00 Mon Sep 17 00:00:00 2001 From: David Schaefer <david.schaefer@ufz.de> Date: Tue, 19 Oct 2021 18:06:02 +0200 Subject: [PATCH 3/5] remove `flags` parameter from _execGeneric --- saqc/core/flags.py | 12 +---------- saqc/funcs/generic.py | 5 ++--- tests/funcs/test_generic_api_functions.py | 22 -------------------- tests/funcs/test_generic_config_functions.py | 14 +++++-------- 4 files changed, 8 insertions(+), 45 deletions(-) diff --git a/saqc/core/flags.py b/saqc/core/flags.py index d9a657db6..2f53bce79 100644 --- a/saqc/core/flags.py +++ b/saqc/core/flags.py @@ -284,18 +284,8 @@ class Flags: # ---------------------------------------------------------------------- # item access - @overload def __getitem__(self, key: str) -> pd.Series: - ... - - @overload - def __getitem__(self, key: Sequence[str]) -> "Flags": - ... - - def __getitem__(self, key: Union[str, Sequence[str]]) -> Union[pd.Series, "Flags"]: - if isinstance(key, str): - return self._data[key].max() - return Flags({k: self[k] for k in key}) + return self._data[key].max() def __setitem__(self, key: SelectT, value: ValueT): # force-KW is only internally available diff --git a/saqc/funcs/generic.py b/saqc/funcs/generic.py index 3b4428205..57c90c3b0 100644 --- a/saqc/funcs/generic.py +++ b/saqc/funcs/generic.py @@ -52,7 +52,6 @@ from saqc.lib.tools import toSequence def _execGeneric( - flags: Flags, data: DictOfSeries, func: Callable[[pd.Series], pd.Series], ) -> Union[DictOfSeries, float]: @@ -138,7 +137,7 @@ def process( fields, targets = toSequence(field), toSequence(target) data_masked, _ = _maskData(data.copy(), flags, data.columns, to_mask) - value = _execGeneric(flags[fields], data_masked[fields], func) + value = _execGeneric(data_masked[fields], func) data.aloc[targets] = value #NOTE: we generate new data, so we also need to drop existing flags @@ -259,7 +258,7 @@ def flag( """ fields, targets = toSequence(field), toSequence(target) - value = _execGeneric(flags, data[fields].copy(), func) + value = _execGeneric(data[fields].copy(), func) if len(target) != len(value.columns): raise ValueError( diff --git a/tests/funcs/test_generic_api_functions.py b/tests/funcs/test_generic_api_functions.py index 8ed97449f..76f9dde99 100644 --- a/tests/funcs/test_generic_api_functions.py +++ b/tests/funcs/test_generic_api_functions.py @@ -1,14 +1,12 @@ #! /usr/bin/env python # -*- coding: utf-8 -*- -from dataclasses import field, fields import pytest import pandas as pd from dios.dios.dios import DictOfSeries from saqc.constants import * from saqc.core.register import flagging -from saqc.funcs.tools import mask from saqc import SaQC from saqc.lib.tools import toSequence @@ -44,23 +42,3 @@ def test_addFieldProcGeneric(data): # import pdb; pdb.set_trace() # assert (data[target] == expected).all(axis=None) - -# def test_mask(data): -# saqc = SaQC(data=data) -# data_org = data.copy(deep=True) -# mean = data["var1"] / 2 - -# data, _ = saqc.generic.process( -# "var1", lambda x: mask(x < mean), flag=BAD -# ).getResult() -# assert ( -# (data["var1"].isna()) == (data_org["var1"] < 10) & data_org["var1"].isna() -# ).all(axis=None) - -# data, flags = saqc.generic.process( -# field="var1", target="tmp", func=lambda x: mask(x < mean), flag=BAD -# ).getResult() -# assert ("tmp" in data.columns) and ("tmp" in flags.columns) -# assert ( -# (data["tmp"].isna()) == (data_org["var1"] < 10) & data_org["var1"].isna() -# ).all(axis=None) diff --git a/tests/funcs/test_generic_config_functions.py b/tests/funcs/test_generic_config_functions.py index 96a0c8fbc..5dcfdbb65 100644 --- a/tests/funcs/test_generic_config_functions.py +++ b/tests/funcs/test_generic_config_functions.py @@ -71,7 +71,6 @@ def test_typeError(): def test_comparisonOperators(data): - flags = initFlagsLike(data) var1, var2, *_ = data.columns tests = [ @@ -85,12 +84,11 @@ def test_comparisonOperators(data): for fields, test, expected in tests: func = _compileGeneric(f"generic.flag(func={test})") - result = _execGeneric(flags[toSequence(fields)], data[fields], func) + result = _execGeneric(data[fields], func) assert (result == expected).all(axis=None) def test_arithmeticOperators(data): - flags = initFlagsLike(data) var1, *_ = data.columns this = data[var1] @@ -105,13 +103,12 @@ def test_arithmeticOperators(data): for test, expected in tests: func = _compileGeneric(f"generic.process(func={test})") - result = _execGeneric(flags[[this.name]], this, func) + result = _execGeneric(this, func) assert (result == expected).all(axis=None) def test_nonReduncingBuiltins(data): - flags = initFlagsLike(data) var1, *_ = data.columns this = data[var1] @@ -123,13 +120,12 @@ def test_nonReduncingBuiltins(data): for test, expected in tests: func = _compileGeneric(f"generic.process(func={test})") - result = _execGeneric(flags[[this.name]], this, func) + result = _execGeneric(this, func) assert (result == expected).all(axis=None) def test_reduncingBuiltins(data): data.loc[::4] = np.nan - flags = initFlagsLike(data) var1 = data.columns[0] this = data[var1] @@ -144,7 +140,7 @@ def test_reduncingBuiltins(data): for test, expected in tests: func = _compileGeneric(f"generic.process(func={test})") - result = _execGeneric(flags[[this.name]], this, func) + result = _execGeneric(this, func) assert result == expected @@ -161,7 +157,7 @@ def test_bitOps(data): for field, test, expected in tests: func = _compileGeneric(f"generic.flag(func={test})") - result = _execGeneric(flags[toSequence(field)], data[field], func) + result = _execGeneric(data[field], func) assert (result == expected).all(axis=None) -- GitLab From 6c1cd2607ea543f5a0b841b07dcfb04a421a1a64 Mon Sep 17 00:00:00 2001 From: David Schaefer <david.schaefer@ufz.de> Date: Tue, 19 Oct 2021 18:24:48 +0200 Subject: [PATCH 4/5] fix process --- saqc/funcs/generic.py | 2 +- tests/funcs/test_generic_api_functions.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/saqc/funcs/generic.py b/saqc/funcs/generic.py index 57c90c3b0..440085e43 100644 --- a/saqc/funcs/generic.py +++ b/saqc/funcs/generic.py @@ -138,7 +138,7 @@ def process( fields, targets = toSequence(field), toSequence(target) data_masked, _ = _maskData(data.copy(), flags, data.columns, to_mask) value = _execGeneric(data_masked[fields], func) - data.aloc[targets] = value + data[targets] = value #NOTE: we generate new data, so we also need to drop existing flags for t in targets: diff --git a/tests/funcs/test_generic_api_functions.py b/tests/funcs/test_generic_api_functions.py index 76f9dde99..3d79bcc74 100644 --- a/tests/funcs/test_generic_api_functions.py +++ b/tests/funcs/test_generic_api_functions.py @@ -34,11 +34,10 @@ def test_addFieldProcGeneric(data): fields = ["var1", "var2"] params = [ ("tmp", lambda x, y: x + y), - # (["tmp1", "tmp2"], lambda x, y: (x + y, y*2)) + (["tmp1", "tmp2"], lambda x, y: (x + y, y*2)) ] for target, func in params: expected = DictOfSeries(func(*[data[f] for f in fields]), columns=toSequence(target)) data, _ = saqc.generic.process(field=fields, target=target, func=func, flag=BAD).getResult(raw=True) - # import pdb; pdb.set_trace() - # assert (data[target] == expected).all(axis=None) + assert (expected == data[target]).all(axis=None) -- GitLab From 9ab90fc0dee90804622392b64be4ac4538c44140 Mon Sep 17 00:00:00 2001 From: David Schaefer <david.schaefer@ufz.de> Date: Wed, 20 Oct 2021 16:55:08 +0200 Subject: [PATCH 5/5] bring back `ismissing` --- saqc/core/flags.py | 12 ++- saqc/funcs/generic.py | 49 +++-------- tests/funcs/test_generic_config_functions.py | 92 +++++++++++--------- 3 files changed, 70 insertions(+), 83 deletions(-) diff --git a/saqc/core/flags.py b/saqc/core/flags.py index 2f53bce79..d9a657db6 100644 --- a/saqc/core/flags.py +++ b/saqc/core/flags.py @@ -284,8 +284,18 @@ class Flags: # ---------------------------------------------------------------------- # item access + @overload def __getitem__(self, key: str) -> pd.Series: - return self._data[key].max() + ... + + @overload + def __getitem__(self, key: Sequence[str]) -> "Flags": + ... + + def __getitem__(self, key: Union[str, Sequence[str]]) -> Union[pd.Series, "Flags"]: + if isinstance(key, str): + return self._data[key].max() + return Flags({k: self[k] for k in key}) def __setitem__(self, key: SelectT, value: ValueT): # force-KW is only internally available diff --git a/saqc/funcs/generic.py b/saqc/funcs/generic.py index 440085e43..89c0f56b6 100644 --- a/saqc/funcs/generic.py +++ b/saqc/funcs/generic.py @@ -18,48 +18,18 @@ import operator as op from saqc.lib.tools import toSequence -# _OP = {"<": op.lt, "<=": op.le, "==": op.eq, "!=": op.ne, ">": op.gt, ">=": op.ge} - - -# def _dslIsFlagged( -# flags: Flags, var: pd.Series, flag: float = None, comparator: str = None -# ) -> Union[pd.Series, DictOfSeries]: -# """ -# helper function for `flag` - -# Param Combinations -# ------------------ -# - ``isflagged('var')`` : show me (anything) flagged -# - ``isflagged('var', DOUBT)`` : show me ``flags >= DOUBT`` -# - ``isflagged('var', DOUBT, '==')`` : show me ``flags == DOUBT`` - -# Raises -# ------ -# ValueError: if `comparator` is passed but no `flag` vaule. Eg. ``isflagged('var', comparator='>=')`` -# """ -# if flag is None: -# if comparator is not None: -# raise ValueError("if `comparator` is used, explicitly pass a `flag` level.") -# flag = UNFLAGGED -# comparator = ">" - -# # default -# if comparator is None: -# comparator = ">=" - -# _op = _OP[comparator] -# return _op(flags[var.name], flag) - def _execGeneric( + flags: Flags, data: DictOfSeries, func: Callable[[pd.Series], pd.Series], -) -> Union[DictOfSeries, float]: +) -> DictOfSeries: # TODO: # - check series.index compatibility globs = { + "isflagged": lambda data: _isflagged(flags[data.name], thresh=UNFLAGGED), "GOOD": GOOD, "BAD": BAD, "UNFLAGGED": UNFLAGGED, @@ -72,9 +42,7 @@ def _execGeneric( data = data.to_frame() out = func(*[data[c] for c in data.columns]) - if isinstance(out, (np.ndarray, pd.Series)): - return DictOfSeries(out) - return out + return DictOfSeries(out) @processing(module="generic", multi=True) @@ -137,10 +105,11 @@ def process( fields, targets = toSequence(field), toSequence(target) data_masked, _ = _maskData(data.copy(), flags, data.columns, to_mask) - value = _execGeneric(data_masked[fields], func) + value = _execGeneric(flags[fields], data_masked[fields], func) data[targets] = value - #NOTE: we generate new data, so we also need to drop existing flags + # NOTE: we generate new data, so we also need to drop existing flags + # TODO: transfer the flags from the input to the output fields for t in targets: if t in flags: flags.drop(t) @@ -258,7 +227,7 @@ def flag( """ fields, targets = toSequence(field), toSequence(target) - value = _execGeneric(data[fields].copy(), func) + value = _execGeneric(flags[fields], data[fields].copy(), func) if len(target) != len(value.columns): raise ValueError( @@ -270,9 +239,11 @@ def flag( if not (value.dtypes == bool).all(): raise TypeError(f"generic expression does not return a boolean array") + # transfer the flags from the input fields for f in fields: value = value & _isflagged(flags[f], thresh=to_mask) + # set the newly generated flags for i, t in enumerate(targets): flags[value[i], t] = flag diff --git a/tests/funcs/test_generic_config_functions.py b/tests/funcs/test_generic_config_functions.py index 5dcfdbb65..6d51c43ca 100644 --- a/tests/funcs/test_generic_config_functions.py +++ b/tests/funcs/test_generic_config_functions.py @@ -72,92 +72,75 @@ def test_typeError(): def test_comparisonOperators(data): var1, var2, *_ = data.columns + flags = initFlagsLike(data) tests = [ - ("var1", "x > 100", data[var1] > 100), - ("var2", "10 >= y", 10 >= data[var2]), - ("var2", f"y < 100", data[var2] < 100), + (["var1"], "x > 100", data[var1] > 100), + (["var2"], "10 >= y", 10 >= data[var2]), + (["var2"], f"y < 100", data[var2] < 100), (["var1", "var2"], "x <= y", data[var1] <= data[var2]), (["var1", "var2"], "x == y", data[var1] == data[var2]), (["var1", "var2"], "x != y", data[var1] != data[var2]), ] - for fields, test, expected in tests: + for field, test, expected in tests: func = _compileGeneric(f"generic.flag(func={test})") - result = _execGeneric(data[fields], func) + result = _execGeneric(flags[field], data[field], func) assert (result == expected).all(axis=None) def test_arithmeticOperators(data): + var1, *_ = data.columns - this = data[var1] + + data = data[var1] + flags = initFlagsLike(data)[[var1]] tests = [ - ("var1 + 100 > 110", this + 100 > 110), - ("var1 - 100 > 0", this - 100 > 0), - ("var1 * 100 > 200", this * 100 > 200), - ("var1 / 100 > .1", this / 100 > 0.1), - ("var1 % 2 == 1", this % 2 == 1), - ("var1 ** 2 == 0", this ** 2 == 0), + ("var1 + 100 > 110", data + 100 > 110), + ("var1 - 100 > 0", data - 100 > 0), + ("var1 * 100 > 200", data * 100 > 200), + ("var1 / 100 > .1", data / 100 > 0.1), + ("var1 % 2 == 1", data % 2 == 1), + ("var1 ** 2 == 0", data ** 2 == 0), ] for test, expected in tests: func = _compileGeneric(f"generic.process(func={test})") - result = _execGeneric(this, func) + result = _execGeneric(flags, data, func) assert (result == expected).all(axis=None) - def test_nonReduncingBuiltins(data): var1, *_ = data.columns - this = data[var1] + data = data[var1] + flags = initFlagsLike(data)[[var1]] tests = [ - ("abs(x)", np.abs(this)), - ("log(x)", np.log(this)), - ("exp(x)", np.exp(this)), + ("abs(x)", np.abs(data)), + ("log(x)", np.log(data)), + ("exp(x)", np.exp(data)), ] for test, expected in tests: func = _compileGeneric(f"generic.process(func={test})") - result = _execGeneric(this, func) + result = _execGeneric(flags, data, func) assert (result == expected).all(axis=None) -def test_reduncingBuiltins(data): - data.loc[::4] = np.nan - var1 = data.columns[0] - this = data[var1] - - tests = [ - ("min(x)", np.nanmin(this)), - ("max(x)", np.nanmax(this)), - ("sum(x)", np.nansum(this)), - ("mean(x)", np.nanmean(this)), - ("std(x)", np.std(this)), - ("len(x)", len(this)), - ] - - for test, expected in tests: - func = _compileGeneric(f"generic.process(func={test})") - result = _execGeneric(this, func) - assert result == expected - - def test_bitOps(data): var1, var2, *_ = data.columns - flags = initFlagsLike(data) tests = [ - (var1, "~(x > mean(x))", ~(data[var1] > np.nanmean(data[var1]))), - (var1, "(x <= 0) | (0 < x)", (data[var1] <= 0) | (0 < data[var1])), + ([var1], "~(x > mean(x))", ~(data[var1] > np.nanmean(data[var1]))), + ([var1], "(x <= 0) | (0 < x)", (data[var1] <= 0) | (0 < data[var1])), ([var1, var2], "(y>= 0) & (0 > x)", (data[var2] >= 0) & (0 > data[var1])), ] for field, test, expected in tests: func = _compileGeneric(f"generic.flag(func={test})") - result = _execGeneric(data[field], func) + result = _execGeneric(flags[field], data[field], func) assert (result == expected).all(axis=None) @@ -247,3 +230,26 @@ def test_callableArgumentsBinary(data): expected = func(data[var1], data[var2]) assert (result_config[var1].dropna() == expected.dropna()).all(axis=None) assert (result_api[var1].dropna() == expected.dropna()).all(axis=None) + + +def test_isflagged(data): + + var1, var2, *_ = data.columns + flags = initFlagsLike(data) + flags[data[var1].index[::2], var1] = BAD + + tests = [ + ([var1], f"isflagged(x)", flags[var1] > UNFLAGGED), + ([var1], f"isflagged(x)", flags[var1] >= BAD), + ([var2], f"~isflagged(x)", flags[var2] == UNFLAGGED), + ( + [var1, var2], + f"~(x > 999) & (~isflagged(y))", + ~(data[var1] > 999) & (flags[var2] == UNFLAGGED), + ), + ] + + for field, test, expected in tests: + func = _compileGeneric(f"generic.flag(func={test}, flag=BAD)") + result = _execGeneric(flags[field], data[field], func) + assert (result == expected).all(axis=None) -- GitLab