From 1e5f8484a3b615bdfff238edd42c48c8f4d34359 Mon Sep 17 00:00:00 2001
From: David Schaefer <david.schaefer@ufz.de>
Date: Wed, 13 Oct 2021 12:51:50 +0200
Subject: [PATCH 1/5] first steps

---
 saqc/funcs/generic.py | 70 ++++++++++++++++++++++++++-----------------
 1 file changed, 42 insertions(+), 28 deletions(-)

diff --git a/saqc/funcs/generic.py b/saqc/funcs/generic.py
index cc2142cb5..2291217bb 100644
--- a/saqc/funcs/generic.py
+++ b/saqc/funcs/generic.py
@@ -3,7 +3,7 @@
 
 from functools import partial
 from inspect import signature
-from typing import Tuple, Union, Callable
+from typing import Sequence, Tuple, Union, Callable
 
 import numpy as np
 import pandas as pd
@@ -17,6 +17,8 @@ from saqc.core.visitor import ENVIRONMENT
 
 import operator as op
 
+from saqc.lib.tools import toSequence
+
 _OP = {"<": op.lt, "<=": op.le, "==": op.eq, "!=": op.ne, ">": op.gt, ">=": op.ge}
 
 
@@ -54,33 +56,23 @@ def _execGeneric(
     flags: Flags,
     data: DictOfSeries,
     func: Callable[[pd.Series], pd.Series],
-    field: str,
+    fields: Sequence[str],
 ) -> pd.Series:
+
     # TODO:
     # - check series.index compatibility
-    # - field is only needed to translate 'this' parameters
-    #    -> maybe we could do the translation on the tree instead
-
-    sig = signature(func)
-    args = []
-    for k, v in sig.parameters.items():
-        k = field if k == "this" else k
-        if k not in data:
-            raise NameError(f"variable '{k}' not found")
-        args.append(data[k])
 
     globs = {
         "isflagged": partial(_dslIsFlagged, flags),
         "ismissing": lambda var: pd.isnull(var),
         "mask": lambda cond: data[cond.name].mask(cond),
-        "this": field,
         "GOOD": GOOD,
         "BAD": BAD,
         "UNFLAGGED": UNFLAGGED,
         **ENVIRONMENT,
     }
     func.__globals__.update(globs)
-    return func(*args)
+    return func(*[data[f] for f in fields])
 
 
 @processing(module="generic")
@@ -155,9 +147,11 @@ def process(
 @flagging(masking="none", module="generic")
 def flag(
     data: DictOfSeries,
-    field: str,
+    field: Union[str, Sequence[str]],
     flags: Flags,
+    target:  Union[str, Sequence[str]],
     func: Callable[[pd.Series], pd.Series],
+    sources: Sequence[str],
     flag: float = BAD,
     to_mask: float = UNFLAGGED,
     **kwargs,
@@ -183,17 +177,18 @@ def flag(
 
     Parameters
     ----------
-    data : dios.DictOfSeries
+    data : DictOfSeries
         A dictionary of pandas.Series, holding all the data.
-    field : str
-        The fieldname of the column, where you want the result from the generic expressions evaluation to be projected
-        to.
-    flags : saqc.Flags
+    field : str or list of str
+        Name of the column(s), holding the data-to-be-flagged.
+    flags : Flags
         Container to store flags of the data.
+    target : str or list of str
+        Name of the column(s), to write the results to
     func : Callable
-        The expression that is to be evaluated is passed in form of a callable, with parameter names that will be
-        interpreted as data column entries. The Callable must return an boolen array like.
-        See the examples section to learn more.
+        Function to call on `field` or `sources`. Must return a boolean pd.Series | np.ndarray
+    sources : list of str
+        Sequence of field names.
     flag : float, default BAD
         flag to set.
 
@@ -236,7 +231,28 @@ def flag(
     Your expression also is allowed to include pandas and numpy functions
 
     >>> lambda level: np.sqrt(level) > 7
+
+    TEMP:
+
+    Multiple fields, single target
+
+    # just work on a single field
+    >>> saqc.generic.flag(field="a", lambda x: x < 0)
+
+    # use as multivariate function: multiple in, single out
+    >>> saqc.generic.flag(field=["x", "y", "z"], target="a", lambda x, y, z: x + y > z)
+
+    # use as multivariate function: multiple in, broadcast to all `field`s
+    >>> saqc.generic.flag(field=["x", "y", "z"], lambda x, y, z: x + y > z)
+
+    # use as multivariate function: multiple in, broadcast to all `targets`
+    >>> saqc.generic.flag(field=["x", "y", "z"], targets=["a", "b", "c"], lambda x, y, z: x + y > z)
+
+
+    # not supported
+    >>> saqc.generic.flag(field=["x", "y", "z"], target=["a", "b"], lambda x, y, z: x < z, y > z )
     """
+
     # we get the data unmasked, in order to also receive flags,
     # so let's do to the masking manually
     data_masked, _ = _maskData(data, flags, data.columns, to_mask)
@@ -247,11 +263,9 @@ def flag(
     if not np.issubdtype(mask.dtype, np.bool_):
         raise TypeError(f"generic expression does not return a boolean array")
 
-    if field not in flags:
-        flags[field] = pd.Series(data=UNFLAGGED, index=mask.index, name=field)
-
-    mask = ~_isflagged(flags[field], to_mask) & mask
+    for f in toSequence(field):
+        mask &= ~_isflagged(flags[f], to_mask)
 
-    flags[mask, field] = flag
+    flags[mask, target] = flag
 
     return data, flags
-- 
GitLab


From 8b069be1adf9c749c1c95c3439e51d10c6bb6e54 Mon Sep 17 00:00:00 2001
From: David Schaefer <david.schaefer@ufz.de>
Date: Tue, 19 Oct 2021 18:02:23 +0200
Subject: [PATCH 2/5] first passing draft

---
 ressources/data/config_ci.csv                |   4 +-
 saqc/core/core.py                            | 155 ++++++++++++++--
 saqc/core/flags.py                           |  16 +-
 saqc/core/modules/tools.py                   |   2 +-
 saqc/core/reader.py                          |   2 +-
 saqc/core/register.py                        |  10 +-
 saqc/funcs/generic.py                        | 112 ++++++------
 saqc/funcs/tools.py                          |  10 +-
 tests/funcs/test_generic_api_functions.py    |  71 ++++----
 tests/funcs/test_generic_config_functions.py | 180 ++++++-------------
 10 files changed, 317 insertions(+), 245 deletions(-)

diff --git a/ressources/data/config_ci.csv b/ressources/data/config_ci.csv
index ecbe227e8..cb20243ea 100644
--- a/ressources/data/config_ci.csv
+++ b/ressources/data/config_ci.csv
@@ -1,8 +1,8 @@
 varname ; test
 #-------; -----------------------------------------------------
-SM2     ; resampling.shift(freq="15Min")
+'.*'    ; resampling.shift(freq="15Min")
 '.*'    ; outliers.flagRange(min=10, max=60)
 SM2     ; breaks.flagMissing()
 SM2     ; outliers.flagRange(min=10, max=60)
 SM2     ; outliers.flagMAD(window="30d", z=3.5)
-Dummy   ; generic.flag(func=(isflagged(SM1) | isflagged(SM2)))
+Dummy   ; generic.flag(field=["SM1", "SM2"], func=(y >= x))
diff --git a/saqc/core/core.py b/saqc/core/core.py
index 252dca351..23b53455b 100644
--- a/saqc/core/core.py
+++ b/saqc/core/core.py
@@ -5,7 +5,7 @@ from __future__ import annotations
 import inspect
 import warnings
 import copy as stdcopy
-from typing import Any, Callable, Tuple, Union, Optional
+from typing import Any, Callable, Sequence, Tuple, Union, Optional
 
 import pandas as pd
 import numpy as np
@@ -189,6 +189,7 @@ class SaQC(FuncModules):
 
         return data.to_df(), self._translator.backward(flags)
 
+
     def _wrap(self, func: Callable):
         """Enrich a function by special saqc-functionality.
 
@@ -210,11 +211,10 @@ class SaQC(FuncModules):
             target: str = None,
             regex: bool = False,
             flag: ExternalFlag = None,
+            inplace: bool = False,
             **kwargs,
         ) -> SaQC:
 
-            if regex and target is not None:
-                raise ValueError("explicit `target` not supported with `regex=True`")
 
             kwargs.setdefault("to_mask", self._translator.TO_MASK)
 
@@ -224,37 +224,162 @@ class SaQC(FuncModules):
 
             # expand regular expressions
             if regex:
-                fields = self._data.columns.str.match(field)
-                fields = self._data.columns[fields]
-                targets = fields
-            else:
-                fields, targets = toSequence(field), toSequence(target, default=field)
+                if field != target:
+                    raise ValueError("explicit `target` not supported with `regex=True`")
+                field = self._data.columns[self._data.columns.str.match(field)]
+                target = field
+
+            if target is None:
+                target = field
+
+            fields, targets = toSequence(field), toSequence(target)
+
+
+            if not func._multi:
+                if len(fields) == 1:
+                    # Write the result generated from a single field to multiple targets.
+                    # Could be optimized to call the function only once and write to
+                    # all targets
+                    fields = fields * len(targets)
+                if len(targets) == 1:
+                    # Write the results generated from multiple fields to a single target
+                    # sort of nonsense, as `target` is simply overwritten mutliple times,
+                    # but principally not illegal.
+                    # Could be optimized to call the function only once on `fields[-1]`
+                    targets = targets * len(fields)
+
+            # else: # multivariate function
+            #     import pdb; pdb.set_trace()
+            #     fields = [fields, ]
+            #     if len(targets) == 1:
+            #         # write the result generated from a single set of fields to multiple targets
+            #         targets = [targets, ]
+
+            if not func._multi and len(fields) != len(targets):
+                import pdb; pdb.set_trace()
+                # TODO: a better error message
+                raise ValueError(
+                    "invalid combination of `field` and `target` parameters"
+                )
+
 
             out = self
 
+            # NOTE: initialize all target fields
             for field, target in zip(fields, targets):
                 if field != target:
+                    try:
+                        out = out._callFunction(
+                            FUNC_MAP["tools.copy"],
+                            data=out._data,
+                            flags=out._flags,
+                            field=field,
+                            target=target
+                        )
+                    except ValueError:
+                        pass
+
+            if not func._multi:
+                # NOTE: we call univariate functions iteratively
+                for target in targets:
                     out = out._callFunction(
-                        FUNC_MAP["tools.copy"],
+                        func,
                         data=out._data,
                         flags=out._flags,
-                        field=field,
-                        new_field=target,
+                        field=target,
+                        target=target,
+                        *args,
+                        **kwargs,
                     )
-                    field = target
-
+            else:
                 out = out._callFunction(
                     func,
                     data=out._data,
                     flags=out._flags,
-                    field=field,
+                    field=fields,
+                    target=targets,
                     *args,
                     **kwargs,
                 )
+
             return out
 
         return inner
 
+    # def _wrap(self, func: Callable):
+    #     """Enrich a function by special saqc-functionality.
+
+    #     For each saqc function this realize
+    #         - the source-target workflow,
+    #         - regex's in field,
+    #         - use default of translator for ``to_mask`` if not specified by user,
+    #         - translation of ``flag`` and
+    #         - working inplace.
+    #     Therefore it adds the following keywords to each saqc function:
+    #     ``target``, ``regex`` and ``inplace``.
+
+    #     The returned function returns a Saqc object.
+    #     """
+
+    #     def inner(
+    #         field: Union[str, Sequence[str]],
+    #         *args,
+    #         target: Union[str, Sequence[str]] = None,
+    #         regex: bool = False,
+    #         flag: ExternalFlag = None,
+    #         **kwargs,
+    #     ) -> SaQC:
+
+    #         if regex and target is not None:
+    #             raise ValueError("explicit `target` not supported with `regex=True`")
+
+    #         kwargs.setdefault("to_mask", self._translator.TO_MASK)
+
+    #         # translation
+    #         if flag is not None:
+    #             kwargs["flag"] = self._translator(flag)
+
+    #         # expand regular expressions
+    #         if regex:
+    #             fields = self._data.columns.str.match(field)
+    #             fields = self._data.columns[fields].to_list()
+    #             targets = fields
+    #         else:
+    #             fields, targets = toSequence(field), toSequence(target, default=field)
+
+    #         out = self
+
+    #         if func._multi:
+    #             if len(fields) > 1 and len(targets) == 1:
+    #                 targets = targets * len(fields)
+
+    #             # import pdb; pdb.set_trace()
+    #             # pass
+
+    #         for field, target in zip(fields, targets):
+    #             if field != target:
+    #                 out = out._callFunction(
+    #                     FUNC_MAP["tools.copy"],
+    #                     data=out._data,
+    #                     flags=out._flags,
+    #                     field=field,
+    #                     new_field=target,
+    #                 )
+    #                 field = target
+
+    #             out = out._callFunction(
+    #                 func,
+    #                 data=out._data,
+    #                 flags=out._flags,
+    #                 field=field,
+    #                 target=target,
+    #                 *args,
+    #                 **kwargs,
+    #             )
+    #         return out
+
+    #     return inner
+
     def _callFunction(
         self,
         function: Callable,
@@ -314,7 +439,7 @@ def _warnForUnusedKwargs(func, keywords, translator: Translator):
     sig_kws = inspect.signature(func).parameters
 
     # we need to ignore kws that are injected or by default hidden in ``**kwargs``
-    ignore = ("to_mask",)
+    ignore = {"to_mask", "target"}
 
     missing = []
     for kw in keywords:
diff --git a/saqc/core/flags.py b/saqc/core/flags.py
index 873cd62ac..d9a657db6 100644
--- a/saqc/core/flags.py
+++ b/saqc/core/flags.py
@@ -3,7 +3,8 @@ from __future__ import annotations
 
 import pandas as pd
 import dios
-from typing import Mapping, Union, Dict, DefaultDict, Optional, Type, Tuple, Iterable
+from typing import Mapping, Sequence, Union, Dict, DefaultDict, Optional, Type, Tuple, Iterable, overload
+from dios.dios.dios import DictOfSeries
 
 from saqc.constants import *
 from saqc.core.history import History
@@ -283,9 +284,18 @@ class Flags:
 
     # ----------------------------------------------------------------------
     # item access
-
+    @overload
     def __getitem__(self, key: str) -> pd.Series:
-        return self._data[key].max()
+        ...
+
+    @overload
+    def __getitem__(self, key: Sequence[str]) -> "Flags":
+        ...
+
+    def __getitem__(self, key: Union[str, Sequence[str]]) -> Union[pd.Series, "Flags"]:
+        if isinstance(key, str):
+            return self._data[key].max()
+        return Flags({k: self[k] for k in key})
 
     def __setitem__(self, key: SelectT, value: ValueT):
         # force-KW is only internally available
diff --git a/saqc/core/modules/tools.py b/saqc/core/modules/tools.py
index 7667261a7..fe340ce59 100644
--- a/saqc/core/modules/tools.py
+++ b/saqc/core/modules/tools.py
@@ -11,7 +11,7 @@ from saqc.core.modules.base import ModuleBase
 
 
 class Tools(ModuleBase):
-    def copy(self, field: str, new_field: str, **kwargs) -> saqc.SaQC:
+    def copy(self, field: str, target: str, **kwargs) -> saqc.SaQC:
         return self.defer("copy", locals())
 
     def drop(self, field: str, **kwargs) -> saqc.SaQC:
diff --git a/saqc/core/reader.py b/saqc/core/reader.py
index 219f5b4c4..7beb7435e 100644
--- a/saqc/core/reader.py
+++ b/saqc/core/reader.py
@@ -71,6 +71,6 @@ def fromConfig(fname, *args, **kwargs):
         tree = ast.parse(expr, mode="eval")
         func, kwargs = ConfigFunctionParser().parse(tree.body)
 
-        saqc = getattr(saqc, func)(field=field, regex=regex, **kwargs)
+        saqc = getattr(saqc, func)(target=field, regex=regex, **{"field": field, **kwargs})
 
     return saqc
diff --git a/saqc/core/register.py b/saqc/core/register.py
index 0a108c383..7a9939407 100644
--- a/saqc/core/register.py
+++ b/saqc/core/register.py
@@ -35,7 +35,7 @@ class CallState:
     mask: dios.DictOfSeries
 
 
-def processing(module: Optional[str] = None):
+def processing(module: str = None, multi: bool = False):
     # executed on module import
     def inner(func):
         func_name = func.__name__
@@ -47,13 +47,14 @@ def processing(module: Optional[str] = None):
             kwargs["to_mask"] = _getMaskingThresh(kwargs)
             return func(data, field, flags, *args, **kwargs)
 
+        callWrapper._multi = multi
         FUNC_MAP[func_name] = callWrapper
         return callWrapper
 
     return inner
 
 
-def flagging(masking: MaskingStrT = "all", module: Optional[str] = None):
+def flagging(masking: MaskingStrT = "all", module: str = None, multi: bool = False):
 
     # executed on module import
     if masking not in ("all", "field", "none"):
@@ -78,6 +79,7 @@ def flagging(masking: MaskingStrT = "all", module: Optional[str] = None):
         FUNC_MAP[func_name] = callWrapper
         callWrapper._module = module
         callWrapper._masking = masking
+        callWrapper._multi = multi
 
         return callWrapper
 
@@ -243,8 +245,8 @@ def _getMaskingThresh(kwargs):
 
 
 def _isflagged(
-    flagscol: Union[np.array, pd.Series], thresh: float
-) -> Union[np.array, pd.Series]:
+    flagscol: Union[np.ndarray, pd.Series], thresh: float
+) -> Union[np.ndarray, pd.Series]:
     """
     Return a mask of flags accordingly to `thresh`. Return type is same as flags.
     """
diff --git a/saqc/funcs/generic.py b/saqc/funcs/generic.py
index ae5f63802..3b4428205 100644
--- a/saqc/funcs/generic.py
+++ b/saqc/funcs/generic.py
@@ -2,7 +2,6 @@
 # -*- coding: utf-8 -*-
 
 from functools import partial
-from inspect import signature
 from typing import Sequence, Tuple, Union, Callable
 
 import numpy as np
@@ -19,67 +18,72 @@ import operator as op
 
 from saqc.lib.tools import toSequence
 
-_OP = {"<": op.lt, "<=": op.le, "==": op.eq, "!=": op.ne, ">": op.gt, ">=": op.ge}
+# _OP = {"<": op.lt, "<=": op.le, "==": op.eq, "!=": op.ne, ">": op.gt, ">=": op.ge}
 
 
-def _dslIsFlagged(
-    flags: Flags, var: pd.Series, flag: float = None, comparator: str = None
-) -> Union[pd.Series, DictOfSeries]:
-    """
-    helper function for `flag`
+# def _dslIsFlagged(
+#     flags: Flags, var: pd.Series, flag: float = None, comparator: str = None
+# ) -> Union[pd.Series, DictOfSeries]:
+#     """
+#     helper function for `flag`
 
-    Param Combinations
-    ------------------
-    - ``isflagged('var')``              : show me (anything) flagged
-    - ``isflagged('var', DOUBT)``       : show me ``flags >= DOUBT``
-    - ``isflagged('var', DOUBT, '==')`` : show me ``flags == DOUBT``
+#     Param Combinations
+#     ------------------
+#     - ``isflagged('var')``              : show me (anything) flagged
+#     - ``isflagged('var', DOUBT)``       : show me ``flags >= DOUBT``
+#     - ``isflagged('var', DOUBT, '==')`` : show me ``flags == DOUBT``
 
-    Raises
-    ------
-    ValueError: if `comparator` is passed but no `flag` vaule. Eg. ``isflagged('var', comparator='>=')``
-    """
-    if flag is None:
-        if comparator is not None:
-            raise ValueError("if `comparator` is used, explicitly pass a `flag` level.")
-        flag = UNFLAGGED
-        comparator = ">"
+#     Raises
+#     ------
+#     ValueError: if `comparator` is passed but no `flag` vaule. Eg. ``isflagged('var', comparator='>=')``
+#     """
+#     if flag is None:
+#         if comparator is not None:
+#             raise ValueError("if `comparator` is used, explicitly pass a `flag` level.")
+#         flag = UNFLAGGED
+#         comparator = ">"
 
-    # default
-    if comparator is None:
-        comparator = ">="
+#     # default
+#     if comparator is None:
+#         comparator = ">="
 
-    _op = _OP[comparator]
-    return _op(flags[var.name], flag)
+#     _op = _OP[comparator]
+#     return _op(flags[var.name], flag)
 
 
 def _execGeneric(
     flags: Flags,
     data: DictOfSeries,
     func: Callable[[pd.Series], pd.Series],
-    fields: Sequence[str],
-) -> pd.Series:
+) -> Union[DictOfSeries, float]:
 
     # TODO:
     # - check series.index compatibility
 
     globs = {
-        "isflagged": partial(_dslIsFlagged, flags),
-        "ismissing": lambda var: pd.isnull(var),
-        "mask": lambda cond: data[cond.name].mask(cond),
         "GOOD": GOOD,
         "BAD": BAD,
         "UNFLAGGED": UNFLAGGED,
         **ENVIRONMENT,
     }
+
     func.__globals__.update(globs)
-    return func(*[data[f] for f in fields])
 
+    if isinstance(data, pd.Series):
+        data = data.to_frame()
+
+    out = func(*[data[c] for c in data.columns])
+    if isinstance(out, (np.ndarray, pd.Series)):
+        return DictOfSeries(out)
+    return out
 
-@processing(module="generic")
+
+@processing(module="generic", multi=True)
 def process(
     data: DictOfSeries,
-    field: str,
+    field: Union[str, Sequence[str]],
     flags: Flags,
+    target: Union[str, Sequence[str]],
     func: Callable[[pd.Series], pd.Series],
     to_mask: float = UNFLAGGED,
     **kwargs,
@@ -132,25 +136,27 @@ def process(
     >>> lambda temperature, uncertainty: np.round(temperature) * np.sqrt(uncertainty)
     """
 
+    fields, targets = toSequence(field), toSequence(target)
     data_masked, _ = _maskData(data.copy(), flags, data.columns, to_mask)
-    data[field] = _execGeneric(flags, data_masked, func, field).squeeze()
+    value = _execGeneric(flags[fields], data_masked[fields], func)
+    data.aloc[targets] = value
 
-    if field in flags:
-        flags.drop(field)
-
-    flags[field] = initFlagsLike(data[field])[field]
+    #NOTE: we generate new data, so we also need to drop existing flags
+    for t in targets:
+        if t in flags:
+            flags.drop(t)
+            flags[t] = initFlagsLike(data[t])[t]
 
     return data, flags
 
 
-@flagging(masking="all", module="generic")
+@flagging(masking="all", module="generic", multi=True)
 def flag(
     data: DictOfSeries,
     field: Union[str, Sequence[str]],
     flags: Flags,
-    target:  Union[str, Sequence[str]],
+    target: Union[str, Sequence[str]],
     func: Callable[[pd.Series], pd.Series],
-    sources: Sequence[str],
     flag: float = BAD,
     to_mask: float = UNFLAGGED,
     **kwargs,
@@ -252,19 +258,23 @@ def flag(
     >>> saqc.generic.flag(field=["x", "y", "z"], target=["a", "b"], lambda x, y, z: x < z, y > z )
     """
 
-    # we get the data unmasked, in order to also receive flags,
-    # so let's do to the masking manually
-    # data_masked, _ = _maskData(data, flags, data.columns, to_mask)
+    fields, targets = toSequence(field), toSequence(target)
+    value = _execGeneric(flags, data[fields].copy(), func)
+
+    if len(target) != len(value.columns):
+        raise ValueError(
+            f"The generic function returned {len(value.columns)} fields, but we only got {len(target)} targets."
+        )
 
-    mask = _execGeneric(flags, data, func, field).squeeze()
-    if np.isscalar(mask):
+    if np.isscalar(value):
         raise TypeError(f"generic expression does not return an array")
-    if not np.issubdtype(mask.dtype, np.bool_):
+    if not (value.dtypes == bool).all():
         raise TypeError(f"generic expression does not return a boolean array")
 
-    for f in toSequence(field):
-        mask &= ~_isflagged(flags[f], to_mask)
+    for f in fields:
+        value = value & _isflagged(flags[f], thresh=to_mask)
 
-    flags[mask, target] = flag
+    for i, t in enumerate(targets):
+        flags[value[i], t] = flag
 
     return data, flags
diff --git a/saqc/funcs/tools.py b/saqc/funcs/tools.py
index 67494c275..54ed0cf77 100644
--- a/saqc/funcs/tools.py
+++ b/saqc/funcs/tools.py
@@ -22,7 +22,7 @@ _MPL_DEFAULT_BACKEND = mpl.get_backend()
 
 @processing(module="tools")
 def copy(
-    data: DictOfSeries, field: str, flags: Flags, new_field: str, **kwargs
+    data: DictOfSeries, field: str, flags: Flags, target: str, **kwargs
 ) -> Tuple[DictOfSeries, Flags]:
     """
     The function generates a copy of the data "field" and inserts it under the name field + suffix into the existing
@@ -36,7 +36,7 @@ def copy(
         The fieldname of the data column, you want to fork (copy).
     flags : saqc.Flags
         Container to store quality flags to data.
-    new_field: str
+    target: str
         Target name.
 
     Returns
@@ -48,12 +48,12 @@ def copy(
         The quality flags of data
         Flags shape may have changed relatively to the flags input.
     """
-    if new_field in flags.columns.union(data.columns):
+    if target in flags.columns.union(data.columns):
         raise ValueError(f"{field}: field already exist")
 
-    data[new_field] = data[field].copy()
+    data[target] = data[field].copy()
     # implicit copy in history access
-    flags.history[new_field] = flags.history[field]
+    flags.history[target] = flags.history[field]
     return data, flags
 
 
diff --git a/tests/funcs/test_generic_api_functions.py b/tests/funcs/test_generic_api_functions.py
index f5c10b90c..8ed97449f 100644
--- a/tests/funcs/test_generic_api_functions.py
+++ b/tests/funcs/test_generic_api_functions.py
@@ -1,13 +1,16 @@
 #! /usr/bin/env python
 # -*- coding: utf-8 -*-
 
+from dataclasses import field, fields
 import pytest
 import pandas as pd
+from dios.dios.dios import DictOfSeries
 
 from saqc.constants import *
 from saqc.core.register import flagging
 from saqc.funcs.tools import mask
 from saqc import SaQC
+from saqc.lib.tools import toSequence
 
 from tests.common import initData, flagAll
 
@@ -23,41 +26,41 @@ def data():
 def test_addFieldFlagGeneric(data):
     saqc = SaQC(data=data)
 
-    func = lambda var1: pd.Series(False, index=data[var1.name].index)
-    data, flags = saqc.generic.flag("tmp1", func, flag=BAD).getResult()
-    assert "tmp1" in flags.columns and "tmp1" not in data
+    func = lambda x: pd.Series(False, index=x.index)
+    data, flags = saqc.generic.flag(field="var1", target="tmp1", func=func, flag=BAD).getResult()
+    assert "tmp1" in flags.columns and "tmp1" in data
 
 
 def test_addFieldProcGeneric(data):
     saqc = SaQC(data=data)
-
-    func = lambda: pd.Series([])
-    data, flags = saqc.generic.process("tmp1", func, flag=BAD).getResult(raw=True)
-    assert "tmp1" in data.columns and data["tmp1"].empty
-
-    func = lambda var1, var2: var1 + var2
-    data, flags = saqc.generic.process("tmp2", func, flag=BAD).getResult()
-    assert "tmp2" in data.columns and (data["tmp2"] == data["var1"] + data["var2"]).all(
-        axis=None
-    )
-
-
-def test_mask(data):
-    saqc = SaQC(data=data)
-    data_org = data.copy(deep=True)
-    mean = data["var1"] / 2
-
-    data, _ = saqc.generic.process(
-        "var1", lambda var1: mask(var1 < mean), flag=BAD
-    ).getResult()
-    assert (
-        (data["var1"].isna()) == (data_org["var1"] < 10) & data_org["var1"].isna()
-    ).all(axis=None)
-
-    data, flags = saqc.generic.process(
-        "tmp", lambda var1: mask(var1 < mean), flag=BAD
-    ).getResult()
-    assert ("tmp" in data.columns) and ("tmp" in flags.columns)
-    assert (
-        (data["tmp"].isna()) == (data_org["var1"] < 10) & data_org["var1"].isna()
-    ).all(axis=None)
+    fields = ["var1", "var2"]
+    params = [
+        ("tmp", lambda x, y: x + y),
+        # (["tmp1", "tmp2"], lambda x, y: (x + y, y*2))
+    ]
+    for target, func in params:
+        expected = DictOfSeries(func(*[data[f] for f in fields]), columns=toSequence(target))
+        data, _ = saqc.generic.process(field=fields, target=target, func=func, flag=BAD).getResult(raw=True)
+        # import pdb; pdb.set_trace()
+        # assert (data[target] == expected).all(axis=None)
+
+
+# def test_mask(data):
+#     saqc = SaQC(data=data)
+#     data_org = data.copy(deep=True)
+#     mean = data["var1"] / 2
+
+#     data, _ = saqc.generic.process(
+#         "var1", lambda x: mask(x < mean), flag=BAD
+#     ).getResult()
+#     assert (
+#         (data["var1"].isna()) == (data_org["var1"] < 10) & data_org["var1"].isna()
+#     ).all(axis=None)
+
+#     data, flags = saqc.generic.process(
+#         field="var1", target="tmp", func=lambda x: mask(x < mean), flag=BAD
+#     ).getResult()
+#     assert ("tmp" in data.columns) and ("tmp" in flags.columns)
+#     assert (
+#         (data["tmp"].isna()) == (data_org["var1"] < 10) & data_org["var1"].isna()
+#     ).all(axis=None)
diff --git a/tests/funcs/test_generic_config_functions.py b/tests/funcs/test_generic_config_functions.py
index 1298e7317..96a0c8fbc 100644
--- a/tests/funcs/test_generic_config_functions.py
+++ b/tests/funcs/test_generic_config_functions.py
@@ -14,6 +14,7 @@ from saqc.core.visitor import ConfigFunctionParser
 from saqc.core.register import flagging
 from saqc.funcs.generic import _execGeneric
 from saqc import SaQC
+from saqc.lib.tools import toSequence
 
 from tests.common import initData, writeIO
 
@@ -38,30 +39,13 @@ def data_diff():
     )
 
 
-def _compileGeneric(expr, flags):
+def _compileGeneric(expr):
     tree = ast.parse(expr, mode="eval")
     _, kwargs = ConfigFunctionParser().parse(tree.body)
     return kwargs["func"]
 
 
-def test_missingIdentifier(data):
-    flags = Flags()
-
-    # NOTE:
-    # - the error is only raised at runtime during parsing would be better
-    tests = [
-        "fff(var2) < 5",
-        "var3 != 42",
-    ]
-
-    for test in tests:
-        func = _compileGeneric(f"generic.flag(func={test})", flags)
-        with pytest.raises(NameError):
-            _execGeneric(flags, data, func, field="")
-
-
 def test_syntaxError():
-    flags = Flags()
     tests = [
         "range(x=5",
         "rangex=5)",
@@ -70,42 +54,39 @@ def test_syntaxError():
 
     for test in tests:
         with pytest.raises(SyntaxError):
-            _compileGeneric(f"flag(func={test})", flags)
+            _compileGeneric(f"flag(func={test})")
 
 
 def test_typeError():
     """
     test that forbidden constructs actually throw an error
-    TODO: find a few more cases or get rid of the test
     """
-    flags = Flags()
 
-    # : think about cases that should be forbidden
+    # TODO: think about cases that should be forbidden
     tests = ("lambda x: x * 2",)
 
     for test in tests:
         with pytest.raises(TypeError):
-            _compileGeneric(f"generic.flag(func={test})", flags)
+            _compileGeneric(f"generic.flag(func={test})")
 
 
 def test_comparisonOperators(data):
     flags = initFlagsLike(data)
     var1, var2, *_ = data.columns
-    this = var1
 
     tests = [
-        ("this > 100", data[this] > 100),
-        (f"10 >= {var2}", 10 >= data[var2]),
-        (f"{var2} < 100", data[var2] < 100),
-        (f"this <= {var2}", data[this] <= data[var2]),
-        (f"{var1} == {var2}", data[this] == data[var2]),
-        (f"{var1} != {var2}", data[this] != data[var2]),
+        ("var1", "x > 100", data[var1] > 100),
+        ("var2", "10 >= y", 10 >= data[var2]),
+        ("var2", f"y < 100", data[var2] < 100),
+        (["var1", "var2"], "x <= y", data[var1] <= data[var2]),
+        (["var1", "var2"], "x == y", data[var1] == data[var2]),
+        (["var1", "var2"], "x != y", data[var1] != data[var2]),
     ]
 
-    for test, expected in tests:
-        func = _compileGeneric(f"generic.flag(func={test})", flags)
-        result = _execGeneric(flags, data, func, field=var1)
-        assert np.all(result == expected)
+    for fields, test, expected in tests:
+        func = _compileGeneric(f"generic.flag(func={test})")
+        result = _execGeneric(flags[toSequence(fields)], data[fields], func)
+        assert (result == expected).all(axis=None)
 
 
 def test_arithmeticOperators(data):
@@ -123,149 +104,89 @@ def test_arithmeticOperators(data):
     ]
 
     for test, expected in tests:
-        func = _compileGeneric(f"generic.process(func={test})", flags)
-        result = _execGeneric(flags, data, func, field=var1)
-        assert np.all(result == expected)
+        func = _compileGeneric(f"generic.process(func={test})")
+        result = _execGeneric(flags[[this.name]], this, func)
+        assert (result == expected).all(axis=None)
+
 
 
 def test_nonReduncingBuiltins(data):
     flags = initFlagsLike(data)
     var1, *_ = data.columns
-    this = var1
-    mean = data[var1].mean()
+    this = data[var1]
 
     tests = [
-        (f"abs({this})", np.abs(data[this])),
-        (f"log({this})", np.log(data[this])),
-        (f"exp({this})", np.exp(data[this])),
-        (
-            f"ismissing(mask({this} < {mean}))",
-            data[this].mask(data[this] < mean).isna(),
-        ),
+        ("abs(x)", np.abs(this)),
+        ("log(x)", np.log(this)),
+        ("exp(x)", np.exp(this)),
     ]
 
     for test, expected in tests:
-        func = _compileGeneric(f"generic.process(func={test})", flags)
-        result = _execGeneric(flags, data, func, field=this)
-        assert (result == expected).all()
+        func = _compileGeneric(f"generic.process(func={test})")
+        result = _execGeneric(flags[[this.name]], this, func)
+        assert (result == expected).all(axis=None)
 
 
 def test_reduncingBuiltins(data):
     data.loc[::4] = np.nan
     flags = initFlagsLike(data)
     var1 = data.columns[0]
-    this = data.iloc[:, 0]
+    this = data[var1]
 
     tests = [
-        ("min(this)", np.nanmin(this)),
-        (f"max({var1})", np.nanmax(this)),
-        (f"sum({var1})", np.nansum(this)),
-        ("mean(this)", np.nanmean(this)),
-        (f"std({this.name})", np.std(this)),
-        (f"len({this.name})", len(this)),
+        ("min(x)", np.nanmin(this)),
+        ("max(x)", np.nanmax(this)),
+        ("sum(x)", np.nansum(this)),
+        ("mean(x)", np.nanmean(this)),
+        ("std(x)", np.std(this)),
+        ("len(x)", len(this)),
     ]
 
     for test, expected in tests:
-        func = _compileGeneric(f"generic.process(func={test})", flags)
-        result = _execGeneric(flags, data, func, field=this.name)
+        func = _compileGeneric(f"generic.process(func={test})")
+        result = _execGeneric(flags[[this.name]], this, func)
         assert result == expected
 
 
-def test_ismissing(data):
-
-    flags = initFlagsLike(data)
-    data.iloc[: len(data) // 2, 0] = np.nan
-    data.iloc[(len(data) // 2) + 1 :, 0] = -9999
-    this = data.iloc[:, 0]
-
-    tests = [
-        (f"ismissing({this.name})", pd.isnull(this)),
-        (f"~ismissing({this.name})", pd.notnull(this)),
-    ]
-
-    for test, expected in tests:
-        func = _compileGeneric(f"generic.flag(func={test})", flags)
-        result = _execGeneric(flags, data, func, this.name)
-        assert np.all(result == expected)
-
-
 def test_bitOps(data):
     var1, var2, *_ = data.columns
-    this = var1
-
-    flags = initFlagsLike(data)
-
-    tests = [
-        ("~(this > mean(this))", ~(data[this] > np.nanmean(data[this]))),
-        (f"(this <= 0) | (0 < {var1})", (data[this] <= 0) | (0 < data[var1])),
-        (f"({var2} >= 0) & (0 > this)", (data[var2] >= 0) & (0 > data[this])),
-    ]
 
-    for test, expected in tests:
-        func = _compileGeneric(f"generic.flag(func={test})", flags)
-        result = _execGeneric(flags, data, func, this)
-        assert np.all(result == expected)
-
-
-def test_isflagged(data):
-
-    var1, var2, *_ = data.columns
     flags = initFlagsLike(data)
-    flags[data[var1].index[::2], var1] = BAD
 
     tests = [
-        (f"isflagged({var1})", flags[var1] > UNFLAGGED),
-        (f"isflagged({var1}, flag=BAD)", flags[var1] >= BAD),
-        (f"isflagged({var1}, UNFLAGGED, '==')", flags[var1] == UNFLAGGED),
-        (f"~isflagged({var2})", flags[var2] == UNFLAGGED),
-        (
-            f"~({var2}>999) & (~isflagged({var2}))",
-            ~(data[var2] > 999) & (flags[var2] == UNFLAGGED),
-        ),
+        (var1, "~(x > mean(x))", ~(data[var1] > np.nanmean(data[var1]))),
+        (var1, "(x <= 0) | (0 < x)", (data[var1] <= 0) | (0 < data[var1])),
+        ([var1, var2], "(y>= 0) & (0 > x)", (data[var2] >= 0) & (0 > data[var1])),
     ]
 
-    for i, (test, expected) in enumerate(tests):
-        try:
-            func = _compileGeneric(f"generic.flag(func={test}, flag=BAD)", flags)
-            result = _execGeneric(flags, data, func, field=None)
-            assert np.all(result == expected)
-        except Exception:
-            print(i, test)
-            raise
-
-    # test bad combination
-    for comp in [">", ">=", "==", "!=", "<", "<="]:
-        fails = f"isflagged({var1}, comparator='{comp}')"
-
-        func = _compileGeneric(f"generic.flag(func={fails}, flag=BAD)", flags)
-        with pytest.raises(ValueError):
-            _execGeneric(flags, data, func, field=None)
+    for field, test, expected in tests:
+        func = _compileGeneric(f"generic.flag(func={test})")
+        result = _execGeneric(flags[toSequence(field)], data[field], func)
+        assert (result == expected).all(axis=None)
 
 
 def test_variableAssignments(data):
-    var1, var2, *_ = data.columns
 
     config = f"""
     varname ; test
-    dummy1  ; generic.process(func=var1 + var2)
-    dummy2  ; generic.flag(func=var1 + var2 > 0)
+    dummy1  ; generic.process(field=["var1", "var2"], func=x + y)
+    dummy2  ; generic.flag(field=["var1", "var2"], func=x + y > 0)
     """
 
     fobj = writeIO(config)
     saqc = fromConfig(fobj, data)
     result_data, result_flags = saqc.getResult(raw=True)
 
-    assert set(result_data.columns) == set(data.columns) | {
-        "dummy1",
-    }
-    assert set(result_flags.columns) == set(data.columns) | {"dummy1", "dummy2"}
+    expected_columns = set(data.columns) | {"dummy1", "dummy2"}
+    assert set(result_data.columns) == expected_columns
+    assert set(result_flags.columns) == expected_columns
 
 
 def test_processMultiple(data_diff):
     config = f"""
     varname ; test
-    dummy   ; generic.process(func=var1 + 1)
-    dummy   ; generic.process(func=var2 - 1)
+    dummy   ; generic.process(field="var1", func=x + 1)
+    dummy   ; generic.process(field="var2", func=y - 1)
     """
 
     fobj = writeIO(config)
@@ -280,7 +201,8 @@ def test_callableArgumentsUnary(data):
 
     @flagging(masking="field")
     def testFuncUnary(data, field, flags, func, **kwargs):
-        data[field] = data[field].rolling(window=window).apply(func)
+        value = data[field].rolling(window=window).apply(func)
+        data[field] = value
         return data, initFlagsLike(data)
 
     var = data.columns[0]
-- 
GitLab


From d82325c0f46d766cb5fcd0208e1a029e9b163c00 Mon Sep 17 00:00:00 2001
From: David Schaefer <david.schaefer@ufz.de>
Date: Tue, 19 Oct 2021 18:06:02 +0200
Subject: [PATCH 3/5] remove `flags` parameter from _execGeneric

---
 saqc/core/flags.py                           | 12 +----------
 saqc/funcs/generic.py                        |  5 ++---
 tests/funcs/test_generic_api_functions.py    | 22 --------------------
 tests/funcs/test_generic_config_functions.py | 14 +++++--------
 4 files changed, 8 insertions(+), 45 deletions(-)

diff --git a/saqc/core/flags.py b/saqc/core/flags.py
index d9a657db6..2f53bce79 100644
--- a/saqc/core/flags.py
+++ b/saqc/core/flags.py
@@ -284,18 +284,8 @@ class Flags:
 
     # ----------------------------------------------------------------------
     # item access
-    @overload
     def __getitem__(self, key: str) -> pd.Series:
-        ...
-
-    @overload
-    def __getitem__(self, key: Sequence[str]) -> "Flags":
-        ...
-
-    def __getitem__(self, key: Union[str, Sequence[str]]) -> Union[pd.Series, "Flags"]:
-        if isinstance(key, str):
-            return self._data[key].max()
-        return Flags({k: self[k] for k in key})
+        return self._data[key].max()
 
     def __setitem__(self, key: SelectT, value: ValueT):
         # force-KW is only internally available
diff --git a/saqc/funcs/generic.py b/saqc/funcs/generic.py
index 3b4428205..57c90c3b0 100644
--- a/saqc/funcs/generic.py
+++ b/saqc/funcs/generic.py
@@ -52,7 +52,6 @@ from saqc.lib.tools import toSequence
 
 
 def _execGeneric(
-    flags: Flags,
     data: DictOfSeries,
     func: Callable[[pd.Series], pd.Series],
 ) -> Union[DictOfSeries, float]:
@@ -138,7 +137,7 @@ def process(
 
     fields, targets = toSequence(field), toSequence(target)
     data_masked, _ = _maskData(data.copy(), flags, data.columns, to_mask)
-    value = _execGeneric(flags[fields], data_masked[fields], func)
+    value = _execGeneric(data_masked[fields], func)
     data.aloc[targets] = value
 
     #NOTE: we generate new data, so we also need to drop existing flags
@@ -259,7 +258,7 @@ def flag(
     """
 
     fields, targets = toSequence(field), toSequence(target)
-    value = _execGeneric(flags, data[fields].copy(), func)
+    value = _execGeneric(data[fields].copy(), func)
 
     if len(target) != len(value.columns):
         raise ValueError(
diff --git a/tests/funcs/test_generic_api_functions.py b/tests/funcs/test_generic_api_functions.py
index 8ed97449f..76f9dde99 100644
--- a/tests/funcs/test_generic_api_functions.py
+++ b/tests/funcs/test_generic_api_functions.py
@@ -1,14 +1,12 @@
 #! /usr/bin/env python
 # -*- coding: utf-8 -*-
 
-from dataclasses import field, fields
 import pytest
 import pandas as pd
 from dios.dios.dios import DictOfSeries
 
 from saqc.constants import *
 from saqc.core.register import flagging
-from saqc.funcs.tools import mask
 from saqc import SaQC
 from saqc.lib.tools import toSequence
 
@@ -44,23 +42,3 @@ def test_addFieldProcGeneric(data):
         # import pdb; pdb.set_trace()
         # assert (data[target] == expected).all(axis=None)
 
-
-# def test_mask(data):
-#     saqc = SaQC(data=data)
-#     data_org = data.copy(deep=True)
-#     mean = data["var1"] / 2
-
-#     data, _ = saqc.generic.process(
-#         "var1", lambda x: mask(x < mean), flag=BAD
-#     ).getResult()
-#     assert (
-#         (data["var1"].isna()) == (data_org["var1"] < 10) & data_org["var1"].isna()
-#     ).all(axis=None)
-
-#     data, flags = saqc.generic.process(
-#         field="var1", target="tmp", func=lambda x: mask(x < mean), flag=BAD
-#     ).getResult()
-#     assert ("tmp" in data.columns) and ("tmp" in flags.columns)
-#     assert (
-#         (data["tmp"].isna()) == (data_org["var1"] < 10) & data_org["var1"].isna()
-#     ).all(axis=None)
diff --git a/tests/funcs/test_generic_config_functions.py b/tests/funcs/test_generic_config_functions.py
index 96a0c8fbc..5dcfdbb65 100644
--- a/tests/funcs/test_generic_config_functions.py
+++ b/tests/funcs/test_generic_config_functions.py
@@ -71,7 +71,6 @@ def test_typeError():
 
 
 def test_comparisonOperators(data):
-    flags = initFlagsLike(data)
     var1, var2, *_ = data.columns
 
     tests = [
@@ -85,12 +84,11 @@ def test_comparisonOperators(data):
 
     for fields, test, expected in tests:
         func = _compileGeneric(f"generic.flag(func={test})")
-        result = _execGeneric(flags[toSequence(fields)], data[fields], func)
+        result = _execGeneric(data[fields], func)
         assert (result == expected).all(axis=None)
 
 
 def test_arithmeticOperators(data):
-    flags = initFlagsLike(data)
     var1, *_ = data.columns
     this = data[var1]
 
@@ -105,13 +103,12 @@ def test_arithmeticOperators(data):
 
     for test, expected in tests:
         func = _compileGeneric(f"generic.process(func={test})")
-        result = _execGeneric(flags[[this.name]], this, func)
+        result = _execGeneric(this, func)
         assert (result == expected).all(axis=None)
 
 
 
 def test_nonReduncingBuiltins(data):
-    flags = initFlagsLike(data)
     var1, *_ = data.columns
     this = data[var1]
 
@@ -123,13 +120,12 @@ def test_nonReduncingBuiltins(data):
 
     for test, expected in tests:
         func = _compileGeneric(f"generic.process(func={test})")
-        result = _execGeneric(flags[[this.name]], this, func)
+        result = _execGeneric(this, func)
         assert (result == expected).all(axis=None)
 
 
 def test_reduncingBuiltins(data):
     data.loc[::4] = np.nan
-    flags = initFlagsLike(data)
     var1 = data.columns[0]
     this = data[var1]
 
@@ -144,7 +140,7 @@ def test_reduncingBuiltins(data):
 
     for test, expected in tests:
         func = _compileGeneric(f"generic.process(func={test})")
-        result = _execGeneric(flags[[this.name]], this, func)
+        result = _execGeneric(this, func)
         assert result == expected
 
 
@@ -161,7 +157,7 @@ def test_bitOps(data):
 
     for field, test, expected in tests:
         func = _compileGeneric(f"generic.flag(func={test})")
-        result = _execGeneric(flags[toSequence(field)], data[field], func)
+        result = _execGeneric(data[field], func)
         assert (result == expected).all(axis=None)
 
 
-- 
GitLab


From 6c1cd2607ea543f5a0b841b07dcfb04a421a1a64 Mon Sep 17 00:00:00 2001
From: David Schaefer <david.schaefer@ufz.de>
Date: Tue, 19 Oct 2021 18:24:48 +0200
Subject: [PATCH 4/5] fix process

---
 saqc/funcs/generic.py                     | 2 +-
 tests/funcs/test_generic_api_functions.py | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/saqc/funcs/generic.py b/saqc/funcs/generic.py
index 57c90c3b0..440085e43 100644
--- a/saqc/funcs/generic.py
+++ b/saqc/funcs/generic.py
@@ -138,7 +138,7 @@ def process(
     fields, targets = toSequence(field), toSequence(target)
     data_masked, _ = _maskData(data.copy(), flags, data.columns, to_mask)
     value = _execGeneric(data_masked[fields], func)
-    data.aloc[targets] = value
+    data[targets] = value
 
     #NOTE: we generate new data, so we also need to drop existing flags
     for t in targets:
diff --git a/tests/funcs/test_generic_api_functions.py b/tests/funcs/test_generic_api_functions.py
index 76f9dde99..3d79bcc74 100644
--- a/tests/funcs/test_generic_api_functions.py
+++ b/tests/funcs/test_generic_api_functions.py
@@ -34,11 +34,10 @@ def test_addFieldProcGeneric(data):
     fields = ["var1", "var2"]
     params = [
         ("tmp", lambda x, y: x + y),
-        # (["tmp1", "tmp2"], lambda x, y: (x + y, y*2))
+        (["tmp1", "tmp2"], lambda x, y: (x + y, y*2))
     ]
     for target, func in params:
         expected = DictOfSeries(func(*[data[f] for f in fields]), columns=toSequence(target))
         data, _ = saqc.generic.process(field=fields, target=target, func=func, flag=BAD).getResult(raw=True)
-        # import pdb; pdb.set_trace()
-        # assert (data[target] == expected).all(axis=None)
+        assert (expected == data[target]).all(axis=None)
 
-- 
GitLab


From 9ab90fc0dee90804622392b64be4ac4538c44140 Mon Sep 17 00:00:00 2001
From: David Schaefer <david.schaefer@ufz.de>
Date: Wed, 20 Oct 2021 16:55:08 +0200
Subject: [PATCH 5/5] bring back `ismissing`

---
 saqc/core/flags.py                           | 12 ++-
 saqc/funcs/generic.py                        | 49 +++--------
 tests/funcs/test_generic_config_functions.py | 92 +++++++++++---------
 3 files changed, 70 insertions(+), 83 deletions(-)

diff --git a/saqc/core/flags.py b/saqc/core/flags.py
index 2f53bce79..d9a657db6 100644
--- a/saqc/core/flags.py
+++ b/saqc/core/flags.py
@@ -284,8 +284,18 @@ class Flags:
 
     # ----------------------------------------------------------------------
     # item access
+    @overload
     def __getitem__(self, key: str) -> pd.Series:
-        return self._data[key].max()
+        ...
+
+    @overload
+    def __getitem__(self, key: Sequence[str]) -> "Flags":
+        ...
+
+    def __getitem__(self, key: Union[str, Sequence[str]]) -> Union[pd.Series, "Flags"]:
+        if isinstance(key, str):
+            return self._data[key].max()
+        return Flags({k: self[k] for k in key})
 
     def __setitem__(self, key: SelectT, value: ValueT):
         # force-KW is only internally available
diff --git a/saqc/funcs/generic.py b/saqc/funcs/generic.py
index 440085e43..89c0f56b6 100644
--- a/saqc/funcs/generic.py
+++ b/saqc/funcs/generic.py
@@ -18,48 +18,18 @@ import operator as op
 
 from saqc.lib.tools import toSequence
 
-# _OP = {"<": op.lt, "<=": op.le, "==": op.eq, "!=": op.ne, ">": op.gt, ">=": op.ge}
-
-
-# def _dslIsFlagged(
-#     flags: Flags, var: pd.Series, flag: float = None, comparator: str = None
-# ) -> Union[pd.Series, DictOfSeries]:
-#     """
-#     helper function for `flag`
-
-#     Param Combinations
-#     ------------------
-#     - ``isflagged('var')``              : show me (anything) flagged
-#     - ``isflagged('var', DOUBT)``       : show me ``flags >= DOUBT``
-#     - ``isflagged('var', DOUBT, '==')`` : show me ``flags == DOUBT``
-
-#     Raises
-#     ------
-#     ValueError: if `comparator` is passed but no `flag` vaule. Eg. ``isflagged('var', comparator='>=')``
-#     """
-#     if flag is None:
-#         if comparator is not None:
-#             raise ValueError("if `comparator` is used, explicitly pass a `flag` level.")
-#         flag = UNFLAGGED
-#         comparator = ">"
-
-#     # default
-#     if comparator is None:
-#         comparator = ">="
-
-#     _op = _OP[comparator]
-#     return _op(flags[var.name], flag)
-
 
 def _execGeneric(
+    flags: Flags,
     data: DictOfSeries,
     func: Callable[[pd.Series], pd.Series],
-) -> Union[DictOfSeries, float]:
+) -> DictOfSeries:
 
     # TODO:
     # - check series.index compatibility
 
     globs = {
+        "isflagged": lambda data: _isflagged(flags[data.name], thresh=UNFLAGGED),
         "GOOD": GOOD,
         "BAD": BAD,
         "UNFLAGGED": UNFLAGGED,
@@ -72,9 +42,7 @@ def _execGeneric(
         data = data.to_frame()
 
     out = func(*[data[c] for c in data.columns])
-    if isinstance(out, (np.ndarray, pd.Series)):
-        return DictOfSeries(out)
-    return out
+    return DictOfSeries(out)
 
 
 @processing(module="generic", multi=True)
@@ -137,10 +105,11 @@ def process(
 
     fields, targets = toSequence(field), toSequence(target)
     data_masked, _ = _maskData(data.copy(), flags, data.columns, to_mask)
-    value = _execGeneric(data_masked[fields], func)
+    value = _execGeneric(flags[fields], data_masked[fields], func)
     data[targets] = value
 
-    #NOTE: we generate new data, so we also need to drop existing flags
+    # NOTE: we generate new data, so we also need to drop existing flags
+    # TODO: transfer the flags from the input to the output fields
     for t in targets:
         if t in flags:
             flags.drop(t)
@@ -258,7 +227,7 @@ def flag(
     """
 
     fields, targets = toSequence(field), toSequence(target)
-    value = _execGeneric(data[fields].copy(), func)
+    value = _execGeneric(flags[fields], data[fields].copy(), func)
 
     if len(target) != len(value.columns):
         raise ValueError(
@@ -270,9 +239,11 @@ def flag(
     if not (value.dtypes == bool).all():
         raise TypeError(f"generic expression does not return a boolean array")
 
+    # transfer the flags from the input fields
     for f in fields:
         value = value & _isflagged(flags[f], thresh=to_mask)
 
+    # set the newly generated flags
     for i, t in enumerate(targets):
         flags[value[i], t] = flag
 
diff --git a/tests/funcs/test_generic_config_functions.py b/tests/funcs/test_generic_config_functions.py
index 5dcfdbb65..6d51c43ca 100644
--- a/tests/funcs/test_generic_config_functions.py
+++ b/tests/funcs/test_generic_config_functions.py
@@ -72,92 +72,75 @@ def test_typeError():
 
 def test_comparisonOperators(data):
     var1, var2, *_ = data.columns
+    flags = initFlagsLike(data)
 
     tests = [
-        ("var1", "x > 100", data[var1] > 100),
-        ("var2", "10 >= y", 10 >= data[var2]),
-        ("var2", f"y < 100", data[var2] < 100),
+        (["var1"], "x > 100", data[var1] > 100),
+        (["var2"], "10 >= y", 10 >= data[var2]),
+        (["var2"], f"y < 100", data[var2] < 100),
         (["var1", "var2"], "x <= y", data[var1] <= data[var2]),
         (["var1", "var2"], "x == y", data[var1] == data[var2]),
         (["var1", "var2"], "x != y", data[var1] != data[var2]),
     ]
 
-    for fields, test, expected in tests:
+    for field, test, expected in tests:
         func = _compileGeneric(f"generic.flag(func={test})")
-        result = _execGeneric(data[fields], func)
+        result = _execGeneric(flags[field], data[field], func)
         assert (result == expected).all(axis=None)
 
 
 def test_arithmeticOperators(data):
+
     var1, *_ = data.columns
-    this = data[var1]
+
+    data = data[var1]
+    flags = initFlagsLike(data)[[var1]]
 
     tests = [
-        ("var1 + 100 > 110", this + 100 > 110),
-        ("var1 - 100 > 0", this - 100 > 0),
-        ("var1 * 100 > 200", this * 100 > 200),
-        ("var1 / 100 > .1", this / 100 > 0.1),
-        ("var1 % 2 == 1", this % 2 == 1),
-        ("var1 ** 2 == 0", this ** 2 == 0),
+        ("var1 + 100 > 110", data + 100 > 110),
+        ("var1 - 100 > 0", data - 100 > 0),
+        ("var1 * 100 > 200", data * 100 > 200),
+        ("var1 / 100 > .1", data / 100 > 0.1),
+        ("var1 % 2 == 1", data % 2 == 1),
+        ("var1 ** 2 == 0", data ** 2 == 0),
     ]
 
     for test, expected in tests:
         func = _compileGeneric(f"generic.process(func={test})")
-        result = _execGeneric(this, func)
+        result = _execGeneric(flags, data, func)
         assert (result == expected).all(axis=None)
 
 
-
 def test_nonReduncingBuiltins(data):
     var1, *_ = data.columns
-    this = data[var1]
+    data = data[var1]
+    flags = initFlagsLike(data)[[var1]]
 
     tests = [
-        ("abs(x)", np.abs(this)),
-        ("log(x)", np.log(this)),
-        ("exp(x)", np.exp(this)),
+        ("abs(x)", np.abs(data)),
+        ("log(x)", np.log(data)),
+        ("exp(x)", np.exp(data)),
     ]
 
     for test, expected in tests:
         func = _compileGeneric(f"generic.process(func={test})")
-        result = _execGeneric(this, func)
+        result = _execGeneric(flags, data, func)
         assert (result == expected).all(axis=None)
 
 
-def test_reduncingBuiltins(data):
-    data.loc[::4] = np.nan
-    var1 = data.columns[0]
-    this = data[var1]
-
-    tests = [
-        ("min(x)", np.nanmin(this)),
-        ("max(x)", np.nanmax(this)),
-        ("sum(x)", np.nansum(this)),
-        ("mean(x)", np.nanmean(this)),
-        ("std(x)", np.std(this)),
-        ("len(x)", len(this)),
-    ]
-
-    for test, expected in tests:
-        func = _compileGeneric(f"generic.process(func={test})")
-        result = _execGeneric(this, func)
-        assert result == expected
-
-
 def test_bitOps(data):
     var1, var2, *_ = data.columns
-
     flags = initFlagsLike(data)
 
     tests = [
-        (var1, "~(x > mean(x))", ~(data[var1] > np.nanmean(data[var1]))),
-        (var1, "(x <= 0) | (0 < x)", (data[var1] <= 0) | (0 < data[var1])),
+        ([var1], "~(x > mean(x))", ~(data[var1] > np.nanmean(data[var1]))),
+        ([var1], "(x <= 0) | (0 < x)", (data[var1] <= 0) | (0 < data[var1])),
         ([var1, var2], "(y>= 0) & (0 > x)", (data[var2] >= 0) & (0 > data[var1])),
     ]
 
     for field, test, expected in tests:
         func = _compileGeneric(f"generic.flag(func={test})")
-        result = _execGeneric(data[field], func)
+        result = _execGeneric(flags[field], data[field], func)
         assert (result == expected).all(axis=None)
 
 
@@ -247,3 +230,26 @@ def test_callableArgumentsBinary(data):
         expected = func(data[var1], data[var2])
         assert (result_config[var1].dropna() == expected.dropna()).all(axis=None)
         assert (result_api[var1].dropna() == expected.dropna()).all(axis=None)
+
+
+def test_isflagged(data):
+
+    var1, var2, *_ = data.columns
+    flags = initFlagsLike(data)
+    flags[data[var1].index[::2], var1] = BAD
+
+    tests = [
+        ([var1], f"isflagged(x)", flags[var1] > UNFLAGGED),
+        ([var1], f"isflagged(x)", flags[var1] >= BAD),
+        ([var2], f"~isflagged(x)", flags[var2] == UNFLAGGED),
+        (
+            [var1, var2],
+            f"~(x > 999) & (~isflagged(y))",
+            ~(data[var1] > 999) & (flags[var2] == UNFLAGGED),
+        ),
+    ]
+
+    for field, test, expected in tests:
+        func = _compileGeneric(f"generic.flag(func={test}, flag=BAD)")
+        result = _execGeneric(flags[field], data[field], func)
+        assert (result == expected).all(axis=None)
-- 
GitLab