Compare revisions

WKDV Bot · David Schäfer · David Schäfer · WKDV Bot · David Schäfer · David Schäfer
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,8 +15,10 @@ SPDX-License-Identifier: GPL-3.0-or-later
 - `SaQC`: support for selection, slicing and setting of items by use of subscription on SaQC objects (e.g. `qc[key]` and `qc[key] = value`).
   Selection works with single keys, collections of keys and string slices (e.g. `qc["a":"f"]`).  Values can be SaQC objects, pd.Series, 
   Iterable of Series and dict-like with series values.
+- `transferFlags` is a multivariate function
 - `plot`: added `yscope` keyword
 - `setFlags`: function to replace `flagManual`
+- `flagUniLOF`: added defaultly applied correction to mitigate phenomenon of overflagging at relatively steep data value slopes. (parameter `slope_correct`). 
 ### Changed
 ### Removed
 ### Fixed

--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -4,11 +4,11 @@

 recommonmark==0.7.1
 sphinx==7.2.6
-sphinx-automodapi==0.16.0
+sphinx-automodapi==0.17.0
 sphinxcontrib-fulltoc==1.2.0
 sphinx-markdown-tables==0.0.17
 jupyter-sphinx==0.5.3
-sphinx_autodoc_typehints==1.25.2
-sphinx-tabs==3.4.4
+sphinx_autodoc_typehints==2.0.0
+sphinx-tabs==3.4.5
 sphinx-design==0.5.0
-pydata-sphinx-theme==0.14.4
+pydata-sphinx-theme==0.15.2
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,12 +5,12 @@
 Click==8.1.7
 docstring_parser==0.15
 dtw==1.4.0
-matplotlib==3.8.2
-numpy==1.26.2
+matplotlib==3.8.3
+numpy==1.26.4
 outlier-utils==0.0.5
-pyarrow==14.0.2
+pyarrow==15.0.0
 pandas==2.1.4
-scikit-learn==1.3.2
-scipy==1.11.4
+scikit-learn==1.4.1.post1
+scipy==1.12.0
 typing_extensions==4.5.0
 fancy-collections==0.2.1
--- a/saqc/core/core.py
+++ b/saqc/core/core.py
@@ -12,7 +12,7 @@ import warnings
 from copy import copy as shallowcopy
 from copy import deepcopy
 from functools import partial
-from typing import Any, Hashable, Iterable, MutableMapping, overload
+from typing import Any, Hashable, Iterable, MutableMapping

 import numpy as np
 import pandas as pd
@@ -32,7 +32,7 @@ from saqc.funcs import FunctionsMixin

 # warnings
 pd.set_option("mode.chained_assignment", "warn")
-pd.options.mode.copy_on_write = False
+pd.set_option("mode.copy_on_write", True)
 np.seterr(invalid="ignore")



--- a/saqc/funcs/flagtools.py
+++ b/saqc/funcs/flagtools.py
@@ -17,9 +17,16 @@ from typing_extensions import Literal

 from saqc import BAD, FILTER_ALL, UNFLAGGED
 from saqc.core import DictOfSeries, flagging, register
+from saqc.core.flags import Flags
 from saqc.core.history import History
 from saqc.lib.checking import validateChoice, validateWindow
-from saqc.lib.tools import initializeTargets, isflagged, isunflagged, toSequence
+from saqc.lib.tools import (
+    initializeTargets,
+    isflagged,
+    isunflagged,
+    multivariateParameters,
+    toSequence,
+)

 if TYPE_CHECKING:
    from saqc import SaQC
@@ -108,7 +115,7 @@ class FlagtoolsMixin:
        data :
            Determines which timestamps to set flags at, depending on the passed type:

-            * 1-d `array` or `List` of timestamps: flag `field` with `flag` at every timestamp in `f_data`
+            * 1-d `array` or `List` of timestamps or `pandas.Index`: flag `field` with `flag` at every timestamp in `f_data`
            * 2-d `array` or List of tuples: for all elements `t[k]` out of f_data:
              flag `field` with `flag` at every timestamp in between `t[k][0]` and `t[k][1]`
            * pd.Series: flag `field` with `flag` in between any index and data value of the passed series
@@ -123,7 +130,7 @@ class FlagtoolsMixin:
        to_flag = pd.Series(False, index=self._data[field].index)

        # check if f_data is meant to denote timestamps:
-        if (isinstance(data, (list, np.ndarray))) and not isinstance(
+        if (isinstance(data, (list, np.ndarray, pd.Index))) and not isinstance(
            data[0], (tuple, np.ndarray)
        ):
            set_idx = pd.DatetimeIndex(data).intersection(to_flag.index)
@@ -356,6 +363,7 @@ class FlagtoolsMixin:
        demask=[],
        squeeze=[],
        handles_target=True,  # function defines a target parameter, so it needs to handle it
+        multivariate=True,
    )
    def transferFlags(
        self: "SaQC",
@@ -415,16 +423,8 @@ class FlagtoolsMixin:
           0   -inf   -inf   -inf
           1  255.0  255.0  255.0
        """
-        history = self._flags.history[field]
-
-        if target is None:
-            target = field
-
-        if overwrite is False:
-            mask = isflagged(self._flags[target], thresh=kwargs["dfilter"])
-            history._hist[mask] = np.nan

-        # append a dummy column
+        fields, targets, broadcasting = multivariateParameters(field, target)
        meta = {
            "func": f"transferFlags",
            "args": (),
@@ -437,15 +437,45 @@ class FlagtoolsMixin:
            },
        }

-        if squeeze:
-            flags = history.squeeze(raw=True)
-            # init an empty history to which we later append the squeezed flags
-            history = History(index=history.index)
-        else:
+        for field, target in zip(fields, targets):
+            # initialize non existing targets
+            if target not in self._data:
+                self._data[target] = pd.Series(np.nan, index=self._data[field].index)
+                self._flags._data[target] = History(self._data[target].index)
+            if not self._data[field].index.equals(self._data[target].index):
+                raise ValueError(
+                    f"All Field and Target indices must match!\n"
+                    f"Indices of {field} and {target} seem to be not congruent within the context of the given\n"
+                    f"- fields: {fields}\n "
+                    f"- and targets: {targets}"
+                )
+            history = self._flags.history[field].copy(deep=True)
+
+            if overwrite is False:
+                mask = isflagged(self._flags[target], thresh=kwargs["dfilter"])
+                history._hist[mask] = np.nan
+
+            if squeeze:
+                # add squeezed flags
+                flags = history.squeeze(raw=True)
+                history = History(index=history.index).append(flags, meta)
+            elif broadcasting is False:
+                # add an empty flags
+                flags = pd.Series(np.nan, index=history.index, dtype=float)
+                history.append(flags, meta)
+            # else:
+            #    broadcasting -> multiple fields will be written to one target
+            #    only add the fields' histories and add an empty column later
+
+            self._flags.history[target].append(history)
+
+        if broadcasting and not squeeze:
+            # add one final history column
+            # all targets are identical, if we broadcast fields -> target
+            target = targets[0]
+            history = self._flags.history[target]
            flags = pd.Series(np.nan, index=history.index, dtype=float)
-
-        history.append(flags, meta)
-        self._flags.history[target].append(history)
+            self._flags.history[target].append(flags, meta)

        return self


--- a/saqc/funcs/outliers.py
+++ b/saqc/funcs/outliers.py
@@ -179,6 +179,8 @@ class OutliersMixin:
        p: int = 1,
        density: Literal["auto"] | float = "auto",
        fill_na: bool = True,
+        slope_correct: bool = True,
+        min_offset: float = None,
        flag: float = BAD,
        **kwargs,
    ) -> "SaQC":
@@ -247,6 +249,15 @@ class OutliersMixin:
        fill_na :
            If True, NaNs in the data are filled with a linear interpolation.

+        slope_correct :
+            if True, a correction is applied, that removes outlier cluster that actually
+            just seem to be steep slopes
+
+        min_offset :
+            If set, only those outlier cluster will be flagged, that are preceeded and succeeeded
+            by sufficiently large value "jumps". Defaults to estimating the sufficient value jumps from
+            the median over the absolute step sizes between data points.
+
        See Also
        --------
        :ref:`introduction to outlier detection with
@@ -366,8 +377,47 @@ class OutliersMixin:
            s_mask = ((_s - _s.mean()) / _s.std()).iloc[: int(s.shape[0])].abs() > 3
        else:
            s_mask = s < -abs(thresh)
-
        s_mask = ~isflagged(qc._flags[field], kwargs["dfilter"]) & s_mask
+
+        if slope_correct:
+            g_mask = s_mask.diff()
+            g_mask = g_mask.cumsum()
+            dat = self._data[field]
+            od_groups = dat.interpolate("linear").groupby(by=g_mask)
+            first_vals = od_groups.first()
+            last_vals = od_groups.last()
+            max_vals = od_groups.max()
+            min_vals = od_groups.min()
+            if min_offset is None:
+                if density == "auto":
+                    d_diff = dat.diff()
+                    eps = d_diff.abs().median()
+                    if eps == 0:
+                        eps = d_diff[d_diff != 0].abs().median()
+                else:
+                    eps = density
+                eps = 3 * eps
+            else:
+                eps = min_offset
+            up_slopes = (min_vals + eps >= last_vals.shift(1)) & (
+                max_vals - eps <= first_vals.shift(-1)
+            )
+            down_slopes = (max_vals - eps <= last_vals.shift(1)) & (
+                min_vals + eps >= first_vals.shift(-1)
+            )
+            slopes = up_slopes | down_slopes
+            odd_return_pred = (max_vals > last_vals.shift(1)) & (
+                min_vals < last_vals.shift(1)
+            )
+            odd_return_succ = (max_vals > first_vals.shift(-1)) & (
+                min_vals < first_vals.shift(-1)
+            )
+            returns = odd_return_succ | odd_return_pred
+            corrections = returns | slopes
+            for s_id in corrections[corrections].index:
+                correct_idx = od_groups.get_group(s_id).index
+                s_mask[correct_idx] = False
+
        qc._flags[s_mask, field] = flag
        qc = qc.dropField(tmp_field)
        return qc

--- a/saqc/funcs/scores.py
+++ b/saqc/funcs/scores.py
@@ -503,9 +503,10 @@ class ScoresMixin:
            filled = pd.Series(False, index=vals.index)

        if density == "auto":
-            density = vals.diff().abs().median()
+            v_diff = vals.diff()
+            density = v_diff.abs().median()
            if density == 0:
-                density = vals.diff().abs().mean()
+                density = v_diff[v_diff != 0].abs().median()
        elif isinstance(density, Callable):
            density = density(vals)
        if isinstance(density, pd.Series):

--- a/saqc/funcs/tools.py
+++ b/saqc/funcs/tools.py
@@ -503,8 +503,9 @@ class ToolsMixin:
            and not isinstance(yscope[0], (list, tuple))
        ):
            yscope = tuple(yscope)
+        if yscope is not None:

-        ax_kwargs.update({"ylim": yscope})
+            ax_kwargs.update({"ylim": yscope})

        if not path:
            mpl.use(_MPL_DEFAULT_BACKEND)

--- a/saqc/lib/tools.py
+++ b/saqc/lib/tools.py
@@ -554,7 +554,7 @@ def initializeTargets(
    index: pd.Index,
 ):
    """
-    Initialize all targets based on field.
+    Initialize all targets based on fields.

    Note
    ----
@@ -652,3 +652,21 @@ def joinExt(sep: str, iterable: Iterable[str], last_sep: str | None = None) -> s
    if len(iterable) < 2:
        return sep.join(iterable)
    return f"{sep.join(iterable[:-1])}{last_sep}{iterable[-1]}"
+
+
+def multivariateParameters(
+    field: str | list[str], target: str | list[str] | None = None
+) -> tuple[list[str], list[str], bool]:
+    fields = toSequence(field)
+    targets = fields if target is None else toSequence(target)
+    broadcasting = False
+
+    if len(targets) == 1:
+        targets = targets * len(fields)
+        broadcasting = True
+    if len(targets) != len(fields):
+        raise ValueError(
+            "expected a single 'target' or the same number of 'field' and 'target' values"
+        )
+
+    return fields, targets, broadcasting
--- a/tests/core/test_flags.py
+++ b/tests/core/test_flags.py
@@ -122,7 +122,7 @@ def test_copy(data: Union[pd.DataFrame, DictOfSeries, Dict[str, pd.Series]]):

    # the underling series data is the same
    for c in shallow.columns:
-        assert shallow._data[c].index is flags._data[c].index
+        assert shallow._data[c].index.equals(flags._data[c].index)

    # the underling series data was copied
    for c in deep.columns:

--- a/tests/core/test_history.py
+++ b/tests/core/test_history.py
@@ -143,7 +143,7 @@ def test_copy(data):
    assert is_equal(deep, shallow)

    # underling pandas data was only copied with deep=True
-    assert shallow.hist.index is hist.hist.index
+    assert shallow.hist.index.equals(hist.hist.index)
    assert deep.hist.index is not hist.hist.index



--- a/tests/funcs/test_flagtools.py
+++ b/tests/funcs/test_flagtools.py
@@ -178,6 +178,49 @@ def test__groupOperation(field, target, expected, copy):
            assert (result._data[f] == result._data[t]).all(axis=None)


+def test_transferFlags():
+    qc = SaQC(
+        data=pd.DataFrame(
+            {"x": [0, 1, 2, 3], "y": [0, 11, 22, 33], "z": [0, 111, 222, 333]}
+        ),
+        flags=pd.DataFrame({"x": [B, U, U, B], "y": [B, B, U, U], "z": [B, B, U, B]}),
+    )
+
+    # no squueze
+    qc1 = qc.transferFlags("x", target="a")
+    assert qc1._history["a"].hist.iloc[:, :-1].equals(qc1._history["x"].hist)
+    assert qc1._history["a"].hist.iloc[:, -1].isna().all()
+
+    qc2 = qc.transferFlags(["x", "y"], target=["a", "b"])
+    assert qc2._history["a"].hist.iloc[:, :-1].equals(qc2._history["x"].hist)
+    assert qc2._history["a"].hist.iloc[:, -1].isna().all()
+    assert qc2._history["b"].hist.iloc[:, :-1].equals(qc2._history["y"].hist)
+    assert qc2._history["b"].hist.iloc[:, -1].isna().all()
+
+    # we use the overwrite option here for easy checking against the origin
+    # flags, because otherwise we would need to respect the inserted nan
+    qc3 = qc.transferFlags(["x", "y", "z"], target="a", overwrite=True)
+    assert qc3._history["a"].hist.iloc[:, 0].equals(qc3._history["x"].hist.squeeze())
+    assert qc3._history["a"].hist.iloc[:, 1].equals(qc3._history["y"].hist.squeeze())
+    assert qc3._history["a"].hist.iloc[:, 2].equals(qc3._history["z"].hist.squeeze())
+    assert qc3._history["a"].hist.iloc[:, -1].isna().all()
+
+    # squueze
+    qc1 = qc.transferFlags("x", target="a", squeeze=True)
+    assert qc1._history["a"].hist.equals(qc1._history["x"].hist)
+
+    qc2 = qc.transferFlags(["x", "y"], target=["a", "b"], squeeze=True)
+    assert qc2._history["a"].hist.equals(qc2._history["x"].hist)
+    assert qc2._history["b"].hist.equals(qc2._history["y"].hist)
+
+    # we use the overwrite option here for easy checking against the origin
+    # flags, because otherwise we would need to respect the inserted nan
+    qc3 = qc.transferFlags(["x", "y", "z"], target="a", overwrite=True, squeeze=True)
+    assert qc3._history["a"].hist.iloc[:, 0].equals(qc3._history["x"].hist.squeeze())
+    assert qc3._history["a"].hist.iloc[:, 1].equals(qc3._history["y"].hist.squeeze())
+    assert qc3._history["a"].hist.iloc[:, 2].equals(qc3._history["z"].hist.squeeze())
+
+
 @pytest.mark.parametrize(
    "f_data",
    [

--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -2,9 +2,9 @@
 #
 # SPDX-License-Identifier: GPL-3.0-or-later

-beautifulsoup4==4.12.2
-hypothesis==6.92.2
-Markdown==3.5.1
+beautifulsoup4==4.12.3
+hypothesis==6.98.15
+Markdown==3.5.2
 pytest==7.4.4
 pytest-lazy-fixture==0.6.3
 requests==2.31.0
No results found