diff --git a/CHANGELOG.md b/CHANGELOG.md index f56df1fb219efc7404dde953c55df47ff3fd21eb..0f09eccdf09c944bd13dc1bd003de1e65ecd6933 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,8 +15,10 @@ SPDX-License-Identifier: GPL-3.0-or-later - `SaQC`: support for selection, slicing and setting of items by use of subscription on SaQC objects (e.g. `qc[key]` and `qc[key] = value`). Selection works with single keys, collections of keys and string slices (e.g. `qc["a":"f"]`). Values can be SaQC objects, pd.Series, Iterable of Series and dict-like with series values. +- `transferFlags` is a multivariate function - `plot`: added `yscope` keyword - `setFlags`: function to replace `flagManual` +- `flagUniLOF`: added defaultly applied correction to mitigate phenomenon of overflagging at relatively steep data value slopes. (parameter `slope_correct`). ### Changed ### Removed ### Fixed diff --git a/docs/requirements.txt b/docs/requirements.txt index 2aeaac87678dc41f9dc1d8b1eb919ec92a0300a3..8c277e95aa5b807e6b859d04b6c5b40dd86dd4b1 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -4,11 +4,11 @@ recommonmark==0.7.1 sphinx==7.2.6 -sphinx-automodapi==0.16.0 +sphinx-automodapi==0.17.0 sphinxcontrib-fulltoc==1.2.0 sphinx-markdown-tables==0.0.17 jupyter-sphinx==0.5.3 -sphinx_autodoc_typehints==1.25.2 -sphinx-tabs==3.4.4 +sphinx_autodoc_typehints==2.0.0 +sphinx-tabs==3.4.5 sphinx-design==0.5.0 -pydata-sphinx-theme==0.14.4 +pydata-sphinx-theme==0.15.2 diff --git a/requirements.txt b/requirements.txt index 4d8b456b9988f3e05b7885dea4a0e5d052f261d0..c9553ab49916a6ba411ae6533e51b90a48285995 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,12 +5,12 @@ Click==8.1.7 docstring_parser==0.15 dtw==1.4.0 -matplotlib==3.8.2 -numpy==1.26.2 +matplotlib==3.8.3 +numpy==1.26.4 outlier-utils==0.0.5 -pyarrow==14.0.2 +pyarrow==15.0.0 pandas==2.1.4 -scikit-learn==1.3.2 -scipy==1.11.4 +scikit-learn==1.4.1.post1 +scipy==1.12.0 typing_extensions==4.5.0 fancy-collections==0.2.1 diff --git a/saqc/core/core.py b/saqc/core/core.py index 916bcf3e48bcb984fe6cba28b5270297130bef33..43448cd8ef05f2c8817f28c862750f2417715431 100644 --- a/saqc/core/core.py +++ b/saqc/core/core.py @@ -12,7 +12,7 @@ import warnings from copy import copy as shallowcopy from copy import deepcopy from functools import partial -from typing import Any, Hashable, Iterable, MutableMapping, overload +from typing import Any, Hashable, Iterable, MutableMapping import numpy as np import pandas as pd @@ -32,7 +32,7 @@ from saqc.funcs import FunctionsMixin # warnings pd.set_option("mode.chained_assignment", "warn") -pd.options.mode.copy_on_write = False +pd.set_option("mode.copy_on_write", True) np.seterr(invalid="ignore") diff --git a/saqc/funcs/flagtools.py b/saqc/funcs/flagtools.py index 783ad6c2c52f356b008d9f2b9261d00781bcd0d3..029665a3f624beea6614fb2f577d4e8d1046175a 100644 --- a/saqc/funcs/flagtools.py +++ b/saqc/funcs/flagtools.py @@ -17,9 +17,16 @@ from typing_extensions import Literal from saqc import BAD, FILTER_ALL, UNFLAGGED from saqc.core import DictOfSeries, flagging, register +from saqc.core.flags import Flags from saqc.core.history import History from saqc.lib.checking import validateChoice, validateWindow -from saqc.lib.tools import initializeTargets, isflagged, isunflagged, toSequence +from saqc.lib.tools import ( + initializeTargets, + isflagged, + isunflagged, + multivariateParameters, + toSequence, +) if TYPE_CHECKING: from saqc import SaQC @@ -108,7 +115,7 @@ class FlagtoolsMixin: data : Determines which timestamps to set flags at, depending on the passed type: - * 1-d `array` or `List` of timestamps: flag `field` with `flag` at every timestamp in `f_data` + * 1-d `array` or `List` of timestamps or `pandas.Index`: flag `field` with `flag` at every timestamp in `f_data` * 2-d `array` or List of tuples: for all elements `t[k]` out of f_data: flag `field` with `flag` at every timestamp in between `t[k][0]` and `t[k][1]` * pd.Series: flag `field` with `flag` in between any index and data value of the passed series @@ -123,7 +130,7 @@ class FlagtoolsMixin: to_flag = pd.Series(False, index=self._data[field].index) # check if f_data is meant to denote timestamps: - if (isinstance(data, (list, np.ndarray))) and not isinstance( + if (isinstance(data, (list, np.ndarray, pd.Index))) and not isinstance( data[0], (tuple, np.ndarray) ): set_idx = pd.DatetimeIndex(data).intersection(to_flag.index) @@ -356,6 +363,7 @@ class FlagtoolsMixin: demask=[], squeeze=[], handles_target=True, # function defines a target parameter, so it needs to handle it + multivariate=True, ) def transferFlags( self: "SaQC", @@ -415,16 +423,8 @@ class FlagtoolsMixin: 0 -inf -inf -inf 1 255.0 255.0 255.0 """ - history = self._flags.history[field] - - if target is None: - target = field - - if overwrite is False: - mask = isflagged(self._flags[target], thresh=kwargs["dfilter"]) - history._hist[mask] = np.nan - # append a dummy column + fields, targets, broadcasting = multivariateParameters(field, target) meta = { "func": f"transferFlags", "args": (), @@ -437,15 +437,45 @@ class FlagtoolsMixin: }, } - if squeeze: - flags = history.squeeze(raw=True) - # init an empty history to which we later append the squeezed flags - history = History(index=history.index) - else: + for field, target in zip(fields, targets): + # initialize non existing targets + if target not in self._data: + self._data[target] = pd.Series(np.nan, index=self._data[field].index) + self._flags._data[target] = History(self._data[target].index) + if not self._data[field].index.equals(self._data[target].index): + raise ValueError( + f"All Field and Target indices must match!\n" + f"Indices of {field} and {target} seem to be not congruent within the context of the given\n" + f"- fields: {fields}\n " + f"- and targets: {targets}" + ) + history = self._flags.history[field].copy(deep=True) + + if overwrite is False: + mask = isflagged(self._flags[target], thresh=kwargs["dfilter"]) + history._hist[mask] = np.nan + + if squeeze: + # add squeezed flags + flags = history.squeeze(raw=True) + history = History(index=history.index).append(flags, meta) + elif broadcasting is False: + # add an empty flags + flags = pd.Series(np.nan, index=history.index, dtype=float) + history.append(flags, meta) + # else: + # broadcasting -> multiple fields will be written to one target + # only add the fields' histories and add an empty column later + + self._flags.history[target].append(history) + + if broadcasting and not squeeze: + # add one final history column + # all targets are identical, if we broadcast fields -> target + target = targets[0] + history = self._flags.history[target] flags = pd.Series(np.nan, index=history.index, dtype=float) - - history.append(flags, meta) - self._flags.history[target].append(history) + self._flags.history[target].append(flags, meta) return self diff --git a/saqc/funcs/outliers.py b/saqc/funcs/outliers.py index 7b5a4b93094c53b856963af5ee3c57e0910b08ae..dfb1a84ef8a51ddba8a2f7a43b31ec60e8549499 100644 --- a/saqc/funcs/outliers.py +++ b/saqc/funcs/outliers.py @@ -179,6 +179,8 @@ class OutliersMixin: p: int = 1, density: Literal["auto"] | float = "auto", fill_na: bool = True, + slope_correct: bool = True, + min_offset: float = None, flag: float = BAD, **kwargs, ) -> "SaQC": @@ -247,6 +249,15 @@ class OutliersMixin: fill_na : If True, NaNs in the data are filled with a linear interpolation. + slope_correct : + if True, a correction is applied, that removes outlier cluster that actually + just seem to be steep slopes + + min_offset : + If set, only those outlier cluster will be flagged, that are preceeded and succeeeded + by sufficiently large value "jumps". Defaults to estimating the sufficient value jumps from + the median over the absolute step sizes between data points. + See Also -------- :ref:`introduction to outlier detection with @@ -366,8 +377,47 @@ class OutliersMixin: s_mask = ((_s - _s.mean()) / _s.std()).iloc[: int(s.shape[0])].abs() > 3 else: s_mask = s < -abs(thresh) - s_mask = ~isflagged(qc._flags[field], kwargs["dfilter"]) & s_mask + + if slope_correct: + g_mask = s_mask.diff() + g_mask = g_mask.cumsum() + dat = self._data[field] + od_groups = dat.interpolate("linear").groupby(by=g_mask) + first_vals = od_groups.first() + last_vals = od_groups.last() + max_vals = od_groups.max() + min_vals = od_groups.min() + if min_offset is None: + if density == "auto": + d_diff = dat.diff() + eps = d_diff.abs().median() + if eps == 0: + eps = d_diff[d_diff != 0].abs().median() + else: + eps = density + eps = 3 * eps + else: + eps = min_offset + up_slopes = (min_vals + eps >= last_vals.shift(1)) & ( + max_vals - eps <= first_vals.shift(-1) + ) + down_slopes = (max_vals - eps <= last_vals.shift(1)) & ( + min_vals + eps >= first_vals.shift(-1) + ) + slopes = up_slopes | down_slopes + odd_return_pred = (max_vals > last_vals.shift(1)) & ( + min_vals < last_vals.shift(1) + ) + odd_return_succ = (max_vals > first_vals.shift(-1)) & ( + min_vals < first_vals.shift(-1) + ) + returns = odd_return_succ | odd_return_pred + corrections = returns | slopes + for s_id in corrections[corrections].index: + correct_idx = od_groups.get_group(s_id).index + s_mask[correct_idx] = False + qc._flags[s_mask, field] = flag qc = qc.dropField(tmp_field) return qc diff --git a/saqc/funcs/scores.py b/saqc/funcs/scores.py index 18f4c6f62abc8d6426d65c6cd99af3e212b728f8..9998c153477390add9805a11f1eb2c27c21ca09d 100644 --- a/saqc/funcs/scores.py +++ b/saqc/funcs/scores.py @@ -503,9 +503,10 @@ class ScoresMixin: filled = pd.Series(False, index=vals.index) if density == "auto": - density = vals.diff().abs().median() + v_diff = vals.diff() + density = v_diff.abs().median() if density == 0: - density = vals.diff().abs().mean() + density = v_diff[v_diff != 0].abs().median() elif isinstance(density, Callable): density = density(vals) if isinstance(density, pd.Series): diff --git a/saqc/funcs/tools.py b/saqc/funcs/tools.py index db4b3700af72d4492134d3537fe384e7684ddf8e..e0f53fabbc32a8b6a89c2d0af1a68267f1dfc565 100644 --- a/saqc/funcs/tools.py +++ b/saqc/funcs/tools.py @@ -503,8 +503,9 @@ class ToolsMixin: and not isinstance(yscope[0], (list, tuple)) ): yscope = tuple(yscope) + if yscope is not None: - ax_kwargs.update({"ylim": yscope}) + ax_kwargs.update({"ylim": yscope}) if not path: mpl.use(_MPL_DEFAULT_BACKEND) diff --git a/saqc/lib/tools.py b/saqc/lib/tools.py index 003423570c14c9177bcd9d10ea2ff7d8b6c42152..4dbcf710813b7668b60a716609fb0183ea443c83 100644 --- a/saqc/lib/tools.py +++ b/saqc/lib/tools.py @@ -554,7 +554,7 @@ def initializeTargets( index: pd.Index, ): """ - Initialize all targets based on field. + Initialize all targets based on fields. Note ---- @@ -652,3 +652,21 @@ def joinExt(sep: str, iterable: Iterable[str], last_sep: str | None = None) -> s if len(iterable) < 2: return sep.join(iterable) return f"{sep.join(iterable[:-1])}{last_sep}{iterable[-1]}" + + +def multivariateParameters( + field: str | list[str], target: str | list[str] | None = None +) -> tuple[list[str], list[str], bool]: + fields = toSequence(field) + targets = fields if target is None else toSequence(target) + broadcasting = False + + if len(targets) == 1: + targets = targets * len(fields) + broadcasting = True + if len(targets) != len(fields): + raise ValueError( + "expected a single 'target' or the same number of 'field' and 'target' values" + ) + + return fields, targets, broadcasting diff --git a/tests/core/test_flags.py b/tests/core/test_flags.py index cc8949bf620cdabb4da6aa5ef72f9552a35d379c..c9628d49cd7f333e9fc7527dd05db48ddaf0cffc 100644 --- a/tests/core/test_flags.py +++ b/tests/core/test_flags.py @@ -122,7 +122,7 @@ def test_copy(data: Union[pd.DataFrame, DictOfSeries, Dict[str, pd.Series]]): # the underling series data is the same for c in shallow.columns: - assert shallow._data[c].index is flags._data[c].index + assert shallow._data[c].index.equals(flags._data[c].index) # the underling series data was copied for c in deep.columns: diff --git a/tests/core/test_history.py b/tests/core/test_history.py index e8279f46237482d0d90c7d29bc00480c664015e1..cb3412c370f90046bf14a062176932ed0bee9984 100644 --- a/tests/core/test_history.py +++ b/tests/core/test_history.py @@ -143,7 +143,7 @@ def test_copy(data): assert is_equal(deep, shallow) # underling pandas data was only copied with deep=True - assert shallow.hist.index is hist.hist.index + assert shallow.hist.index.equals(hist.hist.index) assert deep.hist.index is not hist.hist.index diff --git a/tests/funcs/test_flagtools.py b/tests/funcs/test_flagtools.py index 9c650ab65ca482e5948ff07dee20c5f0ab9a1ddd..6bda00301e2bd493dadff6fa2c2205b46467419b 100644 --- a/tests/funcs/test_flagtools.py +++ b/tests/funcs/test_flagtools.py @@ -178,6 +178,49 @@ def test__groupOperation(field, target, expected, copy): assert (result._data[f] == result._data[t]).all(axis=None) +def test_transferFlags(): + qc = SaQC( + data=pd.DataFrame( + {"x": [0, 1, 2, 3], "y": [0, 11, 22, 33], "z": [0, 111, 222, 333]} + ), + flags=pd.DataFrame({"x": [B, U, U, B], "y": [B, B, U, U], "z": [B, B, U, B]}), + ) + + # no squueze + qc1 = qc.transferFlags("x", target="a") + assert qc1._history["a"].hist.iloc[:, :-1].equals(qc1._history["x"].hist) + assert qc1._history["a"].hist.iloc[:, -1].isna().all() + + qc2 = qc.transferFlags(["x", "y"], target=["a", "b"]) + assert qc2._history["a"].hist.iloc[:, :-1].equals(qc2._history["x"].hist) + assert qc2._history["a"].hist.iloc[:, -1].isna().all() + assert qc2._history["b"].hist.iloc[:, :-1].equals(qc2._history["y"].hist) + assert qc2._history["b"].hist.iloc[:, -1].isna().all() + + # we use the overwrite option here for easy checking against the origin + # flags, because otherwise we would need to respect the inserted nan + qc3 = qc.transferFlags(["x", "y", "z"], target="a", overwrite=True) + assert qc3._history["a"].hist.iloc[:, 0].equals(qc3._history["x"].hist.squeeze()) + assert qc3._history["a"].hist.iloc[:, 1].equals(qc3._history["y"].hist.squeeze()) + assert qc3._history["a"].hist.iloc[:, 2].equals(qc3._history["z"].hist.squeeze()) + assert qc3._history["a"].hist.iloc[:, -1].isna().all() + + # squueze + qc1 = qc.transferFlags("x", target="a", squeeze=True) + assert qc1._history["a"].hist.equals(qc1._history["x"].hist) + + qc2 = qc.transferFlags(["x", "y"], target=["a", "b"], squeeze=True) + assert qc2._history["a"].hist.equals(qc2._history["x"].hist) + assert qc2._history["b"].hist.equals(qc2._history["y"].hist) + + # we use the overwrite option here for easy checking against the origin + # flags, because otherwise we would need to respect the inserted nan + qc3 = qc.transferFlags(["x", "y", "z"], target="a", overwrite=True, squeeze=True) + assert qc3._history["a"].hist.iloc[:, 0].equals(qc3._history["x"].hist.squeeze()) + assert qc3._history["a"].hist.iloc[:, 1].equals(qc3._history["y"].hist.squeeze()) + assert qc3._history["a"].hist.iloc[:, 2].equals(qc3._history["z"].hist.squeeze()) + + @pytest.mark.parametrize( "f_data", [