Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • berntm/saqc
  • rdm-software/saqc
  • schueler/saqc
3 results
Show changes
Commits on Source (34)
...@@ -15,8 +15,10 @@ SPDX-License-Identifier: GPL-3.0-or-later ...@@ -15,8 +15,10 @@ SPDX-License-Identifier: GPL-3.0-or-later
- `SaQC`: support for selection, slicing and setting of items by use of subscription on SaQC objects (e.g. `qc[key]` and `qc[key] = value`). - `SaQC`: support for selection, slicing and setting of items by use of subscription on SaQC objects (e.g. `qc[key]` and `qc[key] = value`).
Selection works with single keys, collections of keys and string slices (e.g. `qc["a":"f"]`). Values can be SaQC objects, pd.Series, Selection works with single keys, collections of keys and string slices (e.g. `qc["a":"f"]`). Values can be SaQC objects, pd.Series,
Iterable of Series and dict-like with series values. Iterable of Series and dict-like with series values.
- `transferFlags` is a multivariate function
- `plot`: added `yscope` keyword - `plot`: added `yscope` keyword
- `setFlags`: function to replace `flagManual` - `setFlags`: function to replace `flagManual`
- `flagUniLOF`: added defaultly applied correction to mitigate phenomenon of overflagging at relatively steep data value slopes. (parameter `slope_correct`).
### Changed ### Changed
### Removed ### Removed
### Fixed ### Fixed
......
...@@ -4,11 +4,11 @@ ...@@ -4,11 +4,11 @@
recommonmark==0.7.1 recommonmark==0.7.1
sphinx==7.2.6 sphinx==7.2.6
sphinx-automodapi==0.16.0 sphinx-automodapi==0.17.0
sphinxcontrib-fulltoc==1.2.0 sphinxcontrib-fulltoc==1.2.0
sphinx-markdown-tables==0.0.17 sphinx-markdown-tables==0.0.17
jupyter-sphinx==0.5.3 jupyter-sphinx==0.5.3
sphinx_autodoc_typehints==1.25.2 sphinx_autodoc_typehints==2.0.0
sphinx-tabs==3.4.4 sphinx-tabs==3.4.5
sphinx-design==0.5.0 sphinx-design==0.5.0
pydata-sphinx-theme==0.14.4 pydata-sphinx-theme==0.15.2
...@@ -5,12 +5,12 @@ ...@@ -5,12 +5,12 @@
Click==8.1.7 Click==8.1.7
docstring_parser==0.15 docstring_parser==0.15
dtw==1.4.0 dtw==1.4.0
matplotlib==3.8.2 matplotlib==3.8.3
numpy==1.26.2 numpy==1.26.4
outlier-utils==0.0.5 outlier-utils==0.0.5
pyarrow==14.0.2 pyarrow==15.0.0
pandas==2.1.4 pandas==2.1.4
scikit-learn==1.3.2 scikit-learn==1.4.1.post1
scipy==1.11.4 scipy==1.12.0
typing_extensions==4.5.0 typing_extensions==4.5.0
fancy-collections==0.2.1 fancy-collections==0.2.1
...@@ -12,7 +12,7 @@ import warnings ...@@ -12,7 +12,7 @@ import warnings
from copy import copy as shallowcopy from copy import copy as shallowcopy
from copy import deepcopy from copy import deepcopy
from functools import partial from functools import partial
from typing import Any, Hashable, Iterable, MutableMapping, overload from typing import Any, Hashable, Iterable, MutableMapping
import numpy as np import numpy as np
import pandas as pd import pandas as pd
...@@ -32,7 +32,7 @@ from saqc.funcs import FunctionsMixin ...@@ -32,7 +32,7 @@ from saqc.funcs import FunctionsMixin
# warnings # warnings
pd.set_option("mode.chained_assignment", "warn") pd.set_option("mode.chained_assignment", "warn")
pd.options.mode.copy_on_write = False pd.set_option("mode.copy_on_write", True)
np.seterr(invalid="ignore") np.seterr(invalid="ignore")
......
...@@ -17,9 +17,16 @@ from typing_extensions import Literal ...@@ -17,9 +17,16 @@ from typing_extensions import Literal
from saqc import BAD, FILTER_ALL, UNFLAGGED from saqc import BAD, FILTER_ALL, UNFLAGGED
from saqc.core import DictOfSeries, flagging, register from saqc.core import DictOfSeries, flagging, register
from saqc.core.flags import Flags
from saqc.core.history import History from saqc.core.history import History
from saqc.lib.checking import validateChoice, validateWindow from saqc.lib.checking import validateChoice, validateWindow
from saqc.lib.tools import initializeTargets, isflagged, isunflagged, toSequence from saqc.lib.tools import (
initializeTargets,
isflagged,
isunflagged,
multivariateParameters,
toSequence,
)
if TYPE_CHECKING: if TYPE_CHECKING:
from saqc import SaQC from saqc import SaQC
...@@ -108,7 +115,7 @@ class FlagtoolsMixin: ...@@ -108,7 +115,7 @@ class FlagtoolsMixin:
data : data :
Determines which timestamps to set flags at, depending on the passed type: Determines which timestamps to set flags at, depending on the passed type:
* 1-d `array` or `List` of timestamps: flag `field` with `flag` at every timestamp in `f_data` * 1-d `array` or `List` of timestamps or `pandas.Index`: flag `field` with `flag` at every timestamp in `f_data`
* 2-d `array` or List of tuples: for all elements `t[k]` out of f_data: * 2-d `array` or List of tuples: for all elements `t[k]` out of f_data:
flag `field` with `flag` at every timestamp in between `t[k][0]` and `t[k][1]` flag `field` with `flag` at every timestamp in between `t[k][0]` and `t[k][1]`
* pd.Series: flag `field` with `flag` in between any index and data value of the passed series * pd.Series: flag `field` with `flag` in between any index and data value of the passed series
...@@ -123,7 +130,7 @@ class FlagtoolsMixin: ...@@ -123,7 +130,7 @@ class FlagtoolsMixin:
to_flag = pd.Series(False, index=self._data[field].index) to_flag = pd.Series(False, index=self._data[field].index)
# check if f_data is meant to denote timestamps: # check if f_data is meant to denote timestamps:
if (isinstance(data, (list, np.ndarray))) and not isinstance( if (isinstance(data, (list, np.ndarray, pd.Index))) and not isinstance(
data[0], (tuple, np.ndarray) data[0], (tuple, np.ndarray)
): ):
set_idx = pd.DatetimeIndex(data).intersection(to_flag.index) set_idx = pd.DatetimeIndex(data).intersection(to_flag.index)
...@@ -356,6 +363,7 @@ class FlagtoolsMixin: ...@@ -356,6 +363,7 @@ class FlagtoolsMixin:
demask=[], demask=[],
squeeze=[], squeeze=[],
handles_target=True, # function defines a target parameter, so it needs to handle it handles_target=True, # function defines a target parameter, so it needs to handle it
multivariate=True,
) )
def transferFlags( def transferFlags(
self: "SaQC", self: "SaQC",
...@@ -415,16 +423,8 @@ class FlagtoolsMixin: ...@@ -415,16 +423,8 @@ class FlagtoolsMixin:
0 -inf -inf -inf 0 -inf -inf -inf
1 255.0 255.0 255.0 1 255.0 255.0 255.0
""" """
history = self._flags.history[field]
if target is None:
target = field
if overwrite is False:
mask = isflagged(self._flags[target], thresh=kwargs["dfilter"])
history._hist[mask] = np.nan
# append a dummy column fields, targets, broadcasting = multivariateParameters(field, target)
meta = { meta = {
"func": f"transferFlags", "func": f"transferFlags",
"args": (), "args": (),
...@@ -437,15 +437,45 @@ class FlagtoolsMixin: ...@@ -437,15 +437,45 @@ class FlagtoolsMixin:
}, },
} }
if squeeze: for field, target in zip(fields, targets):
flags = history.squeeze(raw=True) # initialize non existing targets
# init an empty history to which we later append the squeezed flags if target not in self._data:
history = History(index=history.index) self._data[target] = pd.Series(np.nan, index=self._data[field].index)
else: self._flags._data[target] = History(self._data[target].index)
if not self._data[field].index.equals(self._data[target].index):
raise ValueError(
f"All Field and Target indices must match!\n"
f"Indices of {field} and {target} seem to be not congruent within the context of the given\n"
f"- fields: {fields}\n "
f"- and targets: {targets}"
)
history = self._flags.history[field].copy(deep=True)
if overwrite is False:
mask = isflagged(self._flags[target], thresh=kwargs["dfilter"])
history._hist[mask] = np.nan
if squeeze:
# add squeezed flags
flags = history.squeeze(raw=True)
history = History(index=history.index).append(flags, meta)
elif broadcasting is False:
# add an empty flags
flags = pd.Series(np.nan, index=history.index, dtype=float)
history.append(flags, meta)
# else:
# broadcasting -> multiple fields will be written to one target
# only add the fields' histories and add an empty column later
self._flags.history[target].append(history)
if broadcasting and not squeeze:
# add one final history column
# all targets are identical, if we broadcast fields -> target
target = targets[0]
history = self._flags.history[target]
flags = pd.Series(np.nan, index=history.index, dtype=float) flags = pd.Series(np.nan, index=history.index, dtype=float)
self._flags.history[target].append(flags, meta)
history.append(flags, meta)
self._flags.history[target].append(history)
return self return self
......
...@@ -179,6 +179,8 @@ class OutliersMixin: ...@@ -179,6 +179,8 @@ class OutliersMixin:
p: int = 1, p: int = 1,
density: Literal["auto"] | float = "auto", density: Literal["auto"] | float = "auto",
fill_na: bool = True, fill_na: bool = True,
slope_correct: bool = True,
min_offset: float = None,
flag: float = BAD, flag: float = BAD,
**kwargs, **kwargs,
) -> "SaQC": ) -> "SaQC":
...@@ -247,6 +249,15 @@ class OutliersMixin: ...@@ -247,6 +249,15 @@ class OutliersMixin:
fill_na : fill_na :
If True, NaNs in the data are filled with a linear interpolation. If True, NaNs in the data are filled with a linear interpolation.
slope_correct :
if True, a correction is applied, that removes outlier cluster that actually
just seem to be steep slopes
min_offset :
If set, only those outlier cluster will be flagged, that are preceeded and succeeeded
by sufficiently large value "jumps". Defaults to estimating the sufficient value jumps from
the median over the absolute step sizes between data points.
See Also See Also
-------- --------
:ref:`introduction to outlier detection with :ref:`introduction to outlier detection with
...@@ -366,8 +377,47 @@ class OutliersMixin: ...@@ -366,8 +377,47 @@ class OutliersMixin:
s_mask = ((_s - _s.mean()) / _s.std()).iloc[: int(s.shape[0])].abs() > 3 s_mask = ((_s - _s.mean()) / _s.std()).iloc[: int(s.shape[0])].abs() > 3
else: else:
s_mask = s < -abs(thresh) s_mask = s < -abs(thresh)
s_mask = ~isflagged(qc._flags[field], kwargs["dfilter"]) & s_mask s_mask = ~isflagged(qc._flags[field], kwargs["dfilter"]) & s_mask
if slope_correct:
g_mask = s_mask.diff()
g_mask = g_mask.cumsum()
dat = self._data[field]
od_groups = dat.interpolate("linear").groupby(by=g_mask)
first_vals = od_groups.first()
last_vals = od_groups.last()
max_vals = od_groups.max()
min_vals = od_groups.min()
if min_offset is None:
if density == "auto":
d_diff = dat.diff()
eps = d_diff.abs().median()
if eps == 0:
eps = d_diff[d_diff != 0].abs().median()
else:
eps = density
eps = 3 * eps
else:
eps = min_offset
up_slopes = (min_vals + eps >= last_vals.shift(1)) & (
max_vals - eps <= first_vals.shift(-1)
)
down_slopes = (max_vals - eps <= last_vals.shift(1)) & (
min_vals + eps >= first_vals.shift(-1)
)
slopes = up_slopes | down_slopes
odd_return_pred = (max_vals > last_vals.shift(1)) & (
min_vals < last_vals.shift(1)
)
odd_return_succ = (max_vals > first_vals.shift(-1)) & (
min_vals < first_vals.shift(-1)
)
returns = odd_return_succ | odd_return_pred
corrections = returns | slopes
for s_id in corrections[corrections].index:
correct_idx = od_groups.get_group(s_id).index
s_mask[correct_idx] = False
qc._flags[s_mask, field] = flag qc._flags[s_mask, field] = flag
qc = qc.dropField(tmp_field) qc = qc.dropField(tmp_field)
return qc return qc
......
...@@ -503,9 +503,10 @@ class ScoresMixin: ...@@ -503,9 +503,10 @@ class ScoresMixin:
filled = pd.Series(False, index=vals.index) filled = pd.Series(False, index=vals.index)
if density == "auto": if density == "auto":
density = vals.diff().abs().median() v_diff = vals.diff()
density = v_diff.abs().median()
if density == 0: if density == 0:
density = vals.diff().abs().mean() density = v_diff[v_diff != 0].abs().median()
elif isinstance(density, Callable): elif isinstance(density, Callable):
density = density(vals) density = density(vals)
if isinstance(density, pd.Series): if isinstance(density, pd.Series):
......
...@@ -503,8 +503,9 @@ class ToolsMixin: ...@@ -503,8 +503,9 @@ class ToolsMixin:
and not isinstance(yscope[0], (list, tuple)) and not isinstance(yscope[0], (list, tuple))
): ):
yscope = tuple(yscope) yscope = tuple(yscope)
if yscope is not None:
ax_kwargs.update({"ylim": yscope}) ax_kwargs.update({"ylim": yscope})
if not path: if not path:
mpl.use(_MPL_DEFAULT_BACKEND) mpl.use(_MPL_DEFAULT_BACKEND)
......
...@@ -554,7 +554,7 @@ def initializeTargets( ...@@ -554,7 +554,7 @@ def initializeTargets(
index: pd.Index, index: pd.Index,
): ):
""" """
Initialize all targets based on field. Initialize all targets based on fields.
Note Note
---- ----
...@@ -652,3 +652,21 @@ def joinExt(sep: str, iterable: Iterable[str], last_sep: str | None = None) -> s ...@@ -652,3 +652,21 @@ def joinExt(sep: str, iterable: Iterable[str], last_sep: str | None = None) -> s
if len(iterable) < 2: if len(iterable) < 2:
return sep.join(iterable) return sep.join(iterable)
return f"{sep.join(iterable[:-1])}{last_sep}{iterable[-1]}" return f"{sep.join(iterable[:-1])}{last_sep}{iterable[-1]}"
def multivariateParameters(
field: str | list[str], target: str | list[str] | None = None
) -> tuple[list[str], list[str], bool]:
fields = toSequence(field)
targets = fields if target is None else toSequence(target)
broadcasting = False
if len(targets) == 1:
targets = targets * len(fields)
broadcasting = True
if len(targets) != len(fields):
raise ValueError(
"expected a single 'target' or the same number of 'field' and 'target' values"
)
return fields, targets, broadcasting
...@@ -122,7 +122,7 @@ def test_copy(data: Union[pd.DataFrame, DictOfSeries, Dict[str, pd.Series]]): ...@@ -122,7 +122,7 @@ def test_copy(data: Union[pd.DataFrame, DictOfSeries, Dict[str, pd.Series]]):
# the underling series data is the same # the underling series data is the same
for c in shallow.columns: for c in shallow.columns:
assert shallow._data[c].index is flags._data[c].index assert shallow._data[c].index.equals(flags._data[c].index)
# the underling series data was copied # the underling series data was copied
for c in deep.columns: for c in deep.columns:
......
...@@ -143,7 +143,7 @@ def test_copy(data): ...@@ -143,7 +143,7 @@ def test_copy(data):
assert is_equal(deep, shallow) assert is_equal(deep, shallow)
# underling pandas data was only copied with deep=True # underling pandas data was only copied with deep=True
assert shallow.hist.index is hist.hist.index assert shallow.hist.index.equals(hist.hist.index)
assert deep.hist.index is not hist.hist.index assert deep.hist.index is not hist.hist.index
......
...@@ -178,6 +178,49 @@ def test__groupOperation(field, target, expected, copy): ...@@ -178,6 +178,49 @@ def test__groupOperation(field, target, expected, copy):
assert (result._data[f] == result._data[t]).all(axis=None) assert (result._data[f] == result._data[t]).all(axis=None)
def test_transferFlags():
qc = SaQC(
data=pd.DataFrame(
{"x": [0, 1, 2, 3], "y": [0, 11, 22, 33], "z": [0, 111, 222, 333]}
),
flags=pd.DataFrame({"x": [B, U, U, B], "y": [B, B, U, U], "z": [B, B, U, B]}),
)
# no squueze
qc1 = qc.transferFlags("x", target="a")
assert qc1._history["a"].hist.iloc[:, :-1].equals(qc1._history["x"].hist)
assert qc1._history["a"].hist.iloc[:, -1].isna().all()
qc2 = qc.transferFlags(["x", "y"], target=["a", "b"])
assert qc2._history["a"].hist.iloc[:, :-1].equals(qc2._history["x"].hist)
assert qc2._history["a"].hist.iloc[:, -1].isna().all()
assert qc2._history["b"].hist.iloc[:, :-1].equals(qc2._history["y"].hist)
assert qc2._history["b"].hist.iloc[:, -1].isna().all()
# we use the overwrite option here for easy checking against the origin
# flags, because otherwise we would need to respect the inserted nan
qc3 = qc.transferFlags(["x", "y", "z"], target="a", overwrite=True)
assert qc3._history["a"].hist.iloc[:, 0].equals(qc3._history["x"].hist.squeeze())
assert qc3._history["a"].hist.iloc[:, 1].equals(qc3._history["y"].hist.squeeze())
assert qc3._history["a"].hist.iloc[:, 2].equals(qc3._history["z"].hist.squeeze())
assert qc3._history["a"].hist.iloc[:, -1].isna().all()
# squueze
qc1 = qc.transferFlags("x", target="a", squeeze=True)
assert qc1._history["a"].hist.equals(qc1._history["x"].hist)
qc2 = qc.transferFlags(["x", "y"], target=["a", "b"], squeeze=True)
assert qc2._history["a"].hist.equals(qc2._history["x"].hist)
assert qc2._history["b"].hist.equals(qc2._history["y"].hist)
# we use the overwrite option here for easy checking against the origin
# flags, because otherwise we would need to respect the inserted nan
qc3 = qc.transferFlags(["x", "y", "z"], target="a", overwrite=True, squeeze=True)
assert qc3._history["a"].hist.iloc[:, 0].equals(qc3._history["x"].hist.squeeze())
assert qc3._history["a"].hist.iloc[:, 1].equals(qc3._history["y"].hist.squeeze())
assert qc3._history["a"].hist.iloc[:, 2].equals(qc3._history["z"].hist.squeeze())
@pytest.mark.parametrize( @pytest.mark.parametrize(
"f_data", "f_data",
[ [
......
...@@ -2,9 +2,9 @@ ...@@ -2,9 +2,9 @@
# #
# SPDX-License-Identifier: GPL-3.0-or-later # SPDX-License-Identifier: GPL-3.0-or-later
beautifulsoup4==4.12.2 beautifulsoup4==4.12.3
hypothesis==6.92.2 hypothesis==6.98.15
Markdown==3.5.1 Markdown==3.5.2
pytest==7.4.4 pytest==7.4.4
pytest-lazy-fixture==0.6.3 pytest-lazy-fixture==0.6.3
requests==2.31.0 requests==2.31.0