Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • berntm/saqc
  • rdm-software/saqc
  • schueler/saqc
3 results
Show changes
Commits on Source (34)
......@@ -15,8 +15,10 @@ SPDX-License-Identifier: GPL-3.0-or-later
- `SaQC`: support for selection, slicing and setting of items by use of subscription on SaQC objects (e.g. `qc[key]` and `qc[key] = value`).
Selection works with single keys, collections of keys and string slices (e.g. `qc["a":"f"]`). Values can be SaQC objects, pd.Series,
Iterable of Series and dict-like with series values.
- `transferFlags` is a multivariate function
- `plot`: added `yscope` keyword
- `setFlags`: function to replace `flagManual`
- `flagUniLOF`: added defaultly applied correction to mitigate phenomenon of overflagging at relatively steep data value slopes. (parameter `slope_correct`).
### Changed
### Removed
### Fixed
......
......@@ -4,11 +4,11 @@
recommonmark==0.7.1
sphinx==7.2.6
sphinx-automodapi==0.16.0
sphinx-automodapi==0.17.0
sphinxcontrib-fulltoc==1.2.0
sphinx-markdown-tables==0.0.17
jupyter-sphinx==0.5.3
sphinx_autodoc_typehints==1.25.2
sphinx-tabs==3.4.4
sphinx_autodoc_typehints==2.0.0
sphinx-tabs==3.4.5
sphinx-design==0.5.0
pydata-sphinx-theme==0.14.4
pydata-sphinx-theme==0.15.2
......@@ -5,12 +5,12 @@
Click==8.1.7
docstring_parser==0.15
dtw==1.4.0
matplotlib==3.8.2
numpy==1.26.2
matplotlib==3.8.3
numpy==1.26.4
outlier-utils==0.0.5
pyarrow==14.0.2
pyarrow==15.0.0
pandas==2.1.4
scikit-learn==1.3.2
scipy==1.11.4
scikit-learn==1.4.1.post1
scipy==1.12.0
typing_extensions==4.5.0
fancy-collections==0.2.1
......@@ -12,7 +12,7 @@ import warnings
from copy import copy as shallowcopy
from copy import deepcopy
from functools import partial
from typing import Any, Hashable, Iterable, MutableMapping, overload
from typing import Any, Hashable, Iterable, MutableMapping
import numpy as np
import pandas as pd
......@@ -32,7 +32,7 @@ from saqc.funcs import FunctionsMixin
# warnings
pd.set_option("mode.chained_assignment", "warn")
pd.options.mode.copy_on_write = False
pd.set_option("mode.copy_on_write", True)
np.seterr(invalid="ignore")
......
......@@ -17,9 +17,16 @@ from typing_extensions import Literal
from saqc import BAD, FILTER_ALL, UNFLAGGED
from saqc.core import DictOfSeries, flagging, register
from saqc.core.flags import Flags
from saqc.core.history import History
from saqc.lib.checking import validateChoice, validateWindow
from saqc.lib.tools import initializeTargets, isflagged, isunflagged, toSequence
from saqc.lib.tools import (
initializeTargets,
isflagged,
isunflagged,
multivariateParameters,
toSequence,
)
if TYPE_CHECKING:
from saqc import SaQC
......@@ -108,7 +115,7 @@ class FlagtoolsMixin:
data :
Determines which timestamps to set flags at, depending on the passed type:
* 1-d `array` or `List` of timestamps: flag `field` with `flag` at every timestamp in `f_data`
* 1-d `array` or `List` of timestamps or `pandas.Index`: flag `field` with `flag` at every timestamp in `f_data`
* 2-d `array` or List of tuples: for all elements `t[k]` out of f_data:
flag `field` with `flag` at every timestamp in between `t[k][0]` and `t[k][1]`
* pd.Series: flag `field` with `flag` in between any index and data value of the passed series
......@@ -123,7 +130,7 @@ class FlagtoolsMixin:
to_flag = pd.Series(False, index=self._data[field].index)
# check if f_data is meant to denote timestamps:
if (isinstance(data, (list, np.ndarray))) and not isinstance(
if (isinstance(data, (list, np.ndarray, pd.Index))) and not isinstance(
data[0], (tuple, np.ndarray)
):
set_idx = pd.DatetimeIndex(data).intersection(to_flag.index)
......@@ -356,6 +363,7 @@ class FlagtoolsMixin:
demask=[],
squeeze=[],
handles_target=True, # function defines a target parameter, so it needs to handle it
multivariate=True,
)
def transferFlags(
self: "SaQC",
......@@ -415,16 +423,8 @@ class FlagtoolsMixin:
0 -inf -inf -inf
1 255.0 255.0 255.0
"""
history = self._flags.history[field]
if target is None:
target = field
if overwrite is False:
mask = isflagged(self._flags[target], thresh=kwargs["dfilter"])
history._hist[mask] = np.nan
# append a dummy column
fields, targets, broadcasting = multivariateParameters(field, target)
meta = {
"func": f"transferFlags",
"args": (),
......@@ -437,15 +437,45 @@ class FlagtoolsMixin:
},
}
if squeeze:
flags = history.squeeze(raw=True)
# init an empty history to which we later append the squeezed flags
history = History(index=history.index)
else:
for field, target in zip(fields, targets):
# initialize non existing targets
if target not in self._data:
self._data[target] = pd.Series(np.nan, index=self._data[field].index)
self._flags._data[target] = History(self._data[target].index)
if not self._data[field].index.equals(self._data[target].index):
raise ValueError(
f"All Field and Target indices must match!\n"
f"Indices of {field} and {target} seem to be not congruent within the context of the given\n"
f"- fields: {fields}\n "
f"- and targets: {targets}"
)
history = self._flags.history[field].copy(deep=True)
if overwrite is False:
mask = isflagged(self._flags[target], thresh=kwargs["dfilter"])
history._hist[mask] = np.nan
if squeeze:
# add squeezed flags
flags = history.squeeze(raw=True)
history = History(index=history.index).append(flags, meta)
elif broadcasting is False:
# add an empty flags
flags = pd.Series(np.nan, index=history.index, dtype=float)
history.append(flags, meta)
# else:
# broadcasting -> multiple fields will be written to one target
# only add the fields' histories and add an empty column later
self._flags.history[target].append(history)
if broadcasting and not squeeze:
# add one final history column
# all targets are identical, if we broadcast fields -> target
target = targets[0]
history = self._flags.history[target]
flags = pd.Series(np.nan, index=history.index, dtype=float)
history.append(flags, meta)
self._flags.history[target].append(history)
self._flags.history[target].append(flags, meta)
return self
......
......@@ -179,6 +179,8 @@ class OutliersMixin:
p: int = 1,
density: Literal["auto"] | float = "auto",
fill_na: bool = True,
slope_correct: bool = True,
min_offset: float = None,
flag: float = BAD,
**kwargs,
) -> "SaQC":
......@@ -247,6 +249,15 @@ class OutliersMixin:
fill_na :
If True, NaNs in the data are filled with a linear interpolation.
slope_correct :
if True, a correction is applied, that removes outlier cluster that actually
just seem to be steep slopes
min_offset :
If set, only those outlier cluster will be flagged, that are preceeded and succeeeded
by sufficiently large value "jumps". Defaults to estimating the sufficient value jumps from
the median over the absolute step sizes between data points.
See Also
--------
:ref:`introduction to outlier detection with
......@@ -366,8 +377,47 @@ class OutliersMixin:
s_mask = ((_s - _s.mean()) / _s.std()).iloc[: int(s.shape[0])].abs() > 3
else:
s_mask = s < -abs(thresh)
s_mask = ~isflagged(qc._flags[field], kwargs["dfilter"]) & s_mask
if slope_correct:
g_mask = s_mask.diff()
g_mask = g_mask.cumsum()
dat = self._data[field]
od_groups = dat.interpolate("linear").groupby(by=g_mask)
first_vals = od_groups.first()
last_vals = od_groups.last()
max_vals = od_groups.max()
min_vals = od_groups.min()
if min_offset is None:
if density == "auto":
d_diff = dat.diff()
eps = d_diff.abs().median()
if eps == 0:
eps = d_diff[d_diff != 0].abs().median()
else:
eps = density
eps = 3 * eps
else:
eps = min_offset
up_slopes = (min_vals + eps >= last_vals.shift(1)) & (
max_vals - eps <= first_vals.shift(-1)
)
down_slopes = (max_vals - eps <= last_vals.shift(1)) & (
min_vals + eps >= first_vals.shift(-1)
)
slopes = up_slopes | down_slopes
odd_return_pred = (max_vals > last_vals.shift(1)) & (
min_vals < last_vals.shift(1)
)
odd_return_succ = (max_vals > first_vals.shift(-1)) & (
min_vals < first_vals.shift(-1)
)
returns = odd_return_succ | odd_return_pred
corrections = returns | slopes
for s_id in corrections[corrections].index:
correct_idx = od_groups.get_group(s_id).index
s_mask[correct_idx] = False
qc._flags[s_mask, field] = flag
qc = qc.dropField(tmp_field)
return qc
......
......@@ -503,9 +503,10 @@ class ScoresMixin:
filled = pd.Series(False, index=vals.index)
if density == "auto":
density = vals.diff().abs().median()
v_diff = vals.diff()
density = v_diff.abs().median()
if density == 0:
density = vals.diff().abs().mean()
density = v_diff[v_diff != 0].abs().median()
elif isinstance(density, Callable):
density = density(vals)
if isinstance(density, pd.Series):
......
......@@ -503,8 +503,9 @@ class ToolsMixin:
and not isinstance(yscope[0], (list, tuple))
):
yscope = tuple(yscope)
if yscope is not None:
ax_kwargs.update({"ylim": yscope})
ax_kwargs.update({"ylim": yscope})
if not path:
mpl.use(_MPL_DEFAULT_BACKEND)
......
......@@ -554,7 +554,7 @@ def initializeTargets(
index: pd.Index,
):
"""
Initialize all targets based on field.
Initialize all targets based on fields.
Note
----
......@@ -652,3 +652,21 @@ def joinExt(sep: str, iterable: Iterable[str], last_sep: str | None = None) -> s
if len(iterable) < 2:
return sep.join(iterable)
return f"{sep.join(iterable[:-1])}{last_sep}{iterable[-1]}"
def multivariateParameters(
field: str | list[str], target: str | list[str] | None = None
) -> tuple[list[str], list[str], bool]:
fields = toSequence(field)
targets = fields if target is None else toSequence(target)
broadcasting = False
if len(targets) == 1:
targets = targets * len(fields)
broadcasting = True
if len(targets) != len(fields):
raise ValueError(
"expected a single 'target' or the same number of 'field' and 'target' values"
)
return fields, targets, broadcasting
......@@ -122,7 +122,7 @@ def test_copy(data: Union[pd.DataFrame, DictOfSeries, Dict[str, pd.Series]]):
# the underling series data is the same
for c in shallow.columns:
assert shallow._data[c].index is flags._data[c].index
assert shallow._data[c].index.equals(flags._data[c].index)
# the underling series data was copied
for c in deep.columns:
......
......@@ -143,7 +143,7 @@ def test_copy(data):
assert is_equal(deep, shallow)
# underling pandas data was only copied with deep=True
assert shallow.hist.index is hist.hist.index
assert shallow.hist.index.equals(hist.hist.index)
assert deep.hist.index is not hist.hist.index
......
......@@ -178,6 +178,49 @@ def test__groupOperation(field, target, expected, copy):
assert (result._data[f] == result._data[t]).all(axis=None)
def test_transferFlags():
qc = SaQC(
data=pd.DataFrame(
{"x": [0, 1, 2, 3], "y": [0, 11, 22, 33], "z": [0, 111, 222, 333]}
),
flags=pd.DataFrame({"x": [B, U, U, B], "y": [B, B, U, U], "z": [B, B, U, B]}),
)
# no squueze
qc1 = qc.transferFlags("x", target="a")
assert qc1._history["a"].hist.iloc[:, :-1].equals(qc1._history["x"].hist)
assert qc1._history["a"].hist.iloc[:, -1].isna().all()
qc2 = qc.transferFlags(["x", "y"], target=["a", "b"])
assert qc2._history["a"].hist.iloc[:, :-1].equals(qc2._history["x"].hist)
assert qc2._history["a"].hist.iloc[:, -1].isna().all()
assert qc2._history["b"].hist.iloc[:, :-1].equals(qc2._history["y"].hist)
assert qc2._history["b"].hist.iloc[:, -1].isna().all()
# we use the overwrite option here for easy checking against the origin
# flags, because otherwise we would need to respect the inserted nan
qc3 = qc.transferFlags(["x", "y", "z"], target="a", overwrite=True)
assert qc3._history["a"].hist.iloc[:, 0].equals(qc3._history["x"].hist.squeeze())
assert qc3._history["a"].hist.iloc[:, 1].equals(qc3._history["y"].hist.squeeze())
assert qc3._history["a"].hist.iloc[:, 2].equals(qc3._history["z"].hist.squeeze())
assert qc3._history["a"].hist.iloc[:, -1].isna().all()
# squueze
qc1 = qc.transferFlags("x", target="a", squeeze=True)
assert qc1._history["a"].hist.equals(qc1._history["x"].hist)
qc2 = qc.transferFlags(["x", "y"], target=["a", "b"], squeeze=True)
assert qc2._history["a"].hist.equals(qc2._history["x"].hist)
assert qc2._history["b"].hist.equals(qc2._history["y"].hist)
# we use the overwrite option here for easy checking against the origin
# flags, because otherwise we would need to respect the inserted nan
qc3 = qc.transferFlags(["x", "y", "z"], target="a", overwrite=True, squeeze=True)
assert qc3._history["a"].hist.iloc[:, 0].equals(qc3._history["x"].hist.squeeze())
assert qc3._history["a"].hist.iloc[:, 1].equals(qc3._history["y"].hist.squeeze())
assert qc3._history["a"].hist.iloc[:, 2].equals(qc3._history["z"].hist.squeeze())
@pytest.mark.parametrize(
"f_data",
[
......
......@@ -2,9 +2,9 @@
#
# SPDX-License-Identifier: GPL-3.0-or-later
beautifulsoup4==4.12.2
hypothesis==6.92.2
Markdown==3.5.1
beautifulsoup4==4.12.3
hypothesis==6.98.15
Markdown==3.5.2
pytest==7.4.4
pytest-lazy-fixture==0.6.3
requests==2.31.0