From 071aa617757541bd188dd892063779d9d4d8be84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20L=C3=BCnenschlo=C3=9F?= <peter.luenenschloss@ufz.de> Date: Mon, 4 Mar 2024 11:34:02 +0100 Subject: [PATCH] Adding plausibility checks to `flagUniLOF` --- CHANGELOG.md | 1 + saqc/funcs/outliers.py | 52 +++++++++++++++++++++++++++++++++++++++++- saqc/funcs/scores.py | 5 ++-- 3 files changed, 55 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f56df1fb2..ef5c8a0ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ SPDX-License-Identifier: GPL-3.0-or-later Iterable of Series and dict-like with series values. - `plot`: added `yscope` keyword - `setFlags`: function to replace `flagManual` +- `flagUniLOF`: added defaultly applied correction to mitigate phenomenon of overflagging at relatively steep data value slopes. (parameter `slope_correct`). ### Changed ### Removed ### Fixed diff --git a/saqc/funcs/outliers.py b/saqc/funcs/outliers.py index 7b5a4b930..dfb1a84ef 100644 --- a/saqc/funcs/outliers.py +++ b/saqc/funcs/outliers.py @@ -179,6 +179,8 @@ class OutliersMixin: p: int = 1, density: Literal["auto"] | float = "auto", fill_na: bool = True, + slope_correct: bool = True, + min_offset: float = None, flag: float = BAD, **kwargs, ) -> "SaQC": @@ -247,6 +249,15 @@ class OutliersMixin: fill_na : If True, NaNs in the data are filled with a linear interpolation. + slope_correct : + if True, a correction is applied, that removes outlier cluster that actually + just seem to be steep slopes + + min_offset : + If set, only those outlier cluster will be flagged, that are preceeded and succeeeded + by sufficiently large value "jumps". Defaults to estimating the sufficient value jumps from + the median over the absolute step sizes between data points. + See Also -------- :ref:`introduction to outlier detection with @@ -366,8 +377,47 @@ class OutliersMixin: s_mask = ((_s - _s.mean()) / _s.std()).iloc[: int(s.shape[0])].abs() > 3 else: s_mask = s < -abs(thresh) - s_mask = ~isflagged(qc._flags[field], kwargs["dfilter"]) & s_mask + + if slope_correct: + g_mask = s_mask.diff() + g_mask = g_mask.cumsum() + dat = self._data[field] + od_groups = dat.interpolate("linear").groupby(by=g_mask) + first_vals = od_groups.first() + last_vals = od_groups.last() + max_vals = od_groups.max() + min_vals = od_groups.min() + if min_offset is None: + if density == "auto": + d_diff = dat.diff() + eps = d_diff.abs().median() + if eps == 0: + eps = d_diff[d_diff != 0].abs().median() + else: + eps = density + eps = 3 * eps + else: + eps = min_offset + up_slopes = (min_vals + eps >= last_vals.shift(1)) & ( + max_vals - eps <= first_vals.shift(-1) + ) + down_slopes = (max_vals - eps <= last_vals.shift(1)) & ( + min_vals + eps >= first_vals.shift(-1) + ) + slopes = up_slopes | down_slopes + odd_return_pred = (max_vals > last_vals.shift(1)) & ( + min_vals < last_vals.shift(1) + ) + odd_return_succ = (max_vals > first_vals.shift(-1)) & ( + min_vals < first_vals.shift(-1) + ) + returns = odd_return_succ | odd_return_pred + corrections = returns | slopes + for s_id in corrections[corrections].index: + correct_idx = od_groups.get_group(s_id).index + s_mask[correct_idx] = False + qc._flags[s_mask, field] = flag qc = qc.dropField(tmp_field) return qc diff --git a/saqc/funcs/scores.py b/saqc/funcs/scores.py index 18f4c6f62..9998c1534 100644 --- a/saqc/funcs/scores.py +++ b/saqc/funcs/scores.py @@ -503,9 +503,10 @@ class ScoresMixin: filled = pd.Series(False, index=vals.index) if density == "auto": - density = vals.diff().abs().median() + v_diff = vals.diff() + density = v_diff.abs().median() if density == 0: - density = vals.diff().abs().mean() + density = v_diff[v_diff != 0].abs().median() elif isinstance(density, Callable): density = density(vals) if isinstance(density, pd.Series): -- GitLab