Skip to content
Snippets Groups Projects
Commit 071aa617 authored by Peter Lünenschloß's avatar Peter Lünenschloß
Browse files

Adding plausibility checks to `flagUniLOF`

parent de1b0872
No related branches found
No related tags found
1 merge request!821Adding plausibility checks to `flagUniLOF`
......@@ -17,6 +17,7 @@ SPDX-License-Identifier: GPL-3.0-or-later
Iterable of Series and dict-like with series values.
- `plot`: added `yscope` keyword
- `setFlags`: function to replace `flagManual`
- `flagUniLOF`: added defaultly applied correction to mitigate phenomenon of overflagging at relatively steep data value slopes. (parameter `slope_correct`).
### Changed
### Removed
### Fixed
......
......@@ -179,6 +179,8 @@ class OutliersMixin:
p: int = 1,
density: Literal["auto"] | float = "auto",
fill_na: bool = True,
slope_correct: bool = True,
min_offset: float = None,
flag: float = BAD,
**kwargs,
) -> "SaQC":
......@@ -247,6 +249,15 @@ class OutliersMixin:
fill_na :
If True, NaNs in the data are filled with a linear interpolation.
slope_correct :
if True, a correction is applied, that removes outlier cluster that actually
just seem to be steep slopes
min_offset :
If set, only those outlier cluster will be flagged, that are preceeded and succeeeded
by sufficiently large value "jumps". Defaults to estimating the sufficient value jumps from
the median over the absolute step sizes between data points.
See Also
--------
:ref:`introduction to outlier detection with
......@@ -366,8 +377,47 @@ class OutliersMixin:
s_mask = ((_s - _s.mean()) / _s.std()).iloc[: int(s.shape[0])].abs() > 3
else:
s_mask = s < -abs(thresh)
s_mask = ~isflagged(qc._flags[field], kwargs["dfilter"]) & s_mask
if slope_correct:
g_mask = s_mask.diff()
g_mask = g_mask.cumsum()
dat = self._data[field]
od_groups = dat.interpolate("linear").groupby(by=g_mask)
first_vals = od_groups.first()
last_vals = od_groups.last()
max_vals = od_groups.max()
min_vals = od_groups.min()
if min_offset is None:
if density == "auto":
d_diff = dat.diff()
eps = d_diff.abs().median()
if eps == 0:
eps = d_diff[d_diff != 0].abs().median()
else:
eps = density
eps = 3 * eps
else:
eps = min_offset
up_slopes = (min_vals + eps >= last_vals.shift(1)) & (
max_vals - eps <= first_vals.shift(-1)
)
down_slopes = (max_vals - eps <= last_vals.shift(1)) & (
min_vals + eps >= first_vals.shift(-1)
)
slopes = up_slopes | down_slopes
odd_return_pred = (max_vals > last_vals.shift(1)) & (
min_vals < last_vals.shift(1)
)
odd_return_succ = (max_vals > first_vals.shift(-1)) & (
min_vals < first_vals.shift(-1)
)
returns = odd_return_succ | odd_return_pred
corrections = returns | slopes
for s_id in corrections[corrections].index:
correct_idx = od_groups.get_group(s_id).index
s_mask[correct_idx] = False
qc._flags[s_mask, field] = flag
qc = qc.dropField(tmp_field)
return qc
......
......@@ -503,9 +503,10 @@ class ScoresMixin:
filled = pd.Series(False, index=vals.index)
if density == "auto":
density = vals.diff().abs().median()
v_diff = vals.diff()
density = v_diff.abs().median()
if density == 0:
density = vals.diff().abs().mean()
density = v_diff[v_diff != 0].abs().median()
elif isinstance(density, Callable):
density = density(vals)
if isinstance(density, pd.Series):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment