From 3a2c47962530afa7392101ad0f911433b749b4fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Peter=20L=C3=BCnenschlo=C3=9F?= <peter.luenenschloss@ufz.de> Date: Tue, 21 Jan 2025 15:52:32 +0100 Subject: [PATCH] Univariat Outlier Probabilities and thresh automatisation for flagUniLOF --- CHANGELOG.md | 3 + saqc/funcs/outliers.py | 116 +++++++++++++++++++++----- saqc/funcs/scores.py | 69 +++++++++++++-- tests/funcs/test_outlier_detection.py | 9 +- tests/funcs/test_pattern_rec.py | 1 - 5 files changed, 166 insertions(+), 32 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a749a41c5..2625096d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,9 @@ SPDX-License-Identifier: GPL-3.0-or-later ## Unreleased [List of commits](https://git.ufz.de/rdm-software/saqc/-/compare/v2.6.0...develop) ### Added +- `flagPlateaus`: added function to search and flag outlierish value plateaus of certain temporal extension +- `flagUniLOF`: added dispatch to Local Outlier Probability (*LoOP*) variant +- `flaguniLOF`: made `thresh` Optional - `flagPlateaus`: added function to search and flag anomalous value plateaus of certain temporal extension ### Changed ### Removed diff --git a/saqc/funcs/outliers.py b/saqc/funcs/outliers.py index 91e222128..e4b8fb30b 100644 --- a/saqc/funcs/outliers.py +++ b/saqc/funcs/outliers.py @@ -10,13 +10,16 @@ from __future__ import annotations import uuid import warnings -from typing import TYPE_CHECKING, Callable, Sequence, Tuple +from collections import Counter +from typing import TYPE_CHECKING, Callable, Optional, Sequence, Tuple import numpy as np import numpy.polynomial.polynomial as poly import pandas as pd from outliers import smirnov_grubbs # noqa, on pypi as outlier-utils from scipy.stats import median_abs_deviation +from sklearn.cluster import k_means +from sklearn.metrics import silhouette_samples, silhouette_score from typing_extensions import Literal from saqc import BAD, UNFLAGGED @@ -39,6 +42,55 @@ if TYPE_CHECKING: from saqc import SaQC +def _estimateCutoffProbability(probabilities: np.ndarray, thresh: float): + """ + Estimate a cutoff level for the outlier probabilities passed to `probabilities` with an iterative k-means aproach + """ + + if isinstance(thresh, float) and (thresh < 1): + sup_num = max(4, int(abs(thresh * probabilities.shape[0]))) + + else: + sup_num = max(4, int(abs(thresh))) + + vals = (-probabilities).nlargest(sup_num).values + vals = np.unique(vals).reshape(-1, 1) + km = k_means(vals, 2) + thresh = 1.0 + for check in range(3, int(vals.shape[0]) + 1): + + _km = k_means(vals, check) + + if (_km[-1] - km[-1]) > -1: + + cluster_hierarchy = km[0].argsort(axis=0).squeeze() + top_cluster = cluster_hierarchy[-1] + sec_cluster = cluster_hierarchy[-2] + top_select = km[1] == top_cluster + sec_select = km[1] == sec_cluster + topsec_select = top_select | sec_select + topsec_labels = km[1][topsec_select] + if topsec_select.sum() == 2: + continue + silhouette = silhouette_samples(vals[topsec_select], topsec_labels) + if top_select.sum() == 1: + silhouette[top_select] = 1.0 + if sec_select.sum() == 1: + silhouette[sec_select] = 1.0 + top_silhouette = silhouette[km[1][topsec_select] == top_cluster] + sil_mean = silhouette.mean() + min_select = top_silhouette >= sil_mean + + if (min_select.sum() > 0) and (sil_mean > 0.6): + od_samples = vals[km[1] == top_cluster][min_select] + thresh = od_samples.min() + if od_samples.shape[0] > 10: + thresh = max(thresh, km[0].max() - 3 * od_samples.std()) + break + km = _km + return thresh + + def _stray(partition, min_periods, iter_start, alpha): sample_size = len(partition) flag_index = pd.Index([]) @@ -202,7 +254,9 @@ class OutliersMixin: self: "SaQC", field: str, n: int = 20, - thresh: Literal["auto"] | float = 1.5, + thresh: float | Literal["auto"] | None = None, + probability: Optional[float] = None, + corruption: float | int | None = None, algorithm: Literal["ball_tree", "kd_tree", "brute", "auto"] = "ball_tree", p: int = 1, density: Literal["auto"] | float = "auto", @@ -241,19 +295,23 @@ class OutliersMixin: Higher values greatly increase numerical costs. thresh : - The threshold for flagging the calculated LOF. A LOF of around - ``1`` is considered normal and most likely corresponds to - inlier points. This parameter is considered the main calibration - parameter of the algorithm. - - * The threshing defaults to ``1.5``, wich is the default value - found to be suitable in the literature. - * ``'auto'`` enables flagging the scores with a modified 3-sigma - rule, resulting in a thresh around ``4``, which usually - greatly mitigates overflagging compared to the literature - recommendation, but often is too high. - * sensitive range for the parameter may be ``[1,15]``, assuming - default settings for the other parameters. + Outlier Factor Cutoff. + + * If given, compute local outlier factors and cut them off at value level + `thresh`, resulting in all values having factors assigned higher than the cutoff, + being flagged. + + probability : + Outlier probability cutoff + + * If given, compute local outlier probabilities and cut them off at value level + `probability`, resulting in all values with factors larger than the cutoff, being flagged. + + corruption : + Portion of data that is anomalous. Can be given: + + * as a portion of the data (`float in [0,1]`) + * as the total of corrupted samples (`int > 1`) algorithm : Algorithm used for calculating the :py:attr:`n`-nearest neighbors @@ -385,8 +443,12 @@ class OutliersMixin: :ref:`introduction to outlier detection with saqc <cookbooks/OutlierDetection:Outlier Detection>` """ self._validateLOF(algorithm, n, p, density) - if thresh != "auto" and not isFloatLike(thresh): - raise ValueError(f"'thresh' must be 'auto' or a float, not {thresh}") + + para_check = (thresh is None) + (probability is None) + (corruption is None) + if para_check < 2: + raise ValueError( + f'Only one (or none) of "thresh", "probability", "corruption" must be given, got: \n thresh: {thresh} \n probability: {probability} \n corruption: {corruption}' + ) tmp_field = str(uuid.uuid4()) qc = self.assignUniLOF( @@ -397,13 +459,23 @@ class OutliersMixin: p=p, density=density, fill_na=fill_na, + statistical_extent=0.997 if thresh is None else 1, ) s = qc.data[tmp_field] - if thresh == "auto": - _s = pd.concat([s, (-s - 2)]) - s_mask = ((_s - _s.mean()) / _s.std()).iloc[: int(s.shape[0])].abs() > 3 - else: - s_mask = s < -abs(thresh) + if para_check == 3: + corruption = int(-(len(self.data[field]) / max(n, 4))) + if thresh is not None: + if isinstance(thresh, str) and (thresh == "auto"): + _s = pd.concat([s, (-s - 2)]) + s_mask = ((_s - _s.mean()) / _s.std()).iloc[: int(s.shape[0])].abs() > 3 + else: + s_mask = s < -abs(thresh) + elif corruption is not None: + cutoff = _estimateCutoffProbability(probabilities=s, thresh=corruption) + s_mask = s <= -abs(cutoff) + elif probability is not None: + s_mask = s <= -abs(probability) + s_mask = ~isflagged(qc._flags[field], kwargs["dfilter"]) & s_mask if slope_correct: diff --git a/saqc/funcs/scores.py b/saqc/funcs/scores.py index 5c533d4dc..bd1fd5103 100644 --- a/saqc/funcs/scores.py +++ b/saqc/funcs/scores.py @@ -11,6 +11,8 @@ from typing import TYPE_CHECKING, Callable, Optional, Sequence, Tuple import numpy as np import pandas as pd +from scipy.spatial.distance import cdist +from scipy.special import erf, erfinv from sklearn.neighbors import LocalOutlierFactor from typing_extensions import Literal @@ -49,6 +51,43 @@ def _LOFApply(vals, n_neighbors, ret="scores", **kwargs): return labels == -1 +def _LOPApply( + vals, + n_neighbors, + ret="scores", + statistical_extent=0.997, + use_centroids=False, + **kwargs, +): + """ + Apply Local Outlier Probability algorithm, as presented in + + Kriegel, H.-P.; Kröger, P.; Schubert, E.; Zimek, A. LoOP: local outlier probabilities. Proceedings + of the 18th ACM conference on Information and knowledge management 2009, 1649–1652. + """ + d, members = kNN(vals, n_neighbors, **kwargs) + members = np.concatenate([np.array([range(vals.shape[0])]).T, members], axis=1) + d = np.concatenate([np.array([[0] * vals.shape[0]]).T, d], axis=1) + if use_centroids: + centroids = vals[members].sum(axis=1) / (n_neighbors + 1) + + for k in range(vals.shape[0]): + d[k] = cdist(centroids[k : k + 1], vals[members[k]]) + + standard_dist = np.sqrt(np.sum(d**2, axis=1) / (n_neighbors + 1)) + lambda_val = np.sqrt(2) * erfinv(statistical_extent) + p_dist = lambda_val * standard_dist + exp_p_dist = p_dist[members].sum(axis=1) / (n_neighbors + 1) + plof = (p_dist / exp_p_dist) - 1 + nplof = np.sqrt(np.sum(plof**2) / len(plof)) * lambda_val * np.sqrt(2) + scores = erf(plof / nplof) + scores = np.where(scores > 0, scores, 0) + if ret == "scores": + return -scores + else: + return scores == 0 + + def _groupedScoring( val_frame: pd.DataFrame, n: int = 20, @@ -436,10 +475,11 @@ class ScoresMixin: p: int = 1, density: Literal["auto"] | float = "auto", fill_na: bool = True, + statistical_extent=1, **kwargs, ) -> "SaQC": """ - Assign "univariate" Local Outlier Factor (LOF). + Assign "univariate" Local Outlier Factor (LOF) or "inivariate" Local Outlier Probability (LOP) The Function is a wrapper around a usual LOF implementation, aiming for an easy to use, parameter minimal outlier scoring function for singleton variables, that does not necessitate prior modelling of the variable. @@ -482,11 +522,16 @@ class ScoresMixin: Notes ----- + LOP: Kriegel, H.-P.; Kröger, P.; Schubert, E.; Zimek, A. LoOP: local outlier probabilities. Proceedings + of the 18th ACM conference on Information and knowledge management 2009, 1649–1652. + Algorithm steps for uniLOF flagging of variable `x`: 1. The temporal density `dt(x)` is calculated according to the `density` parameter. - 2. LOF scores `LOF(x)` are calculated for the concatenation [`x`, `dt(x)`] - 3. `x` is flagged where `LOF(x)` exceeds the threshold determined by the parameter `thresh`. + 2. LOF (or LOP) scores `L(x)` are calculated for the concatenation [`x`, `dt(x)`] + 3. `x` is flagged where `L(x)` exceeds the threshold determined by the parameter `thresh`. + + Examples -------- @@ -506,13 +551,13 @@ class ScoresMixin: filled = pd.Series(False, index=vals.index) if density == "auto": - v_diff = vals.diff() + v_diff = (vals**p).diff() density = v_diff.abs().median() if density == 0: density = v_diff[v_diff != 0].abs().median() elif isinstance(density, Callable): density = density(vals) - if isinstance(density, pd.Series): + elif isinstance(density, pd.Series): density = density.values d_var = pd.Series(np.arange(len(vals)) * density, index=vals.index) @@ -537,7 +582,19 @@ class ScoresMixin: ) LOF_vars = np.array([vals_extended, d_extended]).T - scores = _LOFApply(LOF_vars, n, p=p, algorithm=algorithm, metric="minkowski") + if statistical_extent == 1: + scores = _LOFApply( + LOF_vars, n, p=p, algorithm=algorithm, metric="minkowski" + ) + else: + scores = _LOPApply( + LOF_vars, + n, + p=p, + statistical_extent=statistical_extent, + algorithm=algorithm, + metric="minkowski", + ) scores = scores[n:-n] score_ser = pd.Series(scores, index=na_bool_ser.index[~na_bool_ser.values]) score_ser = score_ser.reindex(na_bool_ser.index) diff --git a/tests/funcs/test_outlier_detection.py b/tests/funcs/test_outlier_detection.py index 61953bcd0..320d8585c 100644 --- a/tests/funcs/test_outlier_detection.py +++ b/tests/funcs/test_outlier_detection.py @@ -205,13 +205,16 @@ def test_flagZScoresMV(): assert (qc.flags.to_pandas().iloc[[40, 80], 0] > 0).all() +@pytest.mark.filterwarnings("ignore:Number of distinct clusters") @pytest.mark.parametrize("n", [1, 10]) @pytest.mark.parametrize("p", [1, 2]) -@pytest.mark.parametrize("thresh", ["auto", 2]) -def test_flagUniLOF(spiky_data, n, p, thresh): +@pytest.mark.parametrize( + "cutoff", [{}, {"probability": 0.99}, {"thresh": "auto"}, {"thresh": 2}] +) +def test_flagUniLOF(spiky_data, n, p, cutoff): data = spiky_data[0] field, *_ = data.columns - qc = SaQC(data).flagUniLOF(field, n=n, p=p, thresh=thresh) + qc = SaQC(data).flagUniLOF(field, n=n, p=p, **cutoff) flag_result = qc.flags[field] test_sum = (flag_result.iloc[spiky_data[1]] == BAD).sum() assert test_sum == len(spiky_data[1]) diff --git a/tests/funcs/test_pattern_rec.py b/tests/funcs/test_pattern_rec.py index a9ceb667d..ec9866b7c 100644 --- a/tests/funcs/test_pattern_rec.py +++ b/tests/funcs/test_pattern_rec.py @@ -35,7 +35,6 @@ def test_flagPlateau(): dat = pd.read_csv(path, parse_dates=[0], index_col=0) dat = dat.interpolate("linear") dat = dat.ffill().bfill() - qc = SaQC(dat) qc = qc.flagPlateau( "base3", min_length="10min", max_length="7d", granularity="20min" -- GitLab