From 3a2c47962530afa7392101ad0f911433b749b4fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Peter=20L=C3=BCnenschlo=C3=9F?= <peter.luenenschloss@ufz.de>
Date: Tue, 21 Jan 2025 15:52:32 +0100
Subject: [PATCH] Univariat Outlier Probabilities and thresh automatisation for
 flagUniLOF

---
 CHANGELOG.md                          |   3 +
 saqc/funcs/outliers.py                | 116 +++++++++++++++++++++-----
 saqc/funcs/scores.py                  |  69 +++++++++++++--
 tests/funcs/test_outlier_detection.py |   9 +-
 tests/funcs/test_pattern_rec.py       |   1 -
 5 files changed, 166 insertions(+), 32 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a749a41c5..2625096d8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,9 @@ SPDX-License-Identifier: GPL-3.0-or-later
 ## Unreleased
 [List of commits](https://git.ufz.de/rdm-software/saqc/-/compare/v2.6.0...develop)
 ### Added
+- `flagPlateaus`: added function to search and flag outlierish value plateaus of certain temporal extension
+- `flagUniLOF`: added dispatch to Local Outlier Probability (*LoOP*) variant
+- `flaguniLOF`: made `thresh` Optional
 - `flagPlateaus`: added function to search and flag anomalous value plateaus of certain temporal extension
 ### Changed
 ### Removed
diff --git a/saqc/funcs/outliers.py b/saqc/funcs/outliers.py
index 91e222128..e4b8fb30b 100644
--- a/saqc/funcs/outliers.py
+++ b/saqc/funcs/outliers.py
@@ -10,13 +10,16 @@ from __future__ import annotations
 
 import uuid
 import warnings
-from typing import TYPE_CHECKING, Callable, Sequence, Tuple
+from collections import Counter
+from typing import TYPE_CHECKING, Callable, Optional, Sequence, Tuple
 
 import numpy as np
 import numpy.polynomial.polynomial as poly
 import pandas as pd
 from outliers import smirnov_grubbs  # noqa, on pypi as outlier-utils
 from scipy.stats import median_abs_deviation
+from sklearn.cluster import k_means
+from sklearn.metrics import silhouette_samples, silhouette_score
 from typing_extensions import Literal
 
 from saqc import BAD, UNFLAGGED
@@ -39,6 +42,55 @@ if TYPE_CHECKING:
     from saqc import SaQC
 
 
+def _estimateCutoffProbability(probabilities: np.ndarray, thresh: float):
+    """
+    Estimate a cutoff level for the outlier probabilities passed to `probabilities` with an iterative k-means aproach
+    """
+
+    if isinstance(thresh, float) and (thresh < 1):
+        sup_num = max(4, int(abs(thresh * probabilities.shape[0])))
+
+    else:
+        sup_num = max(4, int(abs(thresh)))
+
+    vals = (-probabilities).nlargest(sup_num).values
+    vals = np.unique(vals).reshape(-1, 1)
+    km = k_means(vals, 2)
+    thresh = 1.0
+    for check in range(3, int(vals.shape[0]) + 1):
+
+        _km = k_means(vals, check)
+
+        if (_km[-1] - km[-1]) > -1:
+
+            cluster_hierarchy = km[0].argsort(axis=0).squeeze()
+            top_cluster = cluster_hierarchy[-1]
+            sec_cluster = cluster_hierarchy[-2]
+            top_select = km[1] == top_cluster
+            sec_select = km[1] == sec_cluster
+            topsec_select = top_select | sec_select
+            topsec_labels = km[1][topsec_select]
+            if topsec_select.sum() == 2:
+                continue
+            silhouette = silhouette_samples(vals[topsec_select], topsec_labels)
+            if top_select.sum() == 1:
+                silhouette[top_select] = 1.0
+            if sec_select.sum() == 1:
+                silhouette[sec_select] = 1.0
+            top_silhouette = silhouette[km[1][topsec_select] == top_cluster]
+            sil_mean = silhouette.mean()
+            min_select = top_silhouette >= sil_mean
+
+            if (min_select.sum() > 0) and (sil_mean > 0.6):
+                od_samples = vals[km[1] == top_cluster][min_select]
+                thresh = od_samples.min()
+                if od_samples.shape[0] > 10:
+                    thresh = max(thresh, km[0].max() - 3 * od_samples.std())
+                break
+        km = _km
+    return thresh
+
+
 def _stray(partition, min_periods, iter_start, alpha):
     sample_size = len(partition)
     flag_index = pd.Index([])
@@ -202,7 +254,9 @@ class OutliersMixin:
         self: "SaQC",
         field: str,
         n: int = 20,
-        thresh: Literal["auto"] | float = 1.5,
+        thresh: float | Literal["auto"] | None = None,
+        probability: Optional[float] = None,
+        corruption: float | int | None = None,
         algorithm: Literal["ball_tree", "kd_tree", "brute", "auto"] = "ball_tree",
         p: int = 1,
         density: Literal["auto"] | float = "auto",
@@ -241,19 +295,23 @@ class OutliersMixin:
               Higher values greatly increase numerical costs.
 
         thresh :
-            The threshold for flagging the calculated LOF. A LOF of around
-            ``1`` is considered normal and most likely corresponds to
-            inlier points. This parameter is considered the main calibration
-            parameter of the algorithm.
-
-            * The threshing defaults to ``1.5``, wich is the default value
-              found to be suitable in the literature.
-            * ``'auto'`` enables flagging the scores with a modified 3-sigma
-              rule, resulting in a thresh around ``4``, which usually
-              greatly mitigates overflagging compared to the literature
-              recommendation, but often is too high.
-            * sensitive range for the parameter may be ``[1,15]``, assuming
-              default settings for the other parameters.
+            Outlier Factor Cutoff.
+
+            * If given, compute local outlier factors and cut them off at value level
+              `thresh`, resulting in all values having factors assigned higher than the cutoff,
+              being flagged.
+
+        probability :
+            Outlier probability cutoff
+
+            * If given, compute local outlier probabilities and cut them off at value level
+              `probability`, resulting in all values with factors larger than the cutoff, being flagged.
+
+        corruption :
+            Portion of data that is anomalous. Can be given:
+
+            * as a portion of the data (`float in [0,1]`)
+            * as the total of corrupted samples (`int > 1`)
 
         algorithm :
             Algorithm used for calculating the :py:attr:`n`-nearest neighbors
@@ -385,8 +443,12 @@ class OutliersMixin:
         :ref:`introduction to outlier detection with saqc <cookbooks/OutlierDetection:Outlier Detection>`
         """
         self._validateLOF(algorithm, n, p, density)
-        if thresh != "auto" and not isFloatLike(thresh):
-            raise ValueError(f"'thresh' must be 'auto' or a float, not {thresh}")
+
+        para_check = (thresh is None) + (probability is None) + (corruption is None)
+        if para_check < 2:
+            raise ValueError(
+                f'Only one (or none) of "thresh", "probability", "corruption" must be given, got: \n thresh: {thresh} \n probability: {probability} \n corruption: {corruption}'
+            )
 
         tmp_field = str(uuid.uuid4())
         qc = self.assignUniLOF(
@@ -397,13 +459,23 @@ class OutliersMixin:
             p=p,
             density=density,
             fill_na=fill_na,
+            statistical_extent=0.997 if thresh is None else 1,
         )
         s = qc.data[tmp_field]
-        if thresh == "auto":
-            _s = pd.concat([s, (-s - 2)])
-            s_mask = ((_s - _s.mean()) / _s.std()).iloc[: int(s.shape[0])].abs() > 3
-        else:
-            s_mask = s < -abs(thresh)
+        if para_check == 3:
+            corruption = int(-(len(self.data[field]) / max(n, 4)))
+        if thresh is not None:
+            if isinstance(thresh, str) and (thresh == "auto"):
+                _s = pd.concat([s, (-s - 2)])
+                s_mask = ((_s - _s.mean()) / _s.std()).iloc[: int(s.shape[0])].abs() > 3
+            else:
+                s_mask = s < -abs(thresh)
+        elif corruption is not None:
+            cutoff = _estimateCutoffProbability(probabilities=s, thresh=corruption)
+            s_mask = s <= -abs(cutoff)
+        elif probability is not None:
+            s_mask = s <= -abs(probability)
+
         s_mask = ~isflagged(qc._flags[field], kwargs["dfilter"]) & s_mask
 
         if slope_correct:
diff --git a/saqc/funcs/scores.py b/saqc/funcs/scores.py
index 5c533d4dc..bd1fd5103 100644
--- a/saqc/funcs/scores.py
+++ b/saqc/funcs/scores.py
@@ -11,6 +11,8 @@ from typing import TYPE_CHECKING, Callable, Optional, Sequence, Tuple
 
 import numpy as np
 import pandas as pd
+from scipy.spatial.distance import cdist
+from scipy.special import erf, erfinv
 from sklearn.neighbors import LocalOutlierFactor
 from typing_extensions import Literal
 
@@ -49,6 +51,43 @@ def _LOFApply(vals, n_neighbors, ret="scores", **kwargs):
         return labels == -1
 
 
+def _LOPApply(
+    vals,
+    n_neighbors,
+    ret="scores",
+    statistical_extent=0.997,
+    use_centroids=False,
+    **kwargs,
+):
+    """
+    Apply Local Outlier Probability algorithm, as presented in
+
+    Kriegel, H.-P.; Kröger, P.; Schubert, E.; Zimek, A. LoOP: local outlier probabilities. Proceedings
+    of the 18th ACM conference on Information and knowledge management 2009, 1649–1652.
+    """
+    d, members = kNN(vals, n_neighbors, **kwargs)
+    members = np.concatenate([np.array([range(vals.shape[0])]).T, members], axis=1)
+    d = np.concatenate([np.array([[0] * vals.shape[0]]).T, d], axis=1)
+    if use_centroids:
+        centroids = vals[members].sum(axis=1) / (n_neighbors + 1)
+
+        for k in range(vals.shape[0]):
+            d[k] = cdist(centroids[k : k + 1], vals[members[k]])
+
+    standard_dist = np.sqrt(np.sum(d**2, axis=1) / (n_neighbors + 1))
+    lambda_val = np.sqrt(2) * erfinv(statistical_extent)
+    p_dist = lambda_val * standard_dist
+    exp_p_dist = p_dist[members].sum(axis=1) / (n_neighbors + 1)
+    plof = (p_dist / exp_p_dist) - 1
+    nplof = np.sqrt(np.sum(plof**2) / len(plof)) * lambda_val * np.sqrt(2)
+    scores = erf(plof / nplof)
+    scores = np.where(scores > 0, scores, 0)
+    if ret == "scores":
+        return -scores
+    else:
+        return scores == 0
+
+
 def _groupedScoring(
     val_frame: pd.DataFrame,
     n: int = 20,
@@ -436,10 +475,11 @@ class ScoresMixin:
         p: int = 1,
         density: Literal["auto"] | float = "auto",
         fill_na: bool = True,
+        statistical_extent=1,
         **kwargs,
     ) -> "SaQC":
         """
-        Assign "univariate" Local Outlier Factor (LOF).
+        Assign "univariate" Local Outlier Factor (LOF) or "inivariate" Local Outlier Probability (LOP)
 
         The Function is a wrapper around a usual LOF implementation, aiming for an easy to use, parameter minimal
         outlier scoring function for singleton variables, that does not necessitate prior modelling of the variable.
@@ -482,11 +522,16 @@ class ScoresMixin:
 
         Notes
         -----
+        LOP: Kriegel, H.-P.; Kröger, P.; Schubert, E.; Zimek, A. LoOP: local outlier probabilities. Proceedings
+          of the 18th ACM conference on Information and knowledge management 2009, 1649–1652.
+
         Algorithm steps for uniLOF flagging of variable `x`:
 
         1. The temporal density `dt(x)` is calculated according to the `density` parameter.
-        2. LOF scores `LOF(x)` are calculated for the concatenation [`x`, `dt(x)`]
-        3. `x` is flagged where `LOF(x)` exceeds the threshold determined by the parameter `thresh`.
+        2. LOF (or LOP) scores `L(x)` are calculated for the concatenation [`x`, `dt(x)`]
+        3. `x` is flagged where `L(x)` exceeds the threshold determined by the parameter `thresh`.
+
+
 
         Examples
         --------
@@ -506,13 +551,13 @@ class ScoresMixin:
             filled = pd.Series(False, index=vals.index)
 
         if density == "auto":
-            v_diff = vals.diff()
+            v_diff = (vals**p).diff()
             density = v_diff.abs().median()
             if density == 0:
                 density = v_diff[v_diff != 0].abs().median()
         elif isinstance(density, Callable):
             density = density(vals)
-        if isinstance(density, pd.Series):
+        elif isinstance(density, pd.Series):
             density = density.values
 
         d_var = pd.Series(np.arange(len(vals)) * density, index=vals.index)
@@ -537,7 +582,19 @@ class ScoresMixin:
         )
 
         LOF_vars = np.array([vals_extended, d_extended]).T
-        scores = _LOFApply(LOF_vars, n, p=p, algorithm=algorithm, metric="minkowski")
+        if statistical_extent == 1:
+            scores = _LOFApply(
+                LOF_vars, n, p=p, algorithm=algorithm, metric="minkowski"
+            )
+        else:
+            scores = _LOPApply(
+                LOF_vars,
+                n,
+                p=p,
+                statistical_extent=statistical_extent,
+                algorithm=algorithm,
+                metric="minkowski",
+            )
         scores = scores[n:-n]
         score_ser = pd.Series(scores, index=na_bool_ser.index[~na_bool_ser.values])
         score_ser = score_ser.reindex(na_bool_ser.index)
diff --git a/tests/funcs/test_outlier_detection.py b/tests/funcs/test_outlier_detection.py
index 61953bcd0..320d8585c 100644
--- a/tests/funcs/test_outlier_detection.py
+++ b/tests/funcs/test_outlier_detection.py
@@ -205,13 +205,16 @@ def test_flagZScoresMV():
     assert (qc.flags.to_pandas().iloc[[40, 80], 0] > 0).all()
 
 
+@pytest.mark.filterwarnings("ignore:Number of distinct clusters")
 @pytest.mark.parametrize("n", [1, 10])
 @pytest.mark.parametrize("p", [1, 2])
-@pytest.mark.parametrize("thresh", ["auto", 2])
-def test_flagUniLOF(spiky_data, n, p, thresh):
+@pytest.mark.parametrize(
+    "cutoff", [{}, {"probability": 0.99}, {"thresh": "auto"}, {"thresh": 2}]
+)
+def test_flagUniLOF(spiky_data, n, p, cutoff):
     data = spiky_data[0]
     field, *_ = data.columns
-    qc = SaQC(data).flagUniLOF(field, n=n, p=p, thresh=thresh)
+    qc = SaQC(data).flagUniLOF(field, n=n, p=p, **cutoff)
     flag_result = qc.flags[field]
     test_sum = (flag_result.iloc[spiky_data[1]] == BAD).sum()
     assert test_sum == len(spiky_data[1])
diff --git a/tests/funcs/test_pattern_rec.py b/tests/funcs/test_pattern_rec.py
index a9ceb667d..ec9866b7c 100644
--- a/tests/funcs/test_pattern_rec.py
+++ b/tests/funcs/test_pattern_rec.py
@@ -35,7 +35,6 @@ def test_flagPlateau():
     dat = pd.read_csv(path, parse_dates=[0], index_col=0)
     dat = dat.interpolate("linear")
     dat = dat.ffill().bfill()
-
     qc = SaQC(dat)
     qc = qc.flagPlateau(
         "base3", min_length="10min", max_length="7d", granularity="20min"
-- 
GitLab