Skip to content
Snippets Groups Projects
Commit 7bfe7237 authored by Peter Lünenschloß's avatar Peter Lünenschloß
Browse files

Merge branch 'prob_lof' into 'develop'

Univariat Outlier Probabilities and thresh automatisation for flagUniLOF

See merge request !864
parents 2827a91b 3a2c4796
No related branches found
No related tags found
1 merge request!864Univariat Outlier Probabilities and thresh automatisation for flagUniLOF
Pipeline #265877 passed with stages
in 14 minutes and 48 seconds
......@@ -8,6 +8,9 @@ SPDX-License-Identifier: GPL-3.0-or-later
## Unreleased
[List of commits](https://git.ufz.de/rdm-software/saqc/-/compare/v2.6.0...develop)
### Added
- `flagPlateaus`: added function to search and flag outlierish value plateaus of certain temporal extension
- `flagUniLOF`: added dispatch to Local Outlier Probability (*LoOP*) variant
- `flaguniLOF`: made `thresh` Optional
- `flagPlateaus`: added function to search and flag anomalous value plateaus of certain temporal extension
### Changed
### Removed
......
......@@ -10,13 +10,16 @@ from __future__ import annotations
import uuid
import warnings
from typing import TYPE_CHECKING, Callable, Sequence, Tuple
from collections import Counter
from typing import TYPE_CHECKING, Callable, Optional, Sequence, Tuple
import numpy as np
import numpy.polynomial.polynomial as poly
import pandas as pd
from outliers import smirnov_grubbs # noqa, on pypi as outlier-utils
from scipy.stats import median_abs_deviation
from sklearn.cluster import k_means
from sklearn.metrics import silhouette_samples, silhouette_score
from typing_extensions import Literal
from saqc import BAD, UNFLAGGED
......@@ -39,6 +42,55 @@ if TYPE_CHECKING:
from saqc import SaQC
def _estimateCutoffProbability(probabilities: np.ndarray, thresh: float):
"""
Estimate a cutoff level for the outlier probabilities passed to `probabilities` with an iterative k-means aproach
"""
if isinstance(thresh, float) and (thresh < 1):
sup_num = max(4, int(abs(thresh * probabilities.shape[0])))
else:
sup_num = max(4, int(abs(thresh)))
vals = (-probabilities).nlargest(sup_num).values
vals = np.unique(vals).reshape(-1, 1)
km = k_means(vals, 2)
thresh = 1.0
for check in range(3, int(vals.shape[0]) + 1):
_km = k_means(vals, check)
if (_km[-1] - km[-1]) > -1:
cluster_hierarchy = km[0].argsort(axis=0).squeeze()
top_cluster = cluster_hierarchy[-1]
sec_cluster = cluster_hierarchy[-2]
top_select = km[1] == top_cluster
sec_select = km[1] == sec_cluster
topsec_select = top_select | sec_select
topsec_labels = km[1][topsec_select]
if topsec_select.sum() == 2:
continue
silhouette = silhouette_samples(vals[topsec_select], topsec_labels)
if top_select.sum() == 1:
silhouette[top_select] = 1.0
if sec_select.sum() == 1:
silhouette[sec_select] = 1.0
top_silhouette = silhouette[km[1][topsec_select] == top_cluster]
sil_mean = silhouette.mean()
min_select = top_silhouette >= sil_mean
if (min_select.sum() > 0) and (sil_mean > 0.6):
od_samples = vals[km[1] == top_cluster][min_select]
thresh = od_samples.min()
if od_samples.shape[0] > 10:
thresh = max(thresh, km[0].max() - 3 * od_samples.std())
break
km = _km
return thresh
def _stray(partition, min_periods, iter_start, alpha):
sample_size = len(partition)
flag_index = pd.Index([])
......@@ -202,7 +254,9 @@ class OutliersMixin:
self: "SaQC",
field: str,
n: int = 20,
thresh: Literal["auto"] | float = 1.5,
thresh: float | Literal["auto"] | None = None,
probability: Optional[float] = None,
corruption: float | int | None = None,
algorithm: Literal["ball_tree", "kd_tree", "brute", "auto"] = "ball_tree",
p: int = 1,
density: Literal["auto"] | float = "auto",
......@@ -241,19 +295,23 @@ class OutliersMixin:
Higher values greatly increase numerical costs.
thresh :
The threshold for flagging the calculated LOF. A LOF of around
``1`` is considered normal and most likely corresponds to
inlier points. This parameter is considered the main calibration
parameter of the algorithm.
* The threshing defaults to ``1.5``, wich is the default value
found to be suitable in the literature.
* ``'auto'`` enables flagging the scores with a modified 3-sigma
rule, resulting in a thresh around ``4``, which usually
greatly mitigates overflagging compared to the literature
recommendation, but often is too high.
* sensitive range for the parameter may be ``[1,15]``, assuming
default settings for the other parameters.
Outlier Factor Cutoff.
* If given, compute local outlier factors and cut them off at value level
`thresh`, resulting in all values having factors assigned higher than the cutoff,
being flagged.
probability :
Outlier probability cutoff
* If given, compute local outlier probabilities and cut them off at value level
`probability`, resulting in all values with factors larger than the cutoff, being flagged.
corruption :
Portion of data that is anomalous. Can be given:
* as a portion of the data (`float in [0,1]`)
* as the total of corrupted samples (`int > 1`)
algorithm :
Algorithm used for calculating the :py:attr:`n`-nearest neighbors
......@@ -385,8 +443,12 @@ class OutliersMixin:
:ref:`introduction to outlier detection with saqc <cookbooks/OutlierDetection:Outlier Detection>`
"""
self._validateLOF(algorithm, n, p, density)
if thresh != "auto" and not isFloatLike(thresh):
raise ValueError(f"'thresh' must be 'auto' or a float, not {thresh}")
para_check = (thresh is None) + (probability is None) + (corruption is None)
if para_check < 2:
raise ValueError(
f'Only one (or none) of "thresh", "probability", "corruption" must be given, got: \n thresh: {thresh} \n probability: {probability} \n corruption: {corruption}'
)
tmp_field = str(uuid.uuid4())
qc = self.assignUniLOF(
......@@ -397,13 +459,23 @@ class OutliersMixin:
p=p,
density=density,
fill_na=fill_na,
statistical_extent=0.997 if thresh is None else 1,
)
s = qc.data[tmp_field]
if thresh == "auto":
_s = pd.concat([s, (-s - 2)])
s_mask = ((_s - _s.mean()) / _s.std()).iloc[: int(s.shape[0])].abs() > 3
else:
s_mask = s < -abs(thresh)
if para_check == 3:
corruption = int(-(len(self.data[field]) / max(n, 4)))
if thresh is not None:
if isinstance(thresh, str) and (thresh == "auto"):
_s = pd.concat([s, (-s - 2)])
s_mask = ((_s - _s.mean()) / _s.std()).iloc[: int(s.shape[0])].abs() > 3
else:
s_mask = s < -abs(thresh)
elif corruption is not None:
cutoff = _estimateCutoffProbability(probabilities=s, thresh=corruption)
s_mask = s <= -abs(cutoff)
elif probability is not None:
s_mask = s <= -abs(probability)
s_mask = ~isflagged(qc._flags[field], kwargs["dfilter"]) & s_mask
if slope_correct:
......
......@@ -11,6 +11,8 @@ from typing import TYPE_CHECKING, Callable, Optional, Sequence, Tuple
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist
from scipy.special import erf, erfinv
from sklearn.neighbors import LocalOutlierFactor
from typing_extensions import Literal
......@@ -49,6 +51,43 @@ def _LOFApply(vals, n_neighbors, ret="scores", **kwargs):
return labels == -1
def _LOPApply(
vals,
n_neighbors,
ret="scores",
statistical_extent=0.997,
use_centroids=False,
**kwargs,
):
"""
Apply Local Outlier Probability algorithm, as presented in
Kriegel, H.-P.; Kröger, P.; Schubert, E.; Zimek, A. LoOP: local outlier probabilities. Proceedings
of the 18th ACM conference on Information and knowledge management 2009, 1649–1652.
"""
d, members = kNN(vals, n_neighbors, **kwargs)
members = np.concatenate([np.array([range(vals.shape[0])]).T, members], axis=1)
d = np.concatenate([np.array([[0] * vals.shape[0]]).T, d], axis=1)
if use_centroids:
centroids = vals[members].sum(axis=1) / (n_neighbors + 1)
for k in range(vals.shape[0]):
d[k] = cdist(centroids[k : k + 1], vals[members[k]])
standard_dist = np.sqrt(np.sum(d**2, axis=1) / (n_neighbors + 1))
lambda_val = np.sqrt(2) * erfinv(statistical_extent)
p_dist = lambda_val * standard_dist
exp_p_dist = p_dist[members].sum(axis=1) / (n_neighbors + 1)
plof = (p_dist / exp_p_dist) - 1
nplof = np.sqrt(np.sum(plof**2) / len(plof)) * lambda_val * np.sqrt(2)
scores = erf(plof / nplof)
scores = np.where(scores > 0, scores, 0)
if ret == "scores":
return -scores
else:
return scores == 0
def _groupedScoring(
val_frame: pd.DataFrame,
n: int = 20,
......@@ -436,10 +475,11 @@ class ScoresMixin:
p: int = 1,
density: Literal["auto"] | float = "auto",
fill_na: bool = True,
statistical_extent=1,
**kwargs,
) -> "SaQC":
"""
Assign "univariate" Local Outlier Factor (LOF).
Assign "univariate" Local Outlier Factor (LOF) or "inivariate" Local Outlier Probability (LOP)
The Function is a wrapper around a usual LOF implementation, aiming for an easy to use, parameter minimal
outlier scoring function for singleton variables, that does not necessitate prior modelling of the variable.
......@@ -482,11 +522,16 @@ class ScoresMixin:
Notes
-----
LOP: Kriegel, H.-P.; Kröger, P.; Schubert, E.; Zimek, A. LoOP: local outlier probabilities. Proceedings
of the 18th ACM conference on Information and knowledge management 2009, 1649–1652.
Algorithm steps for uniLOF flagging of variable `x`:
1. The temporal density `dt(x)` is calculated according to the `density` parameter.
2. LOF scores `LOF(x)` are calculated for the concatenation [`x`, `dt(x)`]
3. `x` is flagged where `LOF(x)` exceeds the threshold determined by the parameter `thresh`.
2. LOF (or LOP) scores `L(x)` are calculated for the concatenation [`x`, `dt(x)`]
3. `x` is flagged where `L(x)` exceeds the threshold determined by the parameter `thresh`.
Examples
--------
......@@ -506,13 +551,13 @@ class ScoresMixin:
filled = pd.Series(False, index=vals.index)
if density == "auto":
v_diff = vals.diff()
v_diff = (vals**p).diff()
density = v_diff.abs().median()
if density == 0:
density = v_diff[v_diff != 0].abs().median()
elif isinstance(density, Callable):
density = density(vals)
if isinstance(density, pd.Series):
elif isinstance(density, pd.Series):
density = density.values
d_var = pd.Series(np.arange(len(vals)) * density, index=vals.index)
......@@ -537,7 +582,19 @@ class ScoresMixin:
)
LOF_vars = np.array([vals_extended, d_extended]).T
scores = _LOFApply(LOF_vars, n, p=p, algorithm=algorithm, metric="minkowski")
if statistical_extent == 1:
scores = _LOFApply(
LOF_vars, n, p=p, algorithm=algorithm, metric="minkowski"
)
else:
scores = _LOPApply(
LOF_vars,
n,
p=p,
statistical_extent=statistical_extent,
algorithm=algorithm,
metric="minkowski",
)
scores = scores[n:-n]
score_ser = pd.Series(scores, index=na_bool_ser.index[~na_bool_ser.values])
score_ser = score_ser.reindex(na_bool_ser.index)
......
......@@ -205,13 +205,16 @@ def test_flagZScoresMV():
assert (qc.flags.to_pandas().iloc[[40, 80], 0] > 0).all()
@pytest.mark.filterwarnings("ignore:Number of distinct clusters")
@pytest.mark.parametrize("n", [1, 10])
@pytest.mark.parametrize("p", [1, 2])
@pytest.mark.parametrize("thresh", ["auto", 2])
def test_flagUniLOF(spiky_data, n, p, thresh):
@pytest.mark.parametrize(
"cutoff", [{}, {"probability": 0.99}, {"thresh": "auto"}, {"thresh": 2}]
)
def test_flagUniLOF(spiky_data, n, p, cutoff):
data = spiky_data[0]
field, *_ = data.columns
qc = SaQC(data).flagUniLOF(field, n=n, p=p, thresh=thresh)
qc = SaQC(data).flagUniLOF(field, n=n, p=p, **cutoff)
flag_result = qc.flags[field]
test_sum = (flag_result.iloc[spiky_data[1]] == BAD).sum()
assert test_sum == len(spiky_data[1])
......
......@@ -35,7 +35,6 @@ def test_flagPlateau():
dat = pd.read_csv(path, parse_dates=[0], index_col=0)
dat = dat.interpolate("linear")
dat = dat.ffill().bfill()
qc = SaQC(dat)
qc = qc.flagPlateau(
"base3", min_length="10min", max_length="7d", granularity="20min"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment