diff --git a/CHANGELOG.md b/CHANGELOG.md index 7d348a4b570cef2084860d4279e052509e3ca33b..1e5b8c7b2eea9ca9c2e38714d5f27c0f28cd63fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ SPDX-License-Identifier: GPL-3.0-or-later - add option to not overwrite existing flags to `concatFlags` - add option to pass existing axis object to `plot` - python 3.11 support +- added Local Outlier Factor functionality ### Changed - Remove all flag value restrictions from the default flagging scheme `FloatTranslator` - Renamed `TranslationScheme.forward` to `TranslationScheme.toInternal` diff --git a/docs/funcs/outliers.rst b/docs/funcs/outliers.rst index cad33ccbc36fcd657c3ff0449be4dbad91407d26..43c9a4cff9d9acc728dbbeafa3d8c73cb44d8e81 100644 --- a/docs/funcs/outliers.rst +++ b/docs/funcs/outliers.rst @@ -19,4 +19,7 @@ outliers ~SaQC.flagByGrubbs ~SaQC.flagRange ~SaQC.flagCrossStatistics + ~SaQC.flagLOF + ~SaQC.flagUniLOF ~SaQC.flagZScore + diff --git a/docs/funcs/scores.rst b/docs/funcs/scores.rst index 6439a25fbb88c5321d46477a41ebeeca021a5951..b7ce8bc297f03e6a5e86821801f7d0421b8c4098 100644 --- a/docs/funcs/scores.rst +++ b/docs/funcs/scores.rst @@ -12,4 +12,6 @@ scores .. autosummary:: ~SaQC.assignKNNScore + ~SaQC.assignLOF + ~SaQC.assignUniLOF ~SaQC.assignZScore diff --git a/docs/requirements.txt b/docs/requirements.txt index 8798e4f94e2f51abacc0b7ead5fd46c860c988ab..182c81416a851e5cca4085530b3ec59055bb153a 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -10,3 +10,4 @@ sphinx-markdown-tables==0.0.17 jupyter-sphinx==0.3.2 sphinx_autodoc_typehints==1.22 sphinx-tabs==3.4.1 +fancy-collections==0.1.3 \ No newline at end of file diff --git a/saqc/funcs/outliers.py b/saqc/funcs/outliers.py index 03d2ea82e3c347138ab92901336b93582cd10b57..713a631b52bf04d145c0dd931d3bf70d200d07ad 100644 --- a/saqc/funcs/outliers.py +++ b/saqc/funcs/outliers.py @@ -29,6 +29,428 @@ if TYPE_CHECKING: class OutliersMixin: + @register( + mask=["field"], + demask=["field"], + squeeze=["field"], + multivariate=True, + handles_target=False, + ) + def flagLOF( + self: "SaQC", + field: Sequence[str], + n: int = 20, + thresh: Union[Literal["auto"], float] = 1.5, + algorithm: Literal["ball_tree", "kd_tree", "brute", "auto"] = "ball_tree", + p: int = 1, + density: Union[Literal["auto"], float, Callable] = "auto", + flag: float = BAD, + **kwargs, + ) -> "SaQC": + """ + Flag Local Outlier Factor (LOF) exceeding cutoff. + + Parameters + ---------- + field : + The field name of the column, holding the data-to-be-flagged. + n : + Number of neighbors to be included into the LOF calculation. Defaults to `20`, which is a value found to be + suitable in the literature. + + * `n` determines the "locality" of an observation (its `n` nearest neighbors) and sets the upper limit of + values of an outlier clusters (i.e. consecutive outliers). Outlier clusters of size greater than `n/2` + may not be detected reliably. + * The larger `n`, the lesser the algorithm's sensitivity to local outliers and small or singleton outliers + points. Higher values greatly increase numerical costs. + thresh : + The threshold for flagging the calculated LOF. A LOF of around `1` is considered normal and most likely + corresponds to inlier points. + + * The "automatic" threshing introduced with the publication of the algorithm defaults to `1.5`. + * In this implementation, `thresh` defaults ('auto') to flagging the scores with a modified 3-sigma rule, + resulting in a thresh > 1.5 which usually mitigates overflagging compared to the + literature recommendation. + + algorithm : + Algorithm used for calculating the `n`-nearest neighbors. + p : + Degree of the metric ("Minkowski"), according to which distance to neighbors is determined. + Most important values are: + * `1` - Manhatten Metric + * `2` - Euclidian Metric + flag : + flag to set. + + Returns + ------- + saqc.SaQC + + Notes + ----- + + * The :py:meth:`~saqc.SaQC.flagLOF` function calculates the Local Outlier Factor (LOF) for every point in the data. The + *LOF* is a scalar value, that roughly correlates to the *reachability*, or "outlierishnes" of the evaluated + datapoint. If a point is as reachable, as all its :py:attr:`n`-nearest neighbors, the *LOF* score evaluates to + around `1`. If it is only as half as reachable as all its `n`-nearest neighbors are + (so to say, as double as "outlierish"), the score evaluates to `2` roughly. + So, the Local Outlier *Factor* relates a points *reachability* to the *reachability* of its + :py:attr:`n`-nearest neighbors in a multiplicative fashion (as a "factor"). + * The `reachability` of a point thereby is determined as an aggregation of the points distance to its :py:attr:`n`-nearest + neighbors, measured with regard to the minkowski metric of degree :py:attr:`p` (usually euclidean). + * To derive a binary label for every point (outlier: *yes*, or *no*), the scores are cut off at a level, + determined by :py:attr:`thresh`. + + """ + fields = toSequence(field) + field_ = str(uuid.uuid4()) + self = self.assignLOF( + field=fields, + target=field_, + n=n, + algorithm=algorithm, + p=p, + density=density, + ) + s = self.data[field_] + if thresh == "auto": + s = pd.concat([s, (-s - 2)]) + s_mask = (s - s.mean() / s.std())[: len(s) // 2].abs() > 3 + else: + s_mask = s < abs(thresh) + + for f in fields: + self._flags[s_mask, f] = flag + self = self.dropField(field_) + return self + + @flagging() + def flagUniLOF( + self: "SaQC", + field: str, + n: int = 20, + thresh: Union[Literal["auto"], float] = 1.5, + algorithm: Literal["ball_tree", "kd_tree", "brute", "auto"] = "ball_tree", + p: int = 1, + density: Union[Literal["auto"], float, Callable] = "auto", + fill_na: str = "linear", + flag: float = BAD, + **kwargs, + ) -> "SaQC": + """ + Flag "univariate" Local Outlier Factor (LOF) exceeding cutoff. + + The Function is a wrapper around a usual LOF implementation, aiming for an easy to use, parameter minimal + outlier detection function for singleton variables, that does not necessitate prior modelling of the variable. + LOF is applied onto a concatenation of the `field` variable and a "temporal density", or "penalty" variable, + that measures temporal distance between data points. See notes Section or a more exhaustive Explaination. + + See the Notes section for more details on the algorithm. + + Parameters + ---------- + field : + The field name of the column, holding the data-to-be-flagged. + n : + Number of periods to be included into the LOF calculation. Defaults to `20`, which is a value found to be + suitable in the literature. + + * `n` determines the "locality" of an observation (its `n` nearest neighbors) and sets the upper limit of + values of an outlier clusters (i.e. consecutive outliers). Outlier clusters of size greater than `n/2` + may not be detected reliably. + * The larger `n`, the lesser the algorithm's sensitivity to local outliers and small or singleton outliers + points. Higher values greatly increase numerical costs. + + thresh : + The threshold for flagging the calculated LOF. A LOF of around `1` is considered normal and most likely + corresponds to inlier points. This parameter is considered the main calibration parameter of the algorithm. + + * The threshing defaults to `1.5`, wich is the default value found to be suitable in the literature. + * 'auto' enables flagging the scores with a modified 3-sigma rule, + resulting in a thresh around `4`, which usually greatly mitigates overflagging compared to the + literature recommendation, but often is too high. + * sensitive range for the parameter may be [1,15], assuming default settings for the other parameters. + + algorithm : + Algorithm used for calculating the `n`-nearest neighbors needed for LOF calculation. + p : + Degree of the metric ("Minkowski"), according to which distance to neighbors is determined. + Most important values are: + * `1` - Manhatten Metric + * `2` - Euclidian Metric + density : + How to calculate the temporal distance/density for the variable-to-be-flagged. + + * `auto` - introduces linear density with an increment equal to the median of the absolute diff of the + variable to be flagged + * float - introduces linear density with an increment equal to `density` + * Callable - calculates the density by applying the function passed onto the variable to be flagged + (passed as Series). + + fill_na : + Weather or not to fill NaN values in the data with a linear interpolation. + flag : + flag to set. + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flags : saqc.Flags + The quality flags of data + + Notes + ----- + + * The :py:meth:`~saqc.SaQC.flagUniLOF` function calculates an univariat Local Outlier Factor (UniLOF) - score for + every point in the one dimensional input data series. + *UniLOF* is a scalar value, that roughly correlates to the *reachability*, or "outlierishnes" of the evaluated + datapoint in the 2 dimensional space constituted by the data-values and the time axis. So the Algorithm + basically operates on the "graph", or the "plot" of the input timeseries. + * If a point in this "graph" is as reachable, as all its :py:attr:`n`-nearest neighbors, the *UniLOF* score + evaluates to around `1`. If it is only as half as reachable as all its `n`-nearest neighbors are + (so to say, as double as "outlierish"), the score evaluates to `2` roughly. + So, the Univariat Local Outlier *Factor* relates a points *reachability* to the *reachability* of its + :py:attr:`n`-nearest neighbors in a multiplicative fashion (as a "factor"). + * The `reachability` of a point thereby is determined as an aggregation of the points distance to its + :py:attr:`n`-nearest + neighbors, measured with regard to the minkowski metric of degree :py:attr:`p` (usually euclidean). + * The parameter :py:attr:`density` thereby determines how dimensionality of the time is removed, to make it a + dimension less, real valued coordinate. + * To derive a binary label for every point (outlier: *yes*, or *no*), the scores are cut off at a level, + determined by :py:attr:`thresh`. + + Examples + -------- + + .. plot:: + :context: reset + :include-source: False + + import matplotlib + import saqc + import pandas as pd + data = pd.read_csv('../resources/data/hydro_data.csv') + data = data.set_index('Timestamp') + data.index = pd.DatetimeIndex(data.index) + qc = saqc.SaQC(data) + + Example usage on the `hydrologic data <https://git.ufz.de/rdm-software/saqc/-/blob/develop/docs/resources/data/hydro_data.csv>`_ + from the repository. + + Loading data via pandas csv file parser, casting index to DateTime type, generating a :py:class:`~saqc.SaQC` + instance from the data and plotting the variable representing light scattering at 254 nanometers wavelength. + + .. doctest:: flagUniLOFExample + + >>> import saqc + >>> data = pd.read_csv('./resources/data/hydro_data.csv') + >>> data = data.set_index('Timestamp') + >>> data.index = pd.DatetimeIndex(data.index) + >>> qc = saqc.SaQC(data) + >>> qc.plot('sac254_raw') # doctest: +SKIP + + .. plot:: + :context: + :include-source: False + :class: center + + qc.plot('sac254_raw') + + We apply :py:meth:`~saqc.SaqC.flagUniLOF` in with default parameter values. Meaning, that the main calibration paramters + :py:attr:`n` and :py:attr:`thresh` evaluate to `20` and `1.5` respectively. + + .. doctest:: flagUniLOFExample + + >>> import saqc + >>> qc = qc.flagUniLOF('sac254_raw') + >>> qc.plot('sac254_raw') # doctest: +SKIP + + .. plot:: + :context: close-figs + :include-source: False + :class: center + + qc = qc.flagUniLOF('sac254_raw') + qc.plot('sac254_raw') + + So the flagging pattern does not look too bad. Quite surely, there is no overflagging present. Actually, + zooming in, one could get the impression, the function underflagged alittle. + + .. plot:: + :context: close-figs + :include-source: False + :class: center + + qc.plot('sac254_raw', xscope=slice('2016-09','2016-11')) + + Tuning Parameter thresh + + + The best way to tune the parameter :py:attr:`thresh`, is, to find a good starting value, that slightly + underflags the data, and from there on, + to *reapply* the function with evermore decreased values of :py:attr:`thresh`. + + .. doctest:: flagUniLOFExample + + >>> qc = qc.flagUniLOF('sac254_raw', thresh=1.3, label='threshold = 1.3') + >>> qc.plot('sac254_raw') # doctest: +SKIP + + .. plot:: + :context: close-figs + :include-source: False + :class: center + + qc = qc.flagUniLOF('sac254_raw', thresh=1.3, label='threshold=1.3') + qc.plot('sac254_raw', xscope=slice('2016-09','2016-11')) + + So, we catched some more of the outlierish, flickering values. Lets lower the threshold even more: + + .. doctest:: flagUniLOFExample + + >>> qc = qc.flagUniLOF('sac254_raw', thresh=1.1, label='threshold = 1.1') + >>> qc.plot('sac254_raw') # doctest: +SKIP + + .. plot:: + :context: close-figs + :include-source: False + :class: center + + qc = qc.flagUniLOF('sac254_raw', thresh=1.1, label='thresh=1.1') + qc.plot('sac254_raw', xscope=slice('2016-09','2016-11')) + + .. doctest:: flagUniLOFExample + + >>> qc = qc.flagUniLOF('sac254_raw', thresh=1.05, label='threshold = 1.05') + >>> qc.plot('sac254_raw') # doctest: +SKIP + + .. plot:: + :context: close-figs + :include-source: False + :class: center + + qc = qc.flagUniLOF('sac254_raw', thresh=1.05, label='thresh=1.05') + qc.plot('sac254_raw', xscope=slice('2016-09','2016-11')) + + Value `1` is the lower bound for meaningfull :py:attr:`thresh` values. At `1` the method will flag all the data: + + .. doctest:: flagUniLOFExample + + >>> qc = qc.flagUniLOF('sac254_raw', thresh=1, label='threshold = 1') + >>> qc.plot('sac254_raw') # doctest: +SKIP + + .. plot:: + :context: close-figs + :include-source: False + :class: center + + qc = qc.flagUniLOF('sac254_raw', thresh=1, label='thresh=1') + qc.plot('sac254_raw', xscope=slice('2016-09','2016-11')) + + Maybe, iterating until `1.1` gets us the best overall flagging result: + + .. plot:: + :context: close-figs + :include-source: False + :class: center + + qc = saqc.SaQC(data) + qc = qc.flagUniLOF('sac254_raw', thresh=1.5, label='thresh=1.5') + qc = qc.flagUniLOF('sac254_raw', thresh=1.3, label='thresh=1.3') + qc = qc.flagUniLOF('sac254_raw', thresh=1.1, label='thresh=1.1') + qc.plot('sac254_raw') + + With some overflagging, where the data jumps erratically. We will see in the next section, how to finetune the + algorithm by shrinking the locality value :py:attr:`n` and make the process more robust in anomalies surroundings. + + First, note, that even this outlier cluster, at march 2016, got correctly flagged: + + .. plot:: + :context: close-figs + :include-source: False + :class: center + + qc.plot('sac254_raw', xscope=slice('2016-03-15','2016-03-17')) + + :py:meth:`~saqc.SaQC.flagUniLOF` will reliably catch groups of outlier, that do not consist of more than :py:attr:`n`/2 + periods. + + Parameter n + + Shrinking the locality Parameter :py:attr:`n`, can lead to clearer results, since jumps for example, do not + interfere with the scores of too much close points.: + + .. doctest:: flagUniLOFExample + + >>> qc = saqc.SaQC(data) + >>> qc = qc.flagUniLOF('sac254_raw', thresh=1.5, n=8, label='thresh=1.5, n= 8') + >>> qc.plot('sac254_raw', xscope=slice('2016-09','2016-11')) # doctest: +SKIP + + .. plot:: + :context: close-figs + :include-source: False + :class: center + + qc = saqc.SaQC(data) + qc = qc.flagUniLOF('sac254_raw', n=8) + qc.plot('sac254_raw', xscope=slice('2016-09','2016-11')) + + But as mentioned above, since :py:attr:`n` correlates with the maximal size of outlier clusters that can be + detected, the group we catched with :py:attr:`n` =20, this time goes unflagged: + + .. plot:: + :context: close-figs + :include-source: False + :class: center + + qc.plot('sac254_raw', xscope=slice('2016-03-15','2016-03-17')) + + Also note, that, when changing :py:attr:`n`, you might have to restart calibrating a good starting point for the + py:attr:`thresh` parameter. + + Increasingly higher values of :py:attr:`n` will make :py:meth:`~sacq.SaQC.flagUniLOF` increasingly invariant to local + variance and make it more of a global flagging function. So, an approach could be, to start with a + really high value of :py:attr:`n` to first clear the data from global outliers before proceeding: + + .. doctest:: flagUniLOFExample + + >>> qc = saqc.SaQC(data) + >>> qc = qc.flagUniLOF('sac254_raw', thresh=1.5, n=100, label='thresh=1.5, n=100') + >>> qc.plot('sac254_raw')# doctest: +SKIP + + .. plot:: + :context: close-figs + :include-source: False + :class: center + + qc = saqc.SaQC(data) + qc = qc.flagUniLOF('sac254_raw', thresh=1.5, n=100, label='thresh=1.5, n=100') + qc.plot('sac254_raw') + + done + + """ + field_ = str(uuid.uuid4()) + self = self.assignUniLOF( + field=field, + target=field_, + n=n, + algorithm=algorithm, + p=p, + density=density, + fill_na=fill_na, + ) + s = self.data[field_] + if thresh == "auto": + _s = pd.concat([s, (-s - 2)]) + s_mask = ((_s - _s.mean()) / _s.std()).iloc[: int(s.shape[0])].abs() > 3 + else: + s_mask = s < -abs(thresh) + + self._flags[s_mask, field] = flag + self = self.dropField(field_) + return self + @flagging() def flagRange( self: "SaQC", @@ -346,7 +768,7 @@ class OutliersMixin: n=n, func=func, freq=partition, - method="ball_tree", + algorithm="ball_tree", min_periods=partition_min, **kwargs, ) @@ -580,6 +1002,7 @@ class OutliersMixin: """ Flag outiers using the modified Z-score outlier detection method. + See references [1] for more details on the algorithm. Note @@ -998,7 +1421,7 @@ class OutliersMixin: :math:`m_j = median(\\{data[f_1][t_i], data[f_2][t_i], ..., data[f_N][t_i]\\})` is calculated 3. for every :math:`0 <= i <= K`, the set :math:`\\{data[f_1][t_i] - m_j, data[f_2][t_i] - m_j, ..., data[f_N][t_i] - m_j\\}` is tested for outliers with the - specified method (`cross_stat` parameter). + specified algorithm (`cross_stat` parameter). Parameters ---------- diff --git a/saqc/funcs/scores.py b/saqc/funcs/scores.py index 42927d050fe0df7d7d4517dac8e66e1cec8943f7..0d1698b20009d58a5fc5cf177edd2b3d7bfa840b 100644 --- a/saqc/funcs/scores.py +++ b/saqc/funcs/scores.py @@ -7,10 +7,11 @@ # -*- coding: utf-8 -*- from __future__ import annotations -from typing import TYPE_CHECKING, Callable, Optional, Sequence, Tuple +from typing import TYPE_CHECKING, Callable, Optional, Sequence, Tuple, Union import numpy as np import pandas as pd +from sklearn.neighbors import LocalOutlierFactor from typing_extensions import Literal from saqc import UNFLAGGED @@ -22,6 +23,69 @@ if TYPE_CHECKING: from saqc import SaQC +def _kNNApply(vals, n_neighbors, func=np.sum, **kwargs): + dist, *_ = kNN(vals, n_neighbors=n_neighbors, **kwargs) + try: + resids = getattr(dist, func.__name__)(axis=1) + except AttributeError: + resids = np.apply_along_axis(func, 1, dist) + return resids + + +def _LOFApply(vals, n_neighbors, ret="scores", **kwargs): + clf = LocalOutlierFactor(n_neighbors=n_neighbors, **kwargs) + labels = clf.fit_predict(vals) + resids = clf.negative_outlier_factor_ + if ret == "scores": + return resids + else: + return labels == -1 + + +def _groupedScoring( + val_frame: pd.DataFrame, + n: int = 20, + freq: float | str | None = np.inf, + min_periods: int = 2, + score_func: Callable = _LOFApply, + score_kwargs: Optional[dict] = None, +) -> Tuple[pd.Series, pd.Series, pd.Series]: + score_kwargs = score_kwargs or {} + score_index = val_frame.index + score_ser = pd.Series(np.nan, index=score_index) + + val_frame = val_frame.loc[score_index] + val_frame = val_frame.dropna() + + if val_frame.empty: + return score_ser + + # partitioning + if not freq: + freq = val_frame.shape[0] + + if isinstance(freq, str): + grouper = pd.Grouper(freq=freq) + else: + grouper = pd.Series( + data=np.arange(0, val_frame.shape[0]), index=val_frame.index + ) + grouper = grouper.transform(lambda x: int(np.floor(x / freq))) + + partitions = val_frame.groupby(grouper) + + for _, partition in partitions: + if partition.empty or (partition.shape[0] < min_periods): + continue + + sample_size = partition.shape[0] + nn_neighbors = min(n, max(sample_size, 2) - 1) + resids = score_func(partition.values, nn_neighbors, **score_kwargs) + score_ser[partition.index] = resids + + return score_ser + + def _univarScoring( data: pd.Series, window: Optional[str, int] = None, @@ -92,7 +156,7 @@ class ScoresMixin: func: Callable[[pd.Series], float] = np.sum, freq: float | str | None = np.inf, min_periods: int = 2, - method: Literal["ball_tree", "kd_tree", "brute", "auto"] = "ball_tree", + algorithm: Literal["ball_tree", "kd_tree", "brute", "auto"] = "ball_tree", metric: str = "minkowski", p: int = 2, **kwargs, @@ -148,7 +212,7 @@ class ScoresMixin: to be applied. If the number of periods present is below `min_periods`, the score for the datapoints in that partition will be np.nan. - method : {'ball_tree', 'kd_tree', 'brute', 'auto'}, default 'ball_tree' + algorithm : {'ball_tree', 'kd_tree', 'brute', 'auto'}, default 'ball_tree' The search algorithm to find each datapoints k nearest neighbors. The keyword just gets passed on to the underlying sklearn method. See reference [1] for more information on the algorithm. @@ -173,51 +237,32 @@ class ScoresMixin: [1] https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html """ if isinstance(target, list): - if (len(target) > 1) or (target[0] in self._data.columns): + if len(target) > 1: raise ValueError( - f"'target' must not exist and be of length 1. {target} was passed instead." + f"'target' must be of length 1. {target} was passed instead." ) target = target[0] + if target in self._data.columns: + self = self.dropField(target) fields = toSequence(field) - val_frame = self._data[fields].copy().to_pandas(how="inner") - score_ser = pd.Series(np.nan, index=val_frame.index, name=target) - val_frame.dropna(inplace=True) - - if val_frame.empty: - return self - - # partitioning - if not freq: - freq = len(val_frame.index) + val_frame = self._data[fields].copy().to_pandas() + score_ser = _groupedScoring( + val_frame, + n=n, + freq=freq, + min_periods=min_periods, + score_func=_kNNApply, + score_kwargs={ + "func": func, + "metric": metric, + "p": p, + "algorithm": algorithm, + }, + ) + score_ser.name = target - if isinstance(freq, str): - grouper = pd.Grouper(freq=freq) - else: - grouper = pd.Series( - data=np.arange(0, len(val_frame)), index=val_frame.index - ) - grouper = grouper.transform(lambda x: int(np.floor(x / freq))) - - partitions = val_frame.groupby(grouper) - - for _, partition in partitions: - if partition.empty or (len(partition) < min_periods): - continue - - sample_size = len(partition) - nn_neighbors = min(n, max(sample_size, 2) - 1) - dist, *_ = kNN( - partition.values, nn_neighbors, algorithm=method, metric=metric, p=p - ) - try: - resids = getattr(dist, func.__name__)(axis=1) - except AttributeError: - resids = np.apply_along_axis(func, 1, dist) - - score_ser[partition.index] = resids - - self._flags[target] = pd.Series(UNFLAGGED, index=score_ser.index, dtype=float) + self._flags[target] = pd.Series(np.nan, index=score_ser.index, dtype=float) self._data[target] = score_ser return self @@ -294,3 +339,206 @@ class ScoresMixin: ) self._data[field] = score return self + + @register( + mask=["field"], + demask=[], + squeeze=["target"], + multivariate=True, + handles_target=True, + ) + def assignLOF( + self: "SaQC", + field: Sequence[str], + target: str, + n: int = 20, + freq: Union[float, str, None] = np.inf, + min_periods: int = 2, + algorithm: Literal["ball_tree", "kd_tree", "brute", "auto"] = "ball_tree", + p: int = 2, + **kwargs, + ) -> "SaQC": + """ + Assign Local Outlier Factor (LOF). + + Parameters + ---------- + field : + The field name of the column, holding the data-to-be-flagged. + target : + A new Column name, where the result is stored. + n : + Number of periods to be included into the LOF calculation. Defaults to `20`, which is a value found to be + suitable in the literature. + + * `n` determines the "locality" of an observation (its `n` nearest neighbors) and sets the upper limit of + values of an outlier clusters (i.e. consecutive outliers). Outlier clusters of size greater than `n/2` + may not be detected reliably. + * The larger `n`, the lesser the algorithm's sensitivity to local outliers and small or singleton outliers + points. Higher values greatly increase numerical costs. + + freq : {float, str, None}, default np.inf + Determines the segmentation of the data into partitions, the kNN algorithm is + applied onto individually. + algorithm : + Algorithm used for calculating the `n`-nearest neighbors needed for LOF calculation. + p : + Degree of the metric ("Minkowski"), according to wich distance to neighbors is determined. + Most important values are: + * `1` - Manhatten Metric + * `2` - Euclidian Metric + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flags : saqc.Flags + The quality flags of data + """ + if isinstance(target, list): + if len(target) > 1: + raise ValueError( + f"'target' must be of length 1. {target} was passed instead." + ) + target = target[0] + if target in self._data.columns: + self = self.dropField(target) + + fields = toSequence(field) + val_frame = self._data[fields].copy().to_pandas() + + score_ser = _groupedScoring( + val_frame, + n=n, + freq=freq, + min_periods=min_periods, + score_func=_LOFApply, + score_kwargs={ + "metric": "minkowski", + "contamination": "auto", + "p": p, + "algorithm": algorithm, + }, + ) + score_ser.name = target + self._flags[target] = pd.Series(np.nan, index=score_ser.index, dtype=float) + self._data[target] = score_ser + + return self + + @register(mask=["field"], demask=[], squeeze=[]) + def assignUniLOF( + self: "SaQC", + field: str, + n: int = 20, + algorithm: Literal["ball_tree", "kd_tree", "brute", "auto"] = "ball_tree", + p: int = 1, + density: Union[Literal["auto"], float, Callable] = "auto", + fill_na: str = "linear", + **kwargs, + ) -> "SaQC": + """ + Assign "univariate" Local Outlier Factor (LOF). + + The Function is a wrapper around a usual LOF implementation, aiming for an easy to use, parameter minimal + outlier scoring function for singleton variables, that does not necessitate prior modelling of the variable. + LOF is applied onto a concatenation of the `field` variable and a "temporal density", or "penalty" variable, + that measures temporal distance between data points. + + See the Notes section for more details on the algorithm. + + Parameters + ---------- + field : + The field name of the column, holding the data-to-be-flagged. + n : + Number of periods to be included into the LOF calculation. Defaults to `20`, which is a value found to be + suitable in the literature. + + * `n` determines the "locality" of an observation (its `n` nearest neighbors) and sets the upper limit of + values of an outlier clusters (i.e. consecutive outliers). Outlier clusters of size greater than `n/2` + may not be detected reliably. + * The larger `n`, the lesser the algorithm's sensitivity to local outliers and small or singleton outliers + points. Higher values greatly increase numerical costs. + + algorithm : + Algorithm used for calculating the `n`-nearest neighbors needed for LOF calculation. + p : + Degree of the metric ("Minkowski"), according to wich distance to neighbors is determined. + Most important values are: + * `1` - Manhatten Metric + * `2` - Euclidian Metric + + density : + How to calculate the temporal distance/density for the variable-to-be-flagged. + + * `auto` - introduces linear density with an increment equal to the median of the absolute diff of the + variable to be flagged + * float - introduces linear density with an increment equal to `density` + * Callable - calculates the density by applying the function passed onto the variable to be flagged + (passed as Series). + + fill_na : + Weather or not to fill NaN values in the data with a linear interpolation. + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flags : saqc.Flags + The quality flags of data + + Notes + ----- + Algorithm steps for uniLOF flagging of variable `x`: + + 1. The temporal density `dt(x)` is calculated according o the `density` parameter. + 2. LOF scores `LOF(x)` are calculated for the concatenation [`x`, `dt(x)`] + 3. `x` is flagged where `LOF(x)` exceeds the threshold determined by the parameter `thresh`. + + Examples + -------- + + """ + + vals = self._data[field] + if fill_na is not None: + vals = vals.interpolate(fill_na) + + if density == "auto": + density = vals.diff().abs().median() + if density == 0: + density = vals.diff().abs().mean() + elif isinstance(density, Callable): + density = density(vals) + if isinstance(density, pd.Series): + density = density.values + + d_var = pd.Series(np.arange(len(vals)) * density, index=vals.index) + na_bool_ser = vals.isna() | d_var.isna() + na_idx = na_bool_ser.index[na_bool_ser.values] + # notna_bool = vals.notna() + val_no = (~na_bool_ser).sum() + if 1 < val_no < n: + n = val_no + elif val_no <= 1: + return self + + d_var = d_var.drop(na_idx, axis=0).values + vals = vals.drop(na_idx, axis=0).values + vals_extended = np.array( + list(vals[::-1][-n:]) + list(vals) + list(vals[::-1][:n]) + ) + d_extended = np.array( + list(d_var[:n] + (d_var[0] - d_var[n + 1])) + + list(d_var) + + list(d_var[-n:] + (d_var[-1] - d_var[-n - 1])) + ) + + LOF_vars = np.array([vals_extended, d_extended]).T + scores = _LOFApply(LOF_vars, n, p=p, algorithm=algorithm, metric="minkowski") + scores = scores[n:-n] + score_ser = pd.Series(scores, index=na_bool_ser.index[~na_bool_ser.values]) + score_ser = score_ser.reindex(na_bool_ser.index) + self._data[field] = score_ser + return self diff --git a/tests/funcs/test_outlier_detection.py b/tests/funcs/test_outlier_detection.py index ebcaa9e8858d522105fe3721638b4df93370bcc8..cf436cf886629e08528587b4c98549cbc5db6366 100644 --- a/tests/funcs/test_outlier_detection.py +++ b/tests/funcs/test_outlier_detection.py @@ -6,6 +6,8 @@ # -*- coding: utf-8 -*- +import itertools + import numpy as np import pandas as pd @@ -173,3 +175,32 @@ def test_flagZScores(): qc = qc.flagZScore("data", window=20) assert (qc.flags.to_pandas().iloc[[40, 80], 0] > 0).all() + + +@pytest.mark.parametrize("n", [1, 10]) +@pytest.mark.parametrize("p", [1, 2]) +@pytest.mark.parametrize("thresh", ["auto", 2]) +def test_flagUniLOF(spiky_data, n, p, thresh): + data = spiky_data[0] + field, *_ = data.columns + qc = SaQC(data).flagUniLOF(field, n=n, p=p, thresh=thresh) + flag_result = qc.flags[field] + test_sum = (flag_result[spiky_data[1]] == BAD).sum() + try: + assert test_sum == len(spiky_data[1]) + except AssertionError: + print("stop") + + +@pytest.mark.parametrize("vars", [1, 2, 3]) +@pytest.mark.parametrize("p", [1, 2]) +@pytest.mark.parametrize("thresh", ["auto", 2]) +def test_flagLOF(spiky_data, vars, p, thresh): + data = pd.DataFrame( + {f"data{v}": spiky_data[0].to_pandas().squeeze() for v in range(vars)} + ) + field, *_ = data.columns + qc = SaQC(data).flagLOF(field) + flag_result = qc.flags[field] + test_sum = (flag_result[spiky_data[1]] == BAD).sum() + assert test_sum == len(spiky_data[1])