Peter Lünenschloß · ebe1d594 · 53b307ea · 64347bae · a8422882 · 52f7a9c6
--- a/saqc/funcs/outliers.py

+ 199

− 2
+++ b/saqc/funcs/outliers.py

+ 199

− 2
 @@ -29,6 +29,202 @@ if TYPE_CHECKING:


 class OutliersMixin:
+    @register(
+        mask=["field"],
+        demask=["field"],
+        squeeze=["field"],
+        multivariate=True,
+        handles_target=False,
+    )
+    def flagLOF(
+        self: "SaQC",
+        field: str,
+        n: int = 20,
+        thresh: "auto" | float = 1.5,
+        algorithm: Literal["ball_tree", "kd_tree", "brute", "auto"] = "ball_tree",
+        metric: str = "minkowski",
+        p: int = 1,
+        density: Literal["auto"] | float | Callable = "auto",
+        flag: float = BAD,
+        **kwargs,
+    ) -> "SaQC":
+        """
+        Flag Local Outlier Factor (LOF) exceeding cutoff.
+
+        Parameters
+        ----------
+        field :
+            The field name of the column, holding the data-to-be-flagged.
+        n :
+            Number of periods to be included into the LOF calculation. Defaults to `20`, which is a value found to be
+            suitable in the literature.
+
+            * as `n` determines the "locality" a points abnormality is compared to (its `n` nearest neighbors),
+              it correlates to the upper limit for the size of outlierclusters that can be detected. Outlier Clusters
+              (consecutive outlierish values) of size greater than `n/2` may not be detected reliably.
+            * The bigger `n`, the lesser the algorithms sensitivity to local outliers and small or singleton outlierish
+              points. Also higher values, greatly increase numerical costs.
+
+        thresh :
+            The threshold for flagging the calculated LOF. A LOF of around `1` is considered normal and most likely
+            corresponds to inlier points.
+
+            * The "automatic" threshing introduced with the publication of the algorithm defaults to `1.5`.
+            * In this implementation, `thresh` defaults ('auto') to flagging the scores with a modified 3-sigma rule,
+              resulting in a thresh > 1.5 which usually mitigates overflagging compared to the
+              literature recommendation.
+
+        algorithm :
+            Algorithm used for calculating the `n`-nearest neighbors needed for LOF calculation.
+        metric :
+            Metric by which the distance between points is actually calculated.
+        p :
+            If metric is "minkowski", this parameter controls the metrics degree. Most important values are:
+            * `1` - Manhatten Metric
+            * `2` - Euclidian Metric
+        flag :
+            flag to set.
+
+        Returns
+        -------
+        data : dios.DictOfSeries
+            A dictionary of pandas.Series, holding all the data.
+        flags : saqc.Flags
+            The quality flags of data
+        """
+
+        field_ = str(uuid.uuid4())
+        self = self.assignLOF(
+            field=field,
+            target=field_,
+            n=n,
+            algorithm=algorithm,
+            metric=metric,
+            p=p,
+            density=density,
+        )
+        s = self.data[field_]
+        if thresh == "auto":
+            s = pd.concat([s, (-s - 2)])
+            s_mask = (s - s.mean() / s.std())[: int(s.shape[0] * 0.5)].abs() > 3
+        else:
+            s_mask = s < abs(thresh)
+
+        self._flags[s_mask, field] = flag
+        self = self.dropField(field_)
+        return self
+
+    @flagging()
+    def flagUniLOF(
+        self: "SaQC",
+        field: str,
+        n: int = 20,
+        thresh: "auto" | float = "auto",
+        algorithm: Literal["ball_tree", "kd_tree", "brute", "auto"] = "ball_tree",
+        metric: str = "minkowski",
+        p: int = 1,
+        density: Literal["auto"] | float | Callable = "auto",
+        fill_na: str = "linear",
+        flag: float = BAD,
+        **kwargs,
+    ) -> "SaQC":
+        """
+        Flag "univariate" Local Outlier Factor (LOF) exceeding cutoff.
+
+        The Function is a wrapper around a usual LOF implementation, aiming for an easy to use, parameter minimal
+        outlier detection function for singleton variables, that does not necessitate prior modelling of the variable.
+        LOF is applied onto a concatenation of the `field` variable and a "temporal density", or "penalty" variable,
+        that measures temporal distance between data points.
+
+        See the Notes section for more details on the algorithm.
+
+        Parameters
+        ----------
+        field :
+            The field name of the column, holding the data-to-be-flagged.
+        n :
+            Number of periods to be included into the LOF calculation. Defaults to `20`, which is a value found to be
+            suitable in the literature.
+
+            * as `n` determines the "locality" a points abnormality is compared to (its `n` nearest neighbors),
+              it correlates to the upper limit for the size of outlierclusters that can be detected. Outlier Clusters
+              (consecutive outlierish values) of size greater than `n/2` may not be detected reliably.
+            * The bigger `n`, the lesser the algorithms sensitivity to local outliers and small or singleton outlierish
+              points. Also higher values, greatly increase numerical costs.
+
+        thresh :
+            The threshold for flagging the calculated LOF. A LOF of around `1` is considered normal and most likely
+            corresponds to inlier points. This parameter is considered the main calibration parameter of the algorithm.
+
+            * The "automatic" threshing introduced with the publication of the algorithm defaults to `1.5`.
+            * In this implementation, `thresh` defaults ('auto') to flagging the scores with a modified 3-sigma rule,
+              resulting in a thresh around `4`, which usually greatly mitigates overflagging compared to the
+              literature recommendation.
+            * sensitive range for the parameter may be [1,15], assuming default settings for the other parameters.
+
+        algorithm :
+            Algorithm used for calculating the `n`-nearest neighbors needed for LOF calculation.
+        metric :
+            Metric by which the distance between points is actually calculated.
+        p :
+            If metric is "minkowski", this parameter controlls the metrics degree. Most important values are:
+            * `1` - Manhaten Metric
+            * `2` - Euclidian Metric
+        density :
+            How to calculate the temporal distance/density for the variable-to-be-flagged.
+
+            * `auto` - introduces linear density with an increment equal to the median of the absolute diff of the
+              variable to be flagged
+            * float - introduces linear density with an increment equal to `density`
+            * Callable - calculates the density by applying the function passed onto the variable to be flagged
+              (passed as Series).
+
+        fill_na :
+            Weather or not to fill NaN values in the data with a linear interpolation.
+        flag :
+            flag to set.
+
+        Returns
+        -------
+        data : dios.DictOfSeries
+            A dictionary of pandas.Series, holding all the data.
+        flags : saqc.Flags
+            The quality flags of data
+
+        Notes
+        -----
+        Algorithm steps for uniLOF flagging of variable `x`:
+
+        1. The temporal density `dt(x)` is calculated according o the `density` parameter.
+        2. LOF scores `LOF(x)` are calculated for the concatenation [`x`, `dt(x)`]
+        3. `x` is flagged where `LOF(x)` exceeds the threshold determined by the parameter `thresh`.
+
+        Examples
+        --------
+
+        """
+        field_ = str(uuid.uuid4())
+        self = self.assignUniLOF(
+            field=field,
+            target=field_,
+            n=n,
+            algorithm=algorithm,
+            metric=metric,
+            p=p,
+            density=density,
+            fill_na=fill_na,
+        )
+        s = self.data[field_]
+        if thresh == "auto":
+            _s = pd.concat([s, (-s - 2)])
+            s_mask = ((_s - _s.mean()) / _s.std()).iloc[: int(s.shape[0])].abs() > 3
+        else:
+            s_mask = s < -abs(thresh)
+
+        self._flags[s_mask, field] = flag
+        self = self.dropField(field_)
+        return self
+
    @flagging()
    def flagRange(
        self: "SaQC",
 @@ -346,7 +542,7 @@ class OutliersMixin:
            n=n,
            func=func,
            freq=partition,
-            method="ball_tree",
+            algorithm="ball_tree",
            min_periods=partition_min,
            **kwargs,
        )
 @@ -580,6 +776,7 @@ class OutliersMixin:
        """
        Flag outiers using the modified Z-score outlier detection method.

+
        See references [1] for more details on the algorithm.

        Note
 @@ -997,7 +1194,7 @@ class OutliersMixin:
           :math:`m_j = median(\\{data[f_1][t_i], data[f_2][t_i], ..., data[f_N][t_i]\\})` is calculated
        3. for every :math:`0 <= i <= K`, the set
           :math:`\\{data[f_1][t_i] - m_j, data[f_2][t_i] - m_j, ..., data[f_N][t_i] - m_j\\}` is tested for outliers with the
-           specified method (`cross_stat` parameter).
+           specified algorithm (`cross_stat` parameter).

        Parameters
        ----------