Bert Palm
--- a/saqc/funcs/outliers.py

+ 367

− 260
+++ b/saqc/funcs/outliers.py

+ 367

− 260
 @@ -10,7 +10,7 @@ from __future__ import annotations

 import uuid
 import warnings
-from typing import TYPE_CHECKING, Callable, Optional, Sequence, Tuple
+from typing import TYPE_CHECKING, Callable, List, Optional, Sequence, Tuple

 import numpy as np
 import numpy.polynomial.polynomial as poly
 @@ -21,7 +21,17 @@ from typing_extensions import Literal

 from saqc import BAD, UNFLAGGED
 from saqc.core import DictOfSeries, Flags, flagging, register
-from saqc.funcs.scores import _univarScoring
+from saqc.lib.checking import (
+    isCallable,
+    isFloatLike,
+    validateCallable,
+    validateChoice,
+    validateFraction,
+    validateFrequency,
+    validateMinPeriods,
+    validateValueBounds,
+    validateWindow,
+)
 from saqc.lib.docs import DOC_TEMPLATES
 from saqc.lib.rolling import windowRoller
 from saqc.lib.tools import getFreqDelta, isflagged, toSequence
 @@ -31,6 +41,19 @@ if TYPE_CHECKING:


 class OutliersMixin:
+    @staticmethod
+    def _validateLOF(algorithm, n, p, density):
+        """validate parameter for LOF and UniLOF"""
+        validateValueBounds(n, "n", left=0, strict_int=True)
+        validateValueBounds(p, "p", left=0, strict_int=True)
+        validateChoice(
+            algorithm, "algorithm", ["ball_tree", "kd_tree", "brute", "auto"]
+        )
+        if density != "auto" and not isFloatLike(density) and not isCallable(density):
+            raise ValueError(
+                f"'density' must be 'auto' or a float or a function, not {density}"
+            )
+
    @register(
        mask=["field"],
        demask=["field"],
 @@ -55,53 +78,79 @@ class OutliersMixin:
        Parameters
        ----------
        n :
-            Number of neighbors to be included into the LOF calculation. Defaults to ``20``, which is a
+            Number of neighbors to be included into the LOF calculation.
+            Defaults to ``20``, which is a
            value found to be suitable in the literature.

-            * :py:attr:`n` determines the "locality" of an observation (its :py:attr:`n` nearest neighbors)
-              and sets the upper limit to the number of values in outlier clusters (i.e. consecutive outliers). Outlier
-              clusters of size greater than :py:attr:`n`/2 may not be detected reliably.
-            * The larger :py:attr:`n`, the lesser the algorithm's sensitivity to local outliers and small
-              or singleton outliers points. Higher values greatly increase numerical costs.
+            * :py:attr:`n` determines the "locality" of an observation
+              (its :py:attr:`n` nearest neighbors) and sets the upper
+              limit to the number of values in outlier clusters (i.e.
+              consecutive outliers). Outlier clusters of size greater
+              than :py:attr:`n`/2 may not be detected reliably.
+            * The larger :py:attr:`n`, the lesser the algorithm's sensitivity
+              to local outliers and small or singleton outliers points.
+              Higher values greatly increase numerical costs.

        thresh :
-            The threshold for flagging the calculated LOF. A LOF of around ``1`` is considered normal and
-            most likely corresponds to inlier points.
+            The threshold for flagging the calculated LOF. A LOF of around
+            ``1`` is considered normal and most likely corresponds to
+            inlier points.

-            * The "automatic" threshing introduced with the publication of the algorithm defaults to ``1.5``.
-            * In this implementation, :py:attr:`thresh` defaults (``'auto'``) to flagging the scores with a
-              modified 3-sigma rule, resulting in a :py:attr:`thresh` `` > 1.5`` which usually mitigates
-              overflagging compared to the literature recommendation.
+            * The "automatic" threshing introduced with the publication
+              of the algorithm defaults to ``1.5``.
+            * In this implementation, :py:attr:`thresh` defaults (``'auto'``)
+              to flagging the scores with a modified 3-sigma rule, resulting
+              in a :py:attr:`thresh` `` > 1.5`` which usually mitigates
+              over-flagging compared to the literature recommendation.

        algorithm :
            Algorithm used for calculating the :py:attr:`n`-nearest neighbors.

        p :
-            Degree of the metric ("Minkowski"), according to which the distance to neighbors is determined.
-            Most important values are:
+            Degree of the metric ("Minkowski"), according to which the
+            distance to neighbors is determined. Most important values are:

-            * ``1`` - Manhatten Metric
+            * ``1`` - Manhattan Metric
            * ``2`` - Euclidian Metric

+        density :
+            How to calculate the temporal distance/density for the variable to flag.
+
+            * ``'auto'`` - introduces linear density with an increment
+              equal to the median of the absolute diff of the variable to flag.
+            * ``float`` - introduces linear density with an increment
+              equal to :py:attr:`density`
+            * Callable - calculates the density by applying the function
+              passed onto the variable to flag (passed as Series).
+
        Notes
        -----
-        * The :py:meth:`~saqc.SaQC.flagLOF` function calculates the Local Outlier Factor (LOF) for every point
-          in the input timeseries. The *LOF* is a scalar value, that roughly correlates to the *reachability*,
-          or "outlierishnes" of the evaluated datapoint. If a point is as reachable, as all its
-          :py:attr:`n`-nearest neighbors, the *LOF* score evaluates to around ``1``. If it is only as half as
-          reachable as all its ``n``-nearest neighbors are (so to say, as double as "outlierish"), the score
-          is about ``2``. So, the Local Outlier *Factor* relates a point's *reachability* to the *reachability*
-          of its :py:attr:`n`-nearest neighbors in a multiplicative fashion (as a "factor").
-        * The *reachability* of a point thereby is determined as an aggregation of the points distances to its
-          :py:attr:`n`-nearest neighbors, measured with regard to the minkowski metric of degree :py:attr:`p`
+        * The :py:meth:`~saqc.SaQC.flagLOF` function calculates the Local
+          Outlier Factor (LOF) for every point in the input timeseries.
+          The *LOF* is a scalar value, that roughly correlates to the
+          *reachability*, or "outlierishnes" of the evaluated datapoint.
+          If a point is as reachable, as all its :py:attr:`n`-nearest
+          neighbors, the *LOF* score evaluates to around ``1``. If it
+          is only as half as reachable as all its ``n``-nearest neighbors
+          are (so to say, as double as "outlierish"), the score is about
+          ``2``. So, the Local Outlier *Factor* relates a point's *reachability*
+          to the *reachability* of its :py:attr:`n`-nearest neighbors
+          in a multiplicative fashion (as a "factor").
+        * The *reachability* of a point thereby is determined as an aggregation
+          of the points distances to its :py:attr:`n`-nearest neighbors,
+          measured with regard to the minkowski metric of degree :py:attr:`p`
          (usually euclidean).
-        * To derive a binary label for every point (outlier: *yes*, or *no*), the scores are cut off at a level,
-          determined by :py:attr:`thresh`.
+        * To derive a binary label for every point (outlier: *yes*, or *no*),
+          the scores are cut off at a level, determined by :py:attr:`thresh`.

        """
+        self._validateLOF(algorithm, n, p, density)
+        if thresh != "auto" and not isFloatLike(thresh):
+            raise ValueError(f"'thresh' must be 'auto' or a float, not {thresh}")
+
        fields = toSequence(field)
        field_ = str(uuid.uuid4())
-        self = self.assignLOF(
+        qc = self.assignLOF(
            field=fields,
            target=field_,
            n=n,
 @@ -109,7 +158,7 @@ class OutliersMixin:
            p=p,
            density=density,
        )
-        s = self.data[field_]
+        s = qc.data[field_]
        if thresh == "auto":
            s = pd.concat([s, (-s - 2)])
            s_mask = (s - s.mean() / s.std())[: len(s) // 2].abs() > 3
 @@ -117,10 +166,10 @@ class OutliersMixin:
            s_mask = s < abs(thresh)

        for f in fields:
-            mask = ~isflagged(self._flags[f], kwargs["dfilter"]) & s_mask
-            self._flags[mask, f] = flag
+            mask = ~isflagged(qc._flags[f], kwargs["dfilter"]) & s_mask
+            qc._flags[mask, f] = flag

-        return self.dropField(field_)
+        return qc.dropField(field_)

    @flagging()
    def flagUniLOF(
 @@ -131,95 +180,120 @@ class OutliersMixin:
        algorithm: Literal["ball_tree", "kd_tree", "brute", "auto"] = "ball_tree",
        p: int = 1,
        density: Literal["auto"] | float | Callable = "auto",
-        fill_na: str = "linear",
+        fill_na: bool = True,
        flag: float = BAD,
        **kwargs,
    ) -> "SaQC":
        """
        Flag "univariate" Local Outlier Factor (LOF) exceeding cutoff.

-        The function is a wrapper around a usual LOF implementation, aiming for an easy to use,
-        parameter minimal outlier detection function for single variables, that does not necessitate
-        prior modelling of the variable. LOF is applied onto a concatenation of the `field` variable
-        and a "temporal density", or "penalty" variable, that measures temporal distance between data
-        points. See notes Section for a more exhaustive explaination.
-
-        See the Notes section for more details on the algorithm.
+        The function is a wrapper around a usual LOF implementation, aiming
+        for an easy to use, parameter minimal outlier detection function
+        for single variables, that does not necessitate prior modelling
+        of the variable. LOF is applied onto a concatenation of the `field`
+        variable and a "temporal density", or "penalty" variable, that
+        measures temporal distance between data points. See notes Section
+        for a more exhaustive explaination. See the Notes section for
+        more details on the algorithm.

        Parameters
        ----------
        n :
-            Number of periods to be included into the LOF calculation. Defaults to `20`, which is a
-            value found to be suitable in the literature.
-
-            * :py:attr:`n` determines the "locality" of an observation (its :py:attr:`n` nearest neighbors)
-              and sets the upper limit to the number of values in an outlier clusters (i.e. consecutive outliers). Outlier
-              clusters of size greater than :py:attr:`n`/2 may not be detected reliably.
-            * The larger :py:attr:`n`, the lesser the algorithm's sensitivity to local outliers and small
-              or singleton outlier points. Higher values greatly increase numerical costs.
+            Number of periods to be included into the LOF calculation.
+            Defaults to `20`, which is a value found to be suitable in
+            the literature.
+
+            * :py:attr:`n` determines the "locality" of an observation
+              (its :py:attr:`n` nearest neighbors) and sets the upper
+              limit to the number of values in an outlier clusters (i.e.
+              consecutive outliers). Outlier clusters of size greater
+              than :py:attr:`n`/2 may not be detected reliably.
+            * The larger :py:attr:`n`, the lesser the algorithm's sensitivity
+              to local outliers and small or singleton outlier points.
+              Higher values greatly increase numerical costs.

        thresh :
-            The threshold for flagging the calculated LOF. A LOF of around ``1`` is considered normal and
-            most likely corresponds to inlier points. This parameter is considered the main calibration
+            The threshold for flagging the calculated LOF. A LOF of around
+            ``1`` is considered normal and most likely corresponds to
+            inlier points. This parameter is considered the main calibration
            parameter of the algorithm.

-            * The threshing defaults to ``1.5``, wich is the default value found to be suitable in the literature.
-            * ``'auto'`` enables flagging the scores with a modified 3-sigma rule,
-              resulting in a thresh around ``4``, which usually greatly mitigates overflagging compared to the
-              literature recommendation, but often is too high.
-            * sensitive range for the parameter may be ``[1,15]``, assuming default settings for the other parameters.
+            * The threshing defaults to ``1.5``, wich is the default value
+              found to be suitable in the literature.
+            * ``'auto'`` enables flagging the scores with a modified 3-sigma
+              rule, resulting in a thresh around ``4``, which usually
+              greatly mitigates overflagging compared to the literature
+              recommendation, but often is too high.
+            * sensitive range for the parameter may be ``[1,15]``, assuming
+              default settings for the other parameters.

        algorithm :
-            Algorithm used for calculating the :py:attr:`n`-nearest neighbors needed for LOF calculation.
+            Algorithm used for calculating the :py:attr:`n`-nearest neighbors
+            needed for LOF calculation.
+
        p :
-            Degree of the metric ("Minkowski"), according to which distance to neighbors is determined.
-            Most important values are:
+            Degree of the metric ("Minkowski"), according to which distance
+            to neighbors is determined. Most important values are:
            * ``1`` - Manhatten Metric
            * ``2`` - Euclidian Metric
+
        density :
-            How to calculate the temporal distance/density for the variable to flag.
+            How to calculate the temporal distance/density for the variable
+            to flag.

-            * ``'auto'`` - introduces linear density with an increment equal to the median of the absolute
-              diff of the variable to flag.
-            * ``float`` - introduces linear density with an increment equal to :py:attr:`density`
-            * Callable - calculates the density by applying the function passed onto the variable to flag
-              (passed as Series).
+            * ``'auto'`` - introduces linear density with an increment
+              equal to the median of the absolute diff of the variable to flag.
+            * ``float`` - introduces linear density with an increment
+              equal to :py:attr:`density`
+            * Callable - calculates the density by applying the function
+              passed onto the variable to flag (passed as Series).

        fill_na :
-            Weather or not to fill NaN values in the data with a linear interpolation.
+            If True, NaNs in the data are filled with a linear interpolation.

        See Also
        --------
-        :ref:`introduction to outlier detection with saqc <cookbooks/OutlierDetection:Outlier Detection>`
+        :ref:`introduction to outlier detection with
+            saqc <cookbooks/OutlierDetection:Outlier Detection>`

        Notes
        -----
-        * The :py:meth:`~saqc.SaQC.flagUniLOF` function calculates an univariate
-          Local Outlier Factor (UniLOF) - score for every point in the one dimensional input
-          data series.
-          The *UniLOF* score of any data point is a scalar value, that roughly correlates to
-          its *reachability*, or "outlierishnes" in the 2-dimensional space constituted by the
-          data-values and the time axis. So the Algorithm basically operates on the "graph",
-          or the "plot" of the input timeseries.
+
+        * The :py:meth:`~saqc.SaQC.flagUniLOF` function calculates an
+          univariate Local Outlier Factor (UniLOF) - score for every
+          point in the one dimensional input data series. The *UniLOF*
+          score of any data point is a scalar value, that roughly correlates
+          to its *reachability*, or "outlierishnes" in the 2-dimensional
+          space constituted by the data-values and the time axis. So
+          the Algorithm basically operates on the "graph", or the "plot"
+          of the input timeseries.
+
        * If a point in this "graph" is as reachable, as all its :py:attr:`n`-nearest
-          neighbors, its *UniLOF* score evaluates to around ``1``. If it is only as half as
-          reachable as all its :py:attr:`n` neighbors are
-          (so to say, as double as "outlierish"), its score evaluates to ``2`` roughly.
-          So, the Univariate Local Outlier *Factor* relates a points *reachability* to the
-          *reachability* of its :py:attr:`n`-nearest neighbors in a multiplicative fashion
+          neighbors, its *UniLOF* score evaluates to around ``1``. If
+          it is only as half as reachable as all its :py:attr:`n` neighbors
+          are (so to say, as double as "outlierish"), its score evaluates
+          to ``2`` roughly. So, the Univariate Local Outlier *Factor*
+          relates a points *reachability* to the *reachability* of its
+          :py:attr:`n`-nearest neighbors in a multiplicative fashion
          (as a "factor").
-        * The *reachability* of a point thereby is derived as an aggregation of the points
-          distance to its :py:attr:`n`-nearest neighbors, measured with regard to the minkowski
-          metric of degree :py:attr:`p` (usually euclidean).
-        * The parameter :py:attr:`density` thereby determines how dimensionality of the time is
-          removed, to make it a dimensionless, real valued coordinate.
-        * To derive a binary label for every point (outlier: *yes*, or *no*), the scores are cut
-          off at a level, determined by :py:attr:`thresh`.
+
+        * The *reachability* of a point thereby is derived as an aggregation
+          of the points distance to its :py:attr:`n`-nearest neighbors,
+          measured with regard to the minkowski metric of degree :py:attr:`p`
+          (usually euclidean).
+
+        * The parameter :py:attr:`density` thereby determines how dimensionality
+          of the time is removed, to make it a dimensionless, real valued
+          coordinate.
+
+        * To derive a binary label for every point (outlier: *yes*, or
+          *no*), the scores are cut off at a level, determined by :py:attr:`thresh`.

        Examples
        --------

-        See the :ref:`outlier detection cookbook <cookbooks/OutlierDetection:Outlier Detection>` for a detailed
+        See the :ref:`outlier detection cookbook
+        <cookbooks/OutlierDetection:Outlier Detection>` for a detailed
        introduction into the usage and tuning of the function.

        .. plot::
 @@ -236,8 +310,10 @@ class OutliersMixin:

        Example usage with default parameter configuration:

-        Loading data via pandas csv file parser, casting index to DateTime, generating a :py:class:`~saqc.SaQC`
-        instance from the data and plotting the variable representing light scattering at 254 nanometers wavelength.
+        Loading data via pandas csv file parser, casting index to DateTime,
+        generating a :py:class:`~saqc.SaQC` instance from the data and
+        plotting the variable representing light scattering at 254 nanometers
+        wavelength.

        .. doctest:: flagUniLOFExample

 @@ -255,8 +331,9 @@ class OutliersMixin:

            qc.plot('sac254_raw')

-        We apply :py:meth:`~saqc.SaqC.flagUniLOF` in with default parameter values. Meaning, that the main
-        calibration paramters :py:attr:`n` and :py:attr:`thresh` evaluate to `20` and `1.5` respectively.
+        We apply :py:meth:`~saqc.SaqC.flagUniLOF` in with default parameter
+        values. Meaning, that the main calibration paramters :py:attr:`n`
+        and :py:attr:`thresh` evaluate to `20` and `1.5` respectively.

        .. doctest:: flagUniLOFExample

 @@ -273,27 +350,31 @@ class OutliersMixin:
           qc.plot('sac254_raw')

        """
-        field_ = str(uuid.uuid4())
-        self = self.assignUniLOF(
+        self._validateLOF(algorithm, n, p, density)
+        if thresh != "auto" and not isFloatLike(thresh):
+            raise ValueError(f"'thresh' must be 'auto' or a float, not {thresh}")
+
+        tmp_field = str(uuid.uuid4())
+        qc = self.assignUniLOF(
            field=field,
-            target=field_,
+            target=tmp_field,
            n=n,
            algorithm=algorithm,
            p=p,
            density=density,
            fill_na=fill_na,
        )
-        s = self.data[field_]
+        s = qc.data[tmp_field]
        if thresh == "auto":
            _s = pd.concat([s, (-s - 2)])
            s_mask = ((_s - _s.mean()) / _s.std()).iloc[: int(s.shape[0])].abs() > 3
        else:
            s_mask = s < -abs(thresh)

-        s_mask = ~isflagged(self._flags[field], kwargs["dfilter"]) & s_mask
-        self._flags[s_mask, field] = flag
-        self = self.dropField(field_)
-        return self
+        s_mask = ~isflagged(qc._flags[field], kwargs["dfilter"]) & s_mask
+        qc._flags[s_mask, field] = flag
+        qc = qc.dropField(tmp_field)
+        return qc

    @flagging()
    def flagRange(
 @@ -305,7 +386,8 @@ class OutliersMixin:
        **kwargs,
    ) -> "SaQC":
        """
-        Function flags values exceeding the closed interval [:py:attr:`min`, :py:attr:`max`].
+        Function flags values exceeding the closed
+        interval [:py:attr:`min`, :py:attr:`max`].

        Parameters
        ----------
 @@ -314,7 +396,6 @@ class OutliersMixin:
        max :
            Upper bound for valid data.
        """
-
        # using .values is much faster
        datacol = self._data[field].to_numpy()
        mask = (datacol < min) | (datacol > max)
 @@ -341,28 +422,28 @@ class OutliersMixin:
        ----------

        window :
-            Determines the segmentation of the data into partitions, the kNN algorithm is
-            applied onto individually.
+            Determines the segmentation of the data into partitions, the
+            kNN algorithm is applied onto individually.

            * ``None``: Apply Scoring on whole data set at once
-            * ``int``: Apply scoring on successive data chunks of periods with the given length.
-              Must be greater than 0.
-            * Offset String : Apply scoring on successive partitions of temporal extension
-              matching the passed offset string
+            * ``int``: Apply scoring on successive data chunks of periods
+              with the given length. Must be greater than 0.
+            * offset String : Apply scoring on successive partitions of
+              temporal extension matching the passed offset string

        min_periods :
-            Minimum number of periods per partition that have to be present for a valid
-            outlier detection to be made in this partition (only of effect, if :py:attr:`freq`
-            is an integer).
+            Minimum number of periods per partition that have to be present
+            for a valid outlier detection to be made in this partition

        iter_start :
-            Float in ``[0, 1]`` that determines which percentage of data is considered
-            "normal". ``0.5`` results in the stray algorithm to search only the upper 50% of
-            the scores for the cut off point. (See reference section for more information)
+            Float in ``[0, 1]`` that determines which percentage of data
+            is considered "normal". ``0.5`` results in the stray algorithm
+            to search only the upper 50% of the scores for the cut off
+            point. (See reference section for more information)

        alpha :
-            Level of significance by which it is tested, if a score might be drawn from
-            another distribution than the majority of the data.
+            Level of significance by which it is tested, if a score might
+            be drawn from another distribution than the majority of the data.

        References
        ----------
 @@ -373,36 +454,36 @@ class OutliersMixin:
        """
        scores = self._data[field].dropna()

+        if window is None:
+            window = len(scores)
+        if not isinstance(window, int):
+            validateFrequency(window, "window")
+
+        validateMinPeriods(min_periods)
+        validateValueBounds(iter_start, "iter_start", left=0, right=1, closed="both")
+
        if scores.empty:
            return self

-        if not window:
-            window = len(scores)
-
-        if isinstance(window, str):
+        if isinstance(window, int):
+            s = pd.Series(data=np.arange(0, len(scores)), index=scores.index)
+            s = s.transform(lambda x: int(np.floor(x / window)))
+            partitions = scores.groupby(s)
+        else:  # pd.Timedelta pd.DateOffset or str
            partitions = scores.groupby(pd.Grouper(freq=window))

-        else:
-            grouper_series = pd.Series(
-                data=np.arange(0, len(scores)), index=scores.index
-            )
-            grouper_series = grouper_series.transform(
-                lambda x: int(np.floor(x / window))
-            )
-            partitions = scores.groupby(grouper_series)
-
        # calculate flags for every window
        for _, partition in partitions:
-            if partition.empty | (len(partition) < min_periods):
-                continue
-
            sample_size = len(partition)

+            if partition.empty or sample_size < min_periods:
+                continue
+
            sorted_i = partition.values.argsort()
            resids = partition.values[sorted_i]
            gaps = np.append(0, np.diff(resids))

-            tail_size = int(max(min(50, np.floor(sample_size / 4)), 2))
+            tail_size = int(max(min(np.floor(sample_size / 4), 50), 2))
            tail_indices = np.arange(2, tail_size + 1)

            i_start = int(max(np.floor(sample_size * iter_start), 1) + 1)
 @@ -448,18 +529,19 @@ class OutliersMixin:
        **kwargs,
    ) -> "SaQC":
        """
-        The algorithm implements a 3-step outlier detection procedure for simultaneously
-        flagging of higher dimensional data (dimensions > 3).
+        The algorithm implements a 3-step outlier detection procedure for
+        simultaneously flagging of higher dimensional data (dimensions > 3).

-        In [1], the procedure is introduced and exemplified with an application on hydrological
-        data. See the notes section for an overview over the algorithms basic steps.
+        In [1], the procedure is introduced and exemplified with an application on
+        hydrological data. See the notes section for an overview over the algorithms
+        basic steps.

        Parameters
        ----------
-        trafo : default identity
-            Transformation to be applied onto every column before scoring. For more fine-grained
-            control, the data could also be transformed before :py:meth:`~saqc.SaQC.flagMVScores`
-            is called.
+        trafo :
+            Transformation to be applied onto every column before scoring. For more
+            fine-grained control, the data could also be transformed before
+            :py:meth:`~saqc.SaQC.flagMVScores` is called.

        alpha :
            Level of significance by which it is tested, if an observations score might
 @@ -468,48 +550,52 @@ class OutliersMixin:
        n :
            Number of neighbors included in the scoring process for every datapoint.

-        func : default sum
-            Function that aggregates a value's k-smallest distances, returning a scalar score.
+        func :
+            Function that aggregates a value's k-smallest distances, returning a
+            scalar score.

        iter_start :
            Value in ``[0,1]`` that determines which percentage of data is considered
-            "normal". 0.5 results in the threshing algorithm to search only the upper 50%
-            of the scores for the cut off point. (See reference section for more
+            "normal". 0.5 results in the threshing algorithm to search only the upper
+            50% of the scores for the cut-off point. (See reference section for more
            information)

        window :
-            Only effective if :py:attr:`threshing` is set to ``'stray'``. Determines the
-            size of the data partitions, the data is decomposed into. Each partition is checked
-            seperately for outliers.
-            Either given as an Offset String, denoting the windows temporal extension or
-            as an integer, denoting the windows number of periods. ``NaN`` also count as periods.
-            If ``None``, all data points share the same scoring window, which than equals the whole
-            data.
+            Only effective if :py:attr:`threshing` is set to ``'stray'``. Determines
+            the size of the data partitions, the data is decomposed into. Each
+            partition is checked seperately for outliers. Either given as an Offset
+            String, denoting the windows temporal extension or as an integer,
+            denoting the windows number of periods. ``NaN`` also count as periods. If
+            ``None``, all data points share the same scoring window, which than
+            equals the whole data.

        min_periods :
-            Only effective if :py:attr:`threshing` is set to ``'stray'`` and :py:attr:`partition` is an integer.
-            Minimum number of periods per :py:attr:`partition` that have to be present for a valid outlier
+            Only effective if :py:attr:`threshing` is set to ``'stray'`` and
+            :py:attr:`partition` is an integer. Minimum number of periods per
+            :py:attr:`partition` that have to be present for a valid outlier
            detection to be made in this partition.

        stray_range :
-            If not ``None``, it is tried to reduce the stray result onto single outlier components
-            of the input :py:attr:`field`. The offset string denotes the range of the
-            temporal surrounding to include into the MAD testing while trying to reduce
-            flags.
+            If not ``None``, it is tried to reduce the stray result onto single
+            outlier components of the input :py:attr:`field`. The offset string
+            denotes the range of the temporal surrounding to include into the MAD
+            testing while trying to reduce flags.

        drop_flagged :
-            Only effective when :py:attr:`stray_range` is not ``None``. Whether or not to drop flagged
-            values from the temporal surroundings.
+            Only effective when :py:attr:`stray_range` is not ``None``. Whether or
+            not to drop flagged values from the temporal surroundings.

        thresh :
-            Only effective when :py:attr:`stray_range` is not ``None``. The 'critical' value,
-            controlling wheather the MAD score is considered referring to an outlier or
-            not. Higher values result in less rigid flagging. The default value is widely
-            considered apropriate in the literature.
+            Only effective when :py:attr:`stray_range` is not ``None``. The
+            'critical' value, controlling wheather the MAD score is considered
+            referring to an outlier or not. Higher values result in less rigid
+            flagging. The default value is widely considered apropriate in the
+            literature.

        min_periods_r :
-            Only effective when :py:attr:`stray_range` is not ``None``. Minimum number of measurements
-            necessary in an interval to actually perform the reduction step.
+            Only effective when :py:attr:`stray_range` is not ``None``. Minimum
+            number of measurements necessary in an interval to actually perform the
+            reduction step.

        Notes
        -----
 @@ -521,33 +607,33 @@ class OutliersMixin:
        (a) make them comparable and
        (b) make outliers more stand out.

-        This step is usually subject to a phase of research/try and error. See [1] for more
-        details.
+        This step is usually subject to a phase of research/try and error. See [1]
+        for more details.

-        Note, that the data transformation as an built-in step of the algorithm, will likely
-        get deprecated in the future. Its better to transform the data in a processing
-        step, preceeding the multivariate flagging process. Also, by doing so, one gets
-        mutch more control and variety in the transformation applied, since the `trafo`
-        parameter only allows for application of the same transformation to all of the
-        variables involved.
+        Note, that the data transformation as a built-in step of the algorithm,
+        will likely get deprecated in the future. It's better to transform the data in
+        a processing step, preceeding the multivariate flagging process. Also,
+        by doing so, one gets mutch more control and variety in the transformation
+        applied, since the `trafo` parameter only allows for application of the same
+        transformation to all the variables involved.

        2. scoring

-        Every observation gets assigned a score depending on its k nearest neighbors. See
-        the `scoring_method` parameter description for details on the different scoring
-        methods. Furthermore [1] may give some insight in the pro and cons of the
-        different methods.
+        Every observation gets assigned a score depending on its k nearest neighbors.
+        See the `scoring_method` parameter description for details on the different
+        scoring methods. Furthermore, [1] may give some insight in the pro and cons of
+        the different methods.

        3. threshing

-        The gaps between the (greatest) scores are tested for beeing drawn from the same
-        distribution as the majority of the scores. If a gap is encountered, that,
-        with sufficient significance, can be said to not be drawn from the same
-        distribution as the one all the smaller gaps are drawn from, than the observation
-        belonging to this gap, and all the observations belonging to gaps larger then
-        this gap, get flagged outliers. See description of the `threshing` parameter for
-        more details. Although [1] gives a fully detailed overview over the `stray`
-        algorithm.
+        The gaps between the (greatest) scores are tested for beeing drawn from the
+        same distribution as the majority of the scores. If a gap is encountered,
+        that, with sufficient significance, can be said to not be drawn from the same
+        distribution as the one all the smaller gaps are drawn from, than the
+        observation belonging to this gap, and all the observations belonging to gaps
+        larger than this gap, get flagged outliers. See description of the
+        `threshing` parameter for more details. Although [1] gives a fully detailed
+        overview over the `stray` algorithm.

        References
        ----------
 @@ -555,7 +641,6 @@ class OutliersMixin:
             Anomaly Detection in High-Dimensional Data,
             Journal of Computational and Graphical Statistics, 30:2, 360-374,
             DOI: 10.1080/10618600.2020.1807997
-
        """

        # parameter deprecations
 @@ -563,8 +648,8 @@ class OutliersMixin:
        if "partition" in kwargs:
            warnings.warn(
                """
-                The parameter `partition` is deprecated and will be removed in version 3.0 of saqc.
-                Please us the parameter `window` instead.'
+                The parameter `partition` is deprecated and will be removed in version 
+                3.0 of saqc. Please us the parameter `window` instead.
                """,
                DeprecationWarning,
            )
 @@ -573,8 +658,8 @@ class OutliersMixin:
        if "partition_min" in kwargs:
            warnings.warn(
                """
-                The parameter `partition_min` is deprecated and will be removed in version 3.0 of saqc.
-                Please us the parameter `min_periods` instead.'
+                The parameter `partition_min` is deprecated and will be removed in 
+                version 3.0 of saqc. Please us the parameter `min_periods` instead.
                """,
                DeprecationWarning,
            )
 @@ -583,27 +668,32 @@ class OutliersMixin:
        if min_periods != 11:
            warnings.warn(
                """
-                You were setting a customary value for the `min_periods` parameter: note that this parameter 
-                does no longer refer to the reduction interval length, but now controls the number of periods 
-                having to be present in an interval of size `window` (deprecated:`partition`) for the algorithm to be 
-                performed in that interval.
-                To alter the size of the reduction window, use the parameter `min_periods_r`. Changes readily apply. 
-                Warning will be removed in saqc version 3.0.
+                You were setting a customary value for the `min_periods` parameter: 
+                note that this parameter does no longer refer to the reduction interval 
+                length, but now controls the number of periods having to be present in 
+                an interval of size `window` (deprecated:`partition`) for the algorithm 
+                to be performed in that interval.
+                To alter the size of the reduction window, use the parameter 
+                `min_periods_r`. Changes readily apply. 
+                This warning will be removed in saqc version 3.0.
                """,
                DeprecationWarning,
            )

+        # Hint: checking is delegated to the called functions
+
        fields = toSequence(field)

+        qc = self
        fields_ = []
        for f in fields:
            field_ = str(uuid.uuid4())
-            self = self.copyField(field=f, target=field_)
-            self = self.transform(field=field_, func=trafo, freq=window)
+            qc = qc.copyField(field=f, target=field_)
+            qc = qc.transform(field=field_, func=trafo, freq=window)
            fields_.append(field_)

        knn_field = str(uuid.uuid4())
-        self = self.assignKNNScore(
+        qc = qc.assignKNNScore(
            field=fields_,
            target=knn_field,
            n=n,
 @@ -614,9 +704,9 @@ class OutliersMixin:
            **kwargs,
        )
        for field_ in fields_:
-            self = self.dropField(field_)
+            qc = qc.dropField(field_)

-        self = self.flagByStray(
+        qc = qc.flagByStray(
            field=knn_field,
            freq=window,
            min_periods=min_periods,
 @@ -626,11 +716,11 @@ class OutliersMixin:
            **kwargs,
        )

-        self._data, self._flags = _evalStrayLabels(
-            data=self._data,
+        qc._data, qc._flags = _evalStrayLabels(
+            data=qc._data,
            field=knn_field,
            target=fields,
-            flags=self._flags,
+            flags=qc._flags,
            reduction_range=stray_range,
            reduction_drop_flagged=drop_flagged,
            reduction_thresh=thresh,
 @@ -638,7 +728,7 @@ class OutliersMixin:
            flag=flag,
            **kwargs,
        )
-        return self.dropField(knn_field)
+        return qc.dropField(knn_field)

    @flagging()
    def flagRaise(
 @@ -655,17 +745,17 @@ class OutliersMixin:
        **kwargs,
    ) -> "SaQC":
        """
-        The function flags raises and drops in value courses, that exceed a certain threshold
-        within a certain timespan.
+        The function flags raises and drops in value courses, that exceed a certain
+        threshold within a certain timespan.

-        The parameter variety of the function is owned to the intriguing case of values, that
-        "return" from outlierish or anomalious value levels and thus exceed the threshold,
-        while actually being usual values.
+        The parameter variety of the function is owned to the intriguing case of
+        values, that "return" from outlierish or anomalious value levels and thus
+        exceed the threshold, while actually being usual values.

        Notes
        -----
-        The dataset is NOT supposed to be harmonized to a time series with an equidistant
-        requency grid.
+        The dataset is NOT supposed to be harmonized to a time series with an
+        equidistant requency grid.

        The value :math:`x_{k}` of a time series :math:`x` with associated
        timestamps :math:`t_i`, is flagged a raise, if:
 @@ -674,35 +764,38 @@ class OutliersMixin:
           :py:attr:`raise_window` range, so that
           :math:`M = |x_k - x_s | >`  :py:attr:`thresh` :math:`> 0`

-        2. The weighted average :math:`\\mu^{*}` of the values, preceding :math:`x_{k}`
-           within :py:attr:`average_window` range indicates, that :math:`x_{k}` does not
-           return from an "outlierish" value course, meaning that
+        2. The weighted average :math:`\\mu^{*}` of the values, preceding
+           :math:`x_{k}` within :py:attr:`average_window` range indicates,
+           that :math:`x_{k}` does not return from an "outlierish" value
+           course, meaning that
           :math:`x_k > \\mu^* + ( M` / :py:attr:`raise_factor` :math:`)`

-        3. Additionally, if :py:attr:`slope` is not ``None``, :math:`x_{k}` is checked for being
-           sufficiently divergent from its very predecessor :math:`x_{k-1}`, meaning that, it
-           is additionally checked if:
+        3. Additionally, if :py:attr:`slope` is not ``None``, :math:`x_{k}`
+           is checked or being sufficiently divergent from its very predecessor
+           :math:`x_{k-1}`, meaning that, it is additionally checked if:
           * :math:`x_k - x_{k-1} >` :py:attr:`slope`
           * :math:`t_k - t_{k-1} >` :py:attr:`weight` :math:`\\times` :py:attr:`freq`

        Parameters
        ----------
        thresh :
-            The threshold, for the total rise (:py:attr:`thresh` ``> 0``), or total drop
-            (:py:attr:`thresh` ``< 0``), value courses must not exceed within a timespan
-            of length :py:attr:`raise_window`.
+            The threshold, for the total rise (:py:attr:`thresh` ``> 0``),
+            or total drop (:py:attr:`thresh` ``< 0``), value courses must
+            not exceed within a timespan of length :py:attr:`raise_window`.

        raise_window :
-            An offset string, determining the timespan, the rise/drop thresholding refers
-            to. Window is inclusively defined.
+            An offset string, determining the timespan, the rise/drop
+            thresholding refers to. Window is inclusively defined.

        freq :
-            An offset string, determining the frequency, the timeseries to flag is supposed
-            to be sampled at. The window is inclusively defined.
+            An offset string, determining the frequency, the timeseries
+            to flag is supposed to be sampled at. The window is inclusively
+            defined.

        average_window :
-            See condition (2) of the description given in the Notes. Window is
-            inclusively defined, defaults to 1.5 times the size of :py:attr:`raise_window`.
+            See condition (2) of the description given in the Notes. Window
+            is inclusively defined, defaults to 1.5 times the size of
+            :py:attr:`raise_window`.

        raise_factor :
            See condition (2).
 @@ -713,6 +806,10 @@ class OutliersMixin:
        weight :
            See condition (3).
        """
+        validateWindow(raise_window, "raise_window", allow_int=False)
+        validateWindow(freq, "freq", allow_int=False)
+        validateWindow(average_window, "average_window", allow_int=False, optional=True)
+
        # prepare input args
        dataseries = self._data[field].dropna()
        raise_window_td = pd.Timedelta(raise_window)
 @@ -834,15 +931,15 @@ class OutliersMixin:
        ----------
        [1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h.htm
        """
-        msg = """
-                        The method `flagMAD` is deprecated and will be removed in verion 3.0 of saqc.
-                        To achieve the same behavior use:
-                        """
-        call = f"qc.flagZScore(field={field}, window={window}, method='modified', thresh={z}, min_residuals={min_residuals}, min_periods={min_periods}, center={center})"
-
-        warnings.warn(f"{msg}`{call}`", DeprecationWarning)
-
-        self = self.flagZScore(
+        warnings.warn(
+            f"The method `flagMAD` is deprecated and will be removed in "
+            "version 3.0 of saqc. To achieve the same behavior use:"
+            f"`qc.flagZScore(field={field}, window={window}, method='modified', "
+            f"thresh={z}, min_residuals={min_residuals}, min_periods={min_periods}, "
+            f"center={center})`",
+            DeprecationWarning,
+        )
+        return self.flagZScore(
            field,
            window=window,
            thresh=z,
 @@ -856,8 +953,6 @@ class OutliersMixin:
            flag=flag,
        )

-        return self
-
    @flagging()
    def flagOffset(
        self: "SaQC",
 @@ -1004,10 +1099,12 @@ class OutliersMixin:
           >>> qc = qc.flagOffset("data", thresh=2, thresh_relative=-.5, tolerance=1.5, window='6H')
           >>> qc.plot('data')  # doctest: +SKIP
        """
-        if (thresh is None) and (thresh_relative is None):
+        validateWindow(window)
+        if thresh is None and thresh_relative is None:
            raise ValueError(
-                "At least one of parameters 'thresh' and 'thresh_relative' has to be given. Got 'thresh'=None, "
-                "'thresh_relative'=None instead."
+                "At least one of parameters 'thresh' and 'thresh_relative' "
+                "has to be given. Got 'thresh'=None, 'thresh_relative'=None "
+                "instead."
            )
        if thresh is None:
            thresh = 0
 @@ -1110,6 +1207,10 @@ class OutliersMixin:

        [1] https://en.wikipedia.org/wiki/Grubbs%27s_test_for_outliers
        """
+        validateWindow(window)
+        validateFraction(alpha, "alpha")
+        validateMinPeriods(min_periods, optional=False)
+
        datcol = self._data[field].copy()
        rate = getFreqDelta(datcol.index)

 @@ -1215,25 +1316,29 @@ class OutliersMixin:
        ----------
        [1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h.htm
        """
-        msg = """
-                      The method `flagCrossStatistics` is deprecated and will be removed in verion 3.0 of saqc.
-                      To achieve the same behavior use:
-                      """
        new_method_string = {
            "modZscore": "modified",
            "Zscore": "standard",
            np.mean: "standard",
            np.median: "modified",
        }
-        call = f"qc.flagZScore(field={field}, window=1, method={new_method_string[method]}, thresh={thresh}, axis=1)"
-
-        warnings.warn(f"{msg}`{call}`", DeprecationWarning)
+        call = (
+            f"qc.flagZScore(field={field}, window=1, "
+            f"method={new_method_string[method]}, "
+            f"thresh={thresh}, axis=1)"
+        )
+        warnings.warn(
+            f"The method `flagCrossStatistics` is deprecated and will "
+            f"be removed in verion 3.0 of saqc. To achieve the same behavior "
+            f"use:`{call}`",
+            DeprecationWarning,
+        )

        return self.flagZScore(
-            field={field},
+            field=field,
            window=1,
-            method={new_method_string[method]},
-            thresh={thresh},
+            method=new_method_string[method],
+            thresh=thresh,
            axis=1,
            flag=flag,
        )
 @@ -1347,21 +1452,22 @@ class OutliersMixin:
                method = "modified"
            else:
                raise ValueError(
-                    "Support for scoring with functions not similar to either Zscore or modified Zscore is "
-                    "not supported anymore"
+                    "Support for scoring with functions not similar to "
+                    "either Zscore or modified Zscore is not supported "
+                    "anymore"
                )

-        dat = self._data[field].to_pandas(how="outer")
+        validateChoice(method, "method", ["standard", "modified"])
+        validateWindow(window, optional=True)
+        validateMinPeriods(min_periods)

-        if min_residuals is None:
-            min_residuals = 0
+        min_residuals = min_residuals or 0
+        min_periods = min_periods or 0

+        dat = self._data[field].to_pandas(how="outer")
        if dat.empty:
            return self

-        if min_periods is None:
-            min_periods = 0
-
        if window is None:
            if dat.notna().sum().sum() >= min_periods:
                if method == "standard":
 @@ -1382,6 +1488,7 @@ class OutliersMixin:
                    )
            else:
                return self
+
        else:  # window is not None
            if axis == 0:
                if method == "standard":
 @@ -1437,7 +1544,7 @@ def _evalStrayLabels(
    field: str,
    flags: Flags,
    target: Sequence[str],
-    reduction_range: Optional[str] = None,
+    reduction_range: str | None = None,
    reduction_drop_flagged: bool = False,  # TODO: still a case ?
    reduction_thresh: float = 3.5,
    reduction_min_periods: int = 1,