From f15169efdce7dac019ecf799ecac918bc77ca8c2 Mon Sep 17 00:00:00 2001
From: Bert Palm <bert.palm@ufz.de>
Date: Thu, 29 Jun 2023 17:44:13 +0200
Subject: [PATCH] more checking, docstring formatting

---
 CHANGELOG.md             |   2 +
 saqc/funcs/noise.py      |   2 +-
 saqc/funcs/outliers.py   | 553 ++++++++++++++++++++++-----------------
 saqc/funcs/scores.py     |   8 +-
 saqc/lib/checking.py     |   4 +
 saqc/lib/tools.py        |  15 +-
 saqc/lib/ts_operators.py |   4 +-
 7 files changed, 344 insertions(+), 244 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index bddd7fbd0..4bb4402a4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,8 @@ SPDX-License-Identifier: GPL-3.0-or-later
 - added checks and unified error message for common inputs.
 ### Changed
 - pin pandas to versions >= 2.0
+- parameter `fill_na` of `SaQC.flagUniLOF` and `SaQC.assignUniLOF` is now of type 
+  `bool` instead of one of `[None, "linear"]`
 ### Removed
 - removed deprecated `DictOfSeries.to_df`
 ### Fixed
diff --git a/saqc/funcs/noise.py b/saqc/funcs/noise.py
index a7dc1e910..0137d4a27 100644
--- a/saqc/funcs/noise.py
+++ b/saqc/funcs/noise.py
@@ -15,7 +15,7 @@ import pandas as pd
 
 from saqc.constants import BAD
 from saqc.core.register import flagging
-from saqc.lib.checking import validateCallable, validateWindow, validateMinPeriods
+from saqc.lib.checking import validateCallable, validateMinPeriods, validateWindow
 from saqc.lib.tools import isunflagged, statPass
 
 if TYPE_CHECKING:
diff --git a/saqc/funcs/outliers.py b/saqc/funcs/outliers.py
index 0b45dacac..04b7ce3c2 100644
--- a/saqc/funcs/outliers.py
+++ b/saqc/funcs/outliers.py
@@ -10,7 +10,7 @@ from __future__ import annotations
 
 import uuid
 import warnings
-from typing import TYPE_CHECKING, Callable, Optional, Sequence, Tuple
+from typing import TYPE_CHECKING, Callable, List, Optional, Sequence, Tuple
 
 import numpy as np
 import numpy.polynomial.polynomial as poly
@@ -22,6 +22,16 @@ from typing_extensions import Literal
 from saqc import BAD, UNFLAGGED
 from saqc.core import DictOfSeries, Flags, flagging, register
 from saqc.funcs.scores import _univarScoring
+from saqc.lib.checking import (
+    isCallable,
+    isFloatLike,
+    validateCallable,
+    validateChoice,
+    validateFrequency,
+    validateMinPeriods,
+    validateValueBounds,
+    validateWindow,
+)
 from saqc.lib.docs import DOC_TEMPLATES
 from saqc.lib.rolling import windowRoller
 from saqc.lib.tools import getFreqDelta, isflagged, toSequence
@@ -31,6 +41,24 @@ if TYPE_CHECKING:
 
 
 class OutliersMixin:
+    @staticmethod
+    def _validateLOF(n, thresh, algorithm, p, density):
+        """validate parameter for LOF and UniLOF"""
+        validateValueBounds(n, "n", left=0, strict_int=True)
+        validateValueBounds(p, "p", left=0, strict_int=True)
+
+        validateChoice(
+            algorithm, "algorithm", ["ball_tree", "kd_tree", "brute", "auto"]
+        )
+
+        if thresh != "auto" and not isFloatLike(thresh):
+            raise ValueError(f"'thresh' must be 'auto' or a float, not {thresh}")
+
+        if density != "auto" and not isFloatLike(density) and not isCallable(density):
+            raise ValueError(
+                f"'density' must be 'auto' or a float or a function, not {density}"
+            )
+
     @register(
         mask=["field"],
         demask=["field"],
@@ -55,58 +83,76 @@ class OutliersMixin:
         Parameters
         ----------
         n :
-            Number of neighbors to be included into the LOF calculation. Defaults to ``20``, which is a
+            Number of neighbors to be included into the LOF calculation.
+            Defaults to ``20``, which is a
             value found to be suitable in the literature.
 
-            * :py:attr:`n` determines the "locality" of an observation (its :py:attr:`n` nearest neighbors)
-              and sets the upper limit to the number of values in outlier clusters (i.e. consecutive outliers). Outlier
-              clusters of size greater than :py:attr:`n`/2 may not be detected reliably.
-            * The larger :py:attr:`n`, the lesser the algorithm's sensitivity to local outliers and small
-              or singleton outliers points. Higher values greatly increase numerical costs.
+            * :py:attr:`n` determines the "locality" of an observation
+              (its :py:attr:`n` nearest neighbors) and sets the upper
+              limit to the number of values in outlier clusters (i.e.
+              consecutive outliers). Outlier clusters of size greater
+              than :py:attr:`n`/2 may not be detected reliably.
+            * The larger :py:attr:`n`, the lesser the algorithm's sensitivity
+              to local outliers and small or singleton outliers points.
+              Higher values greatly increase numerical costs.
 
         thresh :
-            The threshold for flagging the calculated LOF. A LOF of around ``1`` is considered normal and
-            most likely corresponds to inlier points.
+            The threshold for flagging the calculated LOF. A LOF of around
+            ``1`` is considered normal and most likely corresponds to
+            inlier points.
 
-            * The "automatic" threshing introduced with the publication of the algorithm defaults to ``1.5``.
-            * In this implementation, :py:attr:`thresh` defaults (``'auto'``) to flagging the scores with a
-              modified 3-sigma rule, resulting in a :py:attr:`thresh` `` > 1.5`` which usually mitigates
-              overflagging compared to the literature recommendation.
+            * The "automatic" threshing introduced with the publication
+              of the algorithm defaults to ``1.5``.
+            * In this implementation, :py:attr:`thresh` defaults (``'auto'``)
+              to flagging the scores with a modified 3-sigma rule, resulting
+              in a :py:attr:`thresh` `` > 1.5`` which usually mitigates
+              over-flagging compared to the literature recommendation.
 
         algorithm :
             Algorithm used for calculating the :py:attr:`n`-nearest neighbors.
 
         p :
-            Degree of the metric ("Minkowski"), according to which the distance to neighbors is determined.
-            Most important values are:
+            Degree of the metric ("Minkowski"), according to which the
+            distance to neighbors is determined. Most important values are:
 
-            * ``1`` - Manhatten Metric
+            * ``1`` - Manhattan Metric
             * ``2`` - Euclidian Metric
 
+        density :
+            How to calculate the temporal distance/density for the variable to flag.
+
+            * ``'auto'`` - introduces linear density with an increment
+              equal to the median of the absolute diff of the variable to flag.
+            * ``float`` - introduces linear density with an increment
+              equal to :py:attr:`density`
+            * Callable - calculates the density by applying the function
+              passed onto the variable to flag (passed as Series).
+
         Notes
         -----
-        * The :py:meth:`~saqc.SaQC.flagLOF` function calculates the Local Outlier Factor (LOF) for every point
-          in the input timeseries. The *LOF* is a scalar value, that roughly correlates to the *reachability*,
-          or "outlierishnes" of the evaluated datapoint. If a point is as reachable, as all its
-          :py:attr:`n`-nearest neighbors, the *LOF* score evaluates to around ``1``. If it is only as half as
-          reachable as all its ``n``-nearest neighbors are (so to say, as double as "outlierish"), the score
-          is about ``2``. So, the Local Outlier *Factor* relates a point's *reachability* to the *reachability*
-          of its :py:attr:`n`-nearest neighbors in a multiplicative fashion (as a "factor").
-        * The *reachability* of a point thereby is determined as an aggregation of the points distances to its
-          :py:attr:`n`-nearest neighbors, measured with regard to the minkowski metric of degree :py:attr:`p`
+        * The :py:meth:`~saqc.SaQC.flagLOF` function calculates the Local
+          Outlier Factor (LOF) for every point in the input timeseries.
+          The *LOF* is a scalar value, that roughly correlates to the
+          *reachability*, or "outlierishnes" of the evaluated datapoint.
+          If a point is as reachable, as all its :py:attr:`n`-nearest
+          neighbors, the *LOF* score evaluates to around ``1``. If it
+          is only as half as reachable as all its ``n``-nearest neighbors
+          are (so to say, as double as "outlierish"), the score is about
+          ``2``. So, the Local Outlier *Factor* relates a point's *reachability*
+          to the *reachability* of its :py:attr:`n`-nearest neighbors
+          in a multiplicative fashion (as a "factor").
+        * The *reachability* of a point thereby is determined as an aggregation
+          of the points distances to its :py:attr:`n`-nearest neighbors,
+          measured with regard to the minkowski metric of degree :py:attr:`p`
           (usually euclidean).
-        * To derive a binary label for every point (outlier: *yes*, or *no*), the scores are cut off at a level,
-          determined by :py:attr:`thresh`.
+        * To derive a binary label for every point (outlier: *yes*, or *no*),
+          the scores are cut off at a level, determined by :py:attr:`thresh`.
 
         """
-        if not (density == "auto" or isinstance(density, float) or callable(density)):
-            raise ValueError(
-                "'density' must be 'auto' or a float or a function, " f"not {density}"
-            )
-
+        self._validateLOF(n, thresh, algorithm, p, density)
         fields = toSequence(field)
         field_ = str(uuid.uuid4())
-        self = self.assignLOF(
+        qc = self.assignLOF(
             field=fields,
             target=field_,
             n=n,
@@ -114,7 +160,7 @@ class OutliersMixin:
             p=p,
             density=density,
         )
-        s = self.data[field_]
+        s = qc.data[field_]
         if thresh == "auto":
             s = pd.concat([s, (-s - 2)])
             s_mask = (s - s.mean() / s.std())[: len(s) // 2].abs() > 3
@@ -122,10 +168,10 @@ class OutliersMixin:
             s_mask = s < abs(thresh)
 
         for f in fields:
-            mask = ~isflagged(self._flags[f], kwargs["dfilter"]) & s_mask
-            self._flags[mask, f] = flag
+            mask = ~isflagged(qc._flags[f], kwargs["dfilter"]) & s_mask
+            qc._flags[mask, f] = flag
 
-        return self.dropField(field_)
+        return qc.dropField(field_)
 
     @flagging()
     def flagUniLOF(
@@ -136,95 +182,120 @@ class OutliersMixin:
         algorithm: Literal["ball_tree", "kd_tree", "brute", "auto"] = "ball_tree",
         p: int = 1,
         density: Literal["auto"] | float | Callable = "auto",
-        fill_na: str = "linear",
+        fill_na: bool = True,
         flag: float = BAD,
         **kwargs,
     ) -> "SaQC":
         """
         Flag "univariate" Local Outlier Factor (LOF) exceeding cutoff.
 
-        The function is a wrapper around a usual LOF implementation, aiming for an easy to use,
-        parameter minimal outlier detection function for single variables, that does not necessitate
-        prior modelling of the variable. LOF is applied onto a concatenation of the `field` variable
-        and a "temporal density", or "penalty" variable, that measures temporal distance between data
-        points. See notes Section for a more exhaustive explaination.
-
-        See the Notes section for more details on the algorithm.
+        The function is a wrapper around a usual LOF implementation, aiming
+        for an easy to use, parameter minimal outlier detection function
+        for single variables, that does not necessitate prior modelling
+        of the variable. LOF is applied onto a concatenation of the `field`
+        variable and a "temporal density", or "penalty" variable, that
+        measures temporal distance between data points. See notes Section
+        for a more exhaustive explaination. See the Notes section for
+        more details on the algorithm.
 
         Parameters
         ----------
         n :
-            Number of periods to be included into the LOF calculation. Defaults to `20`, which is a
-            value found to be suitable in the literature.
-
-            * :py:attr:`n` determines the "locality" of an observation (its :py:attr:`n` nearest neighbors)
-              and sets the upper limit to the number of values in an outlier clusters (i.e. consecutive outliers). Outlier
-              clusters of size greater than :py:attr:`n`/2 may not be detected reliably.
-            * The larger :py:attr:`n`, the lesser the algorithm's sensitivity to local outliers and small
-              or singleton outlier points. Higher values greatly increase numerical costs.
+            Number of periods to be included into the LOF calculation.
+            Defaults to `20`, which is a value found to be suitable in
+            the literature.
+
+            * :py:attr:`n` determines the "locality" of an observation
+              (its :py:attr:`n` nearest neighbors) and sets the upper
+              limit to the number of values in an outlier clusters (i.e.
+              consecutive outliers). Outlier clusters of size greater
+              than :py:attr:`n`/2 may not be detected reliably.
+            * The larger :py:attr:`n`, the lesser the algorithm's sensitivity
+              to local outliers and small or singleton outlier points.
+              Higher values greatly increase numerical costs.
 
         thresh :
-            The threshold for flagging the calculated LOF. A LOF of around ``1`` is considered normal and
-            most likely corresponds to inlier points. This parameter is considered the main calibration
+            The threshold for flagging the calculated LOF. A LOF of around
+            ``1`` is considered normal and most likely corresponds to
+            inlier points. This parameter is considered the main calibration
             parameter of the algorithm.
 
-            * The threshing defaults to ``1.5``, wich is the default value found to be suitable in the literature.
-            * ``'auto'`` enables flagging the scores with a modified 3-sigma rule,
-              resulting in a thresh around ``4``, which usually greatly mitigates overflagging compared to the
-              literature recommendation, but often is too high.
-            * sensitive range for the parameter may be ``[1,15]``, assuming default settings for the other parameters.
+            * The threshing defaults to ``1.5``, wich is the default value
+              found to be suitable in the literature.
+            * ``'auto'`` enables flagging the scores with a modified 3-sigma
+              rule, resulting in a thresh around ``4``, which usually
+              greatly mitigates overflagging compared to the literature
+              recommendation, but often is too high.
+            * sensitive range for the parameter may be ``[1,15]``, assuming
+              default settings for the other parameters.
 
         algorithm :
-            Algorithm used for calculating the :py:attr:`n`-nearest neighbors needed for LOF calculation.
+            Algorithm used for calculating the :py:attr:`n`-nearest neighbors
+            needed for LOF calculation.
+
         p :
-            Degree of the metric ("Minkowski"), according to which distance to neighbors is determined.
-            Most important values are:
+            Degree of the metric ("Minkowski"), according to which distance
+            to neighbors is determined. Most important values are:
             * ``1`` - Manhatten Metric
             * ``2`` - Euclidian Metric
+
         density :
-            How to calculate the temporal distance/density for the variable to flag.
+            How to calculate the temporal distance/density for the variable
+            to flag.
 
-            * ``'auto'`` - introduces linear density with an increment equal to the median of the absolute
-              diff of the variable to flag.
-            * ``float`` - introduces linear density with an increment equal to :py:attr:`density`
-            * Callable - calculates the density by applying the function passed onto the variable to flag
-              (passed as Series).
+            * ``'auto'`` - introduces linear density with an increment
+              equal to the median of the absolute diff of the variable to flag.
+            * ``float`` - introduces linear density with an increment
+              equal to :py:attr:`density`
+            * Callable - calculates the density by applying the function
+              passed onto the variable to flag (passed as Series).
 
         fill_na :
-            Weather or not to fill NaN values in the data with a linear interpolation.
+            If True, NaNs in the data are filled with a linear interpolation.
 
         See Also
         --------
-        :ref:`introduction to outlier detection with saqc <cookbooks/OutlierDetection:Outlier Detection>`
+        :ref:`introduction to outlier detection with
+            saqc <cookbooks/OutlierDetection:Outlier Detection>`
 
         Notes
         -----
-        * The :py:meth:`~saqc.SaQC.flagUniLOF` function calculates an univariate
-          Local Outlier Factor (UniLOF) - score for every point in the one dimensional input
-          data series.
-          The *UniLOF* score of any data point is a scalar value, that roughly correlates to
-          its *reachability*, or "outlierishnes" in the 2-dimensional space constituted by the
-          data-values and the time axis. So the Algorithm basically operates on the "graph",
-          or the "plot" of the input timeseries.
+
+        * The :py:meth:`~saqc.SaQC.flagUniLOF` function calculates an
+          univariate Local Outlier Factor (UniLOF) - score for every
+          point in the one dimensional input data series. The *UniLOF*
+          score of any data point is a scalar value, that roughly correlates
+          to its *reachability*, or "outlierishnes" in the 2-dimensional
+          space constituted by the data-values and the time axis. So
+          the Algorithm basically operates on the "graph", or the "plot"
+          of the input timeseries.
+
         * If a point in this "graph" is as reachable, as all its :py:attr:`n`-nearest
-          neighbors, its *UniLOF* score evaluates to around ``1``. If it is only as half as
-          reachable as all its :py:attr:`n` neighbors are
-          (so to say, as double as "outlierish"), its score evaluates to ``2`` roughly.
-          So, the Univariate Local Outlier *Factor* relates a points *reachability* to the
-          *reachability* of its :py:attr:`n`-nearest neighbors in a multiplicative fashion
+          neighbors, its *UniLOF* score evaluates to around ``1``. If
+          it is only as half as reachable as all its :py:attr:`n` neighbors
+          are (so to say, as double as "outlierish"), its score evaluates
+          to ``2`` roughly. So, the Univariate Local Outlier *Factor*
+          relates a points *reachability* to the *reachability* of its
+          :py:attr:`n`-nearest neighbors in a multiplicative fashion
           (as a "factor").
-        * The *reachability* of a point thereby is derived as an aggregation of the points
-          distance to its :py:attr:`n`-nearest neighbors, measured with regard to the minkowski
-          metric of degree :py:attr:`p` (usually euclidean).
-        * The parameter :py:attr:`density` thereby determines how dimensionality of the time is
-          removed, to make it a dimensionless, real valued coordinate.
-        * To derive a binary label for every point (outlier: *yes*, or *no*), the scores are cut
-          off at a level, determined by :py:attr:`thresh`.
+
+        * The *reachability* of a point thereby is derived as an aggregation
+          of the points distance to its :py:attr:`n`-nearest neighbors,
+          measured with regard to the minkowski metric of degree :py:attr:`p`
+          (usually euclidean).
+
+        * The parameter :py:attr:`density` thereby determines how dimensionality
+          of the time is removed, to make it a dimensionless, real valued
+          coordinate.
+
+        * To derive a binary label for every point (outlier: *yes*, or
+          *no*), the scores are cut off at a level, determined by :py:attr:`thresh`.
 
         Examples
         --------
 
-        See the :ref:`outlier detection cookbook <cookbooks/OutlierDetection:Outlier Detection>` for a detailed
+        See the :ref:`outlier detection cookbook
+        <cookbooks/OutlierDetection:Outlier Detection>` for a detailed
         introduction into the usage and tuning of the function.
 
         .. plot::
@@ -241,8 +312,10 @@ class OutliersMixin:
 
         Example usage with default parameter configuration:
 
-        Loading data via pandas csv file parser, casting index to DateTime, generating a :py:class:`~saqc.SaQC`
-        instance from the data and plotting the variable representing light scattering at 254 nanometers wavelength.
+        Loading data via pandas csv file parser, casting index to DateTime,
+        generating a :py:class:`~saqc.SaQC` instance from the data and
+        plotting the variable representing light scattering at 254 nanometers
+        wavelength.
 
         .. doctest:: flagUniLOFExample
 
@@ -260,8 +333,9 @@ class OutliersMixin:
 
             qc.plot('sac254_raw')
 
-        We apply :py:meth:`~saqc.SaqC.flagUniLOF` in with default parameter values. Meaning, that the main
-        calibration paramters :py:attr:`n` and :py:attr:`thresh` evaluate to `20` and `1.5` respectively.
+        We apply :py:meth:`~saqc.SaqC.flagUniLOF` in with default parameter
+        values. Meaning, that the main calibration paramters :py:attr:`n`
+        and :py:attr:`thresh` evaluate to `20` and `1.5` respectively.
 
         .. doctest:: flagUniLOFExample
 
@@ -278,27 +352,29 @@ class OutliersMixin:
            qc.plot('sac254_raw')
 
         """
-        field_ = str(uuid.uuid4())
-        self = self.assignUniLOF(
+        self._validateLOF(n, thresh, algorithm, p, density)
+
+        tmp_field = str(uuid.uuid4())
+        qc = self.assignUniLOF(
             field=field,
-            target=field_,
+            target=tmp_field,
             n=n,
             algorithm=algorithm,
             p=p,
             density=density,
             fill_na=fill_na,
         )
-        s = self.data[field_]
+        s = qc.data[tmp_field]
         if thresh == "auto":
             _s = pd.concat([s, (-s - 2)])
             s_mask = ((_s - _s.mean()) / _s.std()).iloc[: int(s.shape[0])].abs() > 3
         else:
             s_mask = s < -abs(thresh)
 
-        s_mask = ~isflagged(self._flags[field], kwargs["dfilter"]) & s_mask
-        self._flags[s_mask, field] = flag
-        self = self.dropField(field_)
-        return self
+        s_mask = ~isflagged(qc._flags[field], kwargs["dfilter"]) & s_mask
+        qc._flags[s_mask, field] = flag
+        qc = qc.dropField(tmp_field)
+        return qc
 
     @flagging()
     def flagRange(
@@ -310,7 +386,8 @@ class OutliersMixin:
         **kwargs,
     ) -> "SaQC":
         """
-        Function flags values exceeding the closed interval [:py:attr:`min`, :py:attr:`max`].
+        Function flags values exceeding the closed
+        interval [:py:attr:`min`, :py:attr:`max`].
 
         Parameters
         ----------
@@ -319,7 +396,6 @@ class OutliersMixin:
         max :
             Upper bound for valid data.
         """
-
         # using .values is much faster
         datacol = self._data[field].to_numpy()
         mask = (datacol < min) | (datacol > max)
@@ -346,28 +422,28 @@ class OutliersMixin:
         ----------
 
         window :
-            Determines the segmentation of the data into partitions, the kNN algorithm is
-            applied onto individually.
+            Determines the segmentation of the data into partitions, the
+            kNN algorithm is applied onto individually.
 
             * ``None``: Apply Scoring on whole data set at once
-            * ``int``: Apply scoring on successive data chunks of periods with the given length.
-              Must be greater than 0.
-            * Offset String : Apply scoring on successive partitions of temporal extension
-              matching the passed offset string
+            * ``int``: Apply scoring on successive data chunks of periods
+              with the given length. Must be greater than 0.
+            * offset String : Apply scoring on successive partitions of
+              temporal extension matching the passed offset string
 
         min_periods :
-            Minimum number of periods per partition that have to be present for a valid
-            outlier detection to be made in this partition (only of effect, if :py:attr:`freq`
-            is an integer).
+            Minimum number of periods per partition that have to be present
+            for a valid outlier detection to be made in this partition
 
         iter_start :
-            Float in ``[0, 1]`` that determines which percentage of data is considered
-            "normal". ``0.5`` results in the stray algorithm to search only the upper 50% of
-            the scores for the cut off point. (See reference section for more information)
+            Float in ``[0, 1]`` that determines which percentage of data
+            is considered "normal". ``0.5`` results in the stray algorithm
+            to search only the upper 50% of the scores for the cut off
+            point. (See reference section for more information)
 
         alpha :
-            Level of significance by which it is tested, if a score might be drawn from
-            another distribution than the majority of the data.
+            Level of significance by which it is tested, if a score might
+            be drawn from another distribution than the majority of the data.
 
         References
         ----------
@@ -378,36 +454,36 @@ class OutliersMixin:
         """
         scores = self._data[field].dropna()
 
+        if window is None:
+            window = len(scores)
+        if not isinstance(window, int):
+            validateFrequency(window, "window")
+
+        validateMinPeriods(min_periods)
+        validateValueBounds(iter_start, "iter_start", left=0, right=1, closed="both")
+
         if scores.empty:
             return self
 
-        if not window:
-            window = len(scores)
-
-        if isinstance(window, str):
+        if isinstance(window, int):
+            s = pd.Series(data=np.arange(0, len(scores)), index=scores.index)
+            s = s.transform(lambda x: int(np.floor(x / window)))
+            partitions = scores.groupby(s)
+        else:  # pd.Timedelta pd.DateOffset or str
             partitions = scores.groupby(pd.Grouper(freq=window))
 
-        else:
-            grouper_series = pd.Series(
-                data=np.arange(0, len(scores)), index=scores.index
-            )
-            grouper_series = grouper_series.transform(
-                lambda x: int(np.floor(x / window))
-            )
-            partitions = scores.groupby(grouper_series)
-
         # calculate flags for every window
         for _, partition in partitions:
-            if partition.empty | (len(partition) < min_periods):
-                continue
-
             sample_size = len(partition)
 
+            if partition.empty or sample_size < min_periods:
+                continue
+
             sorted_i = partition.values.argsort()
             resids = partition.values[sorted_i]
             gaps = np.append(0, np.diff(resids))
 
-            tail_size = int(max(min(50, np.floor(sample_size / 4)), 2))
+            tail_size = int(max(min(np.floor(sample_size / 4), 50), 2))
             tail_indices = np.arange(2, tail_size + 1)
 
             i_start = int(max(np.floor(sample_size * iter_start), 1) + 1)
@@ -452,69 +528,75 @@ class OutliersMixin:
         flag: float = BAD,
         **kwargs,
     ) -> "SaQC":
+        validateCallable(trafo, "trafo")
         """
-        The algorithm implements a 3-step outlier detection procedure for simultaneously
-        flagging of higher dimensional data (dimensions > 3).
+        The algorithm implements a 3-step outlier detection procedure for 
+        simultaneously flagging of higher dimensional data (dimensions > 3).
 
-        In [1], the procedure is introduced and exemplified with an application on hydrological
-        data. See the notes section for an overview over the algorithms basic steps.
+        In [1], the procedure is introduced and exemplified with an application on 
+        hydrological data. See the notes section for an overview over the algorithms 
+        basic steps.
 
         Parameters
         ----------
-        trafo : default identity
-            Transformation to be applied onto every column before scoring. For more fine-grained
-            control, the data could also be transformed before :py:meth:`~saqc.SaQC.flagMVScores`
-            is called.
+        trafo : 
+            Transformation to be applied onto every column before scoring. For more 
+            fine-grained control, the data could also be transformed before 
+            :py:meth:`~saqc.SaQC.flagMVScores` is called.
 
         alpha :
             Level of significance by which it is tested, if an observations score might
             be drawn from another distribution than the majority of the data.
 
-        n :
+        n : 
             Number of neighbors included in the scoring process for every datapoint.
 
-        func : default sum
-            Function that aggregates a value's k-smallest distances, returning a scalar score.
+        func : 
+            Function that aggregates a value's k-smallest distances, returning a 
+            scalar score.
 
         iter_start :
-            Value in ``[0,1]`` that determines which percentage of data is considered
-            "normal". 0.5 results in the threshing algorithm to search only the upper 50%
-            of the scores for the cut off point. (See reference section for more
+            Value in ``[0,1]`` that determines which percentage of data is considered 
+            "normal". 0.5 results in the threshing algorithm to search only the upper 
+            50% of the scores for the cut-off point. (See reference section for more 
             information)
 
         window :
-            Only effective if :py:attr:`threshing` is set to ``'stray'``. Determines the
-            size of the data partitions, the data is decomposed into. Each partition is checked
-            seperately for outliers.
-            Either given as an Offset String, denoting the windows temporal extension or
-            as an integer, denoting the windows number of periods. ``NaN`` also count as periods.
-            If ``None``, all data points share the same scoring window, which than equals the whole
-            data.
+            Only effective if :py:attr:`threshing` is set to ``'stray'``. Determines 
+            the size of the data partitions, the data is decomposed into. Each 
+            partition is checked seperately for outliers. Either given as an Offset 
+            String, denoting the windows temporal extension or as an integer, 
+            denoting the windows number of periods. ``NaN`` also count as periods. If 
+            ``None``, all data points share the same scoring window, which than 
+            equals the whole data.
 
         min_periods :
-            Only effective if :py:attr:`threshing` is set to ``'stray'`` and :py:attr:`partition` is an integer.
-            Minimum number of periods per :py:attr:`partition` that have to be present for a valid outlier
+            Only effective if :py:attr:`threshing` is set to ``'stray'`` and 
+            :py:attr:`partition` is an integer. Minimum number of periods per 
+            :py:attr:`partition` that have to be present for a valid outlier 
             detection to be made in this partition.
 
         stray_range :
-            If not ``None``, it is tried to reduce the stray result onto single outlier components
-            of the input :py:attr:`field`. The offset string denotes the range of the
-            temporal surrounding to include into the MAD testing while trying to reduce
-            flags.
+            If not ``None``, it is tried to reduce the stray result onto single 
+            outlier components of the input :py:attr:`field`. The offset string 
+            denotes the range of the temporal surrounding to include into the MAD 
+            testing while trying to reduce flags.
 
         drop_flagged :
-            Only effective when :py:attr:`stray_range` is not ``None``. Whether or not to drop flagged
-            values from the temporal surroundings.
+            Only effective when :py:attr:`stray_range` is not ``None``. Whether or 
+            not to drop flagged values from the temporal surroundings.
 
         thresh :
-            Only effective when :py:attr:`stray_range` is not ``None``. The 'critical' value,
-            controlling wheather the MAD score is considered referring to an outlier or
-            not. Higher values result in less rigid flagging. The default value is widely
-            considered apropriate in the literature.
+            Only effective when :py:attr:`stray_range` is not ``None``. The 
+            'critical' value, controlling wheather the MAD score is considered 
+            referring to an outlier or not. Higher values result in less rigid 
+            flagging. The default value is widely considered apropriate in the 
+            literature.
 
         min_periods_r :
-            Only effective when :py:attr:`stray_range` is not ``None``. Minimum number of measurements
-            necessary in an interval to actually perform the reduction step.
+            Only effective when :py:attr:`stray_range` is not ``None``. Minimum 
+            number of measurements necessary in an interval to actually perform the 
+            reduction step.
 
         Notes
         -----
@@ -526,33 +608,33 @@ class OutliersMixin:
         (a) make them comparable and
         (b) make outliers more stand out.
 
-        This step is usually subject to a phase of research/try and error. See [1] for more
-        details.
+        This step is usually subject to a phase of research/try and error. See [1] 
+        for more details.
 
-        Note, that the data transformation as an built-in step of the algorithm, will likely
-        get deprecated in the future. Its better to transform the data in a processing
-        step, preceeding the multivariate flagging process. Also, by doing so, one gets
-        mutch more control and variety in the transformation applied, since the `trafo`
-        parameter only allows for application of the same transformation to all of the
-        variables involved.
+        Note, that the data transformation as a built-in step of the algorithm, 
+        will likely get deprecated in the future. It's better to transform the data in 
+        a processing step, preceeding the multivariate flagging process. Also, 
+        by doing so, one gets mutch more control and variety in the transformation 
+        applied, since the `trafo` parameter only allows for application of the same 
+        transformation to all the variables involved.
 
         2. scoring
 
-        Every observation gets assigned a score depending on its k nearest neighbors. See
-        the `scoring_method` parameter description for details on the different scoring
-        methods. Furthermore [1] may give some insight in the pro and cons of the
-        different methods.
+        Every observation gets assigned a score depending on its k nearest neighbors. 
+        See the `scoring_method` parameter description for details on the different 
+        scoring methods. Furthermore, [1] may give some insight in the pro and cons of 
+        the different methods.
 
         3. threshing
 
-        The gaps between the (greatest) scores are tested for beeing drawn from the same
-        distribution as the majority of the scores. If a gap is encountered, that,
-        with sufficient significance, can be said to not be drawn from the same
-        distribution as the one all the smaller gaps are drawn from, than the observation
-        belonging to this gap, and all the observations belonging to gaps larger then
-        this gap, get flagged outliers. See description of the `threshing` parameter for
-        more details. Although [1] gives a fully detailed overview over the `stray`
-        algorithm.
+        The gaps between the (greatest) scores are tested for beeing drawn from the 
+        same distribution as the majority of the scores. If a gap is encountered, 
+        that, with sufficient significance, can be said to not be drawn from the same 
+        distribution as the one all the smaller gaps are drawn from, than the 
+        observation belonging to this gap, and all the observations belonging to gaps 
+        larger than this gap, get flagged outliers. See description of the 
+        `threshing` parameter for more details. Although [1] gives a fully detailed 
+        overview over the `stray` algorithm.
 
         References
         ----------
@@ -560,7 +642,6 @@ class OutliersMixin:
              Anomaly Detection in High-Dimensional Data,
              Journal of Computational and Graphical Statistics, 30:2, 360-374,
              DOI: 10.1080/10618600.2020.1807997
-
         """
 
         # parameter deprecations
@@ -568,8 +649,8 @@ class OutliersMixin:
         if "partition" in kwargs:
             warnings.warn(
                 """
-                The parameter `partition` is deprecated and will be removed in version 3.0 of saqc.
-                Please us the parameter `window` instead.'
+                The parameter `partition` is deprecated and will be removed in version 
+                3.0 of saqc. Please us the parameter `window` instead.
                 """,
                 DeprecationWarning,
             )
@@ -578,8 +659,8 @@ class OutliersMixin:
         if "partition_min" in kwargs:
             warnings.warn(
                 """
-                The parameter `partition_min` is deprecated and will be removed in version 3.0 of saqc.
-                Please us the parameter `min_periods` instead.'
+                The parameter `partition_min` is deprecated and will be removed in 
+                version 3.0 of saqc. Please us the parameter `min_periods` instead.
                 """,
                 DeprecationWarning,
             )
@@ -588,27 +669,30 @@ class OutliersMixin:
         if min_periods != 11:
             warnings.warn(
                 """
-                You were setting a customary value for the `min_periods` parameter: note that this parameter 
-                does no longer refer to the reduction interval length, but now controls the number of periods 
-                having to be present in an interval of size `window` (deprecated:`partition`) for the algorithm to be 
-                performed in that interval.
-                To alter the size of the reduction window, use the parameter `min_periods_r`. Changes readily apply. 
-                Warning will be removed in saqc version 3.0.
+                You were setting a customary value for the `min_periods` parameter: 
+                note that this parameter does no longer refer to the reduction interval 
+                length, but now controls the number of periods having to be present in 
+                an interval of size `window` (deprecated:`partition`) for the algorithm 
+                to be performed in that interval.
+                To alter the size of the reduction window, use the parameter 
+                `min_periods_r`. Changes readily apply. 
+                This warning will be removed in saqc version 3.0.
                 """,
                 DeprecationWarning,
             )
 
         fields = toSequence(field)
 
+        qc = self
         fields_ = []
         for f in fields:
             field_ = str(uuid.uuid4())
-            self = self.copyField(field=f, target=field_)
-            self = self.transform(field=field_, func=trafo, freq=window)
+            qc = qc.copyField(field=f, target=field_)
+            qc = qc.transform(field=field_, func=trafo, freq=window)
             fields_.append(field_)
 
         knn_field = str(uuid.uuid4())
-        self = self.assignKNNScore(
+        qc = qc.assignKNNScore(
             field=fields_,
             target=knn_field,
             n=n,
@@ -619,9 +703,9 @@ class OutliersMixin:
             **kwargs,
         )
         for field_ in fields_:
-            self = self.dropField(field_)
+            qc = qc.dropField(field_)
 
-        self = self.flagByStray(
+        qc = qc.flagByStray(
             field=knn_field,
             freq=window,
             min_periods=min_periods,
@@ -631,11 +715,11 @@ class OutliersMixin:
             **kwargs,
         )
 
-        self._data, self._flags = _evalStrayLabels(
-            data=self._data,
+        qc._data, qc._flags = _evalStrayLabels(
+            data=qc._data,
             field=knn_field,
             target=fields,
-            flags=self._flags,
+            flags=qc._flags,
             reduction_range=stray_range,
             reduction_drop_flagged=drop_flagged,
             reduction_thresh=thresh,
@@ -643,7 +727,7 @@ class OutliersMixin:
             flag=flag,
             **kwargs,
         )
-        return self.dropField(knn_field)
+        return qc.dropField(knn_field)
 
     @flagging()
     def flagRaise(
@@ -660,17 +744,17 @@ class OutliersMixin:
         **kwargs,
     ) -> "SaQC":
         """
-        The function flags raises and drops in value courses, that exceed a certain threshold
-        within a certain timespan.
+        The function flags raises and drops in value courses, that exceed a certain
+        threshold within a certain timespan.
 
-        The parameter variety of the function is owned to the intriguing case of values, that
-        "return" from outlierish or anomalious value levels and thus exceed the threshold,
-        while actually being usual values.
+        The parameter variety of the function is owned to the intriguing case of
+        values, that "return" from outlierish or anomalious value levels and thus
+        exceed the threshold, while actually being usual values.
 
         Notes
         -----
-        The dataset is NOT supposed to be harmonized to a time series with an equidistant
-        requency grid.
+        The dataset is NOT supposed to be harmonized to a time series with an
+        equidistant requency grid.
 
         The value :math:`x_{k}` of a time series :math:`x` with associated
         timestamps :math:`t_i`, is flagged a raise, if:
@@ -679,35 +763,38 @@ class OutliersMixin:
            :py:attr:`raise_window` range, so that
            :math:`M = |x_k - x_s | >`  :py:attr:`thresh` :math:`> 0`
 
-        2. The weighted average :math:`\\mu^{*}` of the values, preceding :math:`x_{k}`
-           within :py:attr:`average_window` range indicates, that :math:`x_{k}` does not
-           return from an "outlierish" value course, meaning that
+        2. The weighted average :math:`\\mu^{*}` of the values, preceding
+           :math:`x_{k}` within :py:attr:`average_window` range indicates,
+           that :math:`x_{k}` does not return from an "outlierish" value
+           course, meaning that
            :math:`x_k > \\mu^* + ( M` / :py:attr:`raise_factor` :math:`)`
 
-        3. Additionally, if :py:attr:`slope` is not ``None``, :math:`x_{k}` is checked for being
-           sufficiently divergent from its very predecessor :math:`x_{k-1}`, meaning that, it
-           is additionally checked if:
+        3. Additionally, if :py:attr:`slope` is not ``None``, :math:`x_{k}`
+           is checked or being sufficiently divergent from its very predecessor
+           :math:`x_{k-1}`, meaning that, it is additionally checked if:
            * :math:`x_k - x_{k-1} >` :py:attr:`slope`
            * :math:`t_k - t_{k-1} >` :py:attr:`weight` :math:`\\times` :py:attr:`freq`
 
         Parameters
         ----------
         thresh :
-            The threshold, for the total rise (:py:attr:`thresh` ``> 0``), or total drop
-            (:py:attr:`thresh` ``< 0``), value courses must not exceed within a timespan
-            of length :py:attr:`raise_window`.
+            The threshold, for the total rise (:py:attr:`thresh` ``> 0``),
+            or total drop (:py:attr:`thresh` ``< 0``), value courses must
+            not exceed within a timespan of length :py:attr:`raise_window`.
 
         raise_window :
-            An offset string, determining the timespan, the rise/drop thresholding refers
-            to. Window is inclusively defined.
+            An offset string, determining the timespan, the rise/drop
+            thresholding refers to. Window is inclusively defined.
 
         freq :
-            An offset string, determining the frequency, the timeseries to flag is supposed
-            to be sampled at. The window is inclusively defined.
+            An offset string, determining the frequency, the timeseries
+            to flag is supposed to be sampled at. The window is inclusively
+            defined.
 
         average_window :
-            See condition (2) of the description given in the Notes. Window is
-            inclusively defined, defaults to 1.5 times the size of :py:attr:`raise_window`.
+            See condition (2) of the description given in the Notes. Window
+            is inclusively defined, defaults to 1.5 times the size of
+            :py:attr:`raise_window`.
 
         raise_factor :
             See condition (2).
diff --git a/saqc/funcs/scores.py b/saqc/funcs/scores.py
index 5110d824f..d0aa2daa2 100644
--- a/saqc/funcs/scores.py
+++ b/saqc/funcs/scores.py
@@ -405,7 +405,7 @@ class ScoresMixin:
         algorithm: Literal["ball_tree", "kd_tree", "brute", "auto"] = "ball_tree",
         p: int = 1,
         density: Literal["auto"] | float | Callable = "auto",
-        fill_na: str = "linear",
+        fill_na: bool = True,
         **kwargs,
     ) -> "SaQC":
         """
@@ -449,7 +449,7 @@ class ScoresMixin:
               (passed as Series).
 
         fill_na :
-            Weather or not to fill NaN values in the data with a linear interpolation.
+            If True, NaNs in the data are filled with a linear interpolation.
 
         Notes
         -----
@@ -465,8 +465,8 @@ class ScoresMixin:
         """
 
         vals = self._data[field]
-        if fill_na is not None:
-            vals = vals.interpolate(fill_na)
+        if fill_na:
+            vals = vals.interpolate("linear")
 
         if density == "auto":
             density = vals.diff().abs().median()
diff --git a/saqc/lib/checking.py b/saqc/lib/checking.py
index abebc510a..be59b69f2 100644
--- a/saqc/lib/checking.py
+++ b/saqc/lib/checking.py
@@ -31,6 +31,10 @@ def isBoolLike(obj: Any, optional: bool = False) -> bool:
     )
 
 
+def isFloatLike(obj: Any) -> bool:
+    return isinstance(obj, (float, int))
+
+
 def isIterable(obj: Any) -> bool:
     if isinstance(obj, Iterable) or pd.api.types.is_iterator(obj):
         return True
diff --git a/saqc/lib/tools.py b/saqc/lib/tools.py
index f22bace02..c06278096 100644
--- a/saqc/lib/tools.py
+++ b/saqc/lib/tools.py
@@ -26,6 +26,7 @@ from typing import (
     Union,
     get_args,
     get_origin,
+    overload,
 )
 
 import numpy as np
@@ -57,12 +58,18 @@ def assertScalar(name, value, optional=False):
     return validateScalar(name=name, value=value, optional=optional)
 
 
-def toSequence(value: T | Sequence[T]) -> List[T]:
-    if value is None:  # special case
-        return [None]
-    if isinstance(value, (str, float, int)):
+# fmt: off
+@overload
+def toSequence(value: T) -> List[T]:
+    ...
+@overload
+def toSequence(value: Sequence[T]) -> List[T]:
+    ...
+def toSequence(value) -> List:
+    if value is None or isinstance(value, (str, float, int)):
         return [value]
     return list(value)
+# fmt: on
 
 
 def squeezeSequence(value: Sequence[T]) -> Union[T, Sequence[T]]:
diff --git a/saqc/lib/ts_operators.py b/saqc/lib/ts_operators.py
index 976754549..b0b7dba71 100644
--- a/saqc/lib/ts_operators.py
+++ b/saqc/lib/ts_operators.py
@@ -12,7 +12,7 @@ The module gathers all kinds of timeseries tranformations.
 import re
 import sys
 import warnings
-from typing import Union, Literal
+from typing import Literal, Union
 
 import numpy as np
 import numpy.polynomial.polynomial as poly
@@ -485,7 +485,7 @@ def shift2Freq(
     shift timestamps backwards/forwards in order to align them with an equidistant
     frequency grid. Resulting Nan's are replaced with the fill-value.
     """
-    validateWindow(freq, 'freq', allow_int=False)
+    validateWindow(freq, "freq", allow_int=False)
     validateChoice(method, "method", ["fshift", "bshift", "nshift"])
     methods = {
         "fshift": lambda freq: ("ffill", pd.Timedelta(freq)),
-- 
GitLab