From f15169efdce7dac019ecf799ecac918bc77ca8c2 Mon Sep 17 00:00:00 2001 From: Bert Palm <bert.palm@ufz.de> Date: Thu, 29 Jun 2023 17:44:13 +0200 Subject: [PATCH] more checking, docstring formatting --- CHANGELOG.md | 2 + saqc/funcs/noise.py | 2 +- saqc/funcs/outliers.py | 553 ++++++++++++++++++++++----------------- saqc/funcs/scores.py | 8 +- saqc/lib/checking.py | 4 + saqc/lib/tools.py | 15 +- saqc/lib/ts_operators.py | 4 +- 7 files changed, 344 insertions(+), 244 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bddd7fbd0..4bb4402a4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,8 @@ SPDX-License-Identifier: GPL-3.0-or-later - added checks and unified error message for common inputs. ### Changed - pin pandas to versions >= 2.0 +- parameter `fill_na` of `SaQC.flagUniLOF` and `SaQC.assignUniLOF` is now of type + `bool` instead of one of `[None, "linear"]` ### Removed - removed deprecated `DictOfSeries.to_df` ### Fixed diff --git a/saqc/funcs/noise.py b/saqc/funcs/noise.py index a7dc1e910..0137d4a27 100644 --- a/saqc/funcs/noise.py +++ b/saqc/funcs/noise.py @@ -15,7 +15,7 @@ import pandas as pd from saqc.constants import BAD from saqc.core.register import flagging -from saqc.lib.checking import validateCallable, validateWindow, validateMinPeriods +from saqc.lib.checking import validateCallable, validateMinPeriods, validateWindow from saqc.lib.tools import isunflagged, statPass if TYPE_CHECKING: diff --git a/saqc/funcs/outliers.py b/saqc/funcs/outliers.py index 0b45dacac..04b7ce3c2 100644 --- a/saqc/funcs/outliers.py +++ b/saqc/funcs/outliers.py @@ -10,7 +10,7 @@ from __future__ import annotations import uuid import warnings -from typing import TYPE_CHECKING, Callable, Optional, Sequence, Tuple +from typing import TYPE_CHECKING, Callable, List, Optional, Sequence, Tuple import numpy as np import numpy.polynomial.polynomial as poly @@ -22,6 +22,16 @@ from typing_extensions import Literal from saqc import BAD, UNFLAGGED from saqc.core import DictOfSeries, Flags, flagging, register from saqc.funcs.scores import _univarScoring +from saqc.lib.checking import ( + isCallable, + isFloatLike, + validateCallable, + validateChoice, + validateFrequency, + validateMinPeriods, + validateValueBounds, + validateWindow, +) from saqc.lib.docs import DOC_TEMPLATES from saqc.lib.rolling import windowRoller from saqc.lib.tools import getFreqDelta, isflagged, toSequence @@ -31,6 +41,24 @@ if TYPE_CHECKING: class OutliersMixin: + @staticmethod + def _validateLOF(n, thresh, algorithm, p, density): + """validate parameter for LOF and UniLOF""" + validateValueBounds(n, "n", left=0, strict_int=True) + validateValueBounds(p, "p", left=0, strict_int=True) + + validateChoice( + algorithm, "algorithm", ["ball_tree", "kd_tree", "brute", "auto"] + ) + + if thresh != "auto" and not isFloatLike(thresh): + raise ValueError(f"'thresh' must be 'auto' or a float, not {thresh}") + + if density != "auto" and not isFloatLike(density) and not isCallable(density): + raise ValueError( + f"'density' must be 'auto' or a float or a function, not {density}" + ) + @register( mask=["field"], demask=["field"], @@ -55,58 +83,76 @@ class OutliersMixin: Parameters ---------- n : - Number of neighbors to be included into the LOF calculation. Defaults to ``20``, which is a + Number of neighbors to be included into the LOF calculation. + Defaults to ``20``, which is a value found to be suitable in the literature. - * :py:attr:`n` determines the "locality" of an observation (its :py:attr:`n` nearest neighbors) - and sets the upper limit to the number of values in outlier clusters (i.e. consecutive outliers). Outlier - clusters of size greater than :py:attr:`n`/2 may not be detected reliably. - * The larger :py:attr:`n`, the lesser the algorithm's sensitivity to local outliers and small - or singleton outliers points. Higher values greatly increase numerical costs. + * :py:attr:`n` determines the "locality" of an observation + (its :py:attr:`n` nearest neighbors) and sets the upper + limit to the number of values in outlier clusters (i.e. + consecutive outliers). Outlier clusters of size greater + than :py:attr:`n`/2 may not be detected reliably. + * The larger :py:attr:`n`, the lesser the algorithm's sensitivity + to local outliers and small or singleton outliers points. + Higher values greatly increase numerical costs. thresh : - The threshold for flagging the calculated LOF. A LOF of around ``1`` is considered normal and - most likely corresponds to inlier points. + The threshold for flagging the calculated LOF. A LOF of around + ``1`` is considered normal and most likely corresponds to + inlier points. - * The "automatic" threshing introduced with the publication of the algorithm defaults to ``1.5``. - * In this implementation, :py:attr:`thresh` defaults (``'auto'``) to flagging the scores with a - modified 3-sigma rule, resulting in a :py:attr:`thresh` `` > 1.5`` which usually mitigates - overflagging compared to the literature recommendation. + * The "automatic" threshing introduced with the publication + of the algorithm defaults to ``1.5``. + * In this implementation, :py:attr:`thresh` defaults (``'auto'``) + to flagging the scores with a modified 3-sigma rule, resulting + in a :py:attr:`thresh` `` > 1.5`` which usually mitigates + over-flagging compared to the literature recommendation. algorithm : Algorithm used for calculating the :py:attr:`n`-nearest neighbors. p : - Degree of the metric ("Minkowski"), according to which the distance to neighbors is determined. - Most important values are: + Degree of the metric ("Minkowski"), according to which the + distance to neighbors is determined. Most important values are: - * ``1`` - Manhatten Metric + * ``1`` - Manhattan Metric * ``2`` - Euclidian Metric + density : + How to calculate the temporal distance/density for the variable to flag. + + * ``'auto'`` - introduces linear density with an increment + equal to the median of the absolute diff of the variable to flag. + * ``float`` - introduces linear density with an increment + equal to :py:attr:`density` + * Callable - calculates the density by applying the function + passed onto the variable to flag (passed as Series). + Notes ----- - * The :py:meth:`~saqc.SaQC.flagLOF` function calculates the Local Outlier Factor (LOF) for every point - in the input timeseries. The *LOF* is a scalar value, that roughly correlates to the *reachability*, - or "outlierishnes" of the evaluated datapoint. If a point is as reachable, as all its - :py:attr:`n`-nearest neighbors, the *LOF* score evaluates to around ``1``. If it is only as half as - reachable as all its ``n``-nearest neighbors are (so to say, as double as "outlierish"), the score - is about ``2``. So, the Local Outlier *Factor* relates a point's *reachability* to the *reachability* - of its :py:attr:`n`-nearest neighbors in a multiplicative fashion (as a "factor"). - * The *reachability* of a point thereby is determined as an aggregation of the points distances to its - :py:attr:`n`-nearest neighbors, measured with regard to the minkowski metric of degree :py:attr:`p` + * The :py:meth:`~saqc.SaQC.flagLOF` function calculates the Local + Outlier Factor (LOF) for every point in the input timeseries. + The *LOF* is a scalar value, that roughly correlates to the + *reachability*, or "outlierishnes" of the evaluated datapoint. + If a point is as reachable, as all its :py:attr:`n`-nearest + neighbors, the *LOF* score evaluates to around ``1``. If it + is only as half as reachable as all its ``n``-nearest neighbors + are (so to say, as double as "outlierish"), the score is about + ``2``. So, the Local Outlier *Factor* relates a point's *reachability* + to the *reachability* of its :py:attr:`n`-nearest neighbors + in a multiplicative fashion (as a "factor"). + * The *reachability* of a point thereby is determined as an aggregation + of the points distances to its :py:attr:`n`-nearest neighbors, + measured with regard to the minkowski metric of degree :py:attr:`p` (usually euclidean). - * To derive a binary label for every point (outlier: *yes*, or *no*), the scores are cut off at a level, - determined by :py:attr:`thresh`. + * To derive a binary label for every point (outlier: *yes*, or *no*), + the scores are cut off at a level, determined by :py:attr:`thresh`. """ - if not (density == "auto" or isinstance(density, float) or callable(density)): - raise ValueError( - "'density' must be 'auto' or a float or a function, " f"not {density}" - ) - + self._validateLOF(n, thresh, algorithm, p, density) fields = toSequence(field) field_ = str(uuid.uuid4()) - self = self.assignLOF( + qc = self.assignLOF( field=fields, target=field_, n=n, @@ -114,7 +160,7 @@ class OutliersMixin: p=p, density=density, ) - s = self.data[field_] + s = qc.data[field_] if thresh == "auto": s = pd.concat([s, (-s - 2)]) s_mask = (s - s.mean() / s.std())[: len(s) // 2].abs() > 3 @@ -122,10 +168,10 @@ class OutliersMixin: s_mask = s < abs(thresh) for f in fields: - mask = ~isflagged(self._flags[f], kwargs["dfilter"]) & s_mask - self._flags[mask, f] = flag + mask = ~isflagged(qc._flags[f], kwargs["dfilter"]) & s_mask + qc._flags[mask, f] = flag - return self.dropField(field_) + return qc.dropField(field_) @flagging() def flagUniLOF( @@ -136,95 +182,120 @@ class OutliersMixin: algorithm: Literal["ball_tree", "kd_tree", "brute", "auto"] = "ball_tree", p: int = 1, density: Literal["auto"] | float | Callable = "auto", - fill_na: str = "linear", + fill_na: bool = True, flag: float = BAD, **kwargs, ) -> "SaQC": """ Flag "univariate" Local Outlier Factor (LOF) exceeding cutoff. - The function is a wrapper around a usual LOF implementation, aiming for an easy to use, - parameter minimal outlier detection function for single variables, that does not necessitate - prior modelling of the variable. LOF is applied onto a concatenation of the `field` variable - and a "temporal density", or "penalty" variable, that measures temporal distance between data - points. See notes Section for a more exhaustive explaination. - - See the Notes section for more details on the algorithm. + The function is a wrapper around a usual LOF implementation, aiming + for an easy to use, parameter minimal outlier detection function + for single variables, that does not necessitate prior modelling + of the variable. LOF is applied onto a concatenation of the `field` + variable and a "temporal density", or "penalty" variable, that + measures temporal distance between data points. See notes Section + for a more exhaustive explaination. See the Notes section for + more details on the algorithm. Parameters ---------- n : - Number of periods to be included into the LOF calculation. Defaults to `20`, which is a - value found to be suitable in the literature. - - * :py:attr:`n` determines the "locality" of an observation (its :py:attr:`n` nearest neighbors) - and sets the upper limit to the number of values in an outlier clusters (i.e. consecutive outliers). Outlier - clusters of size greater than :py:attr:`n`/2 may not be detected reliably. - * The larger :py:attr:`n`, the lesser the algorithm's sensitivity to local outliers and small - or singleton outlier points. Higher values greatly increase numerical costs. + Number of periods to be included into the LOF calculation. + Defaults to `20`, which is a value found to be suitable in + the literature. + + * :py:attr:`n` determines the "locality" of an observation + (its :py:attr:`n` nearest neighbors) and sets the upper + limit to the number of values in an outlier clusters (i.e. + consecutive outliers). Outlier clusters of size greater + than :py:attr:`n`/2 may not be detected reliably. + * The larger :py:attr:`n`, the lesser the algorithm's sensitivity + to local outliers and small or singleton outlier points. + Higher values greatly increase numerical costs. thresh : - The threshold for flagging the calculated LOF. A LOF of around ``1`` is considered normal and - most likely corresponds to inlier points. This parameter is considered the main calibration + The threshold for flagging the calculated LOF. A LOF of around + ``1`` is considered normal and most likely corresponds to + inlier points. This parameter is considered the main calibration parameter of the algorithm. - * The threshing defaults to ``1.5``, wich is the default value found to be suitable in the literature. - * ``'auto'`` enables flagging the scores with a modified 3-sigma rule, - resulting in a thresh around ``4``, which usually greatly mitigates overflagging compared to the - literature recommendation, but often is too high. - * sensitive range for the parameter may be ``[1,15]``, assuming default settings for the other parameters. + * The threshing defaults to ``1.5``, wich is the default value + found to be suitable in the literature. + * ``'auto'`` enables flagging the scores with a modified 3-sigma + rule, resulting in a thresh around ``4``, which usually + greatly mitigates overflagging compared to the literature + recommendation, but often is too high. + * sensitive range for the parameter may be ``[1,15]``, assuming + default settings for the other parameters. algorithm : - Algorithm used for calculating the :py:attr:`n`-nearest neighbors needed for LOF calculation. + Algorithm used for calculating the :py:attr:`n`-nearest neighbors + needed for LOF calculation. + p : - Degree of the metric ("Minkowski"), according to which distance to neighbors is determined. - Most important values are: + Degree of the metric ("Minkowski"), according to which distance + to neighbors is determined. Most important values are: * ``1`` - Manhatten Metric * ``2`` - Euclidian Metric + density : - How to calculate the temporal distance/density for the variable to flag. + How to calculate the temporal distance/density for the variable + to flag. - * ``'auto'`` - introduces linear density with an increment equal to the median of the absolute - diff of the variable to flag. - * ``float`` - introduces linear density with an increment equal to :py:attr:`density` - * Callable - calculates the density by applying the function passed onto the variable to flag - (passed as Series). + * ``'auto'`` - introduces linear density with an increment + equal to the median of the absolute diff of the variable to flag. + * ``float`` - introduces linear density with an increment + equal to :py:attr:`density` + * Callable - calculates the density by applying the function + passed onto the variable to flag (passed as Series). fill_na : - Weather or not to fill NaN values in the data with a linear interpolation. + If True, NaNs in the data are filled with a linear interpolation. See Also -------- - :ref:`introduction to outlier detection with saqc <cookbooks/OutlierDetection:Outlier Detection>` + :ref:`introduction to outlier detection with + saqc <cookbooks/OutlierDetection:Outlier Detection>` Notes ----- - * The :py:meth:`~saqc.SaQC.flagUniLOF` function calculates an univariate - Local Outlier Factor (UniLOF) - score for every point in the one dimensional input - data series. - The *UniLOF* score of any data point is a scalar value, that roughly correlates to - its *reachability*, or "outlierishnes" in the 2-dimensional space constituted by the - data-values and the time axis. So the Algorithm basically operates on the "graph", - or the "plot" of the input timeseries. + + * The :py:meth:`~saqc.SaQC.flagUniLOF` function calculates an + univariate Local Outlier Factor (UniLOF) - score for every + point in the one dimensional input data series. The *UniLOF* + score of any data point is a scalar value, that roughly correlates + to its *reachability*, or "outlierishnes" in the 2-dimensional + space constituted by the data-values and the time axis. So + the Algorithm basically operates on the "graph", or the "plot" + of the input timeseries. + * If a point in this "graph" is as reachable, as all its :py:attr:`n`-nearest - neighbors, its *UniLOF* score evaluates to around ``1``. If it is only as half as - reachable as all its :py:attr:`n` neighbors are - (so to say, as double as "outlierish"), its score evaluates to ``2`` roughly. - So, the Univariate Local Outlier *Factor* relates a points *reachability* to the - *reachability* of its :py:attr:`n`-nearest neighbors in a multiplicative fashion + neighbors, its *UniLOF* score evaluates to around ``1``. If + it is only as half as reachable as all its :py:attr:`n` neighbors + are (so to say, as double as "outlierish"), its score evaluates + to ``2`` roughly. So, the Univariate Local Outlier *Factor* + relates a points *reachability* to the *reachability* of its + :py:attr:`n`-nearest neighbors in a multiplicative fashion (as a "factor"). - * The *reachability* of a point thereby is derived as an aggregation of the points - distance to its :py:attr:`n`-nearest neighbors, measured with regard to the minkowski - metric of degree :py:attr:`p` (usually euclidean). - * The parameter :py:attr:`density` thereby determines how dimensionality of the time is - removed, to make it a dimensionless, real valued coordinate. - * To derive a binary label for every point (outlier: *yes*, or *no*), the scores are cut - off at a level, determined by :py:attr:`thresh`. + + * The *reachability* of a point thereby is derived as an aggregation + of the points distance to its :py:attr:`n`-nearest neighbors, + measured with regard to the minkowski metric of degree :py:attr:`p` + (usually euclidean). + + * The parameter :py:attr:`density` thereby determines how dimensionality + of the time is removed, to make it a dimensionless, real valued + coordinate. + + * To derive a binary label for every point (outlier: *yes*, or + *no*), the scores are cut off at a level, determined by :py:attr:`thresh`. Examples -------- - See the :ref:`outlier detection cookbook <cookbooks/OutlierDetection:Outlier Detection>` for a detailed + See the :ref:`outlier detection cookbook + <cookbooks/OutlierDetection:Outlier Detection>` for a detailed introduction into the usage and tuning of the function. .. plot:: @@ -241,8 +312,10 @@ class OutliersMixin: Example usage with default parameter configuration: - Loading data via pandas csv file parser, casting index to DateTime, generating a :py:class:`~saqc.SaQC` - instance from the data and plotting the variable representing light scattering at 254 nanometers wavelength. + Loading data via pandas csv file parser, casting index to DateTime, + generating a :py:class:`~saqc.SaQC` instance from the data and + plotting the variable representing light scattering at 254 nanometers + wavelength. .. doctest:: flagUniLOFExample @@ -260,8 +333,9 @@ class OutliersMixin: qc.plot('sac254_raw') - We apply :py:meth:`~saqc.SaqC.flagUniLOF` in with default parameter values. Meaning, that the main - calibration paramters :py:attr:`n` and :py:attr:`thresh` evaluate to `20` and `1.5` respectively. + We apply :py:meth:`~saqc.SaqC.flagUniLOF` in with default parameter + values. Meaning, that the main calibration paramters :py:attr:`n` + and :py:attr:`thresh` evaluate to `20` and `1.5` respectively. .. doctest:: flagUniLOFExample @@ -278,27 +352,29 @@ class OutliersMixin: qc.plot('sac254_raw') """ - field_ = str(uuid.uuid4()) - self = self.assignUniLOF( + self._validateLOF(n, thresh, algorithm, p, density) + + tmp_field = str(uuid.uuid4()) + qc = self.assignUniLOF( field=field, - target=field_, + target=tmp_field, n=n, algorithm=algorithm, p=p, density=density, fill_na=fill_na, ) - s = self.data[field_] + s = qc.data[tmp_field] if thresh == "auto": _s = pd.concat([s, (-s - 2)]) s_mask = ((_s - _s.mean()) / _s.std()).iloc[: int(s.shape[0])].abs() > 3 else: s_mask = s < -abs(thresh) - s_mask = ~isflagged(self._flags[field], kwargs["dfilter"]) & s_mask - self._flags[s_mask, field] = flag - self = self.dropField(field_) - return self + s_mask = ~isflagged(qc._flags[field], kwargs["dfilter"]) & s_mask + qc._flags[s_mask, field] = flag + qc = qc.dropField(tmp_field) + return qc @flagging() def flagRange( @@ -310,7 +386,8 @@ class OutliersMixin: **kwargs, ) -> "SaQC": """ - Function flags values exceeding the closed interval [:py:attr:`min`, :py:attr:`max`]. + Function flags values exceeding the closed + interval [:py:attr:`min`, :py:attr:`max`]. Parameters ---------- @@ -319,7 +396,6 @@ class OutliersMixin: max : Upper bound for valid data. """ - # using .values is much faster datacol = self._data[field].to_numpy() mask = (datacol < min) | (datacol > max) @@ -346,28 +422,28 @@ class OutliersMixin: ---------- window : - Determines the segmentation of the data into partitions, the kNN algorithm is - applied onto individually. + Determines the segmentation of the data into partitions, the + kNN algorithm is applied onto individually. * ``None``: Apply Scoring on whole data set at once - * ``int``: Apply scoring on successive data chunks of periods with the given length. - Must be greater than 0. - * Offset String : Apply scoring on successive partitions of temporal extension - matching the passed offset string + * ``int``: Apply scoring on successive data chunks of periods + with the given length. Must be greater than 0. + * offset String : Apply scoring on successive partitions of + temporal extension matching the passed offset string min_periods : - Minimum number of periods per partition that have to be present for a valid - outlier detection to be made in this partition (only of effect, if :py:attr:`freq` - is an integer). + Minimum number of periods per partition that have to be present + for a valid outlier detection to be made in this partition iter_start : - Float in ``[0, 1]`` that determines which percentage of data is considered - "normal". ``0.5`` results in the stray algorithm to search only the upper 50% of - the scores for the cut off point. (See reference section for more information) + Float in ``[0, 1]`` that determines which percentage of data + is considered "normal". ``0.5`` results in the stray algorithm + to search only the upper 50% of the scores for the cut off + point. (See reference section for more information) alpha : - Level of significance by which it is tested, if a score might be drawn from - another distribution than the majority of the data. + Level of significance by which it is tested, if a score might + be drawn from another distribution than the majority of the data. References ---------- @@ -378,36 +454,36 @@ class OutliersMixin: """ scores = self._data[field].dropna() + if window is None: + window = len(scores) + if not isinstance(window, int): + validateFrequency(window, "window") + + validateMinPeriods(min_periods) + validateValueBounds(iter_start, "iter_start", left=0, right=1, closed="both") + if scores.empty: return self - if not window: - window = len(scores) - - if isinstance(window, str): + if isinstance(window, int): + s = pd.Series(data=np.arange(0, len(scores)), index=scores.index) + s = s.transform(lambda x: int(np.floor(x / window))) + partitions = scores.groupby(s) + else: # pd.Timedelta pd.DateOffset or str partitions = scores.groupby(pd.Grouper(freq=window)) - else: - grouper_series = pd.Series( - data=np.arange(0, len(scores)), index=scores.index - ) - grouper_series = grouper_series.transform( - lambda x: int(np.floor(x / window)) - ) - partitions = scores.groupby(grouper_series) - # calculate flags for every window for _, partition in partitions: - if partition.empty | (len(partition) < min_periods): - continue - sample_size = len(partition) + if partition.empty or sample_size < min_periods: + continue + sorted_i = partition.values.argsort() resids = partition.values[sorted_i] gaps = np.append(0, np.diff(resids)) - tail_size = int(max(min(50, np.floor(sample_size / 4)), 2)) + tail_size = int(max(min(np.floor(sample_size / 4), 50), 2)) tail_indices = np.arange(2, tail_size + 1) i_start = int(max(np.floor(sample_size * iter_start), 1) + 1) @@ -452,69 +528,75 @@ class OutliersMixin: flag: float = BAD, **kwargs, ) -> "SaQC": + validateCallable(trafo, "trafo") """ - The algorithm implements a 3-step outlier detection procedure for simultaneously - flagging of higher dimensional data (dimensions > 3). + The algorithm implements a 3-step outlier detection procedure for + simultaneously flagging of higher dimensional data (dimensions > 3). - In [1], the procedure is introduced and exemplified with an application on hydrological - data. See the notes section for an overview over the algorithms basic steps. + In [1], the procedure is introduced and exemplified with an application on + hydrological data. See the notes section for an overview over the algorithms + basic steps. Parameters ---------- - trafo : default identity - Transformation to be applied onto every column before scoring. For more fine-grained - control, the data could also be transformed before :py:meth:`~saqc.SaQC.flagMVScores` - is called. + trafo : + Transformation to be applied onto every column before scoring. For more + fine-grained control, the data could also be transformed before + :py:meth:`~saqc.SaQC.flagMVScores` is called. alpha : Level of significance by which it is tested, if an observations score might be drawn from another distribution than the majority of the data. - n : + n : Number of neighbors included in the scoring process for every datapoint. - func : default sum - Function that aggregates a value's k-smallest distances, returning a scalar score. + func : + Function that aggregates a value's k-smallest distances, returning a + scalar score. iter_start : - Value in ``[0,1]`` that determines which percentage of data is considered - "normal". 0.5 results in the threshing algorithm to search only the upper 50% - of the scores for the cut off point. (See reference section for more + Value in ``[0,1]`` that determines which percentage of data is considered + "normal". 0.5 results in the threshing algorithm to search only the upper + 50% of the scores for the cut-off point. (See reference section for more information) window : - Only effective if :py:attr:`threshing` is set to ``'stray'``. Determines the - size of the data partitions, the data is decomposed into. Each partition is checked - seperately for outliers. - Either given as an Offset String, denoting the windows temporal extension or - as an integer, denoting the windows number of periods. ``NaN`` also count as periods. - If ``None``, all data points share the same scoring window, which than equals the whole - data. + Only effective if :py:attr:`threshing` is set to ``'stray'``. Determines + the size of the data partitions, the data is decomposed into. Each + partition is checked seperately for outliers. Either given as an Offset + String, denoting the windows temporal extension or as an integer, + denoting the windows number of periods. ``NaN`` also count as periods. If + ``None``, all data points share the same scoring window, which than + equals the whole data. min_periods : - Only effective if :py:attr:`threshing` is set to ``'stray'`` and :py:attr:`partition` is an integer. - Minimum number of periods per :py:attr:`partition` that have to be present for a valid outlier + Only effective if :py:attr:`threshing` is set to ``'stray'`` and + :py:attr:`partition` is an integer. Minimum number of periods per + :py:attr:`partition` that have to be present for a valid outlier detection to be made in this partition. stray_range : - If not ``None``, it is tried to reduce the stray result onto single outlier components - of the input :py:attr:`field`. The offset string denotes the range of the - temporal surrounding to include into the MAD testing while trying to reduce - flags. + If not ``None``, it is tried to reduce the stray result onto single + outlier components of the input :py:attr:`field`. The offset string + denotes the range of the temporal surrounding to include into the MAD + testing while trying to reduce flags. drop_flagged : - Only effective when :py:attr:`stray_range` is not ``None``. Whether or not to drop flagged - values from the temporal surroundings. + Only effective when :py:attr:`stray_range` is not ``None``. Whether or + not to drop flagged values from the temporal surroundings. thresh : - Only effective when :py:attr:`stray_range` is not ``None``. The 'critical' value, - controlling wheather the MAD score is considered referring to an outlier or - not. Higher values result in less rigid flagging. The default value is widely - considered apropriate in the literature. + Only effective when :py:attr:`stray_range` is not ``None``. The + 'critical' value, controlling wheather the MAD score is considered + referring to an outlier or not. Higher values result in less rigid + flagging. The default value is widely considered apropriate in the + literature. min_periods_r : - Only effective when :py:attr:`stray_range` is not ``None``. Minimum number of measurements - necessary in an interval to actually perform the reduction step. + Only effective when :py:attr:`stray_range` is not ``None``. Minimum + number of measurements necessary in an interval to actually perform the + reduction step. Notes ----- @@ -526,33 +608,33 @@ class OutliersMixin: (a) make them comparable and (b) make outliers more stand out. - This step is usually subject to a phase of research/try and error. See [1] for more - details. + This step is usually subject to a phase of research/try and error. See [1] + for more details. - Note, that the data transformation as an built-in step of the algorithm, will likely - get deprecated in the future. Its better to transform the data in a processing - step, preceeding the multivariate flagging process. Also, by doing so, one gets - mutch more control and variety in the transformation applied, since the `trafo` - parameter only allows for application of the same transformation to all of the - variables involved. + Note, that the data transformation as a built-in step of the algorithm, + will likely get deprecated in the future. It's better to transform the data in + a processing step, preceeding the multivariate flagging process. Also, + by doing so, one gets mutch more control and variety in the transformation + applied, since the `trafo` parameter only allows for application of the same + transformation to all the variables involved. 2. scoring - Every observation gets assigned a score depending on its k nearest neighbors. See - the `scoring_method` parameter description for details on the different scoring - methods. Furthermore [1] may give some insight in the pro and cons of the - different methods. + Every observation gets assigned a score depending on its k nearest neighbors. + See the `scoring_method` parameter description for details on the different + scoring methods. Furthermore, [1] may give some insight in the pro and cons of + the different methods. 3. threshing - The gaps between the (greatest) scores are tested for beeing drawn from the same - distribution as the majority of the scores. If a gap is encountered, that, - with sufficient significance, can be said to not be drawn from the same - distribution as the one all the smaller gaps are drawn from, than the observation - belonging to this gap, and all the observations belonging to gaps larger then - this gap, get flagged outliers. See description of the `threshing` parameter for - more details. Although [1] gives a fully detailed overview over the `stray` - algorithm. + The gaps between the (greatest) scores are tested for beeing drawn from the + same distribution as the majority of the scores. If a gap is encountered, + that, with sufficient significance, can be said to not be drawn from the same + distribution as the one all the smaller gaps are drawn from, than the + observation belonging to this gap, and all the observations belonging to gaps + larger than this gap, get flagged outliers. See description of the + `threshing` parameter for more details. Although [1] gives a fully detailed + overview over the `stray` algorithm. References ---------- @@ -560,7 +642,6 @@ class OutliersMixin: Anomaly Detection in High-Dimensional Data, Journal of Computational and Graphical Statistics, 30:2, 360-374, DOI: 10.1080/10618600.2020.1807997 - """ # parameter deprecations @@ -568,8 +649,8 @@ class OutliersMixin: if "partition" in kwargs: warnings.warn( """ - The parameter `partition` is deprecated and will be removed in version 3.0 of saqc. - Please us the parameter `window` instead.' + The parameter `partition` is deprecated and will be removed in version + 3.0 of saqc. Please us the parameter `window` instead. """, DeprecationWarning, ) @@ -578,8 +659,8 @@ class OutliersMixin: if "partition_min" in kwargs: warnings.warn( """ - The parameter `partition_min` is deprecated and will be removed in version 3.0 of saqc. - Please us the parameter `min_periods` instead.' + The parameter `partition_min` is deprecated and will be removed in + version 3.0 of saqc. Please us the parameter `min_periods` instead. """, DeprecationWarning, ) @@ -588,27 +669,30 @@ class OutliersMixin: if min_periods != 11: warnings.warn( """ - You were setting a customary value for the `min_periods` parameter: note that this parameter - does no longer refer to the reduction interval length, but now controls the number of periods - having to be present in an interval of size `window` (deprecated:`partition`) for the algorithm to be - performed in that interval. - To alter the size of the reduction window, use the parameter `min_periods_r`. Changes readily apply. - Warning will be removed in saqc version 3.0. + You were setting a customary value for the `min_periods` parameter: + note that this parameter does no longer refer to the reduction interval + length, but now controls the number of periods having to be present in + an interval of size `window` (deprecated:`partition`) for the algorithm + to be performed in that interval. + To alter the size of the reduction window, use the parameter + `min_periods_r`. Changes readily apply. + This warning will be removed in saqc version 3.0. """, DeprecationWarning, ) fields = toSequence(field) + qc = self fields_ = [] for f in fields: field_ = str(uuid.uuid4()) - self = self.copyField(field=f, target=field_) - self = self.transform(field=field_, func=trafo, freq=window) + qc = qc.copyField(field=f, target=field_) + qc = qc.transform(field=field_, func=trafo, freq=window) fields_.append(field_) knn_field = str(uuid.uuid4()) - self = self.assignKNNScore( + qc = qc.assignKNNScore( field=fields_, target=knn_field, n=n, @@ -619,9 +703,9 @@ class OutliersMixin: **kwargs, ) for field_ in fields_: - self = self.dropField(field_) + qc = qc.dropField(field_) - self = self.flagByStray( + qc = qc.flagByStray( field=knn_field, freq=window, min_periods=min_periods, @@ -631,11 +715,11 @@ class OutliersMixin: **kwargs, ) - self._data, self._flags = _evalStrayLabels( - data=self._data, + qc._data, qc._flags = _evalStrayLabels( + data=qc._data, field=knn_field, target=fields, - flags=self._flags, + flags=qc._flags, reduction_range=stray_range, reduction_drop_flagged=drop_flagged, reduction_thresh=thresh, @@ -643,7 +727,7 @@ class OutliersMixin: flag=flag, **kwargs, ) - return self.dropField(knn_field) + return qc.dropField(knn_field) @flagging() def flagRaise( @@ -660,17 +744,17 @@ class OutliersMixin: **kwargs, ) -> "SaQC": """ - The function flags raises and drops in value courses, that exceed a certain threshold - within a certain timespan. + The function flags raises and drops in value courses, that exceed a certain + threshold within a certain timespan. - The parameter variety of the function is owned to the intriguing case of values, that - "return" from outlierish or anomalious value levels and thus exceed the threshold, - while actually being usual values. + The parameter variety of the function is owned to the intriguing case of + values, that "return" from outlierish or anomalious value levels and thus + exceed the threshold, while actually being usual values. Notes ----- - The dataset is NOT supposed to be harmonized to a time series with an equidistant - requency grid. + The dataset is NOT supposed to be harmonized to a time series with an + equidistant requency grid. The value :math:`x_{k}` of a time series :math:`x` with associated timestamps :math:`t_i`, is flagged a raise, if: @@ -679,35 +763,38 @@ class OutliersMixin: :py:attr:`raise_window` range, so that :math:`M = |x_k - x_s | >` :py:attr:`thresh` :math:`> 0` - 2. The weighted average :math:`\\mu^{*}` of the values, preceding :math:`x_{k}` - within :py:attr:`average_window` range indicates, that :math:`x_{k}` does not - return from an "outlierish" value course, meaning that + 2. The weighted average :math:`\\mu^{*}` of the values, preceding + :math:`x_{k}` within :py:attr:`average_window` range indicates, + that :math:`x_{k}` does not return from an "outlierish" value + course, meaning that :math:`x_k > \\mu^* + ( M` / :py:attr:`raise_factor` :math:`)` - 3. Additionally, if :py:attr:`slope` is not ``None``, :math:`x_{k}` is checked for being - sufficiently divergent from its very predecessor :math:`x_{k-1}`, meaning that, it - is additionally checked if: + 3. Additionally, if :py:attr:`slope` is not ``None``, :math:`x_{k}` + is checked or being sufficiently divergent from its very predecessor + :math:`x_{k-1}`, meaning that, it is additionally checked if: * :math:`x_k - x_{k-1} >` :py:attr:`slope` * :math:`t_k - t_{k-1} >` :py:attr:`weight` :math:`\\times` :py:attr:`freq` Parameters ---------- thresh : - The threshold, for the total rise (:py:attr:`thresh` ``> 0``), or total drop - (:py:attr:`thresh` ``< 0``), value courses must not exceed within a timespan - of length :py:attr:`raise_window`. + The threshold, for the total rise (:py:attr:`thresh` ``> 0``), + or total drop (:py:attr:`thresh` ``< 0``), value courses must + not exceed within a timespan of length :py:attr:`raise_window`. raise_window : - An offset string, determining the timespan, the rise/drop thresholding refers - to. Window is inclusively defined. + An offset string, determining the timespan, the rise/drop + thresholding refers to. Window is inclusively defined. freq : - An offset string, determining the frequency, the timeseries to flag is supposed - to be sampled at. The window is inclusively defined. + An offset string, determining the frequency, the timeseries + to flag is supposed to be sampled at. The window is inclusively + defined. average_window : - See condition (2) of the description given in the Notes. Window is - inclusively defined, defaults to 1.5 times the size of :py:attr:`raise_window`. + See condition (2) of the description given in the Notes. Window + is inclusively defined, defaults to 1.5 times the size of + :py:attr:`raise_window`. raise_factor : See condition (2). diff --git a/saqc/funcs/scores.py b/saqc/funcs/scores.py index 5110d824f..d0aa2daa2 100644 --- a/saqc/funcs/scores.py +++ b/saqc/funcs/scores.py @@ -405,7 +405,7 @@ class ScoresMixin: algorithm: Literal["ball_tree", "kd_tree", "brute", "auto"] = "ball_tree", p: int = 1, density: Literal["auto"] | float | Callable = "auto", - fill_na: str = "linear", + fill_na: bool = True, **kwargs, ) -> "SaQC": """ @@ -449,7 +449,7 @@ class ScoresMixin: (passed as Series). fill_na : - Weather or not to fill NaN values in the data with a linear interpolation. + If True, NaNs in the data are filled with a linear interpolation. Notes ----- @@ -465,8 +465,8 @@ class ScoresMixin: """ vals = self._data[field] - if fill_na is not None: - vals = vals.interpolate(fill_na) + if fill_na: + vals = vals.interpolate("linear") if density == "auto": density = vals.diff().abs().median() diff --git a/saqc/lib/checking.py b/saqc/lib/checking.py index abebc510a..be59b69f2 100644 --- a/saqc/lib/checking.py +++ b/saqc/lib/checking.py @@ -31,6 +31,10 @@ def isBoolLike(obj: Any, optional: bool = False) -> bool: ) +def isFloatLike(obj: Any) -> bool: + return isinstance(obj, (float, int)) + + def isIterable(obj: Any) -> bool: if isinstance(obj, Iterable) or pd.api.types.is_iterator(obj): return True diff --git a/saqc/lib/tools.py b/saqc/lib/tools.py index f22bace02..c06278096 100644 --- a/saqc/lib/tools.py +++ b/saqc/lib/tools.py @@ -26,6 +26,7 @@ from typing import ( Union, get_args, get_origin, + overload, ) import numpy as np @@ -57,12 +58,18 @@ def assertScalar(name, value, optional=False): return validateScalar(name=name, value=value, optional=optional) -def toSequence(value: T | Sequence[T]) -> List[T]: - if value is None: # special case - return [None] - if isinstance(value, (str, float, int)): +# fmt: off +@overload +def toSequence(value: T) -> List[T]: + ... +@overload +def toSequence(value: Sequence[T]) -> List[T]: + ... +def toSequence(value) -> List: + if value is None or isinstance(value, (str, float, int)): return [value] return list(value) +# fmt: on def squeezeSequence(value: Sequence[T]) -> Union[T, Sequence[T]]: diff --git a/saqc/lib/ts_operators.py b/saqc/lib/ts_operators.py index 976754549..b0b7dba71 100644 --- a/saqc/lib/ts_operators.py +++ b/saqc/lib/ts_operators.py @@ -12,7 +12,7 @@ The module gathers all kinds of timeseries tranformations. import re import sys import warnings -from typing import Union, Literal +from typing import Literal, Union import numpy as np import numpy.polynomial.polynomial as poly @@ -485,7 +485,7 @@ def shift2Freq( shift timestamps backwards/forwards in order to align them with an equidistant frequency grid. Resulting Nan's are replaced with the fill-value. """ - validateWindow(freq, 'freq', allow_int=False) + validateWindow(freq, "freq", allow_int=False) validateChoice(method, "method", ["fshift", "bshift", "nshift"]) methods = { "fshift": lambda freq: ("ffill", pd.Timedelta(freq)), -- GitLab