Skip to content
Snippets Groups Projects

Check inputs

Merged Bert Palm requested to merge checkInputs into develop
5 files
+ 339
241
Compare changes
  • Side-by-side
  • Inline
Files
5
+ 367
260
@@ -10,7 +10,7 @@ from __future__ import annotations
import uuid
import warnings
from typing import TYPE_CHECKING, Callable, Optional, Sequence, Tuple
from typing import TYPE_CHECKING, Callable, List, Optional, Sequence, Tuple
import numpy as np
import numpy.polynomial.polynomial as poly
@@ -21,7 +21,17 @@ from typing_extensions import Literal
from saqc import BAD, UNFLAGGED
from saqc.core import DictOfSeries, Flags, flagging, register
from saqc.funcs.scores import _univarScoring
from saqc.lib.checking import (
isCallable,
isFloatLike,
validateCallable,
validateChoice,
validateFraction,
validateFrequency,
validateMinPeriods,
validateValueBounds,
validateWindow,
)
from saqc.lib.docs import DOC_TEMPLATES
from saqc.lib.rolling import windowRoller
from saqc.lib.tools import getFreqDelta, isflagged, toSequence
@@ -31,6 +41,19 @@ if TYPE_CHECKING:
class OutliersMixin:
@staticmethod
def _validateLOF(algorithm, n, p, density):
"""validate parameter for LOF and UniLOF"""
validateValueBounds(n, "n", left=0, strict_int=True)
validateValueBounds(p, "p", left=0, strict_int=True)
validateChoice(
algorithm, "algorithm", ["ball_tree", "kd_tree", "brute", "auto"]
)
if density != "auto" and not isFloatLike(density) and not isCallable(density):
raise ValueError(
f"'density' must be 'auto' or a float or a function, not {density}"
)
@register(
mask=["field"],
demask=["field"],
@@ -55,53 +78,79 @@ class OutliersMixin:
Parameters
----------
n :
Number of neighbors to be included into the LOF calculation. Defaults to ``20``, which is a
Number of neighbors to be included into the LOF calculation.
Defaults to ``20``, which is a
value found to be suitable in the literature.
* :py:attr:`n` determines the "locality" of an observation (its :py:attr:`n` nearest neighbors)
and sets the upper limit to the number of values in outlier clusters (i.e. consecutive outliers). Outlier
clusters of size greater than :py:attr:`n`/2 may not be detected reliably.
* The larger :py:attr:`n`, the lesser the algorithm's sensitivity to local outliers and small
or singleton outliers points. Higher values greatly increase numerical costs.
* :py:attr:`n` determines the "locality" of an observation
(its :py:attr:`n` nearest neighbors) and sets the upper
limit to the number of values in outlier clusters (i.e.
consecutive outliers). Outlier clusters of size greater
than :py:attr:`n`/2 may not be detected reliably.
* The larger :py:attr:`n`, the lesser the algorithm's sensitivity
to local outliers and small or singleton outliers points.
Higher values greatly increase numerical costs.
thresh :
The threshold for flagging the calculated LOF. A LOF of around ``1`` is considered normal and
most likely corresponds to inlier points.
The threshold for flagging the calculated LOF. A LOF of around
``1`` is considered normal and most likely corresponds to
inlier points.
* The "automatic" threshing introduced with the publication of the algorithm defaults to ``1.5``.
* In this implementation, :py:attr:`thresh` defaults (``'auto'``) to flagging the scores with a
modified 3-sigma rule, resulting in a :py:attr:`thresh` `` > 1.5`` which usually mitigates
overflagging compared to the literature recommendation.
* The "automatic" threshing introduced with the publication
of the algorithm defaults to ``1.5``.
* In this implementation, :py:attr:`thresh` defaults (``'auto'``)
to flagging the scores with a modified 3-sigma rule, resulting
in a :py:attr:`thresh` `` > 1.5`` which usually mitigates
over-flagging compared to the literature recommendation.
algorithm :
Algorithm used for calculating the :py:attr:`n`-nearest neighbors.
p :
Degree of the metric ("Minkowski"), according to which the distance to neighbors is determined.
Most important values are:
Degree of the metric ("Minkowski"), according to which the
distance to neighbors is determined. Most important values are:
* ``1`` - Manhatten Metric
* ``1`` - Manhattan Metric
* ``2`` - Euclidian Metric
density :
How to calculate the temporal distance/density for the variable to flag.
* ``'auto'`` - introduces linear density with an increment
equal to the median of the absolute diff of the variable to flag.
* ``float`` - introduces linear density with an increment
equal to :py:attr:`density`
* Callable - calculates the density by applying the function
passed onto the variable to flag (passed as Series).
Notes
-----
* The :py:meth:`~saqc.SaQC.flagLOF` function calculates the Local Outlier Factor (LOF) for every point
in the input timeseries. The *LOF* is a scalar value, that roughly correlates to the *reachability*,
or "outlierishnes" of the evaluated datapoint. If a point is as reachable, as all its
:py:attr:`n`-nearest neighbors, the *LOF* score evaluates to around ``1``. If it is only as half as
reachable as all its ``n``-nearest neighbors are (so to say, as double as "outlierish"), the score
is about ``2``. So, the Local Outlier *Factor* relates a point's *reachability* to the *reachability*
of its :py:attr:`n`-nearest neighbors in a multiplicative fashion (as a "factor").
* The *reachability* of a point thereby is determined as an aggregation of the points distances to its
:py:attr:`n`-nearest neighbors, measured with regard to the minkowski metric of degree :py:attr:`p`
* The :py:meth:`~saqc.SaQC.flagLOF` function calculates the Local
Outlier Factor (LOF) for every point in the input timeseries.
The *LOF* is a scalar value, that roughly correlates to the
*reachability*, or "outlierishnes" of the evaluated datapoint.
If a point is as reachable, as all its :py:attr:`n`-nearest
neighbors, the *LOF* score evaluates to around ``1``. If it
is only as half as reachable as all its ``n``-nearest neighbors
are (so to say, as double as "outlierish"), the score is about
``2``. So, the Local Outlier *Factor* relates a point's *reachability*
to the *reachability* of its :py:attr:`n`-nearest neighbors
in a multiplicative fashion (as a "factor").
* The *reachability* of a point thereby is determined as an aggregation
of the points distances to its :py:attr:`n`-nearest neighbors,
measured with regard to the minkowski metric of degree :py:attr:`p`
(usually euclidean).
* To derive a binary label for every point (outlier: *yes*, or *no*), the scores are cut off at a level,
determined by :py:attr:`thresh`.
* To derive a binary label for every point (outlier: *yes*, or *no*),
the scores are cut off at a level, determined by :py:attr:`thresh`.
"""
self._validateLOF(algorithm, n, p, density)
if thresh != "auto" and not isFloatLike(thresh):
raise ValueError(f"'thresh' must be 'auto' or a float, not {thresh}")
fields = toSequence(field)
field_ = str(uuid.uuid4())
self = self.assignLOF(
qc = self.assignLOF(
field=fields,
target=field_,
n=n,
@@ -109,7 +158,7 @@ class OutliersMixin:
p=p,
density=density,
)
s = self.data[field_]
s = qc.data[field_]
if thresh == "auto":
s = pd.concat([s, (-s - 2)])
s_mask = (s - s.mean() / s.std())[: len(s) // 2].abs() > 3
@@ -117,10 +166,10 @@ class OutliersMixin:
s_mask = s < abs(thresh)
for f in fields:
mask = ~isflagged(self._flags[f], kwargs["dfilter"]) & s_mask
self._flags[mask, f] = flag
mask = ~isflagged(qc._flags[f], kwargs["dfilter"]) & s_mask
qc._flags[mask, f] = flag
return self.dropField(field_)
return qc.dropField(field_)
@flagging()
def flagUniLOF(
@@ -131,95 +180,120 @@ class OutliersMixin:
algorithm: Literal["ball_tree", "kd_tree", "brute", "auto"] = "ball_tree",
p: int = 1,
density: Literal["auto"] | float | Callable = "auto",
fill_na: str = "linear",
fill_na: bool = True,
flag: float = BAD,
**kwargs,
) -> "SaQC":
"""
Flag "univariate" Local Outlier Factor (LOF) exceeding cutoff.
The function is a wrapper around a usual LOF implementation, aiming for an easy to use,
parameter minimal outlier detection function for single variables, that does not necessitate
prior modelling of the variable. LOF is applied onto a concatenation of the `field` variable
and a "temporal density", or "penalty" variable, that measures temporal distance between data
points. See notes Section for a more exhaustive explaination.
See the Notes section for more details on the algorithm.
The function is a wrapper around a usual LOF implementation, aiming
for an easy to use, parameter minimal outlier detection function
for single variables, that does not necessitate prior modelling
of the variable. LOF is applied onto a concatenation of the `field`
variable and a "temporal density", or "penalty" variable, that
measures temporal distance between data points. See notes Section
for a more exhaustive explaination. See the Notes section for
more details on the algorithm.
Parameters
----------
n :
Number of periods to be included into the LOF calculation. Defaults to `20`, which is a
value found to be suitable in the literature.
* :py:attr:`n` determines the "locality" of an observation (its :py:attr:`n` nearest neighbors)
and sets the upper limit to the number of values in an outlier clusters (i.e. consecutive outliers). Outlier
clusters of size greater than :py:attr:`n`/2 may not be detected reliably.
* The larger :py:attr:`n`, the lesser the algorithm's sensitivity to local outliers and small
or singleton outlier points. Higher values greatly increase numerical costs.
Number of periods to be included into the LOF calculation.
Defaults to `20`, which is a value found to be suitable in
the literature.
* :py:attr:`n` determines the "locality" of an observation
(its :py:attr:`n` nearest neighbors) and sets the upper
limit to the number of values in an outlier clusters (i.e.
consecutive outliers). Outlier clusters of size greater
than :py:attr:`n`/2 may not be detected reliably.
* The larger :py:attr:`n`, the lesser the algorithm's sensitivity
to local outliers and small or singleton outlier points.
Higher values greatly increase numerical costs.
thresh :
The threshold for flagging the calculated LOF. A LOF of around ``1`` is considered normal and
most likely corresponds to inlier points. This parameter is considered the main calibration
The threshold for flagging the calculated LOF. A LOF of around
``1`` is considered normal and most likely corresponds to
inlier points. This parameter is considered the main calibration
parameter of the algorithm.
* The threshing defaults to ``1.5``, wich is the default value found to be suitable in the literature.
* ``'auto'`` enables flagging the scores with a modified 3-sigma rule,
resulting in a thresh around ``4``, which usually greatly mitigates overflagging compared to the
literature recommendation, but often is too high.
* sensitive range for the parameter may be ``[1,15]``, assuming default settings for the other parameters.
* The threshing defaults to ``1.5``, wich is the default value
found to be suitable in the literature.
* ``'auto'`` enables flagging the scores with a modified 3-sigma
rule, resulting in a thresh around ``4``, which usually
greatly mitigates overflagging compared to the literature
recommendation, but often is too high.
* sensitive range for the parameter may be ``[1,15]``, assuming
default settings for the other parameters.
algorithm :
Algorithm used for calculating the :py:attr:`n`-nearest neighbors needed for LOF calculation.
Algorithm used for calculating the :py:attr:`n`-nearest neighbors
needed for LOF calculation.
p :
Degree of the metric ("Minkowski"), according to which distance to neighbors is determined.
Most important values are:
Degree of the metric ("Minkowski"), according to which distance
to neighbors is determined. Most important values are:
* ``1`` - Manhatten Metric
* ``2`` - Euclidian Metric
density :
How to calculate the temporal distance/density for the variable to flag.
How to calculate the temporal distance/density for the variable
to flag.
* ``'auto'`` - introduces linear density with an increment equal to the median of the absolute
diff of the variable to flag.
* ``float`` - introduces linear density with an increment equal to :py:attr:`density`
* Callable - calculates the density by applying the function passed onto the variable to flag
(passed as Series).
* ``'auto'`` - introduces linear density with an increment
equal to the median of the absolute diff of the variable to flag.
* ``float`` - introduces linear density with an increment
equal to :py:attr:`density`
* Callable - calculates the density by applying the function
passed onto the variable to flag (passed as Series).
fill_na :
Weather or not to fill NaN values in the data with a linear interpolation.
If True, NaNs in the data are filled with a linear interpolation.
See Also
--------
:ref:`introduction to outlier detection with saqc <cookbooks/OutlierDetection:Outlier Detection>`
:ref:`introduction to outlier detection with
saqc <cookbooks/OutlierDetection:Outlier Detection>`
Notes
-----
* The :py:meth:`~saqc.SaQC.flagUniLOF` function calculates an univariate
Local Outlier Factor (UniLOF) - score for every point in the one dimensional input
data series.
The *UniLOF* score of any data point is a scalar value, that roughly correlates to
its *reachability*, or "outlierishnes" in the 2-dimensional space constituted by the
data-values and the time axis. So the Algorithm basically operates on the "graph",
or the "plot" of the input timeseries.
* The :py:meth:`~saqc.SaQC.flagUniLOF` function calculates an
univariate Local Outlier Factor (UniLOF) - score for every
point in the one dimensional input data series. The *UniLOF*
score of any data point is a scalar value, that roughly correlates
to its *reachability*, or "outlierishnes" in the 2-dimensional
space constituted by the data-values and the time axis. So
the Algorithm basically operates on the "graph", or the "plot"
of the input timeseries.
* If a point in this "graph" is as reachable, as all its :py:attr:`n`-nearest
neighbors, its *UniLOF* score evaluates to around ``1``. If it is only as half as
reachable as all its :py:attr:`n` neighbors are
(so to say, as double as "outlierish"), its score evaluates to ``2`` roughly.
So, the Univariate Local Outlier *Factor* relates a points *reachability* to the
*reachability* of its :py:attr:`n`-nearest neighbors in a multiplicative fashion
neighbors, its *UniLOF* score evaluates to around ``1``. If
it is only as half as reachable as all its :py:attr:`n` neighbors
are (so to say, as double as "outlierish"), its score evaluates
to ``2`` roughly. So, the Univariate Local Outlier *Factor*
relates a points *reachability* to the *reachability* of its
:py:attr:`n`-nearest neighbors in a multiplicative fashion
(as a "factor").
* The *reachability* of a point thereby is derived as an aggregation of the points
distance to its :py:attr:`n`-nearest neighbors, measured with regard to the minkowski
metric of degree :py:attr:`p` (usually euclidean).
* The parameter :py:attr:`density` thereby determines how dimensionality of the time is
removed, to make it a dimensionless, real valued coordinate.
* To derive a binary label for every point (outlier: *yes*, or *no*), the scores are cut
off at a level, determined by :py:attr:`thresh`.
* The *reachability* of a point thereby is derived as an aggregation
of the points distance to its :py:attr:`n`-nearest neighbors,
measured with regard to the minkowski metric of degree :py:attr:`p`
(usually euclidean).
* The parameter :py:attr:`density` thereby determines how dimensionality
of the time is removed, to make it a dimensionless, real valued
coordinate.
* To derive a binary label for every point (outlier: *yes*, or
*no*), the scores are cut off at a level, determined by :py:attr:`thresh`.
Examples
--------
See the :ref:`outlier detection cookbook <cookbooks/OutlierDetection:Outlier Detection>` for a detailed
See the :ref:`outlier detection cookbook
<cookbooks/OutlierDetection:Outlier Detection>` for a detailed
introduction into the usage and tuning of the function.
.. plot::
@@ -236,8 +310,10 @@ class OutliersMixin:
Example usage with default parameter configuration:
Loading data via pandas csv file parser, casting index to DateTime, generating a :py:class:`~saqc.SaQC`
instance from the data and plotting the variable representing light scattering at 254 nanometers wavelength.
Loading data via pandas csv file parser, casting index to DateTime,
generating a :py:class:`~saqc.SaQC` instance from the data and
plotting the variable representing light scattering at 254 nanometers
wavelength.
.. doctest:: flagUniLOFExample
@@ -255,8 +331,9 @@ class OutliersMixin:
qc.plot('sac254_raw')
We apply :py:meth:`~saqc.SaqC.flagUniLOF` in with default parameter values. Meaning, that the main
calibration paramters :py:attr:`n` and :py:attr:`thresh` evaluate to `20` and `1.5` respectively.
We apply :py:meth:`~saqc.SaqC.flagUniLOF` in with default parameter
values. Meaning, that the main calibration paramters :py:attr:`n`
and :py:attr:`thresh` evaluate to `20` and `1.5` respectively.
.. doctest:: flagUniLOFExample
@@ -273,27 +350,31 @@ class OutliersMixin:
qc.plot('sac254_raw')
"""
field_ = str(uuid.uuid4())
self = self.assignUniLOF(
self._validateLOF(algorithm, n, p, density)
if thresh != "auto" and not isFloatLike(thresh):
raise ValueError(f"'thresh' must be 'auto' or a float, not {thresh}")
tmp_field = str(uuid.uuid4())
qc = self.assignUniLOF(
field=field,
target=field_,
target=tmp_field,
n=n,
algorithm=algorithm,
p=p,
density=density,
fill_na=fill_na,
)
s = self.data[field_]
s = qc.data[tmp_field]
if thresh == "auto":
_s = pd.concat([s, (-s - 2)])
s_mask = ((_s - _s.mean()) / _s.std()).iloc[: int(s.shape[0])].abs() > 3
else:
s_mask = s < -abs(thresh)
s_mask = ~isflagged(self._flags[field], kwargs["dfilter"]) & s_mask
self._flags[s_mask, field] = flag
self = self.dropField(field_)
return self
s_mask = ~isflagged(qc._flags[field], kwargs["dfilter"]) & s_mask
qc._flags[s_mask, field] = flag
qc = qc.dropField(tmp_field)
return qc
@flagging()
def flagRange(
@@ -305,7 +386,8 @@ class OutliersMixin:
**kwargs,
) -> "SaQC":
"""
Function flags values exceeding the closed interval [:py:attr:`min`, :py:attr:`max`].
Function flags values exceeding the closed
interval [:py:attr:`min`, :py:attr:`max`].
Parameters
----------
@@ -314,7 +396,6 @@ class OutliersMixin:
max :
Upper bound for valid data.
"""
# using .values is much faster
datacol = self._data[field].to_numpy()
mask = (datacol < min) | (datacol > max)
@@ -341,28 +422,28 @@ class OutliersMixin:
----------
window :
Determines the segmentation of the data into partitions, the kNN algorithm is
applied onto individually.
Determines the segmentation of the data into partitions, the
kNN algorithm is applied onto individually.
* ``None``: Apply Scoring on whole data set at once
* ``int``: Apply scoring on successive data chunks of periods with the given length.
Must be greater than 0.
* Offset String : Apply scoring on successive partitions of temporal extension
matching the passed offset string
* ``int``: Apply scoring on successive data chunks of periods
with the given length. Must be greater than 0.
* offset String : Apply scoring on successive partitions of
temporal extension matching the passed offset string
min_periods :
Minimum number of periods per partition that have to be present for a valid
outlier detection to be made in this partition (only of effect, if :py:attr:`freq`
is an integer).
Minimum number of periods per partition that have to be present
for a valid outlier detection to be made in this partition
iter_start :
Float in ``[0, 1]`` that determines which percentage of data is considered
"normal". ``0.5`` results in the stray algorithm to search only the upper 50% of
the scores for the cut off point. (See reference section for more information)
Float in ``[0, 1]`` that determines which percentage of data
is considered "normal". ``0.5`` results in the stray algorithm
to search only the upper 50% of the scores for the cut off
point. (See reference section for more information)
alpha :
Level of significance by which it is tested, if a score might be drawn from
another distribution than the majority of the data.
Level of significance by which it is tested, if a score might
be drawn from another distribution than the majority of the data.
References
----------
@@ -373,36 +454,36 @@ class OutliersMixin:
"""
scores = self._data[field].dropna()
if window is None:
window = len(scores)
if not isinstance(window, int):
validateFrequency(window, "window")
validateMinPeriods(min_periods)
validateValueBounds(iter_start, "iter_start", left=0, right=1, closed="both")
if scores.empty:
return self
if not window:
window = len(scores)
if isinstance(window, str):
if isinstance(window, int):
s = pd.Series(data=np.arange(0, len(scores)), index=scores.index)
s = s.transform(lambda x: int(np.floor(x / window)))
partitions = scores.groupby(s)
else: # pd.Timedelta pd.DateOffset or str
partitions = scores.groupby(pd.Grouper(freq=window))
else:
grouper_series = pd.Series(
data=np.arange(0, len(scores)), index=scores.index
)
grouper_series = grouper_series.transform(
lambda x: int(np.floor(x / window))
)
partitions = scores.groupby(grouper_series)
# calculate flags for every window
for _, partition in partitions:
if partition.empty | (len(partition) < min_periods):
continue
sample_size = len(partition)
if partition.empty or sample_size < min_periods:
continue
sorted_i = partition.values.argsort()
resids = partition.values[sorted_i]
gaps = np.append(0, np.diff(resids))
tail_size = int(max(min(50, np.floor(sample_size / 4)), 2))
tail_size = int(max(min(np.floor(sample_size / 4), 50), 2))
tail_indices = np.arange(2, tail_size + 1)
i_start = int(max(np.floor(sample_size * iter_start), 1) + 1)
@@ -448,18 +529,19 @@ class OutliersMixin:
**kwargs,
) -> "SaQC":
"""
The algorithm implements a 3-step outlier detection procedure for simultaneously
flagging of higher dimensional data (dimensions > 3).
The algorithm implements a 3-step outlier detection procedure for
simultaneously flagging of higher dimensional data (dimensions > 3).
In [1], the procedure is introduced and exemplified with an application on hydrological
data. See the notes section for an overview over the algorithms basic steps.
In [1], the procedure is introduced and exemplified with an application on
hydrological data. See the notes section for an overview over the algorithms
basic steps.
Parameters
----------
trafo : default identity
Transformation to be applied onto every column before scoring. For more fine-grained
control, the data could also be transformed before :py:meth:`~saqc.SaQC.flagMVScores`
is called.
trafo :
Transformation to be applied onto every column before scoring. For more
fine-grained control, the data could also be transformed before
:py:meth:`~saqc.SaQC.flagMVScores` is called.
alpha :
Level of significance by which it is tested, if an observations score might
@@ -468,48 +550,52 @@ class OutliersMixin:
n :
Number of neighbors included in the scoring process for every datapoint.
func : default sum
Function that aggregates a value's k-smallest distances, returning a scalar score.
func :
Function that aggregates a value's k-smallest distances, returning a
scalar score.
iter_start :
Value in ``[0,1]`` that determines which percentage of data is considered
"normal". 0.5 results in the threshing algorithm to search only the upper 50%
of the scores for the cut off point. (See reference section for more
"normal". 0.5 results in the threshing algorithm to search only the upper
50% of the scores for the cut-off point. (See reference section for more
information)
window :
Only effective if :py:attr:`threshing` is set to ``'stray'``. Determines the
size of the data partitions, the data is decomposed into. Each partition is checked
seperately for outliers.
Either given as an Offset String, denoting the windows temporal extension or
as an integer, denoting the windows number of periods. ``NaN`` also count as periods.
If ``None``, all data points share the same scoring window, which than equals the whole
data.
Only effective if :py:attr:`threshing` is set to ``'stray'``. Determines
the size of the data partitions, the data is decomposed into. Each
partition is checked seperately for outliers. Either given as an Offset
String, denoting the windows temporal extension or as an integer,
denoting the windows number of periods. ``NaN`` also count as periods. If
``None``, all data points share the same scoring window, which than
equals the whole data.
min_periods :
Only effective if :py:attr:`threshing` is set to ``'stray'`` and :py:attr:`partition` is an integer.
Minimum number of periods per :py:attr:`partition` that have to be present for a valid outlier
Only effective if :py:attr:`threshing` is set to ``'stray'`` and
:py:attr:`partition` is an integer. Minimum number of periods per
:py:attr:`partition` that have to be present for a valid outlier
detection to be made in this partition.
stray_range :
If not ``None``, it is tried to reduce the stray result onto single outlier components
of the input :py:attr:`field`. The offset string denotes the range of the
temporal surrounding to include into the MAD testing while trying to reduce
flags.
If not ``None``, it is tried to reduce the stray result onto single
outlier components of the input :py:attr:`field`. The offset string
denotes the range of the temporal surrounding to include into the MAD
testing while trying to reduce flags.
drop_flagged :
Only effective when :py:attr:`stray_range` is not ``None``. Whether or not to drop flagged
values from the temporal surroundings.
Only effective when :py:attr:`stray_range` is not ``None``. Whether or
not to drop flagged values from the temporal surroundings.
thresh :
Only effective when :py:attr:`stray_range` is not ``None``. The 'critical' value,
controlling wheather the MAD score is considered referring to an outlier or
not. Higher values result in less rigid flagging. The default value is widely
considered apropriate in the literature.
Only effective when :py:attr:`stray_range` is not ``None``. The
'critical' value, controlling wheather the MAD score is considered
referring to an outlier or not. Higher values result in less rigid
flagging. The default value is widely considered apropriate in the
literature.
min_periods_r :
Only effective when :py:attr:`stray_range` is not ``None``. Minimum number of measurements
necessary in an interval to actually perform the reduction step.
Only effective when :py:attr:`stray_range` is not ``None``. Minimum
number of measurements necessary in an interval to actually perform the
reduction step.
Notes
-----
@@ -521,33 +607,33 @@ class OutliersMixin:
(a) make them comparable and
(b) make outliers more stand out.
This step is usually subject to a phase of research/try and error. See [1] for more
details.
This step is usually subject to a phase of research/try and error. See [1]
for more details.
Note, that the data transformation as an built-in step of the algorithm, will likely
get deprecated in the future. Its better to transform the data in a processing
step, preceeding the multivariate flagging process. Also, by doing so, one gets
mutch more control and variety in the transformation applied, since the `trafo`
parameter only allows for application of the same transformation to all of the
variables involved.
Note, that the data transformation as a built-in step of the algorithm,
will likely get deprecated in the future. It's better to transform the data in
a processing step, preceeding the multivariate flagging process. Also,
by doing so, one gets mutch more control and variety in the transformation
applied, since the `trafo` parameter only allows for application of the same
transformation to all the variables involved.
2. scoring
Every observation gets assigned a score depending on its k nearest neighbors. See
the `scoring_method` parameter description for details on the different scoring
methods. Furthermore [1] may give some insight in the pro and cons of the
different methods.
Every observation gets assigned a score depending on its k nearest neighbors.
See the `scoring_method` parameter description for details on the different
scoring methods. Furthermore, [1] may give some insight in the pro and cons of
the different methods.
3. threshing
The gaps between the (greatest) scores are tested for beeing drawn from the same
distribution as the majority of the scores. If a gap is encountered, that,
with sufficient significance, can be said to not be drawn from the same
distribution as the one all the smaller gaps are drawn from, than the observation
belonging to this gap, and all the observations belonging to gaps larger then
this gap, get flagged outliers. See description of the `threshing` parameter for
more details. Although [1] gives a fully detailed overview over the `stray`
algorithm.
The gaps between the (greatest) scores are tested for beeing drawn from the
same distribution as the majority of the scores. If a gap is encountered,
that, with sufficient significance, can be said to not be drawn from the same
distribution as the one all the smaller gaps are drawn from, than the
observation belonging to this gap, and all the observations belonging to gaps
larger than this gap, get flagged outliers. See description of the
`threshing` parameter for more details. Although [1] gives a fully detailed
overview over the `stray` algorithm.
References
----------
@@ -555,7 +641,6 @@ class OutliersMixin:
Anomaly Detection in High-Dimensional Data,
Journal of Computational and Graphical Statistics, 30:2, 360-374,
DOI: 10.1080/10618600.2020.1807997
"""
# parameter deprecations
@@ -563,8 +648,8 @@ class OutliersMixin:
if "partition" in kwargs:
warnings.warn(
"""
The parameter `partition` is deprecated and will be removed in version 3.0 of saqc.
Please us the parameter `window` instead.'
The parameter `partition` is deprecated and will be removed in version
3.0 of saqc. Please us the parameter `window` instead.
""",
DeprecationWarning,
)
@@ -573,8 +658,8 @@ class OutliersMixin:
if "partition_min" in kwargs:
warnings.warn(
"""
The parameter `partition_min` is deprecated and will be removed in version 3.0 of saqc.
Please us the parameter `min_periods` instead.'
The parameter `partition_min` is deprecated and will be removed in
version 3.0 of saqc. Please us the parameter `min_periods` instead.
""",
DeprecationWarning,
)
@@ -583,27 +668,32 @@ class OutliersMixin:
if min_periods != 11:
warnings.warn(
"""
You were setting a customary value for the `min_periods` parameter: note that this parameter
does no longer refer to the reduction interval length, but now controls the number of periods
having to be present in an interval of size `window` (deprecated:`partition`) for the algorithm to be
performed in that interval.
To alter the size of the reduction window, use the parameter `min_periods_r`. Changes readily apply.
Warning will be removed in saqc version 3.0.
You were setting a customary value for the `min_periods` parameter:
note that this parameter does no longer refer to the reduction interval
length, but now controls the number of periods having to be present in
an interval of size `window` (deprecated:`partition`) for the algorithm
to be performed in that interval.
To alter the size of the reduction window, use the parameter
`min_periods_r`. Changes readily apply.
This warning will be removed in saqc version 3.0.
""",
DeprecationWarning,
)
# Hint: checking is delegated to the called functions
fields = toSequence(field)
qc = self
fields_ = []
for f in fields:
field_ = str(uuid.uuid4())
self = self.copyField(field=f, target=field_)
self = self.transform(field=field_, func=trafo, freq=window)
qc = qc.copyField(field=f, target=field_)
qc = qc.transform(field=field_, func=trafo, freq=window)
fields_.append(field_)
knn_field = str(uuid.uuid4())
self = self.assignKNNScore(
qc = qc.assignKNNScore(
field=fields_,
target=knn_field,
n=n,
@@ -614,9 +704,9 @@ class OutliersMixin:
**kwargs,
)
for field_ in fields_:
self = self.dropField(field_)
qc = qc.dropField(field_)
self = self.flagByStray(
qc = qc.flagByStray(
field=knn_field,
freq=window,
min_periods=min_periods,
@@ -626,11 +716,11 @@ class OutliersMixin:
**kwargs,
)
self._data, self._flags = _evalStrayLabels(
data=self._data,
qc._data, qc._flags = _evalStrayLabels(
data=qc._data,
field=knn_field,
target=fields,
flags=self._flags,
flags=qc._flags,
reduction_range=stray_range,
reduction_drop_flagged=drop_flagged,
reduction_thresh=thresh,
@@ -638,7 +728,7 @@ class OutliersMixin:
flag=flag,
**kwargs,
)
return self.dropField(knn_field)
return qc.dropField(knn_field)
@flagging()
def flagRaise(
@@ -655,17 +745,17 @@ class OutliersMixin:
**kwargs,
) -> "SaQC":
"""
The function flags raises and drops in value courses, that exceed a certain threshold
within a certain timespan.
The function flags raises and drops in value courses, that exceed a certain
threshold within a certain timespan.
The parameter variety of the function is owned to the intriguing case of values, that
"return" from outlierish or anomalious value levels and thus exceed the threshold,
while actually being usual values.
The parameter variety of the function is owned to the intriguing case of
values, that "return" from outlierish or anomalious value levels and thus
exceed the threshold, while actually being usual values.
Notes
-----
The dataset is NOT supposed to be harmonized to a time series with an equidistant
requency grid.
The dataset is NOT supposed to be harmonized to a time series with an
equidistant requency grid.
The value :math:`x_{k}` of a time series :math:`x` with associated
timestamps :math:`t_i`, is flagged a raise, if:
@@ -674,35 +764,38 @@ class OutliersMixin:
:py:attr:`raise_window` range, so that
:math:`M = |x_k - x_s | >` :py:attr:`thresh` :math:`> 0`
2. The weighted average :math:`\\mu^{*}` of the values, preceding :math:`x_{k}`
within :py:attr:`average_window` range indicates, that :math:`x_{k}` does not
return from an "outlierish" value course, meaning that
2. The weighted average :math:`\\mu^{*}` of the values, preceding
:math:`x_{k}` within :py:attr:`average_window` range indicates,
that :math:`x_{k}` does not return from an "outlierish" value
course, meaning that
:math:`x_k > \\mu^* + ( M` / :py:attr:`raise_factor` :math:`)`
3. Additionally, if :py:attr:`slope` is not ``None``, :math:`x_{k}` is checked for being
sufficiently divergent from its very predecessor :math:`x_{k-1}`, meaning that, it
is additionally checked if:
3. Additionally, if :py:attr:`slope` is not ``None``, :math:`x_{k}`
is checked or being sufficiently divergent from its very predecessor
:math:`x_{k-1}`, meaning that, it is additionally checked if:
* :math:`x_k - x_{k-1} >` :py:attr:`slope`
* :math:`t_k - t_{k-1} >` :py:attr:`weight` :math:`\\times` :py:attr:`freq`
Parameters
----------
thresh :
The threshold, for the total rise (:py:attr:`thresh` ``> 0``), or total drop
(:py:attr:`thresh` ``< 0``), value courses must not exceed within a timespan
of length :py:attr:`raise_window`.
The threshold, for the total rise (:py:attr:`thresh` ``> 0``),
or total drop (:py:attr:`thresh` ``< 0``), value courses must
not exceed within a timespan of length :py:attr:`raise_window`.
raise_window :
An offset string, determining the timespan, the rise/drop thresholding refers
to. Window is inclusively defined.
An offset string, determining the timespan, the rise/drop
thresholding refers to. Window is inclusively defined.
freq :
An offset string, determining the frequency, the timeseries to flag is supposed
to be sampled at. The window is inclusively defined.
An offset string, determining the frequency, the timeseries
to flag is supposed to be sampled at. The window is inclusively
defined.
average_window :
See condition (2) of the description given in the Notes. Window is
inclusively defined, defaults to 1.5 times the size of :py:attr:`raise_window`.
See condition (2) of the description given in the Notes. Window
is inclusively defined, defaults to 1.5 times the size of
:py:attr:`raise_window`.
raise_factor :
See condition (2).
@@ -713,6 +806,10 @@ class OutliersMixin:
weight :
See condition (3).
"""
validateWindow(raise_window, "raise_window", allow_int=False)
validateWindow(freq, "freq", allow_int=False)
validateWindow(average_window, "average_window", allow_int=False, optional=True)
# prepare input args
dataseries = self._data[field].dropna()
raise_window_td = pd.Timedelta(raise_window)
@@ -834,15 +931,15 @@ class OutliersMixin:
----------
[1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h.htm
"""
msg = """
The method `flagMAD` is deprecated and will be removed in verion 3.0 of saqc.
To achieve the same behavior use:
"""
call = f"qc.flagZScore(field={field}, window={window}, method='modified', thresh={z}, min_residuals={min_residuals}, min_periods={min_periods}, center={center})"
warnings.warn(f"{msg}`{call}`", DeprecationWarning)
self = self.flagZScore(
warnings.warn(
f"The method `flagMAD` is deprecated and will be removed in "
"version 3.0 of saqc. To achieve the same behavior use:"
f"`qc.flagZScore(field={field}, window={window}, method='modified', "
f"thresh={z}, min_residuals={min_residuals}, min_periods={min_periods}, "
f"center={center})`",
DeprecationWarning,
)
return self.flagZScore(
field,
window=window,
thresh=z,
@@ -856,8 +953,6 @@ class OutliersMixin:
flag=flag,
)
return self
@flagging()
def flagOffset(
self: "SaQC",
@@ -1004,10 +1099,12 @@ class OutliersMixin:
>>> qc = qc.flagOffset("data", thresh=2, thresh_relative=-.5, tolerance=1.5, window='6H')
>>> qc.plot('data') # doctest: +SKIP
"""
if (thresh is None) and (thresh_relative is None):
validateWindow(window)
if thresh is None and thresh_relative is None:
raise ValueError(
"At least one of parameters 'thresh' and 'thresh_relative' has to be given. Got 'thresh'=None, "
"'thresh_relative'=None instead."
"At least one of parameters 'thresh' and 'thresh_relative' "
"has to be given. Got 'thresh'=None, 'thresh_relative'=None "
"instead."
)
if thresh is None:
thresh = 0
@@ -1110,6 +1207,10 @@ class OutliersMixin:
[1] https://en.wikipedia.org/wiki/Grubbs%27s_test_for_outliers
"""
validateWindow(window)
validateFraction(alpha, "alpha")
validateMinPeriods(min_periods, optional=False)
datcol = self._data[field].copy()
rate = getFreqDelta(datcol.index)
@@ -1215,25 +1316,29 @@ class OutliersMixin:
----------
[1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h.htm
"""
msg = """
The method `flagCrossStatistics` is deprecated and will be removed in verion 3.0 of saqc.
To achieve the same behavior use:
"""
new_method_string = {
"modZscore": "modified",
"Zscore": "standard",
np.mean: "standard",
np.median: "modified",
}
call = f"qc.flagZScore(field={field}, window=1, method={new_method_string[method]}, thresh={thresh}, axis=1)"
warnings.warn(f"{msg}`{call}`", DeprecationWarning)
call = (
f"qc.flagZScore(field={field}, window=1, "
f"method={new_method_string[method]}, "
f"thresh={thresh}, axis=1)"
)
warnings.warn(
f"The method `flagCrossStatistics` is deprecated and will "
f"be removed in verion 3.0 of saqc. To achieve the same behavior "
f"use:`{call}`",
DeprecationWarning,
)
return self.flagZScore(
field={field},
field=field,
window=1,
method={new_method_string[method]},
thresh={thresh},
method=new_method_string[method],
thresh=thresh,
axis=1,
flag=flag,
)
@@ -1347,21 +1452,22 @@ class OutliersMixin:
method = "modified"
else:
raise ValueError(
"Support for scoring with functions not similar to either Zscore or modified Zscore is "
"not supported anymore"
"Support for scoring with functions not similar to "
"either Zscore or modified Zscore is not supported "
"anymore"
)
dat = self._data[field].to_pandas(how="outer")
validateChoice(method, "method", ["standard", "modified"])
validateWindow(window, optional=True)
validateMinPeriods(min_periods)
if min_residuals is None:
min_residuals = 0
min_residuals = min_residuals or 0
min_periods = min_periods or 0
dat = self._data[field].to_pandas(how="outer")
if dat.empty:
return self
if min_periods is None:
min_periods = 0
if window is None:
if dat.notna().sum().sum() >= min_periods:
if method == "standard":
@@ -1382,6 +1488,7 @@ class OutliersMixin:
)
else:
return self
else: # window is not None
if axis == 0:
if method == "standard":
@@ -1437,7 +1544,7 @@ def _evalStrayLabels(
field: str,
flags: Flags,
target: Sequence[str],
reduction_range: Optional[str] = None,
reduction_range: str | None = None,
reduction_drop_flagged: bool = False, # TODO: still a case ?
reduction_thresh: float = 3.5,
reduction_min_periods: int = 1,
Loading