From 775d401e1ddea9299277a8e5703b08b3b9289d83 Mon Sep 17 00:00:00 2001 From: Bert Palm <bert.palm@ufz.de> Date: Mon, 3 Jul 2023 15:48:08 +0200 Subject: [PATCH] Check inputs --- CHANGELOG.md | 5 + saqc/funcs/breaks.py | 6 + saqc/funcs/changepoints.py | 99 ++++-- saqc/funcs/constants.py | 43 ++- saqc/funcs/curvefit.py | 25 +- saqc/funcs/drift.py | 118 ++++--- saqc/funcs/flagtools.py | 123 ++++--- saqc/funcs/interpolation.py | 199 +++++++----- saqc/funcs/noise.py | 29 +- saqc/funcs/outliers.py | 627 +++++++++++++++++++++--------------- saqc/funcs/pattern.py | 74 ++--- saqc/funcs/resampling.py | 60 ++-- saqc/funcs/residuals.py | 2 + saqc/funcs/rolling.py | 8 + saqc/funcs/scores.py | 34 +- saqc/funcs/tools.py | 3 + saqc/lib/checking.py | 352 ++++++++++++++++++++ saqc/lib/exceptions.py | 50 --- saqc/lib/rolling.py | 26 +- saqc/lib/tools.py | 140 ++++---- saqc/lib/ts_operators.py | 82 +++-- tests/lib/test_tools.py | 15 +- 22 files changed, 1390 insertions(+), 730 deletions(-) create mode 100644 saqc/lib/checking.py delete mode 100644 saqc/lib/exceptions.py diff --git a/CHANGELOG.md b/CHANGELOG.md index dbd9594e6..4bb4402a4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,11 +9,16 @@ SPDX-License-Identifier: GPL-3.0-or-later ## Unreleased [List of commits](https://git.ufz.de/rdm-software/saqc/-/compare/v2.4.0...develop) ### Added +- added checks and unified error message for common inputs. ### Changed - pin pandas to versions >= 2.0 +- parameter `fill_na` of `SaQC.flagUniLOF` and `SaQC.assignUniLOF` is now of type + `bool` instead of one of `[None, "linear"]` ### Removed - removed deprecated `DictOfSeries.to_df` ### Fixed +- Bug in `SaQC.assignChangePointCluster` and `SaQC.flagChangePoints`: A tuple passed + to `min_period` was only recognised if also `window` was a tuple. ### Deprecated ## [2.4.1](https://git.ufz.de/rdm-software/saqc/-/tags/v2.4.1) - 2023-06-22 diff --git a/saqc/funcs/breaks.py b/saqc/funcs/breaks.py index d37b0999d..9648a4d1e 100644 --- a/saqc/funcs/breaks.py +++ b/saqc/funcs/breaks.py @@ -25,6 +25,7 @@ import pandas as pd from saqc import BAD, FILTER_ALL from saqc.core import flagging, register from saqc.funcs.changepoints import _getChangePoints +from saqc.lib.checking import validateMinPeriods, validateWindow from saqc.lib.tools import isunflagged if TYPE_CHECKING: @@ -97,6 +98,8 @@ class BreaksMixin: 3. None of the :math:`x_j` with :math:`0 < t_j - t_(k+n) <` `gap_window`, is valid (succeding gap). """ + validateWindow(gap_window, name="gap_window", allow_int=False) + validateWindow(group_window, name="group_window", allow_int=False) dat = self._data[field].dropna() if dat.empty: @@ -180,6 +183,9 @@ class BreaksMixin: Jumps that are not distanced to each other by more than three fourth (3/4) of the selected window size, will not be detected reliably. """ + validateWindow(window, allow_int=False) + validateMinPeriods(min_periods) + mask = _getChangePoints( data=self._data[field], stat_func=lambda x, y: np.abs(np.mean(x) - np.mean(y)), diff --git a/saqc/funcs/changepoints.py b/saqc/funcs/changepoints.py index 8037b4345..ff706f691 100644 --- a/saqc/funcs/changepoints.py +++ b/saqc/funcs/changepoints.py @@ -7,7 +7,6 @@ # -*- coding: utf-8 -*- from __future__ import annotations -import typing from typing import TYPE_CHECKING, Callable, Literal, Tuple import numpy as np @@ -15,6 +14,14 @@ import pandas as pd from saqc import BAD, UNFLAGGED from saqc.core import DictOfSeries, Flags, flagging, register +from saqc.lib.checking import ( + isInBounds, + validateCallable, + validateChoice, + validateMinPeriods, + validateValueBounds, + validateWindow, +) if TYPE_CHECKING: from saqc import SaQC @@ -43,11 +50,9 @@ class ChangepointsMixin: Parameters ---------- stat_func : - * If callable: A function that assigns a scalar value to every twin window. The backward-facing - window content will be passed as the first array, the forward-facing window - content as the second. - * If string: The respective statistic will be calculated for both the windows and the absolute difference of - the results will be returned. + A function that assigns a value to every twin window. The backward-facing + window content will be passed as the first array, the forward-facing window + content as the second. thresh_func : A function that determines the value level, exceeding wich qualifies a @@ -88,6 +93,11 @@ class ChangepointsMixin: The default reduction function just selects the value that maximizes the `stat_func`. """ + validateCallable(stat_func, "stat_func") + validateCallable(thresh_func, "thresh_func") + validateCallable(reduce_func, "reduce_func") + # Hint: windows are checked in _getChangePoints + mask = _getChangePoints( data=self._data[field], stat_func=stat_func, @@ -169,6 +179,11 @@ class ChangepointsMixin: model_by_resids : If True, the results of `stat_funcs` are written, otherwise the regime labels. """ + validateCallable(stat_func, "stat_func") + validateCallable(thresh_func, "thresh_func") + validateCallable(reduce_func, "reduce_func") + # Hint: windows are checked in _getChangePoints + rtyp = "residual" if model_by_resids else "cluster" cluster = _getChangePoints( data=self._data[field], @@ -195,19 +210,44 @@ def _getChangePoints( min_periods: int | Tuple[int, int], reduce_window: str | None = None, reduce_func: Callable[[np.ndarray, np.ndarray], float] = lambda x, _: x.argmax(), - result: typing.Literal["cluster", "residual", "mask"] = "mask", + result: Literal["cluster", "residual", "mask"] = "mask", ) -> pd.Series: + """ + TODO: missing docstring + + Parameters + ---------- + data : + stat_func : + thresh_func : + window : + min_periods : + reduce_window : + reduce_func : + result : + + Returns + ------- + """ + validateChoice(result, "result", ["cluster", "residual", "mask"]) + orig_index = data.index data = data.dropna() # implicit copy if isinstance(window, (list, tuple)): bwd_window, fwd_window = window + validateWindow(fwd_window, name="window[0]", allow_int=False) + validateWindow(bwd_window, name="window[1]", allow_int=False) else: + validateWindow(window, name="window", allow_int=False) bwd_window = fwd_window = window - if isinstance(window, (list, tuple)): + if isinstance(min_periods, (list, tuple)): bwd_min_periods, fwd_min_periods = min_periods + validateMinPeriods(bwd_min_periods, "min_periods[0]") + validateMinPeriods(fwd_min_periods, "min_periods[1]") else: + validateMinPeriods(min_periods) bwd_min_periods = fwd_min_periods = min_periods if reduce_window is None: @@ -216,13 +256,7 @@ def _getChangePoints( + pd.Timedelta(fwd_window).total_seconds() ) reduce_window = f"{s}s" - - for window in [fwd_window, bwd_window, reduce_window]: - if isinstance(window, int): - raise TypeError( - "all parameter defining a size of a window " - "must be time-offsets, not integer." - ) + validateWindow(reduce_window, name="reduce_window", allow_int=False) # find window bounds arrays.. num_index = pd.Series(range(len(data)), index=data.index, dtype=int) @@ -258,24 +292,23 @@ def _getChangePoints( det_index = masked_index[result_arr] detected = pd.Series(True, index=det_index) - if reduce_window: - length = len(detected) - - # find window bounds arrays - num_index = pd.Series(range(length), index=detected.index, dtype=int) - rolling = num_index.rolling(window=reduce_window, closed="both", center=True) - start = rolling.min().to_numpy(dtype=int) - end = (rolling.max() + 1).to_numpy(dtype=int) - - detected = _reduceCPCluster( - stat_arr[result_arr], - thresh_arr[result_arr], - start, - end, - reduce_func, - length, - ) - det_index = det_index[detected] + + length = len(detected) + # find window bounds arrays + num_index = pd.Series(range(length), index=detected.index, dtype=int) + rolling = num_index.rolling(window=reduce_window, closed="both", center=True) + start = rolling.min().to_numpy(dtype=int) + end = (rolling.max() + 1).to_numpy(dtype=int) + + detected = _reduceCPCluster( + stat_arr[result_arr], + thresh_arr[result_arr], + start, + end, + reduce_func, + length, + ) + det_index = det_index[detected] # The changepoint is the point "after" the change. # So the detected index has to be shifted by one diff --git a/saqc/funcs/constants.py b/saqc/funcs/constants.py index a4bbf6885..a3af13d61 100644 --- a/saqc/funcs/constants.py +++ b/saqc/funcs/constants.py @@ -16,6 +16,7 @@ import pandas as pd from saqc import BAD from saqc.core import flagging +from saqc.lib.checking import validateMinPeriods, validateValueBounds, validateWindow from saqc.lib.rolling import removeRollingRamps from saqc.lib.tools import getFreqDelta, statPass from saqc.lib.ts_operators import varQC @@ -50,30 +51,22 @@ class ConstantsMixin: thresh : Maximum total change allowed per window. + min_periods : + Minimum number of observations in window required to generate + a flag. Must be an integer greater or equal `2`, because a + single value would always be considered constant. + Defaults to `2`. + window : Size of the moving window. This is the number of observations used for calculating the statistic. Each window will be a fixed size. - If its an offset then this will be the time period of each window. + If it is an offset then this will be the time period of each window. Each window will be a variable sized based on the observations included in the time-period. """ - if not isinstance(window, (str, int)): - raise TypeError("window must be offset string or int.") - d: pd.Series = self._data[field] - - if not isinstance(window, int) and not pd.api.types.is_datetime64_any_dtype( - d.index - ): - raise ValueError( - f"A time based value for 'window' is only possible for variables " - f"with a datetime based index, but variable '{field}' has an index " - f"of dtype {d.index.dtype}. Use an integer window instead." - ) - - # min_periods=2 ensures that at least two non-nan values are present - # in each window and also min() == max() == d[i] is not possible. - min_periods = max(min_periods, 2) + validateWindow(window, index=d.index) + validateMinPeriods(min_periods, minimum=2, optional=False) # 1. find starting points of consecutive constant values as a boolean mask # 2. fill the whole window with True's @@ -133,21 +126,27 @@ class ConstantsMixin: maxna_group : Same as `maxna` but for consecutive NaNs. """ - dataseries = self._data[field] - delta = getFreqDelta(dataseries.index) + d: pd.Series = self._data[field] + validateWindow(window, allow_int=False, index=d.index) + window = pd.Timedelta(window) + + delta = getFreqDelta(d.index) if not delta: raise IndexError("Timeseries irregularly sampled!") if maxna is None: maxna = np.inf - if maxna_group is None: maxna_group = np.inf + validateValueBounds(maxna, "maxna", 0, closed="both", strict_int=True) + validateValueBounds( + maxna_group, "maxna_group", 0, closed="both", strict_int=True + ) + min_periods = int(np.ceil(pd.Timedelta(window) / pd.Timedelta(delta))) - window = pd.Timedelta(window) to_set = statPass( - dataseries, + d, lambda x: varQC(x, maxna, maxna_group), window, thresh, diff --git a/saqc/funcs/curvefit.py b/saqc/funcs/curvefit.py index f79250808..ec385ba46 100644 --- a/saqc/funcs/curvefit.py +++ b/saqc/funcs/curvefit.py @@ -7,14 +7,19 @@ # -*- coding: utf-8 -*- from __future__ import annotations -from typing import TYPE_CHECKING, Tuple, Union +from typing import TYPE_CHECKING, Literal, Tuple, Union import numpy as np import pandas as pd -from typing_extensions import Literal from saqc.core import DictOfSeries, Flags, register -from saqc.lib.tools import getFreqDelta +from saqc.lib.checking import ( + validateChoice, + validateMinPeriods, + validateValueBounds, + validateWindow, +) +from saqc.lib.tools import extractLiteral, getFreqDelta from saqc.lib.ts_operators import ( butterFilter, polyRoller, @@ -89,6 +94,9 @@ class CurvefitMixin: Passing 0, disables the feature and will result in over-fitting for too sparse windows. """ + validateWindow(window) + validateMinPeriods(min_periods) + validateValueBounds(order, "order", left=0, strict_int=True) self._data, self._flags = _fitPolynomial( data=self._data, field=field, @@ -129,10 +137,10 @@ class CurvefitMixin: Fill method to be applied on the data before filtering (butterfilter cant handle ''np.nan''). See documentation of pandas.Series.interpolate method for details on the methods associated with the different keywords. - - filter_type : - The type of filter. Default is ‘lowpass’. """ + validateValueBounds(filter_order, "filter_order", strict_int=True) + validateChoice(fill_method, fill_method, FILL_METHODS) + self._data[field] = butterFilter( self._data[field], cutoff=cutoff, @@ -154,6 +162,11 @@ def _fitPolynomial( **kwargs, ) -> Tuple[DictOfSeries, Flags]: # TODO: some (rather large) parts are functional similar to saqc.funcs.rolling.roll + + validateWindow(window) + validateValueBounds(order, "order", 0, strict_int=True) + validateMinPeriods(min_periods) + if data[field].empty: return data, flags diff --git a/saqc/funcs/drift.py b/saqc/funcs/drift.py index 98c2b3352..a0d2ef1d9 100644 --- a/saqc/funcs/drift.py +++ b/saqc/funcs/drift.py @@ -22,9 +22,15 @@ from typing_extensions import Literal from saqc import BAD from saqc.core import DictOfSeries, Flags, flagging, register from saqc.funcs.changepoints import _getChangePoints +from saqc.lib.checking import ( + validateCallable, + validateChoice, + validateFrequency, + validateValueBounds, + validateWindow, +) from saqc.lib.docs import DOC_TEMPLATES -from saqc.lib.exceptions import ParameterOutOfBounds -from saqc.lib.tools import detectDeviants, filterKwargs, isInBounds, toSequence +from saqc.lib.tools import detectDeviants, filterKwargs, toSequence from saqc.lib.ts_operators import expDriftModel, linearDriftModel from saqc.lib.types import CurveFitter @@ -36,7 +42,7 @@ LinkageString = Literal[ "single", "complete", "average", "weighted", "centroid", "median", "ward" ] -MODELDICT = {"linear": linearDriftModel, "exponential": expDriftModel} +DRIFT_MODELS = {"linear": linearDriftModel, "exponential": expDriftModel} def cityblock(x: np.ndarray | pd.Series, y: np.ndarray | pd.Series) -> np.ndarray: @@ -55,7 +61,7 @@ class DriftMixin: def flagDriftFromNorm( self: "SaQC", field: Sequence[str], - window: str, + window: str, # TODO: this should be named 'freq' spread: float, frac: float = 0.5, metric: Callable[ @@ -132,19 +138,20 @@ class DriftMixin: Introduction to Hierarchical clustering: [2] https://en.wikipedia.org/wiki/Hierarchical_clustering """ - if not isInBounds(frac, (0.5, 1), closed="both"): - raise ParameterOutOfBounds(frac, "frac", (0.5, 1), "both") + validateValueBounds(frac, "frac", left=0, right=1, closed="both") + validateCallable(metric, "metric") + validateChoice(method, "method", LinkageString) if "freq" in kwargs: warnings.warn( - """ - The parameter `freq` is deprecated and will be removed in version 3.0 of saqc. - Please us the parameter `window` instead.' - """, + "The parameter `freq` is deprecated and will be " + "removed in version 3.0 of saqc. Please us the " + "parameter `window` instead.'", DeprecationWarning, ) window = kwargs["freq"] + validateFrequency(window, "window") fields = toSequence(field) data = self._data[fields].to_pandas() @@ -211,9 +218,10 @@ class DriftMixin: default, since it corresponds to the averaged value distance, two data sets have (as opposed by euclidean, for example). """ + validateFrequency(freq, "freq") + validateCallable(metric, "metric") fields = toSequence(field) - if reference not in fields: fields.append(reference) @@ -314,13 +322,14 @@ class DriftMixin: ``expDriftModel`` and ``linearDriftModel``. """ - # extract model func: if isinstance(model, str): - if model not in MODELDICT: + model = DRIFT_MODELS.get(model, None) + if model is None: raise ValueError( - f"invalid model '{model}', choose one of '{MODELDICT.keys()}'" + f"unknown model {model!r}, available models: {list(DRIFT_MODELS)}" ) - model = MODELDICT[model] + validateCallable(model, "model") + validateValueBounds(cal_range, "cal_range", left=0, strict_int=True) # 1: extract fit intervals: if self._data[maintenance_field].empty: @@ -403,14 +412,19 @@ class DriftMixin: tolerance : If an offset string is passed, a data chunk of length `offset` right at the - start and right at the end is ignored when fitting the model. This is to account - for the unreliability of data near the changepoints of regimes. + start and right at the end is ignored when fitting the model. This is to + account for the unreliability of data near the changepoints of regimes. + Defaults to None. epoch : If True, use "seconds from epoch" as x input to the model func, instead of "seconds from regime start". """ + validateCallable(model, "model") + if tolerance is not None: + validateWindow(tolerance, name="tolerance", allow_int=False) + cluster_ser = self._data[cluster_field] unique_successive = pd.unique(cluster_ser.values) data_ser = self._data[field] @@ -419,7 +433,6 @@ class DriftMixin: x_dict = {} x_mask = {} if tolerance is not None: - # get seconds tolerance = pd.Timedelta(tolerance).total_seconds() for label, regime in regimes: if epoch is False: @@ -512,33 +525,34 @@ class DriftMixin: This is to account for the unrelyability of data near the changepoints of regimes. """ - # Hint: This whole function does not set any flags - + # Hint: + # - This whole function does not set any flags + # - Checking is delegated to the called functions cluster_field = field + "_CPcluster" - self = self.copyField(field, cluster_field) - self.data[cluster_field] = _getChangePoints( - data=self._data[cluster_field], + qc = self.copyField(field, cluster_field) + qc.data[cluster_field] = _getChangePoints( + data=qc._data[cluster_field], stat_func=lambda x, y: np.abs(np.mean(x) - np.mean(y)), thresh_func=lambda x, y: max_jump, window=window, min_periods=min_periods, result="cluster", ) - self._data, self._flags = _assignRegimeAnomaly( - data=self._data, + qc._data, qc._flags = _assignRegimeAnomaly( + data=qc._data, field=field, - flags=self._flags, + flags=qc._flags, cluster_field=cluster_field, spread=spread, ) - self = self.correctRegimeAnomaly( + qc = qc.correctRegimeAnomaly( field, cluster_field, lambda x, p1: np.array([p1] * x.shape[0]), tolerance=tolerance, ) - self = self.dropField(cluster_field) - return self + qc = qc.dropField(cluster_field) + return qc @flagging() def flagRegimeAnomaly( @@ -581,16 +595,21 @@ class DriftMixin: method : The linkage method for hierarchical (agglomerative) clustering of the variables. - metric : default absolute difference of means + metric : A metric function for calculating the dissimilarity between 2 regimes. - Defaults to the difference in mean. + Defaults to the absolute difference in mean. frac : - Has to be in [0,1]. Determines the minimum percentage of samples, - the "normal" group has to comprise to be the normal group actually. + The minimum percentage of samples, the "normal" group has to comprise to + actually be the normal group. Must be in the closed interval `[0,1]`, + otherwise a ValueError is raised. """ reserverd = ["set_cluster", "set_flags"] kwargs = filterKwargs(kwargs, reserverd) + validateChoice(method, "method", LinkageString) + validateCallable(metric, "metric") + validateValueBounds(frac, "frac", left=0, right=1, closed="both") + self._data, self._flags = _assignRegimeAnomaly( data=self._data, field=field, @@ -648,16 +667,21 @@ class DriftMixin: method : The linkage method for hierarchical (agglomerative) clustering of the variables. - metric : default absolute difference of means + metric : A metric function for calculating the dissimilarity between 2 regimes. - Defaults to the difference in mean. + Defaults to the absolute difference in mean. frac : - Has to be in [0,1]. Determines the minimum percentage of samples, - the "normal" group has to comprise to be the normal group actually. + The minimum percentage of samples, the "normal" group has to comprise to + actually be the normal group. Must be in the closed interval `[0,1]`, + otherwise a ValueError is raised. """ reserverd = ["set_cluster", "set_flags", "flag"] kwargs = filterKwargs(kwargs, reserverd) + validateChoice(method, "method", LinkageString) + validateCallable(metric, "metric") + validateValueBounds(frac, "frac", left=0, right=1, closed="both") + self._data, self._flags = _assignRegimeAnomaly( data=self._data, field=field, @@ -675,7 +699,10 @@ class DriftMixin: return self -def _driftFit(x, shift_target, cal_mean, driftModel): +def _driftFit( + x: pd.Series, shift_target: pd.Series, cal_mean: int, drift_model: callable +): + """TODO: Docstring""" x_index = x.index - x.index[0] x_data = x_index.total_seconds().values x_data = x_data / x_data[-1] if len(x_data) > 1 else x_data @@ -683,14 +710,14 @@ def _driftFit(x, shift_target, cal_mean, driftModel): origin_mean = np.mean(y_data[:cal_mean]) target_mean = np.mean(y_data[-cal_mean:]) - dataFitFunc = functools.partial(driftModel, origin=origin_mean, target=target_mean) + dataFitFunc = functools.partial(drift_model, origin=origin_mean, target=target_mean) # if drift model has free parameters: if len(inspect.getfullargspec(dataFitFunc).args) > 1: try: # try fitting free parameters fit_paras, *_ = curve_fit(dataFitFunc, x_data, y_data) data_fit = dataFitFunc(x_data, *fit_paras) - data_shift = driftModel( + data_shift = drift_model( x_data, *fit_paras, origin=origin_mean, target=shift_target ) except RuntimeError: @@ -700,7 +727,7 @@ def _driftFit(x, shift_target, cal_mean, driftModel): # when there are no free parameters in the model: else: data_fit = dataFitFunc(x_data) - data_shift = driftModel(x_data, origin=origin_mean, target=shift_target) + data_shift = drift_model(x_data, origin=origin_mean, target=shift_target) return data_fit, data_shift @@ -721,14 +748,15 @@ def _assignRegimeAnomaly( flag: float = BAD, **kwargs, ) -> Tuple[DictOfSeries, Flags]: + """TODO: Docstring.""" series = data[cluster_field] cluster = np.unique(series) - cluster_dios = DictOfSeries({str(i): data[field][series == i] for i in cluster}) - plateaus = detectDeviants(cluster_dios, metric, spread, frac, method, "samples") + cluster_frame = DictOfSeries({str(i): data[field][series == i] for i in cluster}) + plateaus = detectDeviants(cluster_frame, metric, spread, frac, method, "samples") if set_flags: - for p, cols in zip(plateaus, cluster_dios.columns[plateaus]): - flags[cluster_dios[cols].index, field] = flag + for p, cols in zip(plateaus, cluster_frame.columns[plateaus]): + flags[cluster_frame[cols].index, field] = flag if set_cluster: for p in plateaus: diff --git a/saqc/funcs/flagtools.py b/saqc/funcs/flagtools.py index ca1c3a85a..99dbd59b1 100644 --- a/saqc/funcs/flagtools.py +++ b/saqc/funcs/flagtools.py @@ -17,6 +17,7 @@ from typing_extensions import Literal from saqc import BAD, FILTER_ALL, UNFLAGGED from saqc.core import DictOfSeries, flagging, register +from saqc.lib.checking import validateChoice, validateWindow from saqc.lib.tools import isflagged, toSequence if TYPE_CHECKING: @@ -51,10 +52,10 @@ class FlagtoolsMixin: Notes ----- - This function ignores the ``dfilter`` keyword, because the data is not relevant - for processing. - A warning is triggered if the ``flag`` keyword is given, because the flags are - always set to `UNFLAGGED`. + This function ignores the ``dfilter`` keyword, because the data + is not relevant for processing. + A warning is triggered if the ``flag`` keyword is given, because + the flags are always set to `UNFLAGGED`. See Also -------- @@ -81,7 +82,8 @@ class FlagtoolsMixin: Notes ----- - This function ignores the ``dfilter`` keyword, because the data is not relevant for processing. + This function ignores the ``dfilter`` keyword, because the + data is not relevant for processing. """ unflagged = self._flags[field].isna() | (self._flags[field] == UNFLAGGED) self._flags[unflagged, field] = flag @@ -103,43 +105,55 @@ class FlagtoolsMixin: """ Flag data by given, "manually generated" data. - The data is flagged at locations where `mdata` is equal to a provided flag (`mflag`). - The format of mdata can be an indexed object, like pd.Series, pd.Dataframe or dios.DictOfSeries, - but also can be a plain list- or array-like. - How indexed mdata is aligned to data is specified via the `method` parameter. + The data is flagged at locations where `mdata` is equal to a provided + flag (`mflag`). The format of mdata can be an indexed object, + like pd.Series, pd.Dataframe or dios.DictOfSeries, but also can + be a plain list- or array-like. How indexed mdata is aligned to + data is specified via the `method` parameter. Parameters ---------- mdata : - The Data determining, wich intervals are to be flagged, or a string, denoting under which field the data is + The Data determining, wich intervals are to be flagged, or a + string, denoting under which field the data is accessable. method : - Defines how mdata is projected on data. Except for the 'plain' method, the methods assume mdata to have an - index. + Defines how mdata is projected on data. Except for the 'plain' + method, the methods assume mdata to have an index. - * 'plain': mdata must have the same length as data and is projected one-to-one on data. - * 'ontime': works only with indexed mdata. mdata entries are matched with data entries that have the same index. - * 'right-open': mdata defines intervals, values are to be projected on. - The intervals are defined, + * 'plain': mdata must have the same length as data and is + projected one-to-one on data. + * 'ontime': works only with indexed mdata. mdata entries are + matched with data entries that have the same index. + * 'right-open': mdata defines intervals, values are to be + projected on. The intervals are defined, - (1) Either, by any two consecutive timestamps t_1 and 1_2 where t_1 is valued with mflag, or by a series, - (2) Or, a Series, where the index contains in the t1 timestamps nd the values the respective t2 stamps. + (1) Either, by any two consecutive timestamps t_1 and 1_2 + where t_1 is valued with mflag, or by a series, + (2) Or, a Series, where the index contains in the t1 timestamps + and the values the respective t2 stamps. - The value at t_1 gets projected onto all data timestamps t with t_1 <= t < t_2. + The value at t_1 gets projected onto all data timestamps t, + with t_1 <= t < t_2. - * 'left-open': like 'right-open', but the projected interval now covers all t with t_1 < t <= t_2. - * 'closed': like 'right-open', but the projected interval now covers all t with t_1 <= t <= t_2. + * 'left-open': like 'right-open', but the projected interval + now covers all t with t_1 < t <= t_2. + * 'closed': like 'right-open', but the projected interval + now covers all t with t_1 <= t <= t_2. mformat : - * "start-end": mdata is a Series, where every entry indicates an interval to-flag. The index defines the left - bound, the value defines the right bound. - * "mflag": mdata is an array like, with entries containing 'mflag',where flags shall be set. See documentation - for examples. + * "start-end": mdata is a Series, where every entry indicates + an interval to-flag. The index defines the left bound, + the value defines the right bound. + * "mflag": mdata is an array like, with entries containing + 'mflag',where flags shall be set. See documentation for + examples. mflag : - The flag that indicates data points in `mdata`, of wich the projection in data should be flagged. + The flag that indicates data points in `mdata`, of wich the + projection in data should be flagged. Examples -------- @@ -155,15 +169,15 @@ class FlagtoolsMixin: 2000-05-01 1 dtype: int64 - On *dayly* data, with the 'ontime' method, only the provided timestamps are used. - Bear in mind that only exact timestamps apply, any offset will result in ignoring - the timestamp. + On *dayly* data, with the 'ontime' method, only the provided timestamps + are used. Bear in mind that only exact timestamps apply, any offset + will result in ignoring the timestamp. .. doctest:: ExampleFlagManual >>> data = pd.Series(0, index=pd.to_datetime(['2000-01-31', '2000-02-01', '2000-02-02', '2000-03-01', '2000-05-01']), name='daily_data') >>> qc = saqc.SaQC(data) - >>> qc = qc.flagManual('daily_data', mdata, mflag=1, mformat='mdata', method='ontime') + >>> qc = qc.flagManual('daily_data', mdata, mflag=1, mformat='mflag', method='ontime') >>> qc.flags['daily_data'] > UNFLAGGED 2000-01-31 False 2000-02-01 True @@ -176,7 +190,7 @@ class FlagtoolsMixin: .. doctest:: ExampleFlagManual - >>> qc = qc.flagManual('daily_data', mdata, mflag=1, mformat='mdata', method='right-open') + >>> qc = qc.flagManual('daily_data', mdata, mflag=1, mformat='mflag', method='right-open') >>> qc.flags['daily_data'] > UNFLAGGED 2000-01-31 False 2000-02-01 True @@ -189,7 +203,7 @@ class FlagtoolsMixin: .. doctest:: ExampleFlagManual - >>> qc = qc.flagManual('daily_data', mdata, mflag=1, mformat='mdata', method='left-open') + >>> qc = qc.flagManual('daily_data', mdata, mflag=1, mformat='mflag', method='left-open') >>> qc.flags['daily_data'] > UNFLAGGED 2000-01-31 False 2000-02-01 True @@ -198,6 +212,11 @@ class FlagtoolsMixin: 2000-05-01 True dtype: bool """ + validateChoice( + method, "method", ["left-open", "right-open", "closed", "plain", "ontime"] + ) + validateChoice(mformat, "mformat", ["start-end", "mflag"]) + dat = self._data[field] # internal not-mflag-value -> cant go for np.nan not_mflag = -1 if mflag == 0 else 0 @@ -218,7 +237,8 @@ class FlagtoolsMixin: if mformat == "start-end": if method in ["plain", "ontime"]: raise ValueError( - "'Start-End' formatting not compatible to 'plain' or 'ontime' methods" + "'start-end'-format is not compatible " + "with methods 'plain' or 'ontime'" ) else: mdata = pd.Series( @@ -227,7 +247,8 @@ class FlagtoolsMixin: ) mdata[::2] = mflag - # get rid of values that are neither mflag nor not_mflag (for bw-compatibillity mainly) + # get rid of values that are neither mflag + # nor not_mflag (for bw-compatibility mainly) mdata[mdata != mflag] = not_mflag # evaluate methods @@ -236,7 +257,6 @@ class FlagtoolsMixin: # reindex will do the job later elif method == "ontime": pass - elif method in ["left-open", "right-open", "closed"]: mdata = mdata.drop(mdata.index[mdata.diff() == 0]) app_entry = pd.Series(mdata[-1], dat.index.shift(freq="1min")[-1:]) @@ -316,8 +336,8 @@ class FlagtoolsMixin: >>> qc = qc.transferFlags('a', 'b') - To project the flags of `a` to both the variables `b` and `c` in one call, align the field and target variables in - 2 lists: + To project the flags of `a` to both the variables `b` and `c` + in one call, align the field and target variables in 2 lists: .. doctest:: exampleTransfer @@ -330,10 +350,9 @@ class FlagtoolsMixin: import warnings warnings.warn( - f"""The method 'transferFlags' is deprecated and - will be removed in version 2.5 of SaQC. Please use - 'SaQC.concatFlags(field={field}, target={target}, method="match", squeeze=False)' - instead""", + f"The method 'transferFlags' is deprecated and will be removed " + f"in version 2.5 of SaQC. Please use `SaQC.concatFlags(field={field}, " + f"target={target}, method='match', squeeze=False)` instead", DeprecationWarning, ) return self.concatFlags(field, target=target, method="match", squeeze=False) @@ -354,12 +373,13 @@ class FlagtoolsMixin: Parameters ---------- window : - Size of the repetition window. An integer defines the exact number of repetitions, - strings are interpreted as time offsets to fill with . + Size of the repetition window. An integer defines the exact + number of repetitions, strings are interpreted as time offsets + to fill with. method : - Direction of repetetion. With "ffill" the subsequent values receive the flag to - repeat, with "bfill" the previous values. + Direction of repetetion. With "ffill" the subsequent values + receive the flag to repeat, with "bfill" the previous values. Examples -------- @@ -381,7 +401,8 @@ class FlagtoolsMixin: 6 -inf dtype: float64 - Now, to repeat the flag '255.0' two times in direction of ascending indices, execute: + Now, to repeat the flag '255.0' two times in direction of ascending + indices, execute: .. doctest:: propagateFlags @@ -409,7 +430,8 @@ class FlagtoolsMixin: 6 -inf dtype: float64 - If an explicit flag is passed, it will be used to fill the repetition window + If an explicit flag is passed, it will be used to fill the + repetition window .. doctest:: propagateFlags @@ -423,9 +445,8 @@ class FlagtoolsMixin: 6 -inf dtype: float64 """ - - if method not in {"bfill", "ffill"}: - raise ValueError(f"supported methods are 'bfill', 'ffill', got '{method}'") + validateWindow(window) + validateChoice(method, "method", ["bfill", "ffill"]) # get the last history column hc = self._flags.history[field].hist.iloc[:, -1].astype(float) @@ -467,7 +488,7 @@ class FlagtoolsMixin: **kwargs, ) -> "SaQC": """ - Flag all values, if all of the given ``field`` values are already flagged. + Flag all values, if all the given ``field`` values are already flagged. Parameters ---------- diff --git a/saqc/funcs/interpolation.py b/saqc/funcs/interpolation.py index 6682ea592..e842ddeca 100644 --- a/saqc/funcs/interpolation.py +++ b/saqc/funcs/interpolation.py @@ -17,6 +17,14 @@ from typing_extensions import Literal from saqc import UNFLAGGED from saqc.core import register from saqc.core.history import History +from saqc.lib.checking import ( + isValidChoice, + validateCallable, + validateChoice, + validateMinPeriods, + validateValueBounds, + validateWindow, +) from saqc.lib.tools import isflagged from saqc.lib.ts_operators import interpolateNANs, shift2Freq @@ -64,7 +72,7 @@ class InterpolationMixin: def interpolateByRolling( self: "SaQC", field: str, - window: Union[str, int], + window: str | int, func: Callable[[pd.Series], float] = np.median, center: bool = True, min_periods: int = 0, @@ -72,25 +80,33 @@ class InterpolationMixin: **kwargs, ) -> "SaQC": """ - Interpolates nan-values in the data by assigning them the aggregation result of the window surrounding them. + Replace NaN by the aggregation result of the surrounding window. Parameters ---------- window : - The size of the window, the aggregation is computed from. An integer define the number of periods to be used, - an string is interpreted as an offset. ( see `pandas.rolling` for more information). - Integer windows may result in screwed aggregations if called on none-harmonized or irregular data. + The size of the window, the aggregation is computed from. + An integer define the number of periods to be used, a string + is interpreted as an offset. ( see `pandas.rolling` for more + information). Integer windows may result in screwed aggregations + if called on none-harmonized or irregular data. func : default median The function used for aggregation. center : - Center the window around the value. Can only be used with integer windows, otherwise it is silently ignored. + Center the window around the value. Can only be used with + integer windows, otherwise it is silently ignored. min_periods : - Minimum number of valid (not np.nan) values that have to be available in a window for its aggregation to be + Minimum number of valid (not np.nan) values that have to be + available in a window for its aggregation to be computed. """ + validateWindow(window) + validateCallable(func, "func") + validateMinPeriods(min_periods) + datcol = self._data[field] roller = datcol.rolling(window=window, center=center, min_periods=min_periods) try: @@ -109,7 +125,6 @@ class InterpolationMixin: flagcol = pd.Series(np.nan, index=self._flags[field].index) flagcol.loc[interpolated] = np.nan if flag is None else flag - # todo kwargs must have all passed args except data,field,flags meta = { "func": "interpolateByRolling", "args": (field,), @@ -165,16 +180,19 @@ class InterpolationMixin: * ‘from_derivatives’: Refers to scipy.interpolate.BPoly.from_derivatives order : - Order of the interpolation method, ignored if not supported by the chosen ``method`` + Order of the interpolation method, ignored if not supported + by the chosen ``method`` limit : - Maximum number of missing values to interpolate. Only gaps smaller than ``limit`` will be filled. - The gap size can be given as a number of values (integer) or a temporal extensions (offset string). - With ``None``, all missing values will be interpolated. + Maximum number of missing values to interpolate. Only gaps + smaller than ``limit`` will be filled. The gap size can be + given as a number of values (integer) or a temporal extensions + (offset string). With ``None``, all missing values will be + interpolated. extrapolate : - Use parameter to perform extrapolation instead of interpolation onto the trailing and/or leading chunks of - NaN values in data series. + Use parameter to perform extrapolation instead of interpolation + onto the trailing and/or leading chunks of NaN values in data series. * 'None' (default) - perform interpolation * 'forward'/'backward' - perform forward/backward extrapolation @@ -205,7 +223,8 @@ class InterpolationMixin: 2000-01-01 11:00:00 NaN 2000-01-01 12:00:00 NaN - Use :py:meth:`~saqc.SaQC.interpolate` to do linear interpolation of up to 2 consecutive missing values: + Use :py:meth:`~saqc.SaQC.interpolate` to do linear interpolation + of up to 2 consecutive missing values: .. doctest:: interpolate @@ -230,7 +249,8 @@ class InterpolationMixin: <BLANKLINE> - Use :py:meth:`~saqc.SaQC.interpolate` to do linear extrapolaiton of up to 1 consecutive missing values: + Use :py:meth:`~saqc.SaQC.interpolate` to do linear extrapolaiton + of up to 1 consecutive missing values: .. doctest:: interpolate @@ -254,15 +274,21 @@ class InterpolationMixin: 2000-01-01 12:00:00 NaN | <BLANKLINE> """ + if limit is not None: + validateWindow(limit, "limit") + + validateValueBounds(order, "order", left=0, strict_int=True) + validateChoice( + extrapolate, "extrapolate", ["forward", "backward", "both", None] + ) if "freq" in kwargs: # the old interpolate version warnings.warn( - f""" - The method `intepolate` is deprecated and will be removed in version 3.0 of saqc. - To achieve the same behaviour please use: - `qc.align(field={field}, freq={kwargs["freq"]}, method={method}, order={order}, flag={flag})` - """, + f"The method `intepolate` is deprecated and will be removed " + f"in version 3.0 of saqc. To achieve the same behaviour " + f"please use: `qc.align(field={field}, freq={kwargs['freq']}, " + f"method={method}, order={order}, flag={flag})`", DeprecationWarning, ) return self.align( @@ -291,7 +317,6 @@ class InterpolationMixin: self._flags.history[field].append( new_col, {"func": "interpolateInvalid", "args": (), "kwargs": kwargs} ) - return self @register(mask=["field"], demask=[], squeeze=[]) @@ -306,8 +331,8 @@ class InterpolationMixin: **kwargs, ) -> "SaQC": """ - Convert time series to specified frequency. Values affected by frequency - changes will be inteprolated using the given method. + Convert time series to specified frequency. Values affected by + frequency changes will be inteprolated using the given method. Parameters ---------- @@ -317,25 +342,36 @@ class InterpolationMixin: method : Interpolation technique to use. One of: - * ``'nshift'``: shift grid points to the nearest time stamp in the range = +/- 0.5 * ``freq`` - * ``'bshift'``: shift grid points to the first succeeding time stamp (if any) - * ``'fshift'``: shift grid points to the last preceeding time stamp (if any) - * ``'linear'``: Ignore the index and treat the values as equally spaced. - * ``'time'``, ``'index'``, 'values': Use the actual numerical values of the index. + * ``'nshift'``: shift grid points to the nearest time stamp + in the range = +/- 0.5 * ``freq`` + * ``'bshift'``: shift grid points to the first succeeding + time stamp (if any) + * ``'fshift'``: shift grid points to the last preceeding time + stamp (if any) + * ``'linear'``: Ignore the index and treat the values as equally + spaced. + * ``'time'``, ``'index'``, 'values': Use the actual numerical + values of the index. * ``'pad'``: Fill in NaNs using existing values. - * ``'nearest'``, ``'zero'``, ``'slinear'``, ``'quadratic'``, ``'cubic'``, ``'spline'``, ``'barycentric'``, ``'polynomial'``: - Passed to ``scipy.interpolate.interp1d``. These methods use the numerical values of the index. Both ``'polynomial'`` and - ``'spline'`` require that you also specify an ``order``, e.g. ``qc.interpolate(method='polynomial', order=5)``. + * ``'spline'``, ``'polynomial'``: + Passed to ``scipy.interpolate.interp1d``. These methods + use the numerical values of the index. An ``order`` must be + specified, e.g. ``qc.interpolate(method='polynomial', order=5)``. + * ``'nearest'``, ``'zero'``, ``'slinear'``, ``'quadratic'``, ``'cubic'``, ``'barycentric'``: + Passed to ``scipy.interpolate.interp1d``. These methods use + the numerical values of the index. * ``'krogh'``, ``'spline'``, ``'pchip'``, ``'akima'``, ``'cubicspline'``: - Wrappers around the SciPy interpolation methods of similar names. + Wrappers around the SciPy interpolation methods of similar + names. * ``'from_derivatives'``: Refers to ``scipy.interpolate.BPoly.from_derivatives`` order : - Order of the interpolation method, ignored if not supported by the chosen ``method`` + Order of the interpolation method, ignored if not supported + by the chosen ``method`` extrapolate : - Use parameter to perform extrapolation instead of interpolation onto the trailing and/or leading chunks of - NaN values in data series. + Use parameter to perform extrapolation instead of interpolation + onto the trailing and/or leading chunks of NaN values in data series. * ``None`` (default) - perform interpolation * ``'forward'``/``'backward'`` - perform forward/backward extrapolation @@ -348,6 +384,12 @@ class InterpolationMixin: # TODO: # - should we keep `extrapolate` + validateWindow(freq, "freq", allow_int=False) + validateValueBounds(order, "order", left=0, strict_int=True) + validateChoice( + extrapolate, "extrapolate", ["forward", "backward", "both", None] + ) + if self._data[field].empty: return self @@ -377,16 +419,15 @@ class InterpolationMixin: **kwargs, }, } - flagcol = pd.Series(UNFLAGGED if overwrite else np.nan, index=history.index) history.append(flagcol, meta) - self._data[field] = datacol self._flags.history[field] = history - return self + # ============================================================ ### Deprecated functions + # ============================================================ @register(mask=["field"], demask=[], squeeze=[]) def interpolateIndex( @@ -400,10 +441,11 @@ class InterpolationMixin: **kwargs, ) -> "SaQC": """ - Function to interpolate the data at regular (äquidistant) timestamps (or Grid points). + Function to interpolate the data at regular (equidistant) + timestamps also known as or grid points. - .. deprecated:: 2.4.0 - Use :py:meth:`~saqc.SaQC.align` instead. + .. deprecated:: 2.4.0 + Use :py:meth:`~saqc.SaQC.align` instead. Parameters ---------- @@ -415,33 +457,38 @@ class InterpolationMixin: The interpolation method you want to apply. order : - If your selected interpolation method can be performed at different 'orders' - here you pass the desired - order. + If your selected interpolation method can be performed at + different 'orders' - here you pass the desired order. limit : - Upper limit of missing index values (with respect to ``freq``) to fill. The limit can either be expressed - as the number of consecutive missing values (integer) or temporal extension of the gaps to be filled - (Offset String). - If ``None`` is passed, no limit is set. + Upper limit of missing index values (with respect to ``freq``) + to fill. The limit can either be expressed as the number of + consecutive missing values (integer) or temporal extension + of the gaps to be filled (Offset String). If ``None`` is passed, + no limit is set. - extraplate : - Use parameter to perform extrapolation instead of interpolation onto the trailing and/or leading chunks of - NaN values in data series. + extrapolate : + Use parameter to perform extrapolation instead of interpolation + onto the trailing and/or leading chunks of NaN values in data + series. * ``None`` (default) - perform interpolation * ``'forward'``/``'backward'`` - perform forward/backward extrapolation * ``'both'`` - perform forward and backward extrapolation """ - - msg = """ - The method `interpolateIndex` is deprecated and will be removed in verion 3.0 of saqc. - To achieve the same behavior use: - """ - call = "qc.align(field={field}, freq={freq}, method={method}, order={order}, extrapolate={extrapolate})" + call = ( + f"qc.align(field={field}, freq={freq}, method={method}, " + f"order={order}, extrapolate={extrapolate})" + ) if limit != 2: - call = f"{call}.interpolate(field={field}, method={method}, order={order}, limit={limit}, extrapolate={extrapolate})" - + call = ( + f"{call}.interpolate(field={field}, method={method}, " + f"order={order}, limit={limit}, extrapolate={extrapolate})" + ) warnings.warn(f"{msg}`{call}`", DeprecationWarning) + + # HINT: checking is delegated to called functions + out = self.align( field=field, freq=freq, @@ -482,16 +529,14 @@ class InterpolationMixin: Use :py:meth:`~saqc.SaQC.interpolate` instead. """ warnings.warn( - f""" - The method `intepolateInvalid` is deprecated and will be removed - with version 3.0 of saqc. To achieve the same behavior, please use - `qc.interpolate( - field={field}, method={method}, order={order}, - limit={limit}, extrapolate={extrapolate}, flag={flag} - )` - """ + "The method `intepolateInvalid` is deprecated and will be removed " + "with version 3.0 of saqc. To achieve the same behavior, please " + f"use `qc.interpolate(field={field}, method={method}, order={order}, " + f"limit={limit}, extrapolate={extrapolate}, flag={flag})`", + DeprecationWarning, ) + # HINT: checking is delegated to called function return self.interpolate( field=field, method=method, @@ -526,19 +571,14 @@ def _shift( * 'nshift' : shift grid points to the nearest time stamp in the range = +/- 0.5 * ``freq`` * 'bshift' : shift grid points to the first succeeding time stamp (if any) - * 'fshift' : shift grid points to the last preceeding time stamp (if any) - - freq_check : - * ``None`` : do not validate the ``freq`` string. - * 'check' : check ``freq`` against an frequency estimation, produces a warning in case of miss matches. - * 'auto' : estimate frequency, `freq` is ignored. + * 'fshift' : shift grid points to the last preceding time stamp (if any) Returns ------- saqc.SaQC """ - # TODO - # - Do we need `freq_check`? If so could we move it to `align`? + validateChoice(method, "method", ["fshift", "bshift", "nshift"]) + validateWindow(freq, "freq", allow_int=False) datcol = saqc._data[field] if datcol.empty: @@ -567,8 +607,15 @@ def _interpolate( method: str, order: int | None, dfilter: float, - extrapolate: Literal["forward", "backward", "both"] | None = None, + extrapolate: Literal["forward", "backward", "both", None] = None, ) -> Tuple[pd.Series, History]: + """TODO: Docstring""" + + validateChoice(extrapolate, "extrapolate", ["forward", "backward", "both", None]) + validateWindow(freq, "freq", allow_int=False) + if order is not None: + validateValueBounds(order, "order", 0, strict_int=True) + datcol = saqc._data[field].copy() start, end = datcol.index[0].floor(freq), datcol.index[-1].ceil(freq) diff --git a/saqc/funcs/noise.py b/saqc/funcs/noise.py index 751e5b068..0137d4a27 100644 --- a/saqc/funcs/noise.py +++ b/saqc/funcs/noise.py @@ -15,6 +15,7 @@ import pandas as pd from saqc.constants import BAD from saqc.core.register import flagging +from saqc.lib.checking import validateCallable, validateMinPeriods, validateWindow from saqc.lib.tools import isunflagged, statPass if TYPE_CHECKING: @@ -64,26 +65,22 @@ class NoiseMixin: Minimum number of values needed in a chunk to perfom the test. Ignored if ``window`` is an integer. """ - - datcol = self._data[field] - if not min_periods: - min_periods = 0 - if not sub_thresh: - sub_thresh = thresh - window = pd.Timedelta(window) - + validateCallable(func, "func") + validateWindow(window, allow_int=False) + validateMinPeriods(min_periods) if sub_window is not None: + validateWindow(sub_window, "sub_window", allow_int=False) sub_window = pd.Timedelta(sub_window) to_set = statPass( - datcol, - func, - window, - thresh, - operator.gt, - sub_window, - sub_thresh, - min_periods, + datcol=self._data[field], + stat=func, + winsz=pd.Timedelta(window), + thresh=thresh, + comparator=operator.gt, + sub_winsz=sub_window, + sub_thresh=sub_thresh or thresh, + min_periods=min_periods or 0, ) mask = isunflagged(self._flags[field], kwargs["dfilter"]) & to_set self._flags[mask, field] = flag diff --git a/saqc/funcs/outliers.py b/saqc/funcs/outliers.py index 78ad2486b..3be87afc9 100644 --- a/saqc/funcs/outliers.py +++ b/saqc/funcs/outliers.py @@ -10,7 +10,7 @@ from __future__ import annotations import uuid import warnings -from typing import TYPE_CHECKING, Callable, Optional, Sequence, Tuple +from typing import TYPE_CHECKING, Callable, List, Optional, Sequence, Tuple import numpy as np import numpy.polynomial.polynomial as poly @@ -21,7 +21,17 @@ from typing_extensions import Literal from saqc import BAD, UNFLAGGED from saqc.core import DictOfSeries, Flags, flagging, register -from saqc.funcs.scores import _univarScoring +from saqc.lib.checking import ( + isCallable, + isFloatLike, + validateCallable, + validateChoice, + validateFraction, + validateFrequency, + validateMinPeriods, + validateValueBounds, + validateWindow, +) from saqc.lib.docs import DOC_TEMPLATES from saqc.lib.rolling import windowRoller from saqc.lib.tools import getFreqDelta, isflagged, toSequence @@ -31,6 +41,19 @@ if TYPE_CHECKING: class OutliersMixin: + @staticmethod + def _validateLOF(algorithm, n, p, density): + """validate parameter for LOF and UniLOF""" + validateValueBounds(n, "n", left=0, strict_int=True) + validateValueBounds(p, "p", left=0, strict_int=True) + validateChoice( + algorithm, "algorithm", ["ball_tree", "kd_tree", "brute", "auto"] + ) + if density != "auto" and not isFloatLike(density) and not isCallable(density): + raise ValueError( + f"'density' must be 'auto' or a float or a function, not {density}" + ) + @register( mask=["field"], demask=["field"], @@ -55,53 +78,79 @@ class OutliersMixin: Parameters ---------- n : - Number of neighbors to be included into the LOF calculation. Defaults to ``20``, which is a + Number of neighbors to be included into the LOF calculation. + Defaults to ``20``, which is a value found to be suitable in the literature. - * :py:attr:`n` determines the "locality" of an observation (its :py:attr:`n` nearest neighbors) - and sets the upper limit to the number of values in outlier clusters (i.e. consecutive outliers). Outlier - clusters of size greater than :py:attr:`n`/2 may not be detected reliably. - * The larger :py:attr:`n`, the lesser the algorithm's sensitivity to local outliers and small - or singleton outliers points. Higher values greatly increase numerical costs. + * :py:attr:`n` determines the "locality" of an observation + (its :py:attr:`n` nearest neighbors) and sets the upper + limit to the number of values in outlier clusters (i.e. + consecutive outliers). Outlier clusters of size greater + than :py:attr:`n`/2 may not be detected reliably. + * The larger :py:attr:`n`, the lesser the algorithm's sensitivity + to local outliers and small or singleton outliers points. + Higher values greatly increase numerical costs. thresh : - The threshold for flagging the calculated LOF. A LOF of around ``1`` is considered normal and - most likely corresponds to inlier points. + The threshold for flagging the calculated LOF. A LOF of around + ``1`` is considered normal and most likely corresponds to + inlier points. - * The "automatic" threshing introduced with the publication of the algorithm defaults to ``1.5``. - * In this implementation, :py:attr:`thresh` defaults (``'auto'``) to flagging the scores with a - modified 3-sigma rule, resulting in a :py:attr:`thresh` `` > 1.5`` which usually mitigates - overflagging compared to the literature recommendation. + * The "automatic" threshing introduced with the publication + of the algorithm defaults to ``1.5``. + * In this implementation, :py:attr:`thresh` defaults (``'auto'``) + to flagging the scores with a modified 3-sigma rule, resulting + in a :py:attr:`thresh` `` > 1.5`` which usually mitigates + over-flagging compared to the literature recommendation. algorithm : Algorithm used for calculating the :py:attr:`n`-nearest neighbors. p : - Degree of the metric ("Minkowski"), according to which the distance to neighbors is determined. - Most important values are: + Degree of the metric ("Minkowski"), according to which the + distance to neighbors is determined. Most important values are: - * ``1`` - Manhatten Metric + * ``1`` - Manhattan Metric * ``2`` - Euclidian Metric + density : + How to calculate the temporal distance/density for the variable to flag. + + * ``'auto'`` - introduces linear density with an increment + equal to the median of the absolute diff of the variable to flag. + * ``float`` - introduces linear density with an increment + equal to :py:attr:`density` + * Callable - calculates the density by applying the function + passed onto the variable to flag (passed as Series). + Notes ----- - * The :py:meth:`~saqc.SaQC.flagLOF` function calculates the Local Outlier Factor (LOF) for every point - in the input timeseries. The *LOF* is a scalar value, that roughly correlates to the *reachability*, - or "outlierishnes" of the evaluated datapoint. If a point is as reachable, as all its - :py:attr:`n`-nearest neighbors, the *LOF* score evaluates to around ``1``. If it is only as half as - reachable as all its ``n``-nearest neighbors are (so to say, as double as "outlierish"), the score - is about ``2``. So, the Local Outlier *Factor* relates a point's *reachability* to the *reachability* - of its :py:attr:`n`-nearest neighbors in a multiplicative fashion (as a "factor"). - * The *reachability* of a point thereby is determined as an aggregation of the points distances to its - :py:attr:`n`-nearest neighbors, measured with regard to the minkowski metric of degree :py:attr:`p` + * The :py:meth:`~saqc.SaQC.flagLOF` function calculates the Local + Outlier Factor (LOF) for every point in the input timeseries. + The *LOF* is a scalar value, that roughly correlates to the + *reachability*, or "outlierishnes" of the evaluated datapoint. + If a point is as reachable, as all its :py:attr:`n`-nearest + neighbors, the *LOF* score evaluates to around ``1``. If it + is only as half as reachable as all its ``n``-nearest neighbors + are (so to say, as double as "outlierish"), the score is about + ``2``. So, the Local Outlier *Factor* relates a point's *reachability* + to the *reachability* of its :py:attr:`n`-nearest neighbors + in a multiplicative fashion (as a "factor"). + * The *reachability* of a point thereby is determined as an aggregation + of the points distances to its :py:attr:`n`-nearest neighbors, + measured with regard to the minkowski metric of degree :py:attr:`p` (usually euclidean). - * To derive a binary label for every point (outlier: *yes*, or *no*), the scores are cut off at a level, - determined by :py:attr:`thresh`. + * To derive a binary label for every point (outlier: *yes*, or *no*), + the scores are cut off at a level, determined by :py:attr:`thresh`. """ + self._validateLOF(algorithm, n, p, density) + if thresh != "auto" and not isFloatLike(thresh): + raise ValueError(f"'thresh' must be 'auto' or a float, not {thresh}") + fields = toSequence(field) field_ = str(uuid.uuid4()) - self = self.assignLOF( + qc = self.assignLOF( field=fields, target=field_, n=n, @@ -109,7 +158,7 @@ class OutliersMixin: p=p, density=density, ) - s = self.data[field_] + s = qc.data[field_] if thresh == "auto": s = pd.concat([s, (-s - 2)]) s_mask = (s - s.mean() / s.std())[: len(s) // 2].abs() > 3 @@ -117,10 +166,10 @@ class OutliersMixin: s_mask = s < abs(thresh) for f in fields: - mask = ~isflagged(self._flags[f], kwargs["dfilter"]) & s_mask - self._flags[mask, f] = flag + mask = ~isflagged(qc._flags[f], kwargs["dfilter"]) & s_mask + qc._flags[mask, f] = flag - return self.dropField(field_) + return qc.dropField(field_) @flagging() def flagUniLOF( @@ -131,95 +180,120 @@ class OutliersMixin: algorithm: Literal["ball_tree", "kd_tree", "brute", "auto"] = "ball_tree", p: int = 1, density: Literal["auto"] | float | Callable = "auto", - fill_na: str = "linear", + fill_na: bool = True, flag: float = BAD, **kwargs, ) -> "SaQC": """ Flag "univariate" Local Outlier Factor (LOF) exceeding cutoff. - The function is a wrapper around a usual LOF implementation, aiming for an easy to use, - parameter minimal outlier detection function for single variables, that does not necessitate - prior modelling of the variable. LOF is applied onto a concatenation of the `field` variable - and a "temporal density", or "penalty" variable, that measures temporal distance between data - points. See notes Section for a more exhaustive explaination. - - See the Notes section for more details on the algorithm. + The function is a wrapper around a usual LOF implementation, aiming + for an easy to use, parameter minimal outlier detection function + for single variables, that does not necessitate prior modelling + of the variable. LOF is applied onto a concatenation of the `field` + variable and a "temporal density", or "penalty" variable, that + measures temporal distance between data points. See notes Section + for a more exhaustive explaination. See the Notes section for + more details on the algorithm. Parameters ---------- n : - Number of periods to be included into the LOF calculation. Defaults to `20`, which is a - value found to be suitable in the literature. - - * :py:attr:`n` determines the "locality" of an observation (its :py:attr:`n` nearest neighbors) - and sets the upper limit to the number of values in an outlier clusters (i.e. consecutive outliers). Outlier - clusters of size greater than :py:attr:`n`/2 may not be detected reliably. - * The larger :py:attr:`n`, the lesser the algorithm's sensitivity to local outliers and small - or singleton outlier points. Higher values greatly increase numerical costs. + Number of periods to be included into the LOF calculation. + Defaults to `20`, which is a value found to be suitable in + the literature. + + * :py:attr:`n` determines the "locality" of an observation + (its :py:attr:`n` nearest neighbors) and sets the upper + limit to the number of values in an outlier clusters (i.e. + consecutive outliers). Outlier clusters of size greater + than :py:attr:`n`/2 may not be detected reliably. + * The larger :py:attr:`n`, the lesser the algorithm's sensitivity + to local outliers and small or singleton outlier points. + Higher values greatly increase numerical costs. thresh : - The threshold for flagging the calculated LOF. A LOF of around ``1`` is considered normal and - most likely corresponds to inlier points. This parameter is considered the main calibration + The threshold for flagging the calculated LOF. A LOF of around + ``1`` is considered normal and most likely corresponds to + inlier points. This parameter is considered the main calibration parameter of the algorithm. - * The threshing defaults to ``1.5``, wich is the default value found to be suitable in the literature. - * ``'auto'`` enables flagging the scores with a modified 3-sigma rule, - resulting in a thresh around ``4``, which usually greatly mitigates overflagging compared to the - literature recommendation, but often is too high. - * sensitive range for the parameter may be ``[1,15]``, assuming default settings for the other parameters. + * The threshing defaults to ``1.5``, wich is the default value + found to be suitable in the literature. + * ``'auto'`` enables flagging the scores with a modified 3-sigma + rule, resulting in a thresh around ``4``, which usually + greatly mitigates overflagging compared to the literature + recommendation, but often is too high. + * sensitive range for the parameter may be ``[1,15]``, assuming + default settings for the other parameters. algorithm : - Algorithm used for calculating the :py:attr:`n`-nearest neighbors needed for LOF calculation. + Algorithm used for calculating the :py:attr:`n`-nearest neighbors + needed for LOF calculation. + p : - Degree of the metric ("Minkowski"), according to which distance to neighbors is determined. - Most important values are: + Degree of the metric ("Minkowski"), according to which distance + to neighbors is determined. Most important values are: * ``1`` - Manhatten Metric * ``2`` - Euclidian Metric + density : - How to calculate the temporal distance/density for the variable to flag. + How to calculate the temporal distance/density for the variable + to flag. - * ``'auto'`` - introduces linear density with an increment equal to the median of the absolute - diff of the variable to flag. - * ``float`` - introduces linear density with an increment equal to :py:attr:`density` - * Callable - calculates the density by applying the function passed onto the variable to flag - (passed as Series). + * ``'auto'`` - introduces linear density with an increment + equal to the median of the absolute diff of the variable to flag. + * ``float`` - introduces linear density with an increment + equal to :py:attr:`density` + * Callable - calculates the density by applying the function + passed onto the variable to flag (passed as Series). fill_na : - Weather or not to fill NaN values in the data with a linear interpolation. + If True, NaNs in the data are filled with a linear interpolation. See Also -------- - :ref:`introduction to outlier detection with saqc <cookbooks/OutlierDetection:Outlier Detection>` + :ref:`introduction to outlier detection with + saqc <cookbooks/OutlierDetection:Outlier Detection>` Notes ----- - * The :py:meth:`~saqc.SaQC.flagUniLOF` function calculates an univariate - Local Outlier Factor (UniLOF) - score for every point in the one dimensional input - data series. - The *UniLOF* score of any data point is a scalar value, that roughly correlates to - its *reachability*, or "outlierishnes" in the 2-dimensional space constituted by the - data-values and the time axis. So the Algorithm basically operates on the "graph", - or the "plot" of the input timeseries. + + * The :py:meth:`~saqc.SaQC.flagUniLOF` function calculates an + univariate Local Outlier Factor (UniLOF) - score for every + point in the one dimensional input data series. The *UniLOF* + score of any data point is a scalar value, that roughly correlates + to its *reachability*, or "outlierishnes" in the 2-dimensional + space constituted by the data-values and the time axis. So + the Algorithm basically operates on the "graph", or the "plot" + of the input timeseries. + * If a point in this "graph" is as reachable, as all its :py:attr:`n`-nearest - neighbors, its *UniLOF* score evaluates to around ``1``. If it is only as half as - reachable as all its :py:attr:`n` neighbors are - (so to say, as double as "outlierish"), its score evaluates to ``2`` roughly. - So, the Univariate Local Outlier *Factor* relates a points *reachability* to the - *reachability* of its :py:attr:`n`-nearest neighbors in a multiplicative fashion + neighbors, its *UniLOF* score evaluates to around ``1``. If + it is only as half as reachable as all its :py:attr:`n` neighbors + are (so to say, as double as "outlierish"), its score evaluates + to ``2`` roughly. So, the Univariate Local Outlier *Factor* + relates a points *reachability* to the *reachability* of its + :py:attr:`n`-nearest neighbors in a multiplicative fashion (as a "factor"). - * The *reachability* of a point thereby is derived as an aggregation of the points - distance to its :py:attr:`n`-nearest neighbors, measured with regard to the minkowski - metric of degree :py:attr:`p` (usually euclidean). - * The parameter :py:attr:`density` thereby determines how dimensionality of the time is - removed, to make it a dimensionless, real valued coordinate. - * To derive a binary label for every point (outlier: *yes*, or *no*), the scores are cut - off at a level, determined by :py:attr:`thresh`. + + * The *reachability* of a point thereby is derived as an aggregation + of the points distance to its :py:attr:`n`-nearest neighbors, + measured with regard to the minkowski metric of degree :py:attr:`p` + (usually euclidean). + + * The parameter :py:attr:`density` thereby determines how dimensionality + of the time is removed, to make it a dimensionless, real valued + coordinate. + + * To derive a binary label for every point (outlier: *yes*, or + *no*), the scores are cut off at a level, determined by :py:attr:`thresh`. Examples -------- - See the :ref:`outlier detection cookbook <cookbooks/OutlierDetection:Outlier Detection>` for a detailed + See the :ref:`outlier detection cookbook + <cookbooks/OutlierDetection:Outlier Detection>` for a detailed introduction into the usage and tuning of the function. .. plot:: @@ -236,8 +310,10 @@ class OutliersMixin: Example usage with default parameter configuration: - Loading data via pandas csv file parser, casting index to DateTime, generating a :py:class:`~saqc.SaQC` - instance from the data and plotting the variable representing light scattering at 254 nanometers wavelength. + Loading data via pandas csv file parser, casting index to DateTime, + generating a :py:class:`~saqc.SaQC` instance from the data and + plotting the variable representing light scattering at 254 nanometers + wavelength. .. doctest:: flagUniLOFExample @@ -255,8 +331,9 @@ class OutliersMixin: qc.plot('sac254_raw') - We apply :py:meth:`~saqc.SaqC.flagUniLOF` in with default parameter values. Meaning, that the main - calibration paramters :py:attr:`n` and :py:attr:`thresh` evaluate to `20` and `1.5` respectively. + We apply :py:meth:`~saqc.SaqC.flagUniLOF` in with default parameter + values. Meaning, that the main calibration paramters :py:attr:`n` + and :py:attr:`thresh` evaluate to `20` and `1.5` respectively. .. doctest:: flagUniLOFExample @@ -273,27 +350,31 @@ class OutliersMixin: qc.plot('sac254_raw') """ - field_ = str(uuid.uuid4()) - self = self.assignUniLOF( + self._validateLOF(algorithm, n, p, density) + if thresh != "auto" and not isFloatLike(thresh): + raise ValueError(f"'thresh' must be 'auto' or a float, not {thresh}") + + tmp_field = str(uuid.uuid4()) + qc = self.assignUniLOF( field=field, - target=field_, + target=tmp_field, n=n, algorithm=algorithm, p=p, density=density, fill_na=fill_na, ) - s = self.data[field_] + s = qc.data[tmp_field] if thresh == "auto": _s = pd.concat([s, (-s - 2)]) s_mask = ((_s - _s.mean()) / _s.std()).iloc[: int(s.shape[0])].abs() > 3 else: s_mask = s < -abs(thresh) - s_mask = ~isflagged(self._flags[field], kwargs["dfilter"]) & s_mask - self._flags[s_mask, field] = flag - self = self.dropField(field_) - return self + s_mask = ~isflagged(qc._flags[field], kwargs["dfilter"]) & s_mask + qc._flags[s_mask, field] = flag + qc = qc.dropField(tmp_field) + return qc @flagging() def flagRange( @@ -305,7 +386,8 @@ class OutliersMixin: **kwargs, ) -> "SaQC": """ - Function flags values exceeding the closed interval [:py:attr:`min`, :py:attr:`max`]. + Function flags values exceeding the closed + interval [:py:attr:`min`, :py:attr:`max`]. Parameters ---------- @@ -314,7 +396,6 @@ class OutliersMixin: max : Upper bound for valid data. """ - # using .values is much faster datacol = self._data[field].to_numpy() mask = (datacol < min) | (datacol > max) @@ -341,28 +422,28 @@ class OutliersMixin: ---------- window : - Determines the segmentation of the data into partitions, the kNN algorithm is - applied onto individually. + Determines the segmentation of the data into partitions, the + kNN algorithm is applied onto individually. * ``None``: Apply Scoring on whole data set at once - * ``int``: Apply scoring on successive data chunks of periods with the given length. - Must be greater than 0. - * Offset String : Apply scoring on successive partitions of temporal extension - matching the passed offset string + * ``int``: Apply scoring on successive data chunks of periods + with the given length. Must be greater than 0. + * offset String : Apply scoring on successive partitions of + temporal extension matching the passed offset string min_periods : - Minimum number of periods per partition that have to be present for a valid - outlier detection to be made in this partition (only of effect, if :py:attr:`freq` - is an integer). + Minimum number of periods per partition that have to be present + for a valid outlier detection to be made in this partition iter_start : - Float in ``[0, 1]`` that determines which percentage of data is considered - "normal". ``0.5`` results in the stray algorithm to search only the upper 50% of - the scores for the cut off point. (See reference section for more information) + Float in ``[0, 1]`` that determines which percentage of data + is considered "normal". ``0.5`` results in the stray algorithm + to search only the upper 50% of the scores for the cut off + point. (See reference section for more information) alpha : - Level of significance by which it is tested, if a score might be drawn from - another distribution than the majority of the data. + Level of significance by which it is tested, if a score might + be drawn from another distribution than the majority of the data. References ---------- @@ -373,36 +454,36 @@ class OutliersMixin: """ scores = self._data[field].dropna() + if window is None: + window = len(scores) + if not isinstance(window, int): + validateFrequency(window, "window") + + validateMinPeriods(min_periods) + validateValueBounds(iter_start, "iter_start", left=0, right=1, closed="both") + if scores.empty: return self - if not window: - window = len(scores) - - if isinstance(window, str): + if isinstance(window, int): + s = pd.Series(data=np.arange(0, len(scores)), index=scores.index) + s = s.transform(lambda x: int(np.floor(x / window))) + partitions = scores.groupby(s) + else: # pd.Timedelta pd.DateOffset or str partitions = scores.groupby(pd.Grouper(freq=window)) - else: - grouper_series = pd.Series( - data=np.arange(0, len(scores)), index=scores.index - ) - grouper_series = grouper_series.transform( - lambda x: int(np.floor(x / window)) - ) - partitions = scores.groupby(grouper_series) - # calculate flags for every window for _, partition in partitions: - if partition.empty | (len(partition) < min_periods): - continue - sample_size = len(partition) + if partition.empty or sample_size < min_periods: + continue + sorted_i = partition.values.argsort() resids = partition.values[sorted_i] gaps = np.append(0, np.diff(resids)) - tail_size = int(max(min(50, np.floor(sample_size / 4)), 2)) + tail_size = int(max(min(np.floor(sample_size / 4), 50), 2)) tail_indices = np.arange(2, tail_size + 1) i_start = int(max(np.floor(sample_size * iter_start), 1) + 1) @@ -448,18 +529,19 @@ class OutliersMixin: **kwargs, ) -> "SaQC": """ - The algorithm implements a 3-step outlier detection procedure for simultaneously - flagging of higher dimensional data (dimensions > 3). + The algorithm implements a 3-step outlier detection procedure for + simultaneously flagging of higher dimensional data (dimensions > 3). - In [1], the procedure is introduced and exemplified with an application on hydrological - data. See the notes section for an overview over the algorithms basic steps. + In [1], the procedure is introduced and exemplified with an application on + hydrological data. See the notes section for an overview over the algorithms + basic steps. Parameters ---------- - trafo : default identity - Transformation to be applied onto every column before scoring. For more fine-grained - control, the data could also be transformed before :py:meth:`~saqc.SaQC.flagMVScores` - is called. + trafo : + Transformation to be applied onto every column before scoring. For more + fine-grained control, the data could also be transformed before + :py:meth:`~saqc.SaQC.flagMVScores` is called. alpha : Level of significance by which it is tested, if an observations score might @@ -468,48 +550,52 @@ class OutliersMixin: n : Number of neighbors included in the scoring process for every datapoint. - func : default sum - Function that aggregates a value's k-smallest distances, returning a scalar score. + func : + Function that aggregates a value's k-smallest distances, returning a + scalar score. iter_start : Value in ``[0,1]`` that determines which percentage of data is considered - "normal". 0.5 results in the threshing algorithm to search only the upper 50% - of the scores for the cut off point. (See reference section for more + "normal". 0.5 results in the threshing algorithm to search only the upper + 50% of the scores for the cut-off point. (See reference section for more information) window : - Only effective if :py:attr:`threshing` is set to ``'stray'``. Determines the - size of the data partitions, the data is decomposed into. Each partition is checked - seperately for outliers. - Either given as an Offset String, denoting the windows temporal extension or - as an integer, denoting the windows number of periods. ``NaN`` also count as periods. - If ``None``, all data points share the same scoring window, which than equals the whole - data. + Only effective if :py:attr:`threshing` is set to ``'stray'``. Determines + the size of the data partitions, the data is decomposed into. Each + partition is checked seperately for outliers. Either given as an Offset + String, denoting the windows temporal extension or as an integer, + denoting the windows number of periods. ``NaN`` also count as periods. If + ``None``, all data points share the same scoring window, which than + equals the whole data. min_periods : - Only effective if :py:attr:`threshing` is set to ``'stray'`` and :py:attr:`partition` is an integer. - Minimum number of periods per :py:attr:`partition` that have to be present for a valid outlier + Only effective if :py:attr:`threshing` is set to ``'stray'`` and + :py:attr:`partition` is an integer. Minimum number of periods per + :py:attr:`partition` that have to be present for a valid outlier detection to be made in this partition. stray_range : - If not ``None``, it is tried to reduce the stray result onto single outlier components - of the input :py:attr:`field`. The offset string denotes the range of the - temporal surrounding to include into the MAD testing while trying to reduce - flags. + If not ``None``, it is tried to reduce the stray result onto single + outlier components of the input :py:attr:`field`. The offset string + denotes the range of the temporal surrounding to include into the MAD + testing while trying to reduce flags. drop_flagged : - Only effective when :py:attr:`stray_range` is not ``None``. Whether or not to drop flagged - values from the temporal surroundings. + Only effective when :py:attr:`stray_range` is not ``None``. Whether or + not to drop flagged values from the temporal surroundings. thresh : - Only effective when :py:attr:`stray_range` is not ``None``. The 'critical' value, - controlling wheather the MAD score is considered referring to an outlier or - not. Higher values result in less rigid flagging. The default value is widely - considered apropriate in the literature. + Only effective when :py:attr:`stray_range` is not ``None``. The + 'critical' value, controlling wheather the MAD score is considered + referring to an outlier or not. Higher values result in less rigid + flagging. The default value is widely considered apropriate in the + literature. min_periods_r : - Only effective when :py:attr:`stray_range` is not ``None``. Minimum number of measurements - necessary in an interval to actually perform the reduction step. + Only effective when :py:attr:`stray_range` is not ``None``. Minimum + number of measurements necessary in an interval to actually perform the + reduction step. Notes ----- @@ -521,33 +607,33 @@ class OutliersMixin: (a) make them comparable and (b) make outliers more stand out. - This step is usually subject to a phase of research/try and error. See [1] for more - details. + This step is usually subject to a phase of research/try and error. See [1] + for more details. - Note, that the data transformation as an built-in step of the algorithm, will likely - get deprecated in the future. Its better to transform the data in a processing - step, preceeding the multivariate flagging process. Also, by doing so, one gets - mutch more control and variety in the transformation applied, since the `trafo` - parameter only allows for application of the same transformation to all of the - variables involved. + Note, that the data transformation as a built-in step of the algorithm, + will likely get deprecated in the future. It's better to transform the data in + a processing step, preceeding the multivariate flagging process. Also, + by doing so, one gets mutch more control and variety in the transformation + applied, since the `trafo` parameter only allows for application of the same + transformation to all the variables involved. 2. scoring - Every observation gets assigned a score depending on its k nearest neighbors. See - the `scoring_method` parameter description for details on the different scoring - methods. Furthermore [1] may give some insight in the pro and cons of the - different methods. + Every observation gets assigned a score depending on its k nearest neighbors. + See the `scoring_method` parameter description for details on the different + scoring methods. Furthermore, [1] may give some insight in the pro and cons of + the different methods. 3. threshing - The gaps between the (greatest) scores are tested for beeing drawn from the same - distribution as the majority of the scores. If a gap is encountered, that, - with sufficient significance, can be said to not be drawn from the same - distribution as the one all the smaller gaps are drawn from, than the observation - belonging to this gap, and all the observations belonging to gaps larger then - this gap, get flagged outliers. See description of the `threshing` parameter for - more details. Although [1] gives a fully detailed overview over the `stray` - algorithm. + The gaps between the (greatest) scores are tested for beeing drawn from the + same distribution as the majority of the scores. If a gap is encountered, + that, with sufficient significance, can be said to not be drawn from the same + distribution as the one all the smaller gaps are drawn from, than the + observation belonging to this gap, and all the observations belonging to gaps + larger than this gap, get flagged outliers. See description of the + `threshing` parameter for more details. Although [1] gives a fully detailed + overview over the `stray` algorithm. References ---------- @@ -555,7 +641,6 @@ class OutliersMixin: Anomaly Detection in High-Dimensional Data, Journal of Computational and Graphical Statistics, 30:2, 360-374, DOI: 10.1080/10618600.2020.1807997 - """ # parameter deprecations @@ -563,8 +648,8 @@ class OutliersMixin: if "partition" in kwargs: warnings.warn( """ - The parameter `partition` is deprecated and will be removed in version 3.0 of saqc. - Please us the parameter `window` instead.' + The parameter `partition` is deprecated and will be removed in version + 3.0 of saqc. Please us the parameter `window` instead. """, DeprecationWarning, ) @@ -573,8 +658,8 @@ class OutliersMixin: if "partition_min" in kwargs: warnings.warn( """ - The parameter `partition_min` is deprecated and will be removed in version 3.0 of saqc. - Please us the parameter `min_periods` instead.' + The parameter `partition_min` is deprecated and will be removed in + version 3.0 of saqc. Please us the parameter `min_periods` instead. """, DeprecationWarning, ) @@ -583,27 +668,32 @@ class OutliersMixin: if min_periods != 11: warnings.warn( """ - You were setting a customary value for the `min_periods` parameter: note that this parameter - does no longer refer to the reduction interval length, but now controls the number of periods - having to be present in an interval of size `window` (deprecated:`partition`) for the algorithm to be - performed in that interval. - To alter the size of the reduction window, use the parameter `min_periods_r`. Changes readily apply. - Warning will be removed in saqc version 3.0. + You were setting a customary value for the `min_periods` parameter: + note that this parameter does no longer refer to the reduction interval + length, but now controls the number of periods having to be present in + an interval of size `window` (deprecated:`partition`) for the algorithm + to be performed in that interval. + To alter the size of the reduction window, use the parameter + `min_periods_r`. Changes readily apply. + This warning will be removed in saqc version 3.0. """, DeprecationWarning, ) + # Hint: checking is delegated to the called functions + fields = toSequence(field) + qc = self fields_ = [] for f in fields: field_ = str(uuid.uuid4()) - self = self.copyField(field=f, target=field_) - self = self.transform(field=field_, func=trafo, freq=window) + qc = qc.copyField(field=f, target=field_) + qc = qc.transform(field=field_, func=trafo, freq=window) fields_.append(field_) knn_field = str(uuid.uuid4()) - self = self.assignKNNScore( + qc = qc.assignKNNScore( field=fields_, target=knn_field, n=n, @@ -614,9 +704,9 @@ class OutliersMixin: **kwargs, ) for field_ in fields_: - self = self.dropField(field_) + qc = qc.dropField(field_) - self = self.flagByStray( + qc = qc.flagByStray( field=knn_field, freq=window, min_periods=min_periods, @@ -626,11 +716,11 @@ class OutliersMixin: **kwargs, ) - self._data, self._flags = _evalStrayLabels( - data=self._data, + qc._data, qc._flags = _evalStrayLabels( + data=qc._data, field=knn_field, target=fields, - flags=self._flags, + flags=qc._flags, reduction_range=stray_range, reduction_drop_flagged=drop_flagged, reduction_thresh=thresh, @@ -638,7 +728,7 @@ class OutliersMixin: flag=flag, **kwargs, ) - return self.dropField(knn_field) + return qc.dropField(knn_field) @flagging() def flagRaise( @@ -655,17 +745,17 @@ class OutliersMixin: **kwargs, ) -> "SaQC": """ - The function flags raises and drops in value courses, that exceed a certain threshold - within a certain timespan. + The function flags raises and drops in value courses, that exceed a certain + threshold within a certain timespan. - The parameter variety of the function is owned to the intriguing case of values, that - "return" from outlierish or anomalious value levels and thus exceed the threshold, - while actually being usual values. + The parameter variety of the function is owned to the intriguing case of + values, that "return" from outlierish or anomalious value levels and thus + exceed the threshold, while actually being usual values. Notes ----- - The dataset is NOT supposed to be harmonized to a time series with an equidistant - requency grid. + The dataset is NOT supposed to be harmonized to a time series with an + equidistant requency grid. The value :math:`x_{k}` of a time series :math:`x` with associated timestamps :math:`t_i`, is flagged a raise, if: @@ -674,35 +764,38 @@ class OutliersMixin: :py:attr:`raise_window` range, so that :math:`M = |x_k - x_s | >` :py:attr:`thresh` :math:`> 0` - 2. The weighted average :math:`\\mu^{*}` of the values, preceding :math:`x_{k}` - within :py:attr:`average_window` range indicates, that :math:`x_{k}` does not - return from an "outlierish" value course, meaning that + 2. The weighted average :math:`\\mu^{*}` of the values, preceding + :math:`x_{k}` within :py:attr:`average_window` range indicates, + that :math:`x_{k}` does not return from an "outlierish" value + course, meaning that :math:`x_k > \\mu^* + ( M` / :py:attr:`raise_factor` :math:`)` - 3. Additionally, if :py:attr:`slope` is not ``None``, :math:`x_{k}` is checked for being - sufficiently divergent from its very predecessor :math:`x_{k-1}`, meaning that, it - is additionally checked if: + 3. Additionally, if :py:attr:`slope` is not ``None``, :math:`x_{k}` + is checked or being sufficiently divergent from its very predecessor + :math:`x_{k-1}`, meaning that, it is additionally checked if: * :math:`x_k - x_{k-1} >` :py:attr:`slope` * :math:`t_k - t_{k-1} >` :py:attr:`weight` :math:`\\times` :py:attr:`freq` Parameters ---------- thresh : - The threshold, for the total rise (:py:attr:`thresh` ``> 0``), or total drop - (:py:attr:`thresh` ``< 0``), value courses must not exceed within a timespan - of length :py:attr:`raise_window`. + The threshold, for the total rise (:py:attr:`thresh` ``> 0``), + or total drop (:py:attr:`thresh` ``< 0``), value courses must + not exceed within a timespan of length :py:attr:`raise_window`. raise_window : - An offset string, determining the timespan, the rise/drop thresholding refers - to. Window is inclusively defined. + An offset string, determining the timespan, the rise/drop + thresholding refers to. Window is inclusively defined. freq : - An offset string, determining the frequency, the timeseries to flag is supposed - to be sampled at. The window is inclusively defined. + An offset string, determining the frequency, the timeseries + to flag is supposed to be sampled at. The window is inclusively + defined. average_window : - See condition (2) of the description given in the Notes. Window is - inclusively defined, defaults to 1.5 times the size of :py:attr:`raise_window`. + See condition (2) of the description given in the Notes. Window + is inclusively defined, defaults to 1.5 times the size of + :py:attr:`raise_window`. raise_factor : See condition (2). @@ -713,6 +806,10 @@ class OutliersMixin: weight : See condition (3). """ + validateWindow(raise_window, "raise_window", allow_int=False) + validateWindow(freq, "freq", allow_int=False) + validateWindow(average_window, "average_window", allow_int=False, optional=True) + # prepare input args dataseries = self._data[field].dropna() raise_window_td = pd.Timedelta(raise_window) @@ -834,15 +931,15 @@ class OutliersMixin: ---------- [1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h.htm """ - msg = """ - The method `flagMAD` is deprecated and will be removed in verion 3.0 of saqc. - To achieve the same behavior use: - """ - call = f"qc.flagZScore(field={field}, window={window}, method='modified', thresh={z}, min_residuals={min_residuals}, min_periods={min_periods}, center={center})" - - warnings.warn(f"{msg}`{call}`", DeprecationWarning) - - self = self.flagZScore( + warnings.warn( + f"The method `flagMAD` is deprecated and will be removed in " + "version 3.0 of saqc. To achieve the same behavior use:" + f"`qc.flagZScore(field={field}, window={window}, method='modified', " + f"thresh={z}, min_residuals={min_residuals}, min_periods={min_periods}, " + f"center={center})`", + DeprecationWarning, + ) + return self.flagZScore( field, window=window, thresh=z, @@ -856,8 +953,6 @@ class OutliersMixin: flag=flag, ) - return self - @flagging() def flagOffset( self: "SaQC", @@ -1004,10 +1099,12 @@ class OutliersMixin: >>> qc = qc.flagOffset("data", thresh=2, thresh_relative=-.5, tolerance=1.5, window='6H') >>> qc.plot('data') # doctest: +SKIP """ - if (thresh is None) and (thresh_relative is None): + validateWindow(window) + if thresh is None and thresh_relative is None: raise ValueError( - "At least one of parameters 'thresh' and 'thresh_relative' has to be given. Got 'thresh'=None, " - "'thresh_relative'=None instead." + "At least one of parameters 'thresh' and 'thresh_relative' " + "has to be given. Got 'thresh'=None, 'thresh_relative'=None " + "instead." ) if thresh is None: thresh = 0 @@ -1110,6 +1207,10 @@ class OutliersMixin: [1] https://en.wikipedia.org/wiki/Grubbs%27s_test_for_outliers """ + validateWindow(window) + validateFraction(alpha, "alpha") + validateMinPeriods(min_periods, optional=False) + datcol = self._data[field].copy() rate = getFreqDelta(datcol.index) @@ -1215,25 +1316,29 @@ class OutliersMixin: ---------- [1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h.htm """ - msg = """ - The method `flagCrossStatistics` is deprecated and will be removed in verion 3.0 of saqc. - To achieve the same behavior use: - """ new_method_string = { "modZscore": "modified", "Zscore": "standard", np.mean: "standard", np.median: "modified", } - call = f"qc.flagZScore(field={field}, window=1, method={new_method_string[method]}, thresh={thresh}, axis=1)" - - warnings.warn(f"{msg}`{call}`", DeprecationWarning) + call = ( + f"qc.flagZScore(field={field}, window=1, " + f"method={new_method_string[method]}, " + f"thresh={thresh}, axis=1)" + ) + warnings.warn( + f"The method `flagCrossStatistics` is deprecated and will " + f"be removed in verion 3.0 of saqc. To achieve the same behavior " + f"use:`{call}`", + DeprecationWarning, + ) return self.flagZScore( - field={field}, + field=field, window=1, - method={new_method_string[method]}, - thresh={thresh}, + method=new_method_string[method], + thresh=thresh, axis=1, flag=flag, ) @@ -1347,21 +1452,22 @@ class OutliersMixin: method = "modified" else: raise ValueError( - "Support for scoring with functions not similar to either Zscore or modified Zscore is " - "not supported anymore" + "Support for scoring with functions not similar to " + "either Zscore or modified Zscore is not supported " + "anymore" ) - dat = self._data[field].to_pandas(how="outer") + validateChoice(method, "method", ["standard", "modified"]) + validateWindow(window, optional=True) + validateMinPeriods(min_periods) - if min_residuals is None: - min_residuals = 0 + min_residuals = min_residuals or 0 + min_periods = min_periods or 0 + dat = self._data[field].to_pandas(how="outer") if dat.empty: return self - if min_periods is None: - min_periods = 0 - if window is None: if dat.notna().sum().sum() >= min_periods: if method == "standard": @@ -1382,6 +1488,7 @@ class OutliersMixin: ) else: return self + else: # window is not None if axis == 0: if method == "standard": @@ -1437,7 +1544,7 @@ def _evalStrayLabels( field: str, flags: Flags, target: Sequence[str], - reduction_range: Optional[str] = None, + reduction_range: str | None = None, reduction_drop_flagged: bool = False, # TODO: still a case ? reduction_thresh: float = 3.5, reduction_min_periods: int = 1, diff --git a/saqc/funcs/pattern.py b/saqc/funcs/pattern.py index b9869c8a1..fe910aa35 100644 --- a/saqc/funcs/pattern.py +++ b/saqc/funcs/pattern.py @@ -1,16 +1,12 @@ #! /usr/bin/env python - # SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# # SPDX-License-Identifier: GPL-3.0-or-later - # -*- coding: utf-8 -*- from __future__ import annotations from typing import TYPE_CHECKING import dtw -import numpy as np import pandas as pd from saqc import BAD @@ -22,7 +18,7 @@ if TYPE_CHECKING: def calculateDistanceByDTW( - data: pd.Series, reference: pd.Series, forward=True, normalize=True + data: pd.Series, reference: pd.Series, forward: bool = True, normalize: bool = True ): """ Calculate the DTW-distance of data to pattern in a rolling calculation. @@ -35,19 +31,19 @@ def calculateDistanceByDTW( Parameters ---------- - data : pd.Series + data : Data series. Must have datetime-like index, and must be regularly sampled. - reference : : pd.Series + reference : Reference series. Must have datetime-like index, must not contain NaNs and must not be empty. - forward: bool, default True + forward: If `True`, the distance value is set on the left edge of the data chunk. This means, with a perfect match, `0.0` marks the beginning of the pattern in the data. If `False`, `0.0` would mark the end of the pattern. - normalize : bool, default True + normalize : If `False`, return unmodified distances. If `True`, normalize distances by the number of observations in the reference. This helps to make it easier to find a good cutoff threshold for further @@ -97,12 +93,12 @@ class PatternMixin: @flagging() def flagPatternByDTW( self: "SaQC", - field, - reference, - max_distance=0.0, - normalize=True, - plot=False, - flag=BAD, + field: str, + reference: str, + max_distance: float = 0.0, + normalize: bool = True, + plot: bool = False, + flag: float = BAD, **kwargs, ) -> "SaQC": """ @@ -111,48 +107,54 @@ class PatternMixin: The steps are: 1. work on a moving window - 2. for each data chunk extracted from each window, a distance to the given pattern - is calculated, by the dynamic time warping algorithm [1] + 2. for each data chunk extracted from each window, a distance + to the given pattern is calculated, by the dynamic time warping + algorithm [1] - 3. if the distance is below the threshold, all the data in the window gets flagged + 3. if the distance is below the threshold, all the data in the + window gets flagged Parameters ---------- reference : - The name in `data` which holds the pattern. The pattern must not have NaNs, - have a datetime index and must not be empty. + The name in `data` which holds the pattern. The pattern must + not have NaNs, have a datetime index and must not be empty. max_distance : - Maximum dtw-distance between chunk and pattern, if the distance is lower than - ``max_distance`` the data gets flagged. With default, ``0.0``, only exact - matches are flagged. + Maximum dtw-distance between chunk and pattern, if the distance + is lower than ``max_distance`` the data gets flagged. With + default, ``0.0``, only exact matches are flagged. normalize : If `False`, return unmodified distances. - If `True`, normalize distances by the number of observations of the reference. - This helps to make it easier to find a good cutoff threshold for further - processing. The distances then refer to the mean distance per datapoint, - expressed in the datas units. + If `True`, normalize distances by the number of observations + of the reference. This helps to make it easier to find a + good cutoff threshold for further processing. The distances + then refer to the mean distance per datapoint, expressed + in the datas units. plot : - Show a calibration plot, which can be quite helpful to find the right threshold - for `max_distance`. It works best with `normalize=True`. Do not use in automatic - setups / pipelines. The plot show three lines: + Show a calibration plot, which can be quite helpful to find + the right threshold for `max_distance`. It works best with + `normalize=True`. Do not use in automatic setups / pipelines. + The plot show three lines: - data: the data the function was called on - distances: the calculated distances by the algorithm - - indicator: have to distinct levels: `0` and the value of `max_distance`. - If `max_distance` is `0.0` it defaults to `1`. Everywhere where the - indicator is not `0` the data will be flagged. + - indicator: have to distinct levels: `0` and the value of + `max_distance`. If `max_distance` is `0.0` it defaults to + `1`. Everywhere where the indicator is not `0` the data + will be flagged. Notes ----- - The window size of the moving window is set to equal the temporal extension of the - reference datas datetime index. + The window size of the moving window is set to equal the temporal + extension of the reference datas datetime index. References ---------- - Find a nice description of underlying the Dynamic Time Warping Algorithm here: + Find a nice description of underlying the Dynamic Time Warping + Algorithm here: [1] https://cran.r-project.org/web/packages/dtw/dtw.pdf """ diff --git a/saqc/funcs/resampling.py b/saqc/funcs/resampling.py index 3f397133b..817a974a5 100644 --- a/saqc/funcs/resampling.py +++ b/saqc/funcs/resampling.py @@ -17,6 +17,7 @@ from typing_extensions import Literal from saqc.constants import UNFLAGGED from saqc.core import register from saqc.core.history import History +from saqc.lib.checking import validateCallable, validateChoice from saqc.lib.docs import DOC_TEMPLATES from saqc.lib.tools import filterKwargs, getFreqDelta, isflagged from saqc.lib.ts_operators import aggregate2Freq @@ -45,22 +46,27 @@ class ResamplingMixin: **kwargs, ) -> "SaQC": """ - A method to "regularize" data by interpolating linearly the data at regular timestamp. - - .. deprecated:: 2.4.0 - Use :py:meth:`~saqc.SaQC.align` with ``method="linear"`` instead. - - A series of data is considered "regular", if it is sampled regularly (= having uniform sampling rate). - Interpolated values will get assigned the worst flag within freq-range. - Note, that the data only gets interpolated at those (regular) timestamps, that have a valid (existing and - not-na) datapoint preceeding them and one succeeding them within freq range. - Regular timestamp that do not suffice this condition get nan assigned AND The associated flag will be of value - ``UNFLAGGED``. + A method to "regularize" data by interpolating linearly the data + at regular timestamp. + + .. deprecated:: 2.4.0 + Use :py:meth:`~saqc.SaQC.align` with ``method="linear"`` + instead. + + A series of data is considered "regular", if it is sampled regularly + (= having uniform sampling rate). Interpolated values will get + assigned the worst flag within freq-range. Note, that the data + only gets interpolated at those (regular) timestamps, that have + a valid (existing and not-na) datapoint preceeding them and one + succeeding them within freq range. Regular timestamp that do + not suffice this condition get nan assigned AND The associated + flag will be of value ``UNFLAGGED``. Parameters ---------- freq : - An offset string. The frequency of the grid you want to interpolate your data at. + An offset string. The frequency of the grid you want to interpolate + your data at. """ warnings.warn( f""" @@ -70,7 +76,6 @@ class ResamplingMixin: """, DeprecationWarning, ) - reserved = ["method", "order", "limit", "downgrade"] kwargs = filterKwargs(kwargs, reserved) return self.interpolateIndex(field, freq, "time", **kwargs) @@ -86,8 +91,8 @@ class ResamplingMixin: """ Shift data points and flags to a regular frequency grid. - .. deprecated:: 2.4.0 - Use :py:meth:`~saqc.SaQC.align` instead. + .. deprecated:: 2.4.0 + Use :py:meth:`~saqc.SaQC.align` instead. Parameters ---------- @@ -104,12 +109,11 @@ class ResamplingMixin: warnings.warn( f""" The method `shift` is deprecated and will be removed with version 2.6 of saqc. - To achieve the same behavior please use: - `qc.align(field={field}, freq={freq}. method={method})` + To achieve the same behavior please use: `qc.align(field={field}, freq={freq}. method={method})` """, DeprecationWarning, ) - + validateChoice(method, "method", ["fshift", "bshift", "nshift"]) return self.align(field=field, freq=freq, method=method, **kwargs) @register(mask=["field"], demask=[], squeeze=[]) @@ -165,11 +169,12 @@ class ResamplingMixin: maxna_group : Same as `maxna` but for consecutive NaNs. """ + validateChoice(method, "method", ["fagg", "bagg", "nagg"]) + validateCallable(func, "func") datcol = self._data[field] - if datcol.empty: - # see for #GL-374 + # see #GL-374 datcol = pd.Series(index=pd.DatetimeIndex([]), dtype=datcol.dtype) datcol = aggregate2Freq( @@ -288,6 +293,21 @@ class ResamplingMixin: overwrite : Overwrite existing flags if ``True`` """ + validateChoice( + method, + "method", + [ + "inverse_fagg", + "inverse_bagg", + "inverse_nagg", + "inverse_fshift", + "inverse_bshift", + "inverse_nshift", + "inverse_interpolation", + "match", + "auto", + ], + ) if target is None: target = field diff --git a/saqc/funcs/residuals.py b/saqc/funcs/residuals.py index 006e61ee2..d83545e3c 100644 --- a/saqc/funcs/residuals.py +++ b/saqc/funcs/residuals.py @@ -71,6 +71,7 @@ class ResidualsMixin: sparse intervals). To automatically set the minimum number of periods to the number of values in an offset defined window size, pass np.nan. """ + # HINT: checking in _fitPolynomial orig = self._data[field] data, _ = _fitPolynomial( data=self._data, @@ -117,6 +118,7 @@ class ResidualsMixin: center : If True, center the rolling window. """ + # HINT: checking in _roll orig = self._data[field].copy() data, _ = _roll( data=self._data, diff --git a/saqc/funcs/rolling.py b/saqc/funcs/rolling.py index e8ee4cb3d..70f786c5d 100644 --- a/saqc/funcs/rolling.py +++ b/saqc/funcs/rolling.py @@ -13,6 +13,7 @@ import numpy as np import pandas as pd from saqc.core import DictOfSeries, Flags, register +from saqc.lib.checking import validateCallable, validateMinPeriods, validateWindow from saqc.lib.tools import getFreqDelta if TYPE_CHECKING: @@ -53,6 +54,7 @@ class RollingMixin: center : If True, center the rolling window. """ + # HINT: checking in _roll self._data, self._flags = _roll( data=self._data, field=field, @@ -109,6 +111,8 @@ class RollingMixin: """, DeprecationWarning, ) + + # HINT: checking in _roll self._data, self._flags = _roll( data=self._data, field=field, @@ -132,6 +136,10 @@ def _roll( center: bool = True, **kwargs, ): + validateWindow(window) + validateMinPeriods(min_periods) + validateCallable(func, "func") + to_fit = data[field].copy() if to_fit.empty: return data, flags diff --git a/saqc/funcs/scores.py b/saqc/funcs/scores.py index 5110d824f..5aca31c51 100644 --- a/saqc/funcs/scores.py +++ b/saqc/funcs/scores.py @@ -16,6 +16,12 @@ from typing_extensions import Literal from saqc import UNFLAGGED from saqc.core import register +from saqc.lib.checking import ( + validateCallable, + validateChoice, + validateMinPeriods, + validateWindow, +) from saqc.lib.docs import DOC_TEMPLATES from saqc.lib.tools import getApply, toSequence from saqc.lib.ts_operators import kNN @@ -50,7 +56,7 @@ def _groupedScoring( min_periods: int = 2, score_func: Callable = _LOFApply, score_kwargs: Optional[dict] = None, -) -> Tuple[pd.Series, pd.Series, pd.Series]: +) -> pd.Series: score_kwargs = score_kwargs or {} score_index = val_frame.index score_ser = pd.Series(np.nan, index=score_index) @@ -118,6 +124,11 @@ def _univarScoring( min_periods Minimum number of valid meassurements in a scoring window, to consider the resulting score valid. """ + validateWindow(window, optional=True) + validateCallable(model_func, "model_func") + validateCallable(norm_func, "norm_func") + validateMinPeriods(min_periods, optional=True) + if data.empty: return data, data, data if min_periods is None: @@ -227,6 +238,11 @@ class ScoresMixin: ---------- [1] https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html """ + validateChoice( + algorithm, "algorithm", ["ball_tree", "kd_tree", "brute", "auto"] + ) + validateCallable(func, "func") + if isinstance(target, list): if len(target) > 1: raise ValueError( @@ -366,6 +382,11 @@ class ScoresMixin: * `1` - Manhatten Metric * `2` - Euclidian Metric """ + from saqc.funcs.outliers import OutliersMixin + + validateMinPeriods(min_periods) + OutliersMixin._validateLOF(algorithm, n, p, 1.0) + if isinstance(target, list): if len(target) > 1: raise ValueError( @@ -405,7 +426,7 @@ class ScoresMixin: algorithm: Literal["ball_tree", "kd_tree", "brute", "auto"] = "ball_tree", p: int = 1, density: Literal["auto"] | float | Callable = "auto", - fill_na: str = "linear", + fill_na: bool = True, **kwargs, ) -> "SaQC": """ @@ -449,7 +470,7 @@ class ScoresMixin: (passed as Series). fill_na : - Weather or not to fill NaN values in the data with a linear interpolation. + If True, NaNs in the data are filled with a linear interpolation. Notes ----- @@ -463,10 +484,13 @@ class ScoresMixin: -------- """ + from saqc.funcs.outliers import OutliersMixin + + OutliersMixin._validateLOF(algorithm, n, p, density) vals = self._data[field] - if fill_na is not None: - vals = vals.interpolate(fill_na) + if fill_na: + vals = vals.interpolate("linear") if density == "auto": density = vals.diff().abs().median() diff --git a/saqc/funcs/tools.py b/saqc/funcs/tools.py index 11aac29f0..bce714086 100644 --- a/saqc/funcs/tools.py +++ b/saqc/funcs/tools.py @@ -17,6 +17,7 @@ from typing_extensions import Literal from saqc import FILTER_NONE, UNFLAGGED from saqc.core import processing, register +from saqc.lib.checking import validateChoice from saqc.lib.docs import DOC_TEMPLATES from saqc.lib.plotting import makeFig from saqc.lib.tools import periodicMask @@ -177,6 +178,8 @@ class ToolsMixin: >>> start = "22:00:00" >>> end = "06:00:00" """ + validateChoice(mode, "mode", ["periodic", "selection_field"]) + datcol_idx = self._data[field].index if mode == "periodic": diff --git a/saqc/lib/checking.py b/saqc/lib/checking.py new file mode 100644 index 000000000..e2ac2aa90 --- /dev/null +++ b/saqc/lib/checking.py @@ -0,0 +1,352 @@ +#! /usr/bin/env python +# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ +# SPDX-License-Identifier: GPL-3.0-or-later +# -*- coding: utf-8 -*- +from __future__ import annotations + +from typing import Any, Collection, Iterable, Literal, TypeVar, get_origin + +import numpy as np +import pandas as pd + +T = TypeVar("T") + +# ==================================================================== +# `isSomething`-Checks: must not raise Exceptions by checking the value (but +# might rise Exceptions on wrong usage) and should return a boolean +# value +# ==================================================================== + + +def isBoolLike(obj: Any, optional: bool = False) -> bool: + """Return True if obj is a boolean or one of the integers 0 or 1. + If optional is True, `None` also is considered a valid boolean. + """ + return ( + isinstance(obj, bool) + or optional + and obj is None + or isinstance(obj, int) + and obj in [0, 1] + ) + + +def isFloatLike(obj: Any) -> bool: + return isinstance(obj, (float, int)) + + +def isIterable(obj: Any) -> bool: + if isinstance(obj, Iterable) or pd.api.types.is_iterator(obj): + return True + try: + iter(obj) + except TypeError: + return False + return True + + +def isScalar(obj: Any, optional: bool = False) -> bool: + return optional and obj is None or np.isscalar(obj) + + +def isCallable(obj: Any, optional: bool = False) -> bool: + return optional and obj is None or callable(obj) + + +def isFixedFrequencyOffset(obj: Any): + """Check if obj is a `pd.DateOffset` and have a fixed frequency. + + Motivation: + pd.Timedelta always considered to have a fixed frequency, but + a date-offset might or might not have a fixed frequency. + Operations like `pd.Series.rolling` need a window with a fixed + frequency, but other operations like `pd.Series.resample`, for + example can handle any frequencies (fixed and non-fixed) + + This function return True if the object is a subclass of + `pd.offsets.BaseOffset` and have a fixed frequency. + """ + return isinstance(obj, pd.offsets.BaseOffset) and not obj.base.is_anchored() + + +def isFrequencyString(obj: Any, fixed_only=False) -> bool: + if not isinstance(obj, str): + return False + try: + offset = pd.tseries.frequencies.to_offset(obj) + if fixed_only: + return isFixedFrequencyOffset(offset) + return True + except ValueError: + return False + + +def isTimedeltaString(obj: Any, allow_NaT: bool = False) -> bool: + if not isinstance(obj, str): + return False + try: + return pd.notna(pd.Timedelta(obj)) or allow_NaT + except (ValueError, TypeError): + return False + + +def isValidFrequency(obj: Any, allow_str: bool = True, fixed_only: bool = False): + return ( + not fixed_only + and isinstance(obj, pd.offsets.BaseOffset) + or fixed_only + and isFixedFrequencyOffset(obj) + or allow_str + and isFrequencyString(obj, fixed_only=fixed_only) + ) + + +def isValidWindow(obj: Any, allow_int: bool = True, allow_str: bool = True) -> bool: + return ( + isinstance(obj, pd.Timedelta) + or isFixedFrequencyOffset(obj) + or allow_int + and isinstance(obj, int) + and isInBounds(obj, 0) + or allow_str + and isTimedeltaString(obj) + ) + + +def isValidChoice(value: T, choices: Collection[T]) -> bool: + """Return if value is in choices. + + Raises + ====== + TypeError: if choices is not a kind of collection. + """ + if not isinstance(choices, Collection): + raise TypeError("'choices' must be some kind of collection") + return value in choices + + +def isInBounds( + val: int | float, + left: int | float = -np.inf, + right: int | float = np.inf, + closed: Literal["left", "right", "both", "neither"] = "left", +): + """ + Check if a value is in a given interval. + + val : + value to check + + left : + The left or lower bound, defaults to `-inf` + + right : + The right or upper bound, defaults to `+inf` + + closed : default "left" + Defines where the interval has closed or open bounds, defaults to `"left"`. + * `"left"`: to include left bound [left, right) + * `"right"`: to include right bound (left, right] + * `"both"`: closed interval [left, right] + * `"neither"`: (default) open interval (left, right) + """ + validateChoice(closed, "closed", ["left", "right", "both", "neither"]) + if closed == "neither": + return left < val < right + if closed == "left": + return left <= val < right + if closed == "right": + return left < val <= right + if closed == "both": + return left <= val <= right + + +# ==================================================================== +# Validation-functions: +# They should raise an Exceptions if conditions are not fulfilled and +# should return None. +# ==================================================================== + + +def validateScalar(value, name: str, optional: bool = False): + if not isScalar(value, optional=optional): + raise ValueError( + f"{name!r} must be a scalar{' or None' if optional else ''}, " + f"not of type {type(value).__qualname__!r}" + ) + + +def validateCallable(func, name: str, optional: bool = False): + if not isCallable(func, optional=optional): + raise TypeError( + f"{name!r} must be a callable{' or None' if optional else ''}, " + f"not of type {type(func).__qualname__!r}" + ) + + +def _isLiteral(obj: Any) -> bool: + # issubclass and isinstance does not work + # for SpecialTypes, like Literal + return get_origin(obj) == Literal + + +def validateChoice(value: T, name: str, choices: Collection[T] | type(Literal)): + from saqc.lib.tools import extractLiteral + + if _isLiteral(choices): + choices = extractLiteral(choices) + if not isValidChoice(value, choices): + raise ValueError(f"{name!r} must be one of {set(choices)}, not {value!r}") + + +def isIntOrInf(obj: int | float) -> bool: + return isinstance(obj, int) or isinstance(obj, float) and np.isinf(obj) + + +def validateValueBounds( + value: int | float, + name: str, + left: int | float = -np.inf, + right: int | float = np.inf, + closed: Literal["left", "right", "both", "neither"] = "left", + strict_int: bool = False, +): + if ( + not pd.api.types.is_number(value) + or not isInBounds(value, left, right, closed) + or strict_int + and not isIntOrInf(value) + ): + ival_str = dict( + left="right-open interval [{}, {})", + right="left-open interval ({}, {}]", + both="closed interval [{}, {}]", + neither="open interval ({}, {})", + ).get(closed, "interval |{}, {}|") + raise ValueError( + f"{name!r} must be an int{'' if strict_int else ' or float'} " + f"in the {ival_str.format(left, right)}, not {value!r}" + ) + + +def validateFraction( + value: int | float, + name: str, + closed: Literal["left", "right", "both", "neither"] = "both", +): + """Raise a ValueError if value is not in the interval |0, 1|""" + return validateValueBounds( + value, name, left=0, right=1, closed=closed, strict_int=False + ) + + +def validateFrequency( + value: str | pd.offsets.BaseOffset | pd.Timedelta, + name: str, + allow_str: bool = True, + fixed_only=False, +): + # we might want to use checking.py in tools.py, so we use a + # late import here, to avoid circular import errors + from saqc.lib.tools import joinExt + + types = ["a Timedelta", "a BaseOffset"] + if allow_str: + types.append("an offset-string") + msg = f"{name!r} must be {joinExt(', ', types, ' or ')}, not {value!r}" + + if not isValidFrequency(value, allow_str=allow_str, fixed_only=fixed_only): + raise ValueError(msg) + + +def validateWindow( + value: int | str | pd.offsets.BaseOffset | pd.Timedelta, + name: str = "window", + allow_int: bool = True, + allow_str: bool = True, + optional: bool = False, + index: pd.Index | None = None, +): + """ + Check if a `window` parameter is valid. + + Parameters + ---------- + value : + The value of the window to check. + + name : + The name of the window variable to use in error messages. + + allow_int : + If ``True``, integer windows are considered valid. + Default is ``True``. + + allow_str : + If ``True``, offset-string windows are considered valid. + Default is ``True``. + + optional : + If ``True``, allow window to be ``None`` + + index : + A pandas Index that is checked to be datetime-like if a + datetime-like window is used. If `None` or if an integer window + is used, this check is ignored. Default is `None`. + """ + # we might want to use checking.py in tools.py, so we use a + # late import here, to avoid circular import errors + from saqc.lib.tools import joinExt + + # first ensure we're called correctly + if index is not None and not isinstance(index, pd.Index): + raise TypeError( + f"'index' must be None or of type pd.Index, " + f"not of type {type(index).__qualname__!r}" + ) + + types = ["a Timedelta", "a BaseOffset"] + if allow_int: + types.append("a positive integer") + if allow_str: + types.append("an offset-string") + if optional: + types.append("None") + msg = f"{name!r} must be {joinExt(', ', types, ' or ')}, not {value!r}" + + if optional and value is None: + return + + if not isValidWindow(value, allow_int=allow_int, allow_str=allow_str): + # try to get a bit more detail for the error message + if isinstance(value, str) and allow_str: + try: + if pd.isna(pd.Timedelta(value)): + raise ValueError("Timedelta conversion resulted in 'NaT'") + except Exception as e: + raise ValueError( + f"{name!r} is not a valid offset-string, because: " + str(e) + ) from e + raise ValueError(msg) + + if ( + index is not None + and isinstance(value, (str, pd.offsets.BaseOffset, pd.Timedelta)) + and not pd.api.types.is_datetime64_any_dtype(index) + ): + raise ValueError( + f"Data must have a datetime based index, if a time based {name!r} " + f"is used, but data has an index of dtype {index.dtype}." + + " Use an integer instead." + if allow_int + else "" + ) + + +def validateMinPeriods( + value: int | None, name="min_periods", minimum=0, maximum=np.inf, optional=True +): + """check if `min_periods` is in the right-open interval [minimum,maximum)""" + if optional and value is None: + return + validateValueBounds(value, name=name, left=minimum, right=maximum, strict_int=True) diff --git a/saqc/lib/exceptions.py b/saqc/lib/exceptions.py deleted file mode 100644 index 1b8748fd3..000000000 --- a/saqc/lib/exceptions.py +++ /dev/null @@ -1,50 +0,0 @@ -#! /usr/bin/env python - -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -# -*- coding: utf-8 -*- -from __future__ import annotations - -from typing import ( - Any, - Callable, - Collection, - List, - Literal, - Sequence, - Tuple, - TypeVar, - Union, -) - -CLOSURE_TO_NOTION = { - None: "interval ({}, {})", - "left": "right-open interval [{}, {})]", - "right": "left-open interval ({}, {}]", - "both": "closed interval [{}, {}]", -} - - -class ParameterOutOfBounds(Exception): - def __init__( - self, - value: int | float, - para_name: str, - bounds: Tuple[str], - closed: Literal["right", "left", "both"] = None, - ): - Exception.__init__(self) - self.value = value - self.para_name = para_name - self.bounds = bounds - self.closed = closed - self.msg = "Parameter '{}' has to be in the {}, but {} was passed." - - def __str__(self): - return self.msg.format( - self.para_name, - CLOSURE_TO_NOTION[self.closed].format(self.bounds[0], self.bounds[1]), - self.value, - ) diff --git a/saqc/lib/rolling.py b/saqc/lib/rolling.py index 6016d29d6..5c4ae6f9c 100644 --- a/saqc/lib/rolling.py +++ b/saqc/lib/rolling.py @@ -10,7 +10,9 @@ from typing import Literal import numpy as np import pandas as pd +from numpy.lib.stride_tricks import sliding_window_view +from saqc.lib.checking import validateChoice, validateMinPeriods, validateWindow from saqc.lib.tools import getFreqDelta @@ -26,35 +28,35 @@ def windowRoller( * implements efficient 2d rolling in case of regular timestamps or integer defined window * else: dispatches to not optimized (no-numba) version in case of irregular timestamp """ - supportedFuncs = ["mean", "median", "std", "var", "sum"] - if func not in supportedFuncs: - raise ValueError(f'"func" has to be one of {supportedFuncs}. Got {func}.') + validateWindow(window) + validateMinPeriods(min_periods, optional=False) + validateChoice(func, "func", ["mean", "median", "std", "var", "sum"]) + func_kwargs = {} if func in ["std", "var"]: func_kwargs.update({"ddof": 1}) + roll_func = getattr(np, "nan" + func) regularFreq = getFreqDelta(data.index) - vals = data.values if regularFreq is not None: - window = ( - int(pd.Timedelta(window) / pd.Timedelta(regularFreq)) - if isinstance(window, str) - else window - ) + if isinstance(window, str): + window = int(pd.Timedelta(window) / pd.Timedelta(regularFreq)) + vals = data.values ramp = np.empty(((window - 1), vals.shape[1])) ramp.fill(np.nan) vals = np.concatenate([ramp, vals]) if center: vals = np.roll(vals, axis=0, shift=-int(window / 2)) - views = np.lib.stride_tricks.sliding_window_view( - vals, (window, vals.shape[1]) - ).squeeze() + views = sliding_window_view(vals, (window, vals.shape[1])).squeeze() result = roll_func(views, axis=(1, 2), **func_kwargs) + if min_periods > 0: invalid_wins = (~np.isnan(views)).sum(axis=(1, 2)) < min_periods result[invalid_wins] = np.nan + out = pd.Series(result, index=data.index, name="result") + else: # regularFreq is None i_ser = pd.Series(range(data.shape[0]), index=data.index, name="result") result = i_ser.rolling(window=window, center=center).apply( diff --git a/saqc/lib/tools.py b/saqc/lib/tools.py index 5c719aa53..7abb412c1 100644 --- a/saqc/lib/tools.py +++ b/saqc/lib/tools.py @@ -17,12 +17,16 @@ from typing import ( Any, Callable, Collection, + Iterable, List, Literal, Sequence, Tuple, TypeVar, Union, + get_args, + get_origin, + overload, ) import numpy as np @@ -30,63 +34,31 @@ import pandas as pd from scipy import fft from scipy.cluster.hierarchy import fcluster, linkage +from saqc.lib.checking import _isLiteral from saqc.lib.types import CompT -T = TypeVar("T", str, float, int) -BOUND_OPERATORS = { - None: (op.le, op.ge), - "both": (op.lt, op.gt), - "right": (op.le, op.gt), - "left": (op.gt, op.le), -} +T = TypeVar("T") -def isInBounds( - val: int | float, - bounds: Tuple[int | float], - closed: Literal["left", "right", "both"] = None, -): - """ - check if val falls into the interval [left, right] and return boolean accordingly - - val : - value to check - - bounds : - Tuple containing left and right interval bounds. Pass `(a, b)` to define the interval [`a`, `b`]. - Set `a=-inf` or `b=+inf` to set one sided restriction. - - closed : - Enclosure includes the interval bounds into the constraint interval. By default, the bounds - are not included. Pass: - * `"both"`: to include both sides of the interval - * `"left"`: to include left bound - * `"right"`: to include right bound - """ - ops = BOUND_OPERATORS[closed] - if ops[0](val, bounds[0]) or ops[1](val, bounds[1]): - return False - return True - +def extractLiteral(lit: type(Literal)) -> List: + """Return a list of values from a typing.Literal[...] at runtime.""" + if not _isLiteral(lit): + raise TypeError("'lit' must be a typing.Literal") + return list(get_args(lit)) -def assertScalar(name, value, optional=False): - if optional and value is None: - return - if np.isscalar(value): - return - msg = f"'{name}' needs to be a scalar" - if optional: - msg += " or 'None'" - raise ValueError(msg) - - -def toSequence(value: T | Sequence[T]) -> List[T]: - if value is None: # special case - return [None] - if isinstance(value, T.__constraints__): +# fmt: off +@overload +def toSequence(value: T) -> List[T]: + ... +@overload +def toSequence(value: Sequence[T]) -> List[T]: + ... +def toSequence(value) -> List: + if value is None or isinstance(value, (str, float, int)): return [value] return list(value) +# fmt: on def squeezeSequence(value: Sequence[T]) -> Union[T, Sequence[T]]: @@ -95,7 +67,9 @@ def squeezeSequence(value: Sequence[T]) -> Union[T, Sequence[T]]: return value -def periodicMask(dtindex, season_start, season_end, include_bounds): +def periodicMask( + dtindex: pd.Index, season_start: str, season_end: str, include_bounds: bool +): """ This function generates date-periodic/seasonal masks from an index passed. @@ -216,7 +190,7 @@ def isQuoted(string): return bool(re.search(r"'.*'|\".*\"", string)) -def mutateIndex(index, old_name, new_name): +def mutateIndex(index: pd.Index, old_name, new_name): pos = index.get_loc(old_name) index = index.drop(index[pos]) index = index.insert(pos, new_name) @@ -224,13 +198,13 @@ def mutateIndex(index, old_name, new_name): def estimateFrequency( - index, - delta_precision=-1, - max_rate="10s", - min_rate="1D", - optimize=True, - min_energy=0.2, - max_freqs=10, + index: pd.Index, + delta_precision: int = -1, + max_rate: str = "10s", + min_rate: str = "1D", + optimize: bool = True, + min_energy: float = 0.2, + max_freqs: int = 10, bins=None, ): """ @@ -601,3 +575,53 @@ def isAllBoolean(obj: Any): if not pd.api.types.is_bool_dtype(obj[c]): return False return True + + +def joinExt(sep: str, iterable: Iterable[str], last_sep: str | None = None) -> str: + """ + Return a string which is the concatenation of the strings in iterable. + A TypeError will be raised if there are any non-string values in iterable, + including bytes objects. This works exactly as the buildin `str.join`, but + extends it by an optional last separator. + + Parameters + ---------- + sep : + Separator for default concatenation. + + iterable : + Iterable to concatenate. + + last_sep : + Separator for the second last and last element. + + Returns + ------- + string: str + Concatenated strings + + Examples + ======== + + >>> joinExt(', ', ['a', 'b', 'c'], ' or ') + 'a, b or c' + + >>> joinExt(', ', ['a', 'b'], ' or ') + 'a or c' + + >>> joinExt(', ', ['a'], ' or ') + 'a' + + >>> joinExt(', ', ['a', 'b', 'c']) + 'a, b, c' + """ + if last_sep is None: + last_sep = sep + if not isinstance(sep, str): + raise TypeError("'sep' must be string") + if not isinstance(sep, str): + raise TypeError("'last_sep' must be string or None") + iterable = list(iterable) # ensure __len__ and __getitem__ + if len(iterable) < 2: + return sep.join(iterable) + return f"{sep.join(iterable[:-1])}{last_sep}{iterable[-1]}" diff --git a/saqc/lib/ts_operators.py b/saqc/lib/ts_operators.py index 8fbb27e61..f8f7f660a 100644 --- a/saqc/lib/ts_operators.py +++ b/saqc/lib/ts_operators.py @@ -12,7 +12,7 @@ The module gathers all kinds of timeseries tranformations. import re import sys import warnings -from typing import Union +from typing import Literal, Union import numpy as np import numpy.polynomial.polynomial as poly @@ -21,6 +21,7 @@ from scipy.signal import butter, filtfilt from scipy.stats import iqr, median_abs_deviation from sklearn.neighbors import NearestNeighbors +from saqc.lib.checking import validateChoice, validateWindow from saqc.lib.tools import getFreqDelta @@ -328,46 +329,53 @@ def interpolateNANs(data, method, order=2, gap_limit=2, extrapolate=None): :return: """ - # helper variable for checking numerical value of gap limit, if its a numeric value (to avoid comparison to str) + # TODO: IMAO, this code desperately needs a refactoring/rewrite --palmb + gap_check = np.nan if isinstance(gap_limit, str) else gap_limit data = pd.Series(data, copy=True) limit_area = None if extrapolate else "inside" if gap_check is None: - # if there is actually no limit set to the gaps to-be interpolated, generate a dummy mask for the gaps + # if there is actually no limit set to the gaps to-be interpolated, + # generate a dummy mask for the gaps gap_mask = pd.Series(True, index=data.index, name=data.name) + elif gap_check < 2: + # breaks execution down the line and is thus catched here since + # it basically means "do nothing" + return data else: - if gap_check < 2: - # breaks execution down the line and is thus catched here since it basically means "do nothing" - return data + # if there is a limit to the gaps to be interpolated, generate + # a mask that evaluates to False at the right side of each too-large + # gap with a rolling.sum combo + gap_mask = data.rolling(gap_limit, min_periods=0).count() > 0 + + # correction for initial gap + if isinstance(gap_limit, int): + gap_mask.iloc[:gap_limit] = True + + if gap_limit == 2: + # for the common case of gap_limit=2 (default "harmonisation"), + # we efficiently back propagate the False value to fill the + # whole too-large gap by a shift and a conjunction. + gap_mask = gap_mask & gap_mask.shift(-1, fill_value=True) else: - # if there is a limit to the gaps to be interpolated, generate a mask that evaluates to False at the right - # side of each too-large gap with a rolling.sum combo - gap_mask = data.rolling(gap_limit, min_periods=0).count() > 0 - - # correction for initial gap - if isinstance(gap_limit, int): - gap_mask.iloc[:gap_limit] = True - - if gap_limit == 2: - # for the common case of gap_limit=2 (default "harmonisation"), we efficiently back propagate the False - # value to fill the whole too-large gap by a shift and a conjunction. - gap_mask = gap_mask & gap_mask.shift(-1, fill_value=True) - else: - # If the gap_size is bigger we make a flip-rolling combo to backpropagate the False values - gap_mask = ~( - (~gap_mask[::-1]).rolling(gap_limit, min_periods=0).sum() > 0 - )[::-1] + # If the gap_size is bigger we make a flip-rolling combo to + # backpropagate the False values + gap_mask = ~((~gap_mask[::-1]).rolling(gap_limit, min_periods=0).sum() > 0)[ + ::-1 + ] # memorizing the index for later reindexing pre_index = data.index - # drop the gaps that are too large with regard to the gap_limit from the data-to-be interpolated + # drop the gaps that are too large with regard to the gap_limit from + # the data-to-be interpolated data = data[gap_mask] if data.empty: return data if method in ["linear", "time"]: - # in the case of linear interpolation, not much can go wrong/break so this conditional branch has efficient - # finish by just calling pandas interpolation routine to fill the gaps remaining in the data: + # in the case of linear interpolation, not much can go wrong/break + # so this conditional branch has efficient finish by just calling + # pandas interpolation routine to fill the gaps remaining in the data: data.interpolate( method=method, inplace=True, @@ -376,10 +384,12 @@ def interpolateNANs(data, method, order=2, gap_limit=2, extrapolate=None): ) else: - # if the method that is interpolated with, depends on not only the left and right border points of any gap, - # but includes more points, it has to be applied on any data chunk seperated by the too-big gaps individually. - # So we use the gap_mask to group the data into chunks and perform the interpolation on every chunk seperatly - # with the .transform method of the grouper. + # if the method that is interpolated with, depends on not only + # the left and right border points of any gap, but includes more + # points, it has to be applied on any data chunk seperated by + # the too-big gaps individually. So we use the gap_mask to group + # the data into chunks and perform the interpolation on every + # chunk seperatly with the .transform method of the grouper. gap_mask = (~gap_mask).cumsum()[data.index] chunk_groups = data.groupby(by=gap_mask) data = chunk_groups.transform( @@ -410,13 +420,15 @@ def aggregate2Freq( Timestamps that gets no values projected, get filled with the fill-value. It also serves as a replacement for "invalid" intervals. """ + validateChoice(method, "method", ["nagg", "bagg", "fagg"]) + validateWindow(freq, "freq", allow_int=False) + methods = { # offset, closed, label "nagg": lambda f: (f / 2, "left", "left"), "bagg": lambda _: (pd.Timedelta(0), "left", "left"), "fagg": lambda _: (pd.Timedelta(0), "right", "right"), } - # filter data for invalid patterns (since filtering is expensive we pre-check if # it is demanded) if max_invalid_total is not None or max_invalid_consec is not None: @@ -466,13 +478,17 @@ def aggregate2Freq( def shift2Freq( - data: Union[pd.Series, pd.DataFrame], method: str, freq: str, fill_value + data: Union[pd.Series, pd.DataFrame], + method: Literal["fshift", "bshift", "nshift"], + freq: str, + fill_value, ): """ shift timestamps backwards/forwards in order to align them with an equidistant frequency grid. Resulting Nan's are replaced with the fill-value. """ - + validateWindow(freq, "freq", allow_int=False) + validateChoice(method, "method", ["fshift", "bshift", "nshift"]) methods = { "fshift": lambda freq: ("ffill", pd.Timedelta(freq)), "bshift": lambda freq: ("bfill", pd.Timedelta(freq)), diff --git a/tests/lib/test_tools.py b/tests/lib/test_tools.py index 2e1534ce3..54afae663 100644 --- a/tests/lib/test_tools.py +++ b/tests/lib/test_tools.py @@ -6,25 +6,26 @@ import pandas as pd import pytest import saqc.lib.tools as tools +from saqc.lib.checking import validateScalar @pytest.mark.parametrize("optional", [False, True]) @pytest.mark.parametrize("value", [1, 0, "foo", np.nan, np.inf]) -def test_assertScalar(value, optional): - tools.assertScalar("value", value, optional) +def test_validateScalar(value, optional): + validateScalar(value, "value", optional) @pytest.mark.parametrize("optional", [False, True]) @pytest.mark.parametrize("value", [[1], [0, 1], {}, {1, 2}, pd.Series([1, 2])]) -def test_assertScalar_raises(value, optional): +def test_validateScalar(value, optional): with pytest.raises(ValueError): - tools.assertScalar("value", value, optional) + validateScalar(value, "value", optional) -def test_assertScalar_optional(): - tools.assertScalar("value", None, optional=True) +def test_validateScalar(): + validateScalar("value", None, optional=True) with pytest.raises(ValueError): - tools.assertScalar("value", None, optional=False) + validateScalar(None, "value", optional=False) class _ListLike(list): -- GitLab