diff --git a/saqc/funcs/interpolation.py b/saqc/funcs/interpolation.py index 24483f85f78143891a714ed813931188a2cc19dd..516c7090e592eeb7d67ad0c73b72eadc2058fc53 100644 --- a/saqc/funcs/interpolation.py +++ b/saqc/funcs/interpolation.py @@ -445,6 +445,9 @@ class InterpolationMixin: Function to interpolate the data at regular (equidistant) timestamps also known as or grid points. + .. deprecated:: 2.4.0 + Use :py:meth:`~saqc.SaQC.align` instead. + Parameters ---------- freq : @@ -474,13 +477,9 @@ class InterpolationMixin: * ``'forward'``/``'backward'`` - perform forward/backward extrapolation * ``'both'`` - perform forward and backward extrapolation """ - msg = ( - "The method `interpolateIndex` is deprecated and will be removed " - "in version 3.0 of saqc. To achieve the same behavior use: " - ) call = ( - "qc.align(field={field}, freq={freq}, method={method}, " - "order={order}, extrapolate={extrapolate})" + f"qc.align(field={field}, freq={freq}, method={method}, " + f"order={order}, extrapolate={extrapolate})" ) if limit != 2: call = ( diff --git a/saqc/funcs/outliers.py b/saqc/funcs/outliers.py index 64ac08fbd1c4d356581f314734a435843385d313..3be87afc9763e272f4b3975db2ff4c90c784fd98 100644 --- a/saqc/funcs/outliers.py +++ b/saqc/funcs/outliers.py @@ -42,18 +42,13 @@ if TYPE_CHECKING: class OutliersMixin: @staticmethod - def _validateLOF(n, thresh, algorithm, p, density): + def _validateLOF(algorithm, n, p, density): """validate parameter for LOF and UniLOF""" validateValueBounds(n, "n", left=0, strict_int=True) validateValueBounds(p, "p", left=0, strict_int=True) - validateChoice( algorithm, "algorithm", ["ball_tree", "kd_tree", "brute", "auto"] ) - - if thresh != "auto" and not isFloatLike(thresh): - raise ValueError(f"'thresh' must be 'auto' or a float, not {thresh}") - if density != "auto" and not isFloatLike(density) and not isCallable(density): raise ValueError( f"'density' must be 'auto' or a float or a function, not {density}" @@ -149,7 +144,10 @@ class OutliersMixin: the scores are cut off at a level, determined by :py:attr:`thresh`. """ - self._validateLOF(n, thresh, algorithm, p, density) + self._validateLOF(algorithm, n, p, density) + if thresh != "auto" and not isFloatLike(thresh): + raise ValueError(f"'thresh' must be 'auto' or a float, not {thresh}") + fields = toSequence(field) field_ = str(uuid.uuid4()) qc = self.assignLOF( @@ -352,7 +350,9 @@ class OutliersMixin: qc.plot('sac254_raw') """ - self._validateLOF(n, thresh, algorithm, p, density) + self._validateLOF(algorithm, n, p, density) + if thresh != "auto" and not isFloatLike(thresh): + raise ValueError(f"'thresh' must be 'auto' or a float, not {thresh}") tmp_field = str(uuid.uuid4()) qc = self.assignUniLOF( diff --git a/saqc/funcs/pattern.py b/saqc/funcs/pattern.py index b9869c8a1640955db25c9d5c56a1c041ebc7c3aa..fe910aa35408416c5f1d94069cf12b2c17f1c0be 100644 --- a/saqc/funcs/pattern.py +++ b/saqc/funcs/pattern.py @@ -1,16 +1,12 @@ #! /usr/bin/env python - # SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# # SPDX-License-Identifier: GPL-3.0-or-later - # -*- coding: utf-8 -*- from __future__ import annotations from typing import TYPE_CHECKING import dtw -import numpy as np import pandas as pd from saqc import BAD @@ -22,7 +18,7 @@ if TYPE_CHECKING: def calculateDistanceByDTW( - data: pd.Series, reference: pd.Series, forward=True, normalize=True + data: pd.Series, reference: pd.Series, forward: bool = True, normalize: bool = True ): """ Calculate the DTW-distance of data to pattern in a rolling calculation. @@ -35,19 +31,19 @@ def calculateDistanceByDTW( Parameters ---------- - data : pd.Series + data : Data series. Must have datetime-like index, and must be regularly sampled. - reference : : pd.Series + reference : Reference series. Must have datetime-like index, must not contain NaNs and must not be empty. - forward: bool, default True + forward: If `True`, the distance value is set on the left edge of the data chunk. This means, with a perfect match, `0.0` marks the beginning of the pattern in the data. If `False`, `0.0` would mark the end of the pattern. - normalize : bool, default True + normalize : If `False`, return unmodified distances. If `True`, normalize distances by the number of observations in the reference. This helps to make it easier to find a good cutoff threshold for further @@ -97,12 +93,12 @@ class PatternMixin: @flagging() def flagPatternByDTW( self: "SaQC", - field, - reference, - max_distance=0.0, - normalize=True, - plot=False, - flag=BAD, + field: str, + reference: str, + max_distance: float = 0.0, + normalize: bool = True, + plot: bool = False, + flag: float = BAD, **kwargs, ) -> "SaQC": """ @@ -111,48 +107,54 @@ class PatternMixin: The steps are: 1. work on a moving window - 2. for each data chunk extracted from each window, a distance to the given pattern - is calculated, by the dynamic time warping algorithm [1] + 2. for each data chunk extracted from each window, a distance + to the given pattern is calculated, by the dynamic time warping + algorithm [1] - 3. if the distance is below the threshold, all the data in the window gets flagged + 3. if the distance is below the threshold, all the data in the + window gets flagged Parameters ---------- reference : - The name in `data` which holds the pattern. The pattern must not have NaNs, - have a datetime index and must not be empty. + The name in `data` which holds the pattern. The pattern must + not have NaNs, have a datetime index and must not be empty. max_distance : - Maximum dtw-distance between chunk and pattern, if the distance is lower than - ``max_distance`` the data gets flagged. With default, ``0.0``, only exact - matches are flagged. + Maximum dtw-distance between chunk and pattern, if the distance + is lower than ``max_distance`` the data gets flagged. With + default, ``0.0``, only exact matches are flagged. normalize : If `False`, return unmodified distances. - If `True`, normalize distances by the number of observations of the reference. - This helps to make it easier to find a good cutoff threshold for further - processing. The distances then refer to the mean distance per datapoint, - expressed in the datas units. + If `True`, normalize distances by the number of observations + of the reference. This helps to make it easier to find a + good cutoff threshold for further processing. The distances + then refer to the mean distance per datapoint, expressed + in the datas units. plot : - Show a calibration plot, which can be quite helpful to find the right threshold - for `max_distance`. It works best with `normalize=True`. Do not use in automatic - setups / pipelines. The plot show three lines: + Show a calibration plot, which can be quite helpful to find + the right threshold for `max_distance`. It works best with + `normalize=True`. Do not use in automatic setups / pipelines. + The plot show three lines: - data: the data the function was called on - distances: the calculated distances by the algorithm - - indicator: have to distinct levels: `0` and the value of `max_distance`. - If `max_distance` is `0.0` it defaults to `1`. Everywhere where the - indicator is not `0` the data will be flagged. + - indicator: have to distinct levels: `0` and the value of + `max_distance`. If `max_distance` is `0.0` it defaults to + `1`. Everywhere where the indicator is not `0` the data + will be flagged. Notes ----- - The window size of the moving window is set to equal the temporal extension of the - reference datas datetime index. + The window size of the moving window is set to equal the temporal + extension of the reference datas datetime index. References ---------- - Find a nice description of underlying the Dynamic Time Warping Algorithm here: + Find a nice description of underlying the Dynamic Time Warping + Algorithm here: [1] https://cran.r-project.org/web/packages/dtw/dtw.pdf """ diff --git a/saqc/funcs/resampling.py b/saqc/funcs/resampling.py index 3f397133b24b3075f8687ff3d89fec49c3c07c55..817a974a5350ee2fc0cbdc8d3b966adc98b281e9 100644 --- a/saqc/funcs/resampling.py +++ b/saqc/funcs/resampling.py @@ -17,6 +17,7 @@ from typing_extensions import Literal from saqc.constants import UNFLAGGED from saqc.core import register from saqc.core.history import History +from saqc.lib.checking import validateCallable, validateChoice from saqc.lib.docs import DOC_TEMPLATES from saqc.lib.tools import filterKwargs, getFreqDelta, isflagged from saqc.lib.ts_operators import aggregate2Freq @@ -45,22 +46,27 @@ class ResamplingMixin: **kwargs, ) -> "SaQC": """ - A method to "regularize" data by interpolating linearly the data at regular timestamp. - - .. deprecated:: 2.4.0 - Use :py:meth:`~saqc.SaQC.align` with ``method="linear"`` instead. - - A series of data is considered "regular", if it is sampled regularly (= having uniform sampling rate). - Interpolated values will get assigned the worst flag within freq-range. - Note, that the data only gets interpolated at those (regular) timestamps, that have a valid (existing and - not-na) datapoint preceeding them and one succeeding them within freq range. - Regular timestamp that do not suffice this condition get nan assigned AND The associated flag will be of value - ``UNFLAGGED``. + A method to "regularize" data by interpolating linearly the data + at regular timestamp. + + .. deprecated:: 2.4.0 + Use :py:meth:`~saqc.SaQC.align` with ``method="linear"`` + instead. + + A series of data is considered "regular", if it is sampled regularly + (= having uniform sampling rate). Interpolated values will get + assigned the worst flag within freq-range. Note, that the data + only gets interpolated at those (regular) timestamps, that have + a valid (existing and not-na) datapoint preceeding them and one + succeeding them within freq range. Regular timestamp that do + not suffice this condition get nan assigned AND The associated + flag will be of value ``UNFLAGGED``. Parameters ---------- freq : - An offset string. The frequency of the grid you want to interpolate your data at. + An offset string. The frequency of the grid you want to interpolate + your data at. """ warnings.warn( f""" @@ -70,7 +76,6 @@ class ResamplingMixin: """, DeprecationWarning, ) - reserved = ["method", "order", "limit", "downgrade"] kwargs = filterKwargs(kwargs, reserved) return self.interpolateIndex(field, freq, "time", **kwargs) @@ -86,8 +91,8 @@ class ResamplingMixin: """ Shift data points and flags to a regular frequency grid. - .. deprecated:: 2.4.0 - Use :py:meth:`~saqc.SaQC.align` instead. + .. deprecated:: 2.4.0 + Use :py:meth:`~saqc.SaQC.align` instead. Parameters ---------- @@ -104,12 +109,11 @@ class ResamplingMixin: warnings.warn( f""" The method `shift` is deprecated and will be removed with version 2.6 of saqc. - To achieve the same behavior please use: - `qc.align(field={field}, freq={freq}. method={method})` + To achieve the same behavior please use: `qc.align(field={field}, freq={freq}. method={method})` """, DeprecationWarning, ) - + validateChoice(method, "method", ["fshift", "bshift", "nshift"]) return self.align(field=field, freq=freq, method=method, **kwargs) @register(mask=["field"], demask=[], squeeze=[]) @@ -165,11 +169,12 @@ class ResamplingMixin: maxna_group : Same as `maxna` but for consecutive NaNs. """ + validateChoice(method, "method", ["fagg", "bagg", "nagg"]) + validateCallable(func, "func") datcol = self._data[field] - if datcol.empty: - # see for #GL-374 + # see #GL-374 datcol = pd.Series(index=pd.DatetimeIndex([]), dtype=datcol.dtype) datcol = aggregate2Freq( @@ -288,6 +293,21 @@ class ResamplingMixin: overwrite : Overwrite existing flags if ``True`` """ + validateChoice( + method, + "method", + [ + "inverse_fagg", + "inverse_bagg", + "inverse_nagg", + "inverse_fshift", + "inverse_bshift", + "inverse_nshift", + "inverse_interpolation", + "match", + "auto", + ], + ) if target is None: target = field diff --git a/saqc/funcs/residuals.py b/saqc/funcs/residuals.py index 006e61ee275a34c1722dcbd8714c10a0553a8b06..d83545e3cf374047ddb099b9690dea61362b492d 100644 --- a/saqc/funcs/residuals.py +++ b/saqc/funcs/residuals.py @@ -71,6 +71,7 @@ class ResidualsMixin: sparse intervals). To automatically set the minimum number of periods to the number of values in an offset defined window size, pass np.nan. """ + # HINT: checking in _fitPolynomial orig = self._data[field] data, _ = _fitPolynomial( data=self._data, @@ -117,6 +118,7 @@ class ResidualsMixin: center : If True, center the rolling window. """ + # HINT: checking in _roll orig = self._data[field].copy() data, _ = _roll( data=self._data, diff --git a/saqc/funcs/rolling.py b/saqc/funcs/rolling.py index e8ee4cb3d269924c7e020dd2e0e2e1046636d633..70f786c5de9fa6be291a1f76307e2bd8f029e235 100644 --- a/saqc/funcs/rolling.py +++ b/saqc/funcs/rolling.py @@ -13,6 +13,7 @@ import numpy as np import pandas as pd from saqc.core import DictOfSeries, Flags, register +from saqc.lib.checking import validateCallable, validateMinPeriods, validateWindow from saqc.lib.tools import getFreqDelta if TYPE_CHECKING: @@ -53,6 +54,7 @@ class RollingMixin: center : If True, center the rolling window. """ + # HINT: checking in _roll self._data, self._flags = _roll( data=self._data, field=field, @@ -109,6 +111,8 @@ class RollingMixin: """, DeprecationWarning, ) + + # HINT: checking in _roll self._data, self._flags = _roll( data=self._data, field=field, @@ -132,6 +136,10 @@ def _roll( center: bool = True, **kwargs, ): + validateWindow(window) + validateMinPeriods(min_periods) + validateCallable(func, "func") + to_fit = data[field].copy() if to_fit.empty: return data, flags diff --git a/saqc/funcs/scores.py b/saqc/funcs/scores.py index d0aa2daa2f2114bc9b586a7cd4eefba57bb94c8e..5aca31c51cdf212c6733b6eb9355ba720918581e 100644 --- a/saqc/funcs/scores.py +++ b/saqc/funcs/scores.py @@ -16,6 +16,12 @@ from typing_extensions import Literal from saqc import UNFLAGGED from saqc.core import register +from saqc.lib.checking import ( + validateCallable, + validateChoice, + validateMinPeriods, + validateWindow, +) from saqc.lib.docs import DOC_TEMPLATES from saqc.lib.tools import getApply, toSequence from saqc.lib.ts_operators import kNN @@ -50,7 +56,7 @@ def _groupedScoring( min_periods: int = 2, score_func: Callable = _LOFApply, score_kwargs: Optional[dict] = None, -) -> Tuple[pd.Series, pd.Series, pd.Series]: +) -> pd.Series: score_kwargs = score_kwargs or {} score_index = val_frame.index score_ser = pd.Series(np.nan, index=score_index) @@ -118,6 +124,11 @@ def _univarScoring( min_periods Minimum number of valid meassurements in a scoring window, to consider the resulting score valid. """ + validateWindow(window, optional=True) + validateCallable(model_func, "model_func") + validateCallable(norm_func, "norm_func") + validateMinPeriods(min_periods, optional=True) + if data.empty: return data, data, data if min_periods is None: @@ -227,6 +238,11 @@ class ScoresMixin: ---------- [1] https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html """ + validateChoice( + algorithm, "algorithm", ["ball_tree", "kd_tree", "brute", "auto"] + ) + validateCallable(func, "func") + if isinstance(target, list): if len(target) > 1: raise ValueError( @@ -366,6 +382,11 @@ class ScoresMixin: * `1` - Manhatten Metric * `2` - Euclidian Metric """ + from saqc.funcs.outliers import OutliersMixin + + validateMinPeriods(min_periods) + OutliersMixin._validateLOF(algorithm, n, p, 1.0) + if isinstance(target, list): if len(target) > 1: raise ValueError( @@ -463,6 +484,9 @@ class ScoresMixin: -------- """ + from saqc.funcs.outliers import OutliersMixin + + OutliersMixin._validateLOF(algorithm, n, p, density) vals = self._data[field] if fill_na: diff --git a/saqc/funcs/tools.py b/saqc/funcs/tools.py index 11aac29f0fb3130ca91277cb5e80b2a6247f96b6..bce714086ec864a2d17e2a124a9917d10bd457a3 100644 --- a/saqc/funcs/tools.py +++ b/saqc/funcs/tools.py @@ -17,6 +17,7 @@ from typing_extensions import Literal from saqc import FILTER_NONE, UNFLAGGED from saqc.core import processing, register +from saqc.lib.checking import validateChoice from saqc.lib.docs import DOC_TEMPLATES from saqc.lib.plotting import makeFig from saqc.lib.tools import periodicMask @@ -177,6 +178,8 @@ class ToolsMixin: >>> start = "22:00:00" >>> end = "06:00:00" """ + validateChoice(mode, "mode", ["periodic", "selection_field"]) + datcol_idx = self._data[field].index if mode == "periodic": diff --git a/saqc/lib/rolling.py b/saqc/lib/rolling.py index 6016d29d6f12b59fd29af060b5a5363ccb8df73b..5c4ae6f9cef35c23d07669a305c1adba23d9189e 100644 --- a/saqc/lib/rolling.py +++ b/saqc/lib/rolling.py @@ -10,7 +10,9 @@ from typing import Literal import numpy as np import pandas as pd +from numpy.lib.stride_tricks import sliding_window_view +from saqc.lib.checking import validateChoice, validateMinPeriods, validateWindow from saqc.lib.tools import getFreqDelta @@ -26,35 +28,35 @@ def windowRoller( * implements efficient 2d rolling in case of regular timestamps or integer defined window * else: dispatches to not optimized (no-numba) version in case of irregular timestamp """ - supportedFuncs = ["mean", "median", "std", "var", "sum"] - if func not in supportedFuncs: - raise ValueError(f'"func" has to be one of {supportedFuncs}. Got {func}.') + validateWindow(window) + validateMinPeriods(min_periods, optional=False) + validateChoice(func, "func", ["mean", "median", "std", "var", "sum"]) + func_kwargs = {} if func in ["std", "var"]: func_kwargs.update({"ddof": 1}) + roll_func = getattr(np, "nan" + func) regularFreq = getFreqDelta(data.index) - vals = data.values if regularFreq is not None: - window = ( - int(pd.Timedelta(window) / pd.Timedelta(regularFreq)) - if isinstance(window, str) - else window - ) + if isinstance(window, str): + window = int(pd.Timedelta(window) / pd.Timedelta(regularFreq)) + vals = data.values ramp = np.empty(((window - 1), vals.shape[1])) ramp.fill(np.nan) vals = np.concatenate([ramp, vals]) if center: vals = np.roll(vals, axis=0, shift=-int(window / 2)) - views = np.lib.stride_tricks.sliding_window_view( - vals, (window, vals.shape[1]) - ).squeeze() + views = sliding_window_view(vals, (window, vals.shape[1])).squeeze() result = roll_func(views, axis=(1, 2), **func_kwargs) + if min_periods > 0: invalid_wins = (~np.isnan(views)).sum(axis=(1, 2)) < min_periods result[invalid_wins] = np.nan + out = pd.Series(result, index=data.index, name="result") + else: # regularFreq is None i_ser = pd.Series(range(data.shape[0]), index=data.index, name="result") result = i_ser.rolling(window=window, center=center).apply( diff --git a/saqc/lib/tools.py b/saqc/lib/tools.py index c062780962f4c0ab1bf94a311344086e27c8352c..75d73a559f1be84d64fe3b4bf29bf1199f4fc725 100644 --- a/saqc/lib/tools.py +++ b/saqc/lib/tools.py @@ -78,7 +78,9 @@ def squeezeSequence(value: Sequence[T]) -> Union[T, Sequence[T]]: return value -def periodicMask(dtindex, season_start, season_end, include_bounds): +def periodicMask( + dtindex: pd.Index, season_start: str, season_end: str, include_bounds: bool +): """ This function generates date-periodic/seasonal masks from an index passed. @@ -199,7 +201,7 @@ def isQuoted(string): return bool(re.search(r"'.*'|\".*\"", string)) -def mutateIndex(index, old_name, new_name): +def mutateIndex(index: pd.Index, old_name, new_name): pos = index.get_loc(old_name) index = index.drop(index[pos]) index = index.insert(pos, new_name) @@ -207,13 +209,13 @@ def mutateIndex(index, old_name, new_name): def estimateFrequency( - index, - delta_precision=-1, - max_rate="10s", - min_rate="1D", - optimize=True, - min_energy=0.2, - max_freqs=10, + index: pd.Index, + delta_precision: int = -1, + max_rate: str = "10s", + min_rate: str = "1D", + optimize: bool = True, + min_energy: float = 0.2, + max_freqs: int = 10, bins=None, ): """ diff --git a/saqc/lib/ts_operators.py b/saqc/lib/ts_operators.py index b0b7dba71254be92eee6ee74a086635ff35f3abd..d5c677ea188d63b3f1828ca9d767758e946a7c75 100644 --- a/saqc/lib/ts_operators.py +++ b/saqc/lib/ts_operators.py @@ -420,13 +420,15 @@ def aggregate2Freq( Timestamps that gets no values projected, get filled with the fill-value. It also serves as a replacement for "invalid" intervals. """ + validateChoice(method, "method", ["nagg", "bagg", "fagg"]) + validateWindow(freq, "freq", allow_int=False) + methods = { # offset, closed, label "nagg": lambda f: (f / 2, "left", "left"), "bagg": lambda _: (pd.Timedelta(0), "left", "left"), "fagg": lambda _: (pd.Timedelta(0), "right", "right"), } - # filter data for invalid patterns (since filtering is expensive we pre-check if # it is demanded) if max_invalid_total is not None or max_invalid_consec is not None: