diff --git a/CHANGELOG.md b/CHANGELOG.md index 7bad2b405ee49661de3b268513b7bf6cf8c6f36c..c36faca39e8d1cb67e9adf195c6e67732c7dbd8a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,11 @@ SPDX-License-Identifier: GPL-3.0-or-later - Methods `logicalAnd` and `logicalOr` - `Flags` supports slicing and column selection with `list` or a `pd.Index`. ### Changed +- Deprecate `interpolate`, `linear` and `shift` in favor of `align` +- Rename `interplateInvalid` to `interpolate` +- Rename `interpolateIndex` to `align` ### Removed +- Parameter `limit` from `align` ### Fixed - fail on duplicated arguments to test methods diff --git a/docs/resources/data/config.csv b/docs/resources/data/config.csv index 261482b57c42ea28e9e0c34df5223ff413012843..77914d8035af74daa7e2669378c1ebfd6bfa8819 100644 --- a/docs/resources/data/config.csv +++ b/docs/resources/data/config.csv @@ -1,6 +1,6 @@ varname ; test #----------;----------------------------- -SM2 ; shift(freq="15Min") +SM2 ; align(freq="15Min", method="nshift") SM2 ; flagMissing() 'SM(1|2)+' ; flagRange(min=10, max=60) SM2 ; flagMAD(window="30d", z=3.5) diff --git a/saqc/funcs/interpolation.py b/saqc/funcs/interpolation.py index 9def348d0594a7528536675ff33a3e18aa5e459f..0a37748907c0dde54dd5ee5349573e1b20d31331 100644 --- a/saqc/funcs/interpolation.py +++ b/saqc/funcs/interpolation.py @@ -7,6 +7,7 @@ # -*- coding: utf-8 -*- from __future__ import annotations +import warnings from typing import TYPE_CHECKING, Callable, Union import numpy as np @@ -16,12 +17,13 @@ from typing_extensions import Literal from saqc import UNFLAGGED from saqc.core import register from saqc.lib.tools import isflagged -from saqc.lib.ts_operators import interpolateNANs +from saqc.lib.ts_operators import interpolateNANs, shift2Freq if TYPE_CHECKING: from saqc import SaQC +# TODO: remove, when `interpolateIndex` and `interpolateInvalid are removed` _SUPPORTED_METHODS = Literal[ "linear", "time", @@ -136,52 +138,81 @@ class InterpolationMixin: @register( mask=["field"], - demask=["field"], + demask=[], squeeze=[], # func handles history by itself ) - def interpolateInvalid( + def interpolate( self: "SaQC", field: str, - method: _SUPPORTED_METHODS, + method: Literal[ + "linear", + "time", + "index", + "values", + "pad", + "nearest", + "zero", + "slinear", + "quadratic", + "cubic", + "spline", + "barycentric", + "polynomial", + "krogh", + "spline", + "pchip", + "akima", + "cubicspline", + "from_derivatives", + ], order: int = 2, - limit: int | None = None, - extrapolate: Literal["forward", "backward", "both"] = None, + limit: int | str | None = None, + extrapolate: Literal["forward", "backward", "both"] | None = None, flag: float = UNFLAGGED, **kwargs, ) -> "SaQC": """ - Function to interpolate nan values in data. - - There are available all the interpolation methods from the pandas.interpolate method and they are applicable by - the very same key words, that you would pass to the ``pd.Series.interpolate``'s method parameter. + Fill NaN and flagged values using an interpolation method. Parameters ---------- - field : str - Name of the column, holding the data-to-be-interpolated. - - method : {"linear", "time", "nearest", "zero", "slinear", "quadratic", "cubic", "spline", "barycentric", - "polynomial", "krogh", "piecewise_polynomial", "spline", "pchip", "akima"} - The interpolation method to use. - - order : int, default 2 - If there your selected interpolation method can be performed at different 'orders' - here you pass the desired - order. + field: + Column(s) to interpolate. + + method: + Interpolation technique to use. One of: + + * ‘linear’: Ignore the index and treat the values as equally spaced. + * ‘time’: Works on daily and higher resolution data to interpolate given length of interval. + * ‘index’, ‘values’: Use the actual numerical values of the index. + * ‘pad’: Fill in NaNs using existing values. + * ‘nearest’, ‘zero’, ‘slinear’, ‘quadratic’, ‘cubic’, ‘spline’, ‘barycentric’, ‘polynomial’: + Passed to scipy.interpolate.interp1d. These methods use the numerical values of the index. + Both ‘polynomial’ and ‘spline’ require that you also specify an order (int), e.g. + ``qc.interpolate(method='polynomial', order=5)``. + * ‘krogh’, ‘spline’, ‘pchip’, ‘akima’, ‘cubicspline’: + Wrappers around the SciPy interpolation methods of similar names. + * ‘from_derivatives’: Refers to scipy.interpolate.BPoly.from_derivatives + + order: + Order of the interpolation method, ignored if not supported by the chosen ``method`` + + limit: + Maximum number of missing values to interpolate. Only gaps smaller than ``limit`` will be filled. + The gap size can be given as a number of values (integer) or a temporal extensions (offset string). + With ``None``, all missing values will be interpolated. + + extrapolate: + Use parameter to perform extrapolation instead of interpolation onto the trailing and/or leading chunks of + NaN values in data series. - limit : int or str, default None - Upper limit of missing index values (with respect to `freq`) to fill. The limit can either be expressed - as the number of consecutive missing values (integer) or temporal extension of the gaps to be filled - (Offset String). - If `None` is passed, no Limit is set. + * 'None' (default) - perform interpolation + * 'forward'/'backward' - perform forward/backward extrapolation + * 'both' - perform forward and backward extrapolation flag : float or None, default UNFLAGGED Flag that is set for interpolated values. If ``None``, no flags are set at all. - downgrade : bool, default False - If `True` and the interpolation can not be performed at current order, retry with a lower order. - This can happen, because the chosen ``method`` does not support the passed ``order``, or - simply because not enough values are present in a interval. - Returns ------- saqc.SaQC @@ -192,7 +223,7 @@ class InterpolationMixin: Lets generate some dummy data: - .. doctest:: interpolateInvalid + .. doctest:: interpolate >>> data = pd.DataFrame({'data':np.array([np.nan, 0, np.nan, np.nan, np.nan, 4, 5, np.nan, np.nan, 8, 9, np.nan, np.nan])}, index=pd.date_range('2000',freq='1H', periods=13)) >>> data @@ -211,12 +242,12 @@ class InterpolationMixin: 2000-01-01 11:00:00 NaN 2000-01-01 12:00:00 NaN - Use :py:meth:`~saqc.SaQC.interpolateInvalid` to do linear interpolation of up to 2 consecutive missing values: + Use :py:meth:`~saqc.SaQC.interpolate` to do linear interpolation of up to 2 consecutive missing values: - .. doctest:: interpolateInvalid + .. doctest:: interpolate >>> qc = saqc.SaQC(data) - >>> qc = qc.interpolateInvalid("data", limit=3, method='time') + >>> qc = qc.interpolate("data", limit=3, method='time') >>> qc.data # doctest:+NORMALIZE_WHITESPACE data | ======================== | @@ -236,12 +267,12 @@ class InterpolationMixin: <BLANKLINE> - Use :py:meth:`~saqc.SaQC.interpolateInvalid` to do linear extrapolaiton of up to 1 consecutive missing values: + Use :py:meth:`~saqc.SaQC.interpolate` to do linear extrapolaiton of up to 1 consecutive missing values: - .. doctest:: interpolateInvalid + .. doctest:: interpolate >>> qc = saqc.SaQC(data) - >>> qc = qc.interpolateInvalid("data", limit=2, method='time', extrapolate='both') + >>> qc = qc.interpolate("data", limit=2, method='time', extrapolate='both') >>> qc.data # doctest:+NORMALIZE_WHITESPACE data | ======================== | @@ -260,6 +291,26 @@ class InterpolationMixin: 2000-01-01 12:00:00 NaN | <BLANKLINE> """ + + if "freq" in kwargs: + # the old interpolate version + warnings.warn( + f""" + The method `intepolate` is deprecated and will be removed in version 3.0 of saqc. + To achieve the same behaviour please use: + `qc.align(field={field}, freq={kwargs["freq"]}, method={method}, order={order}, flag={flag})` + """, + DeprecationWarning, + ) + return self.align( + field=field, + freq=kwargs.pop("freq", method), + method=method, + order=order, + flag=flag, + **kwargs, + ) + inter_data = interpolateNANs( self._data[field], method, @@ -281,41 +332,69 @@ class InterpolationMixin: return self @register(mask=["field"], demask=[], squeeze=[]) - def interpolateIndex( + def align( self: "SaQC", field: str, freq: str, - method: _SUPPORTED_METHODS, + method: Literal[ + "nshift", + "bshift", + "fshift", + "linear", + "time", + "index", + "values", + "pad", + "nearest", + "zero", + "slinear", + "quadratic", + "cubic", + "spline", + "barycentric", + "polynomial", + "krogh", + "spline", + "pchip", + "akima", + "cubicspline", + "from_derivatives", + ] = "time", order: int = 2, - limit: int | None = 2, extrapolate: Literal["forward", "backward", "both"] = None, **kwargs, ) -> "SaQC": """ - Function to interpolate the data at regular (äquidistant) timestamps (or Grid points). + Convert time series to specified frequency. Values affected by frequency + changes will be inteprolated using the given method. Parameters ---------- - field : str - Name of the column, holding the data-to-be-interpolated. - - freq : str - An Offset String, interpreted as the frequency of - the grid you want to interpolate your data at. - - method : {"linear", "time", "nearest", "zero", "slinear", "quadratic", "cubic", "spline", "barycentric", - "polynomial", "krogh", "piecewise_polynomial", "spline", "pchip", "akima"}: string - The interpolation method you want to apply. - - order : int, default 2 - If your selected interpolation method can be performed at different 'orders' - here you pass the desired - order. - - limit : int, optional - Upper limit of missing index values (with respect to `freq`) to fill. The limit can either be expressed - as the number of consecutive missing values (integer) or temporal extension of the gaps to be filled - (Offset String). - If `None` is passed, no Limit is set. + field: + Column(s) to align. + + freq: + Target frequency. + + method: + Interpolation technique to use. One of: + + * 'nshift': shift grid points to the nearest time stamp in the range = +/- 0.5 * ``freq`` + * 'bshift' : shift grid points to the first succeeding time stamp (if any) + * 'fshift' : shift grid points to the last preceeding time stamp (if any) + * ‘linear’: Ignore the index and treat the values as equally spaced. + * 'time', ‘index’, ‘values’: Use the actual numerical values of the index. + * ‘pad’: Fill in NaNs using existing values. + * ‘nearest’, ‘zero’, ‘slinear’, ‘quadratic’, ‘cubic’, ‘spline’, ‘barycentric’, ‘polynomial’: + Passed to scipy.interpolate.interp1d. These methods use the numerical values of the index. + Both ‘polynomial’ and ‘spline’ require that you also specify an order (int), e.g. + ``qc.interpolate(method='polynomial', order=5)``. + * ‘krogh’, ‘spline’, ‘pchip’, ‘akima’, ‘cubicspline’: + Wrappers around the SciPy interpolation methods of similar names. + * ‘from_derivatives’: Refers to scipy.interpolate.BPoly.from_derivatives + + order: + Order of the interpolation method, ignored if not supported by the chosen ``method`` extraplate : {'forward', 'backward', 'both'}, default None Use parameter to perform extrapolation instead of interpolation onto the trailing and/or leading chunks of @@ -329,9 +408,16 @@ class InterpolationMixin: ------- saqc.SaQC """ + + # TODO: + # - should we keep `extrapolate` + if self._data[field].empty: return self + if method in ("fshift", "bshift", "nshift"): + return _shift(saqc=self, field=field, freq=freq, method=method, **kwargs) + datcol = self._data[field].copy() start, end = datcol.index[0].floor(freq), datcol.index[-1].ceil(freq) @@ -339,9 +425,6 @@ class InterpolationMixin: start=start, end=end, freq=freq, name=datcol.index.name ) - # TODO: - # in future we could use `register(mask=[field], [], [])` - # and dont handle masking manually here flagged = isflagged(self._flags[field], kwargs["dfilter"]) # drop all points that hold no relevant grid information @@ -361,7 +444,7 @@ class InterpolationMixin: data=datcol, method=method, order=order, - gap_limit=limit, + gap_limit=2, extrapolate=extrapolate, ) @@ -378,13 +461,12 @@ class InterpolationMixin: ) meta = { - "func": "interpolateIndex", + "func": "align", "args": (field,), "kwargs": { "freq": freq, "method": method, "order": order, - "limit": limit, "extrapolate": extrapolate, **kwargs, }, @@ -395,3 +477,179 @@ class InterpolationMixin: self._flags.history[field] = history return self + + ### Deprecated functions + + @register(mask=["field"], demask=[], squeeze=[]) + def interpolateIndex( + self: "SaQC", + field: str, + freq: str, + method: _SUPPORTED_METHODS, + order: int = 2, + limit: int | None = 2, + extrapolate: Literal["forward", "backward", "both"] = None, + **kwargs, + ) -> "SaQC": + """ + Function to interpolate the data at regular (äquidistant) timestamps (or Grid points). + + Parameters + ---------- + field : str + Name of the column, holding the data-to-be-interpolated. + + freq : str + An Offset String, interpreted as the frequency of + the grid you want to interpolate your data at. + + method : {"linear", "time", "nearest", "zero", "slinear", "quadratic", "cubic", "spline", "barycentric", + "polynomial", "krogh", "piecewise_polynomial", "spline", "pchip", "akima"}: string + The interpolation method you want to apply. + + order : int, default 2 + If your selected interpolation method can be performed at different 'orders' - here you pass the desired + order. + + limit : int, optional + Upper limit of missing index values (with respect to `freq`) to fill. The limit can either be expressed + as the number of consecutive missing values (integer) or temporal extension of the gaps to be filled + (Offset String). + If `None` is passed, no Limit is set. + + extraplate : {'forward', 'backward', 'both'}, default None + Use parameter to perform extrapolation instead of interpolation onto the trailing and/or leading chunks of + NaN values in data series. + + * 'None' (default) - perform interpolation + * 'forward'/'backward' - perform forward/backward extrapolation + * 'both' - perform forward and backward extrapolation + + Returns + ------- + saqc.SaQC + """ + + msg = """ + The method `interpolateIndex` is deprecated and will be removed in verion 3.0 of saqc. + To achieve the same behavior use: + """ + call = "qc.align(field={field}, freq={freq}, method={method}, order={order}, extrapolate={extrapolate})" + if limit != 2: + call = f"{call}.interpolate(field={field}, method={method}, order={order}, limit={limit}, extrapolate={extrapolate})" + + warnings.warn(f"{msg}`{call}`", DeprecationWarning) + out = self.align( + field=field, + freq=freq, + method=method, + order=order, + extrapolate=extrapolate, + **kwargs, + ) + if limit != 2: + out = out.interpolate( + field=field, + freq=freq, + method=method, + order=order, + limit=limit, + extrapolate=extrapolate, + **kwargs, + ) + return out + + @register( + mask=["field"], + demask=["field"], + squeeze=[], # func handles history by itself + ) + def interpolateInvalid( + self: "SaQC", + field: str, + method: _SUPPORTED_METHODS, + order: int = 2, + limit: int | None = None, + extrapolate: Literal["forward", "backward", "both"] | None = None, + flag: float = UNFLAGGED, + **kwargs, + ) -> "SaQC": + warnings.warn( + f""" + The method `intepolateInvalid` is deprecated and will be removed + with version 3.0 of saqc. To achieve the same behavior, please use + `qc.interpolate( + field={field}, method={method}, order={order}, + limit={limit}, extrapolate={extrapolate}, flag={flag} + )` + """ + ) + + return self.interpolate( + field=field, + method=method, + order=order, + limit=limit, + extrapolate=extrapolate, + flag=flag, + **kwargs, + ) + + +def _shift( + saqc: "SaQC", + field: str, + freq: str, + method: Literal["fshift", "bshift", "nshift"] = "nshift", + **kwargs, +) -> "SaQC": + """ + Shift data points and flags to a regular frequency grid. + + Parameters + ---------- + field : str + The fieldname of the column, holding the data-to-be-shifted. + + freq : str + Offset string. Sampling rate of the target frequency. + + method : {'fshift', 'bshift', 'nshift'}, default 'nshift' + Method to propagate values: + + * 'nshift' : shift grid points to the nearest time stamp in the range = +/- 0.5 * ``freq`` + * 'bshift' : shift grid points to the first succeeding time stamp (if any) + * 'fshift' : shift grid points to the last preceeding time stamp (if any) + + freq_check : {None, 'check', 'auto'}, default None + * ``None`` : do not validate the ``freq`` string. + * 'check' : check ``freq`` against an frequency estimation, produces a warning in case of miss matches. + * 'auto' : estimate frequency, `freq` is ignored. + + Returns + ------- + saqc.SaQC + """ + # TODO + # - Do we need `freq_check`? If so could we move it to `align`? + + datcol = saqc._data[field] + if datcol.empty: + return saqc + + # do the shift + datcol = shift2Freq(datcol, method, freq, fill_value=np.nan) + + # do the shift on the history + kws = dict(method=method, freq=freq) + + history = saqc._flags.history[field].apply( + index=datcol.index, + func_handle_df=True, + func=shift2Freq, + func_kws={**kws, "fill_value": np.nan}, + ) + + saqc._flags.history[field] = history + saqc._data[field] = datcol + return saqc diff --git a/saqc/funcs/resampling.py b/saqc/funcs/resampling.py index ebc02bd4bf0ad1e890050a42138c8c36fcefa253..52b2a5322fd0e9f8ac20d25d969aaea731292471 100644 --- a/saqc/funcs/resampling.py +++ b/saqc/funcs/resampling.py @@ -8,6 +8,7 @@ from __future__ import annotations +import warnings from typing import TYPE_CHECKING, Callable, Optional, Union import numpy as np @@ -70,57 +71,6 @@ class ResamplingMixin: kwargs = filterKwargs(kwargs, reserved) return self.interpolateIndex(field, freq, "time", **kwargs) - @register(mask=["field"], demask=[], squeeze=[]) - def interpolate( - self: "SaQC", - field: str, - freq: str, - method: _SUPPORTED_METHODS, - order: int = 1, - **kwargs, - ) -> "SaQC": - """ - A method to "regularize" data by interpolating the data at regular timestamp. - - A series of data is considered "regular", if it is sampled regularly (= having uniform sampling rate). - - Interpolated values will get assigned the worst flag within freq-range. - - There are available all the interpolations from the pandas.Series.interpolate method and they are called by - the very same keywords. - - Note, that, to perform a timestamp aware, linear interpolation, you have to pass ``'time'`` as `method`, - and NOT ``'linear'``. - - Note, that the data only gets interpolated at those (regular) timestamps, that have a valid (existing and - not-na) datapoint preceeding them and one succeeding them within freq range. - Regular timestamp that do not suffice this condition get nan assigned AND The associated flag will be of value - ``UNFLAGGED``. - - Parameters - ---------- - field : str - The fieldname of the column, holding the data-to-be-regularized. - - freq : str - An offset string. The frequency of the grid you want to interpolate your data at. - - method : {"linear", "time", "nearest", "zero", "slinear", "quadratic", "cubic", "spline", "barycentric", - "polynomial", "krogh", "piecewise_polynomial", "spline", "pchip", "akima"} - The interpolation method you want to apply. - - order : int, default 1 - If your selected interpolation method can be performed at different *orders* - here you pass the desired - order. - - Returns - ------- - saqc.SaQC - """ - reserved = ["limit", "downgrade"] - kwargs = filterKwargs(kwargs, reserved) - return self.interpolateIndex(field, freq, method=method, order=order, **kwargs) - @register(mask=["field"], demask=[], squeeze=[]) def shift( self: "SaQC", @@ -157,28 +107,16 @@ class ResamplingMixin: ------- saqc.SaQC """ - datcol = self._data[field] - if datcol.empty: - return self - - freq = evalFreqStr(freq, freq_check, datcol.index) - - # do the shift - datcol = shift2Freq(datcol, method, freq, fill_value=np.nan) - - # do the shift on the history - kws = dict(method=method, freq=freq) - - history = self._flags.history[field].apply( - index=datcol.index, - func_handle_df=True, - func=shift2Freq, - func_kws={**kws, "fill_value": np.nan}, + warnings.warn( + f""" + The method `shift` is deprecated and will be removed with version 2.6 of saqc. + To achieve the same behavior please use: + `qc.align(field={field}, freq={freq}. method={method})` + """, + DeprecationWarning, ) - - self._flags.history[field] = history - self._data[field] = datcol - return self + freq = evalFreqStr(freq, freq_check, self._data[field].index) + return self.align(field=field, freq=freq, method=method, **kwargs) @register(mask=["field"], demask=[], squeeze=[]) def resample( diff --git a/tests/common.py b/tests/common.py index 819f5d4f38cc96d1d025da5ec5654c0fc46a9258..e9375c73e77a10c4add8616e80f6c37f494939cf 100644 --- a/tests/common.py +++ b/tests/common.py @@ -49,7 +49,7 @@ def writeIO(content): return f -def checkDataFlagsInvariants(data, flags, field, identical=True): +def checkInvariants(data, flags, field, identical=True): """ Check all invariants that must hold at any point for * field diff --git a/tests/funcs/test_proc_functions.py b/tests/funcs/test_proc_functions.py index 1e4ddceba2c6b38ddb3328cba3caa8ae5e607a93..f0f1a62bc05b031e69e3b88649681b8dafda867c 100644 --- a/tests/funcs/test_proc_functions.py +++ b/tests/funcs/test_proc_functions.py @@ -45,24 +45,24 @@ def test_rollingInterpolateMissing(course_5): assert qc.data[field][characteristics["missing"]].isna().all() -def test_interpolateMissing(course_5): +def test_interpolate(course_5): data, characteristics = course_5(periods=10, nan_slice=[5]) field = data.columns[0] data = DictOfSeries(data) flags = initFlagsLike(data) qc = SaQC(data, flags) - qc_lin = qc.interpolateInvalid(field, method="linear") - qc_poly = qc.interpolateInvalid(field, method="polynomial") + qc_lin = qc.interpolate(field, method="linear") + qc_poly = qc.interpolate(field, method="polynomial") assert qc_lin.data[field][characteristics["missing"]].notna().all() assert qc_poly.data[field][characteristics["missing"]].notna().all() data, characteristics = course_5(periods=10, nan_slice=[5, 6, 7]) qc = SaQC(data, flags) - qc_lin_1 = qc.interpolateInvalid(field, method="linear", limit=2) - qc_lin_2 = qc.interpolateInvalid(field, method="linear", limit=3) - qc_lin_3 = qc.interpolateInvalid(field, method="linear", limit=4) + qc_lin_1 = qc.interpolate(field, method="linear", limit=2) + qc_lin_2 = qc.interpolate(field, method="linear", limit=3) + qc_lin_3 = qc.interpolate(field, method="linear", limit=4) assert qc_lin_1.data[field][characteristics["missing"]].isna().all() assert qc_lin_2.data[field][characteristics["missing"]].isna().all() @@ -111,9 +111,7 @@ def test_interpolateGrid(course_5, course_3): data_grid, _ = course_3() data["grid"] = data_grid["data"] flags = initFlagsLike(data) - SaQC(data, flags).interpolateIndex( - "data", "1h", "time", grid_field="grid", limit=10 - ) + SaQC(data, flags).align("data", "1h", "time", grid_field="grid", limit=10) @pytest.mark.slow diff --git a/tests/funcs/test_resampling.py b/tests/funcs/test_resampling.py index 2fc0623e23ed7f37bc10ffeaa23534482d949c8d..860ef654c32208849af9bce4eb089319c0958087 100644 --- a/tests/funcs/test_resampling.py +++ b/tests/funcs/test_resampling.py @@ -12,7 +12,7 @@ import pytest from saqc import BAD, UNFLAGGED, SaQC from saqc.core import DictOfSeries, initFlagsLike -from tests.common import checkDataFlagsInvariants +from tests.common import checkInvariants @pytest.fixture @@ -33,116 +33,7 @@ def data(): return data -@pytest.mark.parametrize( - "func, kws", - [ - ("linear", dict()), - ("shift", dict(method="nshift")), - ("interpolate", dict(method="spline")), - ("resample", dict(func=np.nansum, method="nagg")), - ], -) -def test_wrapper(data, func, kws): - field = "data" - freq = "15T" - flags = initFlagsLike(data) - - # GL-#352 - # make a History, otherwise nothing important is tested - for c in flags.columns: - flags[:, c] = BAD - - qc = SaQC(data, flags) - - qc = getattr(qc, func)(field, freq, **kws) - - # check minimal requirements - checkDataFlagsInvariants(qc._data, qc._flags, field) - assert qc.data[field].index.inferred_freq == freq - - -_SUPPORTED_METHODS = [ - "linear", - "time", - "nearest", - "zero", - "slinear", - "quadratic", - "cubic", - "spline", - "barycentric", - "polynomial", - "krogh", - "piecewise_polynomial", - "spline", - "pchip", - "akima", -] - - -@pytest.mark.parametrize("method", _SUPPORTED_METHODS) -@pytest.mark.parametrize("fill_history", ["some", "all", "none"]) -def test_gridInterpolation(data, method, fill_history): - freq = "15T" - field = "data" - data = data[field] - data = pd.concat([data * np.sin(data), data.shift(1, "2h")]).shift(1, "3s") - data = DictOfSeries(data=data) - flags = initFlagsLike(data) - - if fill_history == "none": - pass - - if fill_history == "all": - for c in flags.columns: - flags[:, c] = BAD - - if fill_history == "some": - for c in flags.columns: - flags[::2, c] = UNFLAGGED - - qc = SaQC(data, flags) - - # we are just testing if the interpolation gets passed to the series without - # causing an error: - res = qc.interpolate( - field, - freq, - method=method, - downcast_interpolation=True, - ) - - if method == "polynomial": - res = qc.interpolate( - field, - freq, - order=2, - method=method, - downcast_interpolation=True, - ) - res = qc.interpolate( - field, - freq, - order=9, - method=method, - downcast_interpolation=True, - ) - - # check minimal requirements - checkDataFlagsInvariants(res._data, res._flags, field, identical=False) - assert res.data[field].index.inferred_freq == freq - - -@pytest.mark.parametrize( - "func, kws", - [ - ("linear", dict()), - ("shift", dict(method="nshift")), - ("interpolate", dict(method="spline")), - ("aggregate", dict(value_func=np.nansum, method="nagg")), - ], -) -def test_flagsSurviveReshaping(func, kws): +def test_flagsSurviveReshaping(): """ flagging -> reshaping -> test (flags also was reshaped correctly) """ @@ -163,76 +54,11 @@ def test_flagsSurviveBackprojection(): @pytest.mark.parametrize( - "reshaper", ["nshift", "fshift", "bshift", "nagg", "bagg", "fagg", "interpolation"] -) -def test_harmSingleVarIntermediateFlagging(data, reshaper): - flags = initFlagsLike(data) - field = "data" - freq = "15T" - - pre_data = data.copy() - pre_flags = flags.copy() - qc = SaQC(data, flags) - - qc = qc.copyField(field, field + "_interpolated") - qc = qc.linear(field + "_interpolated", freq=freq) - checkDataFlagsInvariants( - qc._data, qc._flags, field + "_interpolated", identical=True - ) - assert qc._data[field + "_interpolated"].index.inferred_freq == freq - - # flag something bad - qc._flags[ - qc._data[field + "_interpolated"].index[3:4], field + "_interpolated" - ] = BAD - qc = qc.concatFlags( - field + "_interpolated", method="inverse_" + reshaper, target=field - ) - qc = qc.dropField(field + "_interpolated") - - assert len(qc.data[field]) == len(qc.flags[field]) - assert qc.data[field].equals(pre_data[field]) - assert qc.flags[field].index.equals(pre_flags[field].index) - - if "agg" in reshaper: - if reshaper == "nagg": - start, end = 3, 7 - elif reshaper == "fagg": - start, end = 3, 5 - elif reshaper == "bagg": - start, end = 5, 7 - else: - raise NotImplementedError("untested test case") - - assert all(qc._flags[field].iloc[start:end] > UNFLAGGED) - assert all(qc._flags[field].iloc[:start] == UNFLAGGED) - assert all(qc._flags[field].iloc[end:] == UNFLAGGED) - - elif "shift" in reshaper: - if reshaper == "nshift": - exp = [False, False, False, False, True, False, False, False, False] - elif reshaper == "fshift": - exp = [False, False, False, False, True, False, False, False, False] - elif reshaper == "bshift": - exp = [False, False, False, False, False, True, False, False, False] - else: - raise NotImplementedError("untested test case") - - flagged = qc._flags[field] > UNFLAGGED - assert all(flagged == exp) - - elif reshaper == "interpolation": - pytest.skip("no testcase for interpolation") - - else: - raise NotImplementedError("untested test case") - - -@pytest.mark.parametrize( - "params, expected", + "method, freq, expected", [ ( - ("nagg", "15Min"), + "nagg", + "15Min", pd.Series( data=[-87.5, -25.0, 0.0, 37.5, 50.0], index=pd.date_range( @@ -241,7 +67,8 @@ def test_harmSingleVarIntermediateFlagging(data, reshaper): ), ), ( - ("nagg", "30Min"), + "nagg", + "30Min", pd.Series( data=[-87.5, -25.0, 87.5], index=pd.date_range( @@ -250,7 +77,8 @@ def test_harmSingleVarIntermediateFlagging(data, reshaper): ), ), ( - ("bagg", "15Min"), + "bagg", + "15Min", pd.Series( data=[-50.0, -37.5, -37.5, 12.5, 37.5, 50.0], index=pd.date_range( @@ -259,7 +87,8 @@ def test_harmSingleVarIntermediateFlagging(data, reshaper): ), ), ( - ("bagg", "30Min"), + "bagg", + "30Min", pd.Series( data=[-50.0, -75.0, 50.0, 50.0], index=pd.date_range( @@ -269,36 +98,93 @@ def test_harmSingleVarIntermediateFlagging(data, reshaper): ), ], ) -def test_harmSingleVarInterpolationAgg(data, params, expected): +def test_resampleAggregateInvert(data, method, freq, expected): flags = initFlagsLike(data) field = "data" - h_field = "data_harm" + field_aggregated = "data_aggregated" pre_data = data.copy() pre_flaggger = flags.copy() - method, freq = params qc = SaQC(data, flags) - qc = qc.copyField("data", "data_harm") - qc = qc.resample(h_field, freq, func=np.sum, method=method) + qc = qc.copyField(field, field_aggregated) - checkDataFlagsInvariants(qc._data, qc._flags, h_field, identical=True) - assert qc._data[h_field].index.freq == pd.Timedelta(freq) - assert qc._data[h_field].equals(expected) + qc = qc.resample(field_aggregated, freq, func=np.sum, method=method) + assert qc._data[field_aggregated].index.freq == pd.Timedelta(freq) + assert qc._data[field_aggregated].equals(expected) + checkInvariants(qc._data, qc._flags, field_aggregated, identical=True) - qc = qc.concatFlags(h_field, target=field, method="inverse_" + method) - qc = qc.dropField(h_field) - checkDataFlagsInvariants(qc._data, qc._flags, field, identical=True) + qc = qc.concatFlags(field_aggregated, target=field, method="inverse_" + method) assert qc.data[field].equals(pre_data[field]) assert qc.flags[field].equals(pre_flaggger[field]) + checkInvariants(qc._data, qc._flags, field, identical=True) @pytest.mark.parametrize( - "params, expected", + "method, freq, expected", [ ( - ("bshift", "15Min"), + "linear", + "15Min", + pd.Series( + data=[np.nan, -37.5, -25, 6.25, 37.50, 50], + index=pd.date_range( + "2010-12-31 23:45:00", "2011-01-01 01:00:00", freq="15Min" + ), + ), + ), + ( + "time", + "30Min", + pd.Series( + data=[np.nan, -37.5, 6.25, 50.0], + index=pd.date_range( + "2010-12-31 23:30:00", "2011-01-01 01:00:00", freq="30Min" + ), + ), + ), + ( + "pad", + "30Min", + pd.Series( + data=[np.nan, -37.5, 0, 50.0], + index=pd.date_range( + "2010-12-31 23:30:00", "2011-01-01 01:00:00", freq="30Min" + ), + ), + ), + ], +) +def test_alignInterpolateInvert(data, method, freq, expected): + flags = initFlagsLike(data) + + field = "data" + field_aligned = "data_aligned" + + pre_data = data.copy() + pre_flags = flags.copy() + + qc = SaQC(data, flags) + + qc = qc.copyField(field, field_aligned) + qc = qc.align(field_aligned, freq, method=method) + + assert qc.data[field_aligned].equals(expected) + checkInvariants(qc._data, qc._flags, field, identical=True) + + qc = qc.concatFlags(field_aligned, target=field, method="inverse_interpolation") + assert qc.data[field].equals(pre_data[field]) + assert qc.flags[field].equals(pre_flags[field]) + checkInvariants(qc._data, qc._flags, field, identical=True) + + +@pytest.mark.parametrize( + "method, freq, expected", + [ + ( + "bshift", + "15Min", pd.Series( data=[-50.0, -37.5, -25.0, 12.5, 37.5, 50.0], index=pd.date_range( @@ -307,7 +193,8 @@ def test_harmSingleVarInterpolationAgg(data, params, expected): ), ), ( - ("fshift", "15Min"), + "fshift", + "15Min", pd.Series( data=[np.nan, -37.5, -25.0, 0.0, 37.5, 50.0], index=pd.date_range( @@ -316,7 +203,8 @@ def test_harmSingleVarInterpolationAgg(data, params, expected): ), ), ( - ("nshift", "15min"), + "nshift", + "15min", pd.Series( data=[np.nan, -37.5, -25.0, 12.5, 37.5, 50.0], index=pd.date_range( @@ -325,7 +213,8 @@ def test_harmSingleVarInterpolationAgg(data, params, expected): ), ), ( - ("bshift", "30Min"), + "bshift", + "30Min", pd.Series( data=[-50.0, -37.5, 12.5, 50.0], index=pd.date_range( @@ -334,7 +223,8 @@ def test_harmSingleVarInterpolationAgg(data, params, expected): ), ), ( - ("fshift", "30Min"), + "fshift", + "30Min", pd.Series( data=[np.nan, -37.5, 0.0, 50.0], index=pd.date_range( @@ -343,7 +233,8 @@ def test_harmSingleVarInterpolationAgg(data, params, expected): ), ), ( - ("nshift", "30min"), + "nshift", + "30min", pd.Series( data=[np.nan, -37.5, 12.5, 50.0], index=pd.date_range( @@ -353,76 +244,56 @@ def test_harmSingleVarInterpolationAgg(data, params, expected): ), ], ) -def test_harmSingleVarInterpolationShift(data, params, expected): +def test_alignShiftInvert(data, method, freq, expected): flags = initFlagsLike(data) + field = "data" - h_field = "data_harm" + field_aligned = "data_aligned" + pre_data = data.copy() pre_flags = flags.copy() - method, freq = params qc = SaQC(data, flags) - qc = qc.copyField("data", "data_harm") - qc = qc.shift(h_field, freq, method=method) - assert qc.data[h_field].equals(expected) - checkDataFlagsInvariants(qc._data, qc._flags, field, identical=True) + qc = qc.copyField(field, field_aligned) + qc = qc.align(field_aligned, freq, method=method) - qc = qc.concatFlags(h_field, target=field, method="inverse_" + method) - checkDataFlagsInvariants(qc._data, qc._flags, field, identical=True) + assert qc.data[field_aligned].equals(expected) + checkInvariants(qc._data, qc._flags, field, identical=True) - qc = qc.dropField(h_field) + qc = qc.concatFlags(field_aligned, target=field, method="inverse_" + method) assert qc.data[field].equals(pre_data[field]) assert qc.flags[field].equals(pre_flags[field]) + checkInvariants(qc._data, qc._flags, field, identical=True) -def test_concatFlags(): - index = pd.to_datetime( - [ - "2020-01-01 00:00", - "2020-01-01 00:10", - "2020-01-01 00:30", - "2020-01-01 00:40", - "2020-01-01 01:00", - ] - ) - - df = pd.DataFrame( - data={ - "a": [ - 1, - 2, - 5, - 4, - 3, - ] - }, - index=index, - ) - - qc = SaQC(df) +@pytest.mark.parametrize( + "overwrite, expected_col0, expected_col1", + [ + ( + True, + [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 255, 255], + [np.nan, np.nan, np.nan, np.nan, np.nan, 255, np.nan, 255, 255], + ), + ( + False, + [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 255, 255], + [np.nan, np.nan, np.nan, np.nan, np.nan, 255, np.nan, np.nan, np.nan], + ), + ], +) +def test_concatFlags(data, overwrite, expected_col0, expected_col1): + qc = SaQC(data) - qc = qc.flagRange(field="a", max=4) + qc = qc.flagRange(field="data", max=20) # branch out to another variable - qc = qc.flagRange(field="a", target="b", max=3) - - # bring the flags back again - qc_overwrite = qc.concatFlags("b", target="a", overwrite=True, squeeze=True) - hist_overwrite = qc_overwrite._flags.history["a"].hist.astype(float) - assert hist_overwrite[0].equals( - pd.Series([np.nan, np.nan, 255.0, np.nan, np.nan], index=index) - ) - assert hist_overwrite[1].equals( - pd.Series([np.nan, np.nan, 255.0, 255.0, np.nan], index=index) - ) + qc = qc.flagRange(field="data", target="data_", max=3) - # bring the flags back again - qc_respect = qc.concatFlags("b", target="a", overwrite=False, squeeze=True) - hist_respect = qc_respect._flags.history["a"].hist.astype(float) - assert hist_respect[0].equals( - pd.Series([np.nan, np.nan, 255.0, np.nan, np.nan], index=index) - ) - assert hist_respect[1].equals( - pd.Series([np.nan, np.nan, np.nan, 255.0, np.nan], index=index) + # bring the flags back again - overwrite + qc_concat = qc.concatFlags( + "data_", target="data", overwrite=overwrite, squeeze=True ) + hist_concat = qc_concat._flags.history["data"].hist.astype(float) + assert hist_concat[0].equals(pd.Series(expected_col0, index=data["data"].index)) + assert hist_concat[1].equals(pd.Series(expected_col1, index=data["data"].index))