From 7670fcb9292dec8357c3175685621b892a0c6392 Mon Sep 17 00:00:00 2001 From: luenensc <peter.luenenschloss@ufz.de> Date: Wed, 11 Jan 2023 17:58:03 +0100 Subject: [PATCH] streamlined/documented interpolate NaN funcion --- saqc/lib/ts_operators.py | 108 ++++++++++++++++++--------------- tests/lib/test_ts_operators.py | 73 +++++++++++++++++++++- 2 files changed, 132 insertions(+), 49 deletions(-) diff --git a/saqc/lib/ts_operators.py b/saqc/lib/ts_operators.py index 84f8f568d..5aa69e347 100644 --- a/saqc/lib/ts_operators.py +++ b/saqc/lib/ts_operators.py @@ -276,6 +276,30 @@ def meanQC(data, max_nan_total=np.inf, max_nan_consec=np.inf): ) +def _interpolWrapper(x, order=2, method='time', downgrade_interpolation=False): + """ + Function that automatically modifies the interpolation level or returns uninterpolated + input data if the data configuration breaks the interpolation method at the selected degree. + """ + if order < 0: + return x + elif x.count() > order: + try: + return x.interpolate(method=method, order=int(order)) + except (NotImplementedError, ValueError): + warnings.warn( + f"Interpolation with method {method} is not supported at order " + f"{order}. and will be performed at order {order - 1}" + ) + return _interpolWrapper(x, int(order - 1), method) + elif x.size < 3: + return x + else: + if downgrade_interpolation: + return _interpolWrapper(x, int(x.count() - 1), method) + else: + return x + def interpolateNANs( data, method, order=2, inter_limit=2, downgrade_interpolation=False ): @@ -301,66 +325,54 @@ def interpolateNANs( :return: """ - inter_limit = int(inter_limit or len(data) + 1) data = pd.Series(data, copy=True) - gap_mask = data.isna().rolling(inter_limit, min_periods=0).sum() != inter_limit - if inter_limit == 2: - gap_mask = gap_mask & gap_mask.shift(-1, fill_value=True) + if inter_limit is None: + # if there is actually no limit set to the gaps to-be interpolated, generate a dummy mask for the gaps + gap_mask = pd.Series(True, index=data.index, name=data.name) else: - gap_mask = ( - gap_mask.replace(True, np.nan) - .fillna(method="bfill", limit=inter_limit - 1) - .replace(np.nan, True) - .astype(bool) - ) - + # if there is a limit to the gaps to be interpolated, generate a mask that evaluates to False at the right side + # of each too-large gap with a rolling.sum combo + gap_mask = data.isna().rolling(inter_limit, min_periods=0).sum() != inter_limit + if inter_limit < 20: + # for the common case of inter_limit=2 (default "harmonisation"), we efficiently bag propagate the False + # value to fill the whole too-large gap by a shift and a conjunction. + gap_mask &= gap_mask & gap_mask.shift(-1, fill_value=True) + else: + # If the gap_size is bigger we use pandas backfill-interpolation to propagate the False values back. + # Therefor we replace the True values with np.nan so hat they are interpreted as missing periods. + gap_mask = ( + gap_mask.replace(True, np.nan) + .fillna(method="bfill", limit=inter_limit - 1) + .replace(np.nan, True) + .astype(bool) + ) + + # memorizing the index for later reindexing pre_index = data.index - - if data[gap_mask].empty: + # drop the gaps that are too large with regard to the inter_limit from the data-to-be interpolated + data = data[gap_mask] + if data.empty: return data - else: - data = data[gap_mask] - if method in ["linear", "time"]: - + # in the case of linear interpolation, not much can go wrong/break so this conditional branch has efficient + # finish by just calling pandas interpolation routine to fill the gaps remaining in the data: data.interpolate( - method=method, inplace=True, limit=inter_limit - 1, limit_area="inside" + method=method, inplace=True, limit_area="inside" ) else: - dat_name = data.name - gap_mask = (~gap_mask).cumsum() - data = pd.merge(gap_mask, data, how="inner", left_index=True, right_index=True) - - def _interpolWrapper(x, wrap_order=order, wrap_method=method): - if wrap_order < 0: - return x - elif x.count() > wrap_order: - try: - return x.interpolate(method=wrap_method, order=int(wrap_order)) - except (NotImplementedError, ValueError): - warnings.warn( - f"Interpolation with method {method} is not supported at order " - f"{wrap_order}. and will be performed at order {wrap_order - 1}" - ) - return _interpolWrapper(x, int(wrap_order - 1), wrap_method) - elif x.size < 3: - return x - else: - if downgrade_interpolation: - return _interpolWrapper(x, int(x.count() - 1), wrap_method) - else: - return x - - data = data.groupby(data.columns[0]).transform(_interpolWrapper) - # squeezing the 1-dimensional frame resulting from groupby for consistency - # reasons - data = data.squeeze(axis=1) - data.name = dat_name + # if the method that is inerpolated with depends on not only the left and right border points of any gap, + # but includes more points, it has to be applied on any data chunk seperated by the too-big gaps individually. + # So we use the gap_mask to group the data into chunks and perform the interpolation on every chunk seperatly + # with the .transform method of the grouper. + gap_mask = (~gap_mask).cumsum()[data.index] + data = data.groupby(by=gap_mask).transform(_interpolWrapper, **{'order':order, + 'method':method, + 'downgrade_inerpolation':downgrade_interpolation}) + # finally reinsert the dropped data gaps data = data.reindex(pre_index) - return data diff --git a/tests/lib/test_ts_operators.py b/tests/lib/test_ts_operators.py index f5f8b3ef2..51e6ded81 100644 --- a/tests/lib/test_ts_operators.py +++ b/tests/lib/test_ts_operators.py @@ -1,13 +1,15 @@ # SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ # # SPDX-License-Identifier: GPL-3.0-or-later +from __future__ import annotations + import numpy as np import pandas as pd import pytest -from numpy.testing import assert_array_equal, assert_equal from pandas.testing import assert_series_equal import saqc.lib.ts_operators as tsops +from saqc.lib.ts_operators import interpolateNANs def test_butterFilter(): @@ -193,3 +195,72 @@ def test_rateOfChange(data, expected): result = rateOfChange(data) assert_series_equal(result, expected, check_names=False) + + +@pytest.mark.parametrize( + "limit,area,direction,data,expected", + [ + ( + 1, + "inside", + None, + [np.nan, 0, np.nan, np.nan, np.nan, 4, np.nan], + [np.nan, 0, np.nan, np.nan, np.nan, 4, np.nan], + ), + ( + 2, + "inside", + None, + [np.nan, 0, np.nan, np.nan, np.nan, 4, np.nan], + [np.nan, 0, np.nan, np.nan, np.nan, 4, np.nan], + ), + ( + 3, + "inside", + None, + [np.nan, 0, np.nan, np.nan, np.nan, 4, np.nan], + [np.nan, 0, 1, 2, 3, 4, np.nan], + ), + ( + None, + "inside", + None, + [np.nan, 0, np.nan, np.nan, np.nan, 4, np.nan], + [np.nan, 0, 1, 2, 3, 4, np.nan], + ), + ( + None, + "outside", + "forward", + [np.nan, 0, np.nan, np.nan, np.nan, 4, np.nan], + [np.nan, 0, np.nan, np.nan, np.nan, 4, 4], + ), + ( + None, + "outside", + "backward", + [np.nan, 0, np.nan, np.nan, np.nan, 4, np.nan], + [0, 0, np.nan, np.nan, np.nan, 4, np.nan], + ), + ( + None, + "outside", + "both", + [np.nan, 0, np.nan, np.nan, np.nan, 4, np.nan], + [0, 0, np.nan, np.nan, np.nan, 4, 4], + ), + ( + None, + None, + "both", + [np.nan, 0, np.nan, np.nan, np.nan, 4, np.nan], + [0, 0, 1, 2, 3, 4, 4], + ), + ], +) +def test_interpolatNANs(limit, area, direction, data, expected): + + got = interpolateNANs( + pd.Series(data), inter_limit=limit + ) + assert got.equals(pd.Series(expected, dtype=float)) -- GitLab