diff --git a/saqc/funcs/curvefit.py b/saqc/funcs/curvefit.py index 1633dba68e234db69e3cf094a1c4fae6c54b1d90..cef464e858dbccd9f4860ee5ad50c92776eff95e 100644 --- a/saqc/funcs/curvefit.py +++ b/saqc/funcs/curvefit.py @@ -12,10 +12,13 @@ import pandas as pd from dios import DictOfSeries from saqc.core.register import register + +from saqc.lib.tools import getFreqDelta from saqc.flagger.baseflagger import BaseFlagger from saqc.lib.ts_operators import polyRollerIrregular, polyRollerNumba, polyRoller, polyRollerNoMissingNumba, polyRollerNoMissing + @register(masking='field') def fitPolynomial(data: DictOfSeries, field: str, flagger: BaseFlagger, winsz: Union[int, str], @@ -105,9 +108,8 @@ def fitPolynomial(data: DictOfSeries, field: str, flagger: BaseFlagger, data = data.copy() to_fit = data[field] flags = flagger.getFlags(field) - i = to_fit.index - # checking if index is regular here (index.freqstr property is not reliable) - if not pd.date_range(i[0], i[-1], len(i)).equals(i): + regular = getFreqDelta(to_fit.index) + if not regular: if isinstance(winsz, int): raise NotImplementedError("Integer based window size is not supported for not-harmonized" "sample series.") # get interval centers @@ -130,7 +132,7 @@ def fitPolynomial(data: DictOfSeries, field: str, flagger: BaseFlagger, residues[residues.index[centers_iloc[-1]] : residues.index[-1]] = np.nan else: if isinstance(winsz, str): - winsz = pd.Timedelta(winsz) // pd.Timedelta(to_fit.index.freqstr) + winsz = pd.Timedelta(winsz) // regular if winsz % 2 == 0: winsz = int(winsz - 1) if min_periods is None: diff --git a/saqc/funcs/outliers.py b/saqc/funcs/outliers.py index 0f2ff3181260e88775921cecac133a30b4c29558..df617514a8aae402505274bc7ecc85e1b2dc06fd 100644 --- a/saqc/funcs/outliers.py +++ b/saqc/funcs/outliers.py @@ -11,7 +11,8 @@ import numpy.polynomial.polynomial as poly import numba from saqc.lib.tools import ( customRoller, - findIndex + findIndex, + getFreqDelta ) from saqc.funcs.scores import assignKNNScore from outliers import smirnov_grubbs @@ -845,14 +846,20 @@ def flagByGrubbs(data, field, flagger, winsz, alpha=0.05, min_periods=8, check_l data = data.copy() datcol = data[field] + rate = getFreqDelta(datcol.index) + + # if timeseries that is analyzed, is regular, window size can be transformed to a number of periods: + if rate and isinstance(winsz, str): + winsz = pd.Timedelta(winsz) // rate + to_group = pd.DataFrame(data={"ts": datcol.index, "data": datcol}) to_flag = pd.Series(False, index=datcol.index) if isinstance(winsz, int): # period number defined test intervals grouper_series = pd.Series(data=np.arange(0, datcol.shape[0]), index=datcol.index) grouper_series_lagged = grouper_series + (winsz / 2) - grouper_series = grouper_series.transform(lambda x: int(np.floor(x / winsz))) - grouper_series_lagged = grouper_series_lagged.transform(lambda x: int(np.floor(x / winsz))) + grouper_series = grouper_series.transform(lambda x: x // winsz) + grouper_series_lagged = grouper_series_lagged.transform(lambda x: x // winsz) partitions = to_group.groupby(grouper_series) partitions_lagged = to_group.groupby(grouper_series_lagged) else: diff --git a/saqc/funcs/rolling.py b/saqc/funcs/rolling.py index 47966bd33141876c6c677f4ec346a6d5e07241ca..e0986ddc71c8365d9ab42bdfc630aac35d3ff144 100644 --- a/saqc/funcs/rolling.py +++ b/saqc/funcs/rolling.py @@ -4,6 +4,7 @@ import numpy as np import pandas as pd from saqc.core.register import register +from saqc.lib.tools import getFreqDelta @register(masking='field') @@ -59,8 +60,9 @@ def roll(data, field, flagger, winsz, func=np.mean, eval_flags=True, min_periods if to_fit.empty: return data, flagger + regular = getFreqDelta(to_fit.index) # starting with the annoying case: finding the rolling interval centers of not-harmonized input time series: - if (to_fit.index.freqstr is None) and center: + if center and not regular: if isinstance(winsz, int): raise NotImplementedError( "Integer based window size is not supported for not-harmonized" @@ -90,7 +92,7 @@ def roll(data, field, flagger, winsz, func=np.mean, eval_flags=True, min_periods # everything is more easy if data[field] is harmonized: else: if isinstance(winsz, str): - winsz = int(np.floor(pd.Timedelta(winsz) / pd.Timedelta(to_fit.index.freqstr))) + winsz = pd.Timedelta(winsz) // regular if (winsz % 2 == 0) & center: winsz = int(winsz - 1) @@ -101,9 +103,9 @@ def roll(data, field, flagger, winsz, func=np.mean, eval_flags=True, min_periods means = to_fit.rolling(window=winsz, center=center, closed="both").apply(func) if _return_residues: - residues = means - to_fit + means = means - to_fit - data[field] = residues + data[field] = means if eval_flags: num_cats, codes = flags.factorize() num_cats = pd.Series(num_cats, index=flags.index).rolling(winsz, center=True, min_periods=min_periods).max() diff --git a/saqc/lib/tools.py b/saqc/lib/tools.py index 2ee355c7b0bccefe39c730f1235f591fb7e6880d..769a322dbf52d0e7c65cdd8e3b62c24262c9c5f4 100644 --- a/saqc/lib/tools.py +++ b/saqc/lib/tools.py @@ -568,3 +568,20 @@ def detectDeviants(data, metric, norm_spread, norm_frac, linkage_method='single' return [i for i, x in enumerate(cluster) if x != norm_cluster] +def getFreqDelta(index): + """ + Function checks if the passed index is regularly sampled. + + If yes, the according timedelta value is returned, + + If no, ``None`` is returned. + + (``None`` will also be returned for pd.RangeIndex type.) + + """ + delta = getattr(index, 'freq', None) + if delta is None and not index.empty: + i = pd.date_range(index[0], index[-1], len(index)) + if i.equals(index): + return i[1] - i[0] + return delta