Skip to content
Snippets Groups Projects
Commit f2d98441 authored by Peter Lünenschloß's avatar Peter Lünenschloß
Browse files

check if tieseries is regularly sampled

parent 477d015e
No related branches found
No related tags found
No related merge requests found
......@@ -12,10 +12,13 @@ import pandas as pd
from dios import DictOfSeries
from saqc.core.register import register
from saqc.lib.tools import getFreqDelta
from saqc.flagger.baseflagger import BaseFlagger
from saqc.lib.ts_operators import polyRollerIrregular, polyRollerNumba, polyRoller, polyRollerNoMissingNumba, polyRollerNoMissing
@register(masking='field')
def fitPolynomial(data: DictOfSeries, field: str, flagger: BaseFlagger,
winsz: Union[int, str],
......@@ -105,9 +108,8 @@ def fitPolynomial(data: DictOfSeries, field: str, flagger: BaseFlagger,
data = data.copy()
to_fit = data[field]
flags = flagger.getFlags(field)
i = to_fit.index
# checking if index is regular here (index.freqstr property is not reliable)
if not pd.date_range(i[0], i[-1], len(i)).equals(i):
regular = getFreqDelta(to_fit.index)
if not regular:
if isinstance(winsz, int):
raise NotImplementedError("Integer based window size is not supported for not-harmonized" "sample series.")
# get interval centers
......@@ -130,7 +132,7 @@ def fitPolynomial(data: DictOfSeries, field: str, flagger: BaseFlagger,
residues[residues.index[centers_iloc[-1]] : residues.index[-1]] = np.nan
else:
if isinstance(winsz, str):
winsz = pd.Timedelta(winsz) // pd.Timedelta(to_fit.index.freqstr)
winsz = pd.Timedelta(winsz) // regular
if winsz % 2 == 0:
winsz = int(winsz - 1)
if min_periods is None:
......
......@@ -11,7 +11,8 @@ import numpy.polynomial.polynomial as poly
import numba
from saqc.lib.tools import (
customRoller,
findIndex
findIndex,
getFreqDelta
)
from saqc.funcs.scores import assignKNNScore
from outliers import smirnov_grubbs
......@@ -845,14 +846,20 @@ def flagByGrubbs(data, field, flagger, winsz, alpha=0.05, min_periods=8, check_l
data = data.copy()
datcol = data[field]
rate = getFreqDelta(datcol.index)
# if timeseries that is analyzed, is regular, window size can be transformed to a number of periods:
if rate and isinstance(winsz, str):
winsz = pd.Timedelta(winsz) // rate
to_group = pd.DataFrame(data={"ts": datcol.index, "data": datcol})
to_flag = pd.Series(False, index=datcol.index)
if isinstance(winsz, int):
# period number defined test intervals
grouper_series = pd.Series(data=np.arange(0, datcol.shape[0]), index=datcol.index)
grouper_series_lagged = grouper_series + (winsz / 2)
grouper_series = grouper_series.transform(lambda x: int(np.floor(x / winsz)))
grouper_series_lagged = grouper_series_lagged.transform(lambda x: int(np.floor(x / winsz)))
grouper_series = grouper_series.transform(lambda x: x // winsz)
grouper_series_lagged = grouper_series_lagged.transform(lambda x: x // winsz)
partitions = to_group.groupby(grouper_series)
partitions_lagged = to_group.groupby(grouper_series_lagged)
else:
......
......@@ -4,6 +4,7 @@ import numpy as np
import pandas as pd
from saqc.core.register import register
from saqc.lib.tools import getFreqDelta
@register(masking='field')
......@@ -59,8 +60,9 @@ def roll(data, field, flagger, winsz, func=np.mean, eval_flags=True, min_periods
if to_fit.empty:
return data, flagger
regular = getFreqDelta(to_fit.index)
# starting with the annoying case: finding the rolling interval centers of not-harmonized input time series:
if (to_fit.index.freqstr is None) and center:
if center and not regular:
if isinstance(winsz, int):
raise NotImplementedError(
"Integer based window size is not supported for not-harmonized"
......@@ -90,7 +92,7 @@ def roll(data, field, flagger, winsz, func=np.mean, eval_flags=True, min_periods
# everything is more easy if data[field] is harmonized:
else:
if isinstance(winsz, str):
winsz = int(np.floor(pd.Timedelta(winsz) / pd.Timedelta(to_fit.index.freqstr)))
winsz = pd.Timedelta(winsz) // regular
if (winsz % 2 == 0) & center:
winsz = int(winsz - 1)
......@@ -101,9 +103,9 @@ def roll(data, field, flagger, winsz, func=np.mean, eval_flags=True, min_periods
means = to_fit.rolling(window=winsz, center=center, closed="both").apply(func)
if _return_residues:
residues = means - to_fit
means = means - to_fit
data[field] = residues
data[field] = means
if eval_flags:
num_cats, codes = flags.factorize()
num_cats = pd.Series(num_cats, index=flags.index).rolling(winsz, center=True, min_periods=min_periods).max()
......
......@@ -568,3 +568,20 @@ def detectDeviants(data, metric, norm_spread, norm_frac, linkage_method='single'
return [i for i, x in enumerate(cluster) if x != norm_cluster]
def getFreqDelta(index):
"""
Function checks if the passed index is regularly sampled.
If yes, the according timedelta value is returned,
If no, ``None`` is returned.
(``None`` will also be returned for pd.RangeIndex type.)
"""
delta = getattr(index, 'freq', None)
if delta is None and not index.empty:
i = pd.date_range(index[0], index[-1], len(index))
if i.equals(index):
return i[1] - i[0]
return delta
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment