added new modelling func: rolling mean.

af25b889 · Peter Lünenschloß · 42f8cdf5 · af25b889
Commit af25b889 authored 4 years ago by Peter Lünenschloß
--- a/saqc/funcs/data_modelling.py
+++ b/saqc/funcs/data_modelling.py
@@ -21,7 +21,6 @@ def modelling_polyFit(data, field, flagger, winsz, polydeg, numba='auto', eval_f

    Note, that calculating the residues tends to be quite cost intensive - because a function fitting is perfomed for every
    sample. To improve performance, consider the following possibillities:
-data, flagger = modelling_polyFit(SEEFOdata, 'efield', flagger, '1h', 2, eval_flags=False)

    In case your data is sampled at an equidistant frequency grid:

@@ -88,7 +87,7 @@ data, flagger = modelling_polyFit(SEEFOdata, 'efield', flagger, '1h', 2, eval_fl
            pos = x.index[int(len(x) - y[x.index[-1]])]
            return y.index.get_loc(pos)

-        centers_iloc = centers.rolling('1h', closed='both').apply(center_func, raw=False).astype(int)
+        centers_iloc = centers.rolling(winsz, closed='both').apply(center_func, raw=False).astype(int)
        temp = residues.copy()
        for k in centers_iloc.iteritems():
            residues.iloc[k[1]] = temp[k[0]]
@@ -157,3 +156,83 @@ data, flagger = modelling_polyFit(SEEFOdata, 'efield', flagger, '1h', 2, eval_fl
        flagger = flagger.setFlags(field, flags=to_flag, **kwargs)

    return data, flagger
+
+
+@register
+def modelling_rollingMean(data, field, flagger, winsz, eval_flags=True, min_periods=0, center=True, **kwargs):
+    """
+    Models the timeseries passed with the rolling mean.
+
+    Parameters
+    ----------
+    winsz : integer or offset String
+        The size of the window you want to roll with. If an integer is passed, the size
+        refers to the number of periods for every fitting window. If an offset string is passed,
+        the size refers to the total temporal extension.
+        For regularly sampled timeseries, the period number will be casted down to an odd number if
+        center = True.
+    eval_flags : boolean, default True
+        Wheather or not to assign new flags to the calculated residuals. If True, a residual gets assigned the worst
+        flag present in the interval, the data for its calculation was obtained from.
+        Currently not implemented in combination with not-harmonized timeseries.
+    min_periods : integer, default 0
+        The minimum number of periods, that has to be available in every values fitting surrounding for the mean
+        fitting to be performed. If there are not enough values, np.nan gets assigned. Default (0) results in fitting
+        regardless of the number of values present.
+    center : boolean, default True
+        Wheather or not to center the window the mean is calculated of around the reference value. If False,
+        the reference value is placed to the right of the window (classic rolling mean with lag.)
+    kwargs
+
+    Returns
+    -------
+
+    """
+    data = data.copy()
+    to_fit = data[field]
+    flags = flagger.getFlags(field)
+
+    # starting with the annoying case: finding the rolling interval centers of not-harmonized input time series:
+    if (to_fit.index.freqstr is None) and center:
+        if isinstance(winsz, int):
+            raise NotImplementedError('Integer based window size is not supported for not-harmonized'
+                                      'sample series when rolling with "center=True".')
+        # get interval centers
+        centers = np.floor((to_fit.rolling(pd.Timedelta(winsz) / 2, closed='both', min_periods=min_periods).count()))
+        centers = centers.drop(centers[centers.isna()].index)
+        centers = centers.astype(int)
+        means = to_fit.rolling(pd.Timedelta(winsz), closed='both', min_periods=min_periods).mean()
+
+        def center_func(x, y=centers):
+            pos = x.index[int(len(x) - y[x.index[-1]])]
+            return y.index.get_loc(pos)
+
+        centers_iloc = centers.rolling(winsz, closed='both').apply(center_func, raw=False).astype(int)
+        temp = means.copy()
+        for k in centers_iloc.iteritems():
+            means.iloc[k[1]] = temp[k[0]]
+        # last values are false, due to structural reasons:
+        means[means.index[centers_iloc[-1]]:means.index[-1]] = np.nan
+
+    # everything is more easy if data[field] is harmonized:
+    else:
+        if isinstance(winsz, str):
+            winsz = int(np.floor(pd.Timedelta(winsz) / pd.Timedelta(to_fit.index.freqstr)))
+        if (winsz % 2 == 0) & center:
+            winsz = int(winsz - 1)
+
+        means = to_fit.rolling(window=winsz, center=center, closed='both').mean()
+
+    residues = means - to_fit
+    data[field] = residues
+    if eval_flags:
+        num_cats, codes = flags.factorize()
+        num_cats = pd.Series(num_cats, index=flags.index).rolling(winsz, center=True, min_periods=min_periods).max()
+        nan_samples = num_cats[num_cats.isna()]
+        num_cats.drop(nan_samples.index, inplace=True)
+        to_flag = pd.Series(codes[num_cats.astype(int)], index=num_cats.index)
+        to_flag = to_flag.align(nan_samples)[0]
+        to_flag[nan_samples.index] = flags[nan_samples.index]
+        flagger = flagger.setFlags(field, flags=to_flag, **kwargs)
+
+    return data, flagger