poly fitting function implemented and documented

ed8fed6d · Peter Lünenschloß · 0a813ccf · ed8fed6d · ed8fed6d
Commit ed8fed6d authored 4 years ago by Peter Lünenschloß
--- a/saqc/funcs/data_modelling.py
+++ b/saqc/funcs/data_modelling.py
@@ -5,50 +5,130 @@ import pandas as pd
 import numpy as np
 from saqc.core.register import register
 from saqc.lib.ts_operators import polyRoller, polyRollerNoMissing, polyRoller_numba, polyRollerNoMissing_numba, \
-    validationAgg
+    polyRollerIrregular, validationAgg


 @register
 def modelling_polyFit(data, field, flagger, winsz, polydeg, numba='auto', eval_flags=True, min_periods=0, **kwargs):
+    """
+    Function fits a polynomial model to the data and returns the residues. (field gets overridden).
+    The residue for value x is calculated by fitting a polynomial of degree "polydeg" to a data slice
+    of size "winsz", wich has x at its center.
+
+    Note, that if data[field] is not alligned to an equidistant frequency grid, the window size passed,
+    has to be an offset string. Also numba boost options dont apply for irregularly sampled
+    timeseries.
+
+    Note, that calculating the residues tends to be quite cost intensive - because a function fitting is perfomed for every
+    sample. To improve performance, consider the following possibillities:
+
+    In case your data is sampled at an equidistant frequency grid:
+
+    (1) If you know your data to have no significant number of missing values, or if you do not want to
+        calculate residues for windows containing missing values any way, performance can be increased by setting
+        min_periods=winsz.
+
+    (2) If your data consists of more then around 200000 samples, setting numba=True, will boost the
+        calculations up to a factor of 5 (for samplesize > 300000) - however for lower sample sizes,
+        numba will slow down the calculations, also, up to a factor of 5, for sample_size < 50000.
+        By default (numba='auto'), numba is set to true, if the data sample size exceeds 200000.
+
+    in case your data is not sampled at an equidistant frequency grid:
+
+    (1) Harmonization/resampling of your data will have a noticable impact on polyfittings performance - since
+        numba_boost doesnt apply for irregularly sampled data in the current implementation.
+
+    Parameters
+    ----------
+    winsz : integer or offset String
+        The size of the window you want to use for fitting. If an integer is passed, the size
+        refers to the number of periods for every fitting window. If an offset string is passed,
+        the size refers to the total temporal extension. The window will be centered around the vaule-to-be-fitted.
+        For regularly sampled timeseries the period number will be casted down to an odd number if
+        even.
+    polydeg : integer
+        The degree of the polynomial used for fitting
+    numba : {True, False, "auto"}, default "auto"
+        Wheather or not to apply numbas just-in-time compilation onto the poly fit function. This will noticably
+        increase the speed of calculation, if the sample size is sufficiently high.
+        If "auto" is selected, numba compatible fit functions get applied for data consisiting of > 200000 samples.
+    eval_flags : boolean, default True
+        Wheather or not to assign new flags to the calculated residuals. If True, a residual gets assigned the worst
+        flag present in the interval, the data for its calculation was obtained from.
+    min_periods : integer, default 0
+        The minimum number of periods, that has to be available in every values fitting surrounding for the polynomial
+        fit to be performed. If there are not enough intervals, np.nan gets assigned. Default (0) results in fitting
+        regardless of the number of values present (results in overfitting for to sparse intervals).
+    kwargs
+
+    Returns
+    -------
+
+    """
    data = data.copy()
    to_fit = data[field]
    flags = flagger.getFlags(field)
-    if numba == 'auto':
-        if to_fit.shape[0] < 200000:
-            numba = False
-        else:
-            numba = True
-
-    val_range = np.arange(0, winsz)
-    center_index = np.floor(winsz / 2)
-    if min_periods < winsz:
-        if min_periods > 0:
-            max_nan_total = winsz - min_periods
-            to_fit = to_fit.rolling(winsz, center=True).apply(validationAgg, raw=True, args=(max_nan_total))
-        # we need a missing value marker that is not nan, because nan values dont get passed by pandas rolling method
-        miss_marker = to_fit.min()
-        miss_marker = np.floor(miss_marker - 1)
-        na_mask = to_fit.isna()
-        to_fit[na_mask] = miss_marker
-        if numba:
-            residues = to_fit.rolling(winsz, center=True).apply(polyRoller_numba, args=(miss_marker, val_range,
-                                                                                    center_index, polydeg),
-                                                    raw=True, engine='numba', engine_kwargs={'no_python': True})
-        else:
-            residues = to_fit.rolling(winsz, center=True).apply(polyRoller,
-                                                            args=(miss_marker, val_range, center_index, polydeg), raw=True)
-        residues = residues - to_fit
-        residues[na_mask] = np.nan
+    if not to_fit.index.freqstr:
+        if isinstance(winsz, int):
+            raise NotImplementedError('Integer based window size is not supported for not-harmonized'
+                             'sample series (because it makes no sence)')
+        # get interval centers
+        centers = np.floor((to_fit.rolling(pd.Timedelta(winsz)/2, closed='both', min_periods=min_periods).count()))
+        centers = centers.drop(centers[centers.isna()].index)
+        centers = centers.astype(int)
+        residues = to_fit.rolling(pd.Timedelta(winsz), closed='both', min_periods=min_periods).apply(polyRollerIrregular,
+                                                                                   args=(centers, polydeg))
+        def center_func(x, y=centers):
+            pos = x.index[int(len(x) - y[x.index[-1]])]
+            return y.index.get_loc(pos)
+
+        centers_iloc = centers.rolling('1h', closed='both').apply(center_func, raw=False).astype(int)
+        temp = residues.copy()
+        for k in centers_iloc.iteritems():
+            residues.iloc[k[1]] = temp[k[0]]
+        residues[residues.index[0]:residues.index[centers_iloc[0]]] = np.nan
+        residues[residues.index[centers_iloc[-1]]:residues.index[-1]] = np.nan
    else:
-        # we only fit fully populated intervals:
-        if numba:
-            residues = to_fit.rolling(winsz, center=True).apply(polyRollerNoMissing_numba, args=(val_range,
+        if isinstance(winsz, str):
+            winsz = np.floor(pd.Timedelta(winsz) / pd.Timedelta(to_fit.index.freqstr))
+        if winsz % 2 == 1:
+            winsz = winsz -1
+        if numba == 'auto':
+            if to_fit.shape[0] < 200000:
+                numba = False
+            else:
+                numba = True
+
+        val_range = np.arange(0, winsz)
+        center_index = np.floor(winsz / 2)
+        if min_periods < winsz:
+            if min_periods > 0:
+                max_nan_total = winsz - min_periods
+                to_fit = to_fit.rolling(winsz, center=True).apply(validationAgg, raw=True, args=(max_nan_total))
+            # we need a missing value marker that is not nan, because nan values dont get passed by pandas rolling method
+            miss_marker = to_fit.min()
+            miss_marker = np.floor(miss_marker - 1)
+            na_mask = to_fit.isna()
+            to_fit[na_mask] = miss_marker
+            if numba:
+                residues = to_fit.rolling(winsz, center=True).apply(polyRoller_numba, args=(miss_marker, val_range,
                                                                                        center_index, polydeg),
-                                                                engine='numba', engine_kwargs={'no_python': True}, raw=True)
+                                                        raw=True, engine='numba', engine_kwargs={'no_python': True})
+            else:
+                residues = to_fit.rolling(winsz, center=True).apply(polyRoller,
+                                                                args=(miss_marker, val_range, center_index, polydeg), raw=True)
+            residues[na_mask] = np.nan
        else:
-            residues = to_fit.rolling(winsz, center=True).apply(polyRollerNoMissing,
-                                                                args=(val_range, center_index, polydeg), raw=True)
+            # we only fit fully populated intervals:
+            if numba:
+                residues = to_fit.rolling(winsz, center=True).apply(polyRollerNoMissing_numba, args=(val_range,
+                                                                                            center_index, polydeg),
+                                                                    engine='numba', engine_kwargs={'no_python': True}, raw=True)
+            else:
+                residues = to_fit.rolling(winsz, center=True).apply(polyRollerNoMissing,
+                                                                    args=(val_range, center_index, polydeg), raw=True)

+    residues = residues - to_fit
    data[field] = residues
    if eval_flags:
        num_cats, codes = flags.factorize()

--- a/saqc/lib/ts_operators.py
+++ b/saqc/lib/ts_operators.py
@@ -412,4 +412,13 @@ def polyRollerNoMissing(in_slice, val_range, center_index, poly_deg):
    # function to roll with when modelling data with polynomial model
    # it is assumed, that in slice is an equidistant sample
    fitted = poly.polyfit(x=val_range, y=in_slice, deg=poly_deg)
-    return poly.polyval(center_index, fitted)
\ No newline at end of file
+    return poly.polyval(center_index, fitted)
+
+
+def polyRollerIrregular(in_slice, center_index_ser, poly_deg):
+    # a function to roll with, for polynomial fitting of data not having an equidistant frequency grid.
+    # (expects to get passed pandas timeseries), so raw parameter of rolling.apply should be set to False.
+    x_data = ((in_slice.index - in_slice.index[0]).total_seconds()) / 60
+    fitted = poly.polyfit(x_data, in_slice.values, poly_deg)
+    center_pos = int(len(in_slice) - center_index_ser[in_slice.index[-1]])
+    return poly.polyval(x_data[center_pos], fitted)
\ No newline at end of file