Skip to content
Snippets Groups Projects
Commit ed8fed6d authored by Peter Lünenschloß's avatar Peter Lünenschloß
Browse files

poly fitting function implemented and documented

parent 0a813ccf
No related branches found
No related tags found
3 merge requests!193Release 1.4,!188Release 1.4,!49Dataprocessing features
Pipeline #4923 passed with stage
in 6 minutes and 26 seconds
......@@ -5,50 +5,130 @@ import pandas as pd
import numpy as np
from saqc.core.register import register
from saqc.lib.ts_operators import polyRoller, polyRollerNoMissing, polyRoller_numba, polyRollerNoMissing_numba, \
validationAgg
polyRollerIrregular, validationAgg
@register
def modelling_polyFit(data, field, flagger, winsz, polydeg, numba='auto', eval_flags=True, min_periods=0, **kwargs):
"""
Function fits a polynomial model to the data and returns the residues. (field gets overridden).
The residue for value x is calculated by fitting a polynomial of degree "polydeg" to a data slice
of size "winsz", wich has x at its center.
Note, that if data[field] is not alligned to an equidistant frequency grid, the window size passed,
has to be an offset string. Also numba boost options dont apply for irregularly sampled
timeseries.
Note, that calculating the residues tends to be quite cost intensive - because a function fitting is perfomed for every
sample. To improve performance, consider the following possibillities:
In case your data is sampled at an equidistant frequency grid:
(1) If you know your data to have no significant number of missing values, or if you do not want to
calculate residues for windows containing missing values any way, performance can be increased by setting
min_periods=winsz.
(2) If your data consists of more then around 200000 samples, setting numba=True, will boost the
calculations up to a factor of 5 (for samplesize > 300000) - however for lower sample sizes,
numba will slow down the calculations, also, up to a factor of 5, for sample_size < 50000.
By default (numba='auto'), numba is set to true, if the data sample size exceeds 200000.
in case your data is not sampled at an equidistant frequency grid:
(1) Harmonization/resampling of your data will have a noticable impact on polyfittings performance - since
numba_boost doesnt apply for irregularly sampled data in the current implementation.
Parameters
----------
winsz : integer or offset String
The size of the window you want to use for fitting. If an integer is passed, the size
refers to the number of periods for every fitting window. If an offset string is passed,
the size refers to the total temporal extension. The window will be centered around the vaule-to-be-fitted.
For regularly sampled timeseries the period number will be casted down to an odd number if
even.
polydeg : integer
The degree of the polynomial used for fitting
numba : {True, False, "auto"}, default "auto"
Wheather or not to apply numbas just-in-time compilation onto the poly fit function. This will noticably
increase the speed of calculation, if the sample size is sufficiently high.
If "auto" is selected, numba compatible fit functions get applied for data consisiting of > 200000 samples.
eval_flags : boolean, default True
Wheather or not to assign new flags to the calculated residuals. If True, a residual gets assigned the worst
flag present in the interval, the data for its calculation was obtained from.
min_periods : integer, default 0
The minimum number of periods, that has to be available in every values fitting surrounding for the polynomial
fit to be performed. If there are not enough intervals, np.nan gets assigned. Default (0) results in fitting
regardless of the number of values present (results in overfitting for to sparse intervals).
kwargs
Returns
-------
"""
data = data.copy()
to_fit = data[field]
flags = flagger.getFlags(field)
if numba == 'auto':
if to_fit.shape[0] < 200000:
numba = False
else:
numba = True
val_range = np.arange(0, winsz)
center_index = np.floor(winsz / 2)
if min_periods < winsz:
if min_periods > 0:
max_nan_total = winsz - min_periods
to_fit = to_fit.rolling(winsz, center=True).apply(validationAgg, raw=True, args=(max_nan_total))
# we need a missing value marker that is not nan, because nan values dont get passed by pandas rolling method
miss_marker = to_fit.min()
miss_marker = np.floor(miss_marker - 1)
na_mask = to_fit.isna()
to_fit[na_mask] = miss_marker
if numba:
residues = to_fit.rolling(winsz, center=True).apply(polyRoller_numba, args=(miss_marker, val_range,
center_index, polydeg),
raw=True, engine='numba', engine_kwargs={'no_python': True})
else:
residues = to_fit.rolling(winsz, center=True).apply(polyRoller,
args=(miss_marker, val_range, center_index, polydeg), raw=True)
residues = residues - to_fit
residues[na_mask] = np.nan
if not to_fit.index.freqstr:
if isinstance(winsz, int):
raise NotImplementedError('Integer based window size is not supported for not-harmonized'
'sample series (because it makes no sence)')
# get interval centers
centers = np.floor((to_fit.rolling(pd.Timedelta(winsz)/2, closed='both', min_periods=min_periods).count()))
centers = centers.drop(centers[centers.isna()].index)
centers = centers.astype(int)
residues = to_fit.rolling(pd.Timedelta(winsz), closed='both', min_periods=min_periods).apply(polyRollerIrregular,
args=(centers, polydeg))
def center_func(x, y=centers):
pos = x.index[int(len(x) - y[x.index[-1]])]
return y.index.get_loc(pos)
centers_iloc = centers.rolling('1h', closed='both').apply(center_func, raw=False).astype(int)
temp = residues.copy()
for k in centers_iloc.iteritems():
residues.iloc[k[1]] = temp[k[0]]
residues[residues.index[0]:residues.index[centers_iloc[0]]] = np.nan
residues[residues.index[centers_iloc[-1]]:residues.index[-1]] = np.nan
else:
# we only fit fully populated intervals:
if numba:
residues = to_fit.rolling(winsz, center=True).apply(polyRollerNoMissing_numba, args=(val_range,
if isinstance(winsz, str):
winsz = np.floor(pd.Timedelta(winsz) / pd.Timedelta(to_fit.index.freqstr))
if winsz % 2 == 1:
winsz = winsz -1
if numba == 'auto':
if to_fit.shape[0] < 200000:
numba = False
else:
numba = True
val_range = np.arange(0, winsz)
center_index = np.floor(winsz / 2)
if min_periods < winsz:
if min_periods > 0:
max_nan_total = winsz - min_periods
to_fit = to_fit.rolling(winsz, center=True).apply(validationAgg, raw=True, args=(max_nan_total))
# we need a missing value marker that is not nan, because nan values dont get passed by pandas rolling method
miss_marker = to_fit.min()
miss_marker = np.floor(miss_marker - 1)
na_mask = to_fit.isna()
to_fit[na_mask] = miss_marker
if numba:
residues = to_fit.rolling(winsz, center=True).apply(polyRoller_numba, args=(miss_marker, val_range,
center_index, polydeg),
engine='numba', engine_kwargs={'no_python': True}, raw=True)
raw=True, engine='numba', engine_kwargs={'no_python': True})
else:
residues = to_fit.rolling(winsz, center=True).apply(polyRoller,
args=(miss_marker, val_range, center_index, polydeg), raw=True)
residues[na_mask] = np.nan
else:
residues = to_fit.rolling(winsz, center=True).apply(polyRollerNoMissing,
args=(val_range, center_index, polydeg), raw=True)
# we only fit fully populated intervals:
if numba:
residues = to_fit.rolling(winsz, center=True).apply(polyRollerNoMissing_numba, args=(val_range,
center_index, polydeg),
engine='numba', engine_kwargs={'no_python': True}, raw=True)
else:
residues = to_fit.rolling(winsz, center=True).apply(polyRollerNoMissing,
args=(val_range, center_index, polydeg), raw=True)
residues = residues - to_fit
data[field] = residues
if eval_flags:
num_cats, codes = flags.factorize()
......
......@@ -412,4 +412,13 @@ def polyRollerNoMissing(in_slice, val_range, center_index, poly_deg):
# function to roll with when modelling data with polynomial model
# it is assumed, that in slice is an equidistant sample
fitted = poly.polyfit(x=val_range, y=in_slice, deg=poly_deg)
return poly.polyval(center_index, fitted)
\ No newline at end of file
return poly.polyval(center_index, fitted)
def polyRollerIrregular(in_slice, center_index_ser, poly_deg):
# a function to roll with, for polynomial fitting of data not having an equidistant frequency grid.
# (expects to get passed pandas timeseries), so raw parameter of rolling.apply should be set to False.
x_data = ((in_slice.index - in_slice.index[0]).total_seconds()) / 60
fitted = poly.polyfit(x_data, in_slice.values, poly_deg)
center_pos = int(len(in_slice) - center_index_ser[in_slice.index[-1]])
return poly.polyval(x_data[center_pos], fitted)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment