Skip to content
Snippets Groups Projects
Commit c80b919e authored by Peter Lünenschloß's avatar Peter Lünenschloß
Browse files

test module for data modelling function under construction

parent dc26a553
No related branches found
No related tags found
3 merge requests!193Release 1.4,!188Release 1.4,!49Dataprocessing features
......@@ -58,7 +58,7 @@ def modelling_polyFit(data, field, flagger, winsz, polydeg, numba='auto', eval_f
min_periods : integer, default 0
The minimum number of periods, that has to be available in every values fitting surrounding for the polynomial
fit to be performed. If there are not enough values, np.nan gets assigned. Default (0) results in fitting
regardless of the number of values present (results in overfitting for to sparse intervals).
regardless of the number of values present (results in overfitting for too sparse intervals).
kwargs
Returns
......@@ -105,7 +105,8 @@ def modelling_polyFit(data, field, flagger, winsz, polydeg, numba='auto', eval_f
if min_periods > 0:
max_nan_total = winsz - min_periods
to_fit = to_fit.rolling(winsz, center=True).apply(validationAgg, raw=True, args=(max_nan_total))
# we need a missing value marker that is not nan, because nan values dont get passed by pandas rolling method
# we need a missing value marker that is not nan, because nan values dont get passed by pandas rolling
# method
miss_marker = to_fit.min()
miss_marker = np.floor(miss_marker - 1)
na_mask = to_fit.isna()
......@@ -123,7 +124,8 @@ def modelling_polyFit(data, field, flagger, winsz, polydeg, numba='auto', eval_f
if numba:
residues = to_fit.rolling(winsz, center=True).apply(polyRollerNoMissing_numba, args=(val_range,
center_index, polydeg),
engine='numba', engine_kwargs={'no_python': True}, raw=True)
engine='numba', engine_kwargs={'no_python': True},
raw=True)
else:
residues = to_fit.rolling(winsz, center=True).apply(polyRollerNoMissing,
args=(val_range, center_index, polydeg), raw=True)
......
......@@ -699,14 +699,14 @@ def spikes_flagGrubbs(data, field, flagger, winsz, alpha=0.05, min_periods=8, **
(https://en.wikipedia.org/wiki/Grubbs%27s_test_for_outliers)
The (two-sided) test gets applied onto data chunks of size "winsz". The tests appliance will
be iterated onto each data chunk, till no more outliers are detected in that chunk.
The (two-sided) test gets applied onto data chunks of size "winsz". The tests appliccation will
be iterated on each data-chunk under test, till no more outliers are detected in that chunk.
Note, that the test performs poorely for small data chunks (resulting in heavy overflagging).
Therefor you should select "winsz" so that every window contains at least > 8 values and also
adjust the min_periods values accordingly.
Note, that the data to be tested by the grubbs test are expected to be "normalish" distributed.
Note, that the data to be tested by the grubbs test are expected to be "normallish" distributed.
Parameters
----------
......@@ -719,29 +719,28 @@ def spikes_flagGrubbs(data, field, flagger, winsz, alpha=0.05, min_periods=8, **
The level of significance the grubbs test is to be performed at. (between 0 and 1)
min_periods : Integer
The minimum number of values present in a testing interval for a grubbs test result to be accepted. Only
effective when winsz is an offset string.
kwargs
makes sence in case "winsz" is an offset string.
Returns
-------
"""
data = data.copy()
to_flag = data[field]
to_group = pd.DataFrame(data={'ts': to_flag.index, 'data': to_flag})
datcol = data[field]
to_group = pd.DataFrame(data={'ts': datcol.index, 'data': datcol})
if isinstance(winsz, int):
# period number defined test intervals
grouper_series = pd.Series(data=np.arange(0, to_flag.shape[0]), index=to_flag.index)
grouper_series = pd.Series(data=np.arange(0, datcol.shape[0]), index=datcol.index)
grouper_series = grouper_series.transform(lambda x: int(np.floor(x / winsz)))
partitions = to_group.groupby(grouper_series)
else:
# offset defined test intervals:
partitions = to_group.groupby(pd.Grouper(freq=winsz))
for _, partition in partitions:
if partition.shape[0] > min_periods:
to_flag = smirnov_grubbs.two_sided_test_indices(partition['data'].values, alpha=alpha)
to_flag = partition['ts'].iloc[to_flag]
if not to_flag.empty:
print(to_flag)
flagger = flagger.setFlags(field, loc=to_flag, **kwargs)
return data, flagger
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import pytest
import numpy as np
import pandas as pd
from dios import dios
from test.common import TESTFLAGGER
from saqc.funcs.data_modelling import (
modelling_polyFit
)
import numpy.polynomial.polynomial as poly
@pytest.mark.parametrize("flagger", TESTFLAGGER)
@pytest.mark.parametrize("dat", [pytest.lazy_fixture("course_2")])
def test_modelling_polyFit_forRegular(dat, flagger):
data, _ = dat(freq='10min', periods=100, initial_level=0, final_level=100, out_val=-100)
# add some nice sine distortion
data += np.sin(np.arange(0, len(data)))
data = dios.DictOfSeries(data)
flagger = flagger.initFlags(data)
result1, _ = modelling_polyFit(data, 'data', flagger, 11, numba=False)
result2, _ = modelling_polyFit(data, 'data', flagger, 11, numba=True)
result3, _ = modelling_polyfit(data, 'data', flagger, '2h', numba=False)
result4, _ = modelling_polyfit(data, 'data', flagger, '2h', numba=True)
\ No newline at end of file
......@@ -7,7 +7,7 @@ import numpy as np
import pandas as pd
from dios import dios
from test.common import TESTFLAGGER, initData
from test.common import TESTFLAGGER
from saqc.funcs.harm_functions import (
harm_linear2Grid,
......@@ -16,7 +16,6 @@ from saqc.funcs.harm_functions import (
harm_aggregate2Grid,
harm_deharmonize
)
from saqc.funcs.proc_functions import ORIGINAL_SUFFIX
RESHAPERS = ["nshift", "fshift", "bshift", "nagg", "bagg", "fagg"]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment