From fd3890b2e206694953cc9b3c137991d28d021817 Mon Sep 17 00:00:00 2001 From: Peter Luenenschloss <peter.luenenschloss@ufz.de> Date: Thu, 30 Apr 2020 17:09:47 +0200 Subject: [PATCH] added testmodule - fixed minor bugs encountered while testmoduling --- saqc/funcs/proc_functions.py | 14 +++++++-- saqc/funcs/spikes_detection.py | 4 ++- saqc/lib/tools.py | 1 + saqc/lib/ts_operators.py | 2 +- test/funcs/conftest.py | 12 ++++---- test/funcs/test_proc_functions.py | 46 ++++++++++++++++++++++++++--- test/funcs/test_spikes_detection.py | 2 +- 7 files changed, 65 insertions(+), 16 deletions(-) diff --git a/saqc/funcs/proc_functions.py b/saqc/funcs/proc_functions.py index 26dc78a7f..d6a1f2cf8 100644 --- a/saqc/funcs/proc_functions.py +++ b/saqc/funcs/proc_functions.py @@ -41,9 +41,11 @@ def proc_resample(data, field, flagger, freq, func="mean", max_invalid_total=Non flag_agg_func='max', **kwargs): data = data.copy() datcol = data[field] + d_start = datcol.index[0].floor(freq) + d_end = datcol.index[-1].ceil(freq) # filter data for invalid patterns - if (max_invalid_total is None) | (max_invalid_consec is None): + if (max_invalid_total is not None) | (max_invalid_consec is not None): if not max_invalid_total: max_invalid_total = np.inf if not max_invalid_consec: @@ -72,6 +74,11 @@ def proc_resample(data, field, flagger, freq, func="mean", max_invalid_total=Non flag_agg_func = composeFunction(flag_agg_func) datflags = flagsresampler.apply(flag_agg_func) + # insert freqgrid (for consistency reasons -> in above step, start and ending chunks can get lost due to invalid + # intervals): + grid = pd.date_range(d_start, d_end, freq=freq) + datcol = datcol.reindex(grid) + datflags = datflags.reindex(grid) # data/flags reshaping: data[field] = datcol reshape_flagger = flagger.initFlags(datcol).setFlags(field, flag=datflags, force=True, **kwargs) @@ -83,5 +90,8 @@ def proc_resample(data, field, flagger, freq, func="mean", max_invalid_total=Non def proc_transform(data, field, flagger, func, **kwargs): data = data.copy() func = composeFunction(func) - data[field] = data[field].transform(func) + # NOTE: avoiding pd.Series.transform() in the line below, because transform does process columns element wise + # (so interpolations wouldn't work) + new_col = pd.Series(func(data[field]), index=data[field].index) + data[field] = new_col return data, flagger \ No newline at end of file diff --git a/saqc/funcs/spikes_detection.py b/saqc/funcs/spikes_detection.py index 666f97da7..c8ba91e7b 100644 --- a/saqc/funcs/spikes_detection.py +++ b/saqc/funcs/spikes_detection.py @@ -158,7 +158,9 @@ def spikes_flagMultivarScores(data, field, flagger, fields, trafo='normScale', a ) val_frame.dropna(inplace=True) - val_frame = val_frame.transform(trafo_dict) + for field in val_frame.columns: + val_frame[field] = trafo_dict[field](val_frame[field]) + if threshing == 'stray': to_flag_index = _stray(val_frame, diff --git a/saqc/lib/tools.py b/saqc/lib/tools.py index 9825b3934..70c32da8f 100644 --- a/saqc/lib/tools.py +++ b/saqc/lib/tools.py @@ -84,6 +84,7 @@ def evalFuncString(full_func_string): kwarg_dict = {} if len(paras) > 0: + paras = [float(x) if x.isnumeric() else x for x in paras] para_names = inspect.getfullargspec(func).args[1:1 + len(paras)] kwarg_dict.update(dict(zip(para_names, paras))) diff --git a/saqc/lib/ts_operators.py b/saqc/lib/ts_operators.py index ebfa042e0..2f847387b 100644 --- a/saqc/lib/ts_operators.py +++ b/saqc/lib/ts_operators.py @@ -167,7 +167,7 @@ def interpolateNANs(data, method, order=2, inter_limit=2, downgrade_interpolatio :return: """ - + inter_limit = int(inter_limit) data = pd.Series(data).copy() gap_mask = (data.rolling(inter_limit, min_periods=0).apply(lambda x: np.sum(np.isnan(x)), raw=True)) != inter_limit diff --git a/test/funcs/conftest.py b/test/funcs/conftest.py index 430aaae51..c19280c01 100644 --- a/test/funcs/conftest.py +++ b/test/funcs/conftest.py @@ -128,22 +128,20 @@ def course_4(char_dict): @pytest.fixture def course_5(char_dict): - # NAN_holes values , that remain on value level "base_level" and than begin exposing an outlierish or - # spikey value of magnitude "out_val" every second timestep, starting at periods/2, with the first spike. number + # NAN_holes values , that ascend from initial_level to final_level linearly and have missing data(=nan) + # at posiiotns "nan_slice", (=a slice or a list, for iloc indexing) # periods better be even! # periods better be greater 5 - def fix_funk(freq='10min', periods=100, nan_slice=slice(0, None, 5), initial_level=0, final_level=10, + def fix_funk(freq='10min', periods=10, nan_slice=slice(0, None, 5), initial_level=0, final_level=10, initial_index=pd.Timestamp(2000, 1, 1, 0, 0, 0), char_dict=char_dict): t_index = pd.date_range(initial_index, freq=freq, periods=periods) values = np.linspace(initial_level, final_level, periods) s = pd.Series(values, index=t_index) - s[nan_slice] = np.nan - char_dict['missing'] = s[nan_slice].index + s.iloc[nan_slice] = np.nan + char_dict['missing'] = s.iloc[nan_slice].index data = DictOfSeries(data=s, columns=['data']) return data, char_dict return fix_funk - - return fix_funk \ No newline at end of file diff --git a/test/funcs/test_proc_functions.py b/test/funcs/test_proc_functions.py index 48777e7da..5b9ef99c7 100644 --- a/test/funcs/test_proc_functions.py +++ b/test/funcs/test_proc_functions.py @@ -15,7 +15,45 @@ from saqc.funcs.proc_functions import ( from test.common import TESTFLAGGER @pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_interpolateMissing(course_1, flagger): - data, *_ = course_1(periods=100) - data[1] = np.nan - data[] +def test_interpolateMissing(course_5, flagger): + data, characteristics = course_5(periods=10, nan_slice=[5]) + field = data.columns[0] + data = dios.DictOfSeries(data) + flagger = flagger.initFlags(data) + dataLin, *_ = proc_interpolateMissing(data, field, flagger, method='linear') + dataPoly, *_ = proc_interpolateMissing(data, field, flagger, method='polynomial') + assert dataLin[field][characteristics['missing']].notna().all() + assert dataPoly[field][characteristics['missing']].notna().all() + data, characteristics = course_5(periods=10, nan_slice=[5, 6, 7]) + dataLin1, *_ = proc_interpolateMissing(data, field, flagger, method='linear', inter_limit=2) + dataLin2, *_ = proc_interpolateMissing(data, field, flagger, method='linear', inter_limit=3) + dataLin3, *_ = proc_interpolateMissing(data, field, flagger, method='linear', inter_limit=4) + assert dataLin1[field][characteristics['missing']].isna().all() + assert dataLin2[field][characteristics['missing']].isna().all() + assert dataLin3[field][characteristics['missing']].notna().all() + + +@pytest.mark.parametrize("flagger", TESTFLAGGER) +def test_transform(course_5, flagger): + data, characteristics = course_5(periods=10, nan_slice=[5, 6]) + field = data.columns[0] + data = dios.DictOfSeries(data) + flagger = flagger.initFlags(data) + data1, *_ = proc_transform(data, field, flagger, func='linear') + assert data1[field][characteristics['missing']].isna().all() + data1, *_ = proc_transform(data, field, flagger, func='linear$3') + assert data1[field][characteristics['missing']].notna().all() + data1, *_ = proc_transform(data, field, flagger, func='polynomial$3$3') + assert data1[field][characteristics['missing']].notna().all() + + +@pytest.mark.parametrize("flagger", TESTFLAGGER) +def test_resample(course_5, flagger): + data, characteristics = course_5(freq='1min', periods=30, nan_slice=[1, 11, 12, 22, 24, 26]) + field = data.columns[0] + data = dios.DictOfSeries(data) + flagger = flagger.initFlags(data) + data1, *_ = proc_resample(data, field, flagger, '10min', 'mean', max_invalid_total=2, max_invalid_consec=1) + assert ~np.isnan(data1[field].iloc[0]) + assert np.isnan(data1[field].iloc[1]) + assert np.isnan(data1[field].iloc[2]) \ No newline at end of file diff --git a/test/funcs/test_spikes_detection.py b/test/funcs/test_spikes_detection.py index 77f729c2a..3688f07e6 100644 --- a/test/funcs/test_spikes_detection.py +++ b/test/funcs/test_spikes_detection.py @@ -101,7 +101,7 @@ def test_flagSpikesLimitRaise(dat, flagger): # see test/functs/conftest.py for the 'course_N' @pytest.mark.parametrize("flagger", TESTFLAGGER) @pytest.mark.parametrize("dat", [pytest.lazy_fixture("course_3")]) -def test_flagSpikesOddWater(dat, flagger): +def test_flagMultivarScores(dat, flagger): data1, characteristics = dat(periods=1000, initial_level=5, final_level=15, out_val=50) data2, characteristics = dat(periods=1000, initial_level=20, final_level=1, out_val=30) field = "dummy" -- GitLab