From d3f881b71e66bc9e05cfb2f924682949f1f869c3 Mon Sep 17 00:00:00 2001 From: Peter Luenenschloss <peter.luenenschloss@ufz.de> Date: Sat, 4 Apr 2020 08:06:12 +0200 Subject: [PATCH 01/11] minor play-arounds --- saqc/funcs/spike_detection.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/saqc/funcs/spike_detection.py b/saqc/funcs/spike_detection.py index 6c8102790..848e641b5 100644 --- a/saqc/funcs/spike_detection.py +++ b/saqc/funcs/spike_detection.py @@ -55,15 +55,14 @@ def flagSpikes_oddWater(data, field, flagger, fields, trafo='normScale', alpha=0 tail_size = int(max(min(50, np.floor(sample_size/4)), 2)) tail_indices = np.arange(2, tail_size + 1) i_start = int(max(np.floor(sample_size*iter_start), 1) + 1) - sum(tail_indices/(tail_size-1)*gaps[i_start-tail_indices+1]) ghat = np.array([np.nan]*sample_size) for i in range(i_start-1, sample_size): - ghat[i] = sum(tail_indices/(tail_size-1)*gaps[i-tail_indices+1]) + ghat[i] = sum((tail_indices/(tail_size-1))*gaps[i-tail_indices+1]) log_alpha = np.log(1/alpha) for iter_index in range(i_start-1, sample_size): - if gaps[iter_index] > log_alpha*ghat[iter_index]: - break + if gaps[iter_index] > log_alpha*ghat[iter_index]: + break else: -- GitLab From 51d358baf28c07d3cf674e32669d0aeab4998d58 Mon Sep 17 00:00:00 2001 From: Peter Luenenschloss <peter.luenenschloss@ufz.de> Date: Mon, 6 Apr 2020 19:53:37 +0200 Subject: [PATCH 02/11] minor extensions (trafos) --- saqc/funcs/spike_detection.py | 7 ++++--- saqc/lib/tools.py | 9 +++++++-- saqc/lib/ts_operators.py | 14 ++++++++++++++ 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/saqc/funcs/spike_detection.py b/saqc/funcs/spike_detection.py index 848e641b5..a8b3d5ce9 100644 --- a/saqc/funcs/spike_detection.py +++ b/saqc/funcs/spike_detection.py @@ -27,17 +27,18 @@ def flagSpikes_oddWater(data, field, flagger, fields, trafo='normScale', alpha=0 trafo = composeFunction(trafo.split(',')) # data fransformation/extraction - val_frame = trafo(data[fields[0]]) + val_frame = data[fields[0]] for var in fields[1:]: - val_frame = pd.merge(val_frame, trafo(data[var]), - how='outer', + val_frame = pd.merge(val_frame, data[var], + how='inner', left_index=True, right_index=True ) data_len = val_frame.index.size val_frame.dropna(inplace=True) + val_frame = val_frame.transform(trafo) # KNN calculation kNNfunc = getattr(ts_ops, scoring_method) diff --git a/saqc/lib/tools.py b/saqc/lib/tools.py index 2e8f13598..9fefbc989 100644 --- a/saqc/lib/tools.py +++ b/saqc/lib/tools.py @@ -9,6 +9,7 @@ import pandas as pd import numba as nb import saqc.lib.ts_operators as ts_ops import scipy +import sklearn from functools import reduce, partial from saqc.lib.types import T, PandasLike @@ -31,13 +32,17 @@ SAQC_OPERATORS = { "deriv": ts_ops.derivative, "roc": ts_ops.rateOfChange, "scale": ts_ops.scale, - "normScale": ts_ops.normScale + "normScale": ts_ops.normScale, + "stdByMean": ts_ops.standardizeByMean, + "stdByMedian": ts_ops.standardizeByMedian, + "zlog": ts_ops.zeroLog } OP_MODULES = {'pd': pd, 'np': np, - 'scipy': scipy + 'scipy': scipy, + 'sklearn': sklearn } diff --git a/saqc/lib/ts_operators.py b/saqc/lib/ts_operators.py index 1e1a4545c..3a3034f59 100644 --- a/saqc/lib/ts_operators.py +++ b/saqc/lib/ts_operators.py @@ -4,6 +4,7 @@ import pandas as pd import numpy as np from sklearn.neighbors import NearestNeighbors +from scipy.stats import iqr @@ -28,6 +29,11 @@ def identity(ts): return ts +def zeroLog(ts): + log_ts = np.log(ts) + log_ts[log_ts == -np.inf] = np.nan + return log_ts + def difference(ts): return pd.Series.diff(ts) @@ -84,6 +90,14 @@ def kNN(in_arr, n_neighbors, algorithm='ball_tree'): return nbrs.kneighbors() +def standardizeByMean(ts): + return (ts - ts.mean())/ts.std() + + +def standardizeByMedian(ts): + return (ts - ts.median())/iqr(ts, nan_policy='omit') + + def kNNMaxGap(in_arr, n_neighbors, algorithm='ball_tree'): dist, *_ = kNN(in_arr, n_neighbors, algorithm=algorithm) sample_size = dist.shape[0] -- GitLab From b89266d236062b17e56fb4dac9027482271c46d4 Mon Sep 17 00:00:00 2001 From: Peter Luenenschloss <peter.luenenschloss@ufz.de> Date: Thu, 9 Apr 2020 18:38:19 +0200 Subject: [PATCH 03/11] made stray capable of looping over data slices --- dios | 1 + saqc/funcs/spike_detection.py | 194 +++++++++++++++++++--------------- 2 files changed, 111 insertions(+), 84 deletions(-) create mode 160000 dios diff --git a/dios b/dios new file mode 160000 index 000000000..d5a80af46 --- /dev/null +++ b/dios @@ -0,0 +1 @@ +Subproject commit d5a80af469c0540c91e5b4ddd1c0327b11a0d214 diff --git a/saqc/funcs/spike_detection.py b/saqc/funcs/spike_detection.py index a8b3d5ce9..fa80027b4 100644 --- a/saqc/funcs/spike_detection.py +++ b/saqc/funcs/spike_detection.py @@ -4,11 +4,9 @@ import numpy as np import pandas as pd - from scipy.signal import savgol_filter from scipy.stats import zscore from scipy.optimize import curve_fit -from sklearn.neighbors import NearestNeighbors from saqc.funcs.register import register import numpy.polynomial.polynomial as poly import numba @@ -21,9 +19,106 @@ from saqc.lib.tools import ( composeFunction ) + +def _stray(val_frame, partition_freq=None, scoring_method='kNNMaxGap', n_neighbors=10, iter_start=0.5, + alpha=0.05): + + kNNfunc = getattr(ts_ops, scoring_method) + partitions = val_frame.groupby(pd.Grouper(freq=partition_freq)) + to_flag = [] + for _, partition in partitions: + resids = kNNfunc(partition.values, n_neighbors=n_neighbors, algorithm='ball_tree') + sorted_i = resids.argsort() + resids = resids[sorted_i] + sample_size = resids.shape[0] + gaps = np.append(0, np.diff(resids)) + + tail_size = int(max(min(50, np.floor(sample_size / 4)), 2)) + tail_indices = np.arange(2, tail_size + 1) + i_start = int(max(np.floor(sample_size * iter_start), 1) + 1) + ghat = np.array([np.nan] * sample_size) + for i in range(i_start - 1, sample_size): + ghat[i] = sum((tail_indices / (tail_size - 1)) * gaps[i - tail_indices + 1]) + + log_alpha = np.log(1 / alpha) + for iter_index in range(i_start - 1, sample_size): + if gaps[iter_index] > log_alpha * ghat[iter_index]: + break + + to_flag = np.append(to_flag, list(partition.index[sorted_i[iter_index:]])) + + return to_flag + + +def _expFit(val_frame, scoring_method='kNNMaxGap', n_neighbors=10, iter_start=0.5, + alpha=0.05, bin_frac=10): + + + kNNfunc = getattr(ts_ops, scoring_method) + resids = kNNfunc(val_frame.values, n_neighbors=n_neighbors, algorithm='ball_tree') + data_len = resids.shape[0] + # sorting + sorted_i = resids.argsort() + resids = resids[sorted_i] + iter_index = int(np.floor(resids.size * iter_start)) + # initialize condition variables: + crit_val = np.inf + test_val = 0 + neg_log_alpha = - np.log(alpha) + + # define exponential dist density function: + def fit_function(x, lambd): + return lambd * np.exp(-lambd * x) + # initialise sampling bins + binz = np.linspace(resids[0], resids[-1], 10 * int(np.ceil(data_len / bin_frac))) + binzenters = np.array([0.5 * (binz[i] + binz[i + 1]) for i in range(len(binz) - 1)]) + # inititialize full histogram: + full_hist, binz = np.histogram(resids, bins=binz) + # check if start index is sufficiently high (pointing at resids value beyond histogram maximum at least): + hist_argmax = full_hist.argmax() + + if hist_argmax >= findIndex(binz, resids[iter_index-1], 0): + raise ValueError("Either the data histogram is too strangely shaped for oddWater OD detection - " + "or a too low value for 'iter_start' was passed " + "(iter_start better be much greater 0.5)") + # GO! + iter_max_bin_index = findIndex(binz, resids[iter_index-1], 0) + upper_tail_index = int(np.floor(0.5 * hist_argmax + 0.5 * iter_max_bin_index)) + resids_tail_index = findIndex(resids, binz[upper_tail_index], 0) + upper_tail_hist, bins = np.histogram(resids[resids_tail_index:iter_index], + bins=binz[upper_tail_index:iter_max_bin_index + 1]) + + while (test_val < crit_val) & (iter_index < resids.size-1): + iter_index += 1 + new_iter_max_bin_index = findIndex(binz, resids[iter_index-1], 0) + + # following if/else block "manually" expands the data histogram and circumvents calculation of the complete + # histogram in any new iteration. + if new_iter_max_bin_index == iter_max_bin_index: + upper_tail_hist[-1] += 1 + else: + upper_tail_hist = np.append(upper_tail_hist, np.zeros([new_iter_max_bin_index-iter_max_bin_index])) + upper_tail_hist[-1] += 1 + iter_max_bin_index = new_iter_max_bin_index + upper_tail_index_new = int(np.floor(0.5 * hist_argmax + 0.5 * iter_max_bin_index)) + upper_tail_hist = upper_tail_hist[upper_tail_index_new-upper_tail_index:] + upper_tail_index = upper_tail_index_new + + # fitting + lambdA, _ = curve_fit(fit_function, xdata=binzenters[upper_tail_index:iter_max_bin_index], + ydata=upper_tail_hist, + p0=[-np.log(alpha/resids[iter_index])]) + + crit_val = neg_log_alpha / lambdA + test_val = resids[iter_index] + + return val_frame.index[sorted_i[iter_index:]] + + @register("spikes_oddWater") def flagSpikes_oddWater(data, field, flagger, fields, trafo='normScale', alpha=0.05, bin_frac=10, n_neighbors=2, - iter_start=0.5, scoring_method='kNNMaxGap', lambda_estimator='gap_average', **kwargs): + iter_start=0.5, scoring_method='kNNMaxGap', thresholding='stray', stray_partition=None, + **kwargs): trafo = composeFunction(trafo.split(',')) # data fransformation/extraction @@ -36,98 +131,29 @@ def flagSpikes_oddWater(data, field, flagger, fields, trafo='normScale', alpha=0 right_index=True ) - data_len = val_frame.index.size val_frame.dropna(inplace=True) val_frame = val_frame.transform(trafo) - # KNN calculation - kNNfunc = getattr(ts_ops, scoring_method) - resids = kNNfunc(val_frame.values, n_neighbors=n_neighbors, algorithm='ball_tree') - - # sorting - sorted_i = resids.argsort() - resids = resids[sorted_i] - - # iter_start - - if lambda_estimator == 'gap_average': - sample_size = resids.shape[0] - gaps = np.append(0, np.diff(resids)) - tail_size = int(max(min(50, np.floor(sample_size/4)), 2)) - tail_indices = np.arange(2, tail_size + 1) - i_start = int(max(np.floor(sample_size*iter_start), 1) + 1) - ghat = np.array([np.nan]*sample_size) - for i in range(i_start-1, sample_size): - ghat[i] = sum((tail_indices/(tail_size-1))*gaps[i-tail_indices+1]) - - log_alpha = np.log(1/alpha) - for iter_index in range(i_start-1, sample_size): - if gaps[iter_index] > log_alpha*ghat[iter_index]: - break - + if thresholding == 'stray': + to_flag_index =_stray(val_frame, + partition_freq=stray_partition, + scoring_method=scoring_method, + n_neighbors=n_neighbors, + iter_start=iter_start) else: - # (estimator == 'exponential_fit') - iter_index = int(np.floor(resids.size * iter_start)) - # initialize condition variables: - crit_val = np.inf - test_val = 0 - neg_log_alpha = - np.log(alpha) - - # define exponential dist density function: - def fit_function(x, lambd): - return lambd * np.exp(-lambd * x) - # initialise sampling bins - binz = np.linspace(resids[0], resids[-1], 10 * int(np.ceil(data_len / bin_frac))) - binzenters = np.array([0.5 * (binz[i] + binz[i + 1]) for i in range(len(binz) - 1)]) - # inititialize full histogram: - full_hist, binz = np.histogram(resids, bins=binz) - # check if start index is sufficiently high (pointing at resids value beyond histogram maximum at least): - hist_argmax = full_hist.argmax() - - if hist_argmax >= findIndex(binz, resids[iter_index-1], 0): - raise ValueError("Either the data histogram is too strangely shaped for oddWater OD detection - " - "or a too low value for iter_start was passed (iter_start better be greater 0.5)") - # GO! - iter_max_bin_index = findIndex(binz, resids[iter_index-1], 0) - upper_tail_index = int(np.floor(0.5 * hist_argmax + 0.5 * iter_max_bin_index)) - resids_tail_index = findIndex(resids, binz[upper_tail_index], 0) - upper_tail_hist, bins = np.histogram(resids[resids_tail_index:iter_index], - bins=binz[upper_tail_index:iter_max_bin_index + 1]) - - while (test_val < crit_val) & (iter_index < resids.size-1): - iter_index += 1 - new_iter_max_bin_index = findIndex(binz, resids[iter_index-1], 0) - - # following if/else block "manually" expands the data histogram and circumvents calculation of the complete - # histogram in any new iteration. - if new_iter_max_bin_index == iter_max_bin_index: - upper_tail_hist[-1] += 1 - else: - upper_tail_hist = np.append(upper_tail_hist, np.zeros([new_iter_max_bin_index-iter_max_bin_index])) - upper_tail_hist[-1] += 1 - iter_max_bin_index = new_iter_max_bin_index - upper_tail_index_new = int(np.floor(0.5 * hist_argmax + 0.5 * iter_max_bin_index)) - upper_tail_hist = upper_tail_hist[upper_tail_index_new-upper_tail_index:] - upper_tail_index = upper_tail_index_new - - # fitting - lambdA, _ = curve_fit(fit_function, xdata=binzenters[upper_tail_index:iter_max_bin_index], - ydata=upper_tail_hist, - p0=[-np.log(alpha/resids[iter_index])]) - - crit_val = neg_log_alpha / lambdA - test_val = resids[iter_index] - - # flag them! - to_flag_index = val_frame.index[sorted_i[iter_index:]] + to_flag_index = _expFit(val_frame, + scoring_method=scoring_method, + n_neighbors=n_neighbors, + iter_start=iter_start, + alpha=alpha, + bin_frac=bin_frac) for var in fields: flagger = flagger.setFlags(var, to_flag_index, **kwargs) return data, flagger - @register("spikes_limitRaise") def flagSpikes_limitRaise( data, field, flagger, thresh, raise_window, intended_freq, average_window=None, mean_raise_factor=2, min_slope=None, -- GitLab From 34c13850f2be2e6f0d13d26719daf5292453b9ac Mon Sep 17 00:00:00 2001 From: Peter Luenenschloss <peter.luenenschloss@ufz.de> Date: Thu, 9 Apr 2020 19:54:56 +0200 Subject: [PATCH 04/11] added optimized binning procedure to expFit --- saqc/funcs/spike_detection.py | 23 ++++++++++++++++------- test/funcs/test_spike_detection.py | 2 +- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/saqc/funcs/spike_detection.py b/saqc/funcs/spike_detection.py index fa80027b4..e3d4db26e 100644 --- a/saqc/funcs/spike_detection.py +++ b/saqc/funcs/spike_detection.py @@ -27,10 +27,13 @@ def _stray(val_frame, partition_freq=None, scoring_method='kNNMaxGap', n_neighbo partitions = val_frame.groupby(pd.Grouper(freq=partition_freq)) to_flag = [] for _, partition in partitions: - resids = kNNfunc(partition.values, n_neighbors=n_neighbors, algorithm='ball_tree') + if partition.empty: + continue + sample_size = partition.shape[0] + nn_neighbors = min(n_neighbors, sample_size) + resids = kNNfunc(partition.values, n_neighbors=nn_neighbors-1, algorithm='ball_tree') sorted_i = resids.argsort() resids = resids[sorted_i] - sample_size = resids.shape[0] gaps = np.append(0, np.diff(resids)) tail_size = int(max(min(50, np.floor(sample_size / 4)), 2)) @@ -70,7 +73,13 @@ def _expFit(val_frame, scoring_method='kNNMaxGap', n_neighbors=10, iter_start=0. def fit_function(x, lambd): return lambd * np.exp(-lambd * x) # initialise sampling bins - binz = np.linspace(resids[0], resids[-1], 10 * int(np.ceil(data_len / bin_frac))) + if isinstance(bin_frac, int): + binz = np.linspace(resids[0], resids[-1], 10 * int(np.ceil(data_len / bin_frac))) + elif bin_frac in ['auto', 'fd', 'doane', 'scott', 'stone', 'rice', 'sturges', 'sqrt']: + binz = np.histogram_bin_edges(resids, bins=bin_frac) + else: + raise ValueError('Cant interpret {} as an binning technique.'.format(bin_frac)) + binzenters = np.array([0.5 * (binz[i] + binz[i + 1]) for i in range(len(binz) - 1)]) # inititialize full histogram: full_hist, binz = np.histogram(resids, bins=binz) @@ -116,8 +125,8 @@ def _expFit(val_frame, scoring_method='kNNMaxGap', n_neighbors=10, iter_start=0. @register("spikes_oddWater") -def flagSpikes_oddWater(data, field, flagger, fields, trafo='normScale', alpha=0.05, bin_frac=10, n_neighbors=2, - iter_start=0.5, scoring_method='kNNMaxGap', thresholding='stray', stray_partition=None, +def flagSpikes_oddWater(data, field, flagger, fields, trafo='normScale', alpha=0.05, binning='auto', n_neighbors=2, + iter_start=0.5, scoring_method='kNNMaxGap', threshing='stray', stray_partition=None, **kwargs): trafo = composeFunction(trafo.split(',')) @@ -134,7 +143,7 @@ def flagSpikes_oddWater(data, field, flagger, fields, trafo='normScale', alpha=0 val_frame.dropna(inplace=True) val_frame = val_frame.transform(trafo) - if thresholding == 'stray': + if threshing == 'stray': to_flag_index =_stray(val_frame, partition_freq=stray_partition, scoring_method=scoring_method, @@ -147,7 +156,7 @@ def flagSpikes_oddWater(data, field, flagger, fields, trafo='normScale', alpha=0 n_neighbors=n_neighbors, iter_start=iter_start, alpha=alpha, - bin_frac=bin_frac) + bin_frac=binning) for var in fields: flagger = flagger.setFlags(var, to_flag_index, **kwargs) diff --git a/test/funcs/test_spike_detection.py b/test/funcs/test_spike_detection.py index 01679c695..8f434bcf8 100644 --- a/test/funcs/test_spike_detection.py +++ b/test/funcs/test_spike_detection.py @@ -113,7 +113,7 @@ def test_flagSpikesOddWater(dat, flagger): data = pd.DataFrame({'data1': data1.squeeze(), 'data2': data2.squeeze()}, index=data1.index) flagger = flagger.initFlags(data) _, flagger_result = flagSpikes_oddWater( - data, field, flagger, fields=fields, bin_frac=50, trafo='np.log', + data, field, flagger, fields=fields, binning=50, trafo='np.log', iter_start=0.95, n_neighbors=10 ) assert flagger_result.isFlagged(fields[0])[characteristics['raise']].all() -- GitLab From a897ec77b33d65e2c9b4e73a67574aa2db2736f7 Mon Sep 17 00:00:00 2001 From: Peter Luenenschloss <peter.luenenschloss@ufz.de> Date: Tue, 14 Apr 2020 13:38:52 +0200 Subject: [PATCH 05/11] changed spikedetection architecture --- saqc/funcs/spike_detection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/saqc/funcs/spike_detection.py b/saqc/funcs/spike_detection.py index e3d4db26e..c209cdac5 100644 --- a/saqc/funcs/spike_detection.py +++ b/saqc/funcs/spike_detection.py @@ -44,10 +44,12 @@ def _stray(val_frame, partition_freq=None, scoring_method='kNNMaxGap', n_neighbo ghat[i] = sum((tail_indices / (tail_size - 1)) * gaps[i - tail_indices + 1]) log_alpha = np.log(1 / alpha) + log_inv_alpha = np.log(1 / (1-alpha)) for iter_index in range(i_start - 1, sample_size): if gaps[iter_index] > log_alpha * ghat[iter_index]: break + to_flag = np.append(to_flag, list(partition.index[sorted_i[iter_index:]])) return to_flag -- GitLab From 3a30e5a6c2850f0f18bc1235dcd407bdd4e222f2 Mon Sep 17 00:00:00 2001 From: Peter Luenenschloss <peter.luenenschloss@ufz.de> Date: Mon, 20 Apr 2020 12:00:39 +0200 Subject: [PATCH 06/11] added periods number based slicing to oddWaters Stray algorithm --- saqc/funcs/spike_detection.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/saqc/funcs/spike_detection.py b/saqc/funcs/spike_detection.py index c209cdac5..0c04e8ba9 100644 --- a/saqc/funcs/spike_detection.py +++ b/saqc/funcs/spike_detection.py @@ -20,14 +20,20 @@ from saqc.lib.tools import ( ) -def _stray(val_frame, partition_freq=None, scoring_method='kNNMaxGap', n_neighbors=10, iter_start=0.5, +def _stray(val_frame, partition_freq=None, partition_min=0, scoring_method='kNNMaxGap', n_neighbors=10, iter_start=0.5, alpha=0.05): kNNfunc = getattr(ts_ops, scoring_method) - partitions = val_frame.groupby(pd.Grouper(freq=partition_freq)) + if isinstance(partition_freq, str): + partitions = val_frame.groupby(pd.Grouper(freq=partition_freq)) + else: + grouper_series = pd.Series(data=np.arange(0, val_frame.shape[0]), index=val_frame.index) + grouper_series = grouper_series.transform(lambda x: int(np.floor(x / partition_freq))) + partitions = val_frame.groupby(grouper_series) + to_flag = [] for _, partition in partitions: - if partition.empty: + if partition.empty | (partition.shape[0] < partition_min): continue sample_size = partition.shape[0] nn_neighbors = min(n_neighbors, sample_size) @@ -44,7 +50,6 @@ def _stray(val_frame, partition_freq=None, scoring_method='kNNMaxGap', n_neighbo ghat[i] = sum((tail_indices / (tail_size - 1)) * gaps[i - tail_indices + 1]) log_alpha = np.log(1 / alpha) - log_inv_alpha = np.log(1 / (1-alpha)) for iter_index in range(i_start - 1, sample_size): if gaps[iter_index] > log_alpha * ghat[iter_index]: break @@ -146,11 +151,11 @@ def flagSpikes_oddWater(data, field, flagger, fields, trafo='normScale', alpha=0 val_frame = val_frame.transform(trafo) if threshing == 'stray': - to_flag_index =_stray(val_frame, - partition_freq=stray_partition, - scoring_method=scoring_method, - n_neighbors=n_neighbors, - iter_start=iter_start) + to_flag_index = _stray(val_frame, + partition_freq=stray_partition, + scoring_method=scoring_method, + n_neighbors=n_neighbors, + iter_start=iter_start) else: to_flag_index = _expFit(val_frame, -- GitLab From 73210d6afeade8a066835ba897e24b6816d86039 Mon Sep 17 00:00:00 2001 From: Peter Luenenschloss <peter.luenenschloss@ufz.de> Date: Mon, 20 Apr 2020 12:16:09 +0200 Subject: [PATCH 07/11] minor cleanup --- saqc/funcs/spike_detection.py | 13 +++++++++---- test/funcs/test_spike_detection.py | 4 ++-- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/saqc/funcs/spike_detection.py b/saqc/funcs/spike_detection.py index 0c04e8ba9..ee823198e 100644 --- a/saqc/funcs/spike_detection.py +++ b/saqc/funcs/spike_detection.py @@ -24,6 +24,10 @@ def _stray(val_frame, partition_freq=None, partition_min=0, scoring_method='kNNM alpha=0.05): kNNfunc = getattr(ts_ops, scoring_method) + # partitioning + if not partition_freq: + partition_freq = val_frame.shape[0] + if isinstance(partition_freq, str): partitions = val_frame.groupby(pd.Grouper(freq=partition_freq)) else: @@ -31,6 +35,7 @@ def _stray(val_frame, partition_freq=None, partition_min=0, scoring_method='kNNM grouper_series = grouper_series.transform(lambda x: int(np.floor(x / partition_freq))) partitions = val_frame.groupby(grouper_series) + # calculate flags for every partition to_flag = [] for _, partition in partitions: if partition.empty | (partition.shape[0] < partition_min): @@ -131,10 +136,10 @@ def _expFit(val_frame, scoring_method='kNNMaxGap', n_neighbors=10, iter_start=0. return val_frame.index[sorted_i[iter_index:]] -@register("spikes_oddWater") -def flagSpikes_oddWater(data, field, flagger, fields, trafo='normScale', alpha=0.05, binning='auto', n_neighbors=2, - iter_start=0.5, scoring_method='kNNMaxGap', threshing='stray', stray_partition=None, - **kwargs): +@register("spikes_multivariateKNNScoring") +def flagSpikes_multivariateKNNScoring(data, field, flagger, fields, trafo='normScale', alpha=0.05, binning='auto', n_neighbors=2, + iter_start=0.5, scoring_method='kNNMaxGap', threshing='stray', stray_partition=None, + **kwargs): trafo = composeFunction(trafo.split(',')) # data fransformation/extraction diff --git a/test/funcs/test_spike_detection.py b/test/funcs/test_spike_detection.py index 8f434bcf8..878f1ce05 100644 --- a/test/funcs/test_spike_detection.py +++ b/test/funcs/test_spike_detection.py @@ -11,7 +11,7 @@ from saqc.funcs.spike_detection import ( flagSpikes_slidingZscore, flagSpikes_basic, flagSpikes_limitRaise, - flagSpikes_oddWater + flagSpikes_multivariateKNNScoring ) from test.common import TESTFLAGGER @@ -112,7 +112,7 @@ def test_flagSpikesOddWater(dat, flagger): fields = ['data1', 'data2'] data = pd.DataFrame({'data1': data1.squeeze(), 'data2': data2.squeeze()}, index=data1.index) flagger = flagger.initFlags(data) - _, flagger_result = flagSpikes_oddWater( + _, flagger_result = flagSpikes_multivariateKNNScoring( data, field, flagger, fields=fields, binning=50, trafo='np.log', iter_start=0.95, n_neighbors=10 ) -- GitLab From 0dd2f40f6ca99f8c6d112b344d69ce89a2acddba Mon Sep 17 00:00:00 2001 From: Peter Luenenschloss <peter.luenenschloss@ufz.de> Date: Mon, 20 Apr 2020 13:42:35 +0200 Subject: [PATCH 08/11] mltivariateFlagging parameter order/names --- saqc/funcs/spikes_detection.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/saqc/funcs/spikes_detection.py b/saqc/funcs/spikes_detection.py index 22af71889..3465053f3 100644 --- a/saqc/funcs/spikes_detection.py +++ b/saqc/funcs/spikes_detection.py @@ -136,9 +136,10 @@ def _expFit(val_frame, scoring_method='kNNMaxGap', n_neighbors=10, iter_start=0. @register() -def spikes_flagMultivariateKNNScores(data, field, flagger, fields, trafo='normScale', alpha=0.05, binning='auto', n_neighbors=2, - iter_start=0.5, scoring_method='kNNMaxGap', threshing='stray', stray_partition=None, - **kwargs): +def spikes_flagMultivariateKNNScores(data, field, flagger, fields, trafo='normScale', alpha=0.05, n_neighbors=10, + scoring_method='kNNMaxGap', iter_start=0.5, threshing='stray', + expfit_binning='auto', stray_partition=None, stray_partition_min=0, + **kwargs): trafo = composeFunction(trafo.split(',')) # data fransformation/extraction @@ -157,6 +158,7 @@ def spikes_flagMultivariateKNNScores(data, field, flagger, fields, trafo='normSc if threshing == 'stray': to_flag_index = _stray(val_frame, partition_freq=stray_partition, + partition_min=stray_partition_min, scoring_method=scoring_method, n_neighbors=n_neighbors, iter_start=iter_start) @@ -167,7 +169,7 @@ def spikes_flagMultivariateKNNScores(data, field, flagger, fields, trafo='normSc n_neighbors=n_neighbors, iter_start=iter_start, alpha=alpha, - bin_frac=binning) + bin_frac=expfit_binning) for var in fields: flagger = flagger.setFlags(var, to_flag_index, **kwargs) -- GitLab From 07453251856fc41c83667aeb10a357c6b8a566ef Mon Sep 17 00:00:00 2001 From: Peter Luenenschloss <peter.luenenschloss@ufz.de> Date: Mon, 20 Apr 2020 14:02:02 +0200 Subject: [PATCH 09/11] submodule confusion --- dios | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dios b/dios index d5a80af46..0665773c1 160000 --- a/dios +++ b/dios @@ -1 +1 @@ -Subproject commit d5a80af469c0540c91e5b4ddd1c0327b11a0d214 +Subproject commit 0665773c1a86b018f79db7983494c198b77620fc -- GitLab From 2579967a02f35da8d566942d8ec5524b2e5adfc3 Mon Sep 17 00:00:00 2001 From: Peter Luenenschloss <peter.luenenschloss@ufz.de> Date: Tue, 21 Apr 2020 10:19:13 +0200 Subject: [PATCH 10/11] fixed dios import bug --- test/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/common.py b/test/common.py index c56007fad..c0da2a204 100644 --- a/test/common.py +++ b/test/common.py @@ -6,7 +6,7 @@ import re import numpy as np import pandas as pd -import dios.dios as dios +import dios from saqc.core.core import readConfig from saqc.flagger import ( -- GitLab From 6cbea7bd4fd626be60dff67d73fc2100b499eebf Mon Sep 17 00:00:00 2001 From: Peter Luenenschloss <peter.luenenschloss@ufz.de> Date: Tue, 21 Apr 2020 10:28:22 +0200 Subject: [PATCH 11/11] some renaming --- saqc/funcs/spikes_detection.py | 8 ++++---- saqc/lib/tools.py | 10 +++++----- test/funcs/test_spikes_detection.py | 4 ++-- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/saqc/funcs/spikes_detection.py b/saqc/funcs/spikes_detection.py index 3465053f3..61eb4fb49 100644 --- a/saqc/funcs/spikes_detection.py +++ b/saqc/funcs/spikes_detection.py @@ -136,10 +136,10 @@ def _expFit(val_frame, scoring_method='kNNMaxGap', n_neighbors=10, iter_start=0. @register() -def spikes_flagMultivariateKNNScores(data, field, flagger, fields, trafo='normScale', alpha=0.05, n_neighbors=10, - scoring_method='kNNMaxGap', iter_start=0.5, threshing='stray', - expfit_binning='auto', stray_partition=None, stray_partition_min=0, - **kwargs): +def spikes_flagMultivarScores(data, field, flagger, fields, trafo='normScale', alpha=0.05, n_neighbors=10, + scoring_method='kNNMaxGap', iter_start=0.5, threshing='stray', + expfit_binning='auto', stray_partition=None, stray_partition_min=0, + **kwargs): trafo = composeFunction(trafo.split(',')) # data fransformation/extraction diff --git a/saqc/lib/tools.py b/saqc/lib/tools.py index 374c880ec..18d9048a7 100644 --- a/saqc/lib/tools.py +++ b/saqc/lib/tools.py @@ -27,17 +27,17 @@ SAQC_OPERATORS = { "max": np.max, "first": pd.Series(np.nan, index=pd.DatetimeIndex([])).resample("0min").first, "last": pd.Series(np.nan, index=pd.DatetimeIndex([])).resample("0min").last, - "delta_t": ts_ops.deltaT, + "deltaT": ts_ops.deltaT, "id": ts_ops.identity, "diff": ts_ops.difference, "relDiff": ts_ops.relativeDifference, "deriv": ts_ops.derivative, - "roc": ts_ops.rateOfChange, + "rateOfChange": ts_ops.rateOfChange, "scale": ts_ops.scale, "normScale": ts_ops.normScale, - "stdByMean": ts_ops.standardizeByMean, - "stdByMedian": ts_ops.standardizeByMedian, - "zlog": ts_ops.zeroLog + "meanStandardize": ts_ops.standardizeByMean, + "medianStandardize": ts_ops.standardizeByMedian, + "zLog": ts_ops.zeroLog } diff --git a/test/funcs/test_spikes_detection.py b/test/funcs/test_spikes_detection.py index 06b3f3fcc..77f729c2a 100644 --- a/test/funcs/test_spikes_detection.py +++ b/test/funcs/test_spikes_detection.py @@ -12,7 +12,7 @@ from saqc.funcs.spikes_detection import ( spikes_flagSlidingZscore, spikes_flagBasic, spikes_flagRaise, - spikes_flagMultivariateKNNScores + spikes_flagMultivarScores ) from test.common import TESTFLAGGER @@ -111,7 +111,7 @@ def test_flagSpikesOddWater(dat, flagger): s2 = pd.Series(data=s2.values, index=s1.index) data = dios.DictOfSeries([s1, s2], columns=["data1", "data2"]) flagger = flagger.initFlags(data) - _, flagger_result = spikes_flagMultivariateKNNScores( + _, flagger_result = spikes_flagMultivarScores( data, field, flagger, fields=fields, binning=50, trafo='np.log', iter_start=0.95, n_neighbors=10 ) -- GitLab