From d3f881b71e66bc9e05cfb2f924682949f1f869c3 Mon Sep 17 00:00:00 2001
From: Peter Luenenschloss <peter.luenenschloss@ufz.de>
Date: Sat, 4 Apr 2020 08:06:12 +0200
Subject: [PATCH 01/11] minor play-arounds

---
 saqc/funcs/spike_detection.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/saqc/funcs/spike_detection.py b/saqc/funcs/spike_detection.py
index 6c8102790..848e641b5 100644
--- a/saqc/funcs/spike_detection.py
+++ b/saqc/funcs/spike_detection.py
@@ -55,15 +55,14 @@ def flagSpikes_oddWater(data, field, flagger, fields, trafo='normScale', alpha=0
         tail_size = int(max(min(50, np.floor(sample_size/4)), 2))
         tail_indices = np.arange(2, tail_size + 1)
         i_start = int(max(np.floor(sample_size*iter_start), 1) + 1)
-        sum(tail_indices/(tail_size-1)*gaps[i_start-tail_indices+1])
         ghat = np.array([np.nan]*sample_size)
         for i in range(i_start-1, sample_size):
-            ghat[i] = sum(tail_indices/(tail_size-1)*gaps[i-tail_indices+1])
+            ghat[i] = sum((tail_indices/(tail_size-1))*gaps[i-tail_indices+1])
 
         log_alpha = np.log(1/alpha)
         for iter_index in range(i_start-1, sample_size):
-           if gaps[iter_index] > log_alpha*ghat[iter_index]:
-               break
+            if gaps[iter_index] > log_alpha*ghat[iter_index]:
+                break
 
 
     else:
-- 
GitLab


From 51d358baf28c07d3cf674e32669d0aeab4998d58 Mon Sep 17 00:00:00 2001
From: Peter Luenenschloss <peter.luenenschloss@ufz.de>
Date: Mon, 6 Apr 2020 19:53:37 +0200
Subject: [PATCH 02/11] minor extensions (trafos)

---
 saqc/funcs/spike_detection.py |  7 ++++---
 saqc/lib/tools.py             |  9 +++++++--
 saqc/lib/ts_operators.py      | 14 ++++++++++++++
 3 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/saqc/funcs/spike_detection.py b/saqc/funcs/spike_detection.py
index 848e641b5..a8b3d5ce9 100644
--- a/saqc/funcs/spike_detection.py
+++ b/saqc/funcs/spike_detection.py
@@ -27,17 +27,18 @@ def flagSpikes_oddWater(data, field, flagger, fields, trafo='normScale', alpha=0
 
     trafo = composeFunction(trafo.split(','))
     # data fransformation/extraction
-    val_frame = trafo(data[fields[0]])
+    val_frame = data[fields[0]]
 
     for var in fields[1:]:
-        val_frame = pd.merge(val_frame, trafo(data[var]),
-                             how='outer',
+        val_frame = pd.merge(val_frame, data[var],
+                             how='inner',
                              left_index=True,
                              right_index=True
                              )
 
     data_len = val_frame.index.size
     val_frame.dropna(inplace=True)
+    val_frame = val_frame.transform(trafo)
 
     # KNN calculation
     kNNfunc = getattr(ts_ops, scoring_method)
diff --git a/saqc/lib/tools.py b/saqc/lib/tools.py
index 2e8f13598..9fefbc989 100644
--- a/saqc/lib/tools.py
+++ b/saqc/lib/tools.py
@@ -9,6 +9,7 @@ import pandas as pd
 import numba as nb
 import saqc.lib.ts_operators as ts_ops
 import scipy
+import sklearn
 from functools import reduce, partial
 from saqc.lib.types import T, PandasLike
 
@@ -31,13 +32,17 @@ SAQC_OPERATORS = {
     "deriv": ts_ops.derivative,
     "roc": ts_ops.rateOfChange,
     "scale": ts_ops.scale,
-    "normScale": ts_ops.normScale
+    "normScale": ts_ops.normScale,
+    "stdByMean": ts_ops.standardizeByMean,
+    "stdByMedian": ts_ops.standardizeByMedian,
+    "zlog": ts_ops.zeroLog
 }
 
 
 OP_MODULES = {'pd': pd,
               'np': np,
-              'scipy': scipy
+              'scipy': scipy,
+              'sklearn': sklearn
               }
 
 
diff --git a/saqc/lib/ts_operators.py b/saqc/lib/ts_operators.py
index 1e1a4545c..3a3034f59 100644
--- a/saqc/lib/ts_operators.py
+++ b/saqc/lib/ts_operators.py
@@ -4,6 +4,7 @@
 import pandas as pd
 import numpy as np
 from sklearn.neighbors import NearestNeighbors
+from scipy.stats import iqr
 
 
 
@@ -28,6 +29,11 @@ def identity(ts):
     return ts
 
 
+def zeroLog(ts):
+    log_ts = np.log(ts)
+    log_ts[log_ts == -np.inf] = np.nan
+    return log_ts
+
 def difference(ts):
     return pd.Series.diff(ts)
 
@@ -84,6 +90,14 @@ def kNN(in_arr, n_neighbors, algorithm='ball_tree'):
     return nbrs.kneighbors()
 
 
+def standardizeByMean(ts):
+    return (ts - ts.mean())/ts.std()
+
+
+def standardizeByMedian(ts):
+    return (ts - ts.median())/iqr(ts, nan_policy='omit')
+
+
 def kNNMaxGap(in_arr, n_neighbors, algorithm='ball_tree'):
     dist, *_ = kNN(in_arr, n_neighbors, algorithm=algorithm)
     sample_size = dist.shape[0]
-- 
GitLab


From b89266d236062b17e56fb4dac9027482271c46d4 Mon Sep 17 00:00:00 2001
From: Peter Luenenschloss <peter.luenenschloss@ufz.de>
Date: Thu, 9 Apr 2020 18:38:19 +0200
Subject: [PATCH 03/11] made stray capable of looping over data slices

---
 dios                          |   1 +
 saqc/funcs/spike_detection.py | 194 +++++++++++++++++++---------------
 2 files changed, 111 insertions(+), 84 deletions(-)
 create mode 160000 dios

diff --git a/dios b/dios
new file mode 160000
index 000000000..d5a80af46
--- /dev/null
+++ b/dios
@@ -0,0 +1 @@
+Subproject commit d5a80af469c0540c91e5b4ddd1c0327b11a0d214
diff --git a/saqc/funcs/spike_detection.py b/saqc/funcs/spike_detection.py
index a8b3d5ce9..fa80027b4 100644
--- a/saqc/funcs/spike_detection.py
+++ b/saqc/funcs/spike_detection.py
@@ -4,11 +4,9 @@
 
 import numpy as np
 import pandas as pd
-
 from scipy.signal import savgol_filter
 from scipy.stats import zscore
 from scipy.optimize import curve_fit
-from sklearn.neighbors import NearestNeighbors
 from saqc.funcs.register import register
 import numpy.polynomial.polynomial as poly
 import numba
@@ -21,9 +19,106 @@ from saqc.lib.tools import (
     composeFunction
 )
 
+
+def _stray(val_frame, partition_freq=None, scoring_method='kNNMaxGap', n_neighbors=10, iter_start=0.5,
+           alpha=0.05):
+
+    kNNfunc = getattr(ts_ops, scoring_method)
+    partitions = val_frame.groupby(pd.Grouper(freq=partition_freq))
+    to_flag = []
+    for _, partition in partitions:
+        resids = kNNfunc(partition.values, n_neighbors=n_neighbors, algorithm='ball_tree')
+        sorted_i = resids.argsort()
+        resids = resids[sorted_i]
+        sample_size = resids.shape[0]
+        gaps = np.append(0, np.diff(resids))
+
+        tail_size = int(max(min(50, np.floor(sample_size / 4)), 2))
+        tail_indices = np.arange(2, tail_size + 1)
+        i_start = int(max(np.floor(sample_size * iter_start), 1) + 1)
+        ghat = np.array([np.nan] * sample_size)
+        for i in range(i_start - 1, sample_size):
+            ghat[i] = sum((tail_indices / (tail_size - 1)) * gaps[i - tail_indices + 1])
+
+        log_alpha = np.log(1 / alpha)
+        for iter_index in range(i_start - 1, sample_size):
+            if gaps[iter_index] > log_alpha * ghat[iter_index]:
+                break
+
+        to_flag = np.append(to_flag, list(partition.index[sorted_i[iter_index:]]))
+
+    return to_flag
+
+
+def _expFit(val_frame, scoring_method='kNNMaxGap', n_neighbors=10, iter_start=0.5,
+           alpha=0.05, bin_frac=10):
+
+
+    kNNfunc = getattr(ts_ops, scoring_method)
+    resids = kNNfunc(val_frame.values, n_neighbors=n_neighbors, algorithm='ball_tree')
+    data_len = resids.shape[0]
+    # sorting
+    sorted_i = resids.argsort()
+    resids = resids[sorted_i]
+    iter_index = int(np.floor(resids.size * iter_start))
+    # initialize condition variables:
+    crit_val = np.inf
+    test_val = 0
+    neg_log_alpha = - np.log(alpha)
+
+    # define exponential dist density function:
+    def fit_function(x, lambd):
+        return lambd * np.exp(-lambd * x)
+    # initialise sampling bins
+    binz = np.linspace(resids[0], resids[-1], 10 * int(np.ceil(data_len / bin_frac)))
+    binzenters = np.array([0.5 * (binz[i] + binz[i + 1]) for i in range(len(binz) - 1)])
+    # inititialize full histogram:
+    full_hist, binz = np.histogram(resids, bins=binz)
+    # check if start index is sufficiently high (pointing at resids value beyond histogram maximum at least):
+    hist_argmax = full_hist.argmax()
+
+    if hist_argmax >= findIndex(binz, resids[iter_index-1], 0):
+        raise ValueError("Either the data histogram is too strangely shaped for oddWater OD detection - "
+                         "or a too low value for 'iter_start' was passed "
+                         "(iter_start better be much greater 0.5)")
+    # GO!
+    iter_max_bin_index = findIndex(binz, resids[iter_index-1], 0)
+    upper_tail_index = int(np.floor(0.5 * hist_argmax + 0.5 * iter_max_bin_index))
+    resids_tail_index = findIndex(resids, binz[upper_tail_index], 0)
+    upper_tail_hist, bins = np.histogram(resids[resids_tail_index:iter_index],
+                                             bins=binz[upper_tail_index:iter_max_bin_index + 1])
+
+    while (test_val < crit_val) & (iter_index < resids.size-1):
+        iter_index += 1
+        new_iter_max_bin_index = findIndex(binz, resids[iter_index-1], 0)
+
+        # following if/else block "manually" expands the data histogram and circumvents calculation of the complete
+        # histogram in any new iteration.
+        if new_iter_max_bin_index == iter_max_bin_index:
+            upper_tail_hist[-1] += 1
+        else:
+            upper_tail_hist = np.append(upper_tail_hist, np.zeros([new_iter_max_bin_index-iter_max_bin_index]))
+            upper_tail_hist[-1] += 1
+            iter_max_bin_index = new_iter_max_bin_index
+            upper_tail_index_new = int(np.floor(0.5 * hist_argmax + 0.5 * iter_max_bin_index))
+            upper_tail_hist = upper_tail_hist[upper_tail_index_new-upper_tail_index:]
+            upper_tail_index = upper_tail_index_new
+
+        # fitting
+        lambdA, _ = curve_fit(fit_function, xdata=binzenters[upper_tail_index:iter_max_bin_index],
+                                ydata=upper_tail_hist,
+                                p0=[-np.log(alpha/resids[iter_index])])
+
+        crit_val = neg_log_alpha / lambdA
+        test_val = resids[iter_index]
+
+    return val_frame.index[sorted_i[iter_index:]]
+
+
 @register("spikes_oddWater")
 def flagSpikes_oddWater(data, field, flagger, fields, trafo='normScale', alpha=0.05, bin_frac=10, n_neighbors=2,
-                        iter_start=0.5, scoring_method='kNNMaxGap', lambda_estimator='gap_average', **kwargs):
+                        iter_start=0.5, scoring_method='kNNMaxGap', thresholding='stray', stray_partition=None,
+                        **kwargs):
 
     trafo = composeFunction(trafo.split(','))
     # data fransformation/extraction
@@ -36,98 +131,29 @@ def flagSpikes_oddWater(data, field, flagger, fields, trafo='normScale', alpha=0
                              right_index=True
                              )
 
-    data_len = val_frame.index.size
     val_frame.dropna(inplace=True)
     val_frame = val_frame.transform(trafo)
 
-    # KNN calculation
-    kNNfunc = getattr(ts_ops, scoring_method)
-    resids = kNNfunc(val_frame.values, n_neighbors=n_neighbors, algorithm='ball_tree')
-
-    # sorting
-    sorted_i = resids.argsort()
-    resids = resids[sorted_i]
-
-    # iter_start
-
-    if lambda_estimator == 'gap_average':
-        sample_size = resids.shape[0]
-        gaps = np.append(0, np.diff(resids))
-        tail_size = int(max(min(50, np.floor(sample_size/4)), 2))
-        tail_indices = np.arange(2, tail_size + 1)
-        i_start = int(max(np.floor(sample_size*iter_start), 1) + 1)
-        ghat = np.array([np.nan]*sample_size)
-        for i in range(i_start-1, sample_size):
-            ghat[i] = sum((tail_indices/(tail_size-1))*gaps[i-tail_indices+1])
-
-        log_alpha = np.log(1/alpha)
-        for iter_index in range(i_start-1, sample_size):
-            if gaps[iter_index] > log_alpha*ghat[iter_index]:
-                break
-
+    if thresholding == 'stray':
+        to_flag_index =_stray(val_frame,
+                              partition_freq=stray_partition,
+                              scoring_method=scoring_method,
+                              n_neighbors=n_neighbors,
+                              iter_start=iter_start)
 
     else:
-        # (estimator == 'exponential_fit')
-        iter_index = int(np.floor(resids.size * iter_start))
-        # initialize condition variables:
-        crit_val = np.inf
-        test_val = 0
-        neg_log_alpha = - np.log(alpha)
-
-        # define exponential dist density function:
-        def fit_function(x, lambd):
-            return lambd * np.exp(-lambd * x)
-        # initialise sampling bins
-        binz = np.linspace(resids[0], resids[-1], 10 * int(np.ceil(data_len / bin_frac)))
-        binzenters = np.array([0.5 * (binz[i] + binz[i + 1]) for i in range(len(binz) - 1)])
-        # inititialize full histogram:
-        full_hist, binz = np.histogram(resids, bins=binz)
-        # check if start index is sufficiently high (pointing at resids value beyond histogram maximum at least):
-        hist_argmax = full_hist.argmax()
-
-        if hist_argmax >= findIndex(binz, resids[iter_index-1], 0):
-            raise ValueError("Either the data histogram is too strangely shaped for oddWater OD detection - "
-                             "or a too low value for iter_start was passed (iter_start better be greater 0.5)")
-        # GO!
-        iter_max_bin_index = findIndex(binz, resids[iter_index-1], 0)
-        upper_tail_index = int(np.floor(0.5 * hist_argmax + 0.5 * iter_max_bin_index))
-        resids_tail_index = findIndex(resids, binz[upper_tail_index], 0)
-        upper_tail_hist, bins = np.histogram(resids[resids_tail_index:iter_index],
-                                             bins=binz[upper_tail_index:iter_max_bin_index + 1])
-
-        while (test_val < crit_val) & (iter_index < resids.size-1):
-            iter_index += 1
-            new_iter_max_bin_index = findIndex(binz, resids[iter_index-1], 0)
-
-            # following if/else block "manually" expands the data histogram and circumvents calculation of the complete
-            # histogram in any new iteration.
-            if new_iter_max_bin_index == iter_max_bin_index:
-                upper_tail_hist[-1] += 1
-            else:
-                upper_tail_hist = np.append(upper_tail_hist, np.zeros([new_iter_max_bin_index-iter_max_bin_index]))
-                upper_tail_hist[-1] += 1
-                iter_max_bin_index = new_iter_max_bin_index
-                upper_tail_index_new = int(np.floor(0.5 * hist_argmax + 0.5 * iter_max_bin_index))
-                upper_tail_hist = upper_tail_hist[upper_tail_index_new-upper_tail_index:]
-                upper_tail_index = upper_tail_index_new
-
-            # fitting
-            lambdA, _ = curve_fit(fit_function, xdata=binzenters[upper_tail_index:iter_max_bin_index],
-                                  ydata=upper_tail_hist,
-                                  p0=[-np.log(alpha/resids[iter_index])])
-
-            crit_val = neg_log_alpha / lambdA
-            test_val = resids[iter_index]
-
-    # flag them!
-    to_flag_index = val_frame.index[sorted_i[iter_index:]]
+        to_flag_index = _expFit(val_frame,
+                                scoring_method=scoring_method,
+                                n_neighbors=n_neighbors,
+                                iter_start=iter_start,
+                                alpha=alpha,
+                                bin_frac=bin_frac)
     for var in fields:
         flagger = flagger.setFlags(var, to_flag_index, **kwargs)
 
     return data, flagger
 
 
-
 @register("spikes_limitRaise")
 def flagSpikes_limitRaise(
     data, field, flagger, thresh, raise_window, intended_freq, average_window=None, mean_raise_factor=2, min_slope=None,
-- 
GitLab


From 34c13850f2be2e6f0d13d26719daf5292453b9ac Mon Sep 17 00:00:00 2001
From: Peter Luenenschloss <peter.luenenschloss@ufz.de>
Date: Thu, 9 Apr 2020 19:54:56 +0200
Subject: [PATCH 04/11] added optimized binning procedure to expFit

---
 saqc/funcs/spike_detection.py      | 23 ++++++++++++++++-------
 test/funcs/test_spike_detection.py |  2 +-
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/saqc/funcs/spike_detection.py b/saqc/funcs/spike_detection.py
index fa80027b4..e3d4db26e 100644
--- a/saqc/funcs/spike_detection.py
+++ b/saqc/funcs/spike_detection.py
@@ -27,10 +27,13 @@ def _stray(val_frame, partition_freq=None, scoring_method='kNNMaxGap', n_neighbo
     partitions = val_frame.groupby(pd.Grouper(freq=partition_freq))
     to_flag = []
     for _, partition in partitions:
-        resids = kNNfunc(partition.values, n_neighbors=n_neighbors, algorithm='ball_tree')
+        if partition.empty:
+            continue
+        sample_size = partition.shape[0]
+        nn_neighbors = min(n_neighbors, sample_size)
+        resids = kNNfunc(partition.values, n_neighbors=nn_neighbors-1, algorithm='ball_tree')
         sorted_i = resids.argsort()
         resids = resids[sorted_i]
-        sample_size = resids.shape[0]
         gaps = np.append(0, np.diff(resids))
 
         tail_size = int(max(min(50, np.floor(sample_size / 4)), 2))
@@ -70,7 +73,13 @@ def _expFit(val_frame, scoring_method='kNNMaxGap', n_neighbors=10, iter_start=0.
     def fit_function(x, lambd):
         return lambd * np.exp(-lambd * x)
     # initialise sampling bins
-    binz = np.linspace(resids[0], resids[-1], 10 * int(np.ceil(data_len / bin_frac)))
+    if isinstance(bin_frac, int):
+        binz = np.linspace(resids[0], resids[-1], 10 * int(np.ceil(data_len / bin_frac)))
+    elif bin_frac in ['auto', 'fd', 'doane', 'scott', 'stone', 'rice', 'sturges', 'sqrt']:
+        binz = np.histogram_bin_edges(resids, bins=bin_frac)
+    else:
+        raise ValueError('Cant interpret {} as an binning technique.'.format(bin_frac))
+
     binzenters = np.array([0.5 * (binz[i] + binz[i + 1]) for i in range(len(binz) - 1)])
     # inititialize full histogram:
     full_hist, binz = np.histogram(resids, bins=binz)
@@ -116,8 +125,8 @@ def _expFit(val_frame, scoring_method='kNNMaxGap', n_neighbors=10, iter_start=0.
 
 
 @register("spikes_oddWater")
-def flagSpikes_oddWater(data, field, flagger, fields, trafo='normScale', alpha=0.05, bin_frac=10, n_neighbors=2,
-                        iter_start=0.5, scoring_method='kNNMaxGap', thresholding='stray', stray_partition=None,
+def flagSpikes_oddWater(data, field, flagger, fields, trafo='normScale', alpha=0.05, binning='auto', n_neighbors=2,
+                        iter_start=0.5, scoring_method='kNNMaxGap', threshing='stray', stray_partition=None,
                         **kwargs):
 
     trafo = composeFunction(trafo.split(','))
@@ -134,7 +143,7 @@ def flagSpikes_oddWater(data, field, flagger, fields, trafo='normScale', alpha=0
     val_frame.dropna(inplace=True)
     val_frame = val_frame.transform(trafo)
 
-    if thresholding == 'stray':
+    if threshing == 'stray':
         to_flag_index =_stray(val_frame,
                               partition_freq=stray_partition,
                               scoring_method=scoring_method,
@@ -147,7 +156,7 @@ def flagSpikes_oddWater(data, field, flagger, fields, trafo='normScale', alpha=0
                                 n_neighbors=n_neighbors,
                                 iter_start=iter_start,
                                 alpha=alpha,
-                                bin_frac=bin_frac)
+                                bin_frac=binning)
     for var in fields:
         flagger = flagger.setFlags(var, to_flag_index, **kwargs)
 
diff --git a/test/funcs/test_spike_detection.py b/test/funcs/test_spike_detection.py
index 01679c695..8f434bcf8 100644
--- a/test/funcs/test_spike_detection.py
+++ b/test/funcs/test_spike_detection.py
@@ -113,7 +113,7 @@ def test_flagSpikesOddWater(dat, flagger):
     data = pd.DataFrame({'data1': data1.squeeze(), 'data2': data2.squeeze()}, index=data1.index)
     flagger = flagger.initFlags(data)
     _, flagger_result = flagSpikes_oddWater(
-        data, field, flagger, fields=fields, bin_frac=50, trafo='np.log',
+        data, field, flagger, fields=fields, binning=50, trafo='np.log',
         iter_start=0.95, n_neighbors=10
     )
     assert flagger_result.isFlagged(fields[0])[characteristics['raise']].all()
-- 
GitLab


From a897ec77b33d65e2c9b4e73a67574aa2db2736f7 Mon Sep 17 00:00:00 2001
From: Peter Luenenschloss <peter.luenenschloss@ufz.de>
Date: Tue, 14 Apr 2020 13:38:52 +0200
Subject: [PATCH 05/11] changed spikedetection architecture

---
 saqc/funcs/spike_detection.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/saqc/funcs/spike_detection.py b/saqc/funcs/spike_detection.py
index e3d4db26e..c209cdac5 100644
--- a/saqc/funcs/spike_detection.py
+++ b/saqc/funcs/spike_detection.py
@@ -44,10 +44,12 @@ def _stray(val_frame, partition_freq=None, scoring_method='kNNMaxGap', n_neighbo
             ghat[i] = sum((tail_indices / (tail_size - 1)) * gaps[i - tail_indices + 1])
 
         log_alpha = np.log(1 / alpha)
+        log_inv_alpha = np.log(1 / (1-alpha))
         for iter_index in range(i_start - 1, sample_size):
             if gaps[iter_index] > log_alpha * ghat[iter_index]:
                 break
 
+
         to_flag = np.append(to_flag, list(partition.index[sorted_i[iter_index:]]))
 
     return to_flag
-- 
GitLab


From 3a30e5a6c2850f0f18bc1235dcd407bdd4e222f2 Mon Sep 17 00:00:00 2001
From: Peter Luenenschloss <peter.luenenschloss@ufz.de>
Date: Mon, 20 Apr 2020 12:00:39 +0200
Subject: [PATCH 06/11] added periods number based slicing to oddWaters Stray
 algorithm

---
 saqc/funcs/spike_detection.py | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/saqc/funcs/spike_detection.py b/saqc/funcs/spike_detection.py
index c209cdac5..0c04e8ba9 100644
--- a/saqc/funcs/spike_detection.py
+++ b/saqc/funcs/spike_detection.py
@@ -20,14 +20,20 @@ from saqc.lib.tools import (
 )
 
 
-def _stray(val_frame, partition_freq=None, scoring_method='kNNMaxGap', n_neighbors=10, iter_start=0.5,
+def _stray(val_frame, partition_freq=None, partition_min=0, scoring_method='kNNMaxGap', n_neighbors=10, iter_start=0.5,
            alpha=0.05):
 
     kNNfunc = getattr(ts_ops, scoring_method)
-    partitions = val_frame.groupby(pd.Grouper(freq=partition_freq))
+    if isinstance(partition_freq, str):
+        partitions = val_frame.groupby(pd.Grouper(freq=partition_freq))
+    else:
+        grouper_series = pd.Series(data=np.arange(0, val_frame.shape[0]), index=val_frame.index)
+        grouper_series = grouper_series.transform(lambda x: int(np.floor(x / partition_freq)))
+        partitions = val_frame.groupby(grouper_series)
+
     to_flag = []
     for _, partition in partitions:
-        if partition.empty:
+        if partition.empty | (partition.shape[0] < partition_min):
             continue
         sample_size = partition.shape[0]
         nn_neighbors = min(n_neighbors, sample_size)
@@ -44,7 +50,6 @@ def _stray(val_frame, partition_freq=None, scoring_method='kNNMaxGap', n_neighbo
             ghat[i] = sum((tail_indices / (tail_size - 1)) * gaps[i - tail_indices + 1])
 
         log_alpha = np.log(1 / alpha)
-        log_inv_alpha = np.log(1 / (1-alpha))
         for iter_index in range(i_start - 1, sample_size):
             if gaps[iter_index] > log_alpha * ghat[iter_index]:
                 break
@@ -146,11 +151,11 @@ def flagSpikes_oddWater(data, field, flagger, fields, trafo='normScale', alpha=0
     val_frame = val_frame.transform(trafo)
 
     if threshing == 'stray':
-        to_flag_index =_stray(val_frame,
-                              partition_freq=stray_partition,
-                              scoring_method=scoring_method,
-                              n_neighbors=n_neighbors,
-                              iter_start=iter_start)
+        to_flag_index = _stray(val_frame,
+                               partition_freq=stray_partition,
+                               scoring_method=scoring_method,
+                               n_neighbors=n_neighbors,
+                               iter_start=iter_start)
 
     else:
         to_flag_index = _expFit(val_frame,
-- 
GitLab


From 73210d6afeade8a066835ba897e24b6816d86039 Mon Sep 17 00:00:00 2001
From: Peter Luenenschloss <peter.luenenschloss@ufz.de>
Date: Mon, 20 Apr 2020 12:16:09 +0200
Subject: [PATCH 07/11] minor cleanup

---
 saqc/funcs/spike_detection.py      | 13 +++++++++----
 test/funcs/test_spike_detection.py |  4 ++--
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/saqc/funcs/spike_detection.py b/saqc/funcs/spike_detection.py
index 0c04e8ba9..ee823198e 100644
--- a/saqc/funcs/spike_detection.py
+++ b/saqc/funcs/spike_detection.py
@@ -24,6 +24,10 @@ def _stray(val_frame, partition_freq=None, partition_min=0, scoring_method='kNNM
            alpha=0.05):
 
     kNNfunc = getattr(ts_ops, scoring_method)
+    # partitioning
+    if not partition_freq:
+        partition_freq = val_frame.shape[0]
+
     if isinstance(partition_freq, str):
         partitions = val_frame.groupby(pd.Grouper(freq=partition_freq))
     else:
@@ -31,6 +35,7 @@ def _stray(val_frame, partition_freq=None, partition_min=0, scoring_method='kNNM
         grouper_series = grouper_series.transform(lambda x: int(np.floor(x / partition_freq)))
         partitions = val_frame.groupby(grouper_series)
 
+    # calculate flags for every partition
     to_flag = []
     for _, partition in partitions:
         if partition.empty | (partition.shape[0] < partition_min):
@@ -131,10 +136,10 @@ def _expFit(val_frame, scoring_method='kNNMaxGap', n_neighbors=10, iter_start=0.
     return val_frame.index[sorted_i[iter_index:]]
 
 
-@register("spikes_oddWater")
-def flagSpikes_oddWater(data, field, flagger, fields, trafo='normScale', alpha=0.05, binning='auto', n_neighbors=2,
-                        iter_start=0.5, scoring_method='kNNMaxGap', threshing='stray', stray_partition=None,
-                        **kwargs):
+@register("spikes_multivariateKNNScoring")
+def flagSpikes_multivariateKNNScoring(data, field, flagger, fields, trafo='normScale', alpha=0.05, binning='auto', n_neighbors=2,
+                                      iter_start=0.5, scoring_method='kNNMaxGap', threshing='stray', stray_partition=None,
+                                      **kwargs):
 
     trafo = composeFunction(trafo.split(','))
     # data fransformation/extraction
diff --git a/test/funcs/test_spike_detection.py b/test/funcs/test_spike_detection.py
index 8f434bcf8..878f1ce05 100644
--- a/test/funcs/test_spike_detection.py
+++ b/test/funcs/test_spike_detection.py
@@ -11,7 +11,7 @@ from saqc.funcs.spike_detection import (
     flagSpikes_slidingZscore,
     flagSpikes_basic,
     flagSpikes_limitRaise,
-    flagSpikes_oddWater
+    flagSpikes_multivariateKNNScoring
 )
 
 from test.common import TESTFLAGGER
@@ -112,7 +112,7 @@ def test_flagSpikesOddWater(dat, flagger):
     fields = ['data1', 'data2']
     data = pd.DataFrame({'data1': data1.squeeze(), 'data2': data2.squeeze()}, index=data1.index)
     flagger = flagger.initFlags(data)
-    _, flagger_result = flagSpikes_oddWater(
+    _, flagger_result = flagSpikes_multivariateKNNScoring(
         data, field, flagger, fields=fields, binning=50, trafo='np.log',
         iter_start=0.95, n_neighbors=10
     )
-- 
GitLab


From 0dd2f40f6ca99f8c6d112b344d69ce89a2acddba Mon Sep 17 00:00:00 2001
From: Peter Luenenschloss <peter.luenenschloss@ufz.de>
Date: Mon, 20 Apr 2020 13:42:35 +0200
Subject: [PATCH 08/11] mltivariateFlagging parameter order/names

---
 saqc/funcs/spikes_detection.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/saqc/funcs/spikes_detection.py b/saqc/funcs/spikes_detection.py
index 22af71889..3465053f3 100644
--- a/saqc/funcs/spikes_detection.py
+++ b/saqc/funcs/spikes_detection.py
@@ -136,9 +136,10 @@ def _expFit(val_frame, scoring_method='kNNMaxGap', n_neighbors=10, iter_start=0.
 
 
 @register()
-def spikes_flagMultivariateKNNScores(data, field, flagger, fields, trafo='normScale', alpha=0.05, binning='auto', n_neighbors=2,
-                                      iter_start=0.5, scoring_method='kNNMaxGap', threshing='stray', stray_partition=None,
-                                      **kwargs):
+def spikes_flagMultivariateKNNScores(data, field, flagger, fields, trafo='normScale', alpha=0.05, n_neighbors=10,
+                                     scoring_method='kNNMaxGap', iter_start=0.5, threshing='stray',
+                                     expfit_binning='auto', stray_partition=None, stray_partition_min=0,
+                                     **kwargs):
 
     trafo = composeFunction(trafo.split(','))
     # data fransformation/extraction
@@ -157,6 +158,7 @@ def spikes_flagMultivariateKNNScores(data, field, flagger, fields, trafo='normSc
     if threshing == 'stray':
         to_flag_index = _stray(val_frame,
                                partition_freq=stray_partition,
+                               partition_min=stray_partition_min,
                                scoring_method=scoring_method,
                                n_neighbors=n_neighbors,
                                iter_start=iter_start)
@@ -167,7 +169,7 @@ def spikes_flagMultivariateKNNScores(data, field, flagger, fields, trafo='normSc
                                 n_neighbors=n_neighbors,
                                 iter_start=iter_start,
                                 alpha=alpha,
-                                bin_frac=binning)
+                                bin_frac=expfit_binning)
 
     for var in fields:
         flagger = flagger.setFlags(var, to_flag_index, **kwargs)
-- 
GitLab


From 07453251856fc41c83667aeb10a357c6b8a566ef Mon Sep 17 00:00:00 2001
From: Peter Luenenschloss <peter.luenenschloss@ufz.de>
Date: Mon, 20 Apr 2020 14:02:02 +0200
Subject: [PATCH 09/11] submodule confusion

---
 dios | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dios b/dios
index d5a80af46..0665773c1 160000
--- a/dios
+++ b/dios
@@ -1 +1 @@
-Subproject commit d5a80af469c0540c91e5b4ddd1c0327b11a0d214
+Subproject commit 0665773c1a86b018f79db7983494c198b77620fc
-- 
GitLab


From 2579967a02f35da8d566942d8ec5524b2e5adfc3 Mon Sep 17 00:00:00 2001
From: Peter Luenenschloss <peter.luenenschloss@ufz.de>
Date: Tue, 21 Apr 2020 10:19:13 +0200
Subject: [PATCH 10/11] fixed dios import bug

---
 test/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/common.py b/test/common.py
index c56007fad..c0da2a204 100644
--- a/test/common.py
+++ b/test/common.py
@@ -6,7 +6,7 @@ import re
 
 import numpy as np
 import pandas as pd
-import dios.dios as dios
+import dios
 
 from saqc.core.core import readConfig
 from saqc.flagger import (
-- 
GitLab


From 6cbea7bd4fd626be60dff67d73fc2100b499eebf Mon Sep 17 00:00:00 2001
From: Peter Luenenschloss <peter.luenenschloss@ufz.de>
Date: Tue, 21 Apr 2020 10:28:22 +0200
Subject: [PATCH 11/11] some renaming

---
 saqc/funcs/spikes_detection.py      |  8 ++++----
 saqc/lib/tools.py                   | 10 +++++-----
 test/funcs/test_spikes_detection.py |  4 ++--
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/saqc/funcs/spikes_detection.py b/saqc/funcs/spikes_detection.py
index 3465053f3..61eb4fb49 100644
--- a/saqc/funcs/spikes_detection.py
+++ b/saqc/funcs/spikes_detection.py
@@ -136,10 +136,10 @@ def _expFit(val_frame, scoring_method='kNNMaxGap', n_neighbors=10, iter_start=0.
 
 
 @register()
-def spikes_flagMultivariateKNNScores(data, field, flagger, fields, trafo='normScale', alpha=0.05, n_neighbors=10,
-                                     scoring_method='kNNMaxGap', iter_start=0.5, threshing='stray',
-                                     expfit_binning='auto', stray_partition=None, stray_partition_min=0,
-                                     **kwargs):
+def spikes_flagMultivarScores(data, field, flagger, fields, trafo='normScale', alpha=0.05, n_neighbors=10,
+                              scoring_method='kNNMaxGap', iter_start=0.5, threshing='stray',
+                              expfit_binning='auto', stray_partition=None, stray_partition_min=0,
+                              **kwargs):
 
     trafo = composeFunction(trafo.split(','))
     # data fransformation/extraction
diff --git a/saqc/lib/tools.py b/saqc/lib/tools.py
index 374c880ec..18d9048a7 100644
--- a/saqc/lib/tools.py
+++ b/saqc/lib/tools.py
@@ -27,17 +27,17 @@ SAQC_OPERATORS = {
     "max": np.max,
     "first": pd.Series(np.nan, index=pd.DatetimeIndex([])).resample("0min").first,
     "last": pd.Series(np.nan, index=pd.DatetimeIndex([])).resample("0min").last,
-    "delta_t": ts_ops.deltaT,
+    "deltaT": ts_ops.deltaT,
     "id": ts_ops.identity,
     "diff": ts_ops.difference,
     "relDiff": ts_ops.relativeDifference,
     "deriv": ts_ops.derivative,
-    "roc": ts_ops.rateOfChange,
+    "rateOfChange": ts_ops.rateOfChange,
     "scale": ts_ops.scale,
     "normScale": ts_ops.normScale,
-    "stdByMean": ts_ops.standardizeByMean,
-    "stdByMedian": ts_ops.standardizeByMedian,
-    "zlog": ts_ops.zeroLog
+    "meanStandardize": ts_ops.standardizeByMean,
+    "medianStandardize": ts_ops.standardizeByMedian,
+    "zLog": ts_ops.zeroLog
 }
 
 
diff --git a/test/funcs/test_spikes_detection.py b/test/funcs/test_spikes_detection.py
index 06b3f3fcc..77f729c2a 100644
--- a/test/funcs/test_spikes_detection.py
+++ b/test/funcs/test_spikes_detection.py
@@ -12,7 +12,7 @@ from saqc.funcs.spikes_detection import (
     spikes_flagSlidingZscore,
     spikes_flagBasic,
     spikes_flagRaise,
-    spikes_flagMultivariateKNNScores
+    spikes_flagMultivarScores
 )
 
 from test.common import TESTFLAGGER
@@ -111,7 +111,7 @@ def test_flagSpikesOddWater(dat, flagger):
     s2 = pd.Series(data=s2.values, index=s1.index)
     data = dios.DictOfSeries([s1, s2], columns=["data1", "data2"])
     flagger = flagger.initFlags(data)
-    _, flagger_result = spikes_flagMultivariateKNNScores(
+    _, flagger_result = spikes_flagMultivarScores(
         data, field, flagger, fields=fields, binning=50, trafo='np.log',
         iter_start=0.95, n_neighbors=10
     )
-- 
GitLab