Merge branch 'develop' into 'dependabot/pip/zipp-3.1.0'

# Conflicts: # requirements.txt

Merge branch 'develop' into 'dependabot/pip/zipp-3.1.0'
# Conflicts: # requirements.txt
bacef79f · David Schäfer · d09d9313 · 2117d4f9 · bacef79f · bacef79f
Commit bacef79f authored 4 years ago by David Schäfer
--- a/requirements.txt
+++ b/requirements.txt
 attrs==19.3.0
-Click==7.0
+Click==7.1.2
 cycler==0.10.0
 dtw==1.4.0
-importlib-metadata==1.5.0
-joblib==0.14.1
-kiwisolver==1.1.0
+kiwisolver==1.2.0
+importlib-metadata==1.7.0
+joblib==0.16.0
 llvmlite==0.31.0
-matplotlib==3.1.3
-mlxtend==0.17.2
-more-itertools==8.2.0
+mlxtend==0.17.3
+matplotlib==3.3.0
+more-itertools==8.4.0
 numba==0.48.0
-numpy==1.18.1
+numpy==1.19.1
 outlier==0.2
 utils==1.0.1
 outlier-utils==0.0.3
-packaging==20.1
+packaging==20.4
 pandas==1.0.1
 pluggy==0.13.1
-py==1.8.1
-pyarrow==0.16.0
-pyparsing==2.4.6
+pyparsing==2.4.7
+py==1.9.0
+pyarrow==1.0.0
 pytest-lazy-fixture==0.6.3
-pytest==5.3.5
+pytest==6.0.1
 python-dateutil==2.8.1
-python-intervals==1.10.0
-pytz==2019.3
+python-intervals==1.10.0.post1
+pytz==2020.1
 PyWavelets==1.1.1
-scikit-learn==0.22.1
-scipy==1.4.1
-six==1.14.0
-wcwidth==0.1.8
 zipp==3.1.0
+wcwidth==0.2.5
+scipy==1.5.2
+scikit-learn==0.23.1
+six==1.15.0
 astor==0.8.1
--- a/saqc/funcs/functions.py
+++ b/saqc/funcs/functions.py
@@ -18,12 +18,10 @@ from dios import DictOfSeries
 from typing import Any


-def _dslIsFlagged(flagger, var, flag=None, comparator=None):
+def _dslIsFlagged(flagger, var, flag=None, comparator=">="):
    """
    helper function for `flagGeneric`
    """
-    if comparator is None:
-        return flagger.isFlagged(var.name, flag=flag)
    return flagger.isFlagged(var.name, flag=flag, comparator=comparator)


@@ -441,13 +439,25 @@ range_dict.keys()

 @register
 def flagCrossScoring(data, field, flagger, fields, thresh, cross_stat=np.median, **kwargs):
-    val_frame = data.loc[data.index_of("shared")].to_df()
-    try:
-        stat = getattr(val_frame, cross_stat.__name__)(axis=1)
-    except AttributeError:
-        stat = val_frame.aggregate(cross_stat, axis=1)
-    diff_scores = val_frame.subtract(stat, axis=0).abs()
-    diff_scores = diff_scores > thresh
+    df = data[fields].loc[data[fields].index_of('shared')].to_df()
+
+    if isinstance(cross_stat, str):
+        if cross_stat == 'modZscore':
+            MAD_series = df.subtract(df.median(axis=1), axis=0).abs().median(axis=1)
+            diff_scores = ((0.6745 * (df.subtract(df.median(axis=1), axis=0))).divide(MAD_series, axis=0)).abs()
+        elif cross_stat == 'Zscore':
+            diff_scores = (df.subtract(df.mean(axis=1), axis=0)).divide(df.std(axis=1), axis=0).abs()
+        else:
+            raise ValueError(cross_stat)
+    else:
+        try:
+            stat = getattr(df, cross_stat.__name__)(axis=1)
+        except AttributeError:
+            stat = df.aggregate(cross_stat, axis=1)
+        diff_scores = df.subtract(stat, axis=0).abs()
+
+    mask = diff_scores > thresh
    for var in fields:
-        flagger = flagger.setFlags(var, diff_scores[var].values, **kwargs)
+        flagger = flagger.setFlags(var, mask[var], **kwargs)
+
    return data, flagger
--- a/saqc/funcs/harm_functions.py
+++ b/saqc/funcs/harm_functions.py
@@ -19,7 +19,6 @@ from saqc.funcs.proc_functions import (

 logger = logging.getLogger("SaQC")

-
 @register
 def harm_shift2Grid(data, field, flagger, freq, method="nshift", drop_flags=None, **kwargs):


--- a/saqc/lib/plotting.py
+++ b/saqc/lib/plotting.py
@@ -18,9 +18,7 @@ def __importHelper():
    # needed for datetime conversion
    register_matplotlib_converters()

-    if _interactive:
-        mpl.use("TkAgg")
-    else:
+    if not _interactive:
        # Import plot libs without interactivity, if not needed.
        # This ensures that we can produce an plot.png even if
        # tkinter is not installed. E.g. if one want to run this

--- a/saqc/lib/ts_operators.py
+++ b/saqc/lib/ts_operators.py
@@ -6,6 +6,8 @@ The module gathers all kinds of timeseries tranformations.
 """
 import logging

+import re
+
 import pandas as pd
 import numpy as np
 import numba as nb
@@ -163,13 +165,12 @@ def validationTrafo(data, max_nan_total, max_nan_consec):
            return data
        elif _maxConsecutiveNan(np.asarray(data), max_nan_consec):
            data[:] = False
-            return data
        else:
            data[:] = True
-            return data
    else:
        data[:] = True
-        return data
+
+    return data


 def stdQC(data, max_nan_total=np.inf, max_nan_consec=np.inf):
@@ -248,10 +249,8 @@ def interpolateNANs(data, method, order=2, inter_limit=2, downgrade_interpolatio
                    return x.interpolate(method=wrap_method, order=int(wrap_order))
                except (NotImplementedError, ValueError):
                    logger.warning(
-                        "Interpolation with method {} is not supported at order {}. "
-                        "Interpolation will be performed at order {}".format(
-                            method, str(wrap_order), str(wrap_order - 1)
-                        )
+                        f"Interpolation with method {method} is not supported at order {wrap_order}. "
+                        f"and will be performed at order {wrap_order-1}"
                    )
                    return _interpolWrapper(x, int(wrap_order - 1), wrap_method)
            elif x.size < 3:
@@ -269,8 +268,7 @@ def interpolateNANs(data, method, order=2, inter_limit=2, downgrade_interpolatio
    data = data.reindex(pre_index)
    if return_chunk_bounds:
        return data, chunk_bounds
-    else:
-        return data
+    else: return data


 def aggregate2Freq(
@@ -280,6 +278,12 @@ def aggregate2Freq(
    # Timestamps that have no values projected on them, get "fill_value" assigned. Also,
    # "fill_value" serves as replacement for "invalid" intervals

+    methods = {
+        "nagg": lambda seconds_total: (seconds_total/2, "left", "left"),
+        "bagg": lambda _: (0, "left", "left"),
+        "fagg": lambda _: (0, "right", "right"),
+    }
+
    # filter data for invalid patterns (since filtering is expensive we pre-check if it is demanded)
    if (max_invalid_total is not np.inf) | (max_invalid_consec is not np.inf):
        if pd.isnull(fill_value):
@@ -292,24 +296,8 @@ def aggregate2Freq(
        )
        data[temp_mask] = fill_value

-    # some timestamp acrobatics to feed pd.resample`s base keyword properly
    seconds_total = pd.Timedelta(freq).total_seconds()
-    freq_string = str(int(seconds_total)) + "s"
-    if method == "nagg":
-        # all values within a grid points range (+/- freq/2, closed to the left) get aggregated with 'agg method'
-        base = seconds_total / 2
-        label = "left"
-        closed = "left"
-    elif method == "bagg":
-        # all values in a sampling interval get aggregated with agg_method and assigned to the last grid point
-        base = 0
-        label = "left"
-        closed = "left"
-    else:
-        # all values in a sampling interval get aggregated with agg_method and assigned to the next grid point
-        base = 0
-        label = "right"
-        closed = "right"
+    base, label, closed = methods[method](seconds_total)

    # In the following, we check for empty intervals outside resample.apply, because:
    # - resample AND groupBy do insert value zero for empty intervals if resampling with any kind of "sum" application -
@@ -317,23 +305,16 @@ def aggregate2Freq(
    # - we are aggregating data and flags with this function and empty intervals usually would get assigned flagger.BAD
    #   flag (where resample inserts np.nan or 0)

-    data_resampler = data.resample(freq_string, base=base, closed=closed, label=label)
+    data_resampler = data.resample(f"{seconds_total:.0f}s", base=base, closed=closed, label=label)

    empty_intervals = data_resampler.count() == 0
    # great performance gain can be achieved, when avoiding .apply and using pd.resampler
    # methods instead. (this covers all the basic func aggregations, such as median, mean, sum, count, ...)
    try:
-        # get rid of nan_prefix attached to numpys nanfuncs ("ignore nan is pointless down here -
-        # resample doesnt pass no nans to the func applied)
-        if agg_func.__name__[:3] == "nan":
-            check_name = agg_func.__name__[3:]
-        else:
-            check_name = agg_func.__name__
-
-        # another nasty special case: if function "count" was passed, we not want empty intervals to be replaced by nan:
-        if check_name == "count":
+        check_name = re.sub("^nan", "", agg_func.__name__)
+        # a nasty special case: if function "count" was passed, we not want empty intervals to be replaced by nan:
+        if check_name == 'count':
            empty_intervals[:] = False
-
        data = getattr(data_resampler, check_name)()
    except AttributeError:
        data = data_resampler.apply(agg_func)
@@ -352,26 +333,16 @@ def shift2Freq(data, method, freq, fill_value=np.nan):
    # shift timestamps backwards/forwards in order to allign them with an equidistant
    # frequencie grid.

-    # Shifts
-    if method == "fshift":
-        direction = "ffill"
-        tolerance = pd.Timedelta(freq)
-
-    elif method == "bshift":
-        direction = "bfill"
-        tolerance = pd.Timedelta(freq)
-
-    elif method == "nshift":
-        direction = "nearest"
-        tolerance = pd.Timedelta(freq) / 2
-
-    else:
-        # method == nearest2
-        direction = "nearest"
-        tolerance = pd.Timedelta(freq)
-
+    methods = {
+        "fshift": lambda freq: ("ffill", pd.Timedelta(freq)),
+        "bshift": lambda freq: ("bfill", pd.Timedelta(freq)),
+        "nshift": lambda freq: ("nearest", pd.Timedelta(freq)/2),
+    }
+    direction, tolerance = methods[method](freq)
    target_ind = pd.date_range(
-        start=data.index[0].floor(freq), end=data.index[-1].ceil(freq), freq=freq, name=data.index.name
+        start=data.index[0].floor(freq), end=data.index[-1].ceil(freq),
+        freq=freq,
+        name=data.index.name
    )
    return data.reindex(target_ind, method=direction, tolerance=tolerance, fill_value=fill_value)


--- a/test/funcs/test_generic_config_functions.py
+++ b/test/funcs/test_generic_config_functions.py
@@ -216,7 +216,7 @@ def test_isflagged(data, flagger):

    tests = [
        (f"isflagged({var1})", flagger.isFlagged(var1)),
-        (f"isflagged({var1}, BAD)", flagger.isFlagged(var1, flag=flagger.BAD)),
+        (f"isflagged({var1}, BAD)", flagger.isFlagged(var1, flag=flagger.BAD, comparator=">=")),
        (f"isflagged({var1}, UNFLAGGED, '==')", flagger.isFlagged(var1, flag=flagger.UNFLAGGED, comparator="==")),
        (f"~isflagged({var2})", ~flagger.isFlagged(var2)),
        (f"~({var2}>999) & (~isflagged({var2}))", ~(data[var2] > 999) & (~flagger.isFlagged(var2))),