diff --git a/requirements.txt b/requirements.txt index 43b6bb008eaec3b582943dce110267b2b301607e..81ed970110b012bbd43c6fa3b5937f575ebec526 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,34 +1,34 @@ attrs==19.3.0 -Click==7.0 +Click==7.1.2 cycler==0.10.0 dtw==1.4.0 -importlib-metadata==1.5.0 -joblib==0.14.1 -kiwisolver==1.1.0 +kiwisolver==1.2.0 +importlib-metadata==1.7.0 +joblib==0.16.0 llvmlite==0.31.0 -matplotlib==3.1.3 -mlxtend==0.17.2 -more-itertools==8.2.0 +mlxtend==0.17.3 +matplotlib==3.3.0 +more-itertools==8.4.0 numba==0.48.0 numpy==1.19.1 outlier==0.2 utils==1.0.1 outlier-utils==0.0.3 -packaging==20.1 +packaging==20.4 pandas==1.0.1 pluggy==0.13.1 -py==1.8.1 +pyparsing==2.4.7 +py==1.9.0 pyarrow==1.0.0 -pyparsing==2.4.6 pytest-lazy-fixture==0.6.3 -pytest==5.3.5 +pytest==6.0.1 python-dateutil==2.8.1 -python-intervals==1.10.0 -pytz==2019.3 +python-intervals==1.10.0.post1 +pytz==2020.1 PyWavelets==1.1.1 +zipp==3.1.0 +wcwidth==0.2.5 +scipy==1.5.2 scikit-learn==0.23.1 -scipy==1.4.1 six==1.15.0 -wcwidth==0.1.8 -zipp==2.2.0 astor==0.8.1 diff --git a/saqc/core/register.py b/saqc/core/register.py index e5163c3b6ba24f13ab6d8a4ed69e432b7d4137c8..2863f182556a0a0ca7cd2e0a20ae8c2f73e173b6 100644 --- a/saqc/core/register.py +++ b/saqc/core/register.py @@ -179,8 +179,8 @@ class SaQCFunc(Func): def _unmaskData(self, data_old, flagger_old, data_new, flagger_new): to_mask = flagger_old.BAD if self.to_mask is None else self.to_mask - mask_old = flagger_old.isFlagged(flag=to_mask) - mask_new = flagger_new.isFlagged(flag=to_mask) + mask_old = flagger_old.isFlagged(flag=to_mask, comparator="==") + mask_new = flagger_new.isFlagged(flag=to_mask, comparator="==") for col, left in data_new.indexes.iteritems(): if col not in mask_old: diff --git a/saqc/funcs/functions.py b/saqc/funcs/functions.py index 2b454aaf77e292d0d62f37c1245ebe74accb8054..831b65b4677b7cb28a40e3f36292d229473fd178 100644 --- a/saqc/funcs/functions.py +++ b/saqc/funcs/functions.py @@ -18,12 +18,10 @@ from dios import DictOfSeries from typing import Any -def _dslIsFlagged(flagger, var, flag=None, comparator=None): +def _dslIsFlagged(flagger, var, flag=None, comparator=">="): """ helper function for `flagGeneric` """ - if comparator is None: - return flagger.isFlagged(var.name, flag=flag) return flagger.isFlagged(var.name, flag=flag, comparator=comparator) diff --git a/saqc/lib/plotting.py b/saqc/lib/plotting.py index 859653899f935be0786063d156ca1c9aed3e8f3b..9b780d70198b3e6085b6fda21f5524c7d9c619a6 100644 --- a/saqc/lib/plotting.py +++ b/saqc/lib/plotting.py @@ -165,11 +165,18 @@ def _plotMultipleVariables( ncols += [ncols_rest] gs_kw = dict(width_ratios=_layout_data_to_table_ratio) - layout = dict(figsize=_figsize, sharex=True, tight_layout=True, squeeze=False, gridspec_kw=gs_kw) + layout = dict( + figsize=_figsize, + sharex=True, + tight_layout=True, + squeeze=False, + gridspec_kw=gs_kw if show_tab else {} + ) # plot max. 4 plots per figure allaxs = [] for n in range(nfig): + fig, axs = plt.subplots(nrows=ncols[n], ncols=2 if show_tab else 1, **layout) for ax in axs: @@ -180,7 +187,7 @@ def _plotMultipleVariables( plot_ax, tab_ax = ax _plotInfoTable(tab_ax, tar, _plotstyle, len(tar["data"])) else: - plot_ax = ax + plot_ax = ax[0] _plotFromDicts(plot_ax, tar, _plotstyle) diff --git a/saqc/lib/ts_operators.py b/saqc/lib/ts_operators.py index 437e3255f5252a7e4c460272ad454d1c3845d924..30ce15899d4d5c21999f1e686f2ec2bf83598bbb 100644 --- a/saqc/lib/ts_operators.py +++ b/saqc/lib/ts_operators.py @@ -6,6 +6,8 @@ The module gathers all kinds of timeseries tranformations. """ import logging +import re + import pandas as pd import numpy as np import numba as nb @@ -163,13 +165,12 @@ def validationTrafo(data, max_nan_total, max_nan_consec): return data elif _maxConsecutiveNan(np.asarray(data), max_nan_consec): data[:] = False - return data else: data[:] = True - return data else: data[:] = True - return data + + return data def stdQC(data, max_nan_total=np.inf, max_nan_consec=np.inf): @@ -248,10 +249,8 @@ def interpolateNANs(data, method, order=2, inter_limit=2, downgrade_interpolatio return x.interpolate(method=wrap_method, order=int(wrap_order)) except (NotImplementedError, ValueError): logger.warning( - "Interpolation with method {} is not supported at order {}. " - "Interpolation will be performed at order {}".format( - method, str(wrap_order), str(wrap_order - 1) - ) + f"Interpolation with method {method} is not supported at order {wrap_order}. " + f"and will be performed at order {wrap_order-1}" ) return _interpolWrapper(x, int(wrap_order - 1), wrap_method) elif x.size < 3: @@ -269,8 +268,7 @@ def interpolateNANs(data, method, order=2, inter_limit=2, downgrade_interpolatio data = data.reindex(pre_index) if return_chunk_bounds: return data, chunk_bounds - else: - return data + else: return data def aggregate2Freq( @@ -280,6 +278,12 @@ def aggregate2Freq( # Timestamps that have no values projected on them, get "fill_value" assigned. Also, # "fill_value" serves as replacement for "invalid" intervals + methods = { + "nagg": lambda seconds_total: (seconds_total/2, "left", "left"), + "bagg": lambda _: (0, "left", "left"), + "fagg": lambda _: (0, "right", "right"), + } + # filter data for invalid patterns (since filtering is expensive we pre-check if it is demanded) if (max_invalid_total is not np.inf) | (max_invalid_consec is not np.inf): if pd.isnull(fill_value): @@ -292,24 +296,8 @@ def aggregate2Freq( ) data[temp_mask] = fill_value - # some timestamp acrobatics to feed pd.resample`s base keyword properly seconds_total = pd.Timedelta(freq).total_seconds() - freq_string = str(int(seconds_total)) + "s" - if method == "nagg": - # all values within a grid points range (+/- freq/2, closed to the left) get aggregated with 'agg method' - base = seconds_total / 2 - label = "left" - closed = "left" - elif method == "bagg": - # all values in a sampling interval get aggregated with agg_method and assigned to the last grid point - base = 0 - label = "left" - closed = "left" - else: - # all values in a sampling interval get aggregated with agg_method and assigned to the next grid point - base = 0 - label = "right" - closed = "right" + base, label, closed = methods[method](seconds_total) # In the following, we check for empty intervals outside resample.apply, because: # - resample AND groupBy do insert value zero for empty intervals if resampling with any kind of "sum" application - @@ -317,23 +305,16 @@ def aggregate2Freq( # - we are aggregating data and flags with this function and empty intervals usually would get assigned flagger.BAD # flag (where resample inserts np.nan or 0) - data_resampler = data.resample(freq_string, base=base, closed=closed, label=label) + data_resampler = data.resample(f"{seconds_total:.0f}s", base=base, closed=closed, label=label) empty_intervals = data_resampler.count() == 0 # great performance gain can be achieved, when avoiding .apply and using pd.resampler # methods instead. (this covers all the basic func aggregations, such as median, mean, sum, count, ...) try: - # get rid of nan_prefix attached to numpys nanfuncs ("ignore nan is pointless down here - - # resample doesnt pass no nans to the func applied) - if agg_func.__name__[:3] == "nan": - check_name = agg_func.__name__[3:] - else: - check_name = agg_func.__name__ - - # another nasty special case: if function "count" was passed, we not want empty intervals to be replaced by nan: - if check_name == "count": + check_name = re.sub("^nan", "", agg_func.__name__) + # a nasty special case: if function "count" was passed, we not want empty intervals to be replaced by nan: + if check_name == 'count': empty_intervals[:] = False - data = getattr(data_resampler, check_name)() except AttributeError: data = data_resampler.apply(agg_func) @@ -352,26 +333,16 @@ def shift2Freq(data, method, freq, fill_value=np.nan): # shift timestamps backwards/forwards in order to allign them with an equidistant # frequencie grid. - # Shifts - if method == "fshift": - direction = "ffill" - tolerance = pd.Timedelta(freq) - - elif method == "bshift": - direction = "bfill" - tolerance = pd.Timedelta(freq) - - elif method == "nshift": - direction = "nearest" - tolerance = pd.Timedelta(freq) / 2 - - else: - # method == nearest2 - direction = "nearest" - tolerance = pd.Timedelta(freq) - + methods = { + "fshift": lambda freq: ("ffill", pd.Timedelta(freq)), + "bshift": lambda freq: ("bfill", pd.Timedelta(freq)), + "nshift": lambda freq: ("nearest", pd.Timedelta(freq)/2), + } + direction, tolerance = methods[method](freq) target_ind = pd.date_range( - start=data.index[0].floor(freq), end=data.index[-1].ceil(freq), freq=freq, name=data.index.name + start=data.index[0].floor(freq), end=data.index[-1].ceil(freq), + freq=freq, + name=data.index.name ) return data.reindex(target_ind, method=direction, tolerance=tolerance, fill_value=fill_value) diff --git a/test/funcs/test_generic_config_functions.py b/test/funcs/test_generic_config_functions.py index d4917d29bf966202418f8adaf548fbdb377698f8..4f4f759903f54f11d96a58720500c1a25c037630 100644 --- a/test/funcs/test_generic_config_functions.py +++ b/test/funcs/test_generic_config_functions.py @@ -216,7 +216,7 @@ def test_isflagged(data, flagger): tests = [ (f"isflagged({var1})", flagger.isFlagged(var1)), - (f"isflagged({var1}, BAD)", flagger.isFlagged(var1, flag=flagger.BAD)), + (f"isflagged({var1}, BAD)", flagger.isFlagged(var1, flag=flagger.BAD, comparator=">=")), (f"isflagged({var1}, UNFLAGGED, '==')", flagger.isFlagged(var1, flag=flagger.UNFLAGGED, comparator="==")), (f"~isflagged({var2})", ~flagger.isFlagged(var2)), (f"~({var2}>999) & (~isflagged({var2}))", ~(data[var2] > 999) & (~flagger.isFlagged(var2))),