diff --git a/requirements.txt b/requirements.txt index adf1d2efa9fa81a635b7fe920b4024a62c283263..81ed970110b012bbd43c6fa3b5937f575ebec526 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,34 +1,34 @@ attrs==19.3.0 -Click==7.0 +Click==7.1.2 cycler==0.10.0 dtw==1.4.0 -importlib-metadata==1.5.0 -joblib==0.14.1 -kiwisolver==1.1.0 +kiwisolver==1.2.0 +importlib-metadata==1.7.0 +joblib==0.16.0 llvmlite==0.31.0 -matplotlib==3.1.3 -mlxtend==0.17.2 -more-itertools==8.2.0 +mlxtend==0.17.3 +matplotlib==3.3.0 +more-itertools==8.4.0 numba==0.48.0 -numpy==1.18.1 +numpy==1.19.1 outlier==0.2 utils==1.0.1 outlier-utils==0.0.3 -packaging==20.1 +packaging==20.4 pandas==1.0.1 pluggy==0.13.1 -py==1.8.1 -pyarrow==0.16.0 -pyparsing==2.4.6 +pyparsing==2.4.7 +py==1.9.0 +pyarrow==1.0.0 pytest-lazy-fixture==0.6.3 -pytest==5.3.5 +pytest==6.0.1 python-dateutil==2.8.1 -python-intervals==1.10.0 -pytz==2019.3 +python-intervals==1.10.0.post1 +pytz==2020.1 PyWavelets==1.1.1 -scikit-learn==0.22.1 -scipy==1.4.1 -six==1.14.0 -wcwidth==0.1.8 zipp==3.1.0 +wcwidth==0.2.5 +scipy==1.5.2 +scikit-learn==0.23.1 +six==1.15.0 astor==0.8.1 diff --git a/saqc/funcs/functions.py b/saqc/funcs/functions.py index efa39136fe8acbc94174506b7c171453b8c3870a..4ae400486f611aed5548ffd89ccd40da9798d066 100644 --- a/saqc/funcs/functions.py +++ b/saqc/funcs/functions.py @@ -18,12 +18,10 @@ from dios import DictOfSeries from typing import Any -def _dslIsFlagged(flagger, var, flag=None, comparator=None): +def _dslIsFlagged(flagger, var, flag=None, comparator=">="): """ helper function for `flagGeneric` """ - if comparator is None: - return flagger.isFlagged(var.name, flag=flag) return flagger.isFlagged(var.name, flag=flag, comparator=comparator) @@ -441,13 +439,25 @@ range_dict.keys() @register def flagCrossScoring(data, field, flagger, fields, thresh, cross_stat=np.median, **kwargs): - val_frame = data.loc[data.index_of("shared")].to_df() - try: - stat = getattr(val_frame, cross_stat.__name__)(axis=1) - except AttributeError: - stat = val_frame.aggregate(cross_stat, axis=1) - diff_scores = val_frame.subtract(stat, axis=0).abs() - diff_scores = diff_scores > thresh + df = data[fields].loc[data[fields].index_of('shared')].to_df() + + if isinstance(cross_stat, str): + if cross_stat == 'modZscore': + MAD_series = df.subtract(df.median(axis=1), axis=0).abs().median(axis=1) + diff_scores = ((0.6745 * (df.subtract(df.median(axis=1), axis=0))).divide(MAD_series, axis=0)).abs() + elif cross_stat == 'Zscore': + diff_scores = (df.subtract(df.mean(axis=1), axis=0)).divide(df.std(axis=1), axis=0).abs() + else: + raise ValueError(cross_stat) + else: + try: + stat = getattr(df, cross_stat.__name__)(axis=1) + except AttributeError: + stat = df.aggregate(cross_stat, axis=1) + diff_scores = df.subtract(stat, axis=0).abs() + + mask = diff_scores > thresh for var in fields: - flagger = flagger.setFlags(var, diff_scores[var].values, **kwargs) + flagger = flagger.setFlags(var, mask[var], **kwargs) + return data, flagger diff --git a/saqc/funcs/harm_functions.py b/saqc/funcs/harm_functions.py index e56cfd06e80651fc3f502e86dc0c9bc63f4b2f50..37f745c2a92011ea9be02f86dc82d5326fbeffec 100644 --- a/saqc/funcs/harm_functions.py +++ b/saqc/funcs/harm_functions.py @@ -19,7 +19,6 @@ from saqc.funcs.proc_functions import ( logger = logging.getLogger("SaQC") - @register def harm_shift2Grid(data, field, flagger, freq, method="nshift", drop_flags=None, **kwargs): diff --git a/saqc/lib/plotting.py b/saqc/lib/plotting.py index 3c8c83c0d0b10d850f445385ec0ff02f6ea24926..859653899f935be0786063d156ca1c9aed3e8f3b 100644 --- a/saqc/lib/plotting.py +++ b/saqc/lib/plotting.py @@ -18,9 +18,7 @@ def __importHelper(): # needed for datetime conversion register_matplotlib_converters() - if _interactive: - mpl.use("TkAgg") - else: + if not _interactive: # Import plot libs without interactivity, if not needed. # This ensures that we can produce an plot.png even if # tkinter is not installed. E.g. if one want to run this diff --git a/saqc/lib/ts_operators.py b/saqc/lib/ts_operators.py index 437e3255f5252a7e4c460272ad454d1c3845d924..30ce15899d4d5c21999f1e686f2ec2bf83598bbb 100644 --- a/saqc/lib/ts_operators.py +++ b/saqc/lib/ts_operators.py @@ -6,6 +6,8 @@ The module gathers all kinds of timeseries tranformations. """ import logging +import re + import pandas as pd import numpy as np import numba as nb @@ -163,13 +165,12 @@ def validationTrafo(data, max_nan_total, max_nan_consec): return data elif _maxConsecutiveNan(np.asarray(data), max_nan_consec): data[:] = False - return data else: data[:] = True - return data else: data[:] = True - return data + + return data def stdQC(data, max_nan_total=np.inf, max_nan_consec=np.inf): @@ -248,10 +249,8 @@ def interpolateNANs(data, method, order=2, inter_limit=2, downgrade_interpolatio return x.interpolate(method=wrap_method, order=int(wrap_order)) except (NotImplementedError, ValueError): logger.warning( - "Interpolation with method {} is not supported at order {}. " - "Interpolation will be performed at order {}".format( - method, str(wrap_order), str(wrap_order - 1) - ) + f"Interpolation with method {method} is not supported at order {wrap_order}. " + f"and will be performed at order {wrap_order-1}" ) return _interpolWrapper(x, int(wrap_order - 1), wrap_method) elif x.size < 3: @@ -269,8 +268,7 @@ def interpolateNANs(data, method, order=2, inter_limit=2, downgrade_interpolatio data = data.reindex(pre_index) if return_chunk_bounds: return data, chunk_bounds - else: - return data + else: return data def aggregate2Freq( @@ -280,6 +278,12 @@ def aggregate2Freq( # Timestamps that have no values projected on them, get "fill_value" assigned. Also, # "fill_value" serves as replacement for "invalid" intervals + methods = { + "nagg": lambda seconds_total: (seconds_total/2, "left", "left"), + "bagg": lambda _: (0, "left", "left"), + "fagg": lambda _: (0, "right", "right"), + } + # filter data for invalid patterns (since filtering is expensive we pre-check if it is demanded) if (max_invalid_total is not np.inf) | (max_invalid_consec is not np.inf): if pd.isnull(fill_value): @@ -292,24 +296,8 @@ def aggregate2Freq( ) data[temp_mask] = fill_value - # some timestamp acrobatics to feed pd.resample`s base keyword properly seconds_total = pd.Timedelta(freq).total_seconds() - freq_string = str(int(seconds_total)) + "s" - if method == "nagg": - # all values within a grid points range (+/- freq/2, closed to the left) get aggregated with 'agg method' - base = seconds_total / 2 - label = "left" - closed = "left" - elif method == "bagg": - # all values in a sampling interval get aggregated with agg_method and assigned to the last grid point - base = 0 - label = "left" - closed = "left" - else: - # all values in a sampling interval get aggregated with agg_method and assigned to the next grid point - base = 0 - label = "right" - closed = "right" + base, label, closed = methods[method](seconds_total) # In the following, we check for empty intervals outside resample.apply, because: # - resample AND groupBy do insert value zero for empty intervals if resampling with any kind of "sum" application - @@ -317,23 +305,16 @@ def aggregate2Freq( # - we are aggregating data and flags with this function and empty intervals usually would get assigned flagger.BAD # flag (where resample inserts np.nan or 0) - data_resampler = data.resample(freq_string, base=base, closed=closed, label=label) + data_resampler = data.resample(f"{seconds_total:.0f}s", base=base, closed=closed, label=label) empty_intervals = data_resampler.count() == 0 # great performance gain can be achieved, when avoiding .apply and using pd.resampler # methods instead. (this covers all the basic func aggregations, such as median, mean, sum, count, ...) try: - # get rid of nan_prefix attached to numpys nanfuncs ("ignore nan is pointless down here - - # resample doesnt pass no nans to the func applied) - if agg_func.__name__[:3] == "nan": - check_name = agg_func.__name__[3:] - else: - check_name = agg_func.__name__ - - # another nasty special case: if function "count" was passed, we not want empty intervals to be replaced by nan: - if check_name == "count": + check_name = re.sub("^nan", "", agg_func.__name__) + # a nasty special case: if function "count" was passed, we not want empty intervals to be replaced by nan: + if check_name == 'count': empty_intervals[:] = False - data = getattr(data_resampler, check_name)() except AttributeError: data = data_resampler.apply(agg_func) @@ -352,26 +333,16 @@ def shift2Freq(data, method, freq, fill_value=np.nan): # shift timestamps backwards/forwards in order to allign them with an equidistant # frequencie grid. - # Shifts - if method == "fshift": - direction = "ffill" - tolerance = pd.Timedelta(freq) - - elif method == "bshift": - direction = "bfill" - tolerance = pd.Timedelta(freq) - - elif method == "nshift": - direction = "nearest" - tolerance = pd.Timedelta(freq) / 2 - - else: - # method == nearest2 - direction = "nearest" - tolerance = pd.Timedelta(freq) - + methods = { + "fshift": lambda freq: ("ffill", pd.Timedelta(freq)), + "bshift": lambda freq: ("bfill", pd.Timedelta(freq)), + "nshift": lambda freq: ("nearest", pd.Timedelta(freq)/2), + } + direction, tolerance = methods[method](freq) target_ind = pd.date_range( - start=data.index[0].floor(freq), end=data.index[-1].ceil(freq), freq=freq, name=data.index.name + start=data.index[0].floor(freq), end=data.index[-1].ceil(freq), + freq=freq, + name=data.index.name ) return data.reindex(target_ind, method=direction, tolerance=tolerance, fill_value=fill_value) diff --git a/test/funcs/test_generic_config_functions.py b/test/funcs/test_generic_config_functions.py index d4917d29bf966202418f8adaf548fbdb377698f8..4f4f759903f54f11d96a58720500c1a25c037630 100644 --- a/test/funcs/test_generic_config_functions.py +++ b/test/funcs/test_generic_config_functions.py @@ -216,7 +216,7 @@ def test_isflagged(data, flagger): tests = [ (f"isflagged({var1})", flagger.isFlagged(var1)), - (f"isflagged({var1}, BAD)", flagger.isFlagged(var1, flag=flagger.BAD)), + (f"isflagged({var1}, BAD)", flagger.isFlagged(var1, flag=flagger.BAD, comparator=">=")), (f"isflagged({var1}, UNFLAGGED, '==')", flagger.isFlagged(var1, flag=flagger.UNFLAGGED, comparator="==")), (f"~isflagged({var2})", ~flagger.isFlagged(var2)), (f"~({var2}>999) & (~isflagged({var2}))", ~(data[var2] > 999) & (~flagger.isFlagged(var2))),