Skip to content
Snippets Groups Projects
Commit bacef79f authored by David Schäfer's avatar David Schäfer
Browse files

Merge branch 'develop' into 'dependabot/pip/zipp-3.1.0'

# Conflicts:
#   requirements.txt
parents d09d9313 2117d4f9
No related branches found
No related tags found
1 merge request!76Bump zipp from 2.2.0 to 3.1.0
Pipeline #6083 passed with stage
in 9 minutes and 11 seconds
attrs==19.3.0 attrs==19.3.0
Click==7.0 Click==7.1.2
cycler==0.10.0 cycler==0.10.0
dtw==1.4.0 dtw==1.4.0
importlib-metadata==1.5.0 kiwisolver==1.2.0
joblib==0.14.1 importlib-metadata==1.7.0
kiwisolver==1.1.0 joblib==0.16.0
llvmlite==0.31.0 llvmlite==0.31.0
matplotlib==3.1.3 mlxtend==0.17.3
mlxtend==0.17.2 matplotlib==3.3.0
more-itertools==8.2.0 more-itertools==8.4.0
numba==0.48.0 numba==0.48.0
numpy==1.18.1 numpy==1.19.1
outlier==0.2 outlier==0.2
utils==1.0.1 utils==1.0.1
outlier-utils==0.0.3 outlier-utils==0.0.3
packaging==20.1 packaging==20.4
pandas==1.0.1 pandas==1.0.1
pluggy==0.13.1 pluggy==0.13.1
py==1.8.1 pyparsing==2.4.7
pyarrow==0.16.0 py==1.9.0
pyparsing==2.4.6 pyarrow==1.0.0
pytest-lazy-fixture==0.6.3 pytest-lazy-fixture==0.6.3
pytest==5.3.5 pytest==6.0.1
python-dateutil==2.8.1 python-dateutil==2.8.1
python-intervals==1.10.0 python-intervals==1.10.0.post1
pytz==2019.3 pytz==2020.1
PyWavelets==1.1.1 PyWavelets==1.1.1
scikit-learn==0.22.1
scipy==1.4.1
six==1.14.0
wcwidth==0.1.8
zipp==3.1.0 zipp==3.1.0
wcwidth==0.2.5
scipy==1.5.2
scikit-learn==0.23.1
six==1.15.0
astor==0.8.1 astor==0.8.1
...@@ -18,12 +18,10 @@ from dios import DictOfSeries ...@@ -18,12 +18,10 @@ from dios import DictOfSeries
from typing import Any from typing import Any
def _dslIsFlagged(flagger, var, flag=None, comparator=None): def _dslIsFlagged(flagger, var, flag=None, comparator=">="):
""" """
helper function for `flagGeneric` helper function for `flagGeneric`
""" """
if comparator is None:
return flagger.isFlagged(var.name, flag=flag)
return flagger.isFlagged(var.name, flag=flag, comparator=comparator) return flagger.isFlagged(var.name, flag=flag, comparator=comparator)
...@@ -441,13 +439,25 @@ range_dict.keys() ...@@ -441,13 +439,25 @@ range_dict.keys()
@register @register
def flagCrossScoring(data, field, flagger, fields, thresh, cross_stat=np.median, **kwargs): def flagCrossScoring(data, field, flagger, fields, thresh, cross_stat=np.median, **kwargs):
val_frame = data.loc[data.index_of("shared")].to_df() df = data[fields].loc[data[fields].index_of('shared')].to_df()
try:
stat = getattr(val_frame, cross_stat.__name__)(axis=1) if isinstance(cross_stat, str):
except AttributeError: if cross_stat == 'modZscore':
stat = val_frame.aggregate(cross_stat, axis=1) MAD_series = df.subtract(df.median(axis=1), axis=0).abs().median(axis=1)
diff_scores = val_frame.subtract(stat, axis=0).abs() diff_scores = ((0.6745 * (df.subtract(df.median(axis=1), axis=0))).divide(MAD_series, axis=0)).abs()
diff_scores = diff_scores > thresh elif cross_stat == 'Zscore':
diff_scores = (df.subtract(df.mean(axis=1), axis=0)).divide(df.std(axis=1), axis=0).abs()
else:
raise ValueError(cross_stat)
else:
try:
stat = getattr(df, cross_stat.__name__)(axis=1)
except AttributeError:
stat = df.aggregate(cross_stat, axis=1)
diff_scores = df.subtract(stat, axis=0).abs()
mask = diff_scores > thresh
for var in fields: for var in fields:
flagger = flagger.setFlags(var, diff_scores[var].values, **kwargs) flagger = flagger.setFlags(var, mask[var], **kwargs)
return data, flagger return data, flagger
...@@ -19,7 +19,6 @@ from saqc.funcs.proc_functions import ( ...@@ -19,7 +19,6 @@ from saqc.funcs.proc_functions import (
logger = logging.getLogger("SaQC") logger = logging.getLogger("SaQC")
@register @register
def harm_shift2Grid(data, field, flagger, freq, method="nshift", drop_flags=None, **kwargs): def harm_shift2Grid(data, field, flagger, freq, method="nshift", drop_flags=None, **kwargs):
......
...@@ -18,9 +18,7 @@ def __importHelper(): ...@@ -18,9 +18,7 @@ def __importHelper():
# needed for datetime conversion # needed for datetime conversion
register_matplotlib_converters() register_matplotlib_converters()
if _interactive: if not _interactive:
mpl.use("TkAgg")
else:
# Import plot libs without interactivity, if not needed. # Import plot libs without interactivity, if not needed.
# This ensures that we can produce an plot.png even if # This ensures that we can produce an plot.png even if
# tkinter is not installed. E.g. if one want to run this # tkinter is not installed. E.g. if one want to run this
......
...@@ -6,6 +6,8 @@ The module gathers all kinds of timeseries tranformations. ...@@ -6,6 +6,8 @@ The module gathers all kinds of timeseries tranformations.
""" """
import logging import logging
import re
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import numba as nb import numba as nb
...@@ -163,13 +165,12 @@ def validationTrafo(data, max_nan_total, max_nan_consec): ...@@ -163,13 +165,12 @@ def validationTrafo(data, max_nan_total, max_nan_consec):
return data return data
elif _maxConsecutiveNan(np.asarray(data), max_nan_consec): elif _maxConsecutiveNan(np.asarray(data), max_nan_consec):
data[:] = False data[:] = False
return data
else: else:
data[:] = True data[:] = True
return data
else: else:
data[:] = True data[:] = True
return data
return data
def stdQC(data, max_nan_total=np.inf, max_nan_consec=np.inf): def stdQC(data, max_nan_total=np.inf, max_nan_consec=np.inf):
...@@ -248,10 +249,8 @@ def interpolateNANs(data, method, order=2, inter_limit=2, downgrade_interpolatio ...@@ -248,10 +249,8 @@ def interpolateNANs(data, method, order=2, inter_limit=2, downgrade_interpolatio
return x.interpolate(method=wrap_method, order=int(wrap_order)) return x.interpolate(method=wrap_method, order=int(wrap_order))
except (NotImplementedError, ValueError): except (NotImplementedError, ValueError):
logger.warning( logger.warning(
"Interpolation with method {} is not supported at order {}. " f"Interpolation with method {method} is not supported at order {wrap_order}. "
"Interpolation will be performed at order {}".format( f"and will be performed at order {wrap_order-1}"
method, str(wrap_order), str(wrap_order - 1)
)
) )
return _interpolWrapper(x, int(wrap_order - 1), wrap_method) return _interpolWrapper(x, int(wrap_order - 1), wrap_method)
elif x.size < 3: elif x.size < 3:
...@@ -269,8 +268,7 @@ def interpolateNANs(data, method, order=2, inter_limit=2, downgrade_interpolatio ...@@ -269,8 +268,7 @@ def interpolateNANs(data, method, order=2, inter_limit=2, downgrade_interpolatio
data = data.reindex(pre_index) data = data.reindex(pre_index)
if return_chunk_bounds: if return_chunk_bounds:
return data, chunk_bounds return data, chunk_bounds
else: else: return data
return data
def aggregate2Freq( def aggregate2Freq(
...@@ -280,6 +278,12 @@ def aggregate2Freq( ...@@ -280,6 +278,12 @@ def aggregate2Freq(
# Timestamps that have no values projected on them, get "fill_value" assigned. Also, # Timestamps that have no values projected on them, get "fill_value" assigned. Also,
# "fill_value" serves as replacement for "invalid" intervals # "fill_value" serves as replacement for "invalid" intervals
methods = {
"nagg": lambda seconds_total: (seconds_total/2, "left", "left"),
"bagg": lambda _: (0, "left", "left"),
"fagg": lambda _: (0, "right", "right"),
}
# filter data for invalid patterns (since filtering is expensive we pre-check if it is demanded) # filter data for invalid patterns (since filtering is expensive we pre-check if it is demanded)
if (max_invalid_total is not np.inf) | (max_invalid_consec is not np.inf): if (max_invalid_total is not np.inf) | (max_invalid_consec is not np.inf):
if pd.isnull(fill_value): if pd.isnull(fill_value):
...@@ -292,24 +296,8 @@ def aggregate2Freq( ...@@ -292,24 +296,8 @@ def aggregate2Freq(
) )
data[temp_mask] = fill_value data[temp_mask] = fill_value
# some timestamp acrobatics to feed pd.resample`s base keyword properly
seconds_total = pd.Timedelta(freq).total_seconds() seconds_total = pd.Timedelta(freq).total_seconds()
freq_string = str(int(seconds_total)) + "s" base, label, closed = methods[method](seconds_total)
if method == "nagg":
# all values within a grid points range (+/- freq/2, closed to the left) get aggregated with 'agg method'
base = seconds_total / 2
label = "left"
closed = "left"
elif method == "bagg":
# all values in a sampling interval get aggregated with agg_method and assigned to the last grid point
base = 0
label = "left"
closed = "left"
else:
# all values in a sampling interval get aggregated with agg_method and assigned to the next grid point
base = 0
label = "right"
closed = "right"
# In the following, we check for empty intervals outside resample.apply, because: # In the following, we check for empty intervals outside resample.apply, because:
# - resample AND groupBy do insert value zero for empty intervals if resampling with any kind of "sum" application - # - resample AND groupBy do insert value zero for empty intervals if resampling with any kind of "sum" application -
...@@ -317,23 +305,16 @@ def aggregate2Freq( ...@@ -317,23 +305,16 @@ def aggregate2Freq(
# - we are aggregating data and flags with this function and empty intervals usually would get assigned flagger.BAD # - we are aggregating data and flags with this function and empty intervals usually would get assigned flagger.BAD
# flag (where resample inserts np.nan or 0) # flag (where resample inserts np.nan or 0)
data_resampler = data.resample(freq_string, base=base, closed=closed, label=label) data_resampler = data.resample(f"{seconds_total:.0f}s", base=base, closed=closed, label=label)
empty_intervals = data_resampler.count() == 0 empty_intervals = data_resampler.count() == 0
# great performance gain can be achieved, when avoiding .apply and using pd.resampler # great performance gain can be achieved, when avoiding .apply and using pd.resampler
# methods instead. (this covers all the basic func aggregations, such as median, mean, sum, count, ...) # methods instead. (this covers all the basic func aggregations, such as median, mean, sum, count, ...)
try: try:
# get rid of nan_prefix attached to numpys nanfuncs ("ignore nan is pointless down here - check_name = re.sub("^nan", "", agg_func.__name__)
# resample doesnt pass no nans to the func applied) # a nasty special case: if function "count" was passed, we not want empty intervals to be replaced by nan:
if agg_func.__name__[:3] == "nan": if check_name == 'count':
check_name = agg_func.__name__[3:]
else:
check_name = agg_func.__name__
# another nasty special case: if function "count" was passed, we not want empty intervals to be replaced by nan:
if check_name == "count":
empty_intervals[:] = False empty_intervals[:] = False
data = getattr(data_resampler, check_name)() data = getattr(data_resampler, check_name)()
except AttributeError: except AttributeError:
data = data_resampler.apply(agg_func) data = data_resampler.apply(agg_func)
...@@ -352,26 +333,16 @@ def shift2Freq(data, method, freq, fill_value=np.nan): ...@@ -352,26 +333,16 @@ def shift2Freq(data, method, freq, fill_value=np.nan):
# shift timestamps backwards/forwards in order to allign them with an equidistant # shift timestamps backwards/forwards in order to allign them with an equidistant
# frequencie grid. # frequencie grid.
# Shifts methods = {
if method == "fshift": "fshift": lambda freq: ("ffill", pd.Timedelta(freq)),
direction = "ffill" "bshift": lambda freq: ("bfill", pd.Timedelta(freq)),
tolerance = pd.Timedelta(freq) "nshift": lambda freq: ("nearest", pd.Timedelta(freq)/2),
}
elif method == "bshift": direction, tolerance = methods[method](freq)
direction = "bfill"
tolerance = pd.Timedelta(freq)
elif method == "nshift":
direction = "nearest"
tolerance = pd.Timedelta(freq) / 2
else:
# method == nearest2
direction = "nearest"
tolerance = pd.Timedelta(freq)
target_ind = pd.date_range( target_ind = pd.date_range(
start=data.index[0].floor(freq), end=data.index[-1].ceil(freq), freq=freq, name=data.index.name start=data.index[0].floor(freq), end=data.index[-1].ceil(freq),
freq=freq,
name=data.index.name
) )
return data.reindex(target_ind, method=direction, tolerance=tolerance, fill_value=fill_value) return data.reindex(target_ind, method=direction, tolerance=tolerance, fill_value=fill_value)
......
...@@ -216,7 +216,7 @@ def test_isflagged(data, flagger): ...@@ -216,7 +216,7 @@ def test_isflagged(data, flagger):
tests = [ tests = [
(f"isflagged({var1})", flagger.isFlagged(var1)), (f"isflagged({var1})", flagger.isFlagged(var1)),
(f"isflagged({var1}, BAD)", flagger.isFlagged(var1, flag=flagger.BAD)), (f"isflagged({var1}, BAD)", flagger.isFlagged(var1, flag=flagger.BAD, comparator=">=")),
(f"isflagged({var1}, UNFLAGGED, '==')", flagger.isFlagged(var1, flag=flagger.UNFLAGGED, comparator="==")), (f"isflagged({var1}, UNFLAGGED, '==')", flagger.isFlagged(var1, flag=flagger.UNFLAGGED, comparator="==")),
(f"~isflagged({var2})", ~flagger.isFlagged(var2)), (f"~isflagged({var2})", ~flagger.isFlagged(var2)),
(f"~({var2}>999) & (~isflagged({var2}))", ~(data[var2] > 999) & (~flagger.isFlagged(var2))), (f"~({var2}>999) & (~isflagged({var2}))", ~(data[var2] > 999) & (~flagger.isFlagged(var2))),
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment