Skip to content
Snippets Groups Projects
Commit bacef79f authored by David Schäfer's avatar David Schäfer
Browse files

Merge branch 'develop' into 'dependabot/pip/zipp-3.1.0'

# Conflicts:
#   requirements.txt
parents d09d9313 2117d4f9
No related branches found
No related tags found
1 merge request!76Bump zipp from 2.2.0 to 3.1.0
Pipeline #6083 passed with stage
in 9 minutes and 11 seconds
This commit is part of merge request !76. Comments created here will be created in the context of that merge request.
attrs==19.3.0
Click==7.0
Click==7.1.2
cycler==0.10.0
dtw==1.4.0
importlib-metadata==1.5.0
joblib==0.14.1
kiwisolver==1.1.0
kiwisolver==1.2.0
importlib-metadata==1.7.0
joblib==0.16.0
llvmlite==0.31.0
matplotlib==3.1.3
mlxtend==0.17.2
more-itertools==8.2.0
mlxtend==0.17.3
matplotlib==3.3.0
more-itertools==8.4.0
numba==0.48.0
numpy==1.18.1
numpy==1.19.1
outlier==0.2
utils==1.0.1
outlier-utils==0.0.3
packaging==20.1
packaging==20.4
pandas==1.0.1
pluggy==0.13.1
py==1.8.1
pyarrow==0.16.0
pyparsing==2.4.6
pyparsing==2.4.7
py==1.9.0
pyarrow==1.0.0
pytest-lazy-fixture==0.6.3
pytest==5.3.5
pytest==6.0.1
python-dateutil==2.8.1
python-intervals==1.10.0
pytz==2019.3
python-intervals==1.10.0.post1
pytz==2020.1
PyWavelets==1.1.1
scikit-learn==0.22.1
scipy==1.4.1
six==1.14.0
wcwidth==0.1.8
zipp==3.1.0
wcwidth==0.2.5
scipy==1.5.2
scikit-learn==0.23.1
six==1.15.0
astor==0.8.1
......@@ -18,12 +18,10 @@ from dios import DictOfSeries
from typing import Any
def _dslIsFlagged(flagger, var, flag=None, comparator=None):
def _dslIsFlagged(flagger, var, flag=None, comparator=">="):
"""
helper function for `flagGeneric`
"""
if comparator is None:
return flagger.isFlagged(var.name, flag=flag)
return flagger.isFlagged(var.name, flag=flag, comparator=comparator)
......@@ -441,13 +439,25 @@ range_dict.keys()
@register
def flagCrossScoring(data, field, flagger, fields, thresh, cross_stat=np.median, **kwargs):
val_frame = data.loc[data.index_of("shared")].to_df()
try:
stat = getattr(val_frame, cross_stat.__name__)(axis=1)
except AttributeError:
stat = val_frame.aggregate(cross_stat, axis=1)
diff_scores = val_frame.subtract(stat, axis=0).abs()
diff_scores = diff_scores > thresh
df = data[fields].loc[data[fields].index_of('shared')].to_df()
if isinstance(cross_stat, str):
if cross_stat == 'modZscore':
MAD_series = df.subtract(df.median(axis=1), axis=0).abs().median(axis=1)
diff_scores = ((0.6745 * (df.subtract(df.median(axis=1), axis=0))).divide(MAD_series, axis=0)).abs()
elif cross_stat == 'Zscore':
diff_scores = (df.subtract(df.mean(axis=1), axis=0)).divide(df.std(axis=1), axis=0).abs()
else:
raise ValueError(cross_stat)
else:
try:
stat = getattr(df, cross_stat.__name__)(axis=1)
except AttributeError:
stat = df.aggregate(cross_stat, axis=1)
diff_scores = df.subtract(stat, axis=0).abs()
mask = diff_scores > thresh
for var in fields:
flagger = flagger.setFlags(var, diff_scores[var].values, **kwargs)
flagger = flagger.setFlags(var, mask[var], **kwargs)
return data, flagger
......@@ -19,7 +19,6 @@ from saqc.funcs.proc_functions import (
logger = logging.getLogger("SaQC")
@register
def harm_shift2Grid(data, field, flagger, freq, method="nshift", drop_flags=None, **kwargs):
......
......@@ -18,9 +18,7 @@ def __importHelper():
# needed for datetime conversion
register_matplotlib_converters()
if _interactive:
mpl.use("TkAgg")
else:
if not _interactive:
# Import plot libs without interactivity, if not needed.
# This ensures that we can produce an plot.png even if
# tkinter is not installed. E.g. if one want to run this
......
......@@ -6,6 +6,8 @@ The module gathers all kinds of timeseries tranformations.
"""
import logging
import re
import pandas as pd
import numpy as np
import numba as nb
......@@ -163,13 +165,12 @@ def validationTrafo(data, max_nan_total, max_nan_consec):
return data
elif _maxConsecutiveNan(np.asarray(data), max_nan_consec):
data[:] = False
return data
else:
data[:] = True
return data
else:
data[:] = True
return data
return data
def stdQC(data, max_nan_total=np.inf, max_nan_consec=np.inf):
......@@ -248,10 +249,8 @@ def interpolateNANs(data, method, order=2, inter_limit=2, downgrade_interpolatio
return x.interpolate(method=wrap_method, order=int(wrap_order))
except (NotImplementedError, ValueError):
logger.warning(
"Interpolation with method {} is not supported at order {}. "
"Interpolation will be performed at order {}".format(
method, str(wrap_order), str(wrap_order - 1)
)
f"Interpolation with method {method} is not supported at order {wrap_order}. "
f"and will be performed at order {wrap_order-1}"
)
return _interpolWrapper(x, int(wrap_order - 1), wrap_method)
elif x.size < 3:
......@@ -269,8 +268,7 @@ def interpolateNANs(data, method, order=2, inter_limit=2, downgrade_interpolatio
data = data.reindex(pre_index)
if return_chunk_bounds:
return data, chunk_bounds
else:
return data
else: return data
def aggregate2Freq(
......@@ -280,6 +278,12 @@ def aggregate2Freq(
# Timestamps that have no values projected on them, get "fill_value" assigned. Also,
# "fill_value" serves as replacement for "invalid" intervals
methods = {
"nagg": lambda seconds_total: (seconds_total/2, "left", "left"),
"bagg": lambda _: (0, "left", "left"),
"fagg": lambda _: (0, "right", "right"),
}
# filter data for invalid patterns (since filtering is expensive we pre-check if it is demanded)
if (max_invalid_total is not np.inf) | (max_invalid_consec is not np.inf):
if pd.isnull(fill_value):
......@@ -292,24 +296,8 @@ def aggregate2Freq(
)
data[temp_mask] = fill_value
# some timestamp acrobatics to feed pd.resample`s base keyword properly
seconds_total = pd.Timedelta(freq).total_seconds()
freq_string = str(int(seconds_total)) + "s"
if method == "nagg":
# all values within a grid points range (+/- freq/2, closed to the left) get aggregated with 'agg method'
base = seconds_total / 2
label = "left"
closed = "left"
elif method == "bagg":
# all values in a sampling interval get aggregated with agg_method and assigned to the last grid point
base = 0
label = "left"
closed = "left"
else:
# all values in a sampling interval get aggregated with agg_method and assigned to the next grid point
base = 0
label = "right"
closed = "right"
base, label, closed = methods[method](seconds_total)
# In the following, we check for empty intervals outside resample.apply, because:
# - resample AND groupBy do insert value zero for empty intervals if resampling with any kind of "sum" application -
......@@ -317,23 +305,16 @@ def aggregate2Freq(
# - we are aggregating data and flags with this function and empty intervals usually would get assigned flagger.BAD
# flag (where resample inserts np.nan or 0)
data_resampler = data.resample(freq_string, base=base, closed=closed, label=label)
data_resampler = data.resample(f"{seconds_total:.0f}s", base=base, closed=closed, label=label)
empty_intervals = data_resampler.count() == 0
# great performance gain can be achieved, when avoiding .apply and using pd.resampler
# methods instead. (this covers all the basic func aggregations, such as median, mean, sum, count, ...)
try:
# get rid of nan_prefix attached to numpys nanfuncs ("ignore nan is pointless down here -
# resample doesnt pass no nans to the func applied)
if agg_func.__name__[:3] == "nan":
check_name = agg_func.__name__[3:]
else:
check_name = agg_func.__name__
# another nasty special case: if function "count" was passed, we not want empty intervals to be replaced by nan:
if check_name == "count":
check_name = re.sub("^nan", "", agg_func.__name__)
# a nasty special case: if function "count" was passed, we not want empty intervals to be replaced by nan:
if check_name == 'count':
empty_intervals[:] = False
data = getattr(data_resampler, check_name)()
except AttributeError:
data = data_resampler.apply(agg_func)
......@@ -352,26 +333,16 @@ def shift2Freq(data, method, freq, fill_value=np.nan):
# shift timestamps backwards/forwards in order to allign them with an equidistant
# frequencie grid.
# Shifts
if method == "fshift":
direction = "ffill"
tolerance = pd.Timedelta(freq)
elif method == "bshift":
direction = "bfill"
tolerance = pd.Timedelta(freq)
elif method == "nshift":
direction = "nearest"
tolerance = pd.Timedelta(freq) / 2
else:
# method == nearest2
direction = "nearest"
tolerance = pd.Timedelta(freq)
methods = {
"fshift": lambda freq: ("ffill", pd.Timedelta(freq)),
"bshift": lambda freq: ("bfill", pd.Timedelta(freq)),
"nshift": lambda freq: ("nearest", pd.Timedelta(freq)/2),
}
direction, tolerance = methods[method](freq)
target_ind = pd.date_range(
start=data.index[0].floor(freq), end=data.index[-1].ceil(freq), freq=freq, name=data.index.name
start=data.index[0].floor(freq), end=data.index[-1].ceil(freq),
freq=freq,
name=data.index.name
)
return data.reindex(target_ind, method=direction, tolerance=tolerance, fill_value=fill_value)
......
......@@ -216,7 +216,7 @@ def test_isflagged(data, flagger):
tests = [
(f"isflagged({var1})", flagger.isFlagged(var1)),
(f"isflagged({var1}, BAD)", flagger.isFlagged(var1, flag=flagger.BAD)),
(f"isflagged({var1}, BAD)", flagger.isFlagged(var1, flag=flagger.BAD, comparator=">=")),
(f"isflagged({var1}, UNFLAGGED, '==')", flagger.isFlagged(var1, flag=flagger.UNFLAGGED, comparator="==")),
(f"~isflagged({var2})", ~flagger.isFlagged(var2)),
(f"~({var2}>999) & (~isflagged({var2}))", ~(data[var2] > 999) & (~flagger.isFlagged(var2))),
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment