diff --git a/requirements.txt b/requirements.txt index c68f7d45f0e9971bc27530c8228c7eaee6bed4c9..5b65ca50517521fd025609c817f99f6f7adf14dd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ cycler==0.10.0 dios==0.6.0 dtw==1.4.0 kiwisolver==1.2.0 -importlib-metadata==1.7.0 +importlib-metadata==2.0.0 joblib==0.16.0 llvmlite==0.34.0 mlxtend==0.17.3 @@ -22,12 +22,12 @@ pyparsing==2.4.7 py==1.9.0 pyarrow==1.0.1 pytest-lazy-fixture==0.6.3 -pytest==6.0.1 +pytest==6.1.0 python-dateutil==2.8.1 python-intervals==1.10.0.post1 pytz==2020.1 PyWavelets==1.1.1 -zipp==3.1.0 +zipp==3.2.0 wcwidth==0.2.5 scipy==1.5.2 scikit-learn==0.23.2 diff --git a/saqc/core/core.py b/saqc/core/core.py index 0d86af504539443cca2ef59c7b6d384d677cc9b7..356b94f4d22850bc4df28a96f05b83c9e24a22c8 100644 --- a/saqc/core/core.py +++ b/saqc/core/core.py @@ -15,6 +15,7 @@ import pandas as pd import dios import numpy as np import timeit +import inspect from saqc.lib.plotting import plotHook, plotAllHook from saqc.lib.tools import isQuoted @@ -231,7 +232,6 @@ class SaQC: kwargs.setdefault('nodata', self._nodata) - # to_mask is a control keyword ctrl_kws = { **(FUNC_MAP[func_name]["ctrl_kws"]), 'to_mask': to_mask or self._to_mask, @@ -314,6 +314,10 @@ def _saqcCallFunc(func_dump, data, flagger): data_result, flagger_result = func(data_in, field, flagger, *func_args, func_name=func_name, **func_kws) data_result = _unmaskData(data, mask, data_result, flagger_result, to_mask) + # we check the passed function-kwargs after the actual call, because now "hard" errors would already have been + # raised (Eg. `TypeError: got multiple values for argument 'data'`, when the user pass data=...) + _warnForUnusedKwargs(func_dump, flagger) + return data_result, flagger_result @@ -360,3 +364,43 @@ def _unmaskData(data_old, mask_old, data_new, flagger_new, to_mask): return data_new +def _warnForUnusedKwargs(func_dump, flagger): + """ Warn for unused kwargs, passed to a SaQC.function. + + Parameters + ---------- + func_dump: dict + Saqc internal data structure that hold all function info. + flagger: saqc.flagger.BaseFlagger + Flagger object. + + Returns + ------- + None + + Notes + ----- + A single warning via the logging module is thrown, if any number of + missing kws are detected, naming each missing kw. + """ + passed_kws = func_dump['func_kws'] + func = func_dump['func'] + sig_kws = inspect.signature(func).parameters + + # we need to ignore kwargs that are injected or + # used to control the flagger + ignore = flagger.signature + ('nodata',) + + missing = [] + for kw in passed_kws: + # there is no need to check for + # `kw in [KEYWORD_ONLY, VAR_KEYWORD or POSITIONAL_OR_KEYWORD]` + # because this would have raised an error beforehand. + if kw not in sig_kws and kw not in ignore: + missing.append(kw) + + if missing: + missing = ', '.join(missing) + logging.warning(f"Unused argument(s): {missing}") + + diff --git a/saqc/flagger/baseflagger.py b/saqc/flagger/baseflagger.py index 2d86d9719096c9b1a9c3ceca756af0cf75003325..b3c576e2458cfbe7e94c93991ab1371a31245b86 100644 --- a/saqc/flagger/baseflagger.py +++ b/saqc/flagger/baseflagger.py @@ -153,7 +153,7 @@ class BaseFlagger(ABC): flag = self.BAD if flag is None else flag if force: - row_indexer = loc + row_indexer = slice(None) if loc is None else loc else: # trim flags to loc, we always get a pd.Series returned this = self.getFlags(field=field, loc=loc) diff --git a/saqc/flagger/dmpflagger.py b/saqc/flagger/dmpflagger.py index 5df4d9de146bd2f86fd301d3d5c0df0befa9f3c6..9e5442577b289e45f47c56910584eeceaa376c5c 100644 --- a/saqc/flagger/dmpflagger.py +++ b/saqc/flagger/dmpflagger.py @@ -117,7 +117,7 @@ class DmpFlagger(CategoricalFlagger): ) if force: - row_indexer = loc + row_indexer = slice(None) if loc is None else loc else: # trim flags to loc, we always get a pd.Series returned this = self.getFlags(field=field, loc=loc) diff --git a/saqc/funcs/__init__.py b/saqc/funcs/__init__.py index b85dc8ba939ff8b07efa39a5d6f9309ce6fceb55..e5c5153cbfef7a22c55982abbf7ae8cb369ffe74 100644 --- a/saqc/funcs/__init__.py +++ b/saqc/funcs/__init__.py @@ -9,5 +9,5 @@ from saqc.funcs.constants_detection import * from saqc.funcs.soil_moisture_tests import * from saqc.funcs.spikes_detection import * from saqc.funcs.harm_functions import * -from saqc.funcs.data_modelling import * +from saqc.funcs.modelling import * from saqc.funcs.proc_functions import * diff --git a/saqc/funcs/functions.py b/saqc/funcs/functions.py index 2b2778ba05d3684871d335638eed4ab0fc244b8b..ad0652d5d162f5d627cb9f104fc0ffad821235de 100644 --- a/saqc/funcs/functions.py +++ b/saqc/funcs/functions.py @@ -16,7 +16,9 @@ from mlxtend.evaluate import permutation_test from scipy.cluster.hierarchy import linkage, fcluster -from saqc.lib.tools import groupConsecutives, sesonalMask, FreqIndexer, customRolling +from saqc.lib.tools import groupConsecutives, seasonalMask, FreqIndexer, customRolling +from saqc.funcs.proc_functions import proc_fork, proc_drop, proc_projectFlags +from saqc.funcs.modelling import modelling_mask from saqc.core.register import register from saqc.core.visitor import ENVIRONMENT @@ -474,18 +476,15 @@ def flagSesonalRange( The flagger object, holding flags and additional Informations related to `data`. Flags values may have changed relatively to the flagger input. """ - smask = sesonalMask(data[field].index, startmonth, startday, endmonth, endday) - d = data.loc[smask, [field]] - if d.empty: - return data, flagger - - _, flagger_range = flagRange(d, field, flagger.slice(loc=d[field].index), min=min, max=max, **kwargs) - - if not flagger_range.isFlagged(field).any(): - return data, flagger - - flagger = flagger.merge(flagger_range) + data, flagger = proc_fork(data, field, flagger, suffix="_masked") + data, flagger = modelling_mask(data, field + "_masked", flagger, mode='seasonal', + season_start=f"{startmonth:02}-{startday:02}T00:00:00", + season_end=f"{endmonth:02}-{endday:02}T00:00:00", + include_bounds=True) + data, flagger = flagRange(data, field + "_masked", flagger, min=min, max=max, **kwargs) + data, flagger = proc_projectFlags(data, field, flagger, method='match', source=field + "_masked") + data, flagger = proc_drop(data, field + "_masked", flagger) return data, flagger diff --git a/saqc/funcs/data_modelling.py b/saqc/funcs/modelling.py similarity index 70% rename from saqc/funcs/data_modelling.py rename to saqc/funcs/modelling.py index bc239817ac2b6ec795a40a52f4233178097aa346..2b3ceee8c6aaaa2925c696bbfeb00dbc42d17649 100644 --- a/saqc/funcs/data_modelling.py +++ b/saqc/funcs/modelling.py @@ -11,6 +11,7 @@ from saqc.lib.ts_operators import ( polyRollerNoMissingNumba, polyRollerIrregular, ) +from saqc.lib.tools import seasonalMask @register(masking='field') @@ -282,3 +283,122 @@ def modelling_rollingMean(data, field, flagger, winsz, eval_flags=True, min_peri flagger = flagger.setFlags(field, to_flag.values, **kwargs) return data, flagger + + +def modelling_mask(data, field, flagger, mode, mask_var=None, season_start=None, season_end=None, + include_bounds=True): + """ + This function realizes masking within saqc. + + Due to some inner saqc mechanics, it is not straight forwardly possible to exclude + values or datachunks from flagging routines. This function replaces flags with np.nan + value, wherever values are to get masked. Furthermore, the masked values get replaced by + np.nan, so that they dont effect calculations. + + Here comes a recipe on how to apply a flagging function only on a masked chunk of the variable field: + + 1. dublicate "field" in the input data (proc_fork) + 2. mask the dublicated data (modelling_mask) + 3. apply the tests you only want to be applied onto the masked data chunks (saqc_tests) + 4. project the flags, calculated on the dublicated and masked data onto the original field data + (proc_projectFlags or flagGeneric) + 5. drop the dublicated data (proc_drop) + + To see an implemented example, checkout flagSeasonalRange in the saqc.functions module + + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-masked. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + mode : {"seasonal", "mask_var"} + The masking mode. + - "seasonal": parameters "season_start", "season_end" are evaluated to generate a seasonal (periodical) mask + - "mask_var": data[mask_var] is expected to be a boolean valued timeseries and is used as mask. + mask_var : {None, str}, default None + Only effective if mode == "mask_var" + Fieldname of the column, holding the data that is to be used as mask. (must be moolean series) + Neither the series` length nor its labels have to match data[field]`s index and length. An inner join of the + indices will be calculated and values get masked where the values of the inner join are "True". + season_start : {None, str}, default None + Only effective if mode == "seasonal" + String denoting starting point of every period. Formally, it has to be a truncated instance of "mm-ddTHH:MM:SS". + Has to be of same length as `season_end` parameter. + See examples section below for some examples. + season_end : {None, str}, default None + Only effective if mode == "seasonal" + String denoting starting point of every period. Formally, it has to be a truncated instance of "mm-ddTHH:MM:SS". + Has to be of same length as `season_end` parameter. + See examples section below for some examples. + include_bounds : boolean + Wheather or not to include the mask defining bounds to the mask. + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + Data values may have changed relatively to the data input. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values may have changed relatively to the flagger input. + + + Examples + -------- + The `season_start` and `season_end` parameters provide a conveniant way to generate seasonal / date-periodic masks. + They have to be strings of the forms: "mm-ddTHH:MM:SS", "ddTHH:MM:SS" , "HH:MM:SS", "MM:SS" or "SS" + (mm=month, dd=day, HH=hour, MM=minute, SS=second) + Single digit specifications have to be given with leading zeros. + `season_start` and `seas on_end` strings have to be of same length (refer to the same periodicity) + The highest date unit gives the period. + For example: + + >>> season_start = "01T15:00:00" + >>> season_end = "13T17:30:00" + + Will result in all values sampled between 15:00 at the first and 17:30 at the 13th of every month get masked + + >>> season_start = "01:00" + >>> season_end = "04:00" + + All the values between the first and 4th minute of every hour get masked. + + >>> season_start = "01-01T00:00:00" + >>> season_end = "01-03T00:00:00" + + Mask january and february of evcomprosed in theery year. masking is inclusive always, so in this case the mask will + include 00:00:00 at the first of march. To exclude this one, pass: + + >>> season_start = "01-01T00:00:00" + >>> season_end = "02-28T23:59:59" + + To mask intervals that lap over a seasons frame, like nights, or winter, exchange sequence of season start and + season end. For example, to mask night hours between 22:00:00 in the evening and 06:00:00 in the morning, pass: + + >>> season_start = "22:00:00" + >>> season_end = "06:00:00" + + When inclusive_selection="season", all above examples work the same way, only that you now + determine wich values NOT TO mask (=wich values are to constitute the "seasons"). + """ + data = data.copy() + datcol = data[field] + if mode == 'seasonal': + to_mask = seasonalMask(datcol.index, season_start, season_end, include_bounds) + + elif mode == 'mask_var': + to_mask = data[mask_var] + to_mask_i = to_mask.index.join(datcol.index, how='inner') + to_mask = to_mask[to_mask_i] + else: + raise ValueError("Keyword passed as masking mode is unknown ({})!".format(mode)) + + datcol[to_mask] = np.nan + flags_to_block = pd.Series(np.nan, index=datcol.index[to_mask]).astype(flagger.dtype) + data[field] = datcol + flagger = flagger.setFlags(field, loc=datcol.index[to_mask], flag=flags_to_block, force=True) + + return data, flagger \ No newline at end of file diff --git a/saqc/funcs/spikes_detection.py b/saqc/funcs/spikes_detection.py index 65ddbd58cbebe3010af966b3a3bf9a9346bd969d..93a23c6d0d55da53fe48221ec2057c408c8512a6 100644 --- a/saqc/funcs/spikes_detection.py +++ b/saqc/funcs/spikes_detection.py @@ -465,17 +465,21 @@ def spikes_flagMultivarScores( val_frame = data[fields] val_frame = val_frame.loc[val_frame.index_of("shared")].to_df() val_frame.dropna(inplace=True) + val_frame = val_frame.apply(trafo) + if val_frame.empty: return data, flagger - if threshing == 'stray': - to_flag_index = _stray(val_frame, - partition_freq=stray_partition, - partition_min=stray_partition_min, - scoring_method=scoring_method, - n_neighbors=n_neighbors, - iter_start=iter_start, - trafo=trafo) + if threshing == "stray": + to_flag_index = _stray( + val_frame, + partition_freq=stray_partition, + partition_min=stray_partition_min, + scoring_method=scoring_method, + n_neighbors=n_neighbors, + iter_start=iter_start, + alpha=alpha + ) else: val_frame = val_frame.apply(trafo) @@ -936,7 +940,7 @@ def spikes_flagBasic(data, field, flagger, thresh, tolerance, window, numba_kick to_roll = post_jumps.reindex(dataseries.index, method="bfill", tolerance=window, fill_value=False).dropna() # define spike testing function to roll with: - def spike_tester(chunk, thresh=thresh, tol=tolerance): + def spikeTester(chunk, thresh=thresh, tol=tolerance): # signum change!!! chunk_stair = (np.abs(chunk - chunk[-1]) < thresh)[::-1].cumsum() initial = np.searchsorted(chunk_stair, 2) @@ -953,21 +957,21 @@ def spikes_flagBasic(data, field, flagger, thresh, tolerance, window, numba_kick engine=None if roll_mask.sum() > numba_kickin: engine = 'numba' - result = customRolling(to_roll, window, spike_tester, roll_mask, closed='both', engine=engine) + result = customRolling(to_roll, window, spikeTester, roll_mask, closed='both', engine=engine) + group_col = np.nancumsum(result) group_frame = pd.DataFrame({'group_col': group_col[:-1], 'diff_col': np.diff(group_col).astype(int)}, index=result.index[:-1]) - groups = group_frame.groupby('group_col') - def g_func(x): - r = np.array([False] * x.shape[0]) + def gFunc(x): + r = np.zeros(shape=x.shape[0], dtype=np.bool) r[-x[-1]:] = True return r - to_flag = groups['diff_col'].transform(g_func) - flagger = flagger.setFlags(field, to_flag, **kwargs) + to_flag = groups['diff_col'].transform(gFunc) + flagger = flagger.setFlags(field, to_flag[to_flag == True].index, **kwargs) return data, flagger diff --git a/saqc/lib/tools.py b/saqc/lib/tools.py index 11c1adb45ad1fa6aac72824bc924d39909ebb475..79f956c59511d6072526509ce52dd9aba69e7908 100644 --- a/saqc/lib/tools.py +++ b/saqc/lib/tools.py @@ -202,69 +202,103 @@ def flagWindow(flagger_old, flagger_new, field, direction="fw", window=0, **kwar return flagger_new.setFlags(field, fmask, **kwargs) -def sesonalMask(dtindex, month0=1, day0=1, month1=12, day1=None): +def seasonalMask(dtindex, season_start, season_end, include_bounds): """ - This function provides a mask for a sesonal time range in the given dtindex. - This means the interval is applied again on every year and even over the change of a year. - Note that both edges are inclusive. + This function generates date-periodic/seasonal masks from an index passed. - Examples: - sesonal(dtindex, 1, 1, 3, 1) -> [jan-mar] - sesonal(dtindex, 8, 1, 8, 15) -> [1.aug-15.aug] + For example you could mask all the values of an index, that are sampled in winter, or between 6 and 9 o'clock. + See the examples section for more details. + Parameters + ---------- + dtindex : pandas.DatetimeIndex + The index according to wich you want to generate a mask. + (=resulting mask will be indexed with 'dtindex') + season_start : str + String denoting starting point of every period. Formally, it has to be a truncated instance of "mm-ddTHH:MM:SS". + Has to be of same length as `season_end` parameter. + See examples section below for some examples. + season_end : str + String denoting starting point of every period. Formally, it has to be a truncated instance of "mm-ddTHH:MM:SS". + Has to be of same length as `season_end` parameter. + See examples section below for some examples. + include_bounds : boolean + Wheather or not to include the mask defining bounds to the mask. + + Returns + ------- + to_mask : pandas.Series[bool] + A series, indexed with the input index and having value `True` for all the values that are to be masked. + + Examples + -------- + The `season_start` and `season_end` parameters provide a conveniant way to generate seasonal / date-periodic masks. + They have to be strings of the forms: "mm-ddTHH:MM:SS", "ddTHH:MM:SS" , "HH:MM:SS", "MM:SS" or "SS" + (mm=month, dd=day, HH=hour, MM=minute, SS=second) + Single digit specifications have to be given with leading zeros. + `season_start` and `seas on_end` strings have to be of same length (refer to the same periodicity) + The highest date unit gives the period. + For example: + + >>> season_start = "01T15:00:00" + >>> season_end = "13T17:30:00" + + Will result in all values sampled between 15:00 at the first and 17:30 at the 13th of every month get masked + + >>> season_start = "01:00" + >>> season_end = "04:00" + + All the values between the first and 4th minute of every hour get masked. + + >>> season_start = "01-01T00:00:00" + >>> season_end = "01-03T00:00:00" - This also works, if the second border is smaller then the first + Mask january and february of evcomprosed in theery year. masking is inclusive always, so in this case the mask will + include 00:00:00 at the first of march. To exclude this one, pass: - Examples: - sesonal(dtindex, 10, 1, 2, 1) -> [1.nov-1.feb (following year)] - sesonal(dtindex, 1, 10, 1, 1) -> [10.jan-1.jan(following year)] like everything except ]1.jan-10.jan[ + >>> season_start = "01-01T00:00:00" + >>> season_end = "02-28T23:59:59" + To mask intervals that lap over a seasons frame, like nights, or winter, exchange sequence of season start and + season end. For example, to mask night hours between 22:00:00 in the evening and 06:00:00 in the morning, pass: + + >>> season_start = "22:00:00" + >>> season_end = "06:00:00" + + When inclusive_selection="season", all above examples work the same way, only that you now + determine wich values NOT TO mask (=wich values are to constitute the "seasons"). """ - if day1 is None: - day1 = 31 if month1 in [1, 3, 5, 7, 8, 10, 12] else 29 if month1 == 2 else 30 - - # test plausibility of date - try: - f = "%Y-%m-%d" - t0 = pd.to_datetime(f"2001-{month0}-{day0}", format=f) - t1 = pd.to_datetime(f"2001-{month1}-{day1}", format=f) - except ValueError: - raise ValueError("Given datelike parameter not logical") - - # swap - if t1 < t0: - # we create the same mask as we would do if not inverted - # but the borders need special treatment.. - # ===end]....................[start==== - # ======]end+1........start-1[========= - # ......[end+1========start-1]......... + invert - # ......[start`========= end`]......... + invert - t0 -= pd.to_timedelta("1d") - t1 += pd.to_timedelta("1d") - invert = True - # only swap id condition is still true - t0, t1 = t1, t0 if t1 < t0 else (t0, t1) - - month0, day0 = t0.month, t0.day - month1, day1 = t1.month, t1.day - else: - invert = False + def _replaceBuilder(stamp): + keys = ("second", "minute", "hour", "day", "month", "year") + stamp_list = map(int, re.split(r"[-T:]", stamp)[::-1]) + stamp_kwargs = dict(zip(keys, stamp_list)) + + def _replace(index): + if "day" in stamp_kwargs: + stamp_kwargs["day"] = min(stamp_kwargs["day"], index[0].daysinmonth) + + out = index[0].replace(**stamp_kwargs) + return out.strftime("%Y-%m-%dT%H:%M:%S") - month = [m for m in range(month0, month1 + 1)] + return _replace - # make a mask for [start:end] - mask = dtindex.month.isin(month) - if day0 > 1: - exclude = [d for d in range(1, day0)] - mask &= ~(dtindex.month.isin([month0]) & dtindex.day.isin(exclude)) - if day1 < 31: - exclude = [d for d in range(day1 + 1, 31 + 1)] - mask &= ~(dtindex.month.isin([month1]) & dtindex.day.isin(exclude)) + mask = pd.Series(include_bounds, index=dtindex) - if invert: - return ~mask + start_replacer = _replaceBuilder(season_start) + end_replacer = _replaceBuilder(season_end) + + if pd.Timestamp(start_replacer(dtindex)) <= pd.Timestamp(end_replacer(dtindex)): + def _selector(x, base_bool=include_bounds): + x[start_replacer(x.index):end_replacer(x.index)] = not base_bool + return x else: - return mask + def _selector(x, base_bool=include_bounds): + x[:end_replacer(x.index)] = not base_bool + x[start_replacer(x.index):] = not base_bool + return x + + freq = '1' + 'mmmhhhdddMMMYYY'[len(season_start)] + return mask.groupby(pd.Grouper(freq=freq)).transform(_selector) def assertDictOfSeries(df: Any, argname: str = "arg") -> None: @@ -441,26 +475,27 @@ def customRolling(to_roll, winsz, func, roll_mask, min_periods=1, center=False, """ i_roll = to_roll.copy() - i_roll.index = np.arange(to_roll.shape[0]) + i_roll.index = np.arange(to_roll.shape[0], dtype=np.int64) if isinstance(winsz, str): - winsz = int(pd.Timedelta(winsz).total_seconds()*10**9) + winsz = np.int64(pd.Timedelta(winsz).total_seconds()*10**9) indexer = FreqIndexer(window_size=winsz, win_points=roll_mask, - index_array=to_roll.index.to_numpy(int), + index_array=to_roll.index.to_numpy(np.int64), center=center, closed=closed, forward=forward) elif isinstance(winsz, int): indexer = PeriodsIndexer(window_size=winsz, - win_points=roll_mask, - center=center, - closed=closed) + win_points=roll_mask, + center=center, + closed=closed) - i_roller = i_roll.rolling(indexer, + i_roll = i_roll.rolling(indexer, min_periods=min_periods, center=center, - closed=closed) + closed=closed).apply(func, raw=raw, engine=engine) + return pd.Series(i_roll.values, index=to_roll.index) if hasattr(i_roller, func.__name__): i_roll = getattr(i_roller, func.__name__) diff --git a/sphinx-doc/requirements_sphinx.txt b/sphinx-doc/requirements_sphinx.txt index 2480e078d388b2b7c069e83be7490652d71707ad..995b80b46404c04f3e35969b44f7d2175842a079 100644 --- a/sphinx-doc/requirements_sphinx.txt +++ b/sphinx-doc/requirements_sphinx.txt @@ -15,7 +15,7 @@ docutils==0.16 dtw==1.4.0 idna==2.10 imagesize==1.2.0 -importlib-metadata==1.5.0 +importlib-metadata==2.0.0 Jinja2==2.11.2 joblib==0.14.1 kiwisolver==1.1.0 @@ -35,23 +35,23 @@ pathspec==0.8.0 pluggy==0.13.1 py==1.8.1 pyarrow==1.0.1 -Pygments==2.6.1 +Pygments==2.7.1 pyparsing==2.4.6 -pytest==5.3.5 +pytest==6.1.0 pytest-lazy-fixture==0.6.3 python-dateutil==2.8.1 python-intervals==1.10.0 pytz==2019.3 PyWavelets==1.1.1 recommonmark==0.6.0 -regex==2020.7.14 +regex==2020.9.27 requests==2.24.0 scikit-learn==0.22.1 scipy==1.4.1 six==1.14.0 snowballstemmer==2.0.0 Sphinx==3.2.1 -sphinx-automodapi==0.12 +sphinx-automodapi==0.13 sphinx-markdown-tables==0.0.15 sphinxcontrib-applehelp==1.0.2 sphinxcontrib-devhelp==1.0.2 @@ -65,4 +65,4 @@ typed-ast==1.4.1 urllib3==1.25.10 utils==1.0.1 wcwidth==0.1.8 -zipp==2.2.0 +zipp==3.2.0 diff --git a/test/funcs/test_data_modelling.py b/test/funcs/test_modelling.py similarity index 51% rename from test/funcs/test_data_modelling.py rename to test/funcs/test_modelling.py index 9cfe75ff74c2059ac73df43dc802c0b06e1cd3cb..f221944f1c6c2fcfd1c23acba4dd13f552b9063f 100644 --- a/test/funcs/test_data_modelling.py +++ b/test/funcs/test_modelling.py @@ -7,11 +7,12 @@ import pytest import numpy as np +import pandas as pd import dios from test.common import TESTFLAGGER -from saqc.funcs.data_modelling import modelling_polyFit, modelling_rollingMean +from saqc.funcs.modelling import modelling_polyFit, modelling_rollingMean, modelling_mask TF = TESTFLAGGER[:1] @@ -44,3 +45,30 @@ def test_modelling_rollingMean_forRegular(dat, flagger): flagger = flagger.initFlags(data) modelling_rollingMean(data, "data", flagger, 5, eval_flags=True, min_periods=0, center=True) modelling_rollingMean(data, "data", flagger, 5, eval_flags=True, min_periods=0, center=False) + +@pytest.mark.parametrize("flagger", TF) +@pytest.mark.parametrize("dat", [pytest.lazy_fixture("course_1")]) +def test_modelling_mask(dat, flagger): + data, _ = dat() + data = dios.DictOfSeries(data) + flagger = flagger.initFlags(data) + data_seasonal, flagger_seasonal = modelling_mask(data, "data", flagger, mode='seasonal', season_start="20:00", + season_end="40:00", include_bounds=False) + flaggs = flagger_seasonal._flags["data"] + assert flaggs[np.logical_and(20 <= flaggs.index.minute, 40 >= flaggs.index.minute)].isna().all() + data_seasonal, flagger_seasonal = modelling_mask(data, "data", flagger, mode='seasonal', season_start="15:00:00", + season_end="02:00:00") + flaggs = flagger_seasonal._flags["data"] + assert flaggs[np.logical_and(15 <= flaggs.index.hour, 2 >= flaggs.index.hour)].isna().all() + data_seasonal, flagger_seasonal = modelling_mask(data, "data", flagger, mode='seasonal', season_start="03T00:00:00", + season_end="10T00:00:00") + flaggs = flagger_seasonal._flags["data"] + assert flaggs[np.logical_and(3 <= flaggs.index.hour, 10 >= flaggs.index.hour)].isna().all() + + mask_ser = pd.Series(False, index=data["data"].index) + mask_ser[::5] = True + data["mask_ser"] = mask_ser + flagger = flagger.initFlags(data) + data_masked, flagger_masked = modelling_mask(data, "data", flagger, mode='mask_var', mask_var="mask_ser") + flaggs = flagger_masked._flags["data"] + assert flaggs[data_masked['mask_ser']].isna().all() \ No newline at end of file