diff --git a/saqc/funcs/__init__.py b/saqc/funcs/__init__.py index b85dc8ba939ff8b07efa39a5d6f9309ce6fceb55..e5c5153cbfef7a22c55982abbf7ae8cb369ffe74 100644 --- a/saqc/funcs/__init__.py +++ b/saqc/funcs/__init__.py @@ -9,5 +9,5 @@ from saqc.funcs.constants_detection import * from saqc.funcs.soil_moisture_tests import * from saqc.funcs.spikes_detection import * from saqc.funcs.harm_functions import * -from saqc.funcs.data_modelling import * +from saqc.funcs.modelling import * from saqc.funcs.proc_functions import * diff --git a/saqc/funcs/functions.py b/saqc/funcs/functions.py index db0e5f7348dce9ce0a1d97e92f9cbd6f1d2c6b83..2f870e9d14b560ff23d4530a573cc91fdf0ca72c 100644 --- a/saqc/funcs/functions.py +++ b/saqc/funcs/functions.py @@ -15,7 +15,9 @@ from mlxtend.evaluate import permutation_test from scipy.cluster.hierarchy import linkage, fcluster -from saqc.lib.tools import groupConsecutives, sesonalMask +from saqc.lib.tools import groupConsecutives, seasonalMask +from saqc.funcs.proc_functions import proc_fork, proc_drop, proc_projectFlags +from saqc.funcs.modelling import modelling_mask from saqc.core.register import register from saqc.core.visitor import ENVIRONMENT @@ -473,18 +475,15 @@ def flagSesonalRange( The flagger object, holding flags and additional Informations related to `data`. Flags values may have changed relatively to the flagger input. """ - smask = sesonalMask(data[field].index, startmonth, startday, endmonth, endday) - d = data.loc[smask, [field]] - if d.empty: - return data, flagger - - _, flagger_range = flagRange(d, field, flagger.slice(loc=d[field].index), min=min, max=max, **kwargs) - - if not flagger_range.isFlagged(field).any(): - return data, flagger - - flagger = flagger.merge(flagger_range) + data, flagger = proc_fork(data, field, flagger, suffix="_masked") + data, flagger = modelling_mask(data, field + "_masked", flagger, mode='seasonal', + season_start=f"{startmonth:02}-{startday:02}T00:00:00", + season_end=f"{endmonth:02}-{endday:02}T00:00:00", + include_bounds=True) + data, flagger = flagRange(data, field + "_masked", flagger, min=min, max=max, **kwargs) + data, flagger = proc_projectFlags(data, field, flagger, method='match', source=field + "_masked") + data, flagger = proc_drop(data, field + "_masked", flagger) return data, flagger diff --git a/saqc/funcs/data_modelling.py b/saqc/funcs/modelling.py similarity index 70% rename from saqc/funcs/data_modelling.py rename to saqc/funcs/modelling.py index bc239817ac2b6ec795a40a52f4233178097aa346..2b3ceee8c6aaaa2925c696bbfeb00dbc42d17649 100644 --- a/saqc/funcs/data_modelling.py +++ b/saqc/funcs/modelling.py @@ -11,6 +11,7 @@ from saqc.lib.ts_operators import ( polyRollerNoMissingNumba, polyRollerIrregular, ) +from saqc.lib.tools import seasonalMask @register(masking='field') @@ -282,3 +283,122 @@ def modelling_rollingMean(data, field, flagger, winsz, eval_flags=True, min_peri flagger = flagger.setFlags(field, to_flag.values, **kwargs) return data, flagger + + +def modelling_mask(data, field, flagger, mode, mask_var=None, season_start=None, season_end=None, + include_bounds=True): + """ + This function realizes masking within saqc. + + Due to some inner saqc mechanics, it is not straight forwardly possible to exclude + values or datachunks from flagging routines. This function replaces flags with np.nan + value, wherever values are to get masked. Furthermore, the masked values get replaced by + np.nan, so that they dont effect calculations. + + Here comes a recipe on how to apply a flagging function only on a masked chunk of the variable field: + + 1. dublicate "field" in the input data (proc_fork) + 2. mask the dublicated data (modelling_mask) + 3. apply the tests you only want to be applied onto the masked data chunks (saqc_tests) + 4. project the flags, calculated on the dublicated and masked data onto the original field data + (proc_projectFlags or flagGeneric) + 5. drop the dublicated data (proc_drop) + + To see an implemented example, checkout flagSeasonalRange in the saqc.functions module + + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-masked. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + mode : {"seasonal", "mask_var"} + The masking mode. + - "seasonal": parameters "season_start", "season_end" are evaluated to generate a seasonal (periodical) mask + - "mask_var": data[mask_var] is expected to be a boolean valued timeseries and is used as mask. + mask_var : {None, str}, default None + Only effective if mode == "mask_var" + Fieldname of the column, holding the data that is to be used as mask. (must be moolean series) + Neither the series` length nor its labels have to match data[field]`s index and length. An inner join of the + indices will be calculated and values get masked where the values of the inner join are "True". + season_start : {None, str}, default None + Only effective if mode == "seasonal" + String denoting starting point of every period. Formally, it has to be a truncated instance of "mm-ddTHH:MM:SS". + Has to be of same length as `season_end` parameter. + See examples section below for some examples. + season_end : {None, str}, default None + Only effective if mode == "seasonal" + String denoting starting point of every period. Formally, it has to be a truncated instance of "mm-ddTHH:MM:SS". + Has to be of same length as `season_end` parameter. + See examples section below for some examples. + include_bounds : boolean + Wheather or not to include the mask defining bounds to the mask. + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + Data values may have changed relatively to the data input. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values may have changed relatively to the flagger input. + + + Examples + -------- + The `season_start` and `season_end` parameters provide a conveniant way to generate seasonal / date-periodic masks. + They have to be strings of the forms: "mm-ddTHH:MM:SS", "ddTHH:MM:SS" , "HH:MM:SS", "MM:SS" or "SS" + (mm=month, dd=day, HH=hour, MM=minute, SS=second) + Single digit specifications have to be given with leading zeros. + `season_start` and `seas on_end` strings have to be of same length (refer to the same periodicity) + The highest date unit gives the period. + For example: + + >>> season_start = "01T15:00:00" + >>> season_end = "13T17:30:00" + + Will result in all values sampled between 15:00 at the first and 17:30 at the 13th of every month get masked + + >>> season_start = "01:00" + >>> season_end = "04:00" + + All the values between the first and 4th minute of every hour get masked. + + >>> season_start = "01-01T00:00:00" + >>> season_end = "01-03T00:00:00" + + Mask january and february of evcomprosed in theery year. masking is inclusive always, so in this case the mask will + include 00:00:00 at the first of march. To exclude this one, pass: + + >>> season_start = "01-01T00:00:00" + >>> season_end = "02-28T23:59:59" + + To mask intervals that lap over a seasons frame, like nights, or winter, exchange sequence of season start and + season end. For example, to mask night hours between 22:00:00 in the evening and 06:00:00 in the morning, pass: + + >>> season_start = "22:00:00" + >>> season_end = "06:00:00" + + When inclusive_selection="season", all above examples work the same way, only that you now + determine wich values NOT TO mask (=wich values are to constitute the "seasons"). + """ + data = data.copy() + datcol = data[field] + if mode == 'seasonal': + to_mask = seasonalMask(datcol.index, season_start, season_end, include_bounds) + + elif mode == 'mask_var': + to_mask = data[mask_var] + to_mask_i = to_mask.index.join(datcol.index, how='inner') + to_mask = to_mask[to_mask_i] + else: + raise ValueError("Keyword passed as masking mode is unknown ({})!".format(mode)) + + datcol[to_mask] = np.nan + flags_to_block = pd.Series(np.nan, index=datcol.index[to_mask]).astype(flagger.dtype) + data[field] = datcol + flagger = flagger.setFlags(field, loc=datcol.index[to_mask], flag=flags_to_block, force=True) + + return data, flagger \ No newline at end of file diff --git a/saqc/funcs/spikes_detection.py b/saqc/funcs/spikes_detection.py index 65ddbd58cbebe3010af966b3a3bf9a9346bd969d..e20a4960b31e262f9c8e36157f3c4e6efb6339a6 100644 --- a/saqc/funcs/spikes_detection.py +++ b/saqc/funcs/spikes_detection.py @@ -465,17 +465,21 @@ def spikes_flagMultivarScores( val_frame = data[fields] val_frame = val_frame.loc[val_frame.index_of("shared")].to_df() val_frame.dropna(inplace=True) + val_frame = val_frame.apply(trafo) + if val_frame.empty: return data, flagger - if threshing == 'stray': - to_flag_index = _stray(val_frame, - partition_freq=stray_partition, - partition_min=stray_partition_min, - scoring_method=scoring_method, - n_neighbors=n_neighbors, - iter_start=iter_start, - trafo=trafo) + if threshing == "stray": + to_flag_index = _stray( + val_frame, + partition_freq=stray_partition, + partition_min=stray_partition_min, + scoring_method=scoring_method, + n_neighbors=n_neighbors, + iter_start=iter_start, + alpha=alpha + ) else: val_frame = val_frame.apply(trafo) diff --git a/saqc/lib/tools.py b/saqc/lib/tools.py index 8d21d8dcd95b2ebce770f2c329cdacb9211b5fa8..646ddec08abf2b0a899cf293b2f5628a66d4de1a 100644 --- a/saqc/lib/tools.py +++ b/saqc/lib/tools.py @@ -202,69 +202,103 @@ def flagWindow(flagger_old, flagger_new, field, direction="fw", window=0, **kwar return flagger_new.setFlags(field, fmask, **kwargs) -def sesonalMask(dtindex, month0=1, day0=1, month1=12, day1=None): +def seasonalMask(dtindex, season_start, season_end, include_bounds): """ - This function provides a mask for a sesonal time range in the given dtindex. - This means the interval is applied again on every year and even over the change of a year. - Note that both edges are inclusive. + This function generates date-periodic/seasonal masks from an index passed. - Examples: - sesonal(dtindex, 1, 1, 3, 1) -> [jan-mar] - sesonal(dtindex, 8, 1, 8, 15) -> [1.aug-15.aug] + For example you could mask all the values of an index, that are sampled in winter, or between 6 and 9 o'clock. + See the examples section for more details. + Parameters + ---------- + dtindex : pandas.DatetimeIndex + The index according to wich you want to generate a mask. + (=resulting mask will be indexed with 'dtindex') + season_start : str + String denoting starting point of every period. Formally, it has to be a truncated instance of "mm-ddTHH:MM:SS". + Has to be of same length as `season_end` parameter. + See examples section below for some examples. + season_end : str + String denoting starting point of every period. Formally, it has to be a truncated instance of "mm-ddTHH:MM:SS". + Has to be of same length as `season_end` parameter. + See examples section below for some examples. + include_bounds : boolean + Wheather or not to include the mask defining bounds to the mask. + + Returns + ------- + to_mask : pandas.Series[bool] + A series, indexed with the input index and having value `True` for all the values that are to be masked. + + Examples + -------- + The `season_start` and `season_end` parameters provide a conveniant way to generate seasonal / date-periodic masks. + They have to be strings of the forms: "mm-ddTHH:MM:SS", "ddTHH:MM:SS" , "HH:MM:SS", "MM:SS" or "SS" + (mm=month, dd=day, HH=hour, MM=minute, SS=second) + Single digit specifications have to be given with leading zeros. + `season_start` and `seas on_end` strings have to be of same length (refer to the same periodicity) + The highest date unit gives the period. + For example: + + >>> season_start = "01T15:00:00" + >>> season_end = "13T17:30:00" + + Will result in all values sampled between 15:00 at the first and 17:30 at the 13th of every month get masked + + >>> season_start = "01:00" + >>> season_end = "04:00" - This also works, if the second border is smaller then the first + All the values between the first and 4th minute of every hour get masked. - Examples: - sesonal(dtindex, 10, 1, 2, 1) -> [1.nov-1.feb (following year)] - sesonal(dtindex, 1, 10, 1, 1) -> [10.jan-1.jan(following year)] like everything except ]1.jan-10.jan[ + >>> season_start = "01-01T00:00:00" + >>> season_end = "01-03T00:00:00" + Mask january and february of evcomprosed in theery year. masking is inclusive always, so in this case the mask will + include 00:00:00 at the first of march. To exclude this one, pass: + + >>> season_start = "01-01T00:00:00" + >>> season_end = "02-28T23:59:59" + + To mask intervals that lap over a seasons frame, like nights, or winter, exchange sequence of season start and + season end. For example, to mask night hours between 22:00:00 in the evening and 06:00:00 in the morning, pass: + + >>> season_start = "22:00:00" + >>> season_end = "06:00:00" + + When inclusive_selection="season", all above examples work the same way, only that you now + determine wich values NOT TO mask (=wich values are to constitute the "seasons"). """ - if day1 is None: - day1 = 31 if month1 in [1, 3, 5, 7, 8, 10, 12] else 29 if month1 == 2 else 30 - - # test plausibility of date - try: - f = "%Y-%m-%d" - t0 = pd.to_datetime(f"2001-{month0}-{day0}", format=f) - t1 = pd.to_datetime(f"2001-{month1}-{day1}", format=f) - except ValueError: - raise ValueError("Given datelike parameter not logical") - - # swap - if t1 < t0: - # we create the same mask as we would do if not inverted - # but the borders need special treatment.. - # ===end]....................[start==== - # ======]end+1........start-1[========= - # ......[end+1========start-1]......... + invert - # ......[start`========= end`]......... + invert - t0 -= pd.to_timedelta("1d") - t1 += pd.to_timedelta("1d") - invert = True - # only swap id condition is still true - t0, t1 = t1, t0 if t1 < t0 else (t0, t1) - - month0, day0 = t0.month, t0.day - month1, day1 = t1.month, t1.day - else: - invert = False + def _replaceBuilder(stamp): + keys = ("second", "minute", "hour", "day", "month", "year") + stamp_list = map(int, re.split(r"[-T:]", stamp)[::-1]) + stamp_kwargs = dict(zip(keys, stamp_list)) - month = [m for m in range(month0, month1 + 1)] + def _replace(index): + if "day" in stamp_kwargs: + stamp_kwargs["day"] = min(stamp_kwargs["day"], index[0].daysinmonth) - # make a mask for [start:end] - mask = dtindex.month.isin(month) - if day0 > 1: - exclude = [d for d in range(1, day0)] - mask &= ~(dtindex.month.isin([month0]) & dtindex.day.isin(exclude)) - if day1 < 31: - exclude = [d for d in range(day1 + 1, 31 + 1)] - mask &= ~(dtindex.month.isin([month1]) & dtindex.day.isin(exclude)) + out = index[0].replace(**stamp_kwargs) + return out.strftime("%Y-%m-%dT%H:%M:%S") - if invert: - return ~mask + return _replace + + mask = pd.Series(include_bounds, index=dtindex) + + start_replacer = _replaceBuilder(season_start) + end_replacer = _replaceBuilder(season_end) + + if pd.Timestamp(start_replacer(dtindex)) <= pd.Timestamp(end_replacer(dtindex)): + def _selector(x, base_bool=include_bounds): + x[start_replacer(x.index):end_replacer(x.index)] = not base_bool + return x else: - return mask + def _selector(x, base_bool=include_bounds): + x[:end_replacer(x.index)] = not base_bool + x[start_replacer(x.index):] = not base_bool + return x + + freq = '1' + 'mmmhhhdddMMMYYY'[len(season_start)] + return mask.groupby(pd.Grouper(freq=freq)).transform(_selector) def assertDictOfSeries(df: Any, argname: str = "arg") -> None: diff --git a/test/funcs/test_data_modelling.py b/test/funcs/test_modelling.py similarity index 51% rename from test/funcs/test_data_modelling.py rename to test/funcs/test_modelling.py index 9cfe75ff74c2059ac73df43dc802c0b06e1cd3cb..f221944f1c6c2fcfd1c23acba4dd13f552b9063f 100644 --- a/test/funcs/test_data_modelling.py +++ b/test/funcs/test_modelling.py @@ -7,11 +7,12 @@ import pytest import numpy as np +import pandas as pd import dios from test.common import TESTFLAGGER -from saqc.funcs.data_modelling import modelling_polyFit, modelling_rollingMean +from saqc.funcs.modelling import modelling_polyFit, modelling_rollingMean, modelling_mask TF = TESTFLAGGER[:1] @@ -44,3 +45,30 @@ def test_modelling_rollingMean_forRegular(dat, flagger): flagger = flagger.initFlags(data) modelling_rollingMean(data, "data", flagger, 5, eval_flags=True, min_periods=0, center=True) modelling_rollingMean(data, "data", flagger, 5, eval_flags=True, min_periods=0, center=False) + +@pytest.mark.parametrize("flagger", TF) +@pytest.mark.parametrize("dat", [pytest.lazy_fixture("course_1")]) +def test_modelling_mask(dat, flagger): + data, _ = dat() + data = dios.DictOfSeries(data) + flagger = flagger.initFlags(data) + data_seasonal, flagger_seasonal = modelling_mask(data, "data", flagger, mode='seasonal', season_start="20:00", + season_end="40:00", include_bounds=False) + flaggs = flagger_seasonal._flags["data"] + assert flaggs[np.logical_and(20 <= flaggs.index.minute, 40 >= flaggs.index.minute)].isna().all() + data_seasonal, flagger_seasonal = modelling_mask(data, "data", flagger, mode='seasonal', season_start="15:00:00", + season_end="02:00:00") + flaggs = flagger_seasonal._flags["data"] + assert flaggs[np.logical_and(15 <= flaggs.index.hour, 2 >= flaggs.index.hour)].isna().all() + data_seasonal, flagger_seasonal = modelling_mask(data, "data", flagger, mode='seasonal', season_start="03T00:00:00", + season_end="10T00:00:00") + flaggs = flagger_seasonal._flags["data"] + assert flaggs[np.logical_and(3 <= flaggs.index.hour, 10 >= flaggs.index.hour)].isna().all() + + mask_ser = pd.Series(False, index=data["data"].index) + mask_ser[::5] = True + data["mask_ser"] = mask_ser + flagger = flagger.initFlags(data) + data_masked, flagger_masked = modelling_mask(data, "data", flagger, mode='mask_var', mask_var="mask_ser") + flaggs = flagger_masked._flags["data"] + assert flaggs[data_masked['mask_ser']].isna().all() \ No newline at end of file