Skip to content
Snippets Groups Projects
Commit af1247e4 authored by Peter Lünenschloß's avatar Peter Lünenschloß
Browse files

modelling_mask documented and tested

parent 58b9e904
No related branches found
No related tags found
3 merge requests!193Release 1.4,!188Release 1.4,!85Seasonal and custom data masking
Pipeline #6628 failed with stage
in 5 minutes and 40 seconds
......@@ -251,22 +251,109 @@ def modelling_rollingMean(data, field, flagger, winsz, eval_flags=True, min_peri
def modelling_mask(data, field, flagger, mode, mask_var=None, season_start=None, season_end=None):
"""
Parameters
----------
data : dios.DictOfSeries
A dictionary of pandas.Series, holding all the data.
field : str
The fieldname of the column, holding the data-to-be-masked.
flagger : saqc.flagger
A flagger object, holding flags and additional Informations related to `data`.
mode : {"seasonal", "mask_var"}
The masking mode.
- "seasonal": parameters "season_start", "season_end" are evaluated to generate a seasonal (periodical) mask
- "mask_var": data[mask_var] is expected to be a boolean valued timeseries and is used as mask.
mask_var : {None, str}, default None
Only effective if mode == "mask_var"
Fieldname of the column, holding the data that is to be used as mask. (must be moolean series)
Neither the series` length nor its labels have to match data[field]`s index and length. An inner join of the
indices will be calculated and values get masked where the values of the inner join are "True".
season_start : {None, str}, default None
Only effective if mode == "mask_var"
String denoting starting point of every period. Formally, it has to be a truncated instance of "mm-ddTHH:MM:SS".
Has to be of same length as `season_end` parameter.
See examples section below for some examples.
season_end : {None, str}, default None
Only effective if mode == "mask_var"
String denoting starting point of every period. Formally, it has to be a truncated instance of "mm-ddTHH:MM:SS".
Has to be of same length as `season_end` parameter.
See examples section below for some examples.
Returns
-------
data : dios.DictOfSeries
A dictionary of pandas.Series, holding all the data.
Data values may have changed relatively to the data input.
flagger : saqc.flagger
The flagger object, holding flags and additional Informations related to `data`.
Flags values may have changed relatively to the flagger input.
Examples
--------
The `season_start` and `season_end` parameters provide a conveniant way to generate seasonal / date-periodic masks.
They have to be strings of the forms: "mm-ddTHH:MM:SS", "ddTHH:MM:SS" , "HH:MM:SS", "MM:SS" or "SS"
(mm=month, dd=day, HH=hour, MM=minute, SS=second)
Single digit specifications have to be given with leading zeros.
`season_start` and `season_end` strings have to be of same length (refer to the same periodicity)
The highest date unit gives the period.
For example:
>>> season_start = "01T15:00:00"
>>> season_end = "13T17:30:00"
Will result in all values sampled between 15:00 at the first and 17:30 at the 13th of every month get masked
>>> season_start = "01:00"
>>> season_end = "04:00"
All the values between the first and 4th minute of every hour get masked.
>>> season_start = "01-01T00:00:00"
>>> season_end = "01-03T00:00:00"
Mask january and february of every year. masking is inclusive always, so in this case the mask will
include 00:00:00 at the first of march. To exclude this one, pass:
>>> season_start = "01-01T00:00:00"
>>> season_end = "02-28T23:59:59"
To mask intervals that lap over a seasons frame, like nights, or winter, exchange sequence of season start and
season end. For example, to mask night hours between 22:00:00 in the evening and 06:00:00 in the morning, pass:
>>> season_start = "22:00:00"
>>> season_end = "06:00:00"
"""
data = data.copy()
datcol = data[field]
mask = pd.Series(False, index=datcol.index)
if mode == 'seasonal':
def _composeStamp(index, stamp):
if len(stamp) == 2:
if len(season_start) == 2:
def _composeStamp(index, stamp):
return '{}-{}-{} {}:{}:'.format(index.year[0], index.month[0], index.day[0], index.hour[0],
index.minute[0]) + stamp
if len(stamp) == 5:
elif len(season_start) == 5:
def _composeStamp(index, stamp):
return '{}-{}-{} {}:'.format(index.year[0], index.month[0], index.day[0], index.hour[0]) + stamp
if len(stamp) == 8:
elif len(season_start) == 8:
def _composeStamp(index, stamp):
return '{}-{}-{} '.format(index.year[0], index.month[0], index.day[0]) + stamp
if len(stamp) == 11:
elif len(season_start) == 11:
def _composeStamp(index, stamp):
# some hick-hack ahead, to account for the strange fact that not all the month are of same length in
# this world.
max_days = [31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
max_day_count = min(int(stamp[:2]), max_days[int(index.month[0] - 1)])
stamp = str(max_day_count) + stamp[2:]
return '{}-{}-'.format(index.year[0], index.month[0]) + stamp
if len(stamp) == 14:
elif len(season_start) == 14:
def _composeStamp(index, stamp):
return '{}-'.format(index.year[0]) + stamp
else:
raise ValueError("Whats this?: {}".format(season_start))
if pd.Timestamp(_composeStamp(datcol.index, season_start)) <= pd.Timestamp(_composeStamp(datcol.index,
season_end)):
......@@ -283,10 +370,13 @@ def modelling_mask(data, field, flagger, mode, mask_var=None, season_start=None,
to_mask = mask.groupby(pd.Grouper(freq=freq)).transform(_selector)
elif mode == 'mask_var':
to_mask = data[mask_var]
to_mask = to_mask.index.join(datcol.index, how='inner')
to_mask_i = to_mask.index.join(datcol.index, how='inner')
to_mask = to_mask[to_mask_i]
else:
raise ValueError("Keyword passed as masking mode is unknown ({})!".format(mode))
datcol[~to_mask] = np.nan
flags_to_block = pd.Series(np.nan, index=datcol.index[~to_mask]).astype(flagger.dtype)
flagger = flagger.setFlags(field, loc=datcol.index[~to_mask], flag=flags_to_block, force=True)
datcol[to_mask] = np.nan
flags_to_block = pd.Series(np.nan, index=datcol.index[to_mask]).astype(flagger.dtype)
flagger = flagger.setFlags(field, loc=datcol.index[to_mask], flag=flags_to_block, force=True)
return data, flagger
\ No newline at end of file
......@@ -7,11 +7,12 @@
import pytest
import numpy as np
import pandas as pd
from dios import dios
from test.common import TESTFLAGGER
from saqc.funcs.data_modelling import modelling_polyFit, modelling_rollingMean
from saqc.funcs.data_modelling import modelling_polyFit, modelling_rollingMean, modelling_mask
TF = TESTFLAGGER[:1]
......@@ -44,3 +45,30 @@ def test_modelling_rollingMean_forRegular(dat, flagger):
flagger = flagger.initFlags(data)
modelling_rollingMean(data, "data", flagger, 5, eval_flags=True, min_periods=0, center=True)
modelling_rollingMean(data, "data", flagger, 5, eval_flags=True, min_periods=0, center=False)
@pytest.mark.parametrize("flagger", TF)
@pytest.mark.parametrize("dat", [pytest.lazy_fixture("course_1")])
def test_modelling_mask(dat, flagger):
data, _ = dat()
data = dios.DictOfSeries(data)
flagger = flagger.initFlags(data)
data_seasonal, flagger_seasonal = modelling_mask(data, "data", flagger, mode='seasonal', season_start="20:00",
season_end="40:00")
flaggs = flagger_seasonal._flags["data"]
assert flaggs[np.logical_and(20 <= flaggs.index.minute, 40 >= flaggs.index.minute)].isna().all()
data_seasonal, flagger_seasonal = modelling_mask(data, "data", flagger, mode='seasonal', season_start="15:00:00",
season_end="02:00:00")
flaggs = flagger_seasonal._flags["data"]
assert flaggs[np.logical_and(15 <= flaggs.index.hour, 2 >= flaggs.index.hour)].isna().all()
data_seasonal, flagger_seasonal = modelling_mask(data, "data", flagger, mode='seasonal', season_start="03T00:00:00",
season_end="10T00:00:00")
flaggs = flagger_seasonal._flags["data"]
assert flaggs[np.logical_and(3 <= flaggs.index.hour, 10 >= flaggs.index.hour)].isna().all()
mask_ser = pd.Series(False, index=data["data"].index)
mask_ser[::5] = True
data["mask_ser"] = mask_ser
flagger = flagger.initFlags(data)
data_masked, flagger_masked = modelling_mask(data, "data", flagger, mode='mask_var', mask_var="mask_ser")
flaggs = flagger_masked._flags["data"]
assert flaggs[data_masked['mask_ser']].isna().all()
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment