diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 71348bd8c5555833196ea45d5239ec1d1201518c..4aa45078f7b5b0ca082627a0ab12a76fb4730ae1 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,3 +1,6 @@ +variables: + GIT_SUBMODULE_STRATEGY: recursive + before_script: - export DEBIAN_FRONTEND=noninteractive - apt-get -qq update @@ -9,8 +12,6 @@ before_script: - export PYENV_ROOT="$HOME/.pyenv" - export PATH="$PYENV_ROOT/bin:$PATH" - eval "$(pyenv init -)" - - git clone https://git.ufz.de/rdm/dios ~/dios - - export PYTHONPATH="$HOME/dios":$PYTHONPATH test:python36: @@ -41,3 +42,21 @@ test:python38: - pip install -r requirements.txt - python -m pytest test - python -m saqc --config ressources/data/config_ci.csv --data ressources/data/data.csv --outfile /tmp/test.csv + +# Make html docu with sphinx +pages: + stage: deploy + script: + - pyenv install 3.6.9 + - pyenv shell 3.6.9 + - pip install --upgrade pip + - pip install -r requirements.txt + - cd sphinx-doc/ + - pip install -r requirements_sphinx.txt + - make html + - cp -r _build/html ../public + artifacts: + paths: + - public + only: + - develop diff --git a/.gitmodules b/.gitmodules index 68397aa969304a3dcc129f93735b949971fc9a89..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +0,0 @@ -[submodule "dios"] - path = dios - url = https://git.ufz.de/rdm/dios.git diff --git a/CHANGELOG.md b/CHANGELOG.md index a5fd5fe9f103e900ac7051000dafa55a49d2edc0..9120d74be275a9fcb709f60ecd752c3facfbe1b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -67,7 +67,6 @@ coming soon ... - added the data processing module `proc_functions` - `flagCrossValidation` implemented - ## Bugfixes - `spikes_flagRaise` - overestimation of value courses average fixed - `spikes_flagRaise` - raise check window now closed on both sides @@ -76,5 +75,7 @@ coming soon ... - renamed `spikes_oddWater` to `spikes_flagMultivarScores` - added STRAY auto treshing algorithm to `spikes_flagMultivarScores` - added "unflagging" - postprocess to `spikes_flagMultivarScores` +- improved and extended masking ## Breaking Changes +- register is now a decorator instead of a wrapper diff --git a/README.md b/README.md index 96f878eb5da3dfd5d3e1aac58a6e03376cc07fce..aab7a5506acc3e99b58db25d982e66c2f9d2036a 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +[](https://git.ufz.de/rdm-software/saqc/-/commits/develop) + # System for automated Quality Control (SaQC) Quality Control of numerical data requires a significant amount of diff --git a/dios b/dios deleted file mode 160000 index e9a80225b02799fa668882149a39f4a734b4f280..0000000000000000000000000000000000000000 --- a/dios +++ /dev/null @@ -1 +0,0 @@ -Subproject commit e9a80225b02799fa668882149a39f4a734b4f280 diff --git a/docs/ConfigurationFiles.md b/docs/ConfigurationFiles.md index b660206945cf20993f7c0d8ad9b6dc3a19fc8122..64743bb40d211db2a51adfb5ad07c64bbcbdd7be 100644 --- a/docs/ConfigurationFiles.md +++ b/docs/ConfigurationFiles.md @@ -1,8 +1,8 @@ # Configuration Files -The behaviour of SaQC is completely controlled by a text based configuration file. +The behaviour of SaQC can be completely controlled by a text based configuration file. ## Format -SaQC expects its configuration files to be semicolon-separated text files with a +SaQC expects configuration files to be semicolon-separated text files with a fixed header. Each row of the configuration file lists one variable and one or several test functions that are applied on the given variable. @@ -12,11 +12,11 @@ one variable and one or several test functions that are applied on the given var The header names are basically fixed, but if you really insist in custom configuration headers have a look [here](saqc/core/config.py). -| Name | Data Type | Description | Optional | +| Name | Data Type | Description | Required | |---------|----------------------------------------------|------------------------|----------| -| varname | string | name of a variable | no | -| test | [function notation](#test-function-notation) | test function | no | -| plot | boolean (`True`/`False`) | plot the test's result | yes | +| varname | string | name of a variable | yes | +| test | [function notation](#test-function-notation) | test function | yes | +| plot | boolean (`True`/`False`) | plot the test's result | no | ### Test function notation diff --git a/docs/funcs/FormalDescriptions.md b/docs/funcs/FormalDescriptions.md new file mode 100644 index 0000000000000000000000000000000000000000..12f286add9f9c5a488025d00b6d232eb02c2f38c --- /dev/null +++ b/docs/funcs/FormalDescriptions.md @@ -0,0 +1,86 @@ +# Mathematical descriptions + +A collection of detailed mathematical descriptions. + +## Index + +- [spikes_flagRaise](#spikes_flagraise) +- [spikes_flagSpektrumBased](#spikes_flagspektrumbased) +- [breaks_flagSpektrumBased](#breaks_flagspektrumbased) +- [sm_flagConstants](#sm_flagconstants) + + +## spikes_flagRaise + +The value $`x_{k}`$ of a time series $`x`$ with associated +timestamps $`t_i`$, is flagged a rise, if: + +1. There is any value $`x_{s}`$, preceeding $`x_{k}`$ within `raise_window` range, so that: + * $` M = |x_k - x_s | > `$ `thresh` $` > 0`$ +2. The weighted average $`\mu^*`$ of the values, preceeding $`x_{k}`$ within `average_window` range indicates, that $`x_{k}`$ doesnt return from an outliererish value course, meaning that: + * $` x_k > \mu^* + ( M `$ / `mean_raise_factor` $`)`$ +3. Additionally, if `min_slope` is not `None`, $`x_{k}`$ is checked for being sufficiently divergent from its very predecessor $`x_{k-1}`$, meaning that, it is additionally checked if: + * $`x_k - x_{k-1} > `$ `min_slope` + * $`t_k - t_{k-1} > `$ `min_slope_weight`*`intended_freq` + +The weighted average $`\mu^*`$ was calculated with weights $`w_{i}`$, defined by: +* $`w_{i} = (t_i - t_{i-1})`$ / `intended_freq`, if $`(t_i - t_{i-1})`$ < `intended_freq` and $`w_i =1`$ otherwise. + + + +The value $`x_{k}`$ of a time series $`x_t`$ with +timestamps $`t_i`$ is considered a spikes, if: + + +## spikes_flagSpektrumBased + + +1. The quotient to its preceding data point exceeds a certain bound: + * $` |\frac{x_k}{x_{k-1}}| > 1 + `$ `raise_factor`, or + * $` |\frac{x_k}{x_{k-1}}| < 1 - `$ `raise_factor` +2. The quotient of the second derivative $`x''`$, at the preceding + and subsequent timestamps is close enough to 1: + * $` |\frac{x''_{k-1}}{x''_{k+1}} | > 1 - `$ `deriv_factor`, and + * $` |\frac{x''_{k-1}}{x''_{k+1}} | < 1 + `$ `deriv_factor` +3. The dataset $`X = x_i, ..., x_{k-1}, x_{k+1}, ..., x_j`$, with + $`|t_{k-1} - t_i| = |t_j - t_{k+1}| =`$ `noise_window` fulfills the + following condition: + `noise_func`$`(X) <`$ `noise_thresh` + +## breaks_flagSpektrumBased + +A value $`x_k`$ of a time series $`x_t`$ with timestamps $`t_i`$, is considered to be a break, if: + +1. $`x_k`$ represents a sufficiently large relative jump: + + $`|\frac{x_k - x_{k-1}}{x_k}| >`$ `thresh_rel` + +2. $`x_k`$ represents a sufficient absolute jump: + + $`|x_k - x_{k-1}| >`$ `thresh_abs` + +3. The dataset $`X = x_i, ..., x_{k-1}, x_{k+1}, ..., x_j`$, with $`|t_{k-1} - t_i| = |t_j - t_{k+1}| =`$ `first_der_window` + fulfills the following condition: + + $`|x'_k| >`$ `first_der_factor` $` \cdot \bar{X} `$ + + where $`\bar{X}`$ denotes the arithmetic mean of $`X`$. + +4. The ratio (last/this) of the second derivatives is close to 1: + + $` 1 -`$ `scnd_der_ratio_margin_1` $`< |\frac{x''_{k-1}}{x_{k''}}| < 1 + `$`scnd_der_ratio_margin_1` + +5. The ratio (this/next) of the second derivatives is sufficiently height: + + $`|\frac{x''_{k}}{x''_{k+1}}| > `$`scnd_der_ratio_margin_2` + +## sm_flagConstants + +Any set of consecutive values +$`x_k,..., x_{k+n}`$, of a time series $`x`$ is flagged, if: + +1. $`n > `$`window` +2. $`\sigma(x_k, x_{k+1},..., x_{k+n}) < `$`thresh` +3. $`\max(x'_{k-n-s}, x'_{k-n-s+1},..., x'_{k-n+s}) \geq`$ `deriv_min`, with $`s`$ denoting periods per `precipitation_window` +4. $`\min(x'_{k-n-s}, x'_{k-n-s+1},..., x'_{k-n+s}) \leq`$ `deriv_max`, with $`s`$ denoting periods per `precipitation_window` +5. $`\mu(x_k, x_{k+1},..., x_{k+n}) \le \max(x) \cdot`$ `tolerance` \ No newline at end of file diff --git a/environment.yml b/environment.yml index c08ddee3afdb51bd620f7b7c36e338dac72d42df..aabe58efcf1dab30195217d3e75e53abe969cf63 100644 --- a/environment.yml +++ b/environment.yml @@ -10,6 +10,10 @@ dependencies: - click - scikit-learn - pyarrow + - PyWavelets - pip - pip: - python-intervals + - dtw + - mlxtend + - outlier-utils diff --git a/requirements.txt b/requirements.txt index 5a8333fca668a0c43377f5ae0467bd4140c10209..45132e2397cf607c5b17e106d94eed3a0b0ce753 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,25 +1,26 @@ -attrs==19.3.0 +attrs==20.2.0 Click==7.1.2 cycler==0.10.0 +dios==0.6.0 dtw==1.4.0 kiwisolver==1.2.0 importlib-metadata==1.7.0 joblib==0.16.0 -llvmlite==0.31.0 +llvmlite==0.34.0 mlxtend==0.17.3 -matplotlib==3.3.0 -more-itertools==8.4.0 -numba==0.48.0 -numpy==1.19.1 +matplotlib==3.3.1 +more-itertools==8.5.0 +numba==0.51.2 +numpy==1.19.2 outlier==0.2 utils==1.0.1 outlier-utils==0.0.3 packaging==20.4 -pandas==1.0.1 +pandas==1.1.2 pluggy==0.13.1 pyparsing==2.4.7 py==1.9.0 -pyarrow==1.0.0 +pyarrow==1.0.1 pytest-lazy-fixture==0.6.3 pytest==6.0.1 python-dateutil==2.8.1 diff --git a/ressources/data/config_ci.csv b/ressources/data/config_ci.csv index f613a9385629b662fe18293ee5ea5de015dbbb98..f631338ade105552e37c61d16ea72aab50dab106 100644 --- a/ressources/data/config_ci.csv +++ b/ressources/data/config_ci.csv @@ -1,6 +1,6 @@ varname;test;plot SM2;harm_shift2Grid(freq="15Min");False -SM1;flagRange(min=10, max=60);False +'.*';flagRange(min=10, max=60);False SM2;flagMissing(nodata=NAN);False SM2;flagRange(min=10, max=60);False SM2;spikes_flagMad(window="30d", z=3.5);False diff --git a/saqc/__main__.py b/saqc/__main__.py index 65e7c868643e681eb3e066198f2b34a1f6fdde15..e8acb5aff3864117f723be35a8ee3a7f2bc13260 100644 --- a/saqc/__main__.py +++ b/saqc/__main__.py @@ -57,6 +57,7 @@ def main(config, data, flagger, outfile, nodata, log_level, fail): data_result, flagger_result = saqc.readConfig(config).getResult() + return if outfile: data_result = data_result.to_df() flags = flagger_result.getFlags().to_df() diff --git a/saqc/core/core.py b/saqc/core/core.py index d53d2b5a137164f7df8cdb89ef591cc07da33d78..d00752a8edd058b8b44a955bdc41153f448d78de 100644 --- a/saqc/core/core.py +++ b/saqc/core/core.py @@ -9,7 +9,7 @@ TODOS: import logging from copy import deepcopy -from typing import List, Tuple +from typing import Any, Dict, List import pandas as pd import dios @@ -18,20 +18,26 @@ import timeit from saqc.lib.plotting import plotHook, plotAllHook from saqc.lib.tools import isQuoted -from saqc.core.register import FUNC_MAP, SaQCFunc from saqc.core.reader import readConfig from saqc.flagger import BaseFlagger, CategoricalFlagger, SimpleFlagger, DmpFlagger +from saqc.core.register import FUNC_MAP logger = logging.getLogger("SaQC") -def _handleErrors(exc, func, field, policy): +def _handleErrors(exc, func_dump, policy): + func_name = func_dump['func_name'] + func_kws = func_dump['func_kws'] + field = func_dump['field'] + ctrl_kws = func_dump['ctrl_kws'] + lineno = ctrl_kws['lineno'] + func_expr = ctrl_kws['lineno'] msg = f"Execution failed. Variable: '{field}', " - if func.lineno is not None and func.expr is not None: - msg += f"Config line {func.lineno}: '{func.expr}', " + if lineno is not None and func_expr is not None: + msg += f"Config line {lineno}: '{func_expr}', " else: - msg += f"Function: {func.func.__name__ }(), parameters: '{func.kwargs}', " + msg += f"Function: {func_name}(), parameters: '{func_kws}', " msg += f"Exception:\n{type(exc).__name__}: {exc}" if policy == "ignore": @@ -54,6 +60,9 @@ def _prepInput(flagger, data, flags): raise TypeError("data should not use MultiIndex") data = dios.to_dios(data) + if not hasattr(data.columns, "str"): + raise TypeError("expected dataframe columns of type string") + if not isinstance(flagger, BaseFlagger): # NOTE: we should generate that list automatically, # it won't ever be complete otherwise @@ -96,14 +105,15 @@ _setup() class SaQC: - def __init__(self, flagger, data, flags=None, nodata=np.nan, error_policy="raise"): + def __init__(self, flagger, data, flags=None, nodata=np.nan, to_mask=None, error_policy="raise"): data, flags = _prepInput(flagger, data, flags) self._data = data self._nodata = nodata + self._to_mask = to_mask self._flagger = self._initFlagger(data, flagger, flags) self._error_policy = error_policy # NOTE: will be filled by calls to `_wrap` - self._to_call: List[Tuple[str, SaQCFunc]] = [] + self._to_call: List[Dict[str, Any]] = [] # todo fix the access everywhere def _initFlagger(self, data, flagger, flags): """ Init the internal flagger object. @@ -115,14 +125,14 @@ class SaQC: # ensure all data columns merged = flagger.initFlags(data) if flags is not None: - merged = merged.merge(flagger.initFlags(flags=flags)) + merged = merged.merge(flagger.initFlags(flags=flags), inplace=True) if flagger.initialized: - merged = merged.merge(flagger) + merged = merged.merge(flagger, inplace=True) return merged def readConfig(self, fname): - config = readConfig(fname) + config = readConfig(fname, self._flagger) out = deepcopy(self) for func, field, kwargs, plot, lineno, expr in config: @@ -150,22 +160,27 @@ class SaQC: # method instead of intermingling it with the computation data, flagger = self._data, self._flagger - for field, func in self._to_call: - logger.debug(f"processing: {field}, {func.__name__}, {func.kwargs}") + for func_dump in self._to_call: + func_name = func_dump['func_name'] + func_kws = func_dump['func_kws'] + field = func_dump['field'] + plot = func_dump["ctrl_kws"]["plot"] + logger.debug(f"processing: {field}, {func_name}, {func_kws}") try: t0 = timeit.default_timer() - data_result, flagger_result = func(data=data, flagger=flagger, field=field) + data_result, flagger_result = _saqcCallFunc(func_dump, data, flagger) + except Exception as e: t1 = timeit.default_timer() - logger.debug(f"{func.__name__} failed after {t1-t0} sec") - _handleErrors(e, func, field, self._error_policy) + logger.debug(f"{func_name} failed after {t1 - t0} sec") + _handleErrors(e, func_dump, self._error_policy) continue else: t1 = timeit.default_timer() - logger.debug(f"{func.__name__} finished after {t1-t0} sec") + logger.debug(f"{func_name} finished after {t1 - t0} sec") - if func.plot: + if plot: plotHook( data_old=data, data_new=data_result, @@ -173,16 +188,20 @@ class SaQC: flagger_new=flagger_result, sources=[], targets=[field], - plot_name=func.__name__, + plot_name=func_name, ) data = data_result flagger = flagger_result - if any([func.plot for _, func in self._to_call]): + if any([fdump["ctrl_kws"]["plot"] for fdump in self._to_call]): plotAllHook(data, flagger) - return SaQC(flagger, data, nodata=self._nodata, error_policy=self._error_policy) + # This is much faster for big datasets that to throw everything in the constructor. + # Simply because of _initFlagger -> merge() -> mergeDios() over all columns. + new = SaQC(SimpleFlagger(), dios.DictOfSeries(), nodata=self._nodata, error_policy=self._error_policy) + new._flagger, new._data = flagger, data + return new def getResult(self): """ @@ -196,24 +215,36 @@ class SaQC: realization = self.evaluate() return realization._data, realization._flagger - def _wrap(self, func, lineno=None, expr=None): - def inner(field: str, *args, regex: bool = False, to_mask=None, **kwargs): - + def _wrap(self, func_name, lineno=None, expr=None): + def inner(field: str, *args, regex: bool = False, to_mask=None, plot=False, inplace=False, **kwargs): fields = [field] if not regex else self._data.columns[self._data.columns.str.match(field)] - if func.__name__ in ("flagGeneric", "procGeneric"): - # NOTE: - # We need to pass `nodata` to the generic functions - # (to implement stuff like `ismissing`). As we - # should not interfere with proper nodata attributes - # of other test functions (e.g. `flagMissing`) we - # special case the injection - kwargs["nodata"] = kwargs.get("nodata", self._nodata) + kwargs.setdefault('nodata', self._nodata) + + # to_mask is a control keyword + ctrl_kws = { + **(FUNC_MAP[func_name]["ctrl_kws"]), + 'to_mask': to_mask or self._to_mask, + 'plot': plot, + 'inplace': inplace, + 'lineno': lineno, + 'expr': expr + } + func = FUNC_MAP[func_name]["func"] + + func_dump = { + "func_name": func_name, + "func": func, + "func_args": args, + "func_kws": kwargs, + "ctrl_kws": ctrl_kws, + } + + out = self if inplace else self.copy() - out = deepcopy(self) for field in fields: - f = SaQCFunc(func, *args, lineno=lineno, expression=expr, to_mask=to_mask, **kwargs) - out._to_call.append((field, f)) + dump_copy = {**func_dump, "field": field} + out._to_call.append(dump_copy) return out return inner @@ -222,9 +253,97 @@ class SaQC: """ All failing attribute accesses are redirected to __getattr__. We use this mechanism to make the - `RegisterFunc`s appear as `SaQC`-methods with + registered functions as `SaQC`-methods without actually implementing them. """ if key not in FUNC_MAP: raise AttributeError(f"no such attribute: '{key}'") - return self._wrap(FUNC_MAP[key]) + return self._wrap(key) + + def copy(self): + return deepcopy(self) + + +def _saqcCallFunc(func_dump, data, flagger): + func = func_dump['func'] + func_name = func_dump['func_name'] + func_args = func_dump['func_args'] + func_kws = func_dump['func_kws'] + field = func_dump['field'] + ctrl_kws = func_dump['ctrl_kws'] + to_mask = ctrl_kws['to_mask'] + masking = ctrl_kws['masking'] + + if masking == 'all': + columns = data.columns + elif masking == 'none': + columns = [] + elif masking == 'field': + columns = [field] + else: + raise ValueError(f"masking: {masking}") + to_mask = flagger.BAD if to_mask is None else to_mask + + # NOTE: + # when assigning new variables to `data`, the respective + # field is missing in `flags`, so we add it if necessary in + # order to keep the columns from `data` and `flags` in sync. + # NOTE: + # Also assigning a new variable to `flags` only, is possible. + # This is also is handled here. + # NOTE: + # Any newly assigned column can safely be ignored by masking, thus + # this check comes after setting `columns` + if field not in flagger.getFlags(): + flagger = flagger.merge(flagger.initFlags(data=pd.Series(name=field, dtype=np.float64))) + + data_in, mask = _maskData(data, flagger, columns, to_mask) + data_result, flagger_result = func(data_in, field, flagger, *func_args, func_name=func_name, **func_kws) + data_result = _unmaskData(data, mask, data_result, flagger_result, to_mask) + + return data_result, flagger_result + + +def _maskData(data, flagger, columns, to_mask): + # TODO: this is heavily undertested + mask = flagger.isFlagged(field=columns, flag=to_mask, comparator='==') + data = data.copy() + for c in columns: + col_mask = mask[c].values + if np.any(col_mask): + col_data = data[c].values.astype(np.float64) + col_data[col_mask] = np.nan + data[c] = col_data + return data, mask + + +def _unmaskData(data_old, mask_old, data_new, flagger_new, to_mask): + # TODO: this is heavily undertested + + # NOTE: + # we only need to respect columns, that were masked, + # and are also still present in new data. + # this throws out: + # - any newly assigned columns + # - columns that were excluded from masking + columns = mask_old.dropempty().columns.intersection(data_new.dropempty().columns) + mask_new = flagger_new.isFlagged(field=columns, flag=to_mask, comparator="==") + + for col in columns: + was_masked = mask_old[col] + is_masked = mask_new[col] + + # if index changed we just go with the new data. + # A test should use `register(masking='none')` if it changes + # the index but, does not want to have all NaNs on flagged locations. + if was_masked.index.equals(is_masked.index): + mask = was_masked.values & is_masked.values & data_new[col].isna().values + + # reapplying old values on masked positions + if np.any(mask): + data = np.where(mask, data_old[col].values, data_new[col].values) + data_new[col] = pd.Series(data=data, index=is_masked.index) + + return data_new + + diff --git a/saqc/core/reader.py b/saqc/core/reader.py index 7f0ba93f3b106fb1caad3d90955146beff74704f..12e8728fb658d431eac407de748f6301459db1b6 100644 --- a/saqc/core/reader.py +++ b/saqc/core/reader.py @@ -33,7 +33,13 @@ def _handleComments(df): df.loc[df[F.VARNAME].str.startswith(COMMENT)] = EMPTY for col in df: - df[col] = df[col].str.split(COMMENT, expand=True).iloc[:, 0].str.strip() + try: + df[col] = df[col].str.split(COMMENT, expand=True).iloc[:, 0].str.strip() + except AttributeError: + # NOTE: + # if `df[col]` is not of type string, we know, that + # there are no comments and the `.str` access fails + pass return df @@ -47,7 +53,7 @@ def _injectOptionalColumns(df): return df -def _parseConfig(df): +def _parseConfig(df, flagger): to_call = [] for lineno, (_, field, expr, plot) in enumerate(df.itertuples()): if field == "None": @@ -57,12 +63,12 @@ def _parseConfig(df): if pd.isnull(expr): raise SyntaxError(f"line {lineno}: non-optional column '{F.TEST}' missing") tree = ast.parse(expr, mode="eval") - cp = ConfigFunctionParser(tree.body) + cp = ConfigFunctionParser(tree.body, flagger) to_call.append((cp.func, field, cp.kwargs, plot, lineno + 2, expr)) return to_call -def readConfig(fname): +def readConfig(fname, flagger): df = pd.read_csv( fname, sep=r"\s*;\s*", @@ -79,8 +85,8 @@ def readConfig(fname): df[F.VARNAME] = df[F.VARNAME].replace(r"^\s*$", np.nan, regex=True) df[F.TEST] = df[F.TEST].replace(r"^\s*$", np.nan, regex=True) - df[F.PLOT] = df[F.PLOT].replace({"False": "", EMPTY: ""}) + df[F.PLOT] = df[F.PLOT].replace({"False": "", EMPTY: "", np.nan: ""}) df = df.astype({F.PLOT: bool}) - df = _parseConfig(df) + df = _parseConfig(df, flagger) return df diff --git a/saqc/core/register.py b/saqc/core/register.py index 2863f182556a0a0ca7cd2e0a20ae8c2f73e173b6..15a56868829750d0d0daeef58c05eb7d4bd525a6 100644 --- a/saqc/core/register.py +++ b/saqc/core/register.py @@ -1,211 +1,17 @@ #!/usr/bin/env python -from operator import itemgetter -from inspect import signature, Parameter, _VAR_POSITIONAL, _VAR_KEYWORD -from typing import Tuple, Dict, Generator, Any, Set - -import numpy as np -import pandas as pd - - -class Func: - """ - This class is basically extends functool.partial` and in - fact, most of the constructor implementation is taken - directly from the python standard lib. Not messing with the - partial class directly proved to be an easier aproach though. - - Besides the implementation of a partial like functionality - `Func` provides a couple of properties/methods used to check - the passed arguments before the actual function call happens. - """ - - def __init__(self, *args, **kwargs): - if len(args) < 1: - raise TypeError("'Func' takes at least one argument") - func, *args = args - if not callable(func): - raise TypeError("the first argument must be callable") - - if isinstance(func, Func): - args = func.args + args - kwargs = {**func.kwargs, **kwargs} - func = func.func - - self._signature = signature(func) - # NOTE: - # bind_partial comes with a validity check, so let's use it - self._signature.bind_partial(*args, **kwargs) - - self.__name__ = func.__name__ - self.func = func - self.args = args - self.kwargs = kwargs - - def __repr__(self): - return f"{self.__class__.__name__}({self.__name__}, {self.args}, {self.kwargs})" - - def __call__(self, *args, **kwargs): - keywords = {**self.kwargs, **kwargs} - return self.func(*self.args, *args, **keywords) - - @property - def _parameters(self) -> Generator[Tuple[str, Parameter], None, None]: - """ - yield all 'normal' parameters and their names, skipping - VAR_POSITIONALs (*args) and VAR_KEYWORDs (**kwargs) as - the don't help evaluating the correctness of the passed - arguments. - """ - for k, v in self._signature.parameters.items(): - if v.kind in (_VAR_POSITIONAL, _VAR_KEYWORD): - continue - yield k, v - - @property - def parameters(self) -> Tuple[str]: - """ - return the names of all parameters, i.e. positional - and keyword arguments without varargs - """ - return tuple(map(itemgetter(0), self._parameters)) - - @property - def optionals(self) -> Tuple[str]: - """ - return the names of all optional parameters without varargs - """ - return tuple(k for k, v in self._parameters if v.default is not Parameter.empty) - - def _getPositionals(self): - """ - return the names of all positional parameters without varargs - """ - return tuple(k for k, v in self._parameters if v.default is Parameter.empty) - - positionals = property(_getPositionals) - - def addGlobals(self, globs: Dict[str, Any]): - """ - Add the given key-value pairs to the function's global - scope. We abuse the __globals__ mechanism mainly to - make certain other functions (dis-)available within the - 'Func' body. - """ - self.func.__globals__.update(globs) - return self - - def getUnbounds(self) -> Set[str]: - """ - returns all the names of all unbound variables, - i.e. not yet `partialed` parameters - """ - return set(self.positionals[len(self.args):]) - set(self.kwargs.keys()) - - -class RegisterFunc(Func): - """ - This class acts as a simple wrapper around all registered - functions. Currently its sole purpose is to inject additional - call arguments - """ - - def __call__(self, *args, **kwargs): - # NOTE: - # injecting the function name into the - # keywords is sort of hacky - kwargs = {"func_name": self.__name__, **kwargs} - return super().__call__(*args, **kwargs) - - -class SaQCFunc(Func): - """ - This class represents all test-, process and horminzation functions - provided through `SaQC`. Every call to an `SaQC` object will be wrapped - with all its non-dynamic arguments. - - `SaQCFunc`s are callable and expose the signature `data`, `field` and - `flagger` - """ - - # NOTE: - # we should formalize the function interface somehow, somewhere - _injectables = ("data", "field", "flagger") - - def __init__(self, *args, plot=False, lineno=None, expression=None, to_mask=None, **kwargs): - super().__init__(*args, **kwargs) - - unbound = self.getUnbounds() - if unbound: - raise TypeError(f"missing required arguments: {', '.join(unbound)}") - - self.plot = plot - self.lineno = lineno - self.expr = expression - self.to_mask = to_mask - - def _getPositionals(self) -> Tuple[int]: - """ - Returns all positional (i.e. non-optional arguments) - without the `data`, `field` and `flagger` - """ - positionals = super()._getPositionals() - return tuple(k for k in positionals if k not in self._injectables) - - positionals = property(_getPositionals) - - def __call__(self, data, field, flagger): - # NOTE: - # when assigning new variables to `data`, the respective - # field is missing in `flags`, so we add it if necessary in - # order to keep the columns from `data` and `flags` in sync - if field not in flagger.getFlags(): - flagger = flagger.merge(flagger.initFlags(data=pd.Series(name=field))) - - data_in = self._maskData(data, flagger) - - data_result, flagger_result = self.func(data_in, field, flagger, *self.args, **self.kwargs) - - data_result = self._unmaskData(data, flagger, data_result, flagger_result) - - return data_result, flagger_result - - def _maskData(self, data, flagger): - to_mask = flagger.BAD if self.to_mask is None else self.to_mask - mask = flagger.isFlagged(flag=to_mask, comparator='==') - data = data.copy() - data[mask] = np.nan - return data - - def _unmaskData(self, data_old, flagger_old, data_new, flagger_new): - to_mask = flagger_old.BAD if self.to_mask is None else self.to_mask - mask_old = flagger_old.isFlagged(flag=to_mask, comparator="==") - mask_new = flagger_new.isFlagged(flag=to_mask, comparator="==") - - for col, left in data_new.indexes.iteritems(): - if col not in mask_old: - continue - right = mask_old[col].index - # NOTE: ignore columns with changed indices (assumption: harmonization) - if left.equals(right): - # NOTE: Don't overwrite data, that was masked, but is not considered - # flagged anymore and also respect newly set data on masked locations. - mask = mask_old[col] & mask_new[col] & data_new[col].isna() - data_new.loc[mask, col] = data_old.loc[mask, col] - return data_new - +from typing import Dict, Any # NOTE: # the global SaQC function store, # will be filled by calls to register -FUNC_MAP: Dict[str, RegisterFunc] = {} +FUNC_MAP: Dict[str, Any] = {} + +def register(masking='all'): + def inner(func): + ctrl_kws = dict(masking=masking) + FUNC_MAP[func.__name__] = dict(func=func, ctrl_kws=ctrl_kws) + return func + return inner -def register(func, name=None): - if name is None: - name = func.__name__ - else: - func.__name__ = name - func = RegisterFunc(func) - FUNC_MAP[name] = func - return func diff --git a/saqc/core/visitor.py b/saqc/core/visitor.py index 9ca875d802c507f16eea9bc89cb1fbdabd6f863d..67a4f0d745ed9f67e159a3a335b6c41f2c1fe4e6 100644 --- a/saqc/core/visitor.py +++ b/saqc/core/visitor.py @@ -136,9 +136,15 @@ class ConfigFunctionParser(ast.NodeVisitor): ast.List, ) - def __init__(self, node): + def __init__(self, node, flagger): self.kwargs = {} + self.environment = { + "GOOD": flagger.GOOD, + "BAD": flagger.BAD, + "UNFLAGGED": flagger.UNFLAGGED, + **ENVIRONMENT, + } self.func = self.visit_Call(node) def visit_Call(self, node): @@ -153,14 +159,15 @@ class ConfigFunctionParser(ast.NodeVisitor): raise NameError(f"unknown function '{func_name}'") self.generic_visit(node) - return FUNC_MAP[func_name] + return func_name def visit_keyword(self, node): k, v = node.arg, node.value check_tree = True - # NOTE: not a constant or variable, should be function call + # NOTE: `node` is not a constant or a variable, + # so it should be a function call try: visitor = ConfigExpressionParser(v) args = ast.arguments( @@ -189,9 +196,14 @@ class ConfigFunctionParser(ast.NodeVisitor): # -> after all keywords where visited we end up with # a dictionary holding all the passed arguments as # real python objects - co = compile(ast.fix_missing_locations(ast.Interactive(body=[vnode])), "<ast>", mode="single") - # NOTE: only pass a copy to not clutter the ENVIRONMENT - exec(co, {**ENVIRONMENT}, self.kwargs) + co = compile( + ast.fix_missing_locations(ast.Interactive(body=[vnode])), + "<ast>", + mode="single" + ) + # NOTE: only pass a copy to not clutter the self.environment + exec(co, {**self.environment}, self.kwargs) + # let's do some more validity checks if check_tree: diff --git a/saqc/flagger/baseflagger.py b/saqc/flagger/baseflagger.py index dd3016de9ee711cd7a763f71c10ba18e6fd0148c..2d86d9719096c9b1a9c3ceca756af0cf75003325 100644 --- a/saqc/flagger/baseflagger.py +++ b/saqc/flagger/baseflagger.py @@ -8,7 +8,7 @@ from abc import ABC, abstractmethod from typing import TypeVar, Union, Any, List, Optional import pandas as pd -import dios.dios as dios +import dios from saqc.lib.tools import assertScalar, mergeDios, toSequence, mutateIndex @@ -64,22 +64,27 @@ class BaseFlagger(ABC): if data is not None: if not isinstance(data, diosT): data = dios.DictOfSeries(data) - flags = data.copy() - flags[:] = self.UNFLAGGED + + flags = dios.DictOfSeries(columns=data.columns) + for c in flags.columns: + flags[c] = pd.Series(self.UNFLAGGED, index=data[c].index) + flags = flags.astype(self.dtype) else: - if not isinstance(data, diosT): + if not isinstance(flags, diosT): flags = dios.DictOfSeries(flags) - newflagger = self.copy() - newflagger._flags = flags.astype(self.dtype) + newflagger = self.copy(flags=flags) return newflagger - def rename(self, field: str, new_name: str): - newflagger = self.copy() - newflagger._flags.columns = mutateIndex(newflagger._flags.columns, field, new_name) - return newflagger + def rename(self, field: str, new_name: str, inplace=False): + if inplace: + out = self + else: + out = self.copy() + out._flags.columns = mutateIndex(out._flags.columns, field, new_name) + return out - def merge(self, other: BaseFlaggerT, join: str = "merge"): + def merge(self, other: BaseFlaggerT, subset: Optional[List] = None, join: str = "merge", inplace=False): """ Merge the given flagger 'other' into self """ @@ -87,19 +92,25 @@ class BaseFlagger(ABC): if not isinstance(other, self.__class__): raise TypeError(f"flagger of type '{self.__class__}' needed") - newflagger = self.copy(flags=mergeDios(self.flags, other.flags, join=join)) - return newflagger + if inplace: + self._flags = mergeDios(self._flags, other._flags, subset=subset, join=join) + return self + else: + return self.copy(flags=mergeDios(self._flags, other._flags, subset=subset, join=join)) - def slice(self, field: FieldsT = None, loc: LocT = None, drop: FieldsT = None) -> BaseFlaggerT: + def slice(self, field: FieldsT = None, loc: LocT = None, drop: FieldsT = None, inplace=False) -> BaseFlaggerT: """ Return a potentially trimmed down copy of self. """ if drop is not None: if field is not None: raise TypeError("either 'field' or 'drop' can be given, but not both") field = self._flags.columns.drop(drop, errors="ignore") - flags = self.getFlags(field=field, loc=loc) - flags = dios.to_dios(flags) - newflagger = self.copy(flags=flags) - return newflagger + flags = self.getFlags(field=field, loc=loc).to_dios() + + if inplace: + self._flags = flags + return self + else: + return self.copy(flags=flags) def getFlags(self, field: FieldsT = None, loc: LocT = None) -> PandasT: """ Return a potentially, to `loc`, trimmed down version of flags. @@ -126,9 +137,11 @@ class BaseFlagger(ABC): field = slice(None) if field is None else self._check_field(field) indexer = (loc, field) - return self.flags.aloc[indexer] + # this is a bug in `dios.aloc`, which may return a shallow copied dios, if `slice(None)` is passed + # as row indexer. Thus is because pandas `.loc` return a shallow copy if a null-slice is passed to a series. + return self._flags.copy().aloc[indexer] - def setFlags(self, field: str, loc: LocT = None, flag: FlagT = None, force: bool = False, **kwargs) -> BaseFlaggerT: + def setFlags(self, field: str, loc: LocT = None, flag: FlagT = None, force: bool = False, inplace=False, **kwargs) -> BaseFlaggerT: """Overwrite existing flags at loc. If `force=False` (default) only flags with a lower priority are overwritten, @@ -146,17 +159,21 @@ class BaseFlagger(ABC): this = self.getFlags(field=field, loc=loc) row_indexer = this < flag - out = deepcopy(self) + if inplace: + out = self + else: + out = deepcopy(self) + out._flags.aloc[row_indexer, field] = flag return out - def clearFlags(self, field: str, loc: LocT = None, **kwargs) -> BaseFlaggerT: + def clearFlags(self, field: str, loc: LocT = None, inplace=False, **kwargs) -> BaseFlaggerT: assertScalar("field", field, optional=False) if "force" in kwargs: raise ValueError("Keyword 'force' is not allowed here.") if "flag" in kwargs: raise ValueError("Keyword 'flag' is not allowed here.") - return self.setFlags(field=field, loc=loc, flag=self.UNFLAGGED, force=True, **kwargs) + return self.setFlags(field=field, loc=loc, flag=self.UNFLAGGED, force=True, inplace=inplace, **kwargs) def isFlagged(self, field=None, loc: LocT = None, flag: FlagT = None, comparator: str = ">") -> PandasT: """ @@ -211,9 +228,16 @@ class BaseFlagger(ABC): return flagged def copy(self, flags=None) -> BaseFlaggerT: - out = deepcopy(self) - if flags is not None: + if flags is None: + out = deepcopy(self) + else: + # if flags is given and self.flags is big, + # this hack will bring some speed improvement + saved = self._flags + self._flags = None + out = deepcopy(self) out._flags = flags + self._flags = saved return out def isValidFlag(self, flag: FlagT) -> bool: @@ -241,12 +265,12 @@ class BaseFlagger(ABC): # https://git.ufz.de/rdm-software/saqc/issues/46 failed = [] if isinstance(field, str): - if field not in self.flags: + if field not in self._flags: failed += [field] else: try: for f in field: - if f not in self.flags: + if f not in self._flags: failed += [f] # not iterable, probably a slice or # any indexer we dont have to check diff --git a/saqc/flagger/dmpflagger.py b/saqc/flagger/dmpflagger.py index 4778d384c421992f0d445984de683c3a12f13df9..0a0acd9e37fe6b7a5b52dc5382c1303fda61dd74 100644 --- a/saqc/flagger/dmpflagger.py +++ b/saqc/flagger/dmpflagger.py @@ -4,11 +4,11 @@ import subprocess import json from copy import deepcopy -from typing import TypeVar +from typing import TypeVar, Optional, List import pandas as pd -import dios.dios as dios +import dios from saqc.flagger.categoricalflagger import CategoricalFlagger from saqc.lib.tools import assertScalar, mergeDios, mutateIndex @@ -74,32 +74,38 @@ class DmpFlagger(CategoricalFlagger): # implicit set self._flags, and make deepcopy of self aka. DmpFlagger newflagger = super().initFlags(data=data, flags=flags) - newflagger._causes = newflagger.flags.astype(str) - newflagger._comments = newflagger.flags.astype(str) + newflagger._causes = newflagger._flags.astype(str) + newflagger._comments = newflagger._flags.astype(str) newflagger._causes[:], newflagger._comments[:] = "", "" return newflagger - def slice(self, field=None, loc=None, drop=None): - newflagger = super().slice(field=field, loc=loc, drop=drop) - flags = newflagger.flags + def slice(self, field=None, loc=None, drop=None, inplace=False): + newflagger = super().slice(field=field, loc=loc, drop=drop, inplace=inplace) + flags = newflagger._flags newflagger._causes = self._causes.aloc[flags, ...] newflagger._comments = self._comments.aloc[flags, ...] return newflagger - def rename(self, field: str, new_name: str): - newflagger = super().rename(field, new_name) + def rename(self, field: str, new_name: str, inplace=False): + newflagger = super().rename(field, new_name, inplace=inplace) newflagger._causes.columns = newflagger._flags.columns newflagger._comments.columns = newflagger._flags.columns return newflagger - def merge(self, other: DmpFlaggerT, join: str = "merge"): + def merge(self, other: DmpFlaggerT, subset: Optional[List] = None, join: str = "merge", inplace=False): assert isinstance(other, DmpFlagger) - out = super().merge(other, join) - out._causes = mergeDios(out._causes, other._causes, join=join) - out._comments = mergeDios(out._comments, other._comments, join=join) - return out + flags = mergeDios(self._flags, other._flags, subset=subset, join=join) + causes = mergeDios(self._causes, other._causes, subset=subset, join=join) + comments = mergeDios(self._comments, other._comments, subset=subset, join=join) + if inplace: + self._flags = flags + self._causes = causes + self._comments = comments + return self + else: + return self._construct_new(flags, causes, comments) - def setFlags(self, field, loc=None, flag=None, force=False, comment="", cause="", **kwargs): + def setFlags(self, field, loc=None, flag=None, force=False, comment="", cause="", inplace=False, **kwargs): assert "iloc" not in kwargs, "deprecated keyword, iloc" assertScalar("field", field, optional=False) @@ -113,8 +119,20 @@ class DmpFlagger(CategoricalFlagger): this = self.getFlags(field=field, loc=loc) row_indexer = this < flag - out = deepcopy(self) + if inplace: + out = self + else: + out = deepcopy(self) + out._flags.aloc[row_indexer, field] = flag out._causes.aloc[row_indexer, field] = cause out._comments.aloc[row_indexer, field] = comment return out + + def _construct_new(self, flags, causes, comments) -> DmpFlaggerT: + new = DmpFlagger() + new.project_version = self.project_version + new._flags = flags + new._causes = causes + new._comments = comments + return new diff --git a/saqc/funcs/breaks_detection.py b/saqc/funcs/breaks_detection.py index ec507f5dd4a24687c8693ef1bff1a30a8fcef312..3dd756860d25368cbe0c7456f14bd9195153e341 100644 --- a/saqc/funcs/breaks_detection.py +++ b/saqc/funcs/breaks_detection.py @@ -10,7 +10,7 @@ from saqc.core.register import register from saqc.lib.tools import retrieveTrustworthyOriginal -@register +@register(masking='field') def breaks_flagSpektrumBased( data, field, @@ -27,57 +27,77 @@ def breaks_flagSpektrumBased( **kwargs ): - """ This Function is a generalization of the Spectrum based break flagging mechanism as presented in: - - Dorigo,W,.... Global Automated Quality Control of In Situ Soil Moisture Data from the international - Soil Moisture Network. 2013. Vadoze Zone J. doi:10.2136/vzj2012.0097. + """ + The Function is a generalization of the Spectrum based break flagging mechanism as presented in: The function flags breaks (jumps/drops) in input measurement series by evaluating its derivatives. A measurement y_t is flagged a, break, if: - (1) y_t is changing relatively to its preceeding value by at least (100*rel_change_rate_min) percent - (2) y_(t-1) is difffering from its preceeding value, by a margin of at least "thresh_abs" - (3) Absolute first derivative |(y_t)'| has to be at least "first_der_factor" times as big as the arithmetic middle - over all the first derivative values within a 2 times "first_der_window_size" hours window, centered at t. + (1) y_t is changing relatively to its preceeding value by at least (100*`rel_change_rate_min`) percent + (2) y_(t-1) is difffering from its preceeding value, by a margin of at least `thresh_abs` + (3) Absolute first derivative |(y_t)'| has to be at least `first_der_factor` times as big as the arithmetic middle + over all the first derivative values within a 2 times `first_der_window_size` hours window, centered at t. (4) The ratio of the second derivatives at t and t+1 has to be "aproximately" 1. - ([1-scnd__der_ration_margin_1, 1+scnd_ratio_margin_1]) - (5) The ratio of the second derivatives at t+1 and t+2 has to be larger than scnd_der_ratio_margin_2 + ([1-`scnd_der_ration_margin_1`, 1+`scnd_ratio_margin_1`]) + (5) The ratio of the second derivatives at t+1 and t+2 has to be larger than `scnd_der_ratio_margin_2` NOTE 1: As no reliable statement about the plausibility of the meassurements before and after the jump is possible, only the jump itself is flagged. For flagging constant values following upon a jump, use a flagConstants test. NOTE 2: All derivatives in the reference publication are obtained by applying a Savitzky-Golay filter to the data - before differentiating. However, i was not able to reproduce satisfaction of all the conditions for synthetically - constructed breaks. - Especially condition [4] and [5]! This is because smoothing distributes the harshness of the break over the - smoothing window. Since just taking the differences as derivatives did work well for my empirical data set, - the parameter "smooth" defaults to "raw". That means, that derivatives will be obtained by just using the - differences series. - You are free of course, to change this parameter to "savgol" and play around with the associated filter options. - (see parameter description below) - - - - - :param data: The pandas dataframe holding the data-to-be flagged. - Data must be indexed by a datetime series and be harmonized onto a - time raster with seconds precision (skips allowed). - :param flags: A dataframe holding the flags/flag-entries associated with "data". - :param field: Fieldname of the Soil moisture measurements field in data. - :param flagger: A flagger - object. (saqc.flagger.X) - :param smooth: Bool. Method for obtaining dataseries' derivatives. - False: Just take series step differences (default) - True: Smooth data with a Savitzky Golay Filter before differentiating. - :param smooth_window: Offset string. Size of the filter window, used to calculate the derivatives. - (relevant only, if: smooth is True) - :param smooth_poly_deg: Integer. Polynomial order, used for smoothing with savitzk golay filter. - (relevant only, if: smooth_func='savgol') - :param thresh_rel Float in [0,1]. See (1) of function descritpion above to learn more - :param thresh_abs Float > 0. See (2) of function descritpion above to learn more. - :param first_der_factor Float > 0. See (3) of function descritpion above to learn more. - :param first_der_window_range Offset_String. See (3) of function description to learn more. - :param scnd_der_ratio_margin_1 Float in [0,1]. See (4) of function descritpion above to learn more. - :param scnd_der_ratio_margin_2 Float in [0,1]. See (5) of function descritpion above to learn more. + before differentiating. + + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-flagged. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + thresh_rel : float, default 0.1 + Float in [0,1]. See (1) of function description above to learn more + thresh_abs : float, default 0.01 + Float > 0. See (2) of function descritpion above to learn more. + first_der_factor : float, default 10 + Float > 0. See (3) of function descritpion above to learn more. + first_der_window_range : str, default '12h' + Offset string. See (3) of function description to learn more. + scnd_der_ratio_margin_1 : float, default 0.05 + Float in [0,1]. See (4) of function descritpion above to learn more. + scnd_der_ratio_margin_2 : float, default 10 + Float in [0,1]. See (5) of function descritpion above to learn more. + smooth : bool, default True + Method for obtaining dataseries' derivatives. + * False: Just take series step differences (default) + * True: Smooth data with a Savitzky Golay Filter before differentiating. + smooth_window : {None, str}, default 2 + Effective only if `smooth` = True + Offset string. Size of the filter window, used to calculate the derivatives. + smooth_poly_deg : int, default 2 + Effective only, if `smooth` = True + Polynomial order, used for smoothing with savitzk golay filter. + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional informations related to `data`. + Flags values may have changed, relatively to the flagger input. + + References + ---------- + The Function is a generalization of the Spectrum based break flagging mechanism as presented in: + + [1] Dorigo,W. et al.: Global Automated Quality Control of In Situ Soil Moisture + Data from the international Soil Moisture Network. 2013. Vadoze Zone J. + doi:10.2136/vzj2012.0097. + + Find a brief mathematical description of the function here: + + [2] https://git.ufz.de/rdm-software/saqc/-/blob/testfuncDocs/docs/funcs + /FormalDescriptions.md#breaks_flagspektrumbased """ # retrieve data series input at its original sampling rate diff --git a/saqc/funcs/constants_detection.py b/saqc/funcs/constants_detection.py index d5c9da71f0fce9b12792157a1bfa683dedb7465b..1ba51474f3e4f95a9fcaf2f3a992215f91512e7c 100644 --- a/saqc/funcs/constants_detection.py +++ b/saqc/funcs/constants_detection.py @@ -9,17 +9,41 @@ from saqc.lib.ts_operators import varQC from saqc.lib.tools import retrieveTrustworthyOriginal -@register +@register(masking='field') def constants_flagBasic(data, field, flagger, thresh, window, **kwargs): """ + This functions flags plateaus/series of constant values of length `window` if + their maximum total change is smaller than thresh. + + Function flags plateaus/series of constant values. Any interval of values y(t),..y(t+n) is flagged, if: + + (1) n > `window` + (2) |(y(t + i) - (t + j)| < `thresh`, for all i,j in [0, 1, ..., n] + Flag values are (semi-)constant. - :param data: dataframe - :param field: column in data - :param flagger: saqc flagger obj - :param thresh: the difference between two values must be below that - :param window: sliding window + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-flagged. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + thresh : float + Upper bound for the maximum total change of an interval to be flagged constant. + window : str + Lower bound for the size of an interval to be flagged constant. + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional informations related to `data`. + Flags values may have changed, relatively to the flagger input. """ + d = data[field] # find all constant values in a row with a forward search @@ -39,7 +63,7 @@ def constants_flagBasic(data, field, flagger, thresh, window, **kwargs): return data, flagger -@register +@register(masking='field') def constants_flagVarianceBased( data, field, flagger, window="12h", thresh=0.0005, max_missing=None, max_consec_missing=None, **kwargs ): @@ -47,24 +71,37 @@ def constants_flagVarianceBased( """ Function flags plateaus/series of constant values. Any interval of values y(t),..y(t+n) is flagged, if: - (1) n > "plateau_interval_min" - (2) variance(y(t),...,y(t+n) < thresh - - :param data: The pandas dataframe holding the data-to-be flagged. - Data must be indexed by a datetime series and be harmonized onto a - time raster with seconds precision (skips allowed). - :param field: Fieldname of the Soil moisture measurements field in data. - :param flagger: A flagger - object. (saqc.flagger.X) - :param window: Offset String. Only intervals of minimum size "window" have the - chance to get flagged as constant intervals - :param thresh: Float. The upper barrier, the variance of an interval mus not exceed, if the - interval wants to be flagged a plateau. - :param max_missing: maximum number of nan values tolerated in an interval, for retrieving a valid - variance from it. (Intervals with a number of nans exceeding "max_missing" - have no chance to get flagged a plateau!) - :param max_consec_missing: Maximum number of consecutive nan values allowed in an interval to retrieve a - valid variance from it. (Intervals with a number of nans exceeding - "max_missing" have no chance to get flagged a plateau!) + (1) n > `window` + (2) variance(y(t),...,y(t+n) < `thresh` + + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-flagged. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + window : str + Only intervals of minimum size "window" have the chance to get flagged as constant intervals + thresh : float + The upper bound, the variance of an interval must not exceed, if the interval wants to be flagged a plateau. + max_missing : {None, int}, default None + Maximum number of nan values tolerated in an interval, for retrieving a valid + variance from it. (Intervals with a number of nans exceeding "max_missing" + have no chance to get flagged a plateau!) + max_consec_missing : {None, int}, default None + Maximum number of consecutive nan values allowed in an interval to retrieve a + valid variance from it. (Intervals with a number of nans exceeding + "max_consec_missing" have no chance to get flagged a plateau!) + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional informations related to `data`. + Flags values may have changed, relatively to the flagger input. """ dataseries, data_rate = retrieveTrustworthyOriginal(data, field, flagger) diff --git a/saqc/funcs/functions.py b/saqc/funcs/functions.py index 1864ff377a2f8238f741d4c25e7cf925b7bc93db..58215f48f2b65e3cea03c4123035fc6377dff517 100644 --- a/saqc/funcs/functions.py +++ b/saqc/funcs/functions.py @@ -2,19 +2,19 @@ # -*- coding: utf-8 -*- from functools import partial +from inspect import signature import numpy as np import pandas as pd import dtw import pywt from mlxtend.evaluate import permutation_test -import datetime from saqc.lib.tools import groupConsecutives, seasonalMask from saqc.funcs.proc_functions import proc_fork, proc_drop, proc_projectFlags from saqc.funcs.modelling import modelling_mask -from saqc.core.register import register, Func +from saqc.core.register import register from saqc.core.visitor import ENVIRONMENT from dios import DictOfSeries from typing import Any @@ -33,12 +33,13 @@ def _execGeneric(flagger, data, func, field, nodata): # - field is only needed to translate 'this' parameters # -> maybe we could do the translation on the tree instead - func = Func(func) - for k in func.parameters: + sig = signature(func) + args = [] + for k, v in sig.parameters.items(): k = field if k == "this" else k if k not in data: raise NameError(f"variable '{k}' not found") - func = Func(func, data[k]) + args.append(data[k]) globs = { "isflagged": partial(_dslIsFlagged, flagger), @@ -50,19 +51,61 @@ def _execGeneric(flagger, data, func, field, nodata): "UNFLAGGED": flagger.UNFLAGGED, **ENVIRONMENT, } - func = func.addGlobals(globs) - return func() + func.__globals__.update(globs) + return func(*args) -@register +@register(masking='all') def procGeneric(data, field, flagger, func, nodata=np.nan, **kwargs): """ - Execute generic functions. - The **kwargs are needed to satisfy the test-function interface, - although they are of no use here. Usually they are abused to - transport the name of the test function (here: `procGeneric`) - into the flagger, but as we don't set flags here, we simply - ignore them + generate/process data with generically defined functions. + + The functions can depend on on any of the fields present in data. + + Formally, what the function does, is the following: + + 1. Let F be a Callable, depending on fields f_1, f_2,...f_K, (F = F(f_1, f_2,...f_K)) + Than, for every timestamp t_i that occurs in at least one of the timeseries data[f_j] (outer join), + The value v_i is computed via: + v_i = data([f_1][t_i], data[f_2][t_i], ..., data[f_K][t_i]), if all data[f_j][t_i] do exist + v_i = `nodata`, if at least one of the data[f_j][t_i] is missing. + 2. The result is stored to data[field] (gets generated if not present) + + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, where you want the result from the generic expressions processing to be written to. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + func : Callable + The data processing function with parameter names that will be + interpreted as data column entries. + See the examples section to learn more. + nodata : any, default np.nan + The value that indicates missing/invalid data + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + The shape of the data may have changed relatively to the data input. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + The flags shape may have changed relatively to the input flagger. + + Examples + -------- + Some examples on what to pass to the func parameter: + To compute the sum of the variables "temperature" and "uncertainty", you would pass the function: + + >>> lambda temperature, uncertainty: temperature + uncertainty + + You also can pass numpy and pandas functions: + + >>> lambda temperature, uncertainty: np.round(temperature) * np.sqrt(uncertainty) + """ data[field] = _execGeneric(flagger, data, func, field, nodata).squeeze() # NOTE: @@ -81,8 +124,82 @@ def procGeneric(data, field, flagger, func, nodata=np.nan, **kwargs): return data, flagger -@register +@register(masking='all') def flagGeneric(data, field, flagger, func, nodata=np.nan, **kwargs): + """ + a function to flag a data column by evaluation of a generic expression. + + The expression can depend on any of the fields present in data. + + Formally, what the function does, is the following: + + Let X be an expression, depending on fields f_1, f_2,...f_K, (X = X(f_1, f_2,...f_K)) + Than for every timestamp t_i in data[field]: + data[field][t_i] is flagged if X(data[f_1][t_i], data[f_2][t_i], ..., data[f_K][t_i]) is True. + + Note, that all value series included in the expression to evaluate must be labeled identically to field. + + Note, that the expression is passed in the form of a Callable and that this callables variable names are + interpreted as actual names in the data header. See the examples section to get an idea. + + Note, that all the numpy functions are available within the generic expressions. + + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, where you want the result from the generic expressions evaluation to be projected + to. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + func : Callable + The expression that is to be evaluated is passed in form of a callable, with parameter names that will be + interpreted as data column entries. The Callable must return an boolen array like. + See the examples section to learn more. + nodata : any, default np.nan + The value that indicates missing/invalid data + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values may have changed relatively to the flagger input. + + Examples + -------- + Some examples on what to pass to the func parameter: + To flag the variable `field`, if the sum of the variables + "temperature" and "uncertainty" is below zero, you would pass the function: + + >>> lambda temperature, uncertainty: temperature + uncertainty < 0 + + There is the reserved name 'This', that always refers to `field`. So, to flag field if field is negative, you can + also pass: + + >>> lambda this: this < 0 + + If you want to make dependent the flagging from flags already present in the data, you can use the built-in + ``isflagged`` method. For example, to flag the 'temperature', if 'level' is flagged, you would use: + + >>> lambda level: isflagged(level) + + You can furthermore specify a flagging level, you want to compare the flags to. For example, for flagging + 'temperature', if 'level' is flagged at a level named 'doubtfull' or worse, use: + + >>> lambda level: isflagged(level, flag='doubtfull', comparator='<=') + + If you are unsure about the used flaggers flagging level names, you can use the reserved key words BAD, UNFLAGGED + and GOOD, to refer to the worst (BAD), best(GOOD) or unflagged (UNFLAGGED) flagging levels. For example. + + >>> lambda level: isflagged(level, flag=UNFLAGGED, comparator='==') + + Your expression also is allowed to include pandas and numpy functions + + >>> lambda level: np.sqrt(level) > 7 + """ # NOTE: # The naming of the func parameter is pretty confusing # as it actually holds the result of a generic expression @@ -93,13 +210,40 @@ def flagGeneric(data, field, flagger, func, nodata=np.nan, **kwargs): raise TypeError(f"generic expression does not return a boolean array") if flagger.getFlags(field).empty: - flagger = flagger.merge(flagger.initFlags(data=pd.Series(name=field, index=mask.index))) + flagger = flagger.merge( + flagger.initFlags( + data=pd.Series(name=field, index=mask.index, dtype=np.float64))) flagger = flagger.setFlags(field, mask, **kwargs) return data, flagger -@register -def flagRange(data, field, flagger, min, max, **kwargs): +@register(masking='field') +def flagRange(data, field, flagger, min=-np.inf, max=np.inf, **kwargs): + """ + Function flags values not covered by the closed interval [`min`, `max`]. + + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-flagged. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + min : float + Lower bound for valid data. + max : float + Upper bound for valid data. + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values may have changed relatively to the flagger input. + """ + # using .values is very much faster datacol = data[field].values mask = (datacol < min) | (datacol > max) @@ -107,34 +251,73 @@ def flagRange(data, field, flagger, min, max, **kwargs): return data, flagger -@register -def flagPattern(data, field, flagger, reference_field, method = 'dtw', partition_freq = "days", partition_offset = 0, max_distance = 0.03, normalized_distance = True, open_end = True, widths = (1, 2, 4, 8), waveform = 'mexh', **kwargs): - """ Implementation of two pattern recognition algorithms: +@register(masking='all') +def flagPattern(data, field, flagger, reference_field, method='dtw', partition_freq="days", partition_offset='0', + max_distance=0.03, normalized_distance=True, open_end=True, widths=(1, 2, 4, 8), + waveform='mexh', **kwargs): + """ + Implementation of two pattern recognition algorithms: - Dynamic Time Warping (dtw) [1] - Pattern recognition via wavelets [2] The steps are: - 1. Get the frequency of partitions, in which the time series has to be divided (for example: a pattern occurs daily, or every hour) + 1. Get the frequency of partitions, in which the time series has to be divided (for example: a pattern occurs daily, + or every hour) 2. Compare each partition with the given pattern 3. Check if the compared partition contains the pattern or not 4. Flag partition if it contains the pattern - :param data: pandas dataframe. holding the data - :param field: fieldname in `data`, which holds the series to be checked for patterns - :param flagger: flagger. - :param reference_field: fieldname in `data`, which holds the pattern - :param method: str. Pattern Recognition method to be used: 'dtw' or 'wavelets'. Default: 'dtw' - :param partition_freq: str. Frequency, in which the pattern occurs. If only "days" or "months" is given, then precise length of partition is calculated from pattern length. Default: "days" - :param partition_offset: str. If partition frequency is given, and pattern starts after a timely offset (e.g., partition frequency is "1 h", pattern starts at 10:15, then offset is "15 min"). Default: 0 - :param max_distance: float. For dtw. Maximum dtw-distance between partition and pattern, so that partition is recognized as pattern. Default: 0.03 - :param normalized_distance: boolean. For dtw. Normalizing dtw-distance (see [1]). Default: True - :param open_end: boolean. For dtw. End of pattern is matched with a value in the partition (not necessarily end of partition). Recommendation of [1]. Default: True - :param widths: tuple of int. For wavelets. Widths for wavelet decomposition. [2] recommends a dyadic scale. Default: (1,2,4,8) - :param waveform: str. For wavelets. Wavelet to be used for decomposition. Default: 'mexh' - - Literature: + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-flagged. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + reference_field : str + Fieldname in `data`, that holds the pattern + method : {'dtw', 'wavelets'}, default 'dtw'. + Pattern Recognition method to be used. + partition_freq : str, default 'days' + Frequency, in which the pattern occurs. + Has to be an offset string or one out of {"days", "months"}. If 'days' or 'months' is passed, + then precise length of partition is calculated from pattern length. + partition_offset : str, default '0' + If partition frequency is given by an offset string and the pattern starts after a timely offset, this offset + is given by `partition_offset`. + (e.g., partition frequency is "1h", pattern starts at 10:15, then offset is "15min"). + ax_distance : float, default 0.03 + Only effective if method = 'dtw'. + Maximum dtw-distance between partition and pattern, so that partition is recognized as pattern. + (And thus gets flagged.) + normalized_distance : bool, default True. + For dtw. Normalizing dtw-distance (Doesnt refer to statistical normalization, but to a normalization that + makes comparable dtw-distances for probes of different length, see [1] for more details). + open_end : boolean, default True + Only effective if method = 'dtw'. + Weather or not, the ending of the probe and of the pattern have to be projected onto each other in the search + for the optimal dtw-mapping. Recommendation of [1]. + widths : tuple[int], default (1,2,4,8) + Only effective if method = 'wavelets'. + Widths for wavelet decomposition. [2] recommends a dyadic scale. + waveform: str, default 'mexh' + Only effective if method = 'wavelets'. + Wavelet to be used for decomposition. Default: 'mexh' + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values may have changed relatively to the flagger input. + + References + ---------- [1] https://cran.r-project.org/web/packages/dtw/dtw.pdf - [2] Maharaj, E.A. (2002): Pattern Recognition of Time Series using Wavelets. In: Härdle W., Rönz B. (eds) Compstat. Physica, Heidelberg, 978-3-7908-1517-7. + [2] Maharaj, E.A. (2002): Pattern Recognition of Time Series using Wavelets. In: Härdle W., Rönz B. (eds) Compstat. + Physica, Heidelberg, 978-3-7908-1517-7. """ test = data[field].copy() @@ -213,23 +396,80 @@ def flagPattern(data, field, flagger, reference_field, method = 'dtw', partition return data, flagger - -@register +@register(masking='field') def flagMissing(data, field, flagger, nodata=np.nan, **kwargs): + """ + The function flags all values indicating missing data. + + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-flagged. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + nodata : any, default np.nan + A value that defines missing data. + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values may have changed relatively to the flagger input. + """ + datacol = data[field] if np.isnan(nodata): mask = datacol.isna() else: - mask = datacol[datacol == nodata] + mask = datacol == nodata flagger = flagger.setFlags(field, loc=mask, **kwargs) return data, flagger -@register +@register(masking='field') def flagSesonalRange( data, field, flagger, min, max, startmonth=1, endmonth=12, startday=1, endday=31, **kwargs, ): + """ + Function applies a range check onto data chunks (seasons). + + The data chunks to be tested are defined by annual seasons that range from a starting date, + to an ending date, wheras the dates are defined by month and day number. + + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-flagged. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + min : float + Lower bound for valid data. + max : float + Upper bound for valid data. + startmonth : int + Starting month of the season to flag. + endmonth : int + Ending month of the season to flag. + startday : int + Starting day of the season to flag. + endday : int + Ending day of the season to flag + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values may have changed relatively to the flagger input. + """ data, flagger = proc_fork(data, field, flagger, suffix="_masked") data, flagger = modelling_mask(data, field + "_masked", flagger, mode='seasonal', @@ -241,22 +481,58 @@ def flagSesonalRange( data, flagger = proc_drop(data, field + "_masked", flagger) return data, flagger -@register + +@register(masking='field') def clearFlags(data, field, flagger, **kwargs): flagger = flagger.clearFlags(field, **kwargs) return data, flagger -@register +@register(masking='field') def forceFlags(data, field, flagger, flag, **kwargs): - flagger = flagger.clearFlags(field).setFlags(field, flag=flag, **kwargs) + flagger = flagger.clearFlags(field).setFlags(field, flag=flag, inplace=True, **kwargs) return data, flagger -@register +@register(masking='field') def flagIsolated( data, field, flagger, gap_window, group_window, **kwargs, ): + """ + The function flags arbitrary large groups of values, if they are surrounded by sufficiently + large data gaps. A gap is defined as group of missing and/or flagged values. + + A series of values x_k,x_(k+1),...,x_(k+n), with associated timestamps t_k,t_(k+1),...,t_(k+n), + is considered to be isolated, if: + + 1. t_(k+1) - t_n < `group_window` + 2. None of the x_j with 0 < t_k - t_j < `gap_window`, is valid or unflagged (preceeding gap). + 3. None of the x_j with 0 < t_j - t_(k+n) < `gap_window`, is valid or unflagged (succeding gap). + + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-flagged. + flagger : saqc.flagger + A flagger object, holding flags and additional informations related to `data`. + gap_window : + The minimum size of the gap before and after a group of valid values, making this group considered an + isolated group. See condition (2) and (3) + group_window : + The maximum temporal extension allowed for a group that is isolated by gaps of size 'gap_window', + to be actually flagged as isolated group. See condition (1). + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values may have changed relatively to the flagger input. + """ + gap_window = pd.tseries.frequencies.to_offset(gap_window) group_window = pd.tseries.frequencies.to_offset(group_window) @@ -280,65 +556,139 @@ def flagIsolated( return data, flagger -@register +@register(masking='field') def flagDummy(data, field, flagger, **kwargs): - """ Do nothing """ + """ + Function does nothing but returning data and flagger. + + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-flagged. + flagger : saqc.flagger + A flagger object, holding flags and additional informations related to `data`. + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + """ return data, flagger -@register +@register(masking='field') def flagForceFail(data, field, flagger, **kwargs): - """ Raise a RuntimeError. """ + """ + Function raises a runtime error. + + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-flagged. + flagger : saqc.flagger + A flagger object, holding flags and additional informations related to `data`. + + """ raise RuntimeError("Works as expected :D") -@register +@register(masking='field') def flagUnflagged(data, field, flagger, **kwargs): + """ + Function sets the flagger.GOOD flag to all values flagged better then flagger.GOOD. + If there is an entry 'flag' in the kwargs dictionary passed, the + function sets the kwargs['flag'] flag to all values flagged better kwargs['flag'] + + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-flagged. + flagger : saqc.flagger + A flagger object, holding flags and additional informations related to `data`. + kwargs : Dict + If kwargs contains 'flag' entry, kwargs['flag] is set, if no entry 'flag' is present, + 'flagger.UNFLAGGED' is set. + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + """ + flag = kwargs.pop('flag', flagger.GOOD) flagger = flagger.setFlags(field, flag=flag, **kwargs) return data, flagger -@register +@register(masking='field') def flagGood(data, field, flagger, **kwargs): + """ + Function sets the flagger.GOOD flag to all values flagged better then flagger.GOOD. + + Parameters + ---------- + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-flagged. + flagger : saqc.flagger + A flagger object, holding flags and additional informations related to `data`. + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + + """ kwargs.pop('flag', None) - return flagUnflagged(data, field, flagger) + return flagUnflagged(data, field, flagger, **kwargs) -@register +@register(masking='field') def flagManual(data, field, flagger, mdata, mflag: Any = 1, method="plain", **kwargs): - """ Flag data by given manual data. + """ + Flag data by given, "manually generated" data. The data is flagged at locations where `mdata` is equal to a provided flag (`mflag`). - The format of mdata can be a indexed object, like pd.Series, pd.Dataframe or dios.DictOfSeries, + The format of mdata can be an indexed object, like pd.Series, pd.Dataframe or dios.DictOfSeries, but also can be a plain list- or array-like. - How indexed mdata is aligned to data is specified via `method` argument. + How indexed mdata is aligned to data is specified via the `method` parameter. Parameters ---------- - data : DictOfSeries + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. field : str - The field chooses the column in flags and data in question. - It also determine the column in mdata if its of type pd.Dataframe or dios.DictOfSeries. - - flagger : flagger -range_dict.keys() + The fieldname of the column, holding the data-to-be-flagged. + flagger : saqc.flagger + A flagger object, holding flags and additional informations related to `data`. mdata : {pd.Series, pd.Dataframe, DictOfSeries, str} - The manual data - + The "manually generated" data mflag : scalar - The flag that indicates data points in `mdata`, that should be flagged. - + The flag that indicates data points in `mdata`, of wich the projection in data should be flagged. method : {'plain', 'ontime', 'left-open', 'right-open'}, default plain - Define how mdata is applied on data. Except 'plain' mdata must have a index. - * 'plain': mdata must have same length than data and is applied one-to-one on data. - * 'ontime': work only with indexed mdata, it is applied, where timestamps are match. - * 'right-open': mdata defines periods, which are defined by two consecutive timestamps, the - value of the first aka. left is applied on the whole period. - * 'left-open': like 'right-open' but the value is defined in the latter aka. right timestamp. - - kwargs : Any - passed to flagger + Defines how mdata is projected on data. Except for the 'plain' method, the methods assume mdata to have an + index. + * 'plain': mdata must have the same length as data and is projected one-to-one on data. + * 'ontime': works only with indexed mdata. mdata entries are matched with data entries that have the same index. + * 'right-open': mdata defines intervals, values are to be projected on. + The intervals are defined by any two consecutive timestamps t_1 and 1_2 in mdata. + the value at t_1 gets projected onto all data timestamps t with t_1 <= t < t_2. + * 'left-open': like 'right-open', but the projected interval now covers all t with t_1 < t <= t_2. Returns ------- @@ -435,8 +785,51 @@ range_dict.keys() return data, flagger -@register -def flagCrossScoring(data, field, flagger, fields, thresh, cross_stat=np.median, **kwargs): +@register(masking='all') +def flagCrossScoring(data, field, flagger, fields, thresh, cross_stat='modZscore', **kwargs): + """ + Function checks for outliers relatively to the "horizontal" input data axis. + + For fields=[f_1,f_2,...,f_N] and timestamps [t_1,t_2,...,t_K], the following steps are taken for outlier detection: + + 1. All timestamps t_i, where there is one f_k, with data[f_K] having no entry at t_i, are excluded from the + following process (inner join of the f_i fields.) + 2. for every 0 <= i <= K, the value m_j = median({data[f_1][t_i], data[f_2][t_i], ..., data[f_N][t_i]}) is + calculated + 2. for every 0 <= i <= K, the set {data[f_1][t_i] - m_j, data[f_2][t_i] - m_j, ..., data[f_N][t_i] - m_j} is tested + for outliers with the specified method (`cross_stat` parameter) + + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + A dummy parameter. + flagger : saqc.flagger + A flagger object, holding flags and additional informations related to `data`. + fields : str + List of fieldnames in data, determining wich variables are to be included into the flagging process. + thresh : float + Threshold which the outlier score of an value must exceed, for being flagged an outlier. + cross_stat : {'modZscore', 'Zscore'}, default 'modZscore' + Method used for calculating the outlier scores. + * 'modZscore': Median based "sigma"-ish approach. See Referenecs [1]. + * 'Zscore': Score values by how many times the standard deviation they differ from the median. + See References [1] + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values may have changed relatively to the input flagger. + + References + ---------- + [1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h.htm + """ + df = data[fields].loc[data[fields].index_of('shared')].to_df() if isinstance(cross_stat, str): diff --git a/saqc/funcs/harm_functions.py b/saqc/funcs/harm_functions.py index 974ed3fc5fc54c959e0e008137fb5dd097b9ffb2..b6205ddbd4470aad97eb73f0b97d0dd0f87b0ed0 100644 --- a/saqc/funcs/harm_functions.py +++ b/saqc/funcs/harm_functions.py @@ -19,8 +19,60 @@ from saqc.funcs.proc_functions import ( logger = logging.getLogger("SaQC") -@register +@register(masking='none') def harm_shift2Grid(data, field, flagger, freq, method="nshift", to_drop=None, **kwargs): + """ + A method to "regularize" data by shifting data points forward/backward to a regular timestamp. + + A series of data is considered "regular", if it is sampled regularly (= having uniform sampling rate). + + Method keywords: + + 'nshift' - every grid point gets assigned the nearest value in its range ( range = +/-(freq/2) ) + 'bshift' - every grid point gets assigned its first succeeding value - if there is one available in the + succeeding sampling interval. (equals resampling wih "first") + 'fshift' - every grid point gets assigned its ultimately preceeding value - if there is one available in + the preceeding sampling interval. (equals resampling with "last") + + Note: the flags associated with every datapoint will just get shifted with them. + + Note: if there is no valid data (exisiing and not-na) available in a sampling interval assigned to a regular + timestamp by the selected method, nan gets assigned to this timestamp. The associated flag will be of value + flagger.UNFLAGGED. + + Note: all data nans get excluded defaultly from shifting. If to_drop is None - all BAD flagged values get + excluded as well. + + Note: the method will likely and significantly alter values and shape of data[field]. The original data is kept + in the data dios and assigned to the fieldname field + "_original". + + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-regularized. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`.freq + freq : str + The frequency of the grid you want to shift your data to. + method : {'nshift', 'bshift', 'fshift'}, default 'nshift' + Specifies if datapoints get propagated forwards, backwards or to the nearest grid timestamp. + See description above for details + to_drop : {List[str], str}, default None + Flagtypes you want to drop before shifting - effectively excluding values that are flagged + with a flag in to_drop from the shifting process. Default - results in flagger.BAD + values being dropped initially. + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + Data values and shape may have changed relatively to the data input. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values and shape may have changed relatively to the flagger input. + """ data, flagger = proc_fork(data, field, flagger) data, flagger = proc_shift( @@ -29,10 +81,69 @@ def harm_shift2Grid(data, field, flagger, freq, method="nshift", to_drop=None, * return data, flagger -@register +@register(masking='none') def harm_aggregate2Grid( data, field, flagger, freq, value_func, flag_func=np.nanmax, method="nagg", to_drop=None, **kwargs ): + """ + A method to "regularize" data by aggregating (resampling) data at a regular timestamp. + + A series of data is considered "regular", if it is sampled regularly (= having uniform sampling rate). + + The data will therefor get aggregated with a function, specified by the `value_func` parameter and + the result gets projected onto the new timestamps with a method, specified by "method". + + The following method (keywords) are available: + + 'nagg' (aggreagtion to nearest) - all values in the range (+/- freq/2) of a grid point get aggregated with agg_func + and assigned to it. + Flags get aggregated by `flag_func` and assigned the same way. + 'bagg' (backwards aggregation) - all values in a sampling interval get aggregated with agg_func and the result gets + assigned to the last regular timestamp. + Flags get aggregated by flag_func and assigned the same way. + 'fagg' (forward aggregation) - all values in a sampling interval get aggregated with agg_func and the result gets + assigned to the next regular timestamp. + Flags get aggregated by flag_func and assigned the same way. + + + Note, that, if there is no valid data (exisitng and not-na) available in a sampling interval assigned to a regular timestamp by the selected method, + nan gets assigned to this timestamp. The associated flag will be of value flagger.UNFLAGGED. + + Note: the method will likely and significantly alter values and shape of data[field]. The original data is kept + in the data dios and assigned to the fieldname field + "_original". + + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-regularized. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`.freq + freq : str + The sampling frequency the data is to be aggregated (resampled) at. + value_func : Callable + The function you want to use for aggregation. + flag_func : Callable + The function you want to aggregate the flags with. It should be capable of operating on the flags dtype + (usually ordered categorical). + method : {'fagg', 'bagg', 'nagg'}, default 'nagg' + Specifies which intervals to be aggregated for a certain timestamp. (preceeding, succeeding or + "surrounding" interval). See description above for more details. + to_drop : {List[str], str}, default None + Flagtypes you want to drop before aggregation - effectively excluding values that are flagged + with a flag in to_drop from the aggregation process. Default results in flagger.BAD + values being dropped initially. + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + Data values and shape may have changed relatively to the data input. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values and shape may have changed relatively to the flagger input. + """ data, flagger = proc_fork(data, field, flagger) data, flagger = proc_resample( @@ -51,8 +162,48 @@ def harm_aggregate2Grid( return data, flagger -@register +@register(masking='none') def harm_linear2Grid(data, field, flagger, freq, to_drop=None, **kwargs): + """ + A method to "regularize" data by interpolating linearly the data at regular timestamp. + + A series of data is considered "regular", if it is sampled regularly (= having uniform sampling rate). + + Interpolated values will get assigned the worst flag within freq-range. + + Note: the method will likely and significantly alter values and shape of data[field]. The original data is kept + in the data dios and assigned to the fieldname field + "_original". + + Note, that the data only gets interpolated at those (regular) timestamps, that have a valid (existing and + not-na) datapoint preceeding them and one succeeding them within freq range. + Regular timestamp that do not suffice this condition get nan assigned AND The associated flag will be of value + flagger.UNFLAGGED. + + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-regularized. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`.freq + freq : str + An offset string. The frequency of the grid you want to interpolate your data at. + to_drop : {List[str], str}, default None + Flagtypes you want to drop before interpolation - effectively excluding values that are flagged + with a flag in to_drop from the interpolation process. Default results in flagger.BAD + values being dropped initially. + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + Data values and shape may have changed relatively to the data input. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values and shape may have changed relatively to the flagger input. + """ + data, flagger = proc_fork(data, field, flagger) data, flagger = proc_interpolateGrid( data, field, flagger, freq, "time", to_drop=to_drop, empty_intervals_flag=flagger.UNFLAGGED, **kwargs @@ -60,10 +211,61 @@ def harm_linear2Grid(data, field, flagger, freq, to_drop=None, **kwargs): return data, flagger -@register +@register(masking='none') def harm_interpolate2Grid( data, field, flagger, freq, method, order=1, to_drop=None, **kwargs, ): + """ + A method to "regularize" data by interpolating the data at regular timestamp. + + A series of data is considered "regular", if it is sampled regularly (= having uniform sampling rate). + + Interpolated values will get assigned the worst flag within freq-range. + + There are available all the interpolations from the pandas.Series.interpolate method and they are called by + the very same keywords. + + Note, that, to perform a timestamp aware, linear interpolation, you have to pass 'time' as method, and NOT 'linear'. + + Note: the method will likely and significantly alter values and shape of data[field]. The original data is kept + in the data dios and assigned to the fieldname field + "_original". + + Note, that the data only gets interpolated at those (regular) timestamps, that have a valid (existing and + not-na) datapoint preceeding them and one succeeding them within freq range. + Regular timestamp that do not suffice this condition get nan assigned AND The associated flag will be of value + flagger.UNFLAGGED. + + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-regularized. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`.freq + freq : str + An offset string. The frequency of the grid you want to interpolate your data at. + method : {"linear", "time", "nearest", "zero", "slinear", "quadratic", "cubic", "spline", "barycentric", + "polynomial", "krogh", "piecewise_polynomial", "spline", "pchip", "akima"}: string + The interpolation method you want to apply. + order : int, default 1 + If your selected interpolation method can be performed at different 'orders' - here you pass the desired + order. + to_drop : {List[str], str}, default None + Flagtypes you want to drop before interpolation - effectively excluding values that are flagged + with a flag in to_drop from the interpolation process. Default results in flagger.BAD + values being dropped initially. + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + Data values and shape may have changed relatively to the data input. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values and shape may have changed relatively to the flagger input. + """ + data, flagger = proc_fork(data, field, flagger) data, flagger = proc_interpolateGrid( data, @@ -79,8 +281,72 @@ def harm_interpolate2Grid( return data, flagger -@register +@register(masking='none') def harm_deharmonize(data, field, flagger, method, to_drop=None, **kwargs): + """ + The Function function "undoes" regularization, by regaining the original data and projecting the + flags calculated for the regularized data onto the original ones. + + Afterwards the regularized data is removed from the data dios and 'field' will be associated + to the original data "again". + + Wherever the flags in the original data are "better" then the regularized flags projected on them, + they get overridden with this regularized flags value. + + Which regularized flags are to be projected on which original flags, is controlled by the "method" parameters. + + Generally, if you regularized with the method 'X', you should pass the method 'inverse_X' to the deharmonization. + If you regularized with an interpolation, the method 'inverse_interpolation' would be the appropriate choice. + Also you should pass the same drop flags keyword. + + The deharm methods in detail: + ("original_flags" are associated with the original data that is to be regained, + "regularized_flags" are associated with the regularized data that is to be "deharmonized", + "freq" refers to the regularized datas sampling frequencie) + + 'inverse_nagg' - all original_flags within the range +/- freq/2 of a regularized_flag, get assigned this + regularized flags value. (if regularized_flags > original_flag) + 'inverse_bagg' - all original_flags succeeding a regularized_flag within the range of "freq", get assigned this + regularized flags value. (if regularized_flag > original_flag) + 'inverse_fagg' - all original_flags preceeding a regularized_flag within the range of "freq", get assigned this + regularized flags value. (if regularized_flag > original_flag) + + 'inverse_interpolation' - all original_flags within the range +/- freq of a regularized_flag, get assigned this + regularized flags value (if regularized_flag > original_flag). + + 'inverse_nshift' - That original_flag within the range +/- freq/2, that is nearest to a regularized_flag, gets the + regularized flags value. (if regularized_flag > original_flag) + 'inverse_bshift' - That original_flag succeeding a source flag within the range freq, that is nearest to a + regularized_flag, gets assigned this regularized flags value. (if regularized_flag > original_flag) + 'inverse_nshift' - That original_flag preceeding a regularized flag within the range freq, that is nearest to a + regularized_flag, gets assigned this regularized flags value. (if source_flag > original_flag) + + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-deharmonized. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`.freq + method : {'inverse_fagg', 'inverse_bagg', 'inverse_nagg', 'inverse_fshift', 'inverse_bshift', 'inverse_nshift', + 'inverse_interpolation'} + The method used for projection of regularized flags onto opriginal flags. See description above for more + details. + to_drop : {List[str], str}, default None + Flagtypes you want to drop before interpolation - effectively excluding values that are flagged + with a flag in to_drop from the interpolation process. Default results in flagger.BAD + values being dropped initially. + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + Data values and shape may have changed relatively to the data input. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values and shape may have changed relatively to the flagger input. + """ data, flagger = proc_projectFlags( data, str(field) + ORIGINAL_SUFFIX, flagger, method, source=field, to_drop=to_drop, **kwargs diff --git a/saqc/funcs/modelling.py b/saqc/funcs/modelling.py index 8fede4c050482d5e41c3439469acc81094a935d3..2b3ceee8c6aaaa2925c696bbfeb00dbc42d17649 100644 --- a/saqc/funcs/modelling.py +++ b/saqc/funcs/modelling.py @@ -14,18 +14,22 @@ from saqc.lib.ts_operators import ( from saqc.lib.tools import seasonalMask -@register +@register(masking='field') def modelling_polyFit(data, field, flagger, winsz, polydeg, numba="auto", eval_flags=True, min_periods=0, **kwargs): """ - Function fits a polynomial model to the data and returns the residues. (field gets overridden). + Function fits a polynomial model to the data and returns the residues. + The residue for value x is calculated by fitting a polynomial of degree "polydeg" to a data slice of size "winsz", wich has x at its center. - Note, that if data[field] is not alligned to an equidistant frequency grid, the window size passed, - has to be an offset string. Also numba boost options dont apply for irregularly sampled + Note, that the residues will be stored to the `field` field of the input data, so that the original data, the + polynomial is fitted to, gets overridden. + + Note, that, if data[field] is not alligned to an equidistant frequency grid, the window size passed, + has to be an offset string. Also numba boost options don`t apply for irregularly sampled timeseries. - Note, that calculating the residues tends to be quite cost intensive - because a function fitting is perfomed for every + Note, that calculating the residues tends to be quite costy, because a function fitting is perfomed for every sample. To improve performance, consider the following possibillities: In case your data is sampled at an equidistant frequency grid: @@ -48,30 +52,41 @@ def modelling_polyFit(data, field, flagger, winsz, polydeg, numba="auto", eval_f Parameters ---------- - winsz : integer or offset String + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-modelled. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + winsz : {str, int} The size of the window you want to use for fitting. If an integer is passed, the size refers to the number of periods for every fitting window. If an offset string is passed, the size refers to the total temporal extension. The window will be centered around the vaule-to-be-fitted. For regularly sampled timeseries the period number will be casted down to an odd number if even. - polydeg : integer + polydeg : int The degree of the polynomial used for fitting numba : {True, False, "auto"}, default "auto" Wheather or not to apply numbas just-in-time compilation onto the poly fit function. This will noticably increase the speed of calculation, if the sample size is sufficiently high. If "auto" is selected, numba compatible fit functions get applied for data consisiting of > 200000 samples. - eval_flags : boolean, default True + eval_flags : bool, default True Wheather or not to assign new flags to the calculated residuals. If True, a residual gets assigned the worst flag present in the interval, the data for its calculation was obtained from. - min_periods : integer or np.nan, default 0 + min_periods : {int, np.nan}, default 0 The minimum number of periods, that has to be available in every values fitting surrounding for the polynomial fit to be performed. If there are not enough values, np.nan gets assigned. Default (0) results in fitting regardless of the number of values present (results in overfitting for too sparse intervals). To automatically set the minimum number of periods to the number of values in an offset defined window size, pass np.nan. - kwargs Returns ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + Data values may have changed relatively to the data input. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values may have changed relatively to the flagger input. """ if data[field].empty: @@ -172,31 +187,50 @@ def modelling_polyFit(data, field, flagger, winsz, polydeg, numba="auto", eval_f return data, flagger -@register +@register(masking='field') def modelling_rollingMean(data, field, flagger, winsz, eval_flags=True, min_periods=0, center=True, **kwargs): """ - Models the timeseries passed with the rolling mean. + Models the data with the rolling mean and returns the residues. + + Note, that the residues will be stored to the `field` field of the input data, so that the data that is modelled + gets overridden. Parameters ---------- - winsz : integer or offset String + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-modelled. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + winsz : {int, str} The size of the window you want to roll with. If an integer is passed, the size refers to the number of periods for every fitting window. If an offset string is passed, the size refers to the total temporal extension. For regularly sampled timeseries, the period number will be casted down to an odd number if center = True. - eval_flags : boolean, default True + eval_flags : bool, default True Wheather or not to assign new flags to the calculated residuals. If True, a residual gets assigned the worst flag present in the interval, the data for its calculation was obtained from. Currently not implemented in combination with not-harmonized timeseries. - min_periods : integer, default 0 + min_periods : int, default 0 The minimum number of periods, that has to be available in every values fitting surrounding for the mean fitting to be performed. If there are not enough values, np.nan gets assigned. Default (0) results in fitting regardless of the number of values present. - center : boolean, default True + center : bool, default True Wheather or not to center the window the mean is calculated of around the reference value. If False, the reference value is placed to the right of the window (classic rolling mean with lag.) + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + Data values may have changed relatively to the data input. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values may have changed relatively to the flagger input. """ + data = data.copy() to_fit = data[field] flags = flagger.getFlags(field) diff --git a/saqc/funcs/proc_functions.py b/saqc/funcs/proc_functions.py index 9d889603402eafa8ff9428ddc0ba0ec98e30d0b8..ac3b19499ca458d918b334fa5fd8a89e6c58d4b4 100644 --- a/saqc/funcs/proc_functions.py +++ b/saqc/funcs/proc_functions.py @@ -8,9 +8,8 @@ from saqc.lib.ts_operators import interpolateNANs, aggregate2Freq, shift2Freq, e from saqc.lib.tools import toSequence, mergeDios, dropper, mutateIndex import dios import functools -import matplotlib.pyplot as plt from scipy.optimize import curve_fit -import pickle +from sklearn.linear_model import LinearRegression ORIGINAL_SUFFIX = "_original" @@ -25,7 +24,7 @@ METHOD2ARGS = { } -@register +@register(masking='field') def proc_rollingInterpolateMissing( data, field, flagger, winsz, func=np.median, center=True, min_periods=0, interpol_flag="UNFLAGGED", **kwargs ): @@ -36,28 +35,38 @@ def proc_rollingInterpolateMissing( Note, that in the current implementation, center=True can only be used with integer window sizes - furthermore note, that integer window sizes can yield screwed aggregation results for not-harmonized or irregular data. - - Parameters ---------- - winsz : Integer or Offset String + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-interpolated. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + winsz : int, str The size of the window, the aggregation is computed from. Either counted in periods number (Integer passed), - or defined by a total temporal extension (offset String passed) - func : Function + or defined by a total temporal extension (offset String passed). + func : Callable The function used for aggregation. - center : boolean, default True - Wheather or not the window the aggregation is computed of is centered around the value to be interpolated. - min_periods : Integer - Minimum number of valid (not np.nan) values that have to be available in a window for its Aggregation to be + center : bool, default True + Wheather or not the window, the aggregation is computed of, is centered around the value to be interpolated. + min_periods : int + Minimum number of valid (not np.nan) values that have to be available in a window for its aggregation to be computed. - interpol_flag : {'GOOD', 'BAD', 'UNFLAGGED'} or String, default 'UNFLAGGED' + interpol_flag : {'GOOD', 'BAD', 'UNFLAGGED', str}, default 'UNFLAGGED' Flag that is to be inserted for the interpolated values. You can either pass one of the three major flag-classes or specify directly a certain flag from the passed flagger. Returns ------- - + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + Data values may have changed relatively to the data input. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values may have changed relatively to the flagger input. """ + data = data.copy() datcol = data[field] roller = datcol.rolling(window=winsz, center=center, min_periods=min_periods) @@ -84,7 +93,7 @@ def proc_rollingInterpolateMissing( return data, flagger -@register +@register(masking='field') def proc_interpolateMissing( data, field, @@ -99,39 +108,50 @@ def proc_interpolateMissing( ): """ - function to interpolate nan values in the data. - There are available all the interpolation methods from the pandas.interpolate() method and they are applicable by + Function to interpolate nan values in the data. + + There are available all the interpolation methods from the pandas.interpolate method and they are applicable by the very same key words, that you would pass to pd.Series.interpolates's method parameter. - Note, that the inter_limit keyword really restricts the interpolation to chunks, not containing more than - "inter_limit" successive nan entries. + Note, that the `inter_limit` keyword really restricts the interpolation to chunks, not containing more than + `inter_limit` successive nan entries. - Note, that the function differs from proc_interpolateGrid, in its behaviour to ONLY interpolate nan values that + Note, that the function differs from ``proc_interpolateGrid``, in its behaviour to ONLY interpolate nan values that were already present in the data passed. Parameters - --------- + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-interpolated. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. method : {"linear", "time", "nearest", "zero", "slinear", "quadratic", "cubic", "spline", "barycentric", - "polynomial", "krogh", "piecewise_polynomial", "spline", "pchip", "akima"}: string + "polynomial", "krogh", "piecewise_polynomial", "spline", "pchip", "akima"}: string The interpolation method you want to apply. - - inter_order : integer, default 2 + inter_order : int, default 2 If there your selected interpolation method can be performed at different 'orders' - here you pass the desired order. - - inter_limit : integer, default 2 + inter_limit : int, default 2 Maximum number of consecutive 'nan' values allowed for a gap to be interpolated. - - interpol_flag : {'GOOD', 'BAD', 'UNFLAGGED'} or String, default 'UNFLAGGED' + interpol_flag : {'GOOD', 'BAD', 'UNFLAGGED', str}, default 'UNFLAGGED' Flag that is to be inserted for the interpolated values. You can either pass one of the three major flag-classes or specify directly a certain flag from the passed flagger. - - downgrade_interpolation : boolean, default False + downgrade_interpolation : bool, default False If interpolation can not be performed at 'inter_order' - (not enough values or not implemented at this order) - automaticalyy try to interpolate at order 'inter_order' - 1. - - not_interpol_flags : list or String, default None + not_interpol_flags : {None, str, List[str]}, default None A list of flags or a single Flag, marking values, you want NOT to be interpolated. + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + Data values may have changed relatively to the data input. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values may have changed relatively to the flagger input. """ data = data.copy() @@ -165,19 +185,21 @@ def proc_interpolateMissing( return data, flagger -@register +@register(masking='field') def proc_interpolateGrid( - data, - field, - flagger, - freq, - method, - inter_order=2, - to_drop=None, - downgrade_interpolation=False, - empty_intervals_flag=None, - **kwargs -): + data, + field, + flagger, + freq, + method, + inter_order=2, + to_drop=None, + downgrade_interpolation=False, + empty_intervals_flag=None, + grid_field=None, + inter_limit=2, + **kwargs): + """ Function to interpolate the data at regular (equidistant) timestamps (or Grid points). @@ -187,33 +209,53 @@ def proc_interpolateGrid( Note, that the function differs from proc_interpolateMissing, by returning a whole new data set, only containing samples at the interpolated, equidistant timestamps (of frequency "freq"). + Note, it is possible to interpolate unregular "grids" (with no frequencies). In fact, any date index + can be target of the interpolation. Just pass the field name of the variable, holding the index + you want to interpolate, to "grid_field". 'freq' is then use to determine the maximum gap size for + a grid point to be interpolated. + Parameters - --------- - freq : Offset String + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-interpolated. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + freq : str The frequency of the grid you want to interpolate your data at. - method : {"linear", "time", "nearest", "zero", "slinear", "quadratic", "cubic", "spline", "barycentric", "polynomial", "krogh", "piecewise_polynomial", "spline", "pchip", "akima"}: string The interpolation method you want to apply. - inter_order : integer, default 2 If there your selected interpolation method can be performed at different 'orders' - here you pass the desired order. - - to_drop : list or string, default None + to_drop : {None, str, List[str]}, default None Flags that refer to values you want to drop before interpotion - effectively excluding grid points from interpolation, that are only surrounded by values having a flag in them, that is listed in drop flags. Default results in the flaggers 'BAD' flag to be the drop_flag. - - downgrade_interpolation : boolean, default False + downgrade_interpolation : bool, default False If interpolation can not be performed at 'inter_order' - (not enough values or not implemented at this order) - automaticalyy try to interpolate at order 'inter_order' - 1. - - empty_intervals_flag : String, default None + empty_intervals_flag : str, default None A Flag, that you want to assign to those values resulting equidistant sample grid, that were not surrounded by valid (flagged) data in the original dataset and thus werent interpolated. Default automatically assigns flagger.BAD flag to those values. - """ + grid_field : String, default None + Use the timestamp of another variable as (not nessecarily regular) "grid" to be interpolated. + inter_limit : Integer, default 2 + Maximum number of consecutive Grid values allowed for interpolation. If set + to "n", in the result, chunks of "n" consecutive grid values wont be interpolated. + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + Data values and shape may have changed relatively to the data input. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values and shape may have changed relatively to the flagger input. + """ datcol = data[field] datcol = datcol.copy() @@ -228,8 +270,8 @@ def proc_interpolateGrid( datcol.dropna(inplace=True) if datcol.empty: data[field] = datcol - reshaped_flagger = flagger.initFlags(datcol).setFlags(field, flag=flagscol, force=True, **kwargs) - flagger = flagger.slice(drop=field).merge(reshaped_flagger) + reshaped_flagger = flagger.initFlags(datcol).setFlags(field, flag=flagscol, force=True, inplace=True, **kwargs) + flagger = flagger.slice(drop=field).merge(reshaped_flagger, subset=[field], inplace=True) return data, flagger # account for annoying case of subsequent frequency aligned values, differing exactly by the margin # 2*freq: @@ -243,9 +285,12 @@ def proc_interpolateGrid( spec_case_mask = spec_case_mask.tshift(-1, freq) # prepare grid interpolation: - grid_index = pd.date_range( - start=datcol.index[0].floor(freq), end=datcol.index[-1].ceil(freq), freq=freq, name=datcol.index.name - ) + if grid_field is None: + grid_index = pd.date_range(start=datcol.index[0].floor(freq), end=datcol.index[-1].ceil(freq), freq=freq, + name=datcol.index.name) + else: + grid_index = data[grid_field].index + aligned_start = datcol.index[0] == grid_index[0] aligned_end = datcol.index[-1] == grid_index[-1] @@ -253,24 +298,40 @@ def proc_interpolateGrid( # do the interpolation inter_data, chunk_bounds = interpolateNANs( - datcol, - method, - order=inter_order, - inter_limit=2, - downgrade_interpolation=downgrade_interpolation, - return_chunk_bounds=True, + datcol, method, order=inter_order, inter_limit=inter_limit, downgrade_interpolation=downgrade_interpolation, + return_chunk_bounds=True ) - # override falsely interpolated values: - inter_data[spec_case_mask.index] = np.nan + if grid_field is None: + # override falsely interpolated values: + inter_data[spec_case_mask.index] = np.nan # store interpolated grid - inter_data = inter_data.asfreq(freq) + inter_data = inter_data[grid_index] data[field] = inter_data # flags reshaping (dropping data drops): flagscol.drop(flagscol[drop_mask].index, inplace=True) + if grid_field is not None: + # only basic flag propagation supported for custom grids (take worst from preceeding/succeeding) + preceeding = flagscol.reindex(grid_index, method='ffill', tolerance=freq) + succeeding = flagscol.reindex(grid_index, method='bfill', tolerance=freq) + # check for too big gaps in the source data and drop the values interpolated in those too big gaps + na_mask = preceeding.isna() | succeeding.isna() + na_mask = na_mask[na_mask] + preceeding.drop(na_mask.index, inplace=True) + succeeding.drop(na_mask.index, inplace=True) + inter_data.drop(na_mask.index, inplace=True) + data[field] = inter_data + mask = succeeding > preceeding + preceeding.loc[mask] = succeeding.loc[mask] + flagscol = preceeding + flagger_new = flagger.initFlags(inter_data).setFlags(field, flag=flagscol, force=True, **kwargs) + flagger = flagger.slice(drop=field).merge(flagger_new) + return data, flagger + + # for freq defined grids, max-aggregate flags of every grid points freq-ranged surrounding # hack ahead! Resampling with overlapping intervals: # 1. -> no rolling over categories allowed in pandas, so we translate manually: cats = pd.CategoricalIndex(flagger.dtype.categories, ordered=True) @@ -300,17 +361,17 @@ def proc_interpolateGrid( if np.isnan(inter_data[-1]) and not aligned_end: chunk_bounds = chunk_bounds.append(pd.DatetimeIndex([inter_data.index[-1]])) chunk_bounds = chunk_bounds.unique() - flagger_new = flagger.initFlags(inter_data).setFlags(field, flag=flagscol, force=True, **kwargs) + flagger_new = flagger.initFlags(inter_data).setFlags(field, flag=flagscol, force=True, inplace=True, **kwargs) # block chunk ends of interpolation flags_to_block = pd.Series(np.nan, index=chunk_bounds).astype(flagger_new.dtype) - flagger_new = flagger_new.setFlags(field, loc=chunk_bounds, flag=flags_to_block, force=True) + flagger_new = flagger_new.setFlags(field, loc=chunk_bounds, flag=flags_to_block, force=True, inplace=True) - flagger = flagger.slice(drop=field).merge(flagger_new) + flagger = flagger.slice(drop=field).merge(flagger_new, subset=[field], inplace=True) return data, flagger -@register +@register(masking='field') def proc_resample( data, field, @@ -350,52 +411,58 @@ def proc_resample( anyway, so that there is no point in passing the nan functions. Parameters - --------- - freq : Offset String - The frequency of the grid you want to resample your data to. - - agg_func : function + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-resampled. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + freq : str + An offset string. The frequency of the grid you want to resample your data to. + agg_func : Callable The function you want to use for aggregation. -na_ser.resample('10min').apply(lambda x: x.count()) method: {'fagg', 'bagg', 'nagg'}, default 'bagg' Specifies which intervals to be aggregated for a certain timestamp. (preceeding, succeeding or "surrounding" interval). See description above for more details. - - max_invalid_total_d : integer, default np.inf + max_invalid_total_d : {np.inf, int}, np.inf Maximum number of invalid (nan) datapoints, allowed per resampling interval. If max_invalid_total_d is exceeded, the interval gets resampled to nan. By default (np.inf), there is no bound to the number of nan values in an interval and only intervals containing ONLY nan values or those, containing no values at all, get projected onto nan - - max_invalid_consec_d : integer, default np.inf + max_invalid_consec_d : {np.inf, int}, default np.inf Maximum number of consecutive invalid (nan) data points, allowed per resampling interval. If max_invalid_consec_d is exceeded, the interval gets resampled to nan. By default (np.inf), there is no bound to the number of consecutive nan values in an interval and only intervals containing ONLY nan values, or those containing no values at all, get projected onto nan. - - max_invalid_total_f : integer, default np.inf + max_invalid_total_f : {np.inf, int}, default np.inf Same as "max_invalid_total_d", only applying for the flags. The flag regarded as "invalid" value, is the one passed to empty_intervals_flag (default=flagger.BAD). Also this is the flag assigned to invalid/empty intervals. - - max_invalid_total_f : integer, default np.inf + max_invalid_total_f : {np.inf, int}, default np.inf Same as "max_invalid_total_f", only applying onto flgas. The flag regarded as "invalid" value, is the one passed to empty_intervals_flag (default=flagger.BAD). Also this is the flag assigned to invalid/empty intervals. - - flag_agg_func : function, default: max + flag_agg_func : Callable, default: max The function you want to aggregate the flags with. It should be capable of operating on the flags dtype (usually ordered categorical). - - empty_intervals_flag : String, default None + empty_intervals_flag : {None, str}, default None A Flag, that you want to assign to invalid intervals. Invalid are those intervals, that contain nan values only, or no values at all. Furthermore the empty_intervals_flag is the flag, serving as "invalid" identifyer when checking for "max_total_invalid_f" and "max_consec_invalid_f patterns". Default triggers flagger.BAD to be assigned. - - to_drop : list or string, default None + to_drop : {None, str, List[str]}, default None Flags that refer to values you want to drop before resampling - effectively excluding values that are flagged with a flag in to_drop from the resampling process - this means that they also will not be counted in the the max_consec/max_total evaluation. to_drop = None results in NO flags being dropped initially. + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + Data values and shape may have changed relatively to the data input. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values and shape may have changed relatively to the flagger input. """ data = data.copy() @@ -415,8 +482,8 @@ na_ser.resample('10min').apply(lambda x: x.count()) # for consistency reasons - return empty data/flags column when there is no valid data left # after filtering. data[field] = datcol - reshaped_flagger = flagger.initFlags(datcol).setFlags(field, flag=flagscol, force=True, **kwargs) - flagger = flagger.slice(drop=field).merge(reshaped_flagger) + reshaped_flagger = flagger.initFlags(datcol).setFlags(field, flag=flagscol, force=True, inplace=True, **kwargs) + flagger = flagger.slice(drop=field).merge(reshaped_flagger, subset=[field], inplace=True) return data, flagger datcol = aggregate2Freq( @@ -440,12 +507,12 @@ na_ser.resample('10min').apply(lambda x: x.count()) # data/flags reshaping: data[field] = datcol - reshaped_flagger = flagger.initFlags(datcol).setFlags(field, flag=flagscol, force=True, **kwargs) - flagger = flagger.slice(drop=field).merge(reshaped_flagger) + reshaped_flagger = flagger.initFlags(datcol).setFlags(field, flag=flagscol, force=True, inplace=True, **kwargs) + flagger = flagger.slice(drop=field).merge(reshaped_flagger, subset=[field], inplace=True) return data, flagger -@register +@register(masking='field') def proc_shift(data, field, flagger, freq, method, to_drop=None, empty_intervals_flag=None, **kwargs): """ Function to shift data points to regular (equidistant) timestamps. @@ -460,25 +527,35 @@ def proc_shift(data, field, flagger, freq, method, to_drop=None, empty_intervals 'fshift' - every grid point gets assigned its ultimately preceeding value - if there is one available in the preceeding sampling interval. (equals resampling with "last") - Parameters - --------- - freq : Offset String + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-shifted. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + freq : str The frequency of the grid you want to shift your data to. - method: {'fagg', 'bagg', 'nagg'}, default 'nshift' Specifies if datapoints get propagated forwards, backwards or to the nearest grid timestamp. See function description for more details. - - empty_intervals_flag : String, default None + empty_intervals_flag : {None, str}, default None A Flag, that you want to assign to grid points, where no values are avaible to be shifted to. Default triggers flagger.BAD to be assigned. - - to_drop : list or string, default None + to_drop : {None, str, List[str]}, default None Flags that refer to values you want to drop before shifting - effectively, excluding values that are flagged with a flag in to_drop from the shifting process. Default - to_drop = None - results in flagger.BAD values being dropped initially. + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + Data values and shape may have changed relatively to the data input. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values and shape may have changed relatively to the flagger input. """ data = data.copy() datcol = data[field] @@ -493,8 +570,8 @@ def proc_shift(data, field, flagger, freq, method, to_drop=None, empty_intervals datcol.dropna(inplace=True) if datcol.empty: data[field] = datcol - reshaped_flagger = flagger.initFlags(datcol).setFlags(field, flag=flagscol, force=True, **kwargs) - flagger = flagger.slice(drop=field).merge(reshaped_flagger) + reshaped_flagger = flagger.initFlags(datcol).setFlags(field, flag=flagscol, force=True, inplace=True, **kwargs) + flagger = flagger.slice(drop=field).merge(reshaped_flagger, subset=[field], inplace=True) return data, flagger flagscol.drop(drop_mask[drop_mask].index, inplace=True) @@ -502,12 +579,12 @@ def proc_shift(data, field, flagger, freq, method, to_drop=None, empty_intervals datcol = shift2Freq(datcol, method, freq, fill_value=np.nan) flagscol = shift2Freq(flagscol, method, freq, fill_value=empty_intervals_flag) data[field] = datcol - reshaped_flagger = flagger.initFlags(datcol).setFlags(field, flag=flagscol, force=True, **kwargs) - flagger = flagger.slice(drop=field).merge(reshaped_flagger) + reshaped_flagger = flagger.initFlags(datcol).setFlags(field, flag=flagscol, force=True, inplace=True, **kwargs) + flagger = flagger.slice(drop=field).merge(reshaped_flagger, subset=[field], inplace=True) return data, flagger -@register +@register(masking='field') def proc_transform(data, field, flagger, func, **kwargs): """ Function to transform data columns with a transformation that maps series onto series of the same length. @@ -515,10 +592,23 @@ def proc_transform(data, field, flagger, func, **kwargs): Note, that flags get preserved. Parameters - --------- - func : function + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-transformed. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + func : Callable Function to transform data[field] with. + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + Data values may have changed relatively to the data input. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. """ data = data.copy() # NOTE: avoiding pd.Series.transform() in the line below, because transform does process columns element wise @@ -528,7 +618,7 @@ def proc_transform(data, field, flagger, func, **kwargs): return data, flagger -@register +@register(masking='field') def proc_projectFlags(data, field, flagger, method, source, freq=None, to_drop=None, **kwargs): """ @@ -563,22 +653,31 @@ def proc_projectFlags(data, field, flagger, method, source, freq=None, to_drop=N you can just pass the associated "inverse" method. Also you shoud pass the same drop flags keyword. Parameters - --------- - + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the data column, you want to project the source-flags at. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. method: {'inverse_fagg', 'inverse_bagg', 'inverse_nagg', 'inverse_fshift', 'inverse_bshift', 'inverse_nshift'} The method used for projection of source flags onto field flags. See description above for more details. - - source: String + source: str The source source of flags projection. - - freq: Offset, default None + freq: {None, str},default None The freq determines the projection range for the projection method. See above description for more details. Defaultly (None), the sampling frequency of source is used. - - to_drop: list or String + to_drop: {None, str, List[str]}, default None Flags referring to values that are to drop before flags projection. Relevant only when projecting wiht an inverted shift method. Defaultly flagger.BAD is listed. + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values and shape may have changed relatively to the flagger input. """ flagscol = flagger.getFlags(source) if flagscol.empty: @@ -651,31 +750,63 @@ def proc_projectFlags(data, field, flagger, method, source, freq=None, to_drop=N return data, flagger -@register +@register(masking='none') def proc_fork(data, field, flagger, suffix=ORIGINAL_SUFFIX, **kwargs): """ The function generates a copy of the data "field" and inserts it under the name field + suffix into the existing data. Parameters - --------- - - suffix: String - Sub string to append to the forked data variables name. + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the data column, you want to fork (copy). + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + suffix: str + Substring to append to the forked data variables name. + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + data shape may have changed relatively to the flagger input. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags shape may have changed relatively to the flagger input. """ + fork_field = str(field) + suffix fork_dios = dios.DictOfSeries({fork_field: data[field]}) - fork_flagger = flagger.slice(drop=data.columns.drop(field)).rename(field, fork_field) + fork_flagger = flagger.slice(drop=data.columns.drop(field)).rename(field, fork_field, inplace=True) data = mergeDios(data, fork_dios) flagger = flagger.merge(fork_flagger) return data, flagger -@register +@register(masking='field') def proc_drop(data, field, flagger, **kwargs): """ The function drops field from the data dios and the flagger. + + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the data column, you want to drop. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + data shape may have changed relatively to the flagger input. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags shape may have changed relatively to the flagger input. """ data = data[data.columns.drop(field)] @@ -683,10 +814,28 @@ def proc_drop(data, field, flagger, **kwargs): return data, flagger -@register +@register(masking='field') def proc_rename(data, field, flagger, new_name, **kwargs): """ The function renames field to new name (in both, the flagger and the data). + + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the data column, you want to rename. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + new_name : str + String, field is to be replaced with. + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. """ data.columns = mutateIndex(data.columns, field, new_name) @@ -722,7 +871,7 @@ def _drift_fit(x, shift_target, cal_mean): return dataFit, dataShift -@register +@register(masking='all') def proc_seefoExpDriftCorrecture(data, field, flagger, maint_data_field, cal_mean=5, flag_maint_period=False, **kwargs): """ The function fits an exponential model to chunks of data[field]. @@ -756,15 +905,31 @@ def proc_seefoExpDriftCorrecture(data, field, flagger, maint_data_field, cal_mea Parameters ---------- - maint_data : pandas.Series - The timeseries holding the maintenance information. The series' timestamp itself represent the beginning of a - maintenance event, wheras the values represent the endings of the maintenance intervals - cal_mean : Integer, default 5 + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the data column, you want to correct. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + maint_data_field : str + The fieldname of the datacolumn holding the maintenance information. + The maint data is to expected to have following form: + The series' timestamp itself represents the beginning of a + maintenance event, wheras the values represent the endings of the maintenance intervals. + cal_mean : int, default 5 The number of values the mean is computed over, for obtaining the value level directly after and directly before maintenance event. This values are needed for shift calibration. (see above description) flag_maint_period : bool, default False Wheather or not to flag BAD the values directly obtained while maintenance. + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + Data values may have changed relatively to the data input. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values may have changed relatively to the flagger input. """ # 1: extract fit intervals: @@ -797,4 +962,37 @@ def proc_seefoExpDriftCorrecture(data, field, flagger, maint_data_field, cal_mea to_flag = to_flag.drop(to_flag[: maint_data.index[0]].index) to_flag = to_flag[to_flag.isna()] flagger = flagger.setFlags(field, loc=to_flag, **kwargs) + + data[field] = to_correct + + return data, flagger + + +@register +def proc_seefoLinearDriftCorrecture(data, field, flagger, x_field, y_field, **kwargs): + """ + Train a linear model that predicts data[y_field] by x_1*(data[x_field]) + x_0. (Least squares fit) + + Then correct the data[field] via: + + data[field] = data[field]*x_1 + x_0 + + Note, that data[x_field] and data[y_field] must be of equal length. + (Also, you might want them to be sampled at same timestamps.) + + Parameters + ---------- + x_field : String + Field name of x - data. + y_field : String + Field name of y - data. + + """ + data = data.copy() + datcol = data[field] + reg = LinearRegression() + reg.fit(data[x_field].values.reshape(-1,1), data[y_field].values) + datcol = (datcol * reg.coef_[0]) + reg.intercept_ + data[field] = datcol return data, flagger + diff --git a/saqc/funcs/soil_moisture_tests.py b/saqc/funcs/soil_moisture_tests.py index bff6344c553af72e8b91b440d7145250e33d4396..424b618b87be57d9b7d0a8b63a44f89cb2ce13dc 100644 --- a/saqc/funcs/soil_moisture_tests.py +++ b/saqc/funcs/soil_moisture_tests.py @@ -14,7 +14,7 @@ from saqc.core.register import register from saqc.lib.tools import retrieveTrustworthyOriginal -@register +@register(masking='field') def sm_flagSpikes( data, field, @@ -30,10 +30,55 @@ def sm_flagSpikes( ): """ - The Function provides just a call to flagSpikes_spektrumBased, with parameter defaults, that refer to: + The Function provides just a call to ``flagSpikes_spektrumBased``, with parameter defaults, + that refer to References [1]. + + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-flagged. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + raise_factor : float, default 0.15 + Minimum relative value difference between two values to consider the latter as a spike candidate. + See condition (1) (or reference [2]). + deriv_factor : float, default 0.2 + See condition (2) (or reference [2]). + noise_func : {'CoVar', 'rVar'}, default 'CoVar' + Function to calculate noisiness of the data surrounding potential spikes. + ``'CoVar'``: Coefficient of Variation + ``'rVar'``: Relative Variance + noise_window : str, default '12h' + An offset string that determines the range of the time window of the "surrounding" data of a potential spike. + See condition (3) (or reference [2]). + noise_thresh : float, default 1 + Upper threshold for noisiness of data surrounding potential spikes. See condition (3) (or reference [2]). + smooth_window : {None, str}, default None + Size of the smoothing window of the Savitsky-Golay filter. + The default value ``None`` results in a window of two times the sampling rate (i.e. containing three values). + smooth_poly_deg : int, default 2 + Degree of the polynomial used for fitting with the Savitsky-Golay filter. + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values may have changed relatively to the flagger input. + + References + ---------- + This Function is a generalization of the Spectrum based Spike flagging mechanism as presented in: + + [1] Dorigo, W. et al: Global Automated Quality Control of In Situ Soil Moisture + Data from the international Soil Moisture Network. 2013. Vadoze Zone J. + doi:10.2136/vzj2012.0097. + + [2] https://git.ufz.de/rdm-software/saqc/-/blob/testfuncDocs/docs/funcs/FormalDescriptions.md#spikes_flagspektrumbased - Dorigo,W,.... Global Automated Quality Control of In Situ Soil Moisture Data from the international - Soil Moisture Network. 2013. Vadoze Zone J. doi:10.2136/vzj2012.0097. """ return spikes_flagSpektrumBased( @@ -51,7 +96,7 @@ def sm_flagSpikes( ) -@register +@register(masking='field') def sm_flagBreaks( data, field, @@ -69,10 +114,57 @@ def sm_flagBreaks( ): """ - The Function provides just a call to flagBreaks_spektrumBased, with parameter defaults that refer to: - - Dorigo,W,.... Global Automated Quality Control of In Situ Soil Moisture Data from the international - Soil Moisture Network. 2013. Vadoze Zone J. doi:10.2136/vzj2012.0097. + The Function provides just a call to flagBreaks_spektrumBased, with parameter defaults that refer to references [1]. + + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-flagged. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + thresh_rel : float, default 0.1 + Float in [0,1]. See (1) of function description above to learn more + thresh_abs : float, default 0.01 + Float > 0. See (2) of function descritpion above to learn more. + first_der_factor : float, default 10 + Float > 0. See (3) of function descritpion above to learn more. + first_der_window_range : str, default '12h' + Offset string. See (3) of function description to learn more. + scnd_der_ratio_margin_1 : float, default 0.05 + Float in [0,1]. See (4) of function descritpion above to learn more. + scnd_der_ratio_margin_2 : float, default 10 + Float in [0,1]. See (5) of function descritpion above to learn more. + smooth : bool, default True + Method for obtaining dataseries' derivatives. + * False: Just take series step differences (default) + * True: Smooth data with a Savitzky Golay Filter before differentiating. + smooth_window : {None, str}, default 2 + Effective only if `smooth` = True + Offset string. Size of the filter window, used to calculate the derivatives. + smooth_poly_deg : int, default 2 + Effective only, if `smooth` = True + Polynomial order, used for smoothing with savitzk golay filter. + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional informations related to `data`. + Flags values may have changed, relatively to the flagger input. + + References + ---------- + [1] Dorigo,W. et al.: Global Automated Quality Control of In Situ Soil Moisture + Data from the international Soil Moisture Network. 2013. Vadoze Zone J. + doi:10.2136/vzj2012.0097. + + Find a brief mathematical description of the function here: + + [2] https://git.ufz.de/rdm-software/saqc/-/blob/testfuncDocs/docs/funcs + /FormalDescriptions.md#breaks_flagspektrumbased """ return breaks_flagSpektrumBased( @@ -92,31 +184,48 @@ def sm_flagBreaks( ) -@register +@register(masking='all') def sm_flagFrost(data, field, flagger, soil_temp_variable, window="1h", frost_thresh=0, **kwargs): - """This Function is an implementation of the soil temperature based Soil Moisture flagging, as presented in: - - Dorigo,W,.... Global Automated Quality Control of In Situ Soil Moisture Data from the international - Soil Moisture Network. 2013. Vadoze Zone J. doi:10.2136/vzj2012.0097. + """ + This Function is an implementation of the soil temperature based Soil Moisture flagging, as presented in + references [1]: All parameters default to the values, suggested in this publication. Function flags Soil moisture measurements by evaluating the soil-frost-level in the moment of measurement. Soil temperatures below "frost_level" are regarded as denoting frozen soil state. - :param data: The pandas dataframe holding the data-to-be flagged, as well as the reference - series. Data must be indexed by a datetime series. - :param field: Fieldname of the Soil moisture measurements field in data. - :param flagger: A flagger - object. - like thingies that refer to the data(including datestrings). - :param tolerated_deviation: Offset String. Denoting the maximal temporal deviation, - the soil frost states timestamp is allowed to have, relative to the - data point to-be-flagged. - :param soil_temp_reference: A STRING, denoting the fields name in data, - that holds the data series of soil temperature values, - the to-be-flagged values shall be checked against. - :param frost_level: Value level, the flagger shall check against, when evaluating soil frost level. + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-flagged. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + soil_temp_variable : str, + An offset string, denoting the fields name in data, that holds the data series of soil temperature values, + the to-be-flagged values shall be checked against. + window : str + An offset string denoting the maximal temporal deviation, the soil frost states timestamp is allowed to have, + relative to the data point to-be-flagged. + frost_thresh : float + Value level, the flagger shall check against, when evaluating soil frost level. + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional informations related to `data`. + Flags values may have changed, relatively to the flagger input. + + References + ---------- + [1] Dorigo,W. et al.: Global Automated Quality Control of In Situ Soil Moisture + Data from the international Soil Moisture Network. 2013. Vadoze Zone J. + doi:10.2136/vzj2012.0097. """ # retrieve reference series @@ -139,7 +248,7 @@ def sm_flagFrost(data, field, flagger, soil_temp_variable, window="1h", frost_th return data, flagger -@register +@register(masking='all') def sm_flagPrecipitation( data, field, @@ -155,10 +264,9 @@ def sm_flagPrecipitation( **kwargs, ): - """This Function is an implementation of the precipitation based Soil Moisture flagging, as presented in: - - Dorigo,W,.... Global Automated Quality Control of In Situ Soil Moisture Data from the international - Soil Moisture Network. 2013. Vadoze Zone J. doi:10.2136/vzj2012.0097. + """ + This Function is an implementation of the precipitation based Soil Moisture flagging, as presented in + references [1]. All parameters default to the values, suggested in this publication. (excluding porosity,sensor accuracy and sensor depth) @@ -172,9 +280,9 @@ def sm_flagPrecipitation( A data point y_t is flagged an invalid soil moisture raise, if: - (1) y_t > y_(t-raise_window) - (2) y_t - y_(t-"std_factor_range") > "std_factor" * std(y_(t-"std_factor_range"),...,y_t) - (3) sum(prec(t-24h),...,prec(t)) > sensor_depth * sensor_accuracy * soil_porosity + (1) y_t > y_(t-`raise_window`) + (2) y_t - y_(t-`std_factor_range`) > `std_factor` * std(y_(t-`std_factor_range`),...,y_t) + (3) sum(prec(t-24h),...,prec(t)) > `sensor_depth` * `sensor_accuracy` * `soil_porosity` NOTE1: np.nan entries in the input precipitation series will be regarded as susipicious and the test will be omited for every 24h interval including a np.nan entrie in the original precipitation sampling rate. @@ -183,27 +291,57 @@ def sm_flagPrecipitation( NOTE2: The function wont test any values that are flagged suspicious anyway - this may change in a future version. - :param data: The pandas dataframe holding the data-to-be flagged, as well as the reference - series. Data must be indexed by a datetime series and be harmonized onto a - time raster with seconds precision. - :param field: Fieldname of the Soil moisture measurements field in data. - :param flagger: A flagger - object. (saqc.flagger.X) - :param prec_variable: Fieldname of the precipitation meassurements column in data. - :param sensor_depth: Measurement depth of the soil moisture sensor, [m]. - :param sensor_accuracy: Accuracy of the soil moisture sensor, [-]. - :param soil_porosity: Porosity of moisture sensors surrounding soil, [-]. - :param std_factor: The value determines by which rule it is decided, weather a raise in soil - moisture is significant enough to trigger the flag test or not: - Significants is assumed, if the raise is greater then "std_factor" multiplied - with the last 24 hours standart deviation. - :param std_factor_range: Offset String. Denotes the range over witch the standart deviation is obtained, - to test condition [2]. (Should be a multiple of the sampling rate) - :param raise_window: Offset String. Denotes the distance to the datapoint, relatively to witch - it is decided if the current datapoint is a raise or not. Equation [1]. - It defaults to None. When None is passed, raise_window is just the sample - rate of the data. Any raise reference must be a multiple of the (intended) - sample rate and below std_factor_range. - :param ignore_missing: + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-flagged. + flagger : saqc.flagger + A flagger object, holding flags and additional informations related to `data`. + prec_variable : str + Fieldname of the precipitation meassurements column in data. + raise_window: {None, str}, default None + Denotes the distance to the datapoint, relatively to witch + it is decided if the current datapoint is a raise or not. Equation [1]. + It defaults to None. When None is passed, raise_window is just the sample + rate of the data. Any raise reference must be a multiple of the (intended) + sample rate and below std_factor_range. + sensor_depth : float, default 0 + Measurement depth of the soil moisture sensor, [m]. + sensor_accuracy : float, default 0 + Accuracy of the soil moisture sensor, [-]. + soil_porosity : float, default 0 + Porosity of moisture sensors surrounding soil, [-]. + std_factor : int, default 2 + The value determines by which rule it is decided, weather a raise in soil + moisture is significant enough to trigger the flag test or not: + Significance is assumed, if the raise is greater then "std_factor" multiplied + with the last 24 hours standart deviation. + std_window: str, default '24h' + An offset string that denotes the range over witch the standart deviation is obtained, + to test condition [2]. (Should be a multiple of the sampling rate) + raise_window: str + Denotes the distance to the datapoint, relatively to witch + it is decided if the current datapoint is a raise or not. Equation [1]. + It defaults to None. When None is passed, raise_window is just the sample + rate of the data. Any raise reference must be a multiple of the (intended) + sample rate and below std_factor_range. + ignore_missing: bool, default False + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional informations related to `data`. + Flags values may have changed, relatively to the flagger input. + + References + ---------- + [1] Dorigo,W. et al.: Global Automated Quality Control of In Situ Soil Moisture + Data from the international Soil Moisture Network. 2013. Vadoze Zone J. + doi:10.2136/vzj2012.0097. """ dataseries, moist_rate = retrieveTrustworthyOriginal(data, field, flagger) @@ -246,7 +384,7 @@ def sm_flagPrecipitation( return data, flagger -@register +@register(masking='field') def sm_flagConstants( data, field, @@ -265,16 +403,62 @@ def sm_flagConstants( ): """ - - Note, function has to be harmonized to equidistant freq_grid - - Note, in current implementation, it has to hold that: (rainfall_window_range >= plateau_window_min) - - :param data: The pandas dataframe holding the data-to-be flagged. - Data must be indexed by a datetime series and be harmonized onto a - time raster with seconds precision (skips allowed). - :param field: Fieldname of the Soil moisture measurements field in data. - :param flagger: A flagger - object. (saqc.flagger.X) + This function flags plateaus/series of constant values in soil moisture data. + + Mentionings of "conditions" in the following explanations refer to references [2]. + + The function represents a stricter version of + constants_flagVarianceBased. + + The additional constraints (3)-(5), are designed to match the special cases of constant + values in soil moisture measurements and basically for preceding precipitation events + (conditions (3) and (4)) and certain plateau level (condition (5)). + + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-flagged. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + window : str, default '12h' + Minimum duration during which values need to identical to become plateau candidates. See condition (1) + thresh : float, default 0.0005 + Maximum variance of a group of values to still consider them constant. See condition (2) + precipitation_window : str, default '12h' + See condition (3) and (4) + tolerance : float, default 0.95 + Tolerance factor, see condition (5) + deriv_max : float, default 0 + See condition (4) + deriv_min : float, default 0.0025 + See condition (3) + max_missing : {None, int}, default None + Maximum number of missing values allowed in window, by default this condition is ignored + max_consec_missing : {None, int}, default None + Maximum number of consecutive missing values allowed in window, by default this condition is ignored + smooth_window : {None, str}, default None + Size of the smoothing window of the Savitsky-Golay filter. The default value None results in a window of two + times the sampling rate (i.e. three values) + smooth_poly_deg : int, default 2 + Degree of the polynomial used for smoothing with the Savitsky-Golay filter + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional informations related to `data`. + Flags values may have changed, relatively to the flagger input. + + References + ---------- + [1] Dorigo,W. et al.: Global Automated Quality Control of In Situ Soil Moisture + Data from the international Soil Moisture Network. 2013. Vadoze Zone J. + doi:10.2136/vzj2012.0097. + + [2] https://git.ufz.de/rdm-software/saqc/-/edit/testfuncDocs/docs/funcs/FormalDescriptions.md#sm_flagconstants """ # get plateaus: @@ -340,7 +524,7 @@ def sm_flagConstants( return data, flagger -@register +@register(masking='all') def sm_flagRandomForest(data, field, flagger, references, window_values: int, window_flags: int, path: str, **kwargs): """ This Function uses pre-trained machine-learning model objects for flagging of a specific variable. The model is @@ -351,19 +535,33 @@ def sm_flagRandomForest(data, field, flagger, references, window_values: int, wi model training. For the model to work, the parameters 'references', 'window_values' and 'window_flags' have to be set to the same values as during training. - :param data: The pandas dataframe holding the data-to-be flagged, as well as the reference - series. Data must be indexed by a datetime index. - :param flags: A dataframe holding the flags - :param field: Fieldname of the field in data that is to be flagged. - :param flagger: A flagger - object. - :param references: A string or list of strings, denoting the fieldnames of the data series that - should be used as reference variables - :param window_values: An integer, denoting the window size that is used to derive the gradients of - both the field- and reference-series inside the moving window - :param window_flags: An integer, denoting the window size that is used to count the surrounding - automatic flags that have been set before - :param path: A string giving the path to the respective model object, i.e. its name and - the respective value of the grouping variable. e.g. "models/model_0.2.pkl" + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-flagged. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + references : {str, List[str]} + List or list of strings, denoting the fieldnames of the data series that should be used as reference variables + window_values : int + An integer, denoting the window size that is used to derive the gradients of both the field- and + reference-series inside the moving window + window_flags : int + An integer, denoting the window size that is used to count the surrounding automatic flags that have been set + before + path : str + A string giving the path to the respective model object, i.e. its name and + the respective value of the grouping variable. e.g. "models/model_0.2.pkl" + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional informations related to `data`. + Flags values may have changed, relatively to the flagger input. """ def _refCalc(reference, window_values): diff --git a/saqc/funcs/spikes_detection.py b/saqc/funcs/spikes_detection.py index 47086b81f3d493e89bb8ce48fd6968ad934246dc..8c6f954143e5daf1529b069dba92c9568ad1be85 100644 --- a/saqc/funcs/spikes_detection.py +++ b/saqc/funcs/spikes_detection.py @@ -16,10 +16,10 @@ from saqc.lib.tools import ( offset2seconds, slidingWindowIndices, findIndex, + toSequence ) from outliers import smirnov_grubbs - def _stray( val_frame, partition_freq=None, @@ -28,8 +28,59 @@ def _stray( n_neighbors=10, iter_start=0.5, alpha=0.05, + trafo=lambda x: x + ): + """ + Find outliers in multi dimensional observations. + + The general idea is to assigning scores to every observation based on the observations neighborhood in the space + of observations. Then, the gaps between the (greatest) scores are tested for beeing drawn from the same + distribution, as the majority of the scores. + + See the References section for a link to a detailed description of the algorithm. + + Note, that the flagging result depends on the size of the partition under test and the distribution of the outliers + in it. For "normalish" and/or slightly "erratic" datasets, 5000 - 10000, periods turned out to be a good guess. + Note, that no normalizations/transformations are applied to the different components (data columns) + - those are expected to be applied previously, if necessary. + + Parameters + ---------- + val_frame : (N,M) ndarray + Input NxM array of observations, where N is the number of observations and M the number of components per + observation. + partition_freq : {None, str, int}, default None + Determines the size of the data partitions, the data is decomposed into. Each partition is checked seperately + for outliers. If a String is passed, it has to be an offset string and it results in partitioning the data into + parts of according temporal length. If an integer is passed, the data is simply split up into continous chunks + of `partition_freq` periods. if ``None`` is passed (default), all the data will be tested in one run. + partition_min : int, default 0 + Minimum number of periods per partition that have to be present for a valid outlier dettection to be made in + this partition. (Only of effect, if `partition_freq` is an integer.) + scoring_method : {'kNNSum', 'kNNMaxGap'}, default 'kNNMaxGap' + Scoring method applied. + `'kNNSum'`: Assign to every point the sum of the distances to its 'n_neighbors' nearest neighbors. + `'kNNMaxGap'`: Assign to every point the distance to the neighbor with the "maximum gap" to its predecessor + in the hierarchy of the `n_neighbors` nearest neighbors. (see reference section for further descriptions) + n_neighbors : int, default 10 + Number of neighbors included in the scoring process for every datapoint. + iter_start : float, default 0.5 + Float in [0,1] that determines which percentage of data is considered "normal". 0.5 results in the stray + algorithm to search only the upper 50 % of the scores for the cut off point. (See reference section for more + information) + alpha : float, default 0.05 + Niveau of significance by which it is tested, if a score might be drawn from another distribution, than the + majority of the data. + + References + ---------- + Detailed description of the Stray algorithm is covered here: + + [1] Talagala, P. D., Hyndman, R. J., & Smith-Miles, K. (2019). Anomaly detection in high dimensional data. + arXiv preprint arXiv:1908.04000. + """ kNNfunc = getattr(ts_ops, scoring_method) # partitioning @@ -48,6 +99,7 @@ def _stray( for _, partition in partitions: if partition.empty | (partition.shape[0] < partition_min): continue + partition = partition.apply(trafo) sample_size = partition.shape[0] nn_neighbors = min(n_neighbors, max(sample_size, 2)) resids = kNNfunc(partition.values, n_neighbors=nn_neighbors - 1, algorithm="ball_tree") @@ -73,6 +125,40 @@ def _stray( def _expFit(val_frame, scoring_method="kNNMaxGap", n_neighbors=10, iter_start=0.5, alpha=0.05, bin_frac=10): + """ + Find outliers in multi dimensional observations. + + The general idea is to assigning scores to every observation based on the observations neighborhood in the space + of observations. Then, the gaps between the (greatest) scores are tested for beeing drawn from the same + distribution, as the majority of the scores. + + Note, that no normalizations/transformations are applied to the different components (data columns) + - those are expected to be applied previously, if necessary. + + Parameters + ---------- + val_frame : (N,M) ndarray + Input NxM array of observations, where N is the number of observations and M the number of components per + observation. + scoring_method : {'kNNSum', 'kNNMaxGap'}, default 'kNNMaxGap' + Scoring method applied. + `'kNNSum'`: Assign to every point the sum of the distances to its 'n_neighbors' nearest neighbors. + `'kNNMaxGap'`: Assign to every point the distance to the neighbor with the "maximum gap" to its predecessor + in the hierarchy of the `n_neighbors` nearest neighbors. (see reference section for further descriptions) + n_neighbors : int, default 10 + Number of neighbors included in the scoring process for every datapoint. + iter_start : float, default 0.5 + Float in [0,1] that determines which percentage of data is considered "normal". 0.5 results in the expfit + algorithm to search only the upper 50 % of the scores for the cut off point. (See reference section for more + information) + alpha : float, default 0.05 + Niveau of significance by which it is tested, if a score might be drawn from another distribution, than the + majority of the data. + bin_frac : {int, str}, default 10 + Controls the binning for the histogram in the fitting step. If an integer is passed, the residues will + equidistantly be covered by `bin_frac` bins, ranging from the minimum to the maximum of the residues. + If a string is passed, it will be passed on to the ``numpy.histogram_bin_edges`` method. + """ kNNfunc = getattr(ts_ops, scoring_method) resids = kNNfunc(val_frame.values, n_neighbors=n_neighbors, algorithm="ball_tree") @@ -97,7 +183,7 @@ def _expFit(val_frame, scoring_method="kNNMaxGap", n_neighbors=10, iter_start=0. elif bin_frac in ["auto", "fd", "doane", "scott", "stone", "rice", "sturges", "sqrt"]: binz = np.histogram_bin_edges(resids, bins=bin_frac) else: - raise ValueError("Cant interpret {} as an binning technique.".format(bin_frac)) + raise ValueError(f"Can't interpret {bin_frac} as an binning technique.") binzenters = np.array([0.5 * (binz[i] + binz[i + 1]) for i in range(len(binz) - 1)]) # inititialize full histogram: @@ -150,23 +236,64 @@ def _expFit(val_frame, scoring_method="kNNMaxGap", n_neighbors=10, iter_start=0. def _reduceMVflags( - val_frame, fields, flagger, to_flag_frame, reduction_range, reduction_drop_flagged=False, reduction_thresh=3.5 + val_frame, fields, flagger, to_flag_frame, reduction_range, reduction_drop_flagged=False, reduction_thresh=3.5, + reduction_min_periods=1 ): + """ + Function called by "spikes_flagMultivarScores" to reduce the number of false positives that result from + the algorithms confinement to only flag complete observations (all of its variables/components). + + The function "reduces" an observations flag to components of it, by applying MAD (See references) + test onto every components temporal surrounding. + + Parameters + ---------- + val_frame : (N,M) pd.DataFrame + Input NxM DataFrame of observations, where N is the number of observations and M the number of components per + observation. + fields : str + Fieldnames of the components in `val_frame` that are to be tested for outlierishnes. + to_flag_frame : (K,M) pd.DataFrame + Input dataframe of observations to be tested, where N is the number of observations and M the number + of components per observation. + reduction_range : str + An offset string, denoting the range of the temporal surrounding to include into the MAD testing. + reduction_drop_flagged : bool, default False + Wheather or not to drop flagged values other than the value under test, from the temporal surrounding + before checking the value with MAD. + reduction_thresh : float, default 3.5 + The `critical` value, controlling wheather the MAD score is considered referring to an outlier or not. + Higher values result in less rigid flagging. The default value is widely used in the literature. See references + section for more details ([1]). + + References + ---------- + [1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h.htm + """ + to_flag_frame[:] = False to_flag_index = to_flag_frame.index for var in fields: for index in enumerate(to_flag_index): index_slice = slice(index[1] - pd.Timedelta(reduction_range), index[1] + pd.Timedelta(reduction_range)) - test_slice = val_frame[var][index_slice] + test_slice = val_frame[var][index_slice].dropna() + # check, wheather value under test is sufficiently centered: + first_valid = test_slice.first_valid_index() + last_valid = test_slice.last_valid_index() + min_range = pd.Timedelta(reduction_range)/4 + polydeg = 2 + if ((pd.Timedelta(index[1] - first_valid) < min_range) | + (pd.Timedelta(last_valid - index[1]) < min_range)): + polydeg = 0 if reduction_drop_flagged: - test_slice = test_slice.drop(to_flag_index, errors="ignore") - if not test_slice.empty: - x = test_slice.index.values.astype(float) + test_slice = test_slice.drop(to_flag_index, errors='ignore') + if test_slice.shape[0] >= reduction_min_periods: + x = (test_slice.index.values.astype(float)) x_0 = x[0] - x = (x - x_0) / 10 ** 12 - polyfitted = poly.polyfit(y=test_slice.values, x=x, deg=2) - testval = poly.polyval((float(index[1].to_numpy()) - x_0) / 10 ** 12, polyfitted) + x = (x - x_0)/10**12 + polyfitted = poly.polyfit(y=test_slice.values, x=x, deg=polydeg) + testval = poly.polyval((float(index[1].to_numpy()) - x_0)/10**12, polyfitted) testval = val_frame[var][index[1]] - testval resids = test_slice.values - poly.polyval(x, polyfitted) med_resids = np.median(resids) @@ -180,7 +307,7 @@ def _reduceMVflags( return to_flag_frame -@register +@register(masking='all') def spikes_flagMultivarScores( data, field, @@ -195,19 +322,153 @@ def spikes_flagMultivarScores( expfit_binning="auto", stray_partition=None, stray_partition_min=0, - post_reduction=None, + post_reduction=False, reduction_range=None, reduction_drop_flagged=False, reduction_thresh=3.5, + reduction_min_periods=1, **kwargs, ): + """ + The algorithm implements a 3-step outlier detection procedure for simultaneously flagging of higher dimensional + data (dimensions > 3). + + In references [1], the procedure is introduced and exemplified with an application on hydrological data. + + See the notes section for an overview over the algorithms basic steps. + + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-flagged. (Here a dummy, for structural reasons) + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + fields : List[str] + List of fieldnames, corresponding to the variables that are to be included into the flagging process. + trafo : callable, default np.log + Transformation to be applied onto every column before scoring. Will likely get deprecated soon. Its better + to transform the data in a processing step, preceeeding the call to ``flagMultivarScores``. + alpha : float, default 0.05 + Level of significance by which it is tested, if an observations score might be drawn from another distribution + than the majority of the observation. + n_neighbors : int, default 10 + Number of neighbors included in the scoring process for every datapoint. + scoring_method : {'kNNSum', 'kNNMaxGap'}, default 'kNNMaxGap' + Scoring method applied. + ``'kNNSum'``: Assign to every point the sum of the distances to its 'n_neighbors' nearest neighbors. + ``'kNNMaxGap'``: Assign to every point the distance to the neighbor with the "maximum gap" to its predecessor + in the hierarchy of the `n_neighbors` nearest neighbors. (see reference section for further descriptions) + iter_start : float, default 0.5 + Float in [0,1] that determines which percentage of data is considered "normal". 0.5 results in the threshing + algorithm to search only the upper 50 % of the scores for the cut off point. (See reference section for more + information) + threshing : {'stray', 'expfit'}, default 'stray' + A string, denoting the threshing algorithm to be applied on the observations scores. + See the documentations of the algorithms (``_stray``, ``_expfit``) and/or the references sections paragraph [2] + for more informations on the algorithms. + expfit_binning : {int, str}, default 'auto' + Controls the binning for the histogram in the ``expfit`` algorithms fitting step. + If an integer is passed, the residues will equidistantly be covered by `bin_frac` bins, ranging from the + minimum to the maximum of the residues. If a string is passed, it will be passed on to the + ``numpy.histogram_bin_edges`` method. + stray_partition : {None, str, int}, default None + Only effective when `threshing` = 'stray'. + Determines the size of the data partitions, the data is decomposed into. Each partition is checked seperately + for outliers. If a String is passed, it has to be an offset string and it results in partitioning the data into + parts of according temporal length. If an integer is passed, the data is simply split up into continous chunks + of `partition_freq` periods. if ``None`` is passed (default), all the data will be tested in one run. + stray_partition_min : int, default 0 + Only effective when `threshing` = 'stray'. + Minimum number of periods per partition that have to be present for a valid outlier detection to be made in + this partition. (Only of effect, if `stray_partition` is an integer.) + post_reduction : bool, default False + Wheather or not it should be tried to reduce the flag of an observation to one or more of its components. See + documentation of `_reduceMVflags` for more details. + reduction_range : {None, str}, default None + Only effective when `post_reduction` = True + An offset string, denoting the range of the temporal surrounding to include into the MAD testing while trying + to reduce flags. + reduction_drop_flagged : bool, default False + Only effective when `post_reduction` = True + Wheather or not to drop flagged values other than the value under test from the temporal surrounding + before checking the value with MAD. + reduction_thresh : float, default 3.5 + Only effective when `post_reduction` = True + The `critical` value, controlling wheather the MAD score is considered referring to an outlier or not. + Higher values result in less rigid flagging. The default value is widely considered apropriate in the + literature. + + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values may have changed, relatively to the flagger input. + + Notes + ----- + The basic steps are: + + 1. transforming + + The different data columns are transformed via timeseries transformations to + (a) make them comparable and + (b) make outliers more stand out. + + This step is usually subject to a phase of research/try and error. See [1] for more details. + + Note, that the data transformation as an built-in step of the algorithm, will likely get deprecated soon. Its better + to transform the data in a processing step, preceeding the multivariate flagging process. Also, by doing so, one + gets mutch more control and variety in the transformation applied, since the `trafo` parameter only allows for + application of the same transformation to all of the variables involved. + + 2. scoring + + Every observation gets assigned a score depending on its k nearest neighbors. See the `scoring_method` parameter + description for details on the different scoring methods. Furthermore [1], [2] may give some insight in the + pro and cons of the different methods. + + 3. threshing + + The gaps between the (greatest) scores are tested for beeing drawn from the same + distribution as the majority of the scores. If a gap is encountered, that, with sufficient significance, can be + said to not be drawn from the same distribution as the one all the smaller gaps are drawn from, than + the observation belonging to this gap, and all the observations belonging to gaps larger then this gap, get flagged + outliers. See description of the `threshing` parameter for more details. Although [2] gives a fully detailed + overview over the `stray` algorithm. + + References + ---------- + Odd Water Algorithm: + + [1] Talagala, P.D. et al (2019): A Feature-Based Procedure for Detecting Technical Outliers in Water-Quality Data + From In Situ Sensors. Water Ressources Research, 55(11), 8547-8568. + + A detailed description of the stray algorithm: + + [2] Talagala, P. D., Hyndman, R. J., & Smith-Miles, K. (2019). Anomaly detection in high dimensional data. + arXiv preprint arXiv:1908.04000. + + A detailed description of the MAD outlier scoring: + + [3] https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h.htm + """ # data fransformation/extraction + data = data.copy() + fields = toSequence(fields) val_frame = data[fields] val_frame = val_frame.loc[val_frame.index_of("shared")].to_df() val_frame.dropna(inplace=True) val_frame = val_frame.apply(trafo) + if val_frame.empty: + return data, flagger + if threshing == "stray": to_flag_index = _stray( val_frame, @@ -220,26 +481,22 @@ def spikes_flagMultivarScores( ) else: - to_flag_index = _expFit( - val_frame, - scoring_method=scoring_method, - n_neighbors=n_neighbors, - iter_start=iter_start, - alpha=alpha, - bin_frac=expfit_binning, - ) + val_frame = val_frame.apply(trafo) + to_flag_index = _expFit(val_frame, + scoring_method=scoring_method, + n_neighbors=n_neighbors, + iter_start=iter_start, + alpha=alpha, + bin_frac=expfit_binning) to_flag_frame = pd.DataFrame({var_name: True for var_name in fields}, index=to_flag_index) if post_reduction: - to_flag_frame = _reduceMVflags( - val_frame, - fields, - flagger, - to_flag_frame, - reduction_range, - reduction_drop_flagged=reduction_drop_flagged, - reduction_thresh=reduction_thresh, - ) + val_frame = data[toSequence(fields)].to_df() + to_flag_frame = _reduceMVflags(val_frame, fields, flagger, to_flag_frame, reduction_range, + reduction_drop_flagged=reduction_drop_flagged, + reduction_thresh=reduction_thresh, + reduction_min_periods=reduction_min_periods) + for var in fields: to_flag_ind = to_flag_frame.loc[:, var] @@ -249,7 +506,7 @@ def spikes_flagMultivarScores( return data, flagger -@register +@register(masking='field') def spikes_flagRaise( data, field, @@ -264,13 +521,68 @@ def spikes_flagRaise( numba_boost=True, **kwargs, ): + """ + The function flags rises and drops in value courses, that exceed a certain threshold + within a certain timespan. + + The parameter variety of the function is owned to the intriguing + case of values, that "return" from outlierish or anomalious value levels and + thus exceed the threshold, while actually being usual values. - # NOTE1: this implementation accounts for the case of "pseudo" spikes that result from checking against outliers - # NOTE2: the test is designed to work on raw data as well as on regularized - # - # See saqc documentation at: - # https://git.ufz.de/rdm-software/saqc/blob/develop/docs/funcs/SpikeDetection.md - # for more details + NOTE, the dataset is NOT supposed to be harmonized to a time series with an + equidistant frequency grid. + + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-flagged. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + thresh : float + The threshold, for the total rise (thresh > 0), or total drop (thresh < 0), value courses must + not exceed within a timespan of length `raise_window`. + raise_window : str + An offset string, determining the timespan, the rise/drop thresholding refers to. Window is inclusively defined. + intended_freq : str + An offset string, determining The frequency, the timeseries to-be-flagged is supposed to be sampled at. + The window is inclusively defined. + average_window : {None, str}, default None + See condition (2) of the description linked in the references. Window is inclusively defined. + The window defaults to 1.5 times the size of `raise_window` + mean_raise_factor : float, default 2 + See condition (2) of the description linked in the references. + min_slope : {None, float}, default None + See condition (3) of the description linked in the references + min_slope_weight : float, default 0.8 + See condition (3) of the description linked in the references + numba_boost : bool, default True + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values may have changed, relatively to the flagger input. + + Notes + ----- + The value :math:`x_{k}` of a time series :math:`x` with associated + timestamps :math:`t_i`, is flagged a rise, if: + + 1. There is any value :math:`x_{s}`, preceeding :math:`x_{k}` within `raise_window` range, so that: + * :math:` M = |x_k - x_s | > ` `thresh` :math:` > 0` + 2. The weighted average :math:`\mu^*` of the values, preceeding :math:`x_{k}` within `average_window` + range indicates, that :math:`x_{k}`$ doesnt return from an outliererish value course, meaning that: + * :math:` x_k > \mu^* + ( M ` / `mean_raise_factor` :math:`)` + 3. Additionally, if `min_slope` is not `None`, :math:`x_{k}` is checked for being sufficiently divergent from its + very predecessor $`x_{k-1}`$, meaning that, it is additionally checked if: + * :math:`x_k - x_{k-1} > ` `min_slope` + * :math:`t_k - t_{k-1} > ` `min_slope_weight`*`intended_freq` + + """ # prepare input args dataseries = data[field].dropna() @@ -309,7 +621,7 @@ def spikes_flagRaise( if raise_series.isna().all(): return data, flagger - # "unflag" values of unsifficient deviation to theire predecessors + # "unflag" values of insufficient deviation to their predecessors if min_slope is not None: w_mask = ( pd.Series(dataseries.index).diff().dt.total_seconds() / intended_freq.total_seconds() @@ -355,11 +667,12 @@ def spikes_flagRaise( return data, flagger -@register +@register(masking='field') def spikes_flagSlidingZscore( data, field, flagger, window, offset, count=1, polydeg=1, z=3.5, method="modZ", **kwargs, ): - """ An outlier detection in a sliding window. The method for detection can be a simple Z-score or the more robust + """ + An outlier detection in a sliding window. The method for detection can be a simple Z-score or the more robust modified Z-score, as introduced here [1]. The steps are: @@ -370,20 +683,40 @@ def spikes_flagSlidingZscore( 5. processing continue at 1. until end of data. 6. all potential outlier, that are detected `count`-many times, are promoted to real outlier and flagged by the `flagger` - :param data: pandas dataframe. holding the data - :param field: fieldname in `data`, which holds the relevant infos - :param flagger: flagger. - :param window: int or time-offset string (see [2]). The size of the window the outlier detection is run in. default: 1h - :param offset: int or time-offset string (see [2]). Stepsize the window is set further. default: 1h - :param method: str. `modZ` or `zscore`. see [1] at section `Z-Scores and Modified Z-Scores` - :param count: int. this many times, a datapoint needs to be detected in different windows, to be finally - flagged as outlier - :param polydeg: The degree for the polynomial fit, to calculate the residuum - :param z: float. the value the (mod.) Z-score is tested against. Defaulting to 3.5 (Recommendation of [1]) - - Links: + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-flagged. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + window: {int, str} + Integer or offset string (see [2]). The size of the window the outlier detection is run in. + offset: {int, str} + Integer or offset string (see [2]). Stepsize the window is set further. default: 1h + count: int, default 1 + Number of times a value has to be classified an outlier in different windows, to be finally flagged an outlier. + polydeg : int, default 1 + The degree for the polynomial that is fitted to the data in order to calculate the residues. + z : float, default 3.5 + The value the (mod.) Z-score is tested against. Defaulting to 3.5 (Recommendation of [1]) + method: {'modZ', zscore}, default 'modZ' + See section `Z-Scores and Modified Z-Scores` in [1]. + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values may have changed, relatively to the flagger input. + + References + ---------- [1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h.htm [2] https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects + """ use_offset = False @@ -481,21 +814,42 @@ def spikes_flagSlidingZscore( return data, flagger -@register +@register(masking='field') def spikes_flagMad(data, field, flagger, window, z=3.5, **kwargs): - """ The function represents an implementation of the modyfied Z-score outlier detection method, as introduced here: - [1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h.htm + """ - The test needs the input data to be harmonized to an equidustant time stamp series (=have frequencie)) + The function represents an implementation of the modyfied Z-score outlier detection method. + + See references [1] for more details on the algorithm. + + Note, that the test needs the input data to be sampled regularly (fixed sampling rate). + + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-flagged. (Here a dummy, for structural reasons) + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + window : str + Offset string. Denoting the windows size that the "Z-scored" values have to lie in. + z: float, default 3.5 + The value the Z-score is tested against. Defaulting to 3.5 (Recommendation of [1]) + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values may have changed, relatively to the flagger input. + + References + ---------- + [1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h.htm - :param data: The pandas dataframe holding the data-to-be flagged. - Data must be indexed by a datetime series and be harmonized onto a - time raster with seconds precision. - :param field: Fieldname of the Soil moisture measurements field in data. - :param flagger: A flagger - object. (saqc.flagger.X) - :param winsz: Offset String. Denoting the windows size that that th "Z-scored" values have to lie in. - :param z: Float. The value the Z-score is tested against. Defaulting to 3.5 (Recommendation of [1]) """ d = data[field].copy().mask(flagger.isFlagged(field)) median = d.rolling(window=window, closed="both").median() @@ -519,36 +873,55 @@ def spikes_flagMad(data, field, flagger, window, z=3.5, **kwargs): return data, flagger -@register +@register(masking='field') def spikes_flagBasic(data, field, flagger, thresh=7, tolerance=0, window="15min", **kwargs): """ A basic outlier test that is designed to work for harmonized and not harmonized data. + The test classifies values/value courses as outliers by detecting not only a rise in value, but also, + checking for a return to the initial value level. + Values x(n), x(n+1), .... , x(n+k) of a timeseries x are considered spikes, if - (1) |x(n-1) - x(n + s)| > "thresh", for all s in [0,1,2,...,k] + (1) |x(n-1) - x(n + s)| > `thresh`, for all s in [0,1,2,...,k] - (2) |x(n-1) - x(n+k+1)| < tol + (2) |x(n-1) - x(n+k+1)| < `tolerance` - (3) |x(n-1).index - x(n+k+1).index| < length + (3) |x(n-1).index - x(n+k+1).index| < `windoow` Note, that this definition of a "spike" not only includes one-value outliers, but also plateau-ish value courses. + + Parameters + ---------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-flagged. (Here a dummy, for structural reasons) + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + thresh : float, default 7 + Minimum difference between to values, to consider the latter one as a spike. See condition (1) + tolerance : float, default 0 + Maximum difference between pre-spike and post-spike values. See condition (2) + window : str, default '15min' + Maximum length of "spiky" value courses. See condition (3) + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values may have changed, relatively to the flagger input. + + References + ---------- The implementation is a time-window based version of an outlier test from the UFZ Python library, that can be found here: https://git.ufz.de/chs/python/blob/master/ufz/level1/spike.py - - :param data: Pandas-like. The pandas dataframe holding the data-to-be flagged. - :param field: String. Fieldname of the data column to be tested. - :param flagger: saqc.flagger. A flagger - object. - :param thresh: Float. The lower bound for a value jump, to be considered as initialising a spike. - (see condition (1) in function description). - :param tolerance: Float. Tolerance value. (see condition (2) in function description) - :param window_size: Offset String. The time span in wich the values of a spikey course have to return to the normal - value course (see condition (3) in function description). - :return: """ dataseries = data[field].dropna() @@ -605,7 +978,7 @@ def spikes_flagBasic(data, field, flagger, thresh=7, tolerance=0, window="15min" return data, flagger -@register +@register(masking='field') def spikes_flagSpektrumBased( data, field, @@ -620,69 +993,82 @@ def spikes_flagSpektrumBased( **kwargs, ): """ - This Function is a generalization of the Spectrum based Spike flagging mechanism as presented in: - - Dorigo,W,.... Global Automated Quality Control of In Situ Soil Moisture Data from the international - Soil Moisture Network. 2013. Vadoze Zone J. doi:10.2136/vzj2012.0097. Function detects and flags spikes in input data series by evaluating its derivatives and applying some conditions to it. A datapoint is considered a spike, if: (1) the quotient to its preceeding datapoint exceeds a certain bound - (controlled by param "raise_factor") + (controlled by param `raise_factor`) (2) the quotient of the datas second derivate at the preceeding and subsequent timestamps is close enough to 1. - (controlled by param "deriv_factor") + (controlled by param `deriv_factor`) (3) the surrounding data is not too noisy. (Coefficient of Variation[+/- noise_window] < 1) - (controlled by param "noise_thresh") - - Some things you should be conscious about when applying this test: - - NOTE1: You should run less complex tests, especially range-tests, or absolute spike tests previously to this one, - since the spike check for any potential, unflagged spike, is relatively costly - (1 x smoothing + 2 x deviating + 2 x condition application). - - NOTE2: Due to inconsistency in the paper that provided the concept of this test [paper:], its not really clear - weather to use the coefficient of variance or the relative variance for noise testing. - Since the relative variance was explicitly denoted in the formulas, the function defaults to relative variance, - but can be switched to coefficient of variance, by assignment to parameter "noise statistic". - - - :param data: The pandas dataframe holding the data-to-be flagged. - Data must be indexed by a datetime series and be harmonized onto a - time raster with seconds precision. - :param field: Fieldname of the Soil moisture measurements field in data. - :param flagger: A flagger - object. (saqc.flagger.X) - :param smooth_window: Offset string. Size of the filter window, used to calculate the derivatives. - (relevant only, if: diff_method='savgol') - :param smooth_poly_deg: Integer. Polynomial order, used for smoothing with savitzk golay filter. - (relevant only, if: diff_method='savgol') - :param raise_factor: A float, determinating the bound, the quotient of two consecutive values - has to exceed, to be regarded as potentially spike. A value of 0.x will - trigger the spike test for value y_t, if: - (y_t)/(y_t-1) > 1 + x or: - (y_t)/(y_t-1) < 1 - x. - :param deriv_factor: A float, determining the interval, the quotient of the datas second derivate - around a potential spike has to be part of, to trigger spike flagging for - this value. A datapoint y_t will pass this spike condition if, - for deriv_factor = 0.x, and the second derivative y'' of y, the condition: - 1 - x < abs((y''_t-1)/(y''_t+1)) < 1 + x - holds - :param noise_thresh: A float, determining the bound, the data noisy-ness around a potential spike - must not exceed, in order to guarantee a justifyed judgement: - Therefor the coefficient selected by parameter noise_func (COVA), - of all values within t +/- param "noise_window", - but excluding the point y_t itself, is evaluated and tested - for: COVA < noise_thresh. - :param noise_window: Offset string, determining the size of the window, the coefficient of - variation is calculated of, to determine data noisy-ness around a potential - spike. - The potential spike y_t will be centered in a window of expansion: - [y_t - noise_window_size, y_t + noise_window_size]. - :param noise_func: String. Determines, wheather to use - "relative variance" or "coefficient of variation" to check against the noise - barrier. - 'CoVar' -> "Coefficient of variation" - 'rVar' -> "relative Variance" + (controlled by param `noise_thresh`) + + Note, that the data-to-be-flagged is supposed to be sampled at an equidistant frequency grid + + Note, that the derivative is calculated after applying a Savitsky-Golay filter to the data. + + Parameters + ---------- + + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-flagged. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + raise_factor : float, default 0.15 + Minimum relative value difference between two values to consider the latter as a spike candidate. + See condition (1) (or reference [2]). + deriv_factor : float, default 0.2 + See condition (2) (or reference [2]). + noise_func : {'CoVar', 'rVar'}, default 'CoVar' + Function to calculate noisiness of the data surrounding potential spikes. + ``'CoVar'``: Coefficient of Variation + ``'rVar'``: Relative Variance + noise_window : str, default '12h' + An offset string that determines the range of the time window of the "surrounding" data of a potential spike. + See condition (3) (or reference [2]). + noise_thresh : float, default 1 + Upper threshold for noisiness of data surrounding potential spikes. See condition (3) (or reference [2]). + smooth_window : {None, str}, default None + Size of the smoothing window of the Savitsky-Golay filter. + The default value ``None`` results in a window of two times the sampling rate (i.e. containing three values). + smooth_poly_deg : int, default 2 + Degree of the polynomial used for fitting with the Savitsky-Golay filter. + + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values may have changed relatively to the flagger input. + + References + ---------- + This Function is a generalization of the Spectrum based Spike flagging mechanism as presented in: + + [1] Dorigo, W. et al: Global Automated Quality Control of In Situ Soil Moisture + Data from the international Soil Moisture Network. 2013. Vadoze Zone J. + doi:10.2136/vzj2012.0097. + + Notes + ----- + A value is flagged a spike, if: + + 1. The quotient to its preceding data point exceeds a certain bound: + * :math:`|\frac{x_k}{x_{k-1}}| > 1 + ` `raise_factor`, or + * :math:`|\frac{x_k}{x_{k-1}}| < 1 - ` `raise_factor` +2. The quotient of the second derivative :math:`x''`, at the preceding + and subsequent timestamps is close enough to 1: + * :math:` |\frac{x''_{k-1}}{x''_{k+1}} | > 1 - ` `deriv_factor`, and + * :math:` |\frac{x''_{k-1}}{x''_{k+1}} | < 1 + ` `deriv_factor` +3. The dataset :math:`X = x_i, ..., x_{k-1}, x_{k+1}, ..., x_j`, with + :math:`|t_{k-1} - t_i| = |t_j - t_{k+1}| =` `noise_window` fulfills the + following condition: + `noise_func`:math:`(X) <` `noise_thresh` + """ dataseries, data_rate = retrieveTrustworthyOriginal(data, field, flagger) @@ -743,39 +1129,60 @@ def spikes_flagSpektrumBased( return data, flagger +@register(masking='field') def spikes_flagGrubbs(data, field, flagger, winsz, alpha=0.05, min_periods=8, check_lagged=False, **kwargs): """ The function flags values that are regarded outliers due to the grubbs test. - (https://en.wikipedia.org/wiki/Grubbs%27s_test_for_outliers) + See reference [1] for more information on the grubbs tests definition. - The (two-sided) test gets applied onto data chunks of size "winsz". The tests appliccation will + The (two-sided) test gets applied onto data chunks of size "winsz". The tests application will be iterated on each data-chunk under test, till no more outliers are detected in that chunk. Note, that the test performs poorely for small data chunks (resulting in heavy overflagging). Therefor you should select "winsz" so that every window contains at least > 8 values and also adjust the min_periods values accordingly. - Note, that the data to be tested by the grubbs test are expected to be "normallish" distributed. + Note, that the data to be tested by the grubbs test are expected to be distributed "normalish". Parameters ---------- - winsz : Integer or Offset String + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + field : str + The fieldname of the column, holding the data-to-be-flagged. + flagger : saqc.flagger + A flagger object, holding flags and additional Informations related to `data`. + winsz : {int, str} The size of the window you want to use for outlier testing. If an integer is passed, the size - refers to the number of periods of every testing window. If an offset string is passed, - the size refers to the total temporal extension of every window. - even. - alpha : float - The level of significance the grubbs test is to be performed at. (between 0 and 1) - min_periods : Integer - The minimum number of values present in a testing interval for a grubbs test result to be accepted. Only - makes sence in case "winsz" is an offset string. + refers to the number of periods of every testing window. If a string is passed, it has to be an offset string, + and will denote the total temporal extension of every window. + alpha : float, default 0.05 + The level of significance, the grubbs test is to be performed at. (between 0 and 1) + min_periods : int, default 8 + The minimum number of values that have to be present in an interval under test, for a grubbs test result to be + accepted. Only makes sence in case `winsz` is an offset string. check_lagged: boolean, default False - If True, every value gets checked twice for being an outlier, ones in the initial rolling window and one more time - in a rolling window that is lagged by half the windows delimeter (winsz/2). Recommended for avoiding false + If True, every value gets checked twice for being an outlier. Ones in the initial rolling window and one more + time in a rolling window that is lagged by half the windows delimeter (winsz/2). Recommended for avoiding false positives at the window edges. Only available when rolling with integer defined window size. + Returns + ------- + data : dios.DictOfSeries + A dictionary of pandas.Series, holding all the data. + flagger : saqc.flagger + The flagger object, holding flags and additional Informations related to `data`. + Flags values may have changed relatively to the flagger input. + + References + ---------- + introduction to the grubbs test: + + [1] https://en.wikipedia.org/wiki/Grubbs%27s_test_for_outliers + """ + data = data.copy() datcol = data[field] to_group = pd.DataFrame(data={"ts": datcol.index, "data": datcol}) diff --git a/saqc/lib/plotting.py b/saqc/lib/plotting.py index 9b780d70198b3e6085b6fda21f5524c7d9c619a6..0a9ac3066895e89575a84232dbef171d85ca12f4 100644 --- a/saqc/lib/plotting.py +++ b/saqc/lib/plotting.py @@ -5,7 +5,7 @@ import logging import numpy as np import pandas as pd -import dios.dios as dios +import dios import matplotlib.pyplot as plt from typing import List, Dict, Optional from saqc.flagger import BaseFlagger diff --git a/saqc/lib/tools.py b/saqc/lib/tools.py index 040f85a11f36ae7b84b97992dc48e5b90fad495f..6bc9a79a576261eeeda1e6a83d9c68bc1763c88b 100644 --- a/saqc/lib/tools.py +++ b/saqc/lib/tools.py @@ -7,13 +7,14 @@ from typing import Sequence, Union, Any, Iterator import numpy as np import numba as nb import pandas as pd - +import logging import dios -import inspect + # from saqc.flagger import BaseFlagger from saqc.lib.types import T +logger = logging.getLogger("SaQC") def assertScalar(name, value, optional=False): if (not np.isscalar(value)) and (value is not None) and (optional is True): @@ -337,12 +338,18 @@ def groupConsecutives(series: pd.Series) -> Iterator[pd.Series]: start = stop -def mergeDios(left, right, join="merge"): +def mergeDios(left, right, subset=None, join="merge"): # use dios.merge() as soon as it implemented # see https://git.ufz.de/rdm/dios/issues/15 merged = left.copy() - shared_cols = left.columns.intersection(right.columns) + if subset is not None: + right_subset_cols = right.columns.intersection(subset) + else: + right_subset_cols = right.columns + + shared_cols = left.columns.intersection(right_subset_cols) + for c in shared_cols: l, r = left[c], right[c] if join == "merge": @@ -357,7 +364,7 @@ def mergeDios(left, right, join="merge"): l, r = l.align(r, join=join) merged[c] = l.combine_first(r) - newcols = right.columns.difference(merged.columns) + newcols = right_subset_cols.difference(left.columns) for c in newcols: merged[c] = right[c].copy() @@ -383,3 +390,4 @@ def mutateIndex(index, old_name, new_name): index = index.drop(index[pos]) index = index.insert(pos, new_name) return index + diff --git a/saqc/lib/types.py b/saqc/lib/types.py index 0a6b9643bc59c3669f33e5c4f1c332dc225982c1..facebe59987a4d3a74352b7146a0ab2320f8a73f 100644 --- a/saqc/lib/types.py +++ b/saqc/lib/types.py @@ -5,7 +5,7 @@ from typing import TypeVar, Union import numpy as np import pandas as pd -import dios.dios as dios +import dios T = TypeVar("T") ArrayLike = TypeVar("ArrayLike", np.ndarray, pd.Series, pd.DataFrame) diff --git a/setup.py b/setup.py index 28dc64c8ba650fb232fca9cd5eff288ca746dc13..c152bfee6feab1ff885576b1e047d10c7b7b3426 100644 --- a/setup.py +++ b/setup.py @@ -25,6 +25,7 @@ setup( "pyarrow", "python-intervals", "astor", + "dios" ], license="GPLv3", entry_points={"console_scripts": ["saqc=saqc.__main__:main"],}, diff --git a/sphinx-doc/.gitignore b/sphinx-doc/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..16f05fe5bcb986845d46c131c0e10447581670b2 --- /dev/null +++ b/sphinx-doc/.gitignore @@ -0,0 +1,6 @@ + +_api/ +_build/ +_static/ +*.automodsumm +_static/* \ No newline at end of file diff --git a/sphinx-doc/FlagFunctions.rst b/sphinx-doc/FlagFunctions.rst new file mode 100644 index 0000000000000000000000000000000000000000..584d0dc5f48d0434a02d86ecbf884e47188d7236 --- /dev/null +++ b/sphinx-doc/FlagFunctions.rst @@ -0,0 +1,7 @@ + +Functions +========= + +.. automodapi:: saqc.funcs + :skip: register + diff --git a/sphinx-doc/Makefile b/sphinx-doc/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..efdfe91d931f29bf94f8b1f08c1b6f3d0661c8ab --- /dev/null +++ b/sphinx-doc/Makefile @@ -0,0 +1,27 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile clean + +# clean sphinx generated stuff +clean: + rm -rf _build _static _api + rm -f *.automodsumm + mkdir _static + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + diff --git a/sphinx-doc/conf.py b/sphinx-doc/conf.py new file mode 100644 index 0000000000000000000000000000000000000000..77bdd67bc2a821b18f35e06557be2ea665b6a3c2 --- /dev/null +++ b/sphinx-doc/conf.py @@ -0,0 +1,111 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import sys +sys.path.insert(0, os.path.abspath('..')) + + +# -- Project information ----------------------------------------------------- + +project = 'SaQC' +copyright = '2020, Bert Palm, David Schäfer, Peter Lünenschloß, Lennart Schmidt, Juliane Geller' +author = 'Bert Palm, David Schäfer, Peter Lünenschloß, Lennart Schmidt, Juliane Geller' + +# The full version, including alpha/beta/rc tags +release = 'develop' + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + # "sphinx.ext.doctest", + # "sphinx.ext.extlinks", + # "sphinx.ext.todo", + # "sphinx.ext.intersphinx", + # "sphinx.ext.coverage", + # "sphinx.ext.mathjax", + # "sphinx.ext.ifconfig", + "sphinx.ext.autosectionlabel", + + # link source code + "sphinx.ext.viewcode", + + # add suupport for NumPy style docstrings + "sphinx.ext.napoleon", + + # Doc a whole module + # see https://sphinx-automodapi.readthedocs.io/en/latest/ + 'sphinx_automodapi.automodapi', + # 'sphinx_automodapi.smart_resolver', + # see https://sphinxcontrib-fulltoc.readthedocs.io/en/latest/ + 'sphinxcontrib.fulltoc', + + # Markdown sources support + # https://recommonmark.readthedocs.io/en/latest/ + 'recommonmark', + # https://github.com/ryanfox/sphinx-markdown-tables + 'sphinx_markdown_tables', +] + + +# -- Params of the extensions ------------------------------------------------ + +numpydoc_show_class_members = False + +automodsumm_inherited_members = True +# write out the files generated by automodapi, mainly for debugging +automodsumm_writereprocessed = True + +automodapi_inheritance_diagram = False +automodapi_toctreedirnm = '_api' +autosectionlabel_prefix_document = True + + +# -- Other options ----------------------------------------------------------- + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +source_suffix = ['.rst', '.md'] + + +# -- Options for HTML output ------------------------------------------------- + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = "sphinx" + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = "nature" + +# use pandas theme +# html_theme = "pydata_sphinx_theme" + + +# html_theme_options = { +# } + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] \ No newline at end of file diff --git a/sphinx-doc/flagger.rst b/sphinx-doc/flagger.rst new file mode 100644 index 0000000000000000000000000000000000000000..d8536aa3e39c53d92aa11b9057b92ceef84b7535 --- /dev/null +++ b/sphinx-doc/flagger.rst @@ -0,0 +1,11 @@ + +Flagger +======= + +.. automodapi:: saqc.flagger + :include-all-objects: + :no-heading: + + + + diff --git a/sphinx-doc/index.rst b/sphinx-doc/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..c8d40fbd90e200486bdc243c67445ccd0e690015 --- /dev/null +++ b/sphinx-doc/index.rst @@ -0,0 +1,31 @@ +.. SaQC documentation master file, created by + sphinx-quickstart on Mon Aug 17 12:11:29 2020. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to SaQC's documentation! +================================ + +Saqc is a great tool to clean data from rubbish. + +.. toctree:: + :hidden: + + Repository <https://git.ufz.de/rdm-software/saqc> + +.. toctree:: + :maxdepth: 2 + + flagger + +.. toctree:: + :maxdepth: 2 + + FlagFunctions + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/sphinx-doc/make.bat b/sphinx-doc/make.bat new file mode 100644 index 0000000000000000000000000000000000000000..6247f7e231716482115f34084ac61030743e0715 --- /dev/null +++ b/sphinx-doc/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/sphinx-doc/requirements_sphinx.txt b/sphinx-doc/requirements_sphinx.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c284aedaad5c1ca6bc481b58ff649dbae265ec0 --- /dev/null +++ b/sphinx-doc/requirements_sphinx.txt @@ -0,0 +1,68 @@ +alabaster==0.7.12 +appdirs==1.4.4 +astor==0.8.1 +attrs==20.2.0 +Babel==2.8.0 +black==20.8b1 +certifi==2020.6.20 +chardet==3.0.4 +Click==7.0 +commonmark==0.9.1 +cycler==0.10.0 +decorator==4.4.2 +dios==0.6.0 +docutils==0.16 +dtw==1.4.0 +idna==2.10 +imagesize==1.2.0 +importlib-metadata==1.5.0 +Jinja2==2.11.2 +joblib==0.14.1 +kiwisolver==1.1.0 +llvmlite==0.34.0 +Markdown==3.2.2 +MarkupSafe==1.1.1 +matplotlib==3.3.1 +mlxtend==0.17.2 +more-itertools==8.5.0 +numba==0.51.2 +numpy==1.19.2 +outlier==0.2 +outlier-utils==0.0.3 +packaging==20.1 +pandas==1.1.2 +pathspec==0.8.0 +pluggy==0.13.1 +py==1.8.1 +pyarrow==1.0.1 +Pygments==2.6.1 +pyparsing==2.4.6 +pytest==5.3.5 +pytest-lazy-fixture==0.6.3 +python-dateutil==2.8.1 +python-intervals==1.10.0 +pytz==2019.3 +PyWavelets==1.1.1 +recommonmark==0.6.0 +regex==2020.7.14 +requests==2.24.0 +scikit-learn==0.22.1 +scipy==1.4.1 +six==1.14.0 +snowballstemmer==2.0.0 +Sphinx==3.2.1 +sphinx-automodapi==0.12 +sphinx-markdown-tables==0.0.15 +sphinxcontrib-applehelp==1.0.2 +sphinxcontrib-devhelp==1.0.2 +sphinxcontrib-fulltoc==1.2.0 +sphinxcontrib-htmlhelp==1.0.3 +sphinxcontrib-jsmath==1.0.1 +sphinxcontrib-qthelp==1.0.3 +sphinxcontrib-serializinghtml==1.1.4 +toml==0.10.1 +typed-ast==1.4.1 +urllib3==1.25.10 +utils==1.0.1 +wcwidth==0.1.8 +zipp==2.2.0 diff --git a/test/core/test_core.py b/test/core/test_core.py index 21a9624a3d84e2c435a988e00d0fe1deed2be976..f6753980ab7a70fc3c90a6fe21cae37b7c761278 100644 --- a/test/core/test_core.py +++ b/test/core/test_core.py @@ -21,7 +21,7 @@ logging.disable(logging.CRITICAL) OPTIONAL = [False, True] -@register +@register(masking='field') def flagAll(data, field, flagger, **kwargs): # NOTE: remember to rename flag -> flag_values return data, flagger.setFlags(field=field, flag=flagger.BAD) @@ -38,10 +38,10 @@ def flags(flagger, data, optional): return flagger.initFlags(data[data.columns[::2]])._flags -@pytest.mark.skip(reason="does not make sense anymore") @pytest.mark.parametrize("flagger", TESTFLAGGER) def test_errorHandling(data, flagger): - @register + + @register(masking='field') def raisingFunc(data, field, flagger, **kwargs): raise TypeError @@ -51,6 +51,9 @@ def test_errorHandling(data, flagger): # NOTE: should not fail, that's all we are testing here SaQC(flagger, data, error_policy=policy).raisingFunc(var1).getResult() + with pytest.raises(TypeError): + SaQC(flagger, data, error_policy='raise').raisingFunc(var1).getResult() + @pytest.mark.parametrize("flagger", TESTFLAGGER) def test_duplicatedVariable(flagger): @@ -79,7 +82,7 @@ def test_assignVariable(flagger): pdata, pflagger = SaQC(flagger, data).flagAll(var1).flagAll(var2).getResult() pflags = pflagger.getFlags() - assert (pflags.columns == [var1, var2]).all() + assert (set(pflags.columns) == {var1, var2}) assert pflagger.isFlagged(var2).empty @@ -99,108 +102,6 @@ def test_dtypes(data, flagger, flags): assert dict(flags.dtypes) == dict(pflags.dtypes) -@pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_masking(data, flagger): - """ - test if flagged values are exluded during the preceding tests - """ - flagger = flagger.initFlags(data) - flags = flagger.getFlags() - var1 = 'var1' - mn = min(data[var1]) - mx = max(data[var1]) / 2 - - qc = SaQC(flagger, data) - qc = qc.flagRange(var1, mn, mx) - # min is not considered because its the smalles possible value. - # if masking works, `data > max` will be masked, - # so the following will deliver True for in range (data < max), - # otherwise False, like an inverse range-test - qc = qc.procGeneric("dummy", func=lambda var1: var1 >= mn) - - pdata, pflagger = qc.getResult() - out_of_range = pflagger.isFlagged(var1) - in_range = ~out_of_range - - assert (pdata.loc[out_of_range, "dummy"] == False).all() - assert (pdata.loc[in_range, "dummy"] == True).all() - - -@pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_masking_UnmaskingOnDataChange(data, flagger): - """ test if (un)masking works as expected on data-change. - - If the data change in the func, unmasking should respect this changes and - should not reapply original data, instead take the new data (and flags) as is. - Also if flags change, the data should be taken as is. - """ - FILLER = -9999 - - @register - def changeData(data, field, flagger, **kwargs): - mask = data.isna() - data.aloc[mask] = FILLER - return data, flagger - - @register - def changeFlags(data, field, flagger, **kwargs): - mask = data.isna() - flagger = flagger.setFlags(field, loc=mask[field], flag=flagger.UNFLAGGED, force=True) - return data, flagger - - var = data.columns[0] - var_data = data[var] - mn, mx = var_data.max() * .25, var_data.max() * .75 - range_mask = (var_data < mn) | (var_data > mx) - - qc = SaQC(flagger, data) - qc = qc.flagRange(var, mn, mx) - qcD = qc.changeData(var) - qcF = qc.changeFlags(var) - - data, flagger = qcD.getResult() - assert (data[var][range_mask] == FILLER).all(axis=None) - # only flags change so the data should be still NaN, because - # the unmasking was disabled, but the masking indeed was happening - data, flagger = qcF.getResult() - assert data[var][range_mask].isna().all(axis=None) - - -@pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_shapeDiffUnmasking(data, flagger): - """ test if (un)masking works as expected on index-change. - - If the index of data (and flags) change in the func, the unmasking, - should not reapply original data, instead take the new data (and flags) as is. - """ - - FILLER = -1111 - - @register - def pseudoHarmo(data, field, flagger, **kwargs): - index = data[field].index.to_series() - index.iloc[-len(data[field])//2:] += pd.Timedelta("7.5Min") - - data[field] = pd.Series(data=FILLER, index=index) - - flags = flagger.getFlags() - flags[field] = pd.Series(data=flags[field].values, index=index) - - flagger = flagger.initFlags(flags=flags) - return data, flagger - - var = data.columns[0] - var_data = data[var] - mn, mx = var_data.max() * .25, var_data.max() * .75 - - qc = SaQC(flagger, data) - qc = qc.flagRange(var, mn, mx) - qc = qc.pseudoHarmo(var) - - data, flagger = qc.getResult() - assert (data[var] == FILLER).all(axis=None) - - @pytest.mark.parametrize("flagger", TESTFLAGGER) def test_plotting(data, flagger): """ diff --git a/test/core/test_masking.py b/test/core/test_masking.py new file mode 100644 index 0000000000000000000000000000000000000000..fe332f28bcb0eec835057213d92f5c465909007a --- /dev/null +++ b/test/core/test_masking.py @@ -0,0 +1,119 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + +import logging + +import pytest +import pandas as pd + +from saqc import SaQC, register +from test.common import initData, TESTFLAGGER + + +logging.disable(logging.CRITICAL) + + +@pytest.fixture +def data(): + return initData(3) + + +@pytest.mark.parametrize("flagger", TESTFLAGGER) +def test_masking(data, flagger): + """ + test if flagged values are exluded during the preceding tests + """ + flagger = flagger.initFlags(data) + var1 = 'var1' + mn = min(data[var1]) + mx = max(data[var1]) / 2 + + qc = SaQC(flagger, data) + qc = qc.flagRange(var1, mn, mx) + # min is not considered because its the smalles possible value. + # if masking works, `data > max` will be masked, + # so the following will deliver True for in range (data < max), + # otherwise False, like an inverse range-test + qc = qc.procGeneric("dummy", func=lambda var1: var1 >= mn) + + pdata, pflagger = qc.getResult() + out_of_range = pflagger.isFlagged(var1) + in_range = ~out_of_range + + assert (pdata.loc[out_of_range, "dummy"] == False).all() + assert (pdata.loc[in_range, "dummy"] == True).all() + + +@pytest.mark.parametrize("flagger", TESTFLAGGER) +def test_masking_UnmaskingOnDataChange(data, flagger): + """ test if (un)masking works as expected on data-change. + + If the data change in the func, unmasking should respect this changes and + should not reapply original data, instead take the new data (and flags) as is. + Also if flags change, the data should be taken as is. + """ + FILLER = -9999 + + @register(masking='all') + def changeData(data, field, flagger, **kwargs): + mask = data.isna() + data.aloc[mask] = FILLER + return data, flagger + + @register(masking='all') + def changeFlags(data, field, flagger, **kwargs): + mask = data.isna() + flagger = flagger.setFlags(field, loc=mask[field], flag=flagger.UNFLAGGED, force=True) + return data, flagger + + var = data.columns[0] + var_data = data[var] + mn, mx = var_data.max() * .25, var_data.max() * .75 + range_mask = (var_data < mn) | (var_data > mx) + + qc = SaQC(flagger, data) + qc = qc.flagRange(var, mn, mx) + qcD = qc.changeData(var) + qcF = qc.changeFlags(var) + + data, flagger = qcD.getResult() + assert (data[var][range_mask] == FILLER).all(axis=None) + # only flags change so the data should be still NaN, because + # the unmasking was disabled, but the masking indeed was happening + data, flagger = qcF.getResult() + assert data[var][range_mask].isna().all(axis=None) + + +@pytest.mark.parametrize("flagger", TESTFLAGGER) +def test_shapeDiffUnmasking(data, flagger): + """ test if (un)masking works as expected on index-change. + + If the index of data (and flags) change in the func, the unmasking, + should not reapply original data, instead take the new data (and flags) as is. + """ + + FILLER = -1111 + + @register(masking='none') + def pseudoHarmo(data, field, flagger, **kwargs): + index = data[field].index.to_series() + index.iloc[-len(data[field])//2:] += pd.Timedelta("7.5Min") + + data[field] = pd.Series(data=FILLER, index=index) + + flags = flagger.getFlags() + flags[field] = pd.Series(data=flags[field].values, index=index) + + flagger = flagger.initFlags(flags=flags) + return data, flagger + + var = data.columns[0] + var_data = data[var] + mn, mx = var_data.max() * .25, var_data.max() * .75 + + qc = SaQC(flagger, data) + qc = qc.flagRange(var, mn, mx) + qc = qc.pseudoHarmo(var) + + data, flagger = qc.getResult() + assert (data[var] == FILLER).all(axis=None) diff --git a/test/core/test_reader.py b/test/core/test_reader.py index cb9bd717290c831ae3ae6b6dd38b1c350a220dba..fffb6418b5bacf56e26be7b1c6f04bce8a438fa7 100644 --- a/test/core/test_reader.py +++ b/test/core/test_reader.py @@ -6,16 +6,14 @@ from pathlib import Path import pytest import numpy as np import pandas as pd +import dios -from dios.dios import DictOfSeries from saqc.core.config import Fields as F from test.common import initData, writeIO from saqc.core.core import SaQC from saqc.flagger import SimpleFlagger -from saqc.funcs.functions import flagRange, flagDummy -from saqc.core.register import FUNC_MAP, register, SaQCFunc -import dios +from saqc.core.register import FUNC_MAP, register @pytest.fixture @@ -31,28 +29,10 @@ def test_packagedConfig(): data_path = path / "data.csv" data = pd.read_csv(data_path, index_col=0, parse_dates=True,) - saqc = SaQC(SimpleFlagger(), DictOfSeries(data)).readConfig(config_path) + saqc = SaQC(SimpleFlagger(), dios.DictOfSeries(data)).readConfig(config_path) data, flagger = saqc.getResult() -def test_configDefaults(data): - var1, var2, var3, *_ = data.columns - - header = f"{F.VARNAME};{F.TEST};{F.PLOT}" - tests = [ - (f"{var2};flagRange(min=3, max=6);True", SaQCFunc(flagRange, min=3, max=6, plot=True, lineno=2)), - (f"{var3};flagDummy()", SaQCFunc(flagDummy, plot=False, lineno=2)), - ] - - for config, expected in tests: - fobj = writeIO(header + "\n" + config) - saqc = SaQC(SimpleFlagger(), data).readConfig(fobj) - result = [func for _, func in saqc._to_call][0] - assert result.kwargs == expected.kwargs - assert result.lineno == expected.lineno - assert result.plot == expected.plot - - def test_variableRegex(data): header = f"{F.VARNAME};{F.TEST};{F.PLOT}" @@ -67,7 +47,7 @@ def test_variableRegex(data): for regex, expected in tests: fobj = writeIO(header + "\n" + f"{regex} ; flagDummy()") saqc = SaQC(SimpleFlagger(), data).readConfig(fobj) - result = [field for field, _ in saqc._to_call] + result = [f["field"] for f in saqc._to_call] assert np.all(result == expected) @@ -80,9 +60,9 @@ def test_inlineComments(data): pre2 ; flagDummy() # test ; False # test """ saqc = SaQC(SimpleFlagger(), data).readConfig(writeIO(config)) - result = [func for _, func in saqc._to_call][0] - assert result.plot == False - assert result.func == FUNC_MAP["flagDummy"].func + func_dump = saqc._to_call[0] + assert func_dump["ctrl_kws"]["plot"] is False + assert func_dump["func"] == FUNC_MAP["flagDummy"]["func"] def test_configReaderLineNumbers(data): @@ -98,7 +78,7 @@ def test_configReaderLineNumbers(data): SM1 ; flagDummy() """ saqc = SaQC(SimpleFlagger(), data).readConfig(writeIO(config)) - result = [func.lineno for _, func in saqc._to_call] + result = [f["ctrl_kws"]["lineno"] for f in saqc._to_call] expected = [3, 4, 5, 9] assert result == expected @@ -139,7 +119,7 @@ def test_configChecks(data): for test, expected in tests: fobj = writeIO(header + "\n" + test) with pytest.raises(expected): - SaQC(SimpleFlagger(), data).readConfig(fobj) + SaQC(SimpleFlagger(), data).readConfig(fobj).getResult() def test_supportedArguments(data): @@ -149,7 +129,7 @@ def test_supportedArguments(data): # TODO: necessary? - @register + @register(masking='field') def func(data, field, flagger, kwarg, **kwargs): return data, flagger diff --git a/test/flagger/test_flagger.py b/test/flagger/test_flagger.py index 94753bf7d5a4c63109e5d23621366edd23bf7b16..ee803ad2baf3ebec29e7f98f499fb640a2afeb82 100644 --- a/test/flagger/test_flagger.py +++ b/test/flagger/test_flagger.py @@ -5,7 +5,7 @@ import numpy as np import pandas as pd from pandas.api.types import is_bool_dtype -import dios.dios as dios +import dios from test.common import TESTFLAGGER, initData diff --git a/test/funcs/conftest.py b/test/funcs/conftest.py index 9c0fc594a6386c02a0d67ada0866a9f7c9c8a7f9..af99fd0df10b0169f72f574af355e23c811aaa21 100644 --- a/test/funcs/conftest.py +++ b/test/funcs/conftest.py @@ -2,7 +2,7 @@ import pytest import numpy as np import pandas as pd -from dios.dios import DictOfSeries +from dios import DictOfSeries @pytest.fixture diff --git a/test/funcs/test_generic_config_functions.py b/test/funcs/test_generic_config_functions.py index 4f4f759903f54f11d96a58720500c1a25c037630..b9b8e9531235707479a9a67c54515668653d650c 100644 --- a/test/funcs/test_generic_config_functions.py +++ b/test/funcs/test_generic_config_functions.py @@ -7,7 +7,7 @@ import pytest import numpy as np import pandas as pd -from dios.dios import DictOfSeries +from dios import DictOfSeries from test.common import TESTFLAGGER, TESTNODATA, initData, writeIO from saqc.core.visitor import ConfigFunctionParser @@ -32,9 +32,9 @@ def data_diff(): return DictOfSeries(data={col0.name: col0.iloc[: mid + offset], col1.name: col1.iloc[mid - offset :],}) -def _compileGeneric(expr): +def _compileGeneric(expr, flagger): tree = ast.parse(expr, mode="eval") - cp = ConfigFunctionParser(tree.body) + cp = ConfigFunctionParser(tree.body, flagger) return cp.kwargs["func"] @@ -49,7 +49,7 @@ def test_missingIdentifier(data, flagger): ] for test in tests: - func = _compileGeneric(f"flagGeneric(func={test})") + func = _compileGeneric(f"flagGeneric(func={test})", flagger) with pytest.raises(NameError): _execGeneric(flagger, data, func, field="", nodata=np.nan) @@ -65,7 +65,7 @@ def test_syntaxError(flagger): for test in tests: with pytest.raises(SyntaxError): - _compileGeneric(f"flagGeneric(func={test})") + _compileGeneric(f"flagGeneric(func={test})", flagger) @pytest.mark.parametrize("flagger", TESTFLAGGER) @@ -81,7 +81,7 @@ def test_typeError(flagger): for test in tests: with pytest.raises(TypeError): - _compileGeneric(f"flagGeneric(func={test})") + _compileGeneric(f"flagGeneric(func={test})", flagger) @pytest.mark.parametrize("flagger", TESTFLAGGER) @@ -100,7 +100,7 @@ def test_comparisonOperators(data, flagger): ] for test, expected in tests: - func = _compileGeneric(f"flagGeneric(func={test})") + func = _compileGeneric(f"flagGeneric(func={test})", flagger) result = _execGeneric(flagger, data, func, field=var1, nodata=np.nan) assert np.all(result == expected) @@ -121,7 +121,7 @@ def test_arithmeticOperators(data, flagger): ] for test, expected in tests: - func = _compileGeneric(f"procGeneric(func={test})") + func = _compileGeneric(f"procGeneric(func={test})", flagger) result = _execGeneric(flagger, data, func, field=var1, nodata=np.nan) assert np.all(result == expected) @@ -139,7 +139,7 @@ def test_nonReduncingBuiltins(data, flagger): ] for test, expected in tests: - func = _compileGeneric(f"procGeneric(func={test})") + func = _compileGeneric(f"procGeneric(func={test})", flagger) result = _execGeneric(flagger, data, func, field=this, nodata=np.nan) assert (result == expected).all() @@ -163,7 +163,7 @@ def test_reduncingBuiltins(data, flagger, nodata): ] for test, expected in tests: - func = _compileGeneric(f"procGeneric(func={test})") + func = _compileGeneric(f"procGeneric(func={test})", flagger) result = _execGeneric(flagger, data, func, field=this.name, nodata=nodata) assert result == expected @@ -182,7 +182,7 @@ def test_ismissing(data, flagger, nodata): ] for test, expected in tests: - func = _compileGeneric(f"flagGeneric(func={test})") + func = _compileGeneric(f"flagGeneric(func={test})", flagger) result = _execGeneric(flagger, data, func, this.name, nodata) assert np.all(result == expected) @@ -202,7 +202,7 @@ def test_bitOps(data, flagger, nodata): ] for test, expected in tests: - func = _compileGeneric(f"flagGeneric(func={test})") + func = _compileGeneric(f"flagGeneric(func={test})", flagger) result = _execGeneric(flagger, data, func, this, nodata) assert np.all(result == expected) @@ -216,14 +216,14 @@ def test_isflagged(data, flagger): tests = [ (f"isflagged({var1})", flagger.isFlagged(var1)), - (f"isflagged({var1}, BAD)", flagger.isFlagged(var1, flag=flagger.BAD, comparator=">=")), + (f"isflagged({var1}, flag=BAD)", flagger.isFlagged(var1, flag=flagger.BAD, comparator=">=")), (f"isflagged({var1}, UNFLAGGED, '==')", flagger.isFlagged(var1, flag=flagger.UNFLAGGED, comparator="==")), (f"~isflagged({var2})", ~flagger.isFlagged(var2)), (f"~({var2}>999) & (~isflagged({var2}))", ~(data[var2] > 999) & (~flagger.isFlagged(var2))), ] for test, expected in tests: - func = _compileGeneric(f"flagGeneric(func={test})") + func = _compileGeneric(f"flagGeneric(func={test}, flag=BAD)", flagger) result = _execGeneric(flagger, data, func, field=None, nodata=np.nan) assert np.all(result == expected) @@ -269,7 +269,7 @@ def test_callableArgumentsUnary(data): window = 5 - @register + @register(masking='field') def testFuncUnary(data, field, flagger, func, **kwargs): data[field] = data[field].rolling(window=window).apply(func) return data, flagger.initFlags(data=data) @@ -301,7 +301,7 @@ def test_callableArgumentsBinary(data): flagger = SimpleFlagger() var1, var2 = data.columns[:2] - @register + @register(masking='field') def testFuncBinary(data, field, flagger, func, **kwargs): data[field] = func(data[var1], data[var2]) return data, flagger.initFlags(data=data) diff --git a/test/funcs/test_harm_funcs.py b/test/funcs/test_harm_funcs.py index 97cd96f05aca1e3b2f92dc77624ff6865839921c..2717e3e3b9d2e100017f708fe6e0cbdd8576cfd9 100644 --- a/test/funcs/test_harm_funcs.py +++ b/test/funcs/test_harm_funcs.py @@ -7,8 +7,8 @@ import pytest import numpy as np import pandas as pd +import dios -from dios import dios from test.common import TESTFLAGGER from saqc.funcs.harm_functions import ( diff --git a/test/funcs/test_modelling.py b/test/funcs/test_modelling.py index 132067bf76d4f17b394a3c98c56b9706d851764f..3f6099028f033d3efe2b396e8bad4e3ea4ae9bc4 100644 --- a/test/funcs/test_modelling.py +++ b/test/funcs/test_modelling.py @@ -7,9 +7,12 @@ import pytest import numpy as np +<<<<<<< HEAD:test/funcs/test_modelling.py import pandas as pd +======= +import dios +>>>>>>> develop:test/funcs/test_data_modelling.py -from dios import dios from test.common import TESTFLAGGER from saqc.funcs.modelling import modelling_polyFit, modelling_rollingMean, modelling_mask diff --git a/test/funcs/test_proc_functions.py b/test/funcs/test_proc_functions.py index 98e462c73d4c0f88ab957fd2e7afdf22619926cd..604c1a5032218ab3e6d567adff571162538a648b 100644 --- a/test/funcs/test_proc_functions.py +++ b/test/funcs/test_proc_functions.py @@ -14,6 +14,7 @@ from saqc.funcs.proc_functions import ( proc_resample, proc_transform, proc_rollingInterpolateMissing, + proc_interpolateGrid ) from saqc.lib.ts_operators import linearInterpolation, polynomialInterpolation @@ -83,3 +84,14 @@ def test_resample(course_5, flagger): assert ~np.isnan(data1[field].iloc[0]) assert np.isnan(data1[field].iloc[1]) assert np.isnan(data1[field].iloc[2]) + +@pytest.mark.parametrize("flagger", TESTFLAGGER) +def test_interpolateGrid(course_5, course_3, flagger): + data, _ = course_5() + data_grid, characteristics = course_3() + data['grid']=data_grid.to_df() + #data = dios.DictOfSeries(data) + flagger = flagger.initFlags(data) + dataInt, *_ = proc_interpolateGrid(data, 'data', flagger, '1h', 'time', grid_field='grid', inter_limit=10) + +