diff --git a/CHANGELOG.md b/CHANGELOG.md index 2eceb7e6fc09ff58624ae06e9ac166cff6d0ec66..9a8b584f022e31b6bfa1b3d730cecabf9d5e4945 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -64,6 +64,7 @@ coming soon ... ## Features +- `flagCrossValidation` implemented ## Bugfixes - `spikes_flagRaise` - overestimation of value courses average fixed diff --git a/README.md b/README.md index 308250cbd333bffbff1d5aea96fcfc80a793929d..030ec1dbc1d66eddbb655f63f344cb7da86ef7aa 100644 --- a/README.md +++ b/README.md @@ -35,30 +35,61 @@ implementation of the algorithms is left to the respective developers. ## How? -The most import aspect of SaQC, the [general configuration](docs/ConfigurationFiles.md) -of the system, is text-based. All the magic takes place in a semicolon-separated -table file listing the variables within the dataset and the routines to inspect, -quality control and/or modify them. -``` -varname ; test ; plot -#----------;-------------------------------------;------ -SM2 ; harm_shift2Grid(freq="15Min") ; False -SM2 ; flagMissing(nodata=NAN) ; False -'SM(1|2)+' ; flagRange(min=10, max=60) ; False -SM2 ; spikes_flagMad(window="30d", z=3.5) ; True -``` +`SaQC` is both a command line application controlled by text based and a python +module with a simple API. While a good (but still growing) number of predefined and highly configurable [functions](docs/FunctionIndex.md) are included and ready to use, SaQC -additionally ships with a python based for quality control but also general -purpose data processing -[extension language](docs/GenericFunctions.md). +additionally ships with a python based +[extension language](docs/GenericFunctions.md) for quality and general +purpose data processing. For a more specific round trip to some of SaQC's possibilities, please refer to our [GettingStarted](docs/GettingStarted.md). +### SaQC as a command line application +Most of the magic is controlled by a +[semicolon-separated table file](saqc/docs/ConfigurationFiles.md) listing the variables of the +dataset and the routines to inspect, quality control and/or process them. +The content of such a configuration could look like this: + +``` +varname ; test +#----------;------------------------------------ +SM2 ; harm_shift2Grid(freq="15Min") +SM2 ; flagMissing(nodata=NAN) +'SM(1|2)+' ; flagRange(min=10, max=60) +SM2 ; spikes_flagMad(window="30d", z=3.5) +``` + +As soon as the basic inputs, a dataset and the configuration file are +prepared, running SaQC is as simple as: +```sh +saqc \ + --config path_to_configuration.txt \ + --data path_to_data.csv \ + --outfile path_to_output.csv +``` + +### SaQC as a python module + +The following snippet implements the same configuration given above through +the Python-API: + +```python +from saqc import SaQC, SimpleFlagger + +saqc = (SaQC(data, SimpleFlagger()) + .harm_shift2Grid("SM2", freq="15Min") + .flagMissing("SM2", nodata=np.nan) + .flagRange("SM(1|2)+", regex=True, min=10, max=60) + .spikes_flagMad("SM2", window="30d", z=3.5)) + +data, flagger = saqc.getResult() +``` + ## Installation ### Python Package Index diff --git a/docs/Customizations.md b/docs/Customizations.md index 156f8f1b938b1804b20a51db70638647a1ea2899..b5438944ce2181f0cc4b17733281ac1a96ed74af 100644 --- a/docs/Customizations.md +++ b/docs/Customizations.md @@ -54,7 +54,7 @@ look like that: ```python from saqc.functions.register import register -@register() +@register def yourTestFunction(data, field, flagger, *args, **kwargs): return data, flagger ``` diff --git a/docs/funcs/ConstantDetection.md b/docs/funcs/ConstantDetection.md index 9fd8185df05dde06af4acc9ba4f5cad9ebab8031..a0f4e580945ca503fdf8608f99fa7e974e72b575 100644 --- a/docs/funcs/ConstantDetection.md +++ b/docs/funcs/ConstantDetection.md @@ -17,7 +17,7 @@ constants_flagBasic(window, thresh=0) | parameter | data type | default value | description | |-----------|-----------------------------------------------------------------------|---------------|------------------------------------------------------------------------------------------------------------------------------| | window | integer/[offset string](docs/ParameterDescriptions.md#offset-strings) | | The minimum count or duration in which the values must be constant to be considered as plateau candidates. See condition (1) | -| thresh | float | 0 | The maximum difference between values to still considered as constant. See condition (2) | +| thresh | float | 0 | The maximum difference between values to be still considered as constant. See condition (2) | This functions flags plateaus/series of constant values of length `window` if their difference is smaller than `thresh`. diff --git a/docs/funcs/DTW.md b/docs/funcs/DTW.md deleted file mode 100644 index b0c5a5a597fc1532226d9cfcb650fc9014540089..0000000000000000000000000000000000000000 --- a/docs/funcs/DTW.md +++ /dev/null @@ -1,24 +0,0 @@ -## DTW - - -## Index -[flagDTW](#flagDTW) - -## flagDTW - -``` -flagDTW(refdatafield='SM1', window = 25, min_distance = 0.25, method_dtw = "fast") -``` - - -| parameter | data type | default value | description | -|-----------------------|---------------------------------------------------------------|---------------|------------------------------------------------------------------------------------------------------------------------------------------------------------| -| window | int | `25` |The number of datapoints to be included in each comparison window. | -| min_distance | float | `0.5` |The minimum distance of two graphs to be classified as "different". | -| method_dtw | string | `"fast"` |Implementation of DTW algorithm - "exact" for the normal implementation of DTW, "fast" for the fast implementation. | -| ref_datafield | string | |Name of the reference datafield ("correct" values) with which the actual datafield is compared. | - - -This function compares the data with a reference datafield (given in `ref_datafield`) of values we assume to be correct. The comparison is undertaken window-based, i.e. the two data fields are compared window by window, with overlapping windows. The function flags those values that lie in the middle of a window that exceeds a minimum distance value (given in `min_distance`). - -As comparison algorithm, we use the [Dynamic Time Warping (DTW) Algorithm](https://en.wikipedia.org/wiki/Dynamic_time_warping) that accounts for temporal and spacial offsets when calculating the distance. For a demonstration of the DTW, see the Wiki entry "Results for rain data set" in [Pattern Recognition with Wavelets](https://git.ufz.de/rdm-software/saqc/-/wikis/Pattern-Recognition-with-Wavelets#Results). diff --git a/docs/funcs/Miscellaneous.md b/docs/funcs/Miscellaneous.md index 8865a60606a6a1d72ef97047e9b2d5a85a3fa4fb..f5ad537241171171d76dc89e44f1a3b809198cd9 100644 --- a/docs/funcs/Miscellaneous.md +++ b/docs/funcs/Miscellaneous.md @@ -7,11 +7,13 @@ A collection of unrelated quality check functions. - [flagRange](#flagrange) - [flagSeasonalRange](#flagseasonalrange) - [flagIsolated](#flagisolated) +- [flagPattern](#flagpattern) - [flagMissing](#flagmissing) - [clearFlags](#clearflags) - [forceFlags](#forceflags) + ## flagRange ``` @@ -88,9 +90,32 @@ flagMissing(nodata=NaN) | --------- | ---------- | -------------- | ----------- | | nodata | any | `NAN` | A value that defines missing data | - The function flags all values indicating missing data. + + + +## flagPattern + +``` +flagPattern(ref_datafield, sample_freq = '15 Min', method = 'dtw', min_distance = None) +``` + + +| parameter | data type | default value | description | +|-----------------------|---------------------------------------------------------------|---------------|------------------------------------------------------------------------------------------------------------------------------------------------------------| +| ref_datafield | string | |Name of the reference datafield = "pattern" | +| sample_freq | string | `"15 Min"` |Sample frequency to harmonize the data | +| method | string | `"dtw "` |"dtw" for Dynamic Time Warping (DTW), "wavelet" for Wavelet Pattern Recognition Algorithm | +| min_distance | float | `None` |For DTW - alogrithm: the minimum distance of two graphs in order to be classified as "different" | + + +Implementation of the pattern recognition algorithms introduced in [Pattern Recognition](https://git.ufz.de/rdm-software/saqc/-/wikis/Pattern-Recognition). + + + + + ## clearFlags ``` diff --git a/saqc/__init__.py b/saqc/__init__.py index bf7de72ee0f4a61580ee9122d237fb0cb321e9f5..b348d8829a876ef8d4bae8a236ccc90ec5b3bbdb 100644 --- a/saqc/__init__.py +++ b/saqc/__init__.py @@ -3,6 +3,6 @@ __version__ = "1.3.0" -from saqc.core.core import run +from saqc.core.core import SaQC from saqc.flagger import * -from saqc.funcs import register +from saqc.core.register import register diff --git a/saqc/__main__.py b/saqc/__main__.py index 9db20bdf127d7bdb9039e4127d1300934b3e8a02..b962bdf79e7b47c7b2ed0efc5bdbf8b54a97f893 100644 --- a/saqc/__main__.py +++ b/saqc/__main__.py @@ -6,9 +6,9 @@ import click import numpy as np import pandas as pd -from saqc.core import run +from saqc.core import SaQC from saqc.flagger import CategoricalFlagger -from saqc.flagger.dmpflagger import DmpFlagger, FlagFields +from saqc.flagger.dmpflagger import DmpFlagger import dios @@ -40,15 +40,16 @@ def main(config, data, flagger, outfile, nodata, log_level, fail): data = pd.read_csv(data, index_col=0, parse_dates=True,) data = dios.DictOfSeries(data) - data_result, flagger_result = run( - config_file=config, + saqc = SaQC( flagger=FLAGGERS[flagger], data=data, - nodata=nodata, log_level=log_level, + nodata=nodata, error_policy="raise" if fail else "warn", ) + data_result, flagger_result = saqc.readConfig(config).getResult() + if outfile: data_result = data_result.to_df() flags = flagger_result.getFlags().to_df() diff --git a/saqc/core/__init__.py b/saqc/core/__init__.py index c319709cfae130c3d142f355e1ea43d99db29bdf..eb4a8abf1fb38dc855488a78855eaf2fb0316eb6 100644 --- a/saqc/core/__init__.py +++ b/saqc/core/__init__.py @@ -1,4 +1,5 @@ #! /usr/bin/env python # -*- coding: utf-8 -*- -from saqc.core.core import run +from saqc.core.core import SaQC +from saqc.core.register import register diff --git a/saqc/core/config.py b/saqc/core/config.py index 89a598f86051a0e0904e49c3f66c90b90538577a..e6a9e9f6307f8b2de08f560df56dc691e30f8e86 100644 --- a/saqc/core/config.py +++ b/saqc/core/config.py @@ -6,13 +6,6 @@ class Fields: VARNAME = "varname" START = "start_date" END = "end_date" - TESTS = "test*" + TEST = "test" PLOT = "plot" LINENUMBER = "line" - - -class Params: - FLAG_GENERIC = "flagGeneric" - PROC_GENERIC = "procGeneric" - GENERIC_ARGS = "func_arguments" - FUNC = "func" diff --git a/saqc/core/core.py b/saqc/core/core.py index a0c25c9f1e7a57a6f7cc9906519d3abe87529567..de1eb134bf06291efacd165d622881b25f6c4723 100644 --- a/saqc/core/core.py +++ b/saqc/core/core.py @@ -1,56 +1,56 @@ #! /usr/bin/env python # -*- coding: utf-8 -*- + +""" +TODOS: + - integrate plotting into the api + - `data` and `flagger` as arguments to `getResult` +""" + import logging +from copy import deepcopy +from operator import attrgetter +from typing import List, Tuple -import numpy as np import pandas as pd import dios +import numpy as np -from saqc.core.reader import readConfig, checkConfig -from saqc.core.config import Fields -from saqc.core.evaluator import evalExpression from saqc.lib.plotting import plotHook, plotAllHook -from saqc.lib.types import DiosLikeT +from saqc.lib.tools import isQuoted +from saqc.core.register import FUNC_MAP, SaQCFunc +from saqc.core.reader import readConfig from saqc.flagger import BaseFlagger, CategoricalFlagger, SimpleFlagger, DmpFlagger logger = logging.getLogger("SaQC") -def _collectVariables(meta, data): - """ - find every relevant variable - """ - # NOTE: get to know every variable from meta - variables = list(data.columns) - for idx, configrow in meta.iterrows(): - varname = configrow[Fields.VARNAME] - # assign = configrow[Fields.ASSIGN] - if varname in variables: - continue - # if (varname in data): # or (varname not in variables and assign is True): - variables.append(varname) - return variables - +def _handleErrors(exc, func, policy): + msg = f"failed with:\n{type(exc).__name__}: {exc}" + if func.lineno is not None and func.expr is not None: + msg = f"config, line {func.lineno}: '{func.expr}' " + msg + else: + msg = f"function '{func.func}' with parameters '{func.kwargs}' " + msg -def _convertInput(data, flags): - if isinstance(data, pd.DataFrame): - data = dios.to_dios(data) - if isinstance(flags, pd.DataFrame): - flags = dios.to_dios(flags) + if policy == "ignore": + logger.debug(msg) + elif policy == "warn": + logger.warning(msg) + else: + logger.error(msg) + raise -def _checkAndConvertInput(data, flags, flagger): +def _prepInput(flagger, data, flags): dios_like = (dios.DictOfSeries, pd.DataFrame) if not isinstance(data, dios_like): raise TypeError("data must be of type dios.DictOfSeries or pd.DataFrame") if isinstance(data, pd.DataFrame): - if isinstance(data.index, pd.MultiIndex): - raise TypeError("the index of data is not allowed to be a multiindex") - if isinstance(data.columns, pd.MultiIndex): - raise TypeError("the columns of data is not allowed to be a multiindex") + if isinstance(data.index, pd.MultiIndex) or isinstance(data.columns, pd.MultiIndex): + raise TypeError("data should not use MultiIndex") data = dios.to_dios(data) if not isinstance(flagger, BaseFlagger): @@ -58,147 +58,167 @@ def _checkAndConvertInput(data, flags, flagger): raise TypeError(f"flagger must be of type {flaggerlist} or any inherit class from {BaseFlagger}") if flags is not None: - if not isinstance(flags, dios_like): raise TypeError("flags must be of type dios.DictOfSeries or pd.DataFrame") if isinstance(flags, pd.DataFrame): - if isinstance(flags.index, pd.MultiIndex): - raise TypeError("the index of flags is not allowed to be a multiindex") - if isinstance(flags.columns, pd.MultiIndex): - raise TypeError("the columns of flags is not allowed to be a multiindex") + if isinstance(flags.index, pd.MultiIndex) or isinstance(flags.columns, pd.MultiIndex): + raise TypeError("flags' should not use MultiIndex") flags = dios.to_dios(flags) # NOTE: do not test all columns as they not necessarily need to be the same cols = flags.columns & data.columns if not (flags[cols].lengths == data[cols].lengths).all(): - raise ValueError("the length of values in flags and data does not match.") + raise ValueError("the length of flags and data need to be equal") - return data, flags + if flagger.initialized: + err = "Flagger is not correctly initialized for given data. Call flagger.initFlags() on data or" \ + "do not call it at all." + fflags = flagger.getFlags() + if not fflags.columns.difference(data.columns).empty: + raise ValueError(err + " Detail: Columns missmatch.") + # flagger could have more columns than data + cols = fflags.columns & data.columns + if not (fflags[cols].lengths == data[cols].lengths).all(): + raise ValueError(err + " Detail: Length of flags does not match length of data.") + + return data, flags -def _handleErrors(exc, configrow, test, policy): - line = configrow[Fields.LINENUMBER] - msg = f"config, line {line}, test: '{test}' failed with:\n{type(exc).__name__}: {exc}" - if policy == "ignore": - logger.debug(msg) - elif policy == "warn": - logger.warning(msg) - else: - raise Exception(msg) +def _setup(log_level): + # NOTE: + # the import is needed to trigger the registration + # of the built-in (test-)functions + import saqc.funcs -def _setup(loglevel): + # warnings pd.set_option("mode.chained_assignment", "warn") np.seterr(invalid="ignore") - # logging setting - logger.setLevel(loglevel) + # logging + logger.setLevel(log_level) handler = logging.StreamHandler() formatter = logging.Formatter("[%(asctime)s][%(name)s][%(levelname)s]: %(message)s") handler.setFormatter(formatter) logger.addHandler(handler) -def run( - config_file: str, - flagger: BaseFlagger, - data: DiosLikeT, - flags: DiosLikeT = None, - nodata: float = np.nan, - log_level: str = "INFO", - error_policy: str = "raise", -) -> (dios.DictOfSeries, BaseFlagger): - - _setup(log_level) - data, flags = _checkAndConvertInput(data, flags, flagger) - config = readConfig(config_file, data) - - # split config into the test and some 'meta' data - tests = config.filter(regex=Fields.TESTS) - meta = config[config.columns.difference(tests.columns)] - - # prepapre the flags - flag_cols = _collectVariables(meta, data) - flagger = flagger.initFlags(dios.DictOfSeries(data=data, columns=flag_cols)) - if flags is not None: - flagger = flagger.setFlagger(flagger.initFlags(flags=flags)) - - # NOTE: - # this checks comes late, but the compilation of - # user-test needs fully prepared flags - checkConfig(config, data, flagger, nodata) - - # NOTE: - # the outer loop runs over the flag tests, the inner one over the - # variables. Switching the loop order would complicate the - # reference to flags from other variables within the dataset - for _, testcol in tests.iteritems(): - - # NOTE: just an optimization - if testcol.dropna().empty: - continue - - for idx, configrow in meta.iterrows(): - - # store config params in some handy variables - varname = configrow[Fields.VARNAME] - start_date = configrow[Fields.START] - end_date = configrow[Fields.END] - - func = testcol[idx] - if pd.isnull(func): - continue - - if varname not in data and varname not in flagger.getFlags(): - continue - - # NOTE: - # time slicing support is currently disabled - # prepare the data for the tests - # dtslice = slice(start_date, end_date) - dtslice = slice(None) - data_chunk = data.loc[dtslice] - if data_chunk.empty: - continue - flagger_chunk = flagger.getFlagger(loc=dtslice) +class SaQC: + def __init__(self, flagger, data, flags=None, nodata=np.nan, log_level=logging.INFO, error_policy="raise"): + _setup(log_level) + data, flags = _prepInput(flagger, data, flags) + self._data = data + self._nodata = nodata + self._flagger = self._initFlagger(data, flagger, flags) + self._error_policy = error_policy + # NOTE: will be filled by calls to `_wrap` + self._to_call: List[Tuple[str, SaQCFunc]] = [] + + def _initFlagger(self, data, flagger, flags): + """ Init the internal flagger object. + + Ensures that all data columns are present and user passed flags from + a flags frame and/or an already initialised flagger are used. + If columns overlap the passed flagger object is prioritised. + """ + # ensure all data columns + merged = flagger.initFlags(data) + if flags is not None: + merged = merged.merge(flagger.initFlags(flags=flags)) + if flagger.initialized: + merged = merged.merge(flagger) + return merged + + def readConfig(self, fname): + + config = readConfig(fname) + + out = deepcopy(self) + for func, field, kwargs, plot, lineno, expr in config: + if isQuoted(field): + kwargs["regex"] = True + field = field[1:-1] + kwargs["field"] = field + kwargs["plot"] = plot + out = out._wrap(func, lineno=lineno, expr=expr)(**kwargs) + return out + + def getResult(self, write_back=False): + """ Do the actual calculations and return the results. + + Parameters + ---------- + write_back: bool, default False + If False, every call will recalculate, eventually plot and return the result anew. + If True the resulting data is written back in the SaQC object itself, like if + the object would have been initialized with it. Further calls will then directly + return the result with no recalculation needed, but a replotting is not possible. + + Returns + ------- + data, flagger: (DictOfSeries, DictOfSeries) + """ + data, flagger = self._data, self._flagger + + for field, func in self._to_call: try: - # actually run the tests - data_chunk_result, flagger_chunk_result = evalExpression( - func, data=data_chunk, field=varname, flagger=flagger_chunk, nodata=nodata, - ) + data_result, flagger_result = func(data=data, flagger=flagger, field=field) except Exception as e: - _handleErrors(e, configrow, func, error_policy) + _handleErrors(e, func, self._error_policy) continue - if configrow[Fields.PLOT]: - try: - plotHook( - data_old=data_chunk, data_new=data_chunk_result, - flagger_old=flagger_chunk, flagger_new=flagger_chunk_result, - sources=[], targets=[varname], plot_name=func, - ) - except Exception: - logger.exception(f"Plotting failed. \n" - f" config line: {configrow[Fields.LINENUMBER]}\n" - f" expression: {func}\n" - f" variable(s): {[varname]}.") - - # NOTE: - # time slicing support is currently disabled - # flagger = flagger.setFlagger(flagger_chunk_result) - # data = combineDataFrames(data, data_chunk_result) - flagger = flagger_chunk_result - data = data_chunk_result - - plotfields = config[Fields.VARNAME][config[Fields.PLOT]] - if len(plotfields) > 0: - try: - # to only show variables that have set the plot-flag - # use: plotAllHook(data, flagger, targets=plotfields) + if func.plot: + plotHook( + data_old=data, data_new=data_result, + flagger_old=flagger, flagger_new=flagger_result, + sources=[], targets=[field], plot_name=func.__name__, + ) + + data = data_result + flagger = flagger_result + + if any([func.plot for _, func in self._to_call]): plotAllHook(data, flagger) - except Exception: - logger.exception(f"Final plotting failed.") - return data, flagger + if write_back: + self._data = data + self._flagger = flagger + self._to_call = [] + + return data, flagger + + def _wrap(self, func, lineno=None, expr=None): + + def inner(field: str, *args, regex: bool = False, **kwargs): + + fields = [field] if not regex else self._data.columns[self._data.columns.str.match(field)] + + if func.__name__ in ("flagGeneric", "procGeneric"): + # NOTE: + # We need to pass `nodata` to the generic functions + # (to implement stuff like `ismissing`). As we + # should not interfere with proper nodata attributes + # of other test functions (e.g. `flagMissing`) we + # special case the injection + kwargs["nodata"] = kwargs.get("nodata", self._nodata) + + out = deepcopy(self) + for field in fields: + f = SaQCFunc(func, *args, lineno=lineno, expression=expr, **kwargs) + out._to_call.append((field, f)) + return out + + return inner + + def __getattr__(self, key): + """ + All failing attribute accesses are redirected to + __getattr__. We use this mechanism to make the + `RegisterFunc`s appear as `SaQC`-methods with + actually implementing them. + """ + if key not in FUNC_MAP: + raise AttributeError(f"no such attribute: '{key}'") + return self._wrap(FUNC_MAP[key]) diff --git a/saqc/core/evaluator/__init__.py b/saqc/core/evaluator/__init__.py deleted file mode 100644 index 9376d550d128f9783216d32ed27f0c74617f6196..0000000000000000000000000000000000000000 --- a/saqc/core/evaluator/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -#! /usr/bin/env python -# -*- coding: utf-8 -*- - -from saqc.core.evaluator.evaluator import ( - compileExpression, - evalExpression, - compileTree, - parseExpression, - initLocalEnv, - evalCode, -) - -from saqc.core.evaluator.checker import DslChecker, ConfigChecker - -from saqc.core.evaluator.transformer import DslTransformer, ConfigTransformer diff --git a/saqc/core/evaluator/checker.py b/saqc/core/evaluator/checker.py deleted file mode 100644 index 3e116a920a300f84a60fea807ddbe29c7eb1abb2..0000000000000000000000000000000000000000 --- a/saqc/core/evaluator/checker.py +++ /dev/null @@ -1,122 +0,0 @@ -#! /usr/bin/env python -# -*- coding: utf-8 -*- - -import ast - -from saqc.funcs.register import FUNC_MAP -from saqc.core.config import Params - - -class DslChecker(ast.NodeVisitor): - - SUPPORTED = ( - ast.Expression, - ast.UnaryOp, - ast.BinOp, - ast.BitOr, - ast.BitAnd, - ast.Num, - ast.Compare, - ast.Add, - ast.Sub, - ast.Mult, - ast.Div, - ast.Pow, - ast.Mod, - ast.USub, - ast.Eq, - ast.NotEq, - ast.Gt, - ast.Lt, - ast.GtE, - ast.LtE, - ast.Invert, - ast.Name, - ast.Load, - ast.Call, - ) - - def __init__(self, environment): - self.environment = environment - - def visit_Call(self, node): - func_name = node.func.id - if func_name not in self.environment: - raise NameError(f"unspported function: '{func_name}'") - self.generic_visit(node) - - def visit_Name(self, node): - name = node.id - if name not in self.environment and name not in self.environment["variables"]: - raise NameError(f"unknown variable: '{name}'") - self.generic_visit(node) - - def generic_visit(self, node): - if not isinstance(node, self.SUPPORTED): - raise TypeError(f"invalid expression: '{node}'") - return super().generic_visit(node) - - -class ConfigChecker(ast.NodeVisitor): - - SUPPORTED_NODES = ( - ast.Call, - ast.Num, - ast.Str, - ast.keyword, - ast.NameConstant, - ast.UnaryOp, - ast.Name, - ast.Load, - ast.Expression, - ast.Subscript, - ast.Index, - ast.USub, - ) - - SUPPORTED_ARGUMENTS = ( - ast.Str, - ast.Num, - ast.NameConstant, - ast.Call, - ast.UnaryOp, - ast.USub, - ast.Name, - ) - - def __init__(self, environment, pass_parameter): - self.pass_parameter = pass_parameter - self.environment = environment - self.func_name = None - - def visit_Call(self, node): - - func_name = node.func.id - if func_name not in FUNC_MAP: - raise NameError(f"unknown test function: '{func_name}'") - if node.args: - raise TypeError("only keyword arguments are supported") - self.func_name = func_name - return self.generic_visit(node) - - def visit_keyword(self, node): - key, value = node.arg, node.value - if self.func_name in (Params.FLAG_GENERIC, Params.PROC_GENERIC) and key == Params.FUNC: - DslChecker(self.environment).visit(value) - return - - if key not in FUNC_MAP[self.func_name].signature + self.pass_parameter: - raise TypeError(f"unknown function parameter '{node.arg}'") - - if not isinstance(value, self.SUPPORTED_ARGUMENTS): - raise TypeError(f"invalid argument type '{type(value)}'") - - if isinstance(value, ast.Name) and value.id not in self.environment: - raise NameError(f"unknown variable: {value.id}") - - return self.generic_visit(node) - - def generic_visit(self, node): - if not isinstance(node, self.SUPPORTED_NODES): - raise TypeError(f"invalid node: '{node}'") - return super().generic_visit(node) diff --git a/saqc/core/evaluator/evaluator.py b/saqc/core/evaluator/evaluator.py deleted file mode 100644 index d15431ecb999a0adcc5e1c13f829817c1744dc30..0000000000000000000000000000000000000000 --- a/saqc/core/evaluator/evaluator.py +++ /dev/null @@ -1,100 +0,0 @@ -#! /usr/bin/env python -# -*- coding: utf-8 -*- - -import ast -import logging - -from functools import partial -from typing import Any, Dict - -import astor -import numpy as np -import dios.dios as dios - -from saqc.flagger.baseflagger import BaseFlagger -from saqc.funcs.register import FUNC_MAP -from saqc.core.evaluator.checker import ConfigChecker -from saqc.core.evaluator.transformer import ConfigTransformer - - -logger = logging.getLogger("SaQC") - - -def _dslIsFlagged(flagger, field, flag=None, comparator=None): - if comparator is None: - return flagger.isFlagged(field, flag=flag) - return flagger.isFlagged(field, flag=flag, comparator=comparator) - - -def initLocalEnv(data: dios.DictOfSeries, field: str, flagger: BaseFlagger, nodata: float) -> Dict[str, Any]: - - return { - # general - "data": data, - "field": field, - "flagger": flagger, - "this": field, - # transformation only - "variables": set(flagger.getFlags().columns.tolist()), - "nolookup": set(["isflagged"]), # no variable lookup for flagger based functions, - # missing values/data - "NAN": np.nan, - "NODATA": nodata, - # flags - "GOOD": flagger.GOOD, - "BAD": flagger.BAD, - "UNFLAGGED": flagger.UNFLAGGED, - # special functions - "ismissing": lambda data: (data == nodata) | data.isna(), - "isflagged": partial(_dslIsFlagged, flagger), - # math - "abs": np.abs, - "exp": np.exp, - "log": np.log, - "sqrt": np.sqrt, - "sin": np.sin, - "cos": np.cos, - "tan": np.tan, - "max": np.nanmax, - "min": np.nanmin, - "mean": np.nanmean, - "sum": np.nansum, - "std": np.nanstd, - "len": lambda data: np.array(len(data)), - } - - -def parseExpression(expr: str) -> ast.AST: - tree = ast.parse(expr, mode="eval") - return tree - - -def compileTree(tree: ast.Expression): - return compile(ast.fix_missing_locations(tree), "<ast>", mode="eval") - - -def evalCode(code, global_env=None, local_env=None): - return eval(code, global_env or {}, local_env or {}) - - -def compileExpression(expr, data, field, flagger, nodata=np.nan): - local_env = initLocalEnv(data, field, flagger, nodata) - tree = parseExpression(expr) - ConfigChecker(local_env, flagger.signature).visit(tree) - transformed_tree = ConfigTransformer(local_env).visit(tree) - src = astor.to_source(transformed_tree).strip() - logger.debug(f"calling transformed function:\n{src}") - return local_env, compileTree(transformed_tree) - - -def evalExpression(expr, data, field, flagger, nodata=np.nan): - # mask the already flagged value to make all the functions - # called on the way through the evaluator ignore flagged values - mask = flagger.isFlagged() - data_in = data.copy() - data_in[mask] = np.nan - local_env, code = compileExpression(expr, data_in, field, flagger, nodata) - data_result, flagger_result = evalCode(code, FUNC_MAP, local_env) - # reinject the original values, as we don't want to loose them - data_result.aloc[mask] = data[mask] - return data_result, flagger_result diff --git a/saqc/core/evaluator/transformer.py b/saqc/core/evaluator/transformer.py deleted file mode 100644 index a304a010fc05c3e26a4c6110bf1caf1b4d85f8ee..0000000000000000000000000000000000000000 --- a/saqc/core/evaluator/transformer.py +++ /dev/null @@ -1,90 +0,0 @@ -#! /usr/bin/env python -# -*- coding: utf-8 -*- - -import ast - -from typing import Dict, Any -from contextlib import contextmanager - -from saqc.core.config import Params - - -class DslTransformer(ast.NodeTransformer): - def __init__(self, environment: Dict[str, Any]): - self.environment = environment - - def visit_Call(self, node): - new_args = node.args - for a in new_args: - a.lookup = node.func.id not in self.environment["nolookup"] - - node = ast.Call(func=node.func, args=new_args, keywords=[]) - return self.generic_visit(node) - - def visit_Name(self, node): - - # NOTE: - # - # There are different categories of name nodes: - # - # 1. Names that need a lookup in the global/local eval - # environment (e.g. function names, dsl constants, ...) - # -> nodes need to leave visit_Name unaltered - # 2. Names that need a lookup in the 'data' DataFrame - # -> nodes need to be rewritten int ast.Subscript - # 3. Names that should be treated as constants and be passed to - # functions requiring a 'field' parameter (e.g. 'isflagged') - # -> nodes need to be rewritten to ast.Constant/ast.Str - # - # TODO: - # - # The differentiation between these categories is done based - # on the two variables out of 'self.environment', namely - # 'nolookup' and 'variables' in two different methods - # ('vsisit_Call' and 'visit_Name'). This continues to feel hacky - # and I really like to see a cleaner solution for that problem - - name = node.id - - if name == "this": - name = self.environment["this"] - - if name in self.environment["variables"]: - # determine further tree-transformation path by target - if getattr(node, "lookup", True): - value = ast.Constant(value=name) - node = ast.Subscript( - value=ast.Name(id="data", ctx=ast.Load()), slice=ast.Index(value=value), ctx=ast.Load(), - ) - else: - node = ast.Constant(value=name) - - return node - - -class ConfigTransformer(ast.NodeTransformer): - def __init__(self, environment): - self.environment = environment - self.func_name = None - - def visit_Call(self, node): - self.func_name = node.func.id - - new_args = [ - ast.Name(id="data", ctx=ast.Load()), - ast.Name(id="field", ctx=ast.Load()), - ast.Name(id="flagger", ctx=ast.Load()), - ] - node = ast.Call(func=node.func, args=new_args + node.args, keywords=node.keywords) - - return self.generic_visit(node) - - def visit_keyword(self, node): - key, value = node.arg, node.value - - if self.func_name in (Params.FLAG_GENERIC, Params.PROC_GENERIC) and key == Params.FUNC: - dsl_transformer = DslTransformer(self.environment) - value = dsl_transformer.visit(value) - return ast.keyword(arg=key, value=value) - - return self.generic_visit(node) diff --git a/saqc/core/reader.py b/saqc/core/reader.py index 05d28bddb0202e17861347176df94b110e3c5946..06f323707ec02fe273cf5159d9fb2d9ea7d1c7b1 100644 --- a/saqc/core/reader.py +++ b/saqc/core/reader.py @@ -1,155 +1,85 @@ #! /usr/bin/env python # -*- coding: utf-8 -*- +import ast -import re -import logging -from csv import reader -from typing import Dict, List, Any, Union, Iterable, Iterator, Tuple -from contextlib import contextmanager -from io import StringIO, TextIOWrapper +import numpy as np import pandas as pd -import dios from saqc.core.config import Fields as F -from saqc.core.evaluator import compileExpression -from saqc.flagger import BaseFlagger - - -logger = logging.getLogger("SaQC") - - -# typing declarations -Config = Iterable[Dict[str, Any]] -Filename = Union[StringIO, str] - - -CONFIG_TYPES = { - F.VARNAME: str, - F.START: pd.to_datetime, - F.END: pd.to_datetime, - F.TESTS: str, - F.PLOT: lambda v: str(v).lower() == "true", - F.LINENUMBER: int, -} - - -def _raise(config_row, exc, msg, field=None): - line_number = config_row[F.LINENUMBER] - base_msg = f"configuration error in line {line_number}" - if field: - base_msg += f", column '{field}'" - msg = base_msg + ":\n" + msg - raise exc(msg) - - -@contextmanager -def _open(fname: Filename) -> Union[StringIO, TextIOWrapper]: - if isinstance(fname, StringIO): - yield fname - else: - f = open(fname) - yield f - f.close() - - -def _matchKey(keys: Iterable[str], fuzzy_key: str) -> str: - for key in keys: - if re.match(fuzzy_key, key): - return key - - -def _castRow(row: Dict[str, Any]): - out = {} - for row_key, row_value in row.items(): - for fuzzy_key, func in CONFIG_TYPES.items(): - if re.match(fuzzy_key, row_key): - try: - out[row_key] = func(row_value) - except ValueError: - _raise(row, ValueError, f"invalid value: '{row_value}'") - return out - - -def _expandVarnameWildcards(config: Config, data: dios.DictOfSeries) -> Config: - def isQuoted(string): - return bool(re.search(r"'.*'|\".*\"", string)) - - new = [] - for row in config: - varname = row[F.VARNAME] - if varname and isQuoted(varname): - pattern = varname[1:-1] - expansion = data.columns[data.columns.str.match(pattern)] - if not len(expansion): - logger.warning(f"no match for regular expression '{pattern}'") - for var in expansion: - new.append({**row, F.VARNAME: var}) - else: - new.append(row) - return new - - -def _clearRows(rows: Iterable[List[str]], comment: str = "#") -> Iterator[Tuple[str, List[Any]]]: - for i, row in enumerate(rows): - row = [c.strip() for c in row] - if any(row) and not row[0].lstrip().startswith(comment): - row = [c.split(comment)[0].strip() for c in row] - yield i, row - - -def readConfig(fname: Filename, data: dios.DictOfSeries, sep: str = ";", comment: str = "#") -> pd.DataFrame: - - defaults = { - F.VARNAME: "", - F.START: min(map(min, data.indexes)), - F.END: max(map(max, data.indexes)), - F.PLOT: False - } - - with _open(fname) as f: - rdr = reader(f, delimiter=";") - - rows = _clearRows(rdr) - _, header = next(rows) - - config = [] - for n, row in rows: - row = {**defaults, **dict(zip(header, row)), F.LINENUMBER: n + 1} - if row[F.VARNAME] in data: - index = data[row[F.VARNAME]].index - row = {**row, **{F.START: index.min(), F.END: index.max()}} - row = _castRow(row) - config.append(row) - - expanded = _expandVarnameWildcards(config, data) - return pd.DataFrame(expanded) - - -def checkConfig(config_df: pd.DataFrame, data: dios.DictOfSeries, flagger: BaseFlagger, nodata: float) -> pd.DataFrame: - - for _, config_row in config_df.iterrows(): - - var_name = config_row[F.VARNAME] - if pd.isnull(config_row[F.VARNAME]) or not var_name: - _raise( - config_row, SyntaxError, f"non-optional column '{F.VARNAME}' is missing or empty", - ) - - test_fields = config_row.filter(regex=F.TESTS).dropna() - if test_fields.empty: - _raise( - config_row, SyntaxError, f"at least one test needs to be given for variable", - ) - - for col, expr in test_fields.iteritems(): - if not expr: - _raise(config_row, SyntaxError, f"field '{col}' may not be empty") - try: - compileExpression(expr, data, var_name, flagger, nodata) - except (TypeError, NameError, SyntaxError) as exc: - _raise( - config_row, type(exc), exc.args[0] + f" (failing statement: '{expr}')", col, - ) - return config_df +from saqc.core.visitor import ConfigFunctionParser + +COMMENT = "#" +EMPTY = "None" + + +def _handleEmptyLines(df): + if F.VARNAME not in df.columns: + # at least the first line was empty, so we search the header + df = df.reset_index() + i = (df == F.VARNAME).first_valid_index() + df.columns = df.iloc[i] + df = df.iloc[i + 1:] + + # mark empty lines + mask = (df.isnull() | (df == "")).all(axis=1) + df.loc[mask] = EMPTY + return df + + +def _handleComments(df): + # mark commented lines + df.loc[df[F.VARNAME].str.startswith(COMMENT)] = EMPTY + + for col in df: + df[col] = df[col].str.split(COMMENT, expand=True).iloc[:, 0].str.strip() + + return df + + +def _injectOptionalColumns(df): + # inject optional columns + if F.PLOT not in df: + empty = (df == EMPTY).all(axis=1) + df[F.PLOT] = "False" + df[empty] = EMPTY + return df + + +def _parseConfig(df): + to_call = [] + for lineno, (_, field, expr, plot) in enumerate(df.itertuples()): + if field == "None": + continue + if pd.isnull(field): + raise SyntaxError(f"line {lineno}: non-optional column '{F.VARNAME}' missing") + if pd.isnull(expr): + raise SyntaxError(f"line {lineno}: non-optional column '{F.TEST}' missing") + tree = ast.parse(expr, mode="eval") + cp = ConfigFunctionParser(tree.body) + to_call.append((cp.func, field, cp.kwargs, plot, lineno + 2, expr)) + return to_call + + +def readConfig(fname): + df = pd.read_csv( + fname, + sep=r"\s*;\s*", engine="python", + dtype=str, + quoting=3, + keep_default_na=False, # don't replace "" by nan + skip_blank_lines=False, + ) + + df = _handleEmptyLines(df) + df = _injectOptionalColumns(df) + df = _handleComments(df) + + df[F.VARNAME] = df[F.VARNAME].replace(r"^\s*$", np.nan, regex=True) + df[F.TEST] = df[F.TEST].replace(r"^\s*$", np.nan, regex=True) + df[F.PLOT] = df[F.PLOT].replace({"False": "", EMPTY: ""}) + df = df.astype({F.PLOT: bool}) + df = _parseConfig(df) + + return df diff --git a/saqc/core/register.py b/saqc/core/register.py new file mode 100644 index 0000000000000000000000000000000000000000..7aa667cf2486a0b5e9b8821a39bb3c648ad90a9a --- /dev/null +++ b/saqc/core/register.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python + +from operator import itemgetter +from inspect import signature, Parameter, _VAR_POSITIONAL, _VAR_KEYWORD +from typing import Tuple, Dict, Generator, Any, Set + +import numpy as np +import pandas as pd + + +class Func: + """ + This class is basically extends functool.partial` and in + fact, most of the constructor implementation is taken + directly from the python standard lib. Not messing with the + partial class directly proved to be an easier aproach though. + + Besides the implementation of a partial like functionality + `Func` provides a couple of properties/methods used to check + the passed arguments before the actual function call happens. + """ + def __init__(self, *args, **kwargs): + if len(args) < 1: + raise TypeError("'Func' takes at least one argument") + func, *args = args + if not callable(func): + raise TypeError("the first argument must be callable") + + if isinstance(func, Func): + args = func.args + args + kwargs = {**func.kwargs, **kwargs} + func = func.func + + self._signature = signature(func) + # NOTE: + # bind_partial comes with a validity check, so let's use it + self._signature.bind_partial(*args, **kwargs) + + self.__name__ = func.__name__ + self.func = func + self.args = args + self.kwargs = kwargs + + def __repr__(self): + return f"{self.__class__.__name__}({self.__name__}, {self.args}, {self.kwargs})" + + def __call__(self, *args, **kwargs): + keywords = {**self.kwargs, **kwargs} + return self.func(*self.args, *args, **keywords) + + @property + def _parameters(self) -> Generator[Tuple[str, Parameter], None, None]: + """ + yield all 'normal' parameters and their names, skipping + VAR_POSITIONALs (*args) and VAR_KEYWORDs (**kwargs) as + the don't help evaluating the correctness of the passed + arguments. + """ + for k, v in self._signature.parameters.items(): + if v.kind in (_VAR_POSITIONAL, _VAR_KEYWORD): + continue + yield k, v + + @property + def parameters(self) -> Tuple[str]: + """ + return the names of all parameters, i.e. positional + and keyword arguments without varargs + """ + return tuple(map(itemgetter(0), self._parameters)) + + @property + def optionals(self) -> Tuple[str]: + """ + return the names of all optional parameters without varargs + """ + return tuple(k for k, v in self._parameters if v.default is not Parameter.empty) + + def _getPositionals(self): + """ + return the names of all positional parameters without varargs + """ + return tuple(k for k, v in self._parameters if v.default is Parameter.empty) + + positionals = property(_getPositionals) + + def addGlobals(self, globs: Dict[str, Any]): + """ + Add the given key-value pairs to the function's global + scope. We abuse the __globals__ mechanism mainly to + make certain other functions (dis-)available within the + 'Func' body. + """ + self.func.__globals__.update(globs) + return self + + def getUnbounds(self) -> Set[str]: + """ + returns all the names of all unbound variables, + i.e. not yet `partialed` parameters + """ + return set(self.positionals[len(self.args):]) - set(self.kwargs.keys()) + + +class RegisterFunc(Func): + + """ + This class acts as a simple wrapper around all registered + functions. Currently its sole purpose is to inject additional + call arguments + """ + def __call__(self, *args, **kwargs): + # NOTE: + # injecting the function name into the + # keywords is sort of hacky + kwargs = {"func_name": self.__name__, **kwargs} + return super().__call__(*args, **kwargs) + + +class SaQCFunc(Func): + + """ + This class represents all test-, process and horminzation functions + provided through `SaQC`. Every call to an `SaQC` object will be wrapped + with all its non-dynamic arguments. + + `SaQCFunc`s are callable and expose the signature `data`, `field` and + `flagger` + """ + + # NOTE: + # we should formalize the function interface somehow, somewhere + _injectables = ("data", "field", "flagger") + + def __init__(self, *args, plot=False, lineno=None, expression=None, **kwargs): + super().__init__(*args, **kwargs) + + unbound = self.getUnbounds() + if unbound: + raise TypeError(f"missing required arguments: {', '.join(unbound)}") + + self.plot = plot + self.lineno = lineno + self.expr = expression + + def _getPositionals(self) -> Tuple[int]: + """ + Returns all positional (i.e. non-optional arguments) + without the `data`, `field` and `flagger` + """ + positionals = super()._getPositionals() + return tuple(k for k in positionals if k not in self._injectables) + + positionals = property(_getPositionals) + + def __call__(self, data, field, flagger): + # NOTE: + # when assigning new variables to `data`, the respective + # field is missing in `flags`, so we add it if necessary in + # order to keep the columns from `data` and `flags` in sync + if field not in flagger.getFlags(): + flagger = flagger.merge(flagger.initFlags(data=pd.Series(name=field))) + + # NOTE: replace flagged values by nan + mask = flagger.isFlagged() + data_in = data.copy() + data_in[mask] = np.nan + + data_result, flagger_result = self.func(data_in, field, flagger, *self.args, **self.kwargs) + + # NOTE: reinject the masked values + data_result.aloc[mask] = data[mask] + + return data_result, flagger_result + + +# NOTE: +# the global SaQC function store, +# will be filled by calls to register +FUNC_MAP : Dict[str, RegisterFunc] = {} + + +def register(func, name=None): + if name is None: + name = func.__name__ + else: + func.__name__ = name + func = RegisterFunc(func) + FUNC_MAP[name] = func + return func diff --git a/saqc/core/visitor.py b/saqc/core/visitor.py new file mode 100644 index 0000000000000000000000000000000000000000..219a7054c046d3f07ce0335f56814d4859dcaef0 --- /dev/null +++ b/saqc/core/visitor.py @@ -0,0 +1,209 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + +import ast + +import numpy as np +import pandas as pd + +from saqc.core.register import FUNC_MAP +import saqc.lib.ts_operators as ts_ops + + +ENVIRONMENT = { + "NAN": np.nan, + "abs": np.abs, + "max": np.nanmax, + "min": np.nanmin, + "mean": np.nanmean, + "sum": np.nansum, + "std": np.nanstd, + "len": len, + "exp": np.exp, + "log": np.log, + "var": np.var, + "median": np.median, + "first": pd.Series(np.nan, index=pd.DatetimeIndex([])).resample("0min").first, + "last": pd.Series(np.nan, index=pd.DatetimeIndex([])).resample("0min").last, + "deltaT": ts_ops.deltaT, + "id": ts_ops.identity, + "diff": ts_ops.difference, + "relDiff": ts_ops.relativeDifference, + "deriv": ts_ops.derivative, + "rateOfChange": ts_ops.rateOfChange, + "scale": ts_ops.scale, + "normScale": ts_ops.normScale, + "meanStandardize": ts_ops.standardizeByMean, + "medianStandardize": ts_ops.standardizeByMedian, + "zLog": ts_ops.zeroLog +} + +RESERVED = {"GOOD", "BAD", "UNFLAGGED", "NODATA"} + + +class ConfigExpressionParser(ast.NodeVisitor): + + """ + Generic configuration functions will be rewritten as lambda functions + and variables that need a look up in `data` will act as arguments, e.g.: + `flagGeneric(func=(x != NODATA) & (y < 3))` + will be rewritten to + `lambda x, y: (x != NODATA) & (y < 3)` + + The main purpose of this class is to identify all such lambda arguments + and check the given expression for accordance with the restrictions + imposed onto generic functions. + """ + + SUPPORTED = ( + ast.Str, + ast.Expression, + ast.UnaryOp, + ast.BinOp, + ast.BitOr, + ast.BitAnd, + ast.Num, + ast.Compare, + ast.Add, + ast.Sub, + ast.Mult, + ast.Div, + ast.Pow, + ast.Mod, + ast.USub, + ast.Eq, + ast.NotEq, + ast.Gt, + ast.Lt, + ast.GtE, + ast.LtE, + ast.Invert, + ast.Name, + ast.Load, + ast.Call, + ) + + def __init__(self, node): + self._args = [] + self.visit(node) + if not self._args: + # NOTE: + # we assume, that we are not dealing with an + # expression as we couldn't find any arguments + raise TypeError("not a valid expression") + + @property + def args(self): + return tuple(dict.fromkeys(self._args)) + + def visit_Call(self, node): + # only non-keyword arguments allowed + # in generic functions + for n in node.args: + self.visit(n) + + def visit_Name(self, node): + # NOTE: + # the assumption is, that anything not in + # ENVIRONMENT + RESERVED needs a lookup in `data` + name = node.id + if name not in ENVIRONMENT and name not in RESERVED: + self._args.append(name) + self.generic_visit(node) + + def generic_visit(self, node): + if not isinstance(node, self.SUPPORTED): + raise TypeError(f"invalid expression: '{node}'") + return super().generic_visit(node) + + +class ConfigFunctionParser(ast.NodeVisitor): + + SUPPORTED_NODES = ( + ast.Call, + ast.Num, + ast.Str, + ast.keyword, + ast.NameConstant, + ast.UnaryOp, + ast.Name, + ast.Load, + ast.Expression, + ast.Subscript, + ast.Index, + ast.USub, + ast.List, + ) + + def __init__(self, node): + + self.kwargs = {} + self.func = self.visit_Call(node) + + def visit_Call(self, node): + if not isinstance(node, ast.Call): + raise TypeError("expected function call") + + if node.args: + raise TypeError("only keyword arguments are supported") + + func_name = node.func.id + if func_name not in FUNC_MAP: + raise NameError(f"unknown function '{func_name}'") + + self.generic_visit(node) + return FUNC_MAP[func_name] + + def visit_keyword(self, node): + + k, v = node.arg, node.value + check_tree = True + + # NOTE: not a constant or variable, should be function call + try: + visitor = ConfigExpressionParser(v) + args = ast.arguments( + posonlyargs=[], + kwonlyargs=[], + kw_defaults=[], + defaults=[], + args=[ast.arg(arg=a, annotation=None) for a in visitor.args], + kwarg=None, + vararg=None, + ) + v = ast.Lambda(args=args, body=v) + # NOTE: + # don't pass the generated functions down + # to the checks implemented in this class... + check_tree = False + except TypeError: + pass + + vnode = ast.Assign( + targets=[ast.Name(id=k, ctx=ast.Store())], value=v + ) + + # NOTE: + # in order to get concrete values out of the AST + # we compile and evaluate the keyword (e.g. max=100) + # into the dictionary `self.kwargs` + # -> after all keywords where visited we end up with + # a dictionary holding all the passed arguments as + # real python objects + co = compile( + ast.fix_missing_locations(ast.Interactive(body=[vnode])), + "<ast>", + mode="single" + ) + # NOTE: only pass a copy to not clutter the ENVIRONMENT + exec(co, {**ENVIRONMENT}, self.kwargs) + + # let's do some more validity checks + if check_tree: + self.generic_visit(v) + + def generic_visit(self, node): + if not isinstance(node, self.SUPPORTED_NODES): + raise TypeError(f"invalid node: '{node}'") + return super().generic_visit(node) + diff --git a/saqc/flagger/baseflagger.py b/saqc/flagger/baseflagger.py index d4229502658a9d7e8207aed3967d7d7ce1d2d22d..17dfff7973d47c5d2b487f7e94a568c20267b759 100644 --- a/saqc/flagger/baseflagger.py +++ b/saqc/flagger/baseflagger.py @@ -4,12 +4,13 @@ import operator as op from copy import deepcopy from abc import ABC, abstractmethod -from typing import TypeVar, Union, Any, List +from typing import TypeVar, Union, Any, List, Optional +from functools import reduce import pandas as pd import dios.dios as dios -from saqc.lib.tools import assertScalar, mergeDios +from saqc.lib.tools import assertScalar, mergeDios, toSequence COMPARATOR_MAP = { "!=": op.ne, @@ -37,7 +38,11 @@ class BaseFlagger(ABC): # NOTE: the arggumens of setFlags supported from # the configuration functions self.signature = ("flag",) - self._flags: diosT = dios.DictOfSeries() + self._flags: Optional[diosT] = None + + @property + def initialized(self): + return self._flags is not None @property def flags(self): @@ -66,7 +71,7 @@ class BaseFlagger(ABC): newflagger._flags = flags.astype(self.dtype) return newflagger - def setFlagger(self, other: BaseFlaggerT, join: str = "merge"): + def merge(self, other: BaseFlaggerT, join: str = "merge"): """ Merge the given flagger 'other' into self """ @@ -79,7 +84,7 @@ class BaseFlagger(ABC): ) return newflagger - def getFlagger(self, field: FieldsT = None, loc: LocT = None, drop: FieldsT = None) -> BaseFlaggerT: + def slice(self, field: FieldsT = None, loc: LocT = None, drop: FieldsT = None) -> BaseFlaggerT: """ Return a potentially trimmed down copy of self. """ if drop is not None: if field is not None: @@ -148,14 +153,48 @@ class BaseFlagger(ABC): return self.setFlags(field=field, loc=loc, flag=self.UNFLAGGED, force=True, **kwargs) def isFlagged(self, field=None, loc: LocT = None, flag: FlagT = None, comparator: str = ">") -> PandasT: - assertScalar("flag", flag, optional=True) - flag = self.GOOD if flag is None else flag + """ + Returns boolean data that indicate where data has been flagged. + + Parameters + ---------- + field : str, list-like, default None + The field(s)/column(s) of the data to be tested if flagged. + If None all columns are used. + + loc : mask, slice, pd.Index, etc., default None + The location/rows of the data to be tested if flagged. + If None all rows are used. + + flag : str, category, list-like, default None + The flag(s) that define data as flagged. If None, `flagger.GOOD` + is used. + + comparator : {'<', '<=', '==', '!=', '>=', '>'}, default '>' + Defines how the comparison is done. The `flags` are always on the + left-hand-side, thus, the flag to compare is always on the right- + hand-side. Eg. a call with all defaults, return the equivalent + of `flagger.getFlags() > flagger.GOOD` + + Returns + ------- + pandas.Series or dios.DictOfSeries : Return Series if field is a scalar, + otherwise DictOfSeries. + """ + if isinstance(flag, pd.Series): + raise TypeError("flag: pd.Series is not allowed") + checkflags = set(toSequence(flag, self.GOOD)) + flags = self.getFlags(field, loc) cp = COMPARATOR_MAP[comparator] - # use notna() to prevent nans to become True, - # like in: np.nan != 0 -> True - flagged = flags.notna() & cp(flags, flag) + # notna() to prevent nans to become True, eg.: `np.nan != 0 -> True` + flagged = flags.notna() + for f in checkflags: + if not self.isValidFlag(f): + raise ValueError(f"invalid flag: {f}") + flagged &= cp(flags, f) + return flagged def copy(self, flags=None) -> BaseFlaggerT: @@ -164,6 +203,24 @@ class BaseFlagger(ABC): out._flags = flags return out + def isValidFlag(self, flag: FlagT) -> bool: + """ + Check if given flag is known to this flagger. + + Parameters + ---------- + flag: str + The flag to be checked. + + Returns + ------- + bool + """ + # This is a very rudimentary fallback for the check + # and the child flagger may should implement a better + # version of it. + return flag == self.BAD or flag == self.GOOD or flag == self.UNFLAGGED or self.isSUSPICIOUS(flag) + def _check_field(self, field): """ Check if (all) field(s) in self._flags. """ diff --git a/saqc/flagger/dmpflagger.py b/saqc/flagger/dmpflagger.py index 832a9684d1cdfedfc3fd8789aa00ca9b3412fd08..a01ba9661e2ca663b0590c2fea6c4fe9b55aa751 100644 --- a/saqc/flagger/dmpflagger.py +++ b/saqc/flagger/dmpflagger.py @@ -6,6 +6,8 @@ import json from copy import deepcopy from typing import TypeVar +import pandas as pd + import dios.dios as dios from saqc.flagger.categoricalflagger import CategoricalFlagger @@ -54,6 +56,16 @@ class DmpFlagger(CategoricalFlagger): def comments(self): return self._comments + def getFlagsAll(self): + out = pd.concat( + [self._flags.to_df(), self._causes.to_df(), self._comments.to_df()], + axis=1, + keys=[FlagFields.FLAG, FlagFields.CAUSE, FlagFields.COMMENT] + ) + out = (out.reorder_levels(order=[1, 0], axis=1) + .sort_index(axis=1, level=0, sort_remaining=False)) + return out + def initFlags(self, data: dios.DictOfSeries = None, flags: dios.DictOfSeries = None): """ initialize a flagger based on the given 'data' or 'flags' @@ -68,16 +80,16 @@ class DmpFlagger(CategoricalFlagger): newflagger._causes[:], newflagger._comments[:] = "", "" return newflagger - def getFlagger(self, field=None, loc=None, drop=None): - newflagger = super().getFlagger(field=field, loc=loc, drop=drop) + def slice(self, field=None, loc=None, drop=None): + newflagger = super().slice(field=field, loc=loc, drop=drop) flags = newflagger.flags newflagger._causes = self._causes.aloc[flags, ...] newflagger._comments = self._comments.aloc[flags, ...] return newflagger - def setFlagger(self, other: DmpFlaggerT, join: str="merge"): + def merge(self, other: DmpFlaggerT, join: str= "merge"): assert isinstance(other, DmpFlagger) - out = super().setFlagger(other, join) + out = super().merge(other, join) out._causes = mergeDios(out._causes, other._causes, join=join) out._comments = mergeDios(out._comments, other._comments, join=join) return out diff --git a/saqc/funcs/__init__.py b/saqc/funcs/__init__.py index dd99e37239d85533f38cd5a0780087f96c8335db..62d7f1c01f24b520af566dd3e84656a77cba5e6e 100644 --- a/saqc/funcs/__init__.py +++ b/saqc/funcs/__init__.py @@ -2,10 +2,10 @@ # -*- coding: utf-8 -*- # imports needed to make the functions register themself -from .register import register -from .functions import * -from .breaks_detection import * -from .constants_detection import * -from .soil_moisture_tests import * -from .spikes_detection import * -from .harm_functions import * +from saqc.core.register import register +from saqc.funcs.functions import * +from saqc.funcs.breaks_detection import * +from saqc.funcs.constants_detection import * +from saqc.funcs.soil_moisture_tests import * +from saqc.funcs.spikes_detection import * +from saqc.funcs.harm_functions import * diff --git a/saqc/funcs/breaks_detection.py b/saqc/funcs/breaks_detection.py index 7bc7823da12950c55262b5f4376cb51b85612292..ec507f5dd4a24687c8693ef1bff1a30a8fcef312 100644 --- a/saqc/funcs/breaks_detection.py +++ b/saqc/funcs/breaks_detection.py @@ -6,11 +6,11 @@ import pandas as pd from scipy.signal import savgol_filter -from saqc.funcs.register import register +from saqc.core.register import register from saqc.lib.tools import retrieveTrustworthyOriginal -@register() +@register def breaks_flagSpektrumBased( data, field, diff --git a/saqc/funcs/constants_detection.py b/saqc/funcs/constants_detection.py index d6577d61a552b660f2d787f4ef40a8f2af89ff8f..d5c9da71f0fce9b12792157a1bfa683dedb7465b 100644 --- a/saqc/funcs/constants_detection.py +++ b/saqc/funcs/constants_detection.py @@ -4,12 +4,12 @@ import numpy as np import pandas as pd -from saqc.funcs.register import register +from saqc.core.register import register from saqc.lib.ts_operators import varQC from saqc.lib.tools import retrieveTrustworthyOriginal -@register() +@register def constants_flagBasic(data, field, flagger, thresh, window, **kwargs): """ Flag values are (semi-)constant. @@ -39,7 +39,7 @@ def constants_flagBasic(data, field, flagger, thresh, window, **kwargs): return data, flagger -@register() +@register def constants_flagVarianceBased( data, field, flagger, window="12h", thresh=0.0005, max_missing=None, max_consec_missing=None, **kwargs ): diff --git a/saqc/funcs/functions.py b/saqc/funcs/functions.py index 2a33e2d592772ad0112f4c9ecc7a97afab3d8d2d..04381c7442cd834725773f35038059ec48f2a014 100644 --- a/saqc/funcs/functions.py +++ b/saqc/funcs/functions.py @@ -1,6 +1,8 @@ #! /usr/bin/env python # -*- coding: utf-8 -*- +from functools import partial + import numpy as np import pandas as pd import dtw @@ -10,43 +12,93 @@ import datetime from saqc.lib.tools import groupConsecutives, sesonalMask -from saqc.funcs.register import register - - -@register() -def procGeneric(data, field, flagger, func, **kwargs): - data[field] = func.squeeze() +from saqc.core.register import register, Func +from saqc.core.visitor import ENVIRONMENT +from dios import DictOfSeries +from typing import Any + + +def _dslIsFlagged(flagger, var, flag=None, comparator=None): + """ + helper function for `flagGeneric` + """ + if comparator is None: + return flagger.isFlagged(var.name, flag=flag) + return flagger.isFlagged(var.name, flag=flag, comparator=comparator) + + +def _execGeneric(flagger, data, func, field, nodata): + # TODO: + # - check series.index compatibility + # - field is only needed to translate 'this' parameters + # -> maybe we could do the translation on the tree instead + + func = Func(func) + for k in func.parameters: + k = field if k == "this" else k + if k not in data: + raise NameError(f"variable '{k}' not found") + func = Func(func, data[k]) + + globs = { + "isflagged": partial(_dslIsFlagged, flagger), + "ismissing": lambda var: ((var == nodata) | pd.isnull(var)), + "this": field, + "NODATA": nodata, + "GOOD": flagger.GOOD, + "BAD": flagger.BAD, + "UNFLAGGED": flagger.UNFLAGGED, + **ENVIRONMENT + } + func = func.addGlobals(globs) + return func() + + +@register +def procGeneric(data, field, flagger, func, nodata=np.nan, **kwargs): + """ + Execute generic functions. + The **kwargs are needed to satisfy the test-function interface, + although they are of no use here. Usually they are abused to + transport the name of the test function (here: `procGeneric`) + into the flagger, but as we don't set flags here, we simply + ignore them + """ + data[field] = _execGeneric(flagger, data, func, field, nodata).squeeze() # NOTE: # The flags to `field` will be (re-)set to UNFLAGGED - - # PROBLEM: - # flagger.setFlagger merges the given flaggers, if + # That leads to the following problem: + # flagger.merge merges the given flaggers, if # `field` did already exist before the call to `procGeneric` # but with a differing index, we end up with: # len(data[field]) != len(flagger.getFlags(field)) # see: test/funcs/test_generic_functions.py::test_procGenericMultiple - flagger = flagger.setFlagger(flagger.initFlags(data[field])) + + # TODO: + # We need a way to simply overwrite a given flagger column, maybe + # an optional keyword to merge ? + flagger = flagger.merge(flagger.initFlags(data[field])) return data, flagger -@register() -def flagGeneric(data, field, flagger, func, **kwargs): +@register +def flagGeneric(data, field, flagger, func, nodata=np.nan, **kwargs): # NOTE: # The naming of the func parameter is pretty confusing # as it actually holds the result of a generic expression - mask = func.squeeze() + mask = _execGeneric(flagger, data, func, field, nodata).squeeze() if np.isscalar(mask): raise TypeError(f"generic expression does not return an array") if not np.issubdtype(mask.dtype, np.bool_): raise TypeError(f"generic expression does not return a boolean array") if flagger.getFlags(field).empty: - flagger = flagger.setFlagger(flagger.initFlags(data=pd.Series(name=field, index=mask.index))) + flagger = flagger.merge(flagger.initFlags(data=pd.Series(name=field, index=mask.index))) flagger = flagger.setFlags(field, mask, **kwargs) return data, flagger -@register() +@register def flagRange(data, field, flagger, min, max, **kwargs): # using .values is very much faster datacol = data[field].values @@ -56,7 +108,6 @@ def flagRange(data, field, flagger, min, max, **kwargs): - @register() def flagPattern(data, field, flagger, reference_field, method = 'dtw', partition_freq = "days", partition_offset = 0, max_distance = 0.03, normalized_distance = True, widths = [1,2,4,8], waveform = 'mexh', **kwargs): @@ -140,7 +191,7 @@ def flagPattern(data, field, flagger, reference_field, method = 'dtw', partition -@register() +@register def flagMissing(data, field, flagger, nodata=np.nan, **kwargs): datacol = data[field] if np.isnan(nodata): @@ -152,9 +203,9 @@ def flagMissing(data, field, flagger, nodata=np.nan, **kwargs): return data, flagger -@register() +@register def flagSesonalRange( - data, field, flagger, min, max, startmonth=1, endmonth=12, startday=1, endday=31, **kwargs, + data, field, flagger, min, max, startmonth=1, endmonth=12, startday=1, endday=31, **kwargs, ): smask = sesonalMask(data[field].index, startmonth, startday, endmonth, endday) @@ -162,32 +213,31 @@ def flagSesonalRange( if d.empty: return data, flagger - _, flagger_range = flagRange(d, field, flagger.getFlagger(loc=d[field].index), min=min, max=max, **kwargs) + _, flagger_range = flagRange(d, field, flagger.slice(loc=d[field].index), min=min, max=max, **kwargs) if not flagger_range.isFlagged(field).any(): return data, flagger - flagger = flagger.setFlagger(flagger_range) + flagger = flagger.merge(flagger_range) return data, flagger -@register() +@register def clearFlags(data, field, flagger, **kwargs): flagger = flagger.clearFlags(field, **kwargs) return data, flagger -@register() +@register def forceFlags(data, field, flagger, flag, **kwargs): flagger = flagger.clearFlags(field).setFlags(field, flag=flag, **kwargs) return data, flagger -@register() +@register def flagIsolated( - data, field, flagger, gap_window, group_window, **kwargs, + data, field, flagger, gap_window, group_window, **kwargs, ): - gap_window = pd.tseries.frequencies.to_offset(gap_window) group_window = pd.tseries.frequencies.to_offset(group_window) @@ -200,9 +250,9 @@ def flagIsolated( start = srs.index[0] stop = srs.index[-1] if stop - start <= group_window: - left = mask[start - gap_window : start].iloc[:-1] + left = mask[start - gap_window: start].iloc[:-1] if left.all(): - right = mask[stop : stop + gap_window].iloc[1:] + right = mask[stop: stop + gap_window].iloc[1:] if right.all(): flags[start:stop] = True @@ -211,6 +261,151 @@ def flagIsolated( return data, flagger -@register() +@register def flagDummy(data, field, flagger, **kwargs): + """ Do nothing """ + return data, flagger + + +@register +def flagManual(data, field, flagger, mdata, mflag: Any = 1, method='plain', **kwargs): + """ Flag data by given manual data. + + The data is flagged at locations where `mdata` is equal to a provided flag (`mflag`). + The format of mdata can be a indexed object, like pd.Series, pd.Dataframe or dios.DictOfSeries, + but also can be a plain list- or array-like. + How indexed mdata is aligned to data is specified via `method` argument. + + Parameters + ---------- + data : DictOfSeries + field : str + The field chooses the column in flags and data in question. + It also determine the column in mdata if its of type pd.Dataframe or dios.DictOfSeries. + + flagger : flagger + + mdata : {pd.Series, pd.Dataframe, DictOfSeries, str} + The manual data + + mflag : scalar + The flag that indicates data points in `mdata`, that should be flagged. + + method : {'plain', 'ontime', 'left-open', 'right-open'}, default plain + Define how mdata is applied on data. Except 'plain' mdata must have a index. + * 'plain': mdata must have same length than data and is applied one-to-one on data. + * 'ontime': work only with indexed mdata, it is applied, where timestamps are match. + * 'right-open': mdata defines periods, which are defined by two consecutive timestamps, the + value of the first aka. left is applied on the whole period. + * 'left-open': like 'right-open' but the value is defined in the latter aka. right timestamp. + + kwargs : Any + passed to flagger + + Returns + ------- + data, flagger: original data, modified flagger + + Examples + -------- + An example for mdata + >>> mdata = pd.Series([1,0,1], index=pd.to_datetime(['2000-02', '2000-03', '2001-05'])) + >>> mdata + 2000-02-01 1 + 2000-03-01 0 + 2001-05-01 1 + dtype: int64 + + On *dayly* data, with the 'ontime' method, only the provided timestamnps are used. + Bear in mind that only exact timestamps apply, any offset will result in ignoring + the timestamp. + >>> _, fl = flagManual(data, field, flagger, mdata, mflag=1, method='ontime') + >>> fl.isFlagged(field) + 2000-01-31 False + 2000-02-01 True + 2000-02-02 False + 2000-02-03 False + .. .. + 2000-02-29 False + 2000-03-01 True + 2000-03-02 False + Freq: D, dtype: bool + + With the 'right-open' method, the mdata is forward fill: + >>> _, fl = flagManual(data, field, flagger, mdata, mflag=1, method='right-open') + >>> fl.isFlagged(field) + 2000-01-31 False + 2000-02-01 True + 2000-02-02 True + .. .. + 2000-02-29 True + 2000-03-01 False + 2000-03-02 False + Freq: D, dtype: bool + + With the 'left-open' method, backward filling is used: + >>> _, fl = flagManual(data, field, flagger, mdata, mflag=1, method='left-open') + >>> fl.isFlagged(field) + 2000-01-31 False + 2000-02-01 False + 2000-02-02 True + .. .. + 2000-02-29 True + 2000-03-01 True + 2000-03-02 False + Freq: D, dtype: bool + """ + dat = data[field] + if isinstance(mdata, str): + # todo import path type in mdata, use + # s = pd.read_csv(mdata, index_col=N, usecol=[N,N,..]) <- use positional + # use a list-arg in config to get the columns + # at last, fall throug to next checks + raise NotImplementedError("giving a path is currently not supported") + + if isinstance(mdata, (pd.DataFrame, DictOfSeries)): + mdata = mdata[field] + + hasindex = isinstance(mdata, (pd.Series, pd.DataFrame, DictOfSeries)) + if not hasindex and method != 'plain': + raise ValueError("mdata has no index") + + if method == 'plain': + if hasindex: + mdata = mdata.to_numpy() + if len(mdata) != len(dat): + raise ValueError('mdata must have same length then data') + mdata = pd.Series(mdata, index=dat.index) + elif method == 'ontime': + pass # reindex will do the job later + elif method in ['left-open', 'right-open']: + mdata = mdata.reindex(dat.index.union(mdata.index)) + + # -->)[t0-->)[t1--> (ffill) + if method == 'right-open': + mdata = mdata.ffill() + + # <--t0](<--t1](<-- (bfill) + if method == 'left-open': + mdata = mdata.bfill() + else: + raise ValueError(method) + + mask = mdata == mflag + mask = mask.reindex(dat.index).fillna(False) + flagger = flagger.setFlags(field=field, loc=mask, **kwargs) + return data, flagger + + +@register +def flagCrossScoring(data, field, flagger, fields, thresh, cross_stat=np.median, **kwargs): + val_frame = data.loc[data.index_of('shared')].to_df() + try: + stat = getattr(val_frame, cross_stat.__name__)(axis=1) + except AttributeError: + stat = val_frame.aggregate(cross_stat, axis=1) + diff_scores = val_frame.subtract(stat, axis=0).abs() + diff_scores = diff_scores > thresh + for var in fields: + flagger = flagger.setFlags(var, diff_scores[var].values, **kwargs) return data, flagger diff --git a/saqc/funcs/harm_functions.py b/saqc/funcs/harm_functions.py index 42ef0200c3210e4f7e34b83857876583a2f19e8a..32f0f696b1bd785614c68f434fd75fec1056accc 100644 --- a/saqc/funcs/harm_functions.py +++ b/saqc/funcs/harm_functions.py @@ -6,9 +6,8 @@ import numpy as np import logging import dios -from saqc.funcs.functions import flagMissing -from saqc.funcs.register import register -from saqc.lib.tools import toSequence, getFuncFromInput +from saqc.core.register import register +from saqc.lib.tools import toSequence logger = logging.getLogger("SaQC") @@ -47,10 +46,10 @@ def harmWrapper(heap={}): freq, inter_method, reshape_method, - inter_agg="mean", + inter_agg=np.nanmean, inter_order=1, inter_downcast=False, - reshape_agg="max", + reshape_agg=np.nanmax, reshape_missing_flag=None, reshape_shift_comment=False, drop_flags=None, @@ -58,12 +57,9 @@ def harmWrapper(heap={}): **kwargs, ): data = data.copy() - # get funcs from strings: - inter_agg = getFuncFromInput(inter_agg) - reshape_agg = getFuncFromInput(reshape_agg) # get data of variable - flagger_merged = flagger.getFlagger(field=field) + flagger_merged = flagger.slice(field=field) dat_col = data[field] # now we send the flags frame in its current shape to the future: @@ -78,6 +74,9 @@ def harmWrapper(heap={}): # now we can manipulate it without loosing information gathered before harmonization dat_col, flagger_merged_clean, _ = _outsortCrap(dat_col, field, flagger_merged, drop_flags=drop_flags) + if dat_col.empty: + return data, flagger + # interpolation! (yeah) dat_col, chunk_bounds = _interpolateGrid( dat_col, @@ -128,14 +127,13 @@ def harmWrapper(heap={}): resolve_method = HARM_2_DEHARM[harm_info[Heap.METHOD]] # retrieve data and flags from the merged saqc-conform data frame (and by that get rid of blow-up entries). - flagger_harmony = flagger.getFlagger(field=field) + flagger_harmony = flagger.slice(field=field) dat_col = data[field] # reconstruct the drops that were performed before harmonization _, flagger_original_clean, drop_mask = _outsortCrap( dat_col, field, harm_info[Heap.FLAGGER], drop_flags=harm_info[Heap.DROP] ) - drops = flagger.getFlags(field=field, loc=drop_mask) # with reconstructed pre-harmonization flags-frame -> perform the projection of the flags calculated for # the harmonized timeseries, onto the original timestamps @@ -165,8 +163,9 @@ def harmWrapper(heap={}): harm_harmonize, harm_deharmonize = harmWrapper() -register()(harm_harmonize) -register()(harm_deharmonize) +register(harm_harmonize, name='harm_harmonize') +register(harm_deharmonize, name='harm_deharmonize') + # (de-)harmonize helper @@ -195,7 +194,7 @@ def _outsortCrap( for drop_flag in drop_flags: drop_mask = drop_mask | flagger.isFlagged(field, flag=drop_flag, comparator="==") - flagger_out = flagger.getFlagger(loc=~drop_mask) + flagger_out = flagger.slice(loc=~drop_mask) return data[~drop_mask], flagger_out, drop_mask @@ -687,85 +686,17 @@ def _backtrackFlags(flagger_harmony, flagger_original_clean, flagger_original, f return flagger_original.initFlags(flags=res) -def _fromMerged(data, flagger, fieldname): - # we need a not-na mask for the flags data to be retrieved: - mask = flagger.getFlags(fieldname).notna() - return data.loc[mask[mask].index, fieldname], flagger.getFlagger(field=fieldname, loc=mask) - -def _toMerged(data, flagger, fieldname, data_to_insert, flagger_to_insert, target_index=None, **kwargs): - - data = data.copy() - flags = flagger._flags - flags_to_insert = flagger_to_insert._flags - - # this should never happen, but if this could happen in general, - # the caller have to ensure, that we get a dios - assert not isinstance(data, pd.Series) - - newcols = data.columns.difference([fieldname]) - data = data[newcols] - flags = flags[newcols] - - # first case: there is no data, the data-to-insert would have - # to be merged with, and also are we not deharmonizing: - if data.empty and target_index is None: - return data, flagger_to_insert - - - # if thats not the case: generate the drop mask for the remaining data: - - # the following is not implemented in dios, but as soon as it is done, - # we should use it. wait for #21 see: https://git.ufz.de/rdm/dios/issues/21 - # mask = data.isna().all(axis=1) - # workaround: - nans = data.isna() - common_nans_index = nans[nans].index_of('shared') - - # we only want to drop lines, that do not have to be re-inserted in the merge: - drops = common_nans_index.difference(data_to_insert.index) - # clear mask, but keep index - mask = data.copy() - mask[:] = True - # final mask: - mask[drops] = False - - # if we are not "de-harmonizing": - if target_index is None: - # erase nan rows in the data, that became redundant because of harmonization and merge with data-to-insert: - data = pd.merge(data[mask], data_to_insert, how="outer", left_index=True, right_index=True) - flags = pd.merge(flags[mask], flags_to_insert, how="outer", left_index=True, right_index=True) - return data, flagger.initFlags(flags=flags) - - else: - # trivial case: there is only one variable ("reindexing to make sure shape matches pre-harm shape"): - if data.empty: - data = data_to_insert.reindex(target_index).to_frame(name=fieldname) - flags = flags_to_insert.reindex(target_index, fill_value=flagger.UNFLAGGED) - return data, flagger.initFlags(flags=flags) - # annoying case: more than one variables: - # erase nan rows resulting from harmonization but keep/regain those, that were initially present in the data: - new_index = data[mask].index.join(target_index, how="outer") - data = data.reindex(new_index) - flags = flags.reindex(new_index, fill_value=flagger.UNFLAGGED) - data = pd.merge(data, data_to_insert, how="outer", left_index=True, right_index=True) - flags = pd.merge(flags, flags_to_insert, how="outer", left_index=True, right_index=True) - - # internally harmonization memorizes its own manipulation by inserting nan flags - - # those we will now assign the flagger.bad flag by the "missingTest": - return flagMissing(data, fieldname, flagger.initFlags(flags=flags), nodata=np.nan, **kwargs) - - -@register() +@register def harm_shift2Grid(data, field, flagger, freq, method="nshift", drop_flags=None, **kwargs): return harm_harmonize( data, field, flagger, freq, inter_method=method, reshape_method=method, drop_flags=drop_flags, **kwargs, ) -@register() +@register def harm_aggregate2Grid( - data, field, flagger, freq, value_func, flag_func="max", method="nagg", drop_flags=None, **kwargs, + data, field, flagger, freq, value_func, flag_func=np.nanmax, method="nagg", drop_flags=None, **kwargs, ): return harm_harmonize( data, @@ -781,8 +712,8 @@ def harm_aggregate2Grid( ) -@register() -def harm_linear2Grid(data, field, flagger, freq, method="nagg", func="max", drop_flags=None, **kwargs): +@register +def harm_linear2Grid(data, field, flagger, freq, method="nagg", func=np.nanmax, drop_flags=None, **kwargs): return harm_harmonize( data, field, @@ -796,9 +727,9 @@ def harm_linear2Grid(data, field, flagger, freq, method="nagg", func="max", drop ) -@register() +@register def harm_interpolate2Grid( - data, field, flagger, freq, method, order=1, flag_method="nagg", flag_func="max", drop_flags=None, **kwargs, + data, field, flagger, freq, method, order=1, flag_method="nagg", flag_func=np.nanmax, drop_flags=None, **kwargs, ): return harm_harmonize( data, @@ -814,45 +745,38 @@ def harm_interpolate2Grid( ) -@register() +@register def harm_downsample( data, field, flagger, sample_freq, agg_freq, - sample_func="mean", - agg_func="mean", + sample_func=np.nanmean, + agg_func=np.nanmean, invalid_flags=None, max_invalid=None, **kwargs, ): - agg_func = getFuncFromInput(agg_func) if max_invalid is None: max_invalid = np.inf - if sample_func is not None: - sample_func = getFuncFromInput(sample_func) # define the "fastest possible" aggregator if sample_func is None: if max_invalid < np.inf: - def aggregator(x): if x.isna().sum() < max_invalid: return agg_func(x) else: return np.nan - else: - def aggregator(x): return agg_func(x) else: - dummy_resampler = pd.Series(np.nan, index=[pd.Timedelta("1min")]).resample("1min") if hasattr(dummy_resampler, sample_func.__name__): @@ -870,7 +794,6 @@ def harm_downsample( def aggregator(x): return agg_func(getattr(x.resample(sample_freq), sample_func_name)()) - else: if max_invalid < np.inf: @@ -880,9 +803,7 @@ def harm_downsample( return agg_func(y) else: return np.nan - else: - def aggregator(x): return agg_func(x.resample(sample_freq).apply(sample_func)) @@ -894,7 +815,7 @@ def harm_downsample( inter_method="bagg", reshape_method="bagg_no_deharm", inter_agg=aggregator, - reshape_agg="max", + reshape_agg=np.nanmax, drop_flags=invalid_flags, **kwargs, ) diff --git a/saqc/funcs/register.py b/saqc/funcs/register.py deleted file mode 100644 index b973b71b6952794e4adacdb55f1b6bb33572fd66..0000000000000000000000000000000000000000 --- a/saqc/funcs/register.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python - -from functools import partial -from inspect import signature, _VAR_KEYWORD - - -class Partial(partial): - def __init__(self, func, *args, **kwargs): - self._signature = signature(func) - - @property - def signature(self): - out = [] - for k, v in self._signature.parameters.items(): - if v.kind != _VAR_KEYWORD: - out.append(k) - return tuple(out) - - -# NOTE: will be filled by calls to register -FUNC_MAP = {} - - -def register(): - def outer(func): - name = func.__name__ - func = Partial(func, func_name=name) - FUNC_MAP[name] = func - - def inner(*args, **kwargs): - return func(*args, **kwargs) - - return inner - - return outer diff --git a/saqc/funcs/soil_moisture_tests.py b/saqc/funcs/soil_moisture_tests.py index 87e0afde36aee3b529956f7aad7e49220046ce62..28793b59a02997c778908bf299f6d65361d4a970 100644 --- a/saqc/funcs/soil_moisture_tests.py +++ b/saqc/funcs/soil_moisture_tests.py @@ -10,11 +10,11 @@ from scipy.signal import savgol_filter from saqc.funcs.breaks_detection import breaks_flagSpektrumBased from saqc.funcs.spikes_detection import spikes_flagSpektrumBased from saqc.funcs.constants_detection import constants_flagVarianceBased -from saqc.funcs.register import register +from saqc.core.register import register from saqc.lib.tools import retrieveTrustworthyOriginal -@register() +@register def sm_flagSpikes( data, field, @@ -51,7 +51,7 @@ def sm_flagSpikes( ) -@register() +@register def sm_flagBreaks( data, field, @@ -92,7 +92,7 @@ def sm_flagBreaks( ) -@register() +@register def sm_flagFrost(data, field, flagger, soil_temp_variable, window="1h", frost_thresh=0, **kwargs): """This Function is an implementation of the soil temperature based Soil Moisture flagging, as presented in: @@ -139,7 +139,7 @@ def sm_flagFrost(data, field, flagger, soil_temp_variable, window="1h", frost_th return data, flagger -@register() +@register def sm_flagPrecipitation( data, field, @@ -246,7 +246,7 @@ def sm_flagPrecipitation( return data, flagger -@register() +@register def sm_flagConstants( data, field, @@ -340,7 +340,7 @@ def sm_flagConstants( return data, flagger -@register() +@register def sm_flagRandomForest(data, field, flagger, references, window_values: int, window_flags: int, path: str, **kwargs): """ This Function uses pre-trained machine-learning model objects for flagging of a specific variable. The model is diff --git a/saqc/funcs/spikes_detection.py b/saqc/funcs/spikes_detection.py index 3cf93fdae8f924a272a04a9f531d3a1092c8f029..80115b4a026d9219f08d5f5b7d7194e8abe74dcd 100644 --- a/saqc/funcs/spikes_detection.py +++ b/saqc/funcs/spikes_detection.py @@ -7,7 +7,7 @@ import pandas as pd from scipy.signal import savgol_filter from scipy.stats import zscore from scipy.optimize import curve_fit -from saqc.funcs.register import register +from saqc.core.register import register import numpy.polynomial.polynomial as poly import numba import saqc.lib.ts_operators as ts_ops @@ -16,7 +16,6 @@ from saqc.lib.tools import ( offset2seconds, slidingWindowIndices, findIndex, - composeFunction ) @@ -135,13 +134,12 @@ def _expFit(val_frame, scoring_method='kNNMaxGap', n_neighbors=10, iter_start=0. return val_frame.index[sorted_i[iter_index:]] -@register() +@register def spikes_flagMultivarScores(data, field, flagger, fields, trafo='normScale', alpha=0.05, n_neighbors=10, scoring_method='kNNMaxGap', iter_start=0.5, threshing='stray', expfit_binning='auto', stray_partition=None, stray_partition_min=0, **kwargs): - trafo = composeFunction(trafo.split(',')) # data fransformation/extraction val_frame = data[fields[0]] @@ -178,7 +176,7 @@ def spikes_flagMultivarScores(data, field, flagger, fields, trafo='normScale', a -@register() +@register def spikes_flagRaise( data, field, flagger, thresh, raise_window, intended_freq, average_window=None, mean_raise_factor=2, min_slope=None, min_slope_weight=0.8, numba_boost=True, **kwargs): @@ -273,7 +271,7 @@ def spikes_flagRaise( return data, flagger -@register() +@register def spikes_flagSlidingZscore( data, field, flagger, window, offset, count=1, polydeg=1, z=3.5, method="modZ", **kwargs, ): @@ -397,7 +395,7 @@ def spikes_flagSlidingZscore( return data, flagger -@register() +@register def spikes_flagMad(data, field, flagger, window, z=3.5, **kwargs): """ The function represents an implementation of the modyfied Z-score outlier detection method, as introduced here: @@ -435,7 +433,7 @@ def spikes_flagMad(data, field, flagger, window, z=3.5, **kwargs): return data, flagger -@register() +@register def spikes_flagBasic(data, field, flagger, thresh=7, tolerance=0, window="15min", **kwargs): """ A basic outlier test that is designed to work for harmonized and not harmonized data. @@ -521,7 +519,7 @@ def spikes_flagBasic(data, field, flagger, thresh=7, tolerance=0, window="15min" return data, flagger -@register() +@register def spikes_flagSpektrumBased( data, field, diff --git a/saqc/lib/plotting.py b/saqc/lib/plotting.py index 92a0667331ceec4064b4466305a5199808e5c8c1..bbe898b98850d55951d177cf3a43171497f09d04 100644 --- a/saqc/lib/plotting.py +++ b/saqc/lib/plotting.py @@ -156,9 +156,11 @@ def _plotMultipleVariables( tlen = len(targets) tgen = (t for t in targets) - nfig, ncols_rest = divmod(tlen, 5) - ncols = [4] * nfig + [ncols_rest] - nfig += 1 + nfig, ncols_rest = divmod(tlen, 4) + ncols = [4] * nfig + if ncols_rest: + nfig += 1 + ncols += [ncols_rest] gs_kw = dict(width_ratios=_layout_data_to_table_ratio) layout = dict( diff --git a/saqc/lib/tools.py b/saqc/lib/tools.py index d6185de75ce930ba431da285ed72f1673c5fa830..ec438b598fae98e474b59b81c19f9d4ab94bd7ce 100644 --- a/saqc/lib/tools.py +++ b/saqc/lib/tools.py @@ -1,92 +1,92 @@ #! /usr/bin/env python # -*- coding: utf-8 -*- +import re from typing import Sequence, Union, Any, Iterator import numpy as np import numba as nb -import saqc.lib.ts_operators as ts_ops -import scipy -import sklearn -from functools import reduce, partial import pandas as pd + import dios # from saqc.flagger import BaseFlagger from saqc.lib.types import T -SAQC_OPERATORS = { - "exp": np.exp, - "log": np.log, - "sum": np.sum, - "var": np.var, - "std": np.std, - "mean": np.mean, - "median": np.median, - "min": np.min, - "max": np.max, - "first": pd.Series(np.nan, index=pd.DatetimeIndex([])).resample("0min").first, - "last": pd.Series(np.nan, index=pd.DatetimeIndex([])).resample("0min").last, - "deltaT": ts_ops.deltaT, - "id": ts_ops.identity, - "diff": ts_ops.difference, - "relDiff": ts_ops.relativeDifference, - "deriv": ts_ops.derivative, - "rateOfChange": ts_ops.rateOfChange, - "scale": ts_ops.scale, - "normScale": ts_ops.normScale, - "meanStandardize": ts_ops.standardizeByMean, - "medianStandardize": ts_ops.standardizeByMedian, - "zLog": ts_ops.zeroLog -} - - -OP_MODULES = {'pd': pd, - 'np': np, - 'scipy': scipy, - 'sklearn': sklearn - } - - -def evalFuncString(func_string): - if not isinstance(func_string, str): - return func_string - module_dot = func_string.find(".") - first, *rest = func_string.split(".") - if rest: - module = func_string[:module_dot] - try: - return reduce(lambda m, f: getattr(m, f), rest, OP_MODULES[first]) - except KeyError: - availability_list = [f"'{k}' (= {s.__name__})" for k, s in OP_MODULES.items()] - availability_list = " \n".join(availability_list) - raise ValueError( - f'The external-module alias "{module}" is not known to the internal operators dispatcher. ' - f"\n Please select from: \n{availability_list}" - ) - - else: - if func_string in SAQC_OPERATORS: - return SAQC_OPERATORS[func_string] - else: - availability_list = [f"'{k}' (= {s.__name__})" for k, s in SAQC_OPERATORS.items()] - availability_list = " \n".join(availability_list) - raise ValueError( - f'The external-module alias "{func_string}" is not known to the internal operators ' - f"dispatcher. \n Please select from: \n{availability_list}" - ) - - -def composeFunction(functions): - if callable(functions): - return functions - functions = toSequence(functions) - functions = [evalFuncString(f) for f in functions] - - def composed(ts, funcs=functions): - return reduce(lambda x, f: f(x), reversed(funcs), ts) - - return partial(composed, funcs=functions) +# SAQC_OPERATORS = { +# "exp": np.exp, +# "log": np.log, +# "sum": np.sum, +# "var": np.var, +# "std": np.std, +# "mean": np.mean, +# "median": np.median, +# "min": np.min, +# "max": np.max, +# "first": pd.Series(np.nan, index=pd.DatetimeIndex([])).resample("0min").first, +# "last": pd.Series(np.nan, index=pd.DatetimeIndex([])).resample("0min").last, +# "deltaT": ts_ops.deltaT, +# "id": ts_ops.identity, +# "diff": ts_ops.difference, +# "relDiff": ts_ops.relativeDifference, +# "deriv": ts_ops.derivative, +# "rateOfChange": ts_ops.rateOfChange, +# "scale": ts_ops.scale, +# "normScale": ts_ops.normScale, +# "meanStandardize": ts_ops.standardizeByMean, +# "medianStandardize": ts_ops.standardizeByMedian, +# "zLog": ts_ops.zeroLog +# } + + +# OP_MODULES = {'pd': pd, +# 'np': np, +# 'scipy': scipy, +# 'sklearn': sklearn +# } + + +# def evalFuncString(func_string): +# # TODO: check if necessary when the API is available +# if not isinstance(func_string, str): +# return func_string +# module_dot = func_string.find(".") +# first, *rest = func_string.split(".") +# if rest: +# module = func_string[:module_dot] +# try: +# return reduce(lambda m, f: getattr(m, f), rest, OP_MODULES[first]) +# except KeyError: +# availability_list = [f"'{k}' (= {s.__name__})" for k, s in OP_MODULES.items()] +# availability_list = " \n".join(availability_list) +# raise ValueError( +# f'The external-module alias "{module}" is not known to the internal operators dispatcher. ' +# f"\n Please select from: \n{availability_list}" +# ) + +# else: +# if func_string in SAQC_OPERATORS: +# return SAQC_OPERATORS[func_string] +# else: +# availability_list = [f"'{k}' (= {s.__name__})" for k, s in SAQC_OPERATORS.items()] +# availability_list = " \n".join(availability_list) +# raise ValueError( +# f'The external-module alias "{func_string}" is not known to the internal operators ' +# f"dispatcher. \n Please select from: \n{availability_list}" +# ) + + +# def composeFunction(functions): +# # TODO: check if necessary when the API is available +# if callable(functions): +# return functions +# functions = toSequence(functions) +# functions = [evalFuncString(f) for f in functions] + +# def composed(ts, funcs=functions): +# return reduce(lambda x, f: f(x), reversed(funcs), ts) + +# return partial(composed, funcs=functions) def assertScalar(name, value, optional=False): @@ -243,6 +243,7 @@ def offset2seconds(offset): def flagWindow(flagger_old, flagger_new, field, direction="fw", window=0, **kwargs) -> pd.Series: + # NOTE: unused -> remove? if window == 0 or window == "": return flagger_new @@ -354,6 +355,7 @@ def getFuncFromInput(func): :param func: A key to the STRING_2_FUNC dict, or an actual function """ + # TODO: check if necessary when the API is available # if input is a callable - than just pass it: if hasattr(func, "__call__"): if (func.__name__ == "aggregator") & (func.__module__ == "saqc.funcs.harm_functions"): @@ -392,6 +394,7 @@ def groupConsecutives(series: pd.Series) -> Iterator[pd.Series]: yield pd.Series(data=values[start:stop], index=index[start:stop]) start = stop + def mergeDios(left, right, join="merge"): # use dios.merge() as soon as it implemented # see https://git.ufz.de/rdm/dios/issues/15 @@ -409,7 +412,7 @@ def mergeDios(left, right, join="merge"): # hold the values from the left join argument r, l = l.align(r, join="outer") else: - l, r= l.align(r, join=join) + l, r = l.align(r, join=join) merged[c] = l.combine_first(r) newcols = right.columns.difference(merged.columns) @@ -418,3 +421,6 @@ def mergeDios(left, right, join="merge"): return merged + +def isQuoted(string): + return bool(re.search(r"'.*'|\".*\"", string)) diff --git a/saqc/lib/ts_operators.py b/saqc/lib/ts_operators.py index 2683f39b2d1d11c5a7798673460b40657e5e8862..dd16768876ad3283698a340c1068a05592c1f63b 100644 --- a/saqc/lib/ts_operators.py +++ b/saqc/lib/ts_operators.py @@ -34,6 +34,7 @@ def zeroLog(ts): log_ts[log_ts == -np.inf] = np.nan return log_ts + def difference(ts): return pd.Series.diff(ts) @@ -90,7 +91,6 @@ def kNN(in_arr, n_neighbors, algorithm="ball_tree"): return nbrs.kneighbors() - def standardizeByMean(ts): return (ts - ts.mean())/ts.std() diff --git a/test/common.py b/test/common.py index c0da2a204f50149e8f6c494b239fdabf01928bf8..ed603c936ce539d86332c07ca76778a35e14a4b1 100644 --- a/test/common.py +++ b/test/common.py @@ -2,15 +2,12 @@ # -*- coding: utf-8 -*- import io -import re import numpy as np import pandas as pd import dios -from saqc.core.core import readConfig from saqc.flagger import ( - ContinuousFlagger, CategoricalFlagger, SimpleFlagger, DmpFlagger, @@ -24,7 +21,6 @@ TESTFLAGGER = ( CategoricalFlagger(["NIL", "GOOD", "BAD"]), SimpleFlagger(), DmpFlagger(), - # ContinuousFlagger(), ) @@ -42,35 +38,8 @@ def initData(cols=2, start_date="2017-01-01", end_date="2017-12-31", freq=None, return di -def initMetaString(metastring, data): - cleaned = re.sub(r"\s*,\s*", r",", re.sub(r"\|", r";", re.sub(r"\n[ \t]+", r"\n", metastring))) - fobj = io.StringIO(cleaned.strip()) - config = readConfig(fobj, data) - fobj.seek(0) - return fobj, config - - -def _getKeys(metadict): - keys = list(metadict[0].keys()) - for row in metadict[1:]: - for k in row.keys(): - if k not in keys: - keys.append(k) - return keys - - def writeIO(content): f = io.StringIO() f.write(content) f.seek(0) return f - - -def initMetaDict(config_dict, data): - df = pd.DataFrame(config_dict)[_getKeys(config_dict)] - fobj = io.StringIO() - df.fillna("").to_csv(fobj, index=False, sep=";") - fobj.seek(0) - config = readConfig(fobj, data) - fobj.seek(0) - return fobj, config diff --git a/test/core/__init__.py b/test/core/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..e1f7e6e8bf04d3d59cff68b83d91fbf791f0faa2 100644 --- a/test/core/__init__.py +++ b/test/core/__init__.py @@ -0,0 +1,2 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- diff --git a/test/core/test_core.py b/test/core/test_core.py index a27abb5d9aabac6d9cd427e8f232b1227e89020f..7f18648e91edea992473d4eeff14d89647e99265 100644 --- a/test/core/test_core.py +++ b/test/core/test_core.py @@ -7,11 +7,10 @@ import pytest import numpy as np import pandas as pd -from saqc.funcs import register, flagRange -from saqc.core.core import run -from saqc.core.config import Fields as F -import saqc.lib.plotting as splot -from test.common import initData, initMetaDict, TESTFLAGGER +from saqc import SaQC, register +from saqc.funcs import flagRange +from saqc.lib import plotting as splot +from test.common import initData, TESTFLAGGER # no logging output needed here @@ -22,7 +21,7 @@ logging.disable(logging.CRITICAL) OPTIONAL = [False, True] -@register() +@register def flagAll(data, field, flagger, **kwargs): # NOTE: remember to rename flag -> flag_values return data, flagger.setFlags(field=field, flag=flagger.BAD) @@ -45,96 +44,30 @@ def flags(flagger, data, optional): return flagger.initFlags(data[data.columns[::2]])._flags -# NOTE: there is a lot of pytest magic involved: -# the parametrize parameters are implicitly available -# within the used fixtures, that is why we need the optional -# parametrization without actually using it in the -# function -@pytest.mark.skip(reason="test slicing support is currently disabled") -@pytest.mark.parametrize("flagger", TESTFLAGGER) -@pytest.mark.parametrize("optional", OPTIONAL) -def test_temporalPartitioning(data, flagger, flags): - """ - Check if the time span in meta is respected - """ - var1, var2, var3, *_ = data.columns - split_date = data.index[len(data.index) // 2] - - metadict = [ - {F.VARNAME: var1, F.TESTS: "flagAll()"}, - {F.VARNAME: var2, F.TESTS: "flagAll()", F.END: split_date}, - {F.VARNAME: var3, F.TESTS: "flagAll()", F.START: split_date}, - ] - meta_file, meta_frame = initMetaDict(metadict, data) - pdata, pflagger = run(meta_file, flagger, data, flags=flags) - - fields = [F.VARNAME, F.START, F.END] - for _, row in meta_frame.iterrows(): - vname, start_date, end_date = row[fields] - fchunk = pflagger.getFlags(field=vname, loc=pflagger.isFlagged(vname)) - assert fchunk.index.min() == start_date, "different start dates" - assert fchunk.index.max() == end_date, "different end dates" - - -@pytest.mark.skip(reason="test slicing support is currently disabled") -@pytest.mark.parametrize("flagger", TESTFLAGGER) -@pytest.mark.parametrize("optional", OPTIONAL) -def test_positionalPartitioning(data, flagger, flags): - data = data.reset_index(drop=True) - if flags is not None: - flags = flags.reset_index(drop=True) - var1, var2, var3, *_ = data.columns - split_index = int(len(data.index) // 2) - - metadict = [ - {F.VARNAME: var1, F.TESTS: "flagAll()"}, - {F.VARNAME: var2, F.TESTS: "flagAll()", F.END: split_index}, - {F.VARNAME: var3, F.TESTS: "flagAll()", F.START: split_index}, - ] - meta_file, meta_frame = initMetaDict(metadict, data) - - pdata, pflagger = run(meta_file, flagger, data, flags=flags) - - fields = [F.VARNAME, F.START, F.END] - for _, row in meta_frame.iterrows(): - vname, start_index, end_index = row[fields] - fchunk = pflagger.getFlags(field=vname, loc=pflagger.isFlagged(vname)) - assert fchunk.index.min() == start_index, "different start indices" - assert fchunk.index.max() == end_index, f"different end indices: {fchunk.index.max()} vs. {end_index}" - - +@pytest.mark.skip(reason="does not make sense anymore") @pytest.mark.parametrize("flagger", TESTFLAGGER) def test_errorHandling(data, flagger): - @register() - def raisingFunc(data, fielf, flagger, **kwargs): - raise TypeError - var1, *_ = data.columns - - metadict = [ - {F.VARNAME: var1, F.TESTS: "raisingFunc()"}, - ] + @register + def raisingFunc(data, field, flagger, **kwargs): + raise TypeError - tests = ["ignore", "warn"] + var1 = data.columns[0] - for policy in tests: + for policy in ["ignore", "warn"]: # NOTE: should not fail, that's all we are testing here - metafobj, _ = initMetaDict(metadict, data) - run(metafobj, flagger, data, error_policy=policy) + SaQC(flagger, data, error_policy=policy).raisingFunc(var1).getResult() @pytest.mark.parametrize("flagger", TESTFLAGGER) def test_duplicatedVariable(flagger): data = initData(1) - var1, *_ = data.columns + var1 = data.columns[0] - metadict = [ - {F.VARNAME: var1, F.TESTS: "flagAll()"}, - {F.VARNAME: var1, F.TESTS: "flagAll()"}, - ] - metafobj, meta = initMetaDict(metadict, data) - - pdata, pflagger = run(metafobj, flagger, data) + pdata, pflagger = (SaQC(flagger, data) + .flagDummy(var1) + .flagDummy(var1) + .getResult()) pflags = pflagger.getFlags() if isinstance(pflags.columns, pd.MultiIndex): @@ -150,16 +83,13 @@ def test_assignVariable(flagger): test implicit assignments """ data = initData(1) - var1, *_ = data.columns + var1 = data.columns[0] var2 = "empty" - metadict = [ - {F.VARNAME: var1, F.TESTS: "flagAll()"}, - {F.VARNAME: var2, F.TESTS: "flagAll()"}, - ] - metafobj, meta = initMetaDict(metadict, data) - - pdata, pflagger = run(metafobj, flagger, data) + pdata, pflagger = (SaQC(flagger, data) + .flagAll(var1) + .flagAll(var2) + .getResult()) pflags = pflagger.getFlags() assert (pflags.columns == [var1, var2]).all() @@ -174,18 +104,36 @@ def test_dtypes(data, flagger, flags): """ flagger = flagger.initFlags(data) flags = flagger.getFlags() - var1, var2, *_ = data.columns - - metadict = [ - {F.VARNAME: var1, F.TESTS: "flagAll()"}, - {F.VARNAME: var2, F.TESTS: "flagAll()"}, - ] - metafobj, meta = initMetaDict(metadict, data) - pdata, pflagger = run(metafobj, flagger, data, flags=flags) + var1, var2 = data.columns[:2] + + pdata, pflagger = (SaQC(flagger, data, flags=flags) + .flagAll(var1) + .flagAll(var2) + .getResult()) + pflags = pflagger.getFlags() assert dict(flags.dtypes) == dict(pflags.dtypes) +@pytest.mark.parametrize("flagger", TESTFLAGGER) +def test_nanInjections(data, flagger): + """ + test if flagged values are exluded during the preceding tests + """ + flagger = flagger.initFlags(data) + flags = flagger.getFlags() + var = data.columns[0] + mn = min(data[var]) + mx = max(data[var])/2 + + pdata, pflagger = (SaQC(flagger, data, flags=flags) + .flagRange(var, mn, mx) + .procGeneric("dummy", func=lambda var1: var1 >= mn) + .getResult()) + assert not pdata.loc[pflagger.isFlagged(var), "dummy"].any() + assert pdata.loc[~pflagger.isFlagged(var), "dummy"].all() + + @pytest.mark.parametrize("flagger", TESTFLAGGER) def test_plotting(data, flagger): """ diff --git a/test/core/test_evaluator.py b/test/core/test_evaluator.py deleted file mode 100644 index 1e9a63419916e19a7869622ad87e75710d7b5250..0000000000000000000000000000000000000000 --- a/test/core/test_evaluator.py +++ /dev/null @@ -1,74 +0,0 @@ -#! /usr/bin/env python -# -*- coding: utf-8 -*- - -import pytest -import numpy as np - -from saqc.funcs import register -from saqc.core.evaluator import ( - compileTree, - parseExpression, - initLocalEnv, - ConfigChecker, - ConfigTransformer, -) - -from test.common import TESTFLAGGER, initData - - -def compileExpression(expr, flagger, nodata=np.nan): - data = initData() - field = data.columns[0] - tree = parseExpression(expr) - env = initLocalEnv(data, field, flagger.initFlags(data), nodata) - ConfigChecker(env, flagger.signature).visit(tree) - transformed_tree = ConfigTransformer(env).visit(tree) - code = compileTree(transformed_tree) - return code - - -@pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_syntaxError(flagger): - exprs = [ - "range(x=5", - "rangex=5)", - "range[x=5]" "range{x=5}" "int->float(x=4)" "int*float(x=4)", - ] - - for expr in exprs: - with pytest.raises(SyntaxError): - compileExpression(expr, flagger) - - -@pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_typeError(flagger): - - exprs = [ - # "func", - "flagDummy(kwarg=[1, 2, 3])", - "flagDummy(x=5)", - "flagDummy(dummy())", - "flagDummy(kwarg=dummy(this))", - ] - - for expr in exprs: - with pytest.raises(TypeError): - compileExpression(expr, flagger) - - -@pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_supportedArguments(flagger): - @register() - def func(data, field, flagger, kwarg, **kwargs): - return data, flagger - - exprs = [ - "func(kwarg='str')", - "func(kwarg=5)", - "func(kwarg=5.5)", - "func(kwarg=-5)", - "func(kwarg=True)", - "func(kwarg=func())", - ] - for expr in exprs: - compileExpression(expr, flagger) diff --git a/test/core/test_reader.py b/test/core/test_reader.py index 5b0f220142f73d3f4112c4a0a77bb71d9c43c816..035f9d0a1cec30d33b0fb3e78784920f790962aa 100644 --- a/test/core/test_reader.py +++ b/test/core/test_reader.py @@ -1,14 +1,21 @@ #! /usr/bin/env python # -*- coding: utf-8 -*- +from pathlib import Path + import pytest import numpy as np +import pandas as pd -import saqc -from saqc.core.reader import checkConfig +from dios.dios import DictOfSeries from saqc.core.config import Fields as F +from test.common import initData, writeIO + +from saqc.core.core import SaQC +from saqc.flagger import SimpleFlagger +from saqc.funcs.functions import flagRange, flagDummy +from saqc.core.register import FUNC_MAP, register, SaQCFunc import dios -from test.common import initData, initMetaDict, initMetaString, TESTFLAGGER, TESTNODATA, writeIO @pytest.fixture @@ -16,52 +23,53 @@ def data() -> dios.DictOfSeries: return initData(3) -def test_configPreparation(data): - var1, var2, var3, *_ = data.columns - date = data.indexes[0][data.lengths[0] // 2] +def test_packagedConfig(): - # NOTE: - # time slicing support is currently disabled - tests = [ - # {F.VARNAME: var1, F.START: date, F.TESTS: "flagAll()", F.PLOT: True}, - # {F.VARNAME: var3, F.END: date, F.TESTS: "flagAll()"}, + path = Path(__file__).parents[2] / "ressources/data" - {F.VARNAME: var2, F.TESTS: "flagAll()", F.PLOT: False}, - {F.VARNAME: var3, F.TESTS: "flagAll()",}, - ] + config_path = path / "config_ci.csv" + data_path = path / "data.csv" + data = pd.read_csv(data_path, index_col=0, parse_dates=True,) + saqc = SaQC(SimpleFlagger(), DictOfSeries(data)).readConfig(config_path) + data, flagger = saqc.getResult() - for i, test in enumerate(tests): - index = data[test[F.VARNAME]].index - start_date, end_date = index.min(), index.max() +def test_configDefaults(data): + var1, var2, var3, *_ = data.columns - defaults = { - F.START: start_date, - F.END: end_date, - F.PLOT: False, - F.LINENUMBER: 2, - } + header = f"{F.VARNAME};{F.TEST};{F.PLOT}" + tests = [ + (f"{var2};flagRange(min=3, max=6);True", SaQCFunc(flagRange, min=3, max=6, plot=True, lineno=2)), + (f"{var3};flagDummy()", SaQCFunc(flagDummy, plot=False, lineno=2)) + ] - _, meta_frame = initMetaDict([test], data) - result = dict(zip(meta_frame.columns, meta_frame.iloc[0])) - expected = {**defaults, **test} - assert result == expected + for config, expected in tests: + fobj = writeIO(header + "\n" + config) + saqc = SaQC(SimpleFlagger(), data).readConfig(fobj) + result = [func for _, func in saqc._to_call][0] + assert result.kwargs == expected.kwargs + assert result.lineno == expected.lineno + assert result.plot == expected.plot def test_variableRegex(data): + header = f"{F.VARNAME};{F.TEST};{F.PLOT}" tests = [ ("'.*'", data.columns), ("'var(1|2)'", [c for c in data.columns if c[-1] in ("1", "2")]), ("'var[12]'", [c for c in data.columns if c[-1] in ("1", "2")]), ("var[12]", ["var[12]"]), # not quoted -> not a regex - ('"(.*3)"', [c for c in data.columns if c[-1] == "3"]), + ('".*3"', [c for c in data.columns if c[-1] == "3"]), ] - for config_wc, expected in tests: - _, config = initMetaDict([{F.VARNAME: config_wc, F.TESTS: "flagAll()"}], data) - assert np.all(config[F.VARNAME] == expected) + + for regex, expected in tests: + fobj = writeIO(header + "\n" + f"{regex} ; flagDummy()") + saqc = SaQC(SimpleFlagger(), data).readConfig(fobj) + result = [field for field, _ in saqc._to_call] + assert np.all(result == expected) def test_inlineComments(data): @@ -69,65 +77,40 @@ def test_inlineComments(data): adresses issue #3 """ config = f""" - {F.VARNAME}|{F.TESTS}|{F.PLOT} - pre2|flagAll() # test|False # test + {F.VARNAME} ; {F.TEST} ; {F.PLOT} + pre2 ; flagDummy() # test ; False # test """ - _, meta_frame = initMetaString(config, data) - assert meta_frame.loc[0, F.PLOT] == False - assert meta_frame.loc[0, F.TESTS] == "flagAll()" + saqc = SaQC(SimpleFlagger(), data).readConfig(writeIO(config)) + result = [func for _, func in saqc._to_call][0] + assert result.plot == False + assert result.func == FUNC_MAP["flagDummy"].func def test_configReaderLineNumbers(data): config = f""" - {F.VARNAME}|{F.TESTS} - #temp1|dummy() - pre1|dummy() - pre2|dummy() - SM|dummy() - #SM|dummy() - # SM1|dummy() - - SM1|dummy() + {F.VARNAME} ; {F.TEST} + #temp1 ; flagDummy() + pre1 ; flagDummy() + pre2 ; flagDummy() + SM ; flagDummy() + #SM ; flagDummy() + # SM1 ; flagDummy() + + SM1 ; flagDummy() """ - meta_fname, meta_frame = initMetaString(config, data) - result = meta_frame[F.LINENUMBER].tolist() + saqc = SaQC(SimpleFlagger(), data).readConfig(writeIO(config)) + result = [func.lineno for _, func in saqc._to_call] expected = [3, 4, 5, 9] assert result == expected -def test_configMultipleTests(data): - - var = data.columns[0] - - config = f""" - {F.VARNAME} ; test_1 ; test_2 - #-----------;---------------;-------------------------- - {var} ; flagMissing() ; flagRange(min=10, max=60) - """ - - from saqc.flagger import SimpleFlagger - from saqc.core.core import run - from saqc.core.reader import readConfig, checkConfig - from saqc.funcs.functions import flagMissing, flagRange - - flagger = SimpleFlagger().initFlags(data) - df = checkConfig(readConfig(writeIO(config), data), data, flagger, np.nan) - assert {"test_1", "test_2"} - set(df.columns) == set([]) - - flagger_expected = SimpleFlagger().initFlags(data) - for func, kwargs in [(flagMissing, {}), (flagRange, {"min": 10, "max": 60})]: - data, flagger_expected = func(data, var, flagger_expected, **kwargs) - _, flagger_result = run(writeIO(config), SimpleFlagger(), data) - - assert (flagger_result.getFlags() == flagger_expected.getFlags()).all(None) - - def test_configFile(data): # check that the reader accepts different whitespace patterns config = f""" - {F.VARNAME} ; {F.TESTS} + {F.VARNAME} ; {F.TEST} + #temp1 ; flagDummy() pre1; flagDummy() pre2 ;flagDummy() @@ -137,25 +120,54 @@ def test_configFile(data): SM1;flagDummy() """ - saqc.run(writeIO(config), TESTFLAGGER[0], data) + SaQC(SimpleFlagger(), data).readConfig(writeIO(config)) -@pytest.mark.parametrize("flagger", TESTFLAGGER) -@pytest.mark.parametrize("nodata", TESTNODATA) -def test_configChecks(data, flagger, nodata, caplog): +def test_configChecks(data): - flagger = flagger.initFlags(data) var1, var2, var3, *_ = data.columns + header = f"{F.VARNAME};{F.TEST}" tests = [ - ({F.VARNAME: var1, F.TESTS: "flagRange(mn=0)"}, TypeError), - ({F.VARNAME: var3, F.TESTS: "flagNothing()"}, NameError), - ({F.VARNAME: "", F.TESTS: "flagRange(min=3)"}, SyntaxError), - ({F.VARNAME: var1, F.TESTS: ""}, SyntaxError), - ({F.TESTS: "flagRange(min=3)"}, SyntaxError), + (f"{var1};flagRange(mn=0)", TypeError), # bad argument name + (f"{var1};flagRange(min=0)", TypeError), # not enough arguments + (f"{var3};flagNothing()", NameError), # unknown function + (";flagRange(min=3)", SyntaxError), # missing variable + (f"{var1};", SyntaxError), # missing test + (f"{var1}; min", TypeError), # not a function call ] - for config_dict, expected in tests: - _, config_df = initMetaDict([config_dict], data) + for test, expected in tests: + fobj = writeIO(header + "\n" + test) with pytest.raises(expected): - checkConfig(config_df, data, flagger, nodata) + SaQC(SimpleFlagger(), data).readConfig(fobj) + + +def test_supportedArguments(data): + + # test if the following function arguments + # are supported (i.e. parsing does not fail) + + # TODO: necessary? + + @register + def func(data, field, flagger, kwarg, **kwargs): + return data, flagger + + var1 = data.columns[0] + + header = f"{F.VARNAME};{F.TEST}" + tests = [ + f"{var1};func(kwarg=NAN)", + f"{var1};func(kwarg='str')", + f"{var1};func(kwarg=5)", + f"{var1};func(kwarg=5.5)", + f"{var1};func(kwarg=-5)", + f"{var1};func(kwarg=True)", + f"{var1};func(kwarg=sum([1, 2, 3]))", + ] + + for test in tests: + fobj = writeIO(header + "\n" + test) + SaQC(SimpleFlagger(), data).readConfig(fobj) + diff --git a/test/flagger/test_dmpflagger.py b/test/flagger/test_dmpflagger.py index fb6c612867548b07885321a759d63011bd9231ab..c4be9cb21253309252f92b1898da7aba9f37a4d2 100644 --- a/test/flagger/test_dmpflagger.py +++ b/test/flagger/test_dmpflagger.py @@ -10,6 +10,7 @@ import pytest from test.common import initData from saqc.flagger import DmpFlagger + @pytest.fixture def data(): return initData(cols=1) @@ -26,7 +27,7 @@ def test_initFlags(data): assert (flagger._comments == "").all(axis=None) -def test_setFlaggerOuter(data): +def test_mergeFlaggerOuter(data): flagger = DmpFlagger() @@ -49,7 +50,7 @@ def test_setFlaggerOuter(data): .initFlags(data=data_right) .setFlags(field=field, flag=flagger.GOOD, cause="SaQCRight", comment="testRight")) - merged = left.setFlagger(right, join="outer") + merged = left.merge(right, join="outer") right_index = data_right[field].index.difference(data_left[field].index) assert (merged._flags.loc[right_index] == flagger.GOOD).all(axis=None) @@ -61,7 +62,7 @@ def test_setFlaggerOuter(data): assert (merged._causes.loc[left_index] == "SaQCLeft").all(axis=None) assert np.all(parseComments(merged._comments.loc[left_index]) == "testLeft") -def test_setFlaggerInner(data): +def test_mergeFlaggerInner(data): flagger = DmpFlagger() @@ -79,7 +80,7 @@ def test_setFlaggerInner(data): .initFlags(data=data_right) .setFlags(field=field, flag=flagger.GOOD, cause="SaQCRight", comment="testRight")) - merged = left.setFlagger(right, join="inner") + merged = left.merge(right, join="inner") assert (merged._flags[field].index == data_right[field].index).all() assert (merged._causes[field].index == data_right[field].index).all() @@ -90,7 +91,7 @@ def test_setFlaggerInner(data): assert np.all(parseComments(merged._comments) == "testLeft") -def test_getFlaggerDrop(data): +def test_sliceFlaggerDrop(data): flagger = DmpFlagger().initFlags(data) with pytest.raises(TypeError): flagger.getFlags(field=data.columns, drop="var") @@ -98,7 +99,7 @@ def test_getFlaggerDrop(data): field = data.columns[0] expected = data[data.columns.drop(field)].to_df() - filtered = flagger.getFlagger(drop=field) + filtered = flagger.slice(drop=field) assert (filtered._flags.columns == expected.columns).all(axis=None) assert (filtered._comments.columns == expected.columns).all(axis=None) diff --git a/test/flagger/test_flagger.py b/test/flagger/test_flagger.py index 8bbb92949d8dfb4db885df788e1445894683b829..6809864df30139ff242e57e035a18367338466b7 100644 --- a/test/flagger/test_flagger.py +++ b/test/flagger/test_flagger.py @@ -1,9 +1,5 @@ #!/usr/bin/env python -__author__ = "Bert Palm" -__email__ = "bert.palm@ufz.de" -__copyright__ = "Copyright 2018, Helmholtz-Zentrum für Umweltforschung GmbH - UFZ" - import pytest import numpy as np import pandas as pd @@ -144,16 +140,16 @@ def test_setFlags(data, flagger): @pytest.mark.parametrize("data", DATASETS) @pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_getFlagger(data, flagger): +def test_sliceFlagger(data, flagger): """ test before: - initFlags() - - getFlags() inside getFlagger() + - getFlags() inside slice() """ sl = slice(None, None, 3) flagger = flagger.initFlags(data) - newflagger = flagger.getFlagger(loc=sl) + newflagger = flagger.slice(loc=sl) assert isinstance(newflagger, type(flagger)) newflags = newflagger.getFlags() @@ -163,7 +159,7 @@ def test_getFlagger(data, flagger): @pytest.mark.parametrize("data", DATASETS) @pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_getFlaggerDrop(data, flagger): +def test_sliceFlaggerDrop(data, flagger): flagger = flagger.initFlags(data) with pytest.raises(TypeError): flagger.getFlags(field=data.columns, drop="var") @@ -171,27 +167,27 @@ def test_getFlaggerDrop(data, flagger): field = data.columns[0] expected = data.columns.drop(field) - filtered = flagger.getFlagger(drop=field) + filtered = flagger.slice(drop=field) assert (filtered.getFlags().columns == expected).all(axis=None) assert (filtered.getFlags().to_df().index== data[expected].to_df().index).all(axis=None) @pytest.mark.parametrize("data", DATASETS) @pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_setFlagger(data, flagger): +def test_mergeFlagger(data, flagger): """ test before: - initFlags() - getFlags() - setFlags() - - getFlagger() + - slice() """ field, *_ = data.columns sl = slice(None, None, 3) this_flagger = flagger.initFlags(data) - other_flagger = this_flagger.getFlagger(loc=sl).setFlags(field) - result_flagger = this_flagger.setFlagger(other_flagger) + other_flagger = this_flagger.slice(loc=sl).setFlags(field) + result_flagger = this_flagger.merge(other_flagger) result_flags = result_flagger.getFlags() other_flags = other_flagger.getFlags() @@ -213,14 +209,14 @@ def test_setFlagger(data, flagger): @pytest.mark.parametrize("data", DATASETS) @pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_setFlaggerColumnsDiff(data, flagger): +def test_mergeFlaggerColumnsDiff(data, flagger): """ test before: - initFlags() - getFlags() - setFlags() - - getFlagger() - - setFlagger() + - slice() + - merge() """ field, *_ = data.columns new_field = field + "_new" @@ -231,7 +227,7 @@ def test_setFlaggerColumnsDiff(data, flagger): other_flagger = flagger.initFlags(other_data) this_flagger = flagger.initFlags(data).setFlags(field, flag=flagger.BAD) - result_flagger = this_flagger.setFlagger(other_flagger) + result_flagger = this_flagger.merge(other_flagger) result_flags = result_flagger.getFlags() other_flags = other_flagger.getFlags() @@ -259,14 +255,14 @@ def test_setFlaggerColumnsDiff(data, flagger): @pytest.mark.parametrize("data", DATASETS) @pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_setFlaggerIndexDiff(data, flagger): +def test_mergeFlaggerIndexDiff(data, flagger): """ test before: - initFlags() - getFlags() - setFlags() - - getFlagger() - - setFlagger() + - slice() + - merge() we need to check: - index is union of this and other's index @@ -288,7 +284,7 @@ def test_setFlaggerIndexDiff(data, flagger): this_flagger = flagger.initFlags(data).setFlags(field, flag=flagger.BAD) other_flagger = flagger.initFlags(other_data) - result_flagger = this_flagger.setFlagger(other_flagger) + result_flagger = this_flagger.merge(other_flagger) result_flags = result_flagger.getFlags() this_flags = this_flagger.getFlags() @@ -312,7 +308,7 @@ def test_setFlaggerIndexDiff(data, flagger): @pytest.mark.parametrize("data", DATASETS) @pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_setFlaggerOuter(data, flagger): +def test_mergeFlaggerOuter(data, flagger): field = data.columns[0] @@ -328,7 +324,7 @@ def test_setFlaggerOuter(data, flagger): .initFlags(data=data_right) .setFlags(field, flag=flagger.GOOD)) - merged = left.setFlagger(right, join="outer") + merged = left.merge(right, join="outer") loc = data_right[field].index.difference(data_left[field].index) assert (merged.getFlags(field, loc=loc) == flagger.GOOD).all(axis=None) @@ -337,7 +333,7 @@ def test_setFlaggerOuter(data, flagger): @pytest.mark.parametrize("data", DATASETS) @pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_setFlaggerInner(data, flagger): +def test_mergeFlaggerInner(data, flagger): field = data.columns[0] @@ -353,7 +349,7 @@ def test_setFlaggerInner(data, flagger): .initFlags(data=data_right) .setFlags(field, flag=flagger.GOOD)) - merged = left.setFlagger(right, join="inner") + merged = left.merge(right, join="inner") assert (merged.getFlags(field).index == data_right[field].index).all() assert (merged.getFlags(field) == flagger.BAD).all() @@ -361,7 +357,7 @@ def test_setFlaggerInner(data, flagger): @pytest.mark.parametrize("data", DATASETS) @pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_setFlaggerMerge(data, flagger): +def test_mergeFlaggerMerge(data, flagger): field = data.columns[0] data_left = data @@ -375,7 +371,7 @@ def test_setFlaggerMerge(data, flagger): .initFlags(data=data_right) .setFlags(field, flag=flagger.GOOD)) - merged = left.setFlagger(right, join="merge") + merged = left.merge(right, join="merge") loc = data_left[field].index.difference(data_right[field].index) assert (merged.getFlags(field, loc=data_right[field].index) == flagger.GOOD).all(axis=None) @@ -456,7 +452,7 @@ def test_isFlaggedSeries_fail(data, flagger): # {"field": ["var1", "var2"]}, ] for args in fail_tests: - with pytest.raises(ValueError): + with pytest.raises(TypeError): flagger.isFlagged(**args) diff --git a/test/funcs/test_functions.py b/test/funcs/test_functions.py index c34d7ca6ded14bbf020201337dc1f227a67132b5..2f509cc52f756639373755bd8ff5730849725e60 100644 --- a/test/funcs/test_functions.py +++ b/test/funcs/test_functions.py @@ -3,17 +3,13 @@ import pytest import numpy as np +import pandas as pd +import dios -from saqc.funcs.functions import ( - flagRange, - flagSesonalRange, - forceFlags, - clearFlags, - flagIsolated, - flagPattern -) + +from saqc.funcs.functions import * from test.common import initData, TESTFLAGGER import pandas as pd @@ -130,3 +126,105 @@ def test_flagIsolated(data, flagger): data, field, flagger_result, group_window="2D", gap_window="2.1D", continuation_range="1.1D", ) assert flagger_result.isFlagged(field)[[3, 5, 13, 14]].all() + + +@pytest.mark.parametrize("flagger", TESTFLAGGER) +@pytest.mark.parametrize("dat", [pytest.lazy_fixture("course_2")]) +def test_flagCrossScoring(dat, flagger): + data1, characteristics = dat(initial_level=0, final_level=0, out_val=0) + data2, characteristics = dat(initial_level=0, final_level=0, out_val=10) + field = "dummy" + fields = ["data1", "data2"] + s1, s2 = data1.squeeze(), data2.squeeze() + s1 = pd.Series(data=s1.values, index=s1.index) + s2 = pd.Series(data=s2.values, index=s1.index) + data = dios.DictOfSeries([s1, s2], columns=["data1", "data2"]) + flagger = flagger.initFlags(data) + _, flagger_result = flagCrossScoring( + data, field, flagger, fields=fields, thresh=3, cross_stat=np.mean + ) + for field in fields: + isflagged = flagger_result.isFlagged(field) + assert isflagged[characteristics['raise']].all() + + +@pytest.mark.parametrize("flagger", TESTFLAGGER) +def test_flagManual(data, flagger): + field = data.columns[0] + flagger = flagger.initFlags(data) + args = data, field, flagger + dat = data[field] + + mdata = pd.Series('lala', index=dat.index) + index_exp = mdata.iloc[[10, 33, 200, 500]].index + mdata.iloc[[101, 133, 220, 506]] = 'b' + mdata.loc[index_exp] = 'a' + shrinked = mdata.loc[index_exp.union(mdata.iloc[[1, 2, 3, 4, 600, 601]].index)] + + kwargs_list = [ + dict(mdata=mdata, mflag='a', method='plain'), + dict(mdata=mdata.to_list(), mflag='a', method='plain'), + dict(mdata=mdata, mflag='a', method='ontime'), + dict(mdata=shrinked, mflag='a', method='ontime'), + ] + + for kw in kwargs_list: + _, fl = flagManual(*args, **kw) + isflagged = fl.isFlagged(field) + assert isflagged[isflagged].index.equals(index_exp) + + # flag not exist in mdata + _, fl = flagManual(*args, mdata=mdata, mflag="i do not exist", method='ontime') + isflagged = fl.isFlagged(field) + assert isflagged[isflagged].index.equals(pd.DatetimeIndex([])) + + # check right-open / ffill + index = pd.date_range(start="2016-01-01", end="2018-12-31", periods=11) + mdata = pd.Series(0, index=index) + mdata.loc[index[[1, 5, 6, 7, 9, 10]]] = 1 + # >>> mdata + # 2016-01-01 00:00:00 0 + # 2016-04-19 12:00:00 1 + # 2016-08-07 00:00:00 0 + # 2016-11-24 12:00:00 0 + # 2017-03-14 00:00:00 0 + # 2017-07-01 12:00:00 1 + # 2017-10-19 00:00:00 1 + # 2018-02-05 12:00:00 1 + # 2018-05-26 00:00:00 0 + # 2018-09-12 12:00:00 1 + # 2018-12-31 00:00:00 1 + # dtype: int64 + + # add first and last index from data + expected = mdata.copy() + expected.loc[dat.index[0]] = 0 + expected.loc[dat.index[-1]] = 1 + expected = expected.astype(bool) + + _, fl = flagManual(*args, mdata=mdata, mflag=1, method='right-open') + isflagged = fl.isFlagged(field) + last = expected.index[0] + for curr in expected.index[1:]: + expected_value = mdata[last] + # datetime slicing is inclusive ! + i = isflagged[last:curr].index[:-1] + chunk = isflagged.loc[i] + assert (chunk == expected_value).all() + last = curr + # check last value + assert isflagged[curr] == expected[curr] + + # check left-open / bfill + expected.loc[dat.index[-1]] = 0 # this time the last is False + _, fl = flagManual(*args, mdata=mdata, mflag=1, method='left-open') + isflagged = fl.isFlagged(field) + last = expected.index[0] + assert isflagged[last] == expected[last] + for curr in expected.index[1:]: + expected_value = mdata[curr] + # datetime slicing is inclusive ! + i = isflagged[last:curr].index[1:] + chunk = isflagged.loc[i] + assert (chunk == expected_value).all() + last = curr diff --git a/test/funcs/test_generic_config_functions.py b/test/funcs/test_generic_config_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..7e9621d62c6053d1700e08395dd7570aac2f23ec --- /dev/null +++ b/test/funcs/test_generic_config_functions.py @@ -0,0 +1,335 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + +import ast + +import pytest +import numpy as np +import pandas as pd + +from dios.dios import DictOfSeries + +from test.common import TESTFLAGGER, TESTNODATA, initData, writeIO +from saqc.core.visitor import ConfigFunctionParser +from saqc.core.config import Fields as F +from saqc.core.register import register +from saqc import SaQC, SimpleFlagger +from saqc.funcs.functions import _execGeneric + + +@pytest.fixture +def data(): + return initData() + + +@pytest.fixture +def data_diff(): + data = initData(cols=3) + col0 = data[data.columns[0]] + col1 = data[data.columns[1]] + mid = len(col0) // 2 + offset = len(col0) // 8 + return DictOfSeries( + data={ + col0.name: col0.iloc[:mid + offset], + col1.name: col1.iloc[mid - offset:], + } + ) + + +def _compileGeneric(expr): + tree = ast.parse(expr, mode="eval") + cp = ConfigFunctionParser(tree.body) + return cp.kwargs["func"] + + +@pytest.mark.parametrize("flagger", TESTFLAGGER) +def test_missingIdentifier(data, flagger): + + # NOTE: + # - the error is only raised at runtime during parsing would be better + tests = [ + "fff(var2) < 5", + "var3 != NODATA", + ] + + for test in tests: + func = _compileGeneric(f"flagGeneric(func={test})") + with pytest.raises(NameError): + _execGeneric(flagger, data, func, field="", nodata=np.nan) + + +@pytest.mark.parametrize("flagger", TESTFLAGGER) +def test_syntaxError(flagger): + + tests = [ + "range(x=5", + "rangex=5)", + "range[x=5]" "range{x=5}" "int->float(x=4)" "int*float(x=4)", + ] + + for test in tests: + with pytest.raises(SyntaxError): + _compileGeneric(f"flagGeneric(func={test})") + + +@pytest.mark.parametrize("flagger", TESTFLAGGER) +def test_typeError(flagger): + + """ + test that forbidden constructs actually throw an error + TODO: find a few more cases or get rid of the test + """ + + # : think about cases that should be forbidden + tests = ( + "lambda x: x * 2", + ) + + for test in tests: + with pytest.raises(TypeError): + _compileGeneric(f"flagGeneric(func={test})") + + +@pytest.mark.parametrize("flagger", TESTFLAGGER) +def test_comparisonOperators(data, flagger): + flagger = flagger.initFlags(data) + var1, var2, *_ = data.columns + this = var1 + + tests = [ + ("this > 100", data[this] > 100), + (f"10 >= {var2}", 10 >= data[var2]), + (f"{var2} < 100", data[var2] < 100), + (f"this <= {var2}", data[this] <= data[var2]), + (f"{var1} == {var2}", data[this] == data[var2]), + (f"{var1} != {var2}", data[this] != data[var2]), + ] + + for test, expected in tests: + func = _compileGeneric(f"flagGeneric(func={test})") + result = _execGeneric(flagger, data, func, field=var1, nodata=np.nan) + assert np.all(result == expected) + + +@pytest.mark.parametrize("flagger", TESTFLAGGER) +def test_arithmeticOperators(data, flagger): + flagger = flagger.initFlags(data) + var1, *_ = data.columns + this = data[var1] + + tests = [ + ("var1 + 100 > 110", this + 100 > 110), + ("var1 - 100 > 0", this - 100 > 0), + ("var1 * 100 > 200", this * 100 > 200), + ("var1 / 100 > .1", this / 100 > .1), + ("var1 % 2 == 1", this % 2 == 1), + ("var1 ** 2 == 0", this ** 2 == 0), + ] + + for test, expected in tests: + func = _compileGeneric(f"procGeneric(func={test})") + result = _execGeneric(flagger, data, func, field=var1, nodata=np.nan) + assert np.all(result == expected) + + +@pytest.mark.parametrize("flagger", TESTFLAGGER) +def test_nonReduncingBuiltins(data, flagger): + flagger = flagger.initFlags(data) + var1, *_ = data.columns + this = var1 + + tests = [ + (f"abs({this})", np.abs(data[this])), + (f"log({this})", np.log(data[this])), + (f"exp({this})", np.exp(data[this])), + ] + + for test, expected in tests: + func = _compileGeneric(f"procGeneric(func={test})") + result = _execGeneric(flagger, data, func, field=this, nodata=np.nan) + assert (result == expected).all() + + +@pytest.mark.parametrize("flagger", TESTFLAGGER) +@pytest.mark.parametrize("nodata", TESTNODATA) +def test_reduncingBuiltins(data, flagger, nodata): + + data.loc[::4] = nodata + flagger = flagger.initFlags(data) + var1 = data.columns[0] + this = data.iloc[:, 0] + + tests = [ + ("min(this)", np.nanmin(this)), + (f"max({var1})", np.nanmax(this)), + (f"sum({var1})", np.nansum(this)), + ("mean(this)", np.nanmean(this)), + (f"std({this.name})", np.std(this)), + (f"len({this.name})", len(this)), + ] + + for test, expected in tests: + func = _compileGeneric(f"procGeneric(func={test})") + result = _execGeneric(flagger, data, func, field=this.name, nodata=nodata) + assert result == expected + + +@pytest.mark.parametrize("flagger", TESTFLAGGER) +@pytest.mark.parametrize("nodata", TESTNODATA) +def test_ismissing(data, flagger, nodata): + + data.iloc[: len(data) // 2, 0] = np.nan + data.iloc[(len(data) // 2) + 1 :, 0] = -9999 + this = data.iloc[:, 0] + + tests = [ + (f"ismissing({this.name})", (pd.isnull(this) | (this == nodata))), + (f"~ismissing({this.name})", (pd.notnull(this) & (this != nodata))), + ] + + for test, expected in tests: + func = _compileGeneric(f"flagGeneric(func={test})") + result = _execGeneric(flagger, data, func, this.name, nodata) + assert np.all(result == expected) + + +@pytest.mark.parametrize("flagger", TESTFLAGGER) +@pytest.mark.parametrize("nodata", TESTNODATA) +def test_bitOps(data, flagger, nodata): + var1, var2, *_ = data.columns + this = var1 + + flagger = flagger.initFlags(data) + + tests = [ + ("~(this > mean(this))", ~(data[this] > np.nanmean(data[this]))), + (f"(this <= 0) | (0 < {var1})", (data[this] <= 0) | (0 < data[var1])), + (f"({var2} >= 0) & (0 > this)", (data[var2] >= 0) & (0 > data[this])), + ] + + for test, expected in tests: + func = _compileGeneric(f"flagGeneric(func={test})") + result = _execGeneric(flagger, data, func, this, nodata) + assert np.all(result == expected) + + +@pytest.mark.parametrize("flagger", TESTFLAGGER) +def test_isflagged(data, flagger): + + var1, var2, *_ = data.columns + + flagger = flagger.initFlags(data).setFlags( + var1, loc=data[var1].index[::2], flag=flagger.BAD + ) + + tests = [ + (f"isflagged({var1})", flagger.isFlagged(var1)), + (f"isflagged({var1}, BAD)", flagger.isFlagged(var1, flag=flagger.BAD)), + (f"isflagged({var1}, UNFLAGGED, '==')", flagger.isFlagged(var1, flag=flagger.UNFLAGGED, comparator="==")), + (f"~isflagged({var2})", ~flagger.isFlagged(var2)), + (f"~({var2}>999) & (~isflagged({var2}))", ~(data[var2] > 999) & (~flagger.isFlagged(var2))) + ] + + for test, expected in tests: + func = _compileGeneric(f"flagGeneric(func={test})") + result = _execGeneric(flagger, data, func, field=None, nodata=np.nan) + assert np.all(result == expected) + + +@pytest.mark.parametrize("flagger", TESTFLAGGER) +def test_variableAssignments(data, flagger): + var1, var2, *_ = data.columns + + config = f""" + {F.VARNAME} ; {F.TEST} + dummy1 ; procGeneric(func=var1 + var2) + dummy2 ; flagGeneric(func=var1 + var2 > 0) + """ + + fobj = writeIO(config) + saqc = SaQC(flagger, data).readConfig(fobj) + result_data, result_flagger = saqc.getResult() + + assert set(result_data.columns) == set(data.columns) | { + "dummy1", + } + assert set(result_flagger.getFlags().columns) == set(data.columns) | {"dummy1", "dummy2"} + + +@pytest.mark.xfail(stric=True) +@pytest.mark.parametrize("flagger", TESTFLAGGER) +def test_procGenericMultiple(data_diff, flagger): + var1, var2, *_ = data_diff.columns + + config = f""" + {F.VARNAME} ; {F.TEST} + dummy ; procGeneric(func=var1 + 1) + dummy ; procGeneric(func=var2 - 1) + """ + + fobj = writeIO(config) + saqc = SaQC(flagger, data_diff).readConfig(fobj) + result_data, result_flagger = saqc.getResult() + assert len(result_data["dummy"]) == len(result_flagger.getFlags("dummy")) + + +def test_callableArgumentsUnary(data): + + window = 5 + + @register + def testFuncUnary(data, field, flagger, func, **kwargs): + data[field] = data[field].rolling(window=window).apply(func) + return data, flagger.initFlags(data=data) + + flagger = SimpleFlagger() + var = data.columns[0] + + config = f""" + {F.VARNAME} ; {F.TEST} + {var} ; testFuncUnary(func={{0}}) + """ + + tests = [ + ("sum", np.sum), + ("std(exp(x))", lambda x: np.std(np.exp(x))), + ] + + for (name, func) in tests: + fobj = writeIO(config.format(name)) + result_config, _ = SaQC(flagger, data).readConfig(fobj).getResult() + result_api, _ = SaQC(flagger, data).testFuncUnary(var, func=func).getResult() + expected = data[var].rolling(window=window).apply(func) + assert (result_config[var].dropna() == expected.dropna()).all(axis=None) + assert (result_api[var].dropna() == expected.dropna()).all(axis=None) + + +def test_callableArgumentsBinary(data): + + flagger = SimpleFlagger() + var1, var2 = data.columns[:2] + + @register + def testFuncBinary(data, field, flagger, func, **kwargs): + data[field] = func(data[var1], data[var2]) + return data, flagger.initFlags(data=data) + + + config = f""" + {F.VARNAME} ; {F.TEST} + {var1} ; testFuncBinary(func={{0}}) + """ + + tests = [ + ("x + y", lambda x, y: x + y), + ("y - (x * 2)", lambda y, x: y - (x * 2)), + ] + + for (name, func) in tests: + fobj = writeIO(config.format(name)) + result_config, _ = SaQC(flagger, data).readConfig(fobj).getResult() + result_api, _ = SaQC(flagger, data).testFuncBinary(var1, func=func).getResult() + expected = func(data[var1], data[var2]) + assert (result_config[var1].dropna() == expected.dropna()).all(axis=None) + assert (result_api[var1].dropna() == expected.dropna()).all(axis=None) diff --git a/test/funcs/test_generic_functions.py b/test/funcs/test_generic_functions.py deleted file mode 100644 index 873c96116c1175728c72efc5394dce0197f973e0..0000000000000000000000000000000000000000 --- a/test/funcs/test_generic_functions.py +++ /dev/null @@ -1,266 +0,0 @@ -#! /usr/bin/env python -# -*- coding: utf-8 -*- - -import pytest -import numpy as np - -from dios.dios import DictOfSeries - -from saqc.core.core import run -from saqc.core.config import Fields as F - -from test.common import TESTFLAGGER, TESTNODATA, initData, writeIO - - -from saqc.core.evaluator import ( - DslTransformer, - initLocalEnv, - parseExpression, - evalExpression, - compileTree, - evalCode, -) - - -def _evalDslExpression(expr, data, field, flagger, nodata=np.nan): - env = initLocalEnv(data, field, flagger, nodata) - tree = parseExpression(expr) - transformed_tree = DslTransformer(env).visit(tree) - code = compileTree(transformed_tree) - return evalCode(code, local_env=env) - - -@pytest.fixture -def data(): - return initData() - - -@pytest.fixture -def data_diff(): - data = initData(cols=3) - col0 = data[data.columns[0]] - col1 = data[data.columns[1]] - mid = len(col0) // 2 - offset = len(col0) // 8 - return DictOfSeries( - data={ - col0.name: col0.iloc[:mid + offset], - col1.name: col1.iloc[mid - offset:], - } - ) - -@pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_missingIdentifier(data, flagger): - - flagger = flagger.initFlags(data) - tests = ["flagGeneric(func=fff(var2) < 5)", "flagGeneric(func=var3 != NODATA)"] - for expr in tests: - with pytest.raises(NameError): - evalExpression(expr, data, data.columns[0], flagger, np.nan) - - -@pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_comparisonOperators(data, flagger): - flagger = flagger.initFlags(data) - var1, var2, *_ = data.columns - this = var1 - - tests = [ - ("this > 100", data[this] > 100), - (f"10 >= {var2}", 10 >= data[var2]), - (f"{var2} < 100", data[var2] < 100), - (f"this <= {var2}", data[this] <= data[var2]), - (f"{var1} == {var2}", data[this] == data[var2]), - (f"{var1} != {var2}", data[this] != data[var2]), - ] - - # check within the usually enclosing scope - for expr, mask in tests: - _, result_flagger = evalExpression(f"flagGeneric(func={expr})", data, this, flagger, np.nan) - expected_flagger = flagger.setFlags(this, loc=mask, test="generic") - assert (result_flagger.isFlagged() == expected_flagger.isFlagged()).all(None) - - -@pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_arithmeticOperators(data, flagger): - flagger = flagger.initFlags(data) - var1, *_ = data.columns - this = data[var1] - - tests = [ - ("this + 100", this + 100), - ("this - 1000", this - 1000), - ("this * 2", this * 2), - ("this / 100", this / 100), - ("this % 2", this % 2), - ("this ** 2", this ** 2), - ] - - # check within the usually enclosing scope - for expr, expected in tests: - result_data, _ = evalExpression(f"procGeneric(func={expr})", data, var1, flagger, np.nan) - assert np.all(result_data[expected.name] == expected) - - -@pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_nonReduncingBuiltins(data, flagger): - flagger = flagger.initFlags(data) - var1, *_ = data.columns - this = data[var1] - - tests = [ - ("abs(this)", np.abs(this)), - ("sqrt(this)", np.sqrt(this)), - ("exp(this)", np.exp(this)), - ("log(this)", np.log(this)), - ] - - for expr, expected in tests: - result_data, _ = evalExpression(f"procGeneric(func={expr})", data, var1, flagger, np.nan) - assert np.all(result_data[expected.name] == expected) - - -@pytest.mark.parametrize("flagger", TESTFLAGGER) -@pytest.mark.parametrize("nodata", TESTNODATA) -def test_reduncingBuiltins(data, flagger, nodata): - data.loc[::4] = nodata - flagger = flagger.initFlags(data) - var1, *_ = data.columns - this = data[var1] - - tests = [ - ("min(this)", np.min(this)), - (f"max(this)", np.max(this)), - (f"sum(this)", np.nansum(this)), - ("mean(this)", np.nanmean(this)), - (f"std(this)", np.std(this)), - (f"len(this)", len(this)), - ] - for expr, expected in tests: - result_data, _ = evalExpression(f"procGeneric(func={expr})", data, var1, flagger, np.nan) - assert np.all(result_data[var1] == expected) - - -@pytest.mark.parametrize("flagger", TESTFLAGGER) -@pytest.mark.parametrize("nodata", TESTNODATA) -def test_ismissing(data, flagger, nodata): - - data.iloc[: data.lengths[0] // 2, 0] = np.nan - data.iloc[(data.lengths[0] // 2) + 1 :, 0] = -9999 - var1, *_ = data.columns - - flagger = flagger.initFlags(data) - - tests = [ - (f"ismissing({var1})", lambda data: (data.isna() | (data == nodata)).all()), - (f"~ismissing({var1})", lambda data: ~(data.isna() | (data == nodata)).all(),), - ] - - for expr, checkFunc in tests: - idx = _evalDslExpression(expr, data, var1, flagger, nodata) - assert checkFunc(data.loc[idx, var1]) - - -@pytest.mark.parametrize("flagger", TESTFLAGGER) -@pytest.mark.parametrize("nodata", TESTNODATA) -def test_bitOps(data, flagger, nodata): - var1, var2, *_ = data.columns - this = data[var1] - - flagger = flagger.initFlags(data) - - tests = [ - (f"~(this > mean(this))", ~(this > np.nanmean(this))), - (f"(this <= 0) | (0 < {var1})", (this <= 0) | (0 < data[var1])), - (f"({var2} >= 0) & (0 > this)", (data[var2] >= 0) & (0 > this)), - ] - - for expr, expected in tests: - _, flagger_result = evalExpression(f"flagGeneric(func={expr})", data, this.name, flagger, nodata) - assert (flagger_result.isFlagged(this.name) == expected).all() - - -@pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_isflagged(data, flagger): - - flagger = flagger.initFlags(data) - var1, var2, *_ = data.columns - - flagger = flagger.setFlags(var1, loc=slice(None, None, 2)) - flagger = flagger.setFlags(var2, loc=slice(None, None, 2)) - - idx = _evalDslExpression(f"isflagged({var1})", data, var2, flagger) - - flagged = flagger.isFlagged(var1) - assert (flagged == idx).all - - -@pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_invertIsFlagged(data, flagger): - - flagger = flagger.initFlags(data) - var1, var2, *_ = data.columns - - flagger = flagger.setFlags(var2, loc=slice(None, None, 2)) - - tests = [ - (f"~isflagged({var2})", ~flagger.isFlagged(var2)), - (f"~({var2}>999) & (~isflagged({var2}))", ~(data[var2] > 999) & (~flagger.isFlagged(var2))), - ] - - for expr, flags_expected in tests: - _, flagger_result = evalExpression(f"flagGeneric(func={expr})", data, var1, flagger, np.nan) - flags_result = flagger_result.isFlagged(var1) - assert (flags_result == flags_expected).all(None) - - -@pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_isflaggedArgument(data, flagger): - - var1, var2, *_ = data.columns - - flagger = flagger.initFlags(data).setFlags(var1, loc=slice(None, None, 2), flag=flagger.BAD) - - tests = [ - (_evalDslExpression(f"isflagged({var1}, BAD)", data, var2, flagger), flagger.isFlagged(var1, flag=flagger.BAD)), - ( - _evalDslExpression(f"isflagged({var1}, UNFLAGGED, '==')", data, var2, flagger), - flagger.isFlagged(var1, flag=flagger.UNFLAGGED, comparator="=="), - ), - ] - - for result, expected in tests: - assert (result == expected).all(None) - - -@pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_variableAssignments(data, flagger): - var1, var2, *_ = data.columns - - config = f""" - {F.VARNAME} ; {F.TESTS} - dummy1 ; procGeneric(func=var1 + var2) - dummy2 ; flagGeneric(func=var1 + var2 > 0) - """ - - result_data, result_flagger = run(writeIO(config), flagger, data) - - assert set(result_data.columns) == set(data.columns) | { - "dummy1", - } - assert set(result_flagger.getFlags().columns) == set(data.columns) | {"dummy1", "dummy2"} - - -@pytest.mark.xfail(strict=True) -@pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_procGenericMultiple(data_diff, flagger): - var1, var2, *_ = data_diff.columns - - config = f""" - {F.VARNAME} ; {F.TESTS} - dummy ; procGeneric(func=var1 + 1) - dummy ; procGeneric(func=var2 - 1) - """ - - result_data, result_flagger = run(writeIO(config), flagger, data_diff) - assert len(result_data["dummy"]) == len(result_flagger.getFlags("dummy")) diff --git a/test/funcs/test_harm_funcs.py b/test/funcs/test_harm_funcs.py index d2e6783dbacef62876b8759d3379af0b7a516a44..e5e0447b6ea03f031d66367014a0821cc4027463 100644 --- a/test/funcs/test_harm_funcs.py +++ b/test/funcs/test_harm_funcs.py @@ -29,12 +29,8 @@ COFLAGGING = [False, True] SETSHIFTCOMMENT = [False, True] -INTERPOLATIONS = ["fshift", "bshift", "nshift", "nagg", "bagg"] - INTERPOLATIONS2 = ["fagg", "time", "polynomial"] -FREQS = ["15min", "30min"] - @pytest.fixture def data(): @@ -79,40 +75,6 @@ def multi_data(): return dios.DictOfSeries(data) -@pytest.mark.parametrize("method", INTERPOLATIONS2) -def test_gridInterpolation(data, method): - freq = "15min" - data = data.squeeze() - data = (data * np.sin(data)).append(data.shift(1, "2h")).shift(1, "3s") - kwds = dict(agg_method="sum", downcast_interpolation=True) - - # we are just testing if the interpolation gets passed to the series without causing an error: - _interpolateGrid(data, freq, method, order=1, **kwds) - if method == "polynomial": - _interpolateGrid(data, freq, method, order=2, **kwds) - _interpolateGrid(data, freq, method, order=10, **kwds) - data = _insertGrid(data, freq) - _interpolate(data, method, inter_limit=3) - - -@pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_outsortCrap(data, flagger): - field = data.columns[0] - s = data[field] - flagger = flagger.initFlags(data) - - drop_index = s.index[5:7] - flagger = flagger.setFlags(field, loc=drop_index) - res, *_ = _outsortCrap(s, field, flagger, drop_flags=flagger.BAD) - assert drop_index.difference(res.index).equals(drop_index) - - flagger = flagger.setFlags(field, loc=s.iloc[0:1], flag=flagger.GOOD) - drop_index = drop_index.insert(-1, s.index[0]) - to_drop = [flagger.BAD, flagger.GOOD] - res, *_ = _outsortCrap(s, field, flagger, drop_flags=to_drop) - assert drop_index.sort_values().difference(res.index).equals(drop_index.sort_values()) - - @pytest.mark.parametrize("flagger", TESTFLAGGER) def test_heapConsistency(data, flagger): @@ -187,69 +149,43 @@ def test_harmSingleVarIntermediateFlagging(data, flagger, reshaper, co_flagging) @pytest.mark.parametrize("flagger", TESTFLAGGER) -@pytest.mark.parametrize("interpolation", INTERPOLATIONS) -@pytest.mark.parametrize("freq", FREQS) -def test_harmSingleVarInterpolations(data, flagger, interpolation, freq): +def test_harmSingleVarInterpolations(data, flagger): flagger = flagger.initFlags(data) - flags = flagger.getFlags() - # make pre harm copies: - pre_data = data.copy() - pre_flags = flags.copy() - assert len(data.columns) == 1 field = data.columns[0] - harm_start = data[field].index[0].floor(freq=freq) - harm_end = data[field].index[-1].ceil(freq=freq) - test_index = pd.date_range(start=harm_start, end=harm_end, freq=freq) - data, flagger = harm_harmonize( - data, "data", flagger, freq, interpolation, "fshift", reshape_shift_comment=False, inter_agg="sum", - ) + tests = [ + ("fshift", "15Min", [np.nan, -37.5, -25.0, 0.0, 37.5, 50.0]), + ("fshift", "30Min", [np.nan, -37.5, 0.0, 50.0]), + ("bshift", "15Min", [-50.0, -37.5, -25.0, 12.5, 37.5, 50.0]), + ("bshift", "30Min", [-50.0, -37.5, 12.5, 50.0]), + ("nshift", "15min", [np.nan, -37.5, -25.0, 12.5, 37.5, 50.0]), + ("nshift", "30min", [np.nan, -37.5, 12.5, 50.0]), + ("nagg", "15Min", [np.nan, -87.5, -25.0, 0.0, 37.5, 50.0]), + ("nagg", "30Min", [np.nan, -87.5, -25.0, 87.5]), + ("bagg", "15Min", [-50.0, -37.5, -37.5, 12.5, 37.5, 50.0]), + ("bagg", "30Min", [-50.0, -75.0, 50.0, 50.0]), + ] + + for interpolation, freq, expected in tests: + data_harm, _ = harm_harmonize( + data, "data", flagger, freq, interpolation, "fshift", reshape_shift_comment=False, inter_agg="sum", + ) + + harm_start = data[field].index[0].floor(freq=freq) + harm_end = data[field].index[-1].ceil(freq=freq) + test_index = pd.date_range(start=harm_start, end=harm_end, freq=freq) + expected = pd.Series(expected, index=test_index) + assert data_harm[field].equals(expected) + + data_deharm, flagger_deharm = harm_deharmonize(data, "data", flagger, co_flagging=True) - if interpolation == "fshift": - if freq == "15min": - exp = pd.Series([np.nan, -37.5, -25.0, 0.0, 37.5, 50.0], index=test_index) - assert data[field].equals(exp) - if freq == "30min": - exp = pd.Series([np.nan, -37.5, 0.0, 50.0], index=test_index) - assert data[field].equals(exp) - if interpolation == "bshift": - if freq == "15min": - exp = pd.Series([-50.0, -37.5, -25.0, 12.5, 37.5, 50.0], index=test_index) - assert data[field].equals(exp) - if freq == "30min": - exp = pd.Series([-50.0, -37.5, 12.5, 50.0], index=test_index) - assert data[field].equals(exp) - if interpolation == "nshift": - if freq == "15min": - exp = pd.Series([np.nan, -37.5, -25.0, 12.5, 37.5, 50.0], index=test_index) - assert data[field].equals(exp) - if freq == "30min": - exp = pd.Series([np.nan, -37.5, 12.5, 50.0], index=test_index) - assert data[field].equals(exp) - if interpolation == "nagg": - if freq == "15min": - exp = pd.Series([np.nan, -87.5, -25.0, 0.0, 37.5, 50.0], index=test_index) - assert data[field].equals(exp) - if freq == "30min": - exp = pd.Series([np.nan, -87.5, -25.0, 87.5], index=test_index) - assert data[field].equals(exp) - if interpolation == "bagg": - if freq == "15min": - exp = pd.Series([-50.0, -37.5, -37.5, 12.5, 37.5, 50.0], index=test_index) - assert data[field].equals(exp) - if freq == "30min": - exp = pd.Series([-50.0, -75.0, 50.0, 50.0], index=test_index) - assert data[field].equals(exp) - - data, flagger = harm_deharmonize(data, "data", flagger, co_flagging=True) - - # data, flagger = harm_deharmonize(data, "data", flagger, co_flagging=True) flags = flagger.getFlags() + flags_deharm = flagger_deharm.getFlags() - assert pre_data[field].equals(data[field]) - assert len(data[field]) == len(flags[field]) - assert (pre_flags[field].index == flags[field].index).all() + assert data[field].equals(data[field]) + assert len(data_deharm[field]) == len(flags[field]) + assert (flags[field].index == flags_deharm[field].index).all() @pytest.mark.parametrize("flagger", TESTFLAGGER) @@ -334,15 +270,13 @@ def test_outsortCrap(data, flagger): @pytest.mark.parametrize("flagger", TESTFLAGGER) - - def test_wrapper(data, flagger): # we are only testing, whether the wrappers do pass processing: field = data.columns[0] freq = "15min" flagger = flagger.initFlags(data) - harm_downsample(data, field, flagger, "15min", "30min", agg_func="sum", sample_func="mean") - harm_linear2Grid(data, field, flagger, freq, method="nagg", func="max", drop_flags=None) - harm_aggregate2Grid(data, field, flagger, freq, value_func="sum", flag_func="max", method="nagg", drop_flags=None) + harm_downsample(data, field, flagger, "15min", "30min", agg_func=np.nansum, sample_func=np.nanmean) + harm_linear2Grid(data, field, flagger, freq, method="nagg", func=np.nanmax, drop_flags=None) + harm_aggregate2Grid(data, field, flagger, freq, value_func=np.nansum, flag_func=np.nanmax, method="nagg", drop_flags=None) harm_shift2Grid(data, field, flagger, freq, method="nshift", drop_flags=None) harm_interpolate2Grid(data, field, flagger, freq, method="spline") diff --git a/test/funcs/test_spikes_detection.py b/test/funcs/test_spikes_detection.py index daac97d2c11390e9b5c65af3a8a3fdfb8a3aa7ee..d3b48a52c88d5c0ed95e17b9a5cabb86a44746c5 100644 --- a/test/funcs/test_spikes_detection.py +++ b/test/funcs/test_spikes_detection.py @@ -113,7 +113,7 @@ def test_flagSpikesOddWater(dat, flagger): data = dios.DictOfSeries([s1, s2], columns=["data1", "data2"]) flagger = flagger.initFlags(data) _, flagger_result = spikes_flagMultivarScores( - data, field, flagger, fields=fields, binning=50, trafo='np.log', + data, field, flagger, fields=fields, binning=50, trafo=np.log, iter_start=0.95, n_neighbors=10 ) for field in fields: