diff --git a/README.md b/README.md index 2e29d34472b3c151150132e7357c8a66cb702d48..02262fd5cad0080554a16c15d59c9d418ca7216c 100644 --- a/README.md +++ b/README.md @@ -58,11 +58,11 @@ dataset and the routines to inspect, quality control and/or process them. The content of such a configuration could look like this: ``` -varname ; test +varname ; test #----------;------------------------------------ -SM2 ; shiftToFreq(freq="15Min") -SM2 ; flagMissing(nodata=NAN) -'SM(1|2)+' ; flagRange(min=10, max=60) +SM2 ; shiftToFreq(freq="15Min") +SM2 ; flagMissing() +'SM(1|2)+' ; flagRange(min=10, max=60) SM2 ; flagMad(window="30d", z=3.5) ``` @@ -86,7 +86,7 @@ from saqc import SaQC saqc = (SaQC(data) .shiftToFreq("SM2", freq="15Min") - .flagMissing("SM2", nodata=np.nan) + .flagMissing("SM2") .flagRange("SM(1|2)+", regex=True, min=10, max=60) .flagMad("SM2", window="30d", z=3.5)) diff --git a/ressources/data/config.csv b/ressources/data/config.csv index 9a7e14c7585470846507969cde3c1f2767414b73..7bbdbbfbc1765178ac249663c80653fd191a89db 100644 --- a/ressources/data/config.csv +++ b/ressources/data/config.csv @@ -1,6 +1,6 @@ varname ; test #----------;--------------------------------------- SM2 ; resampling.shift(freq="15Min") -SM2 ; breaks.flagMissing(nodata=NAN) +SM2 ; breaks.flagMissing() 'SM(1|2)+' ; outliers.flagRange(min=10, max=60) SM2 ; outliers.flagMAD(window="30d", z=3.5) diff --git a/ressources/data/config_ci.csv b/ressources/data/config_ci.csv index 3702e0c78dac87e75488ce5ad545b453c14edcf6..ecbe227e8562bdaa967785e1ff538c224b5f4fa4 100644 --- a/ressources/data/config_ci.csv +++ b/ressources/data/config_ci.csv @@ -2,7 +2,7 @@ varname ; test #-------; ----------------------------------------------------- SM2 ; resampling.shift(freq="15Min") '.*' ; outliers.flagRange(min=10, max=60) -SM2 ; breaks.flagMissing(nodata=NAN) +SM2 ; breaks.flagMissing() SM2 ; outliers.flagRange(min=10, max=60) SM2 ; outliers.flagMAD(window="30d", z=3.5) Dummy ; generic.flag(func=(isflagged(SM1) | isflagged(SM2))) diff --git a/saqc/__main__.py b/saqc/__main__.py index 14b6386d39537fa93b05191f3c83deb04c504f3d..087809007d0596472624cfd665c72fe69af763bc 100644 --- a/saqc/__main__.py +++ b/saqc/__main__.py @@ -110,7 +110,6 @@ def main(config, data, scheme, outfile, nodata, log_level, fail): saqc = SaQC( data=data, - nodata=nodata, scheme=SCHEMES[scheme or "float"](), error_policy="raise" if fail else "warn", ) diff --git a/saqc/core/core.py b/saqc/core/core.py index bbed1b54ed7269796e661cef9ede39dd23a613a8..0cabed2025ae52296fad7d3c45828ee75276752e 100644 --- a/saqc/core/core.py +++ b/saqc/core/core.py @@ -142,14 +142,12 @@ class SaQC(FuncModules): data, flags=None, scheme: Translator = None, - nodata=np.nan, error_policy="raise", lazy=False, ): super().__init__(self) data, flags = _prepInput(data, flags) self._data = data - self._nodata = nodata self._flags = self._initFlags(data, flags) self._error_policy = error_policy self._lazy = lazy @@ -197,7 +195,6 @@ class SaQC(FuncModules): out = SaQC( data=DictOfSeries(), flags=Flags(), - nodata=self._nodata, error_policy=self._error_policy, scheme=self._translator, ) @@ -219,9 +216,7 @@ class SaQC(FuncModules): from saqc.core.reader import readConfig out = stdcopy.deepcopy(self) - out._planned.extend( - readConfig(fname, self._translator, self._data, self._nodata) - ) + out._planned.extend(readConfig(fname, self._translator, self._data)) if self._lazy: return out return out.evaluate() @@ -305,7 +300,6 @@ class SaQC(FuncModules): partial = func.bind( *fargs, **{ - "nodata": self._nodata, "flag": self._translator(flag), "to_mask": self._translator.TO_MASK, **fkwargs, @@ -378,7 +372,7 @@ def _warnForUnusedKwargs(func, translator: Translator): sig_kws = inspect.signature(func.func).parameters # we need to ignore kws that are injected or by default hidden in ``**kwargs`` - ignore = ("nodata", "to_mask") + ignore = ("to_mask",) missing = [] for kw in func.keywords: diff --git a/saqc/core/modules/breaks.py b/saqc/core/modules/breaks.py index 1edfad5197ca94000849d5c0249ef6ccd61c1fd0..fa3ffb08e33af9f177f25b2ea19aca19241fc252 100644 --- a/saqc/core/modules/breaks.py +++ b/saqc/core/modules/breaks.py @@ -12,9 +12,7 @@ from saqc.lib.types import FreqString, IntegerWindow, ColumnName class Breaks(ModuleBase): - def flagMissing( - self, field: ColumnName, nodata: float = np.nan, flag: float = BAD, **kwargs - ) -> saqc.SaQC: + def flagMissing(self, field: ColumnName, flag: float = BAD, **kwargs) -> saqc.SaQC: return self.defer("flagMissing", locals()) def flagIsolated( diff --git a/saqc/core/modules/generic.py b/saqc/core/modules/generic.py index 990d198047333fcc2898a93952853043e831f976..74f6ee89d111ffee1c67a3120f07b854b3eed44e 100644 --- a/saqc/core/modules/generic.py +++ b/saqc/core/modules/generic.py @@ -17,7 +17,6 @@ class Generic(ModuleBase): self, field: str, func: Callable[[pd.Series], pd.Series], - nodata: float = np.nan, to_mask: float = UNFLAGGED, **kwargs, ) -> saqc.SaQC: @@ -27,7 +26,6 @@ class Generic(ModuleBase): self, field: str, func: Callable[[pd.Series], pd.Series], - nodata: float = np.nan, flag: float = BAD, to_mask: float = UNFLAGGED, **kwargs, diff --git a/saqc/core/reader.py b/saqc/core/reader.py index 21bfc1a5e47c7d440531d4fa9eefee90fed006af..1221fde3e9cb6a63aef9f00d66faf21c4293ec33 100644 --- a/saqc/core/reader.py +++ b/saqc/core/reader.py @@ -47,16 +47,7 @@ def _handleComments(df: pd.DataFrame) -> pd.DataFrame: return df -def _injectOptionalColumns(df): - # inject optional columns - if F.PLOT not in df: - empty = (df == EMPTY).all(axis=1) - df[F.PLOT] = "False" - df[empty] = EMPTY - return df - - -def _parseConfig(df, translator, data, nodata): +def _parseConfig(df, translator, data): funcs = [] for lineno, (_, target, expr) in enumerate(df.itertuples()): if target == "None" or pd.isnull(target) or pd.isnull(expr): @@ -75,9 +66,7 @@ def _parseConfig(df, translator, data, nodata): if "flag" in kwargs: kwargs["flag"] = translator(kwargs["flag"]) - partial = func.bind( - **{"nodata": nodata, "to_mask": translator.TO_MASK, **kwargs} - ) + partial = func.bind(**{"to_mask": translator.TO_MASK, **kwargs}) targets = toSequence(target) @@ -91,7 +80,7 @@ def _parseConfig(df, translator, data, nodata): return funcs -def readConfig(fname, translator, data, nodata): +def readConfig(fname, translator, data): df = pd.read_csv( fname, sep=r"\s*;\s*", @@ -107,4 +96,4 @@ def readConfig(fname, translator, data, nodata): df[F.VARNAME] = df[F.VARNAME].replace(r"^\s*$", np.nan, regex=True) df[F.TEST] = df[F.TEST].replace(r"^\s*$", np.nan, regex=True) - return _parseConfig(df, translator, data, nodata) + return _parseConfig(df, translator, data) diff --git a/saqc/core/visitor.py b/saqc/core/visitor.py index 8e07ddc6d4fa2a88693d6589e632b71f722d0eaf..e4b768f4d0b427e55b03831a7fafddffa4da4e18 100644 --- a/saqc/core/visitor.py +++ b/saqc/core/visitor.py @@ -39,17 +39,18 @@ ENVIRONMENT = { "zLog": ts_ops.zeroLog, } -# TODO: how does a user pass flags now -RESERVED = {"GOOD", "BAD", "UNFLAGGED", "NODATA"} +# TODO: +# get from saqc.constants +RESERVED = {"GOOD", "BAD", "UNFLAGGED"} class ConfigExpressionParser(ast.NodeVisitor): """ Generic configuration functions will be rewritten as lambda functions and variables that need a look up in `data` will act as arguments, e.g.: - `flagGeneric(func=(x != NODATA) & (y < 3))` + `flagGeneric(func=(x != 4) & (y < 3))` will be rewritten to - `lambda x, y: (x != NODATA) & (y < 3)` + `lambda x, y: (x != 4) & (y < 3)` The main purpose of this class is to identify all such lambda arguments and check the given expression for accordance with the restrictions diff --git a/saqc/funcs/breaks.py b/saqc/funcs/breaks.py index 4d34a550a55167fe594af0c312d4bc0e96f091b8..7a921c927f4480d65131cf3ff9cdf9ab8eed8d4f 100644 --- a/saqc/funcs/breaks.py +++ b/saqc/funcs/breaks.py @@ -33,7 +33,6 @@ def flagMissing( data: DictOfSeries, field: ColumnName, flags: Flags, - nodata: float = np.nan, flag: float = BAD, to_mask: float = UNFLAGGED, **kwargs @@ -49,8 +48,6 @@ def flagMissing( The fieldname of the column, holding the data-to-be-flagged. flags : saqc.Flags Container to store quality flags to data. - nodata : any, default np.nan - A value that defines missing data. flag : float, default BAD flag to set. @@ -63,11 +60,8 @@ def flagMissing( """ datacol = data[field] + mask = datacol.isna() - if np.isnan(nodata): - mask = datacol.isna() - else: - mask = datacol == nodata mask = ~_isflagged(flags[field], to_mask) & mask flags[mask, field] = flag diff --git a/saqc/funcs/generic.py b/saqc/funcs/generic.py index 19738b6715cea56f4b13ab5bea4961e891cc2d76..cc2142cb562e4c247d1c08828518f72ba0777c3b 100644 --- a/saqc/funcs/generic.py +++ b/saqc/funcs/generic.py @@ -55,7 +55,6 @@ def _execGeneric( data: DictOfSeries, func: Callable[[pd.Series], pd.Series], field: str, - nodata: float, ) -> pd.Series: # TODO: # - check series.index compatibility @@ -72,10 +71,9 @@ def _execGeneric( globs = { "isflagged": partial(_dslIsFlagged, flags), - "ismissing": lambda var: ((var == nodata) | pd.isnull(var)), + "ismissing": lambda var: pd.isnull(var), "mask": lambda cond: data[cond.name].mask(cond), "this": field, - "NODATA": nodata, "GOOD": GOOD, "BAD": BAD, "UNFLAGGED": UNFLAGGED, @@ -91,7 +89,6 @@ def process( field: str, flags: Flags, func: Callable[[pd.Series], pd.Series], - nodata: float = np.nan, to_mask: float = UNFLAGGED, **kwargs, ) -> Tuple[DictOfSeries, Flags]: @@ -106,7 +103,7 @@ def process( Than, for every timestamp t_i that occurs in at least one of the timeseries data[f_j] (outer join), The value v_i is computed via: v_i = data([f_1][t_i], data[f_2][t_i], ..., data[f_K][t_i]), if all data[f_j][t_i] do exist - v_i = `nodata`, if at least one of the data[f_j][t_i] is missing. + v_i = `np.nan`, if at least one of the data[f_j][t_i] is missing. 2. The result is stored to data[field] (gets generated if not present) Parameters @@ -121,8 +118,6 @@ def process( The data processing function with parameter names that will be interpreted as data column entries. See the examples section to learn more. - nodata : any, default np.nan - The value that indicates missing/invalid data Returns ------- @@ -147,7 +142,7 @@ def process( # we get the data unmasked in order to also receive flags, # so let's do to the masking manually data_masked, _ = _maskData(data, flags, data.columns, to_mask) - data[field] = _execGeneric(flags, data_masked, func, field, nodata).squeeze() + data[field] = _execGeneric(flags, data_masked, func, field).squeeze() if field in flags: flags.drop(field) @@ -163,7 +158,6 @@ def flag( field: str, flags: Flags, func: Callable[[pd.Series], pd.Series], - nodata: float = np.nan, flag: float = BAD, to_mask: float = UNFLAGGED, **kwargs, @@ -200,8 +194,6 @@ def flag( The expression that is to be evaluated is passed in form of a callable, with parameter names that will be interpreted as data column entries. The Callable must return an boolen array like. See the examples section to learn more. - nodata : any, default np.nan - The value that indicates missing/invalid data flag : float, default BAD flag to set. @@ -249,7 +241,7 @@ def flag( # so let's do to the masking manually data_masked, _ = _maskData(data, flags, data.columns, to_mask) - mask = _execGeneric(flags, data_masked, func, field, nodata).squeeze() + mask = _execGeneric(flags, data_masked, func, field).squeeze() if np.isscalar(mask): raise TypeError(f"generic expression does not return an array") if not np.issubdtype(mask.dtype, np.bool_): diff --git a/sphinx-doc/getting_started_md/ParameterDescriptions.md b/sphinx-doc/getting_started_md/ParameterDescriptions.md index 8fcfa0511100177240701bb9338174bf4dfde27a..0581f55c189f0c1b80c22296a188104d9b6a6716 100644 --- a/sphinx-doc/getting_started_md/ParameterDescriptions.md +++ b/sphinx-doc/getting_started_md/ParameterDescriptions.md @@ -32,4 +32,3 @@ and might range from numerical values to string constants. | Alias | Description | | ---- | ---- | | `NAN` | Not a number | -| `NODATA` | Missing data | diff --git a/sphinx-doc/ressources/data/config.csv b/sphinx-doc/ressources/data/config.csv index c8f7f803de8e32aafba102654ee9112aaa3e659d..6f31389afc600155fb18cd9710738d3fc809d065 100644 --- a/sphinx-doc/ressources/data/config.csv +++ b/sphinx-doc/ressources/data/config.csv @@ -1,6 +1,6 @@ varname ; test ; plot #----------;-------------------------------------;------ -SM2 ; shift(freq="15Min") ; False -SM2 ; flagMissing(nodata=NAN) ; False +SM2 ; shift(freq="15Min") ; False +SM2 ; flagMissing() ; False 'SM(1|2)+' ; flagRange(min=10, max=60) ; False SM2 ; flagMAD(window="30d", z=3.5) ; True diff --git a/sphinx-doc/ressources/data/config_ci.csv b/sphinx-doc/ressources/data/config_ci.csv index 74ddfbae40943f3bbcc75e58fbab4379e39815c4..05e22c807bd26768c2de338c35970720b60fdb3c 100644 --- a/sphinx-doc/ressources/data/config_ci.csv +++ b/sphinx-doc/ressources/data/config_ci.csv @@ -1,7 +1,7 @@ varname;test;plot SM2;shift(freq="15Min");False '.*';flagRange(min=10, max=60);False -SM2;flagMissing(nodata=NAN);False +SM2;flagMissing();False SM2;flagRange(min=10, max=60);False SM2;flagMAD(window="30d", z=3.5);False Dummy;flag(func=(isflagged(SM1) | isflagged(SM2))) diff --git a/tests/common.py b/tests/common.py index cab0892a247b38bfb0e3ce98850fa7a5602d7497..7072a3173c1c682de986c2c7a076cad1d244179c 100644 --- a/tests/common.py +++ b/tests/common.py @@ -7,11 +7,9 @@ import pandas as pd import dios from saqc.constants import * -from saqc.core import initFlagsLike, Flags +from saqc.core import Flags from saqc.core.history import History, createHistoryFromData -TESTNODATA = (np.nan, -9999) - def flagAll(data, field, flags, **kwargs): # NOTE: remember to rename flag -> flag_values diff --git a/tests/funcs/test_generic_config_functions.py b/tests/funcs/test_generic_config_functions.py index b5f5aa0e18db842d44e28895f1d2f7a0dd7bb6b2..781205bbaadea0d0f4068000e45326547223b633 100644 --- a/tests/funcs/test_generic_config_functions.py +++ b/tests/funcs/test_generic_config_functions.py @@ -15,7 +15,7 @@ from saqc.core.register import flagging from saqc.funcs.generic import _execGeneric from saqc import SaQC -from tests.common import TESTNODATA, initData, writeIO +from tests.common import initData, writeIO @pytest.fixture @@ -51,13 +51,13 @@ def test_missingIdentifier(data): # - the error is only raised at runtime during parsing would be better tests = [ "fff(var2) < 5", - "var3 != NODATA", + "var3 != 42", ] for test in tests: func = _compileGeneric(f"generic.flag(func={test})", flags) with pytest.raises(NameError): - _execGeneric(flags, data, func, field="", nodata=np.nan) + _execGeneric(flags, data, func, field="") def test_syntaxError(): @@ -104,7 +104,7 @@ def test_comparisonOperators(data): for test, expected in tests: func = _compileGeneric(f"generic.flag(func={test})", flags) - result = _execGeneric(flags, data, func, field=var1, nodata=np.nan) + result = _execGeneric(flags, data, func, field=var1) assert np.all(result == expected) @@ -124,7 +124,7 @@ def test_arithmeticOperators(data): for test, expected in tests: func = _compileGeneric(f"generic.process(func={test})", flags) - result = _execGeneric(flags, data, func, field=var1, nodata=np.nan) + result = _execGeneric(flags, data, func, field=var1) assert np.all(result == expected) @@ -146,13 +146,12 @@ def test_nonReduncingBuiltins(data): for test, expected in tests: func = _compileGeneric(f"generic.process(func={test})", flags) - result = _execGeneric(flags, data, func, field=this, nodata=np.nan) + result = _execGeneric(flags, data, func, field=this) assert (result == expected).all() -@pytest.mark.parametrize("nodata", TESTNODATA) -def test_reduncingBuiltins(data, nodata): - data.loc[::4] = nodata +def test_reduncingBuiltins(data): + data.loc[::4] = np.nan flags = initFlagsLike(data) var1 = data.columns[0] this = data.iloc[:, 0] @@ -168,12 +167,11 @@ def test_reduncingBuiltins(data, nodata): for test, expected in tests: func = _compileGeneric(f"generic.process(func={test})", flags) - result = _execGeneric(flags, data, func, field=this.name, nodata=nodata) + result = _execGeneric(flags, data, func, field=this.name) assert result == expected -@pytest.mark.parametrize("nodata", TESTNODATA) -def test_ismissing(data, nodata): +def test_ismissing(data): flags = initFlagsLike(data) data.iloc[: len(data) // 2, 0] = np.nan @@ -181,18 +179,17 @@ def test_ismissing(data, nodata): this = data.iloc[:, 0] tests = [ - (f"ismissing({this.name})", (pd.isnull(this) | (this == nodata))), - (f"~ismissing({this.name})", (pd.notnull(this) & (this != nodata))), + (f"ismissing({this.name})", pd.isnull(this)), + (f"~ismissing({this.name})", pd.notnull(this)), ] for test, expected in tests: func = _compileGeneric(f"generic.flag(func={test})", flags) - result = _execGeneric(flags, data, func, this.name, nodata) + result = _execGeneric(flags, data, func, this.name) assert np.all(result == expected) -@pytest.mark.parametrize("nodata", TESTNODATA) -def test_bitOps(data, nodata): +def test_bitOps(data): var1, var2, *_ = data.columns this = var1 @@ -206,7 +203,7 @@ def test_bitOps(data, nodata): for test, expected in tests: func = _compileGeneric(f"generic.flag(func={test})", flags) - result = _execGeneric(flags, data, func, this, nodata) + result = _execGeneric(flags, data, func, this) assert np.all(result == expected) @@ -230,7 +227,7 @@ def test_isflagged(data): for i, (test, expected) in enumerate(tests): try: func = _compileGeneric(f"generic.flag(func={test}, flag=BAD)", flags) - result = _execGeneric(flags, data, func, field=None, nodata=np.nan) + result = _execGeneric(flags, data, func, field=None) assert np.all(result == expected) except Exception: print(i, test) @@ -242,7 +239,7 @@ def test_isflagged(data): func = _compileGeneric(f"generic.flag(func={fails}, flag=BAD)", flags) with pytest.raises(ValueError): - _execGeneric(flags, data, func, field=None, nodata=np.nan) + _execGeneric(flags, data, func, field=None) def test_variableAssignments(data):