diff --git a/environment.yml b/environment.yml index c08ddee3afdb51bd620f7b7c36e338dac72d42df..aabe58efcf1dab30195217d3e75e53abe969cf63 100644 --- a/environment.yml +++ b/environment.yml @@ -10,6 +10,10 @@ dependencies: - click - scikit-learn - pyarrow + - PyWavelets - pip - pip: - python-intervals + - dtw + - mlxtend + - outlier-utils diff --git a/requirements.txt b/requirements.txt index db2e71d721c465a108648e0725c8c309e88689ae..45132e2397cf607c5b17e106d94eed3a0b0ce753 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -attrs==20.1.0 +attrs==20.2.0 Click==7.1.2 cycler==0.10.0 dios==0.6.0 @@ -10,13 +10,13 @@ llvmlite==0.34.0 mlxtend==0.17.3 matplotlib==3.3.1 more-itertools==8.5.0 -numba==0.51.1 -numpy==1.19.1 +numba==0.51.2 +numpy==1.19.2 outlier==0.2 utils==1.0.1 outlier-utils==0.0.3 packaging==20.4 -pandas==1.1.1 +pandas==1.1.2 pluggy==0.13.1 pyparsing==2.4.7 py==1.9.0 diff --git a/saqc/core/core.py b/saqc/core/core.py index 0350f2e6b9faf1ad8d0792b52fd60a93d66e3674..d00752a8edd058b8b44a955bdc41153f448d78de 100644 --- a/saqc/core/core.py +++ b/saqc/core/core.py @@ -105,10 +105,11 @@ _setup() class SaQC: - def __init__(self, flagger, data, flags=None, nodata=np.nan, error_policy="raise"): + def __init__(self, flagger, data, flags=None, nodata=np.nan, to_mask=None, error_policy="raise"): data, flags = _prepInput(flagger, data, flags) self._data = data self._nodata = nodata + self._to_mask = to_mask self._flagger = self._initFlagger(data, flagger, flags) self._error_policy = error_policy # NOTE: will be filled by calls to `_wrap` @@ -131,7 +132,7 @@ class SaQC: def readConfig(self, fname): - config = readConfig(fname) + config = readConfig(fname, self._flagger) out = deepcopy(self) for func, field, kwargs, plot, lineno, expr in config: @@ -218,19 +219,12 @@ class SaQC: def inner(field: str, *args, regex: bool = False, to_mask=None, plot=False, inplace=False, **kwargs): fields = [field] if not regex else self._data.columns[self._data.columns.str.match(field)] - if func_name in ("flagGeneric", "procGeneric"): - # NOTE: - # We need to pass `nodata` to the generic functions - # (to implement stuff like `ismissing`). As we - # should not interfere with proper nodata attributes - # of other test functions (e.g. `flagMissing`) we - # special case the injection - kwargs.setdefault('nodata', self._nodata) + kwargs.setdefault('nodata', self._nodata) # to_mask is a control keyword ctrl_kws = { **(FUNC_MAP[func_name]["ctrl_kws"]), - 'to_mask': to_mask, + 'to_mask': to_mask or self._to_mask, 'plot': plot, 'inplace': inplace, 'lineno': lineno, @@ -246,10 +240,7 @@ class SaQC: "ctrl_kws": ctrl_kws, } - if inplace: - out = self - else: - out = self.copy() + out = self if inplace else self.copy() for field in fields: dump_copy = {**func_dump, "field": field} @@ -330,12 +321,12 @@ def _unmaskData(data_old, mask_old, data_new, flagger_new, to_mask): # TODO: this is heavily undertested # NOTE: - # we only need to respect columns, that was masked, - # and also are still present in new data. - # this throw out: + # we only need to respect columns, that were masked, + # and are also still present in new data. + # this throws out: # - any newly assigned columns - # - columns that wasn't masked, due to masking-kw - columns = mask_old.columns.intersection(data_new.columns) + # - columns that were excluded from masking + columns = mask_old.dropempty().columns.intersection(data_new.dropempty().columns) mask_new = flagger_new.isFlagged(field=columns, flag=to_mask, comparator="==") for col in columns: @@ -350,7 +341,7 @@ def _unmaskData(data_old, mask_old, data_new, flagger_new, to_mask): # reapplying old values on masked positions if np.any(mask): - data = np.where(mask, data_new[col].values, data_old[col].values) + data = np.where(mask, data_old[col].values, data_new[col].values) data_new[col] = pd.Series(data=data, index=is_masked.index) return data_new diff --git a/saqc/core/reader.py b/saqc/core/reader.py index 512f7f592f807c8ae1562a027e889781fe955a3c..12e8728fb658d431eac407de748f6301459db1b6 100644 --- a/saqc/core/reader.py +++ b/saqc/core/reader.py @@ -53,7 +53,7 @@ def _injectOptionalColumns(df): return df -def _parseConfig(df): +def _parseConfig(df, flagger): to_call = [] for lineno, (_, field, expr, plot) in enumerate(df.itertuples()): if field == "None": @@ -63,12 +63,12 @@ def _parseConfig(df): if pd.isnull(expr): raise SyntaxError(f"line {lineno}: non-optional column '{F.TEST}' missing") tree = ast.parse(expr, mode="eval") - cp = ConfigFunctionParser(tree.body) + cp = ConfigFunctionParser(tree.body, flagger) to_call.append((cp.func, field, cp.kwargs, plot, lineno + 2, expr)) return to_call -def readConfig(fname): +def readConfig(fname, flagger): df = pd.read_csv( fname, sep=r"\s*;\s*", @@ -87,6 +87,6 @@ def readConfig(fname): df[F.TEST] = df[F.TEST].replace(r"^\s*$", np.nan, regex=True) df[F.PLOT] = df[F.PLOT].replace({"False": "", EMPTY: "", np.nan: ""}) df = df.astype({F.PLOT: bool}) - df = _parseConfig(df) + df = _parseConfig(df, flagger) return df diff --git a/saqc/core/visitor.py b/saqc/core/visitor.py index f65d6e0e1f9de28c863391484a05535e516d0658..67a4f0d745ed9f67e159a3a335b6c41f2c1fe4e6 100644 --- a/saqc/core/visitor.py +++ b/saqc/core/visitor.py @@ -136,9 +136,15 @@ class ConfigFunctionParser(ast.NodeVisitor): ast.List, ) - def __init__(self, node): + def __init__(self, node, flagger): self.kwargs = {} + self.environment = { + "GOOD": flagger.GOOD, + "BAD": flagger.BAD, + "UNFLAGGED": flagger.UNFLAGGED, + **ENVIRONMENT, + } self.func = self.visit_Call(node) def visit_Call(self, node): @@ -160,7 +166,8 @@ class ConfigFunctionParser(ast.NodeVisitor): k, v = node.arg, node.value check_tree = True - # NOTE: not a constant or variable, should be function call + # NOTE: `node` is not a constant or a variable, + # so it should be a function call try: visitor = ConfigExpressionParser(v) args = ast.arguments( @@ -189,9 +196,14 @@ class ConfigFunctionParser(ast.NodeVisitor): # -> after all keywords where visited we end up with # a dictionary holding all the passed arguments as # real python objects - co = compile(ast.fix_missing_locations(ast.Interactive(body=[vnode])), "<ast>", mode="single") - # NOTE: only pass a copy to not clutter the ENVIRONMENT - exec(co, {**ENVIRONMENT}, self.kwargs) + co = compile( + ast.fix_missing_locations(ast.Interactive(body=[vnode])), + "<ast>", + mode="single" + ) + # NOTE: only pass a copy to not clutter the self.environment + exec(co, {**self.environment}, self.kwargs) + # let's do some more validity checks if check_tree: diff --git a/saqc/funcs/functions.py b/saqc/funcs/functions.py index b48ecbb57d78ad822e64cc52d6ed7ca46ec2c840..e13e6e38e834d5616c06ec600bf8c651c58a8d7b 100644 --- a/saqc/funcs/functions.py +++ b/saqc/funcs/functions.py @@ -221,7 +221,7 @@ def flagGeneric(data, field, flagger, func, nodata=np.nan, **kwargs): @register(masking='field') -def flagRange(data, field, flagger, min, max, **kwargs): +def flagRange(data, field, flagger, min=-np.inf, max=np.inf, **kwargs): """ Function flags values not covered by the closed interval [`min`, `max`]. @@ -428,7 +428,7 @@ def flagMissing(data, field, flagger, nodata=np.nan, **kwargs): if np.isnan(nodata): mask = datacol.isna() else: - mask = datacol[datacol == nodata] + mask = datacol == nodata flagger = flagger.setFlags(field, loc=mask, **kwargs) return data, flagger diff --git a/sphinx-doc/requirements_sphinx.txt b/sphinx-doc/requirements_sphinx.txt index f00da706d00d72a8d9a6c0e73595311889863c1d..8c284aedaad5c1ca6bc481b58ff649dbae265ec0 100644 --- a/sphinx-doc/requirements_sphinx.txt +++ b/sphinx-doc/requirements_sphinx.txt @@ -1,7 +1,7 @@ alabaster==0.7.12 appdirs==1.4.4 astor==0.8.1 -attrs==20.1.0 +attrs==20.2.0 Babel==2.8.0 black==20.8b1 certifi==2020.6.20 @@ -25,12 +25,12 @@ MarkupSafe==1.1.1 matplotlib==3.3.1 mlxtend==0.17.2 more-itertools==8.5.0 -numba==0.51.1 -numpy==1.19.1 +numba==0.51.2 +numpy==1.19.2 outlier==0.2 outlier-utils==0.0.3 packaging==20.1 -pandas==1.1.1 +pandas==1.1.2 pathspec==0.8.0 pluggy==0.13.1 py==1.8.1 diff --git a/test/funcs/test_generic_config_functions.py b/test/funcs/test_generic_config_functions.py index 0b36b0f531f396e1f170eff08fa74e27c29a631b..b9b8e9531235707479a9a67c54515668653d650c 100644 --- a/test/funcs/test_generic_config_functions.py +++ b/test/funcs/test_generic_config_functions.py @@ -32,9 +32,9 @@ def data_diff(): return DictOfSeries(data={col0.name: col0.iloc[: mid + offset], col1.name: col1.iloc[mid - offset :],}) -def _compileGeneric(expr): +def _compileGeneric(expr, flagger): tree = ast.parse(expr, mode="eval") - cp = ConfigFunctionParser(tree.body) + cp = ConfigFunctionParser(tree.body, flagger) return cp.kwargs["func"] @@ -49,7 +49,7 @@ def test_missingIdentifier(data, flagger): ] for test in tests: - func = _compileGeneric(f"flagGeneric(func={test})") + func = _compileGeneric(f"flagGeneric(func={test})", flagger) with pytest.raises(NameError): _execGeneric(flagger, data, func, field="", nodata=np.nan) @@ -65,7 +65,7 @@ def test_syntaxError(flagger): for test in tests: with pytest.raises(SyntaxError): - _compileGeneric(f"flagGeneric(func={test})") + _compileGeneric(f"flagGeneric(func={test})", flagger) @pytest.mark.parametrize("flagger", TESTFLAGGER) @@ -81,7 +81,7 @@ def test_typeError(flagger): for test in tests: with pytest.raises(TypeError): - _compileGeneric(f"flagGeneric(func={test})") + _compileGeneric(f"flagGeneric(func={test})", flagger) @pytest.mark.parametrize("flagger", TESTFLAGGER) @@ -100,7 +100,7 @@ def test_comparisonOperators(data, flagger): ] for test, expected in tests: - func = _compileGeneric(f"flagGeneric(func={test})") + func = _compileGeneric(f"flagGeneric(func={test})", flagger) result = _execGeneric(flagger, data, func, field=var1, nodata=np.nan) assert np.all(result == expected) @@ -121,7 +121,7 @@ def test_arithmeticOperators(data, flagger): ] for test, expected in tests: - func = _compileGeneric(f"procGeneric(func={test})") + func = _compileGeneric(f"procGeneric(func={test})", flagger) result = _execGeneric(flagger, data, func, field=var1, nodata=np.nan) assert np.all(result == expected) @@ -139,7 +139,7 @@ def test_nonReduncingBuiltins(data, flagger): ] for test, expected in tests: - func = _compileGeneric(f"procGeneric(func={test})") + func = _compileGeneric(f"procGeneric(func={test})", flagger) result = _execGeneric(flagger, data, func, field=this, nodata=np.nan) assert (result == expected).all() @@ -163,7 +163,7 @@ def test_reduncingBuiltins(data, flagger, nodata): ] for test, expected in tests: - func = _compileGeneric(f"procGeneric(func={test})") + func = _compileGeneric(f"procGeneric(func={test})", flagger) result = _execGeneric(flagger, data, func, field=this.name, nodata=nodata) assert result == expected @@ -182,7 +182,7 @@ def test_ismissing(data, flagger, nodata): ] for test, expected in tests: - func = _compileGeneric(f"flagGeneric(func={test})") + func = _compileGeneric(f"flagGeneric(func={test})", flagger) result = _execGeneric(flagger, data, func, this.name, nodata) assert np.all(result == expected) @@ -202,7 +202,7 @@ def test_bitOps(data, flagger, nodata): ] for test, expected in tests: - func = _compileGeneric(f"flagGeneric(func={test})") + func = _compileGeneric(f"flagGeneric(func={test})", flagger) result = _execGeneric(flagger, data, func, this, nodata) assert np.all(result == expected) @@ -216,14 +216,14 @@ def test_isflagged(data, flagger): tests = [ (f"isflagged({var1})", flagger.isFlagged(var1)), - (f"isflagged({var1}, BAD)", flagger.isFlagged(var1, flag=flagger.BAD, comparator=">=")), + (f"isflagged({var1}, flag=BAD)", flagger.isFlagged(var1, flag=flagger.BAD, comparator=">=")), (f"isflagged({var1}, UNFLAGGED, '==')", flagger.isFlagged(var1, flag=flagger.UNFLAGGED, comparator="==")), (f"~isflagged({var2})", ~flagger.isFlagged(var2)), (f"~({var2}>999) & (~isflagged({var2}))", ~(data[var2] > 999) & (~flagger.isFlagged(var2))), ] for test, expected in tests: - func = _compileGeneric(f"flagGeneric(func={test})") + func = _compileGeneric(f"flagGeneric(func={test}, flag=BAD)", flagger) result = _execGeneric(flagger, data, func, field=None, nodata=np.nan) assert np.all(result == expected)