diff --git a/saqc/core/core.py b/saqc/core/core.py index a01d880970ef77af0efbfc329b67c2a689d94212..0cd5759d06a3fe899f21b60a72cd1f84548dd3d5 100644 --- a/saqc/core/core.py +++ b/saqc/core/core.py @@ -73,14 +73,14 @@ def runner(metafname, flagger, data, flags=None, nodata=np.nan, error_policy="ra meta = config[config.columns.difference(tests.columns)] # # prepapre the flags - varnames = _collectVariables(meta, data) - fresh = flagger.initFlags(pd.DataFrame(index=data.index, columns=varnames)) - flagger = fresh if flags is None else flags._flags.join(fresh._flags) - # if flags is None: - # flag_cols = _collectVariables(meta, data) - # flagger = flagger.initFlags(pd.DataFrame(index=data.index, columns=flag_cols)) - # else: - # flagger = flagger.initFlags(flags=flags) + # varnames = _collectVariables(meta, data) + # fresh = flagger.initFlags(pd.DataFrame(index=data.index, columns=varnames)) + # flagger = fresh if flags is None else flags.join(fresh._flags) + + flag_cols = _collectVariables(meta, data) + flagger = flagger.initFlags(data=pd.DataFrame(index=data.index, columns=flag_cols)) + if flags is not None: + flagger = flagger.setFlagger(flagger.initFlags(flags=flags)) # this checks comes late, but the compiling of the user-test need fully prepared flags checkConfig(config, data, flagger, nodata) @@ -130,7 +130,14 @@ def runner(metafname, flagger, data, flags=None, nodata=np.nan, error_policy="ra flagger = flagger.setFlagger(flagger_chunk_result) - plotHook(dchunk, flagger_chunk, flagger_chunk_result, varname, configrow[Fields.PLOT], flag_test) + plotHook( + dchunk, + flagger_chunk, + flagger_chunk_result, + varname, + configrow[Fields.PLOT], + flag_test, + ) plotAllHook(data, flagger) diff --git a/saqc/flagger/baseflagger.py b/saqc/flagger/baseflagger.py index a119d05e3da2d4c9f7941a6eb8753412b4d283cf..898df1ddaac836fe4c345eb37aa482273cf40790 100644 --- a/saqc/flagger/baseflagger.py +++ b/saqc/flagger/baseflagger.py @@ -49,15 +49,13 @@ class BaseFlagger(ABC): if 'data' is not None: return a flagger with flagger.UNFALGGED values if 'flags' is not None: return a flagger with the given flags """ + + if data is None and flags is None: + raise TypeError("either 'data' or 'flags' are required") if data is not None: - assertDataFrame(data, "data", allow_multiindex=False) flags = pd.DataFrame( data=self.UNFLAGGED, index=data.index, columns=data.columns ) - elif flags is not None: - assertDataFrame(flags, "flags", allow_multiindex=False) - else: - raise TypeError("either 'data' or 'flags' are required") return self._copy(self._assureDtype(flags)) def setFlagger(self, other: BaseFlaggerT): @@ -80,7 +78,7 @@ class BaseFlagger(ABC): for key, values in other.iteritems(): flags.loc[other.index, key] = values - return self._copy(flags) + return self._copy(self._assureDtype(flags)) def getFlagger( self, field: str = None, loc: LocT = None, iloc: IlocT = None diff --git a/saqc/flagger/dmpflagger.py b/saqc/flagger/dmpflagger.py index 8038575be582cc260f15e13500bb6d60224f1b5f..2c0d62d6bc0cc28a1de833441701c799aafc473a 100644 --- a/saqc/flagger/dmpflagger.py +++ b/saqc/flagger/dmpflagger.py @@ -50,15 +50,14 @@ class DmpFlagger(CategoricalBaseFlagger): if 'data' is not None: return a flagger with flagger.UNFALGGED values if 'flags' is not None: return a flagger with the given flags """ + if data is not None: - assertDataFrame(data, "data", allow_multiindex=False) flags = pd.DataFrame( data=self.UNFLAGGED, columns=self._getColumnIndex(data.columns), index=data.index, ) elif flags is not None: - assertDataFrame(flags, "flags", allow_multiindex=False) if not isinstance(flags.columns, pd.MultiIndex): flags = flags.T.set_index( keys=self._getColumnIndex(flags.columns, [FlagFields.FLAG]) diff --git a/test/core/test_core.py b/test/core/test_core.py index b3b76eb05ce9ee40c424f4b5c7cbdd78e29ec554..c8d1d2d06e41eea8b034f9cdfcb674c6df3416d4 100644 --- a/test/core/test_core.py +++ b/test/core/test_core.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import pytest +import numpy as np import pandas as pd from saqc.funcs import register, flagRange @@ -11,9 +12,7 @@ from saqc.lib.plotting import _plot from test.common import initData, initMetaDict, TESTFLAGGER -@pytest.fixture -def data(): - return initData(3) +OPTIONAL = [False, True] @register("flagAll") @@ -22,8 +21,31 @@ def flagAll(data, field, flagger, **kwargs): return data, flagger.setFlags(field=field, flag=flagger.BAD) +@pytest.fixture +def data(): + return initData(3) + + +def _initFlags(flagger, data, optional): + return None + if optional: + return flagger.initFlags(data[data.columns[::2]])._flags + + +@pytest.fixture +def flags(flagger, data, optional): + if not optional: + return flagger.initFlags(data[data.columns[::2]])._flags + + +# NOTE: there is a lot of pytest magic involved: +# the parametrize parameters are implicitly available +# within the used fixtures, that is why we need the optional +# parametrization without actually using it in the +# function @pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_temporalPartitioning(data, flagger): +@pytest.mark.parametrize("optional", OPTIONAL) +def test_temporalPartitioning(data, flagger, flags): """ Check if the time span in meta is respected """ @@ -36,7 +58,7 @@ def test_temporalPartitioning(data, flagger): {F.VARNAME: var3, F.TESTS: "flagAll()", F.START: split_date}, ] meta_file, meta_frame = initMetaDict(metadict, data) - pdata, pflagger = runner(meta_file, flagger, data) + pdata, pflagger = runner(meta_file, flagger, data, flags=flags) fields = [F.VARNAME, F.START, F.END] for _, row in meta_frame.iterrows(): @@ -47,8 +69,11 @@ def test_temporalPartitioning(data, flagger): @pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_positionalPartitioning(data, flagger): +@pytest.mark.parametrize("optional", OPTIONAL) +def test_positionalPartitioning(data, flagger, flags): data = data.reset_index(drop=True) + if flags is not None: + flags = flags.reset_index(drop=True) var1, var2, var3, *_ = data.columns split_index = int(len(data.index) // 2) @@ -59,7 +84,7 @@ def test_positionalPartitioning(data, flagger): ] meta_file, meta_frame = initMetaDict(metadict, data) - pdata, pflagger = runner(meta_file, flagger, data) + pdata, pflagger = runner(meta_file, flagger, data, flags=flags) fields = [F.VARNAME, F.START, F.END] for _, row in meta_frame.iterrows(): @@ -72,7 +97,8 @@ def test_positionalPartitioning(data, flagger): @pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_missingConfig(data, flagger): +@pytest.mark.parametrize("optional", OPTIONAL) +def test_missingConfig(data, flagger, flags): """ Test if variables available in the dataset but not the config are handled correctly, i.e. are ignored @@ -82,7 +108,7 @@ def test_missingConfig(data, flagger): metadict = [{F.VARNAME: var1, F.TESTS: "flagAll()"}] metafobj, meta = initMetaDict(metadict, data) - pdata, pflagger = runner(metafobj, flagger, data) + pdata, pflagger = runner(metafobj, flagger, data, flags=flags) assert var1 in pdata and var2 not in pflagger.getFlags() @@ -105,6 +131,28 @@ def test_missingVariable(flagger): runner(metafobj, flagger, data) +@pytest.mark.parametrize("flagger", TESTFLAGGER) +def test_duplicatedVariable(flagger): + data = initData(1) + var1, *_ = data.columns + + metadict = [ + {F.VARNAME: var1, F.ASSIGN: False, F.TESTS: "flagAll()"}, + {F.VARNAME: var1, F.ASSIGN: True, F.TESTS: "flagAll()"}, + ] + metafobj, meta = initMetaDict(metadict, data) + + pdata, pflagger = runner(metafobj, flagger, data) + pflags = pflagger.getFlags() + + if isinstance(pflags.columns, pd.MultiIndex): + cols = pflags.columns.get_level_values(0).drop_duplicates() + assert np.all(cols == [var1]) + else: + assert (pflags.columns == [var1]).all() + + + @pytest.mark.parametrize("flagger", TESTFLAGGER) def test_assignVariable(flagger): """ @@ -134,7 +182,8 @@ def test_assignVariable(flagger): @pytest.mark.parametrize("flagger", TESTFLAGGER) -def test_dtypes(data, flagger): +@pytest.mark.parametrize("optional", OPTIONAL) +def test_dtypes(data, flagger, flags): """ Test if the categorical dtype is preserved through the core functionality """ @@ -147,12 +196,11 @@ def test_dtypes(data, flagger): {F.VARNAME: var2, F.TESTS: "flagAll()"}, ] metafobj, meta = initMetaDict(metadict, data) - pdata, pflagger = runner(metafobj, flagger, data, flags) + pdata, pflagger = runner(metafobj, flagger, data, flags=flags) pflags = pflagger.getFlags() assert dict(flags.dtypes) == dict(pflags.dtypes) -@pytest.mark.skip(reason="not ported yet") @pytest.mark.parametrize("flagger", TESTFLAGGER) def test_plotting(data, flagger): """ @@ -169,4 +217,4 @@ def test_plotting(data, flagger): data, field, flagger_range, min=40, max=60, flag=flagger.GOOD ) mask = flagger.getFlags(field) != flagger_range.getFlags(field) - plot(data, mask, field, flagger, interactive_backend=False) + _plot(data, mask, field, flagger, interactive_backend=False)