diff --git a/saqc/core/reader.py b/saqc/core/reader.py index 12e8728fb658d431eac407de748f6301459db1b6..4339b24f694258d618a4d3bc1c67effa95967155 100644 --- a/saqc/core/reader.py +++ b/saqc/core/reader.py @@ -56,12 +56,14 @@ def _injectOptionalColumns(df): def _parseConfig(df, flagger): to_call = [] for lineno, (_, field, expr, plot) in enumerate(df.itertuples()): - if field == "None": + if field == "None" or pd.isnull(field) or pd.isnull(expr): continue - if pd.isnull(field): - raise SyntaxError(f"line {lineno}: non-optional column '{F.VARNAME}' missing") - if pd.isnull(expr): - raise SyntaxError(f"line {lineno}: non-optional column '{F.TEST}' missing") + # if field == "None": + # continue + # if pd.isnull(field): + # raise SyntaxError(f"line {lineno}: non-optional column '{F.VARNAME}' missing") + # if pd.isnull(expr): + # raise SyntaxError(f"line {lineno}: non-optional column '{F.TEST}' missing") tree = ast.parse(expr, mode="eval") cp = ConfigFunctionParser(tree.body, flagger) to_call.append((cp.func, field, cp.kwargs, plot, lineno + 2, expr)) diff --git a/saqc/flagger/__init__.py b/saqc/flagger/__init__.py index 3c942296fd455c1d1632a5880afa5759f394c787..dd5b607158f13f3922cdf734f21fa98be19a96cb 100644 --- a/saqc/flagger/__init__.py +++ b/saqc/flagger/__init__.py @@ -6,3 +6,4 @@ from saqc.flagger.categoricalflagger import CategoricalFlagger from saqc.flagger.simpleflagger import SimpleFlagger from saqc.flagger.dmpflagger import DmpFlagger from saqc.flagger.continuousflagger import ContinuousFlagger +from saqc.flagger.positionalflagger import PositionalFlagger diff --git a/saqc/flagger/baseflagger.py b/saqc/flagger/baseflagger.py index 113e973db6d966b6630f36358aa0bbd7eba1e166..ec1f14b236b748ac8d14fc784bddcf591f9930ed 100644 --- a/saqc/flagger/baseflagger.py +++ b/saqc/flagger/baseflagger.py @@ -267,6 +267,8 @@ class BaseFlagger(ABC): else: # if flags is given and self.flags is big, # this hack will bring some speed improvement + # NOTE: there should be nicer way to do this, + # why not through a constructur method? saved = self._flags self._flags = None out = deepcopy(self) diff --git a/saqc/flagger/positionalflagger.py b/saqc/flagger/positionalflagger.py new file mode 100644 index 0000000000000000000000000000000000000000..6cd13758cf7cf6aaaa2c0c7a10e2109dc52bf0d1 --- /dev/null +++ b/saqc/flagger/positionalflagger.py @@ -0,0 +1,102 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + +from copy import deepcopy + +import pandas as pd + +from dios import DictOfSeries +from saqc.flagger.baseflagger import BaseFlagger, COMPARATOR_MAP +from saqc.lib.tools import assertScalar, toSequence + + +FLAGS = ("-1", "0", "1", "2") + + +class PositionalFlagger(BaseFlagger): + def __init__(self): + super().__init__(dtype=str) + + def setFlags(self, field, loc, position=-1, flag=None, force=False, inplace=False, **kwargs): + assertScalar("field", field, optional=False) + + # prepping + flag = str(self.BAD if flag is None else flag) + self.isValidFlag(flag, fail=True) + out = self if inplace else deepcopy(self) + out_flags = out._flags[field] + + # replace unflagged with the magic starter '9' + out_flags = out_flags.str.replace(f"^{self.UNFLAGGED}", "9", regex=True) + + # bring all flags to the desired length + # length = position # if position > 0 else out_flags.str.len().max() + if position == -1: + length = position = out_flags.str.len().max() + else: + length = position = position + 1 + out_flags = out_flags.str.pad(length + 1, fillchar=self.GOOD, side="right") + + # we rigerously overwrite existing flags + new_flags = out_flags.str[position] + new_flags[loc] = flag + + out._flags[field] = out_flags.str[:position] + new_flags + out_flags.str[position+1:] + return out + + def isFlagged(self, field=None, loc=None, flag=None, comparator=">"): + + flags = self._getMaxFlag(field, loc).astype(int) + + # notna() to prevent nans to become True, + # eg.: `np.nan != 0 -> True` + flagged = flags.notna() + flags_to_compare = set(toSequence(flag, self.GOOD)) + if not flags_to_compare: + flagged[:] = False + return flagged + + cp = COMPARATOR_MAP[comparator] + for f in flags_to_compare: + self.isValidFlag(f, fail=True) + flagged &= cp(flags, int(f)) + return flagged + + def isValidFlag(self, flag, fail=False): + check = flag in FLAGS + if check is False and fail is True: + raise ValueError(f"invalid flag {flag}, given values should be in '{FLAGS}'") + return check + + def _getMaxFlag(self, field, loc): + + data = {} + flags = self.getFlags(field, loc) + if isinstance(flags, pd.Series): + flags = flags.to_frame() + for col_name, col in flags.iteritems(): + mask = col != self.UNFLAGGED + col = col.str.replace("^9", "0", regex=True) + col[mask] = col[mask].apply(lambda x: max(list(x))) + data[col_name] = col + return DictOfSeries(data) + + @property + def UNFLAGGED(self): + return FLAGS[0] + + @property + def GOOD(self): + return FLAGS[1] + + @property + def SUSPICIOUS(self): + return FLAGS[2] + + @property + def BAD(self): + return FLAGS[3] + + def isSUSPICIOUS(self, flag): + return flag == self.SUSPICIOUS + diff --git a/test/common.py b/test/common.py index 9e2571e6e803a954cfad86bdbb9c1d0e38ed0b1a..500ed5adcd11dfe112196831678ece8bd881dabc 100644 --- a/test/common.py +++ b/test/common.py @@ -8,6 +8,7 @@ import pandas as pd import dios from saqc.flagger import ( + PositionalFlagger, CategoricalFlagger, SimpleFlagger, DmpFlagger, diff --git a/test/core/test_reader.py b/test/core/test_reader.py index e86e885c99788179b2ebf3288400f0e09e5222c9..d3733a64ca4420dc378bb5c63e272d18171df7b7 100644 --- a/test/core/test_reader.py +++ b/test/core/test_reader.py @@ -116,8 +116,6 @@ def test_configChecks(data): (f"{var1};flagFunc(mn=0)", TypeError), # bad argument name (f"{var1};flagFunc()", TypeError), # not enough arguments (f"{var3};flagNothing()", NameError), # unknown function - (";flagFunc(min=3)", SyntaxError), # missing variable - (f"{var1};", SyntaxError), # missing test (f"{var1}; min", TypeError), # not a function call ] diff --git a/test/flagger/test_positionalflagger.py b/test/flagger/test_positionalflagger.py new file mode 100644 index 0000000000000000000000000000000000000000..9012a18e9f0dd57ef6ee4c567e629aaded6e51ba --- /dev/null +++ b/test/flagger/test_positionalflagger.py @@ -0,0 +1,54 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + +import pytest + +import numpy as np + +from test.common import initData +from saqc.flagger import PositionalFlagger + + +@pytest.fixture +def data(): + return initData(cols=2) + + +def test_initFlags(data): + flagger = PositionalFlagger().initFlags(data=data) + assert (flagger.isFlagged() == False).all(axis=None) + assert (flagger.flags == flagger.UNFLAGGED).all(axis=None) + + +def test_setFlags(data): + flagger = PositionalFlagger().initFlags(data=data) + + field = data.columns[0] + mask = np.zeros(len(data[field]), dtype=bool) + mask[1:10:2] = True + + flagger = flagger.setFlags(field=field, loc=mask, flag=flagger.SUSPICIOUS) + assert (flagger.flags.loc[mask, field] == "91").all(axis=None) + assert (flagger.flags.loc[~mask, field] == "90").all(axis=None) + + flagger = flagger.setFlags(field=field, loc=~mask, flag=flagger.BAD) + assert (flagger.flags.loc[~mask, field] == "902").all(axis=None) + assert (flagger.flags.loc[mask, field] == "910").all(axis=None) + + assert (flagger.flags[data.columns[1]] == "-1").all(axis=None) + + +def test_isFlagged(data): + flagger = PositionalFlagger().initFlags(data=data) + field = data.columns[0] + + loc_sus = slice(1, 20, 2) + flagger = flagger.setFlags(field=field, loc=loc_sus, flag=flagger.SUSPICIOUS) + assert (flagger.isFlagged(field=field, comparator=">=", flag=flagger.SUSPICIOUS)[loc_sus] == True).all(axis=None) + assert (flagger.isFlagged(field=field, comparator=">", flag=flagger.SUSPICIOUS) == False).all(axis=None) + + loc_bad = slice(1, 10, 2) + flagger = flagger.setFlags(field=field, loc=loc_bad, flag=flagger.BAD) + assert (flagger.isFlagged(field=field, comparator=">")[loc_sus] == True).all(axis=None) + assert (flagger.isFlagged(field=field, comparator=">=", flag=flagger.BAD)[loc_bad] == True).all(axis=None) + assert (flagger.isFlagged(field=field, comparator=">", flag=flagger.BAD) == False).all(axis=None)