Skip to content
Snippets Groups Projects
Commit 6f6055df authored by David Schäfer's avatar David Schäfer
Browse files

bugfix: unmasking reinjected the new (i.e. nan) instead of the old values...

bugfix: unmasking reinjected the new (i.e. nan) instead of the old values (this should have been caught by a test)
parent 2ed81ab1
No related branches found
No related tags found
2 merge requests!193Release 1.4,!188Release 1.4
......@@ -56,12 +56,14 @@ def _injectOptionalColumns(df):
def _parseConfig(df, flagger):
to_call = []
for lineno, (_, field, expr, plot) in enumerate(df.itertuples()):
if field == "None":
if field == "None" or pd.isnull(field) or pd.isnull(expr):
continue
if pd.isnull(field):
raise SyntaxError(f"line {lineno}: non-optional column '{F.VARNAME}' missing")
if pd.isnull(expr):
raise SyntaxError(f"line {lineno}: non-optional column '{F.TEST}' missing")
# if field == "None":
# continue
# if pd.isnull(field):
# raise SyntaxError(f"line {lineno}: non-optional column '{F.VARNAME}' missing")
# if pd.isnull(expr):
# raise SyntaxError(f"line {lineno}: non-optional column '{F.TEST}' missing")
tree = ast.parse(expr, mode="eval")
cp = ConfigFunctionParser(tree.body, flagger)
to_call.append((cp.func, field, cp.kwargs, plot, lineno + 2, expr))
......
......@@ -6,3 +6,4 @@ from saqc.flagger.categoricalflagger import CategoricalFlagger
from saqc.flagger.simpleflagger import SimpleFlagger
from saqc.flagger.dmpflagger import DmpFlagger
from saqc.flagger.continuousflagger import ContinuousFlagger
from saqc.flagger.positionalflagger import PositionalFlagger
......@@ -267,6 +267,8 @@ class BaseFlagger(ABC):
else:
# if flags is given and self.flags is big,
# this hack will bring some speed improvement
# NOTE: there should be nicer way to do this,
# why not through a constructur method?
saved = self._flags
self._flags = None
out = deepcopy(self)
......
#! /usr/bin/env python
# -*- coding: utf-8 -*-
from copy import deepcopy
import pandas as pd
from dios import DictOfSeries
from saqc.flagger.baseflagger import BaseFlagger, COMPARATOR_MAP
from saqc.lib.tools import assertScalar, toSequence
FLAGS = ("-1", "0", "1", "2")
class PositionalFlagger(BaseFlagger):
def __init__(self):
super().__init__(dtype=str)
def setFlags(self, field, loc, position=-1, flag=None, force=False, inplace=False, **kwargs):
assertScalar("field", field, optional=False)
# prepping
flag = str(self.BAD if flag is None else flag)
self.isValidFlag(flag, fail=True)
out = self if inplace else deepcopy(self)
out_flags = out._flags[field]
# replace unflagged with the magic starter '9'
out_flags = out_flags.str.replace(f"^{self.UNFLAGGED}", "9", regex=True)
# bring all flags to the desired length
# length = position # if position > 0 else out_flags.str.len().max()
if position == -1:
length = position = out_flags.str.len().max()
else:
length = position = position + 1
out_flags = out_flags.str.pad(length + 1, fillchar=self.GOOD, side="right")
# we rigerously overwrite existing flags
new_flags = out_flags.str[position]
new_flags[loc] = flag
out._flags[field] = out_flags.str[:position] + new_flags + out_flags.str[position+1:]
return out
def isFlagged(self, field=None, loc=None, flag=None, comparator=">"):
flags = self._getMaxFlag(field, loc).astype(int)
# notna() to prevent nans to become True,
# eg.: `np.nan != 0 -> True`
flagged = flags.notna()
flags_to_compare = set(toSequence(flag, self.GOOD))
if not flags_to_compare:
flagged[:] = False
return flagged
cp = COMPARATOR_MAP[comparator]
for f in flags_to_compare:
self.isValidFlag(f, fail=True)
flagged &= cp(flags, int(f))
return flagged
def isValidFlag(self, flag, fail=False):
check = flag in FLAGS
if check is False and fail is True:
raise ValueError(f"invalid flag {flag}, given values should be in '{FLAGS}'")
return check
def _getMaxFlag(self, field, loc):
data = {}
flags = self.getFlags(field, loc)
if isinstance(flags, pd.Series):
flags = flags.to_frame()
for col_name, col in flags.iteritems():
mask = col != self.UNFLAGGED
col = col.str.replace("^9", "0", regex=True)
col[mask] = col[mask].apply(lambda x: max(list(x)))
data[col_name] = col
return DictOfSeries(data)
@property
def UNFLAGGED(self):
return FLAGS[0]
@property
def GOOD(self):
return FLAGS[1]
@property
def SUSPICIOUS(self):
return FLAGS[2]
@property
def BAD(self):
return FLAGS[3]
def isSUSPICIOUS(self, flag):
return flag == self.SUSPICIOUS
......@@ -8,6 +8,7 @@ import pandas as pd
import dios
from saqc.flagger import (
PositionalFlagger,
CategoricalFlagger,
SimpleFlagger,
DmpFlagger,
......
......@@ -116,8 +116,6 @@ def test_configChecks(data):
(f"{var1};flagFunc(mn=0)", TypeError), # bad argument name
(f"{var1};flagFunc()", TypeError), # not enough arguments
(f"{var3};flagNothing()", NameError), # unknown function
(";flagFunc(min=3)", SyntaxError), # missing variable
(f"{var1};", SyntaxError), # missing test
(f"{var1}; min", TypeError), # not a function call
]
......
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import pytest
import numpy as np
from test.common import initData
from saqc.flagger import PositionalFlagger
@pytest.fixture
def data():
return initData(cols=2)
def test_initFlags(data):
flagger = PositionalFlagger().initFlags(data=data)
assert (flagger.isFlagged() == False).all(axis=None)
assert (flagger.flags == flagger.UNFLAGGED).all(axis=None)
def test_setFlags(data):
flagger = PositionalFlagger().initFlags(data=data)
field = data.columns[0]
mask = np.zeros(len(data[field]), dtype=bool)
mask[1:10:2] = True
flagger = flagger.setFlags(field=field, loc=mask, flag=flagger.SUSPICIOUS)
assert (flagger.flags.loc[mask, field] == "91").all(axis=None)
assert (flagger.flags.loc[~mask, field] == "90").all(axis=None)
flagger = flagger.setFlags(field=field, loc=~mask, flag=flagger.BAD)
assert (flagger.flags.loc[~mask, field] == "902").all(axis=None)
assert (flagger.flags.loc[mask, field] == "910").all(axis=None)
assert (flagger.flags[data.columns[1]] == "-1").all(axis=None)
def test_isFlagged(data):
flagger = PositionalFlagger().initFlags(data=data)
field = data.columns[0]
loc_sus = slice(1, 20, 2)
flagger = flagger.setFlags(field=field, loc=loc_sus, flag=flagger.SUSPICIOUS)
assert (flagger.isFlagged(field=field, comparator=">=", flag=flagger.SUSPICIOUS)[loc_sus] == True).all(axis=None)
assert (flagger.isFlagged(field=field, comparator=">", flag=flagger.SUSPICIOUS) == False).all(axis=None)
loc_bad = slice(1, 10, 2)
flagger = flagger.setFlags(field=field, loc=loc_bad, flag=flagger.BAD)
assert (flagger.isFlagged(field=field, comparator=">")[loc_sus] == True).all(axis=None)
assert (flagger.isFlagged(field=field, comparator=">=", flag=flagger.BAD)[loc_bad] == True).all(axis=None)
assert (flagger.isFlagged(field=field, comparator=">", flag=flagger.BAD) == False).all(axis=None)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment