Skip to content
Snippets Groups Projects
Commit 6f6055df authored by David Schäfer's avatar David Schäfer
Browse files

bugfix: unmasking reinjected the new (i.e. nan) instead of the old values...

bugfix: unmasking reinjected the new (i.e. nan) instead of the old values (this should have been caught by a test)
parent 2ed81ab1
No related branches found
No related tags found
2 merge requests!193Release 1.4,!188Release 1.4
...@@ -56,12 +56,14 @@ def _injectOptionalColumns(df): ...@@ -56,12 +56,14 @@ def _injectOptionalColumns(df):
def _parseConfig(df, flagger): def _parseConfig(df, flagger):
to_call = [] to_call = []
for lineno, (_, field, expr, plot) in enumerate(df.itertuples()): for lineno, (_, field, expr, plot) in enumerate(df.itertuples()):
if field == "None": if field == "None" or pd.isnull(field) or pd.isnull(expr):
continue continue
if pd.isnull(field): # if field == "None":
raise SyntaxError(f"line {lineno}: non-optional column '{F.VARNAME}' missing") # continue
if pd.isnull(expr): # if pd.isnull(field):
raise SyntaxError(f"line {lineno}: non-optional column '{F.TEST}' missing") # raise SyntaxError(f"line {lineno}: non-optional column '{F.VARNAME}' missing")
# if pd.isnull(expr):
# raise SyntaxError(f"line {lineno}: non-optional column '{F.TEST}' missing")
tree = ast.parse(expr, mode="eval") tree = ast.parse(expr, mode="eval")
cp = ConfigFunctionParser(tree.body, flagger) cp = ConfigFunctionParser(tree.body, flagger)
to_call.append((cp.func, field, cp.kwargs, plot, lineno + 2, expr)) to_call.append((cp.func, field, cp.kwargs, plot, lineno + 2, expr))
......
...@@ -6,3 +6,4 @@ from saqc.flagger.categoricalflagger import CategoricalFlagger ...@@ -6,3 +6,4 @@ from saqc.flagger.categoricalflagger import CategoricalFlagger
from saqc.flagger.simpleflagger import SimpleFlagger from saqc.flagger.simpleflagger import SimpleFlagger
from saqc.flagger.dmpflagger import DmpFlagger from saqc.flagger.dmpflagger import DmpFlagger
from saqc.flagger.continuousflagger import ContinuousFlagger from saqc.flagger.continuousflagger import ContinuousFlagger
from saqc.flagger.positionalflagger import PositionalFlagger
...@@ -267,6 +267,8 @@ class BaseFlagger(ABC): ...@@ -267,6 +267,8 @@ class BaseFlagger(ABC):
else: else:
# if flags is given and self.flags is big, # if flags is given and self.flags is big,
# this hack will bring some speed improvement # this hack will bring some speed improvement
# NOTE: there should be nicer way to do this,
# why not through a constructur method?
saved = self._flags saved = self._flags
self._flags = None self._flags = None
out = deepcopy(self) out = deepcopy(self)
......
#! /usr/bin/env python
# -*- coding: utf-8 -*-
from copy import deepcopy
import pandas as pd
from dios import DictOfSeries
from saqc.flagger.baseflagger import BaseFlagger, COMPARATOR_MAP
from saqc.lib.tools import assertScalar, toSequence
FLAGS = ("-1", "0", "1", "2")
class PositionalFlagger(BaseFlagger):
def __init__(self):
super().__init__(dtype=str)
def setFlags(self, field, loc, position=-1, flag=None, force=False, inplace=False, **kwargs):
assertScalar("field", field, optional=False)
# prepping
flag = str(self.BAD if flag is None else flag)
self.isValidFlag(flag, fail=True)
out = self if inplace else deepcopy(self)
out_flags = out._flags[field]
# replace unflagged with the magic starter '9'
out_flags = out_flags.str.replace(f"^{self.UNFLAGGED}", "9", regex=True)
# bring all flags to the desired length
# length = position # if position > 0 else out_flags.str.len().max()
if position == -1:
length = position = out_flags.str.len().max()
else:
length = position = position + 1
out_flags = out_flags.str.pad(length + 1, fillchar=self.GOOD, side="right")
# we rigerously overwrite existing flags
new_flags = out_flags.str[position]
new_flags[loc] = flag
out._flags[field] = out_flags.str[:position] + new_flags + out_flags.str[position+1:]
return out
def isFlagged(self, field=None, loc=None, flag=None, comparator=">"):
flags = self._getMaxFlag(field, loc).astype(int)
# notna() to prevent nans to become True,
# eg.: `np.nan != 0 -> True`
flagged = flags.notna()
flags_to_compare = set(toSequence(flag, self.GOOD))
if not flags_to_compare:
flagged[:] = False
return flagged
cp = COMPARATOR_MAP[comparator]
for f in flags_to_compare:
self.isValidFlag(f, fail=True)
flagged &= cp(flags, int(f))
return flagged
def isValidFlag(self, flag, fail=False):
check = flag in FLAGS
if check is False and fail is True:
raise ValueError(f"invalid flag {flag}, given values should be in '{FLAGS}'")
return check
def _getMaxFlag(self, field, loc):
data = {}
flags = self.getFlags(field, loc)
if isinstance(flags, pd.Series):
flags = flags.to_frame()
for col_name, col in flags.iteritems():
mask = col != self.UNFLAGGED
col = col.str.replace("^9", "0", regex=True)
col[mask] = col[mask].apply(lambda x: max(list(x)))
data[col_name] = col
return DictOfSeries(data)
@property
def UNFLAGGED(self):
return FLAGS[0]
@property
def GOOD(self):
return FLAGS[1]
@property
def SUSPICIOUS(self):
return FLAGS[2]
@property
def BAD(self):
return FLAGS[3]
def isSUSPICIOUS(self, flag):
return flag == self.SUSPICIOUS
...@@ -8,6 +8,7 @@ import pandas as pd ...@@ -8,6 +8,7 @@ import pandas as pd
import dios import dios
from saqc.flagger import ( from saqc.flagger import (
PositionalFlagger,
CategoricalFlagger, CategoricalFlagger,
SimpleFlagger, SimpleFlagger,
DmpFlagger, DmpFlagger,
......
...@@ -116,8 +116,6 @@ def test_configChecks(data): ...@@ -116,8 +116,6 @@ def test_configChecks(data):
(f"{var1};flagFunc(mn=0)", TypeError), # bad argument name (f"{var1};flagFunc(mn=0)", TypeError), # bad argument name
(f"{var1};flagFunc()", TypeError), # not enough arguments (f"{var1};flagFunc()", TypeError), # not enough arguments
(f"{var3};flagNothing()", NameError), # unknown function (f"{var3};flagNothing()", NameError), # unknown function
(";flagFunc(min=3)", SyntaxError), # missing variable
(f"{var1};", SyntaxError), # missing test
(f"{var1}; min", TypeError), # not a function call (f"{var1}; min", TypeError), # not a function call
] ]
......
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import pytest
import numpy as np
from test.common import initData
from saqc.flagger import PositionalFlagger
@pytest.fixture
def data():
return initData(cols=2)
def test_initFlags(data):
flagger = PositionalFlagger().initFlags(data=data)
assert (flagger.isFlagged() == False).all(axis=None)
assert (flagger.flags == flagger.UNFLAGGED).all(axis=None)
def test_setFlags(data):
flagger = PositionalFlagger().initFlags(data=data)
field = data.columns[0]
mask = np.zeros(len(data[field]), dtype=bool)
mask[1:10:2] = True
flagger = flagger.setFlags(field=field, loc=mask, flag=flagger.SUSPICIOUS)
assert (flagger.flags.loc[mask, field] == "91").all(axis=None)
assert (flagger.flags.loc[~mask, field] == "90").all(axis=None)
flagger = flagger.setFlags(field=field, loc=~mask, flag=flagger.BAD)
assert (flagger.flags.loc[~mask, field] == "902").all(axis=None)
assert (flagger.flags.loc[mask, field] == "910").all(axis=None)
assert (flagger.flags[data.columns[1]] == "-1").all(axis=None)
def test_isFlagged(data):
flagger = PositionalFlagger().initFlags(data=data)
field = data.columns[0]
loc_sus = slice(1, 20, 2)
flagger = flagger.setFlags(field=field, loc=loc_sus, flag=flagger.SUSPICIOUS)
assert (flagger.isFlagged(field=field, comparator=">=", flag=flagger.SUSPICIOUS)[loc_sus] == True).all(axis=None)
assert (flagger.isFlagged(field=field, comparator=">", flag=flagger.SUSPICIOUS) == False).all(axis=None)
loc_bad = slice(1, 10, 2)
flagger = flagger.setFlags(field=field, loc=loc_bad, flag=flagger.BAD)
assert (flagger.isFlagged(field=field, comparator=">")[loc_sus] == True).all(axis=None)
assert (flagger.isFlagged(field=field, comparator=">=", flag=flagger.BAD)[loc_bad] == True).all(axis=None)
assert (flagger.isFlagged(field=field, comparator=">", flag=flagger.BAD) == False).all(axis=None)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment