Skip to content
Snippets Groups Projects
Commit 0059e785 authored by David Schäfer's avatar David Schäfer
Browse files

Merge branch 'posflagger' into 'develop'

New flagger implementation

See merge request !116
parents 2ed81ab1 6f6055df
No related branches found
No related tags found
3 merge requests!193Release 1.4,!188Release 1.4,!116New flagger implementation
Pipeline #8757 passed with warnings with stages
in 10 minutes and 19 seconds
......@@ -56,12 +56,14 @@ def _injectOptionalColumns(df):
def _parseConfig(df, flagger):
to_call = []
for lineno, (_, field, expr, plot) in enumerate(df.itertuples()):
if field == "None":
if field == "None" or pd.isnull(field) or pd.isnull(expr):
continue
if pd.isnull(field):
raise SyntaxError(f"line {lineno}: non-optional column '{F.VARNAME}' missing")
if pd.isnull(expr):
raise SyntaxError(f"line {lineno}: non-optional column '{F.TEST}' missing")
# if field == "None":
# continue
# if pd.isnull(field):
# raise SyntaxError(f"line {lineno}: non-optional column '{F.VARNAME}' missing")
# if pd.isnull(expr):
# raise SyntaxError(f"line {lineno}: non-optional column '{F.TEST}' missing")
tree = ast.parse(expr, mode="eval")
cp = ConfigFunctionParser(tree.body, flagger)
to_call.append((cp.func, field, cp.kwargs, plot, lineno + 2, expr))
......
......@@ -6,3 +6,4 @@ from saqc.flagger.categoricalflagger import CategoricalFlagger
from saqc.flagger.simpleflagger import SimpleFlagger
from saqc.flagger.dmpflagger import DmpFlagger
from saqc.flagger.continuousflagger import ContinuousFlagger
from saqc.flagger.positionalflagger import PositionalFlagger
......@@ -267,6 +267,8 @@ class BaseFlagger(ABC):
else:
# if flags is given and self.flags is big,
# this hack will bring some speed improvement
# NOTE: there should be nicer way to do this,
# why not through a constructur method?
saved = self._flags
self._flags = None
out = deepcopy(self)
......
#! /usr/bin/env python
# -*- coding: utf-8 -*-
from copy import deepcopy
import pandas as pd
from dios import DictOfSeries
from saqc.flagger.baseflagger import BaseFlagger, COMPARATOR_MAP
from saqc.lib.tools import assertScalar, toSequence
FLAGS = ("-1", "0", "1", "2")
class PositionalFlagger(BaseFlagger):
def __init__(self):
super().__init__(dtype=str)
def setFlags(self, field, loc, position=-1, flag=None, force=False, inplace=False, **kwargs):
assertScalar("field", field, optional=False)
# prepping
flag = str(self.BAD if flag is None else flag)
self.isValidFlag(flag, fail=True)
out = self if inplace else deepcopy(self)
out_flags = out._flags[field]
# replace unflagged with the magic starter '9'
out_flags = out_flags.str.replace(f"^{self.UNFLAGGED}", "9", regex=True)
# bring all flags to the desired length
# length = position # if position > 0 else out_flags.str.len().max()
if position == -1:
length = position = out_flags.str.len().max()
else:
length = position = position + 1
out_flags = out_flags.str.pad(length + 1, fillchar=self.GOOD, side="right")
# we rigerously overwrite existing flags
new_flags = out_flags.str[position]
new_flags[loc] = flag
out._flags[field] = out_flags.str[:position] + new_flags + out_flags.str[position+1:]
return out
def isFlagged(self, field=None, loc=None, flag=None, comparator=">"):
flags = self._getMaxFlag(field, loc).astype(int)
# notna() to prevent nans to become True,
# eg.: `np.nan != 0 -> True`
flagged = flags.notna()
flags_to_compare = set(toSequence(flag, self.GOOD))
if not flags_to_compare:
flagged[:] = False
return flagged
cp = COMPARATOR_MAP[comparator]
for f in flags_to_compare:
self.isValidFlag(f, fail=True)
flagged &= cp(flags, int(f))
return flagged
def isValidFlag(self, flag, fail=False):
check = flag in FLAGS
if check is False and fail is True:
raise ValueError(f"invalid flag {flag}, given values should be in '{FLAGS}'")
return check
def _getMaxFlag(self, field, loc):
data = {}
flags = self.getFlags(field, loc)
if isinstance(flags, pd.Series):
flags = flags.to_frame()
for col_name, col in flags.iteritems():
mask = col != self.UNFLAGGED
col = col.str.replace("^9", "0", regex=True)
col[mask] = col[mask].apply(lambda x: max(list(x)))
data[col_name] = col
return DictOfSeries(data)
@property
def UNFLAGGED(self):
return FLAGS[0]
@property
def GOOD(self):
return FLAGS[1]
@property
def SUSPICIOUS(self):
return FLAGS[2]
@property
def BAD(self):
return FLAGS[3]
def isSUSPICIOUS(self, flag):
return flag == self.SUSPICIOUS
......@@ -8,6 +8,7 @@ import pandas as pd
import dios
from saqc.flagger import (
PositionalFlagger,
CategoricalFlagger,
SimpleFlagger,
DmpFlagger,
......
......@@ -116,8 +116,6 @@ def test_configChecks(data):
(f"{var1};flagFunc(mn=0)", TypeError), # bad argument name
(f"{var1};flagFunc()", TypeError), # not enough arguments
(f"{var3};flagNothing()", NameError), # unknown function
(";flagFunc(min=3)", SyntaxError), # missing variable
(f"{var1};", SyntaxError), # missing test
(f"{var1}; min", TypeError), # not a function call
]
......
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import pytest
import numpy as np
from test.common import initData
from saqc.flagger import PositionalFlagger
@pytest.fixture
def data():
return initData(cols=2)
def test_initFlags(data):
flagger = PositionalFlagger().initFlags(data=data)
assert (flagger.isFlagged() == False).all(axis=None)
assert (flagger.flags == flagger.UNFLAGGED).all(axis=None)
def test_setFlags(data):
flagger = PositionalFlagger().initFlags(data=data)
field = data.columns[0]
mask = np.zeros(len(data[field]), dtype=bool)
mask[1:10:2] = True
flagger = flagger.setFlags(field=field, loc=mask, flag=flagger.SUSPICIOUS)
assert (flagger.flags.loc[mask, field] == "91").all(axis=None)
assert (flagger.flags.loc[~mask, field] == "90").all(axis=None)
flagger = flagger.setFlags(field=field, loc=~mask, flag=flagger.BAD)
assert (flagger.flags.loc[~mask, field] == "902").all(axis=None)
assert (flagger.flags.loc[mask, field] == "910").all(axis=None)
assert (flagger.flags[data.columns[1]] == "-1").all(axis=None)
def test_isFlagged(data):
flagger = PositionalFlagger().initFlags(data=data)
field = data.columns[0]
loc_sus = slice(1, 20, 2)
flagger = flagger.setFlags(field=field, loc=loc_sus, flag=flagger.SUSPICIOUS)
assert (flagger.isFlagged(field=field, comparator=">=", flag=flagger.SUSPICIOUS)[loc_sus] == True).all(axis=None)
assert (flagger.isFlagged(field=field, comparator=">", flag=flagger.SUSPICIOUS) == False).all(axis=None)
loc_bad = slice(1, 10, 2)
flagger = flagger.setFlags(field=field, loc=loc_bad, flag=flagger.BAD)
assert (flagger.isFlagged(field=field, comparator=">")[loc_sus] == True).all(axis=None)
assert (flagger.isFlagged(field=field, comparator=">=", flag=flagger.BAD)[loc_bad] == True).all(axis=None)
assert (flagger.isFlagged(field=field, comparator=">", flag=flagger.BAD) == False).all(axis=None)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment