From 6f6055df9cd752ea449fee55992cc3c500e1c4ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?David=20Sch=C3=A4fer?= <david.schaefer@ufz.de>
Date: Fri, 2 Oct 2020 11:20:23 +0200
Subject: [PATCH] bugfix: unmasking reinjected the new (i.e. nan) instead of
 the old values (this should have been caught by a test)

---
 saqc/core/reader.py                    |  12 +--
 saqc/flagger/__init__.py               |   1 +
 saqc/flagger/baseflagger.py            |   2 +
 saqc/flagger/positionalflagger.py      | 102 +++++++++++++++++++++++++
 test/common.py                         |   1 +
 test/core/test_reader.py               |   2 -
 test/flagger/test_positionalflagger.py |  54 +++++++++++++
 7 files changed, 167 insertions(+), 7 deletions(-)
 create mode 100644 saqc/flagger/positionalflagger.py
 create mode 100644 test/flagger/test_positionalflagger.py

diff --git a/saqc/core/reader.py b/saqc/core/reader.py
index 12e8728fb..4339b24f6 100644
--- a/saqc/core/reader.py
+++ b/saqc/core/reader.py
@@ -56,12 +56,14 @@ def _injectOptionalColumns(df):
 def _parseConfig(df, flagger):
     to_call = []
     for lineno, (_, field, expr, plot) in enumerate(df.itertuples()):
-        if field == "None":
+        if field == "None" or pd.isnull(field) or pd.isnull(expr):
             continue
-        if pd.isnull(field):
-            raise SyntaxError(f"line {lineno}: non-optional column '{F.VARNAME}' missing")
-        if pd.isnull(expr):
-            raise SyntaxError(f"line {lineno}: non-optional column '{F.TEST}' missing")
+        # if field == "None":
+        #     continue
+        # if pd.isnull(field):
+        #     raise SyntaxError(f"line {lineno}: non-optional column '{F.VARNAME}' missing")
+        # if pd.isnull(expr):
+        #     raise SyntaxError(f"line {lineno}: non-optional column '{F.TEST}' missing")
         tree = ast.parse(expr, mode="eval")
         cp = ConfigFunctionParser(tree.body, flagger)
         to_call.append((cp.func, field, cp.kwargs, plot, lineno + 2, expr))
diff --git a/saqc/flagger/__init__.py b/saqc/flagger/__init__.py
index 3c942296f..dd5b60715 100644
--- a/saqc/flagger/__init__.py
+++ b/saqc/flagger/__init__.py
@@ -6,3 +6,4 @@ from saqc.flagger.categoricalflagger import CategoricalFlagger
 from saqc.flagger.simpleflagger import SimpleFlagger
 from saqc.flagger.dmpflagger import DmpFlagger
 from saqc.flagger.continuousflagger import ContinuousFlagger
+from saqc.flagger.positionalflagger import PositionalFlagger
diff --git a/saqc/flagger/baseflagger.py b/saqc/flagger/baseflagger.py
index 113e973db..ec1f14b23 100644
--- a/saqc/flagger/baseflagger.py
+++ b/saqc/flagger/baseflagger.py
@@ -267,6 +267,8 @@ class BaseFlagger(ABC):
         else:
             # if flags is given and self.flags is big,
             # this hack will bring some speed improvement
+            # NOTE: there should be nicer way to do this,
+            #       why not through a constructur method?
             saved = self._flags
             self._flags = None
             out = deepcopy(self)
diff --git a/saqc/flagger/positionalflagger.py b/saqc/flagger/positionalflagger.py
new file mode 100644
index 000000000..6cd13758c
--- /dev/null
+++ b/saqc/flagger/positionalflagger.py
@@ -0,0 +1,102 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from copy import deepcopy
+
+import pandas as pd
+
+from dios import DictOfSeries
+from saqc.flagger.baseflagger import BaseFlagger, COMPARATOR_MAP
+from saqc.lib.tools import assertScalar, toSequence
+
+
+FLAGS = ("-1", "0", "1", "2")
+
+
+class PositionalFlagger(BaseFlagger):
+    def __init__(self):
+        super().__init__(dtype=str)
+
+    def setFlags(self, field, loc, position=-1, flag=None, force=False, inplace=False, **kwargs):
+        assertScalar("field", field, optional=False)
+
+        # prepping
+        flag = str(self.BAD if flag is None else flag)
+        self.isValidFlag(flag, fail=True)
+        out = self if inplace else deepcopy(self)
+        out_flags = out._flags[field]
+
+        # replace unflagged with the magic starter '9'
+        out_flags = out_flags.str.replace(f"^{self.UNFLAGGED}", "9", regex=True)
+
+        # bring all flags to the desired length
+        # length = position # if position > 0 else out_flags.str.len().max()
+        if position == -1:
+            length = position = out_flags.str.len().max()
+        else:
+            length = position = position + 1
+        out_flags = out_flags.str.pad(length + 1, fillchar=self.GOOD, side="right")
+
+        # we rigerously overwrite existing flags 
+        new_flags = out_flags.str[position]
+        new_flags[loc] = flag
+
+        out._flags[field] = out_flags.str[:position] + new_flags + out_flags.str[position+1:]
+        return out
+
+    def isFlagged(self, field=None, loc=None, flag=None, comparator=">"):
+
+        flags = self._getMaxFlag(field, loc).astype(int)
+
+        # notna() to prevent nans to become True,
+        # eg.: `np.nan != 0 -> True`
+        flagged = flags.notna()
+        flags_to_compare = set(toSequence(flag, self.GOOD))
+        if not flags_to_compare:
+            flagged[:] = False
+            return flagged
+
+        cp = COMPARATOR_MAP[comparator]
+        for f in flags_to_compare:
+            self.isValidFlag(f, fail=True)
+            flagged &= cp(flags, int(f))
+        return flagged
+
+    def isValidFlag(self, flag, fail=False):
+        check = flag in FLAGS
+        if check is False and fail is True:
+            raise ValueError(f"invalid flag {flag}, given values should be in '{FLAGS}'")
+        return check
+
+    def _getMaxFlag(self, field, loc):
+
+        data = {}
+        flags = self.getFlags(field, loc)
+        if isinstance(flags, pd.Series):
+            flags = flags.to_frame()
+        for col_name, col in flags.iteritems():
+            mask = col != self.UNFLAGGED
+            col = col.str.replace("^9", "0", regex=True)
+            col[mask] = col[mask].apply(lambda x: max(list(x)))
+            data[col_name] = col
+        return DictOfSeries(data)
+
+    @property
+    def UNFLAGGED(self):
+        return FLAGS[0]
+
+    @property
+    def GOOD(self):
+        return FLAGS[1]
+
+    @property
+    def SUSPICIOUS(self):
+        return FLAGS[2]
+
+    @property
+    def BAD(self):
+        return FLAGS[3]
+
+    def isSUSPICIOUS(self, flag):
+        return flag == self.SUSPICIOUS
+
diff --git a/test/common.py b/test/common.py
index 9e2571e6e..500ed5adc 100644
--- a/test/common.py
+++ b/test/common.py
@@ -8,6 +8,7 @@ import pandas as pd
 import dios
 
 from saqc.flagger import (
+    PositionalFlagger,
     CategoricalFlagger,
     SimpleFlagger,
     DmpFlagger,
diff --git a/test/core/test_reader.py b/test/core/test_reader.py
index e86e885c9..d3733a64c 100644
--- a/test/core/test_reader.py
+++ b/test/core/test_reader.py
@@ -116,8 +116,6 @@ def test_configChecks(data):
         (f"{var1};flagFunc(mn=0)", TypeError),  # bad argument name
         (f"{var1};flagFunc()", TypeError),  # not enough arguments
         (f"{var3};flagNothing()", NameError),  # unknown function
-        (";flagFunc(min=3)", SyntaxError),  # missing variable
-        (f"{var1};", SyntaxError),  # missing test
         (f"{var1}; min", TypeError),  # not a function call
     ]
 
diff --git a/test/flagger/test_positionalflagger.py b/test/flagger/test_positionalflagger.py
new file mode 100644
index 000000000..9012a18e9
--- /dev/null
+++ b/test/flagger/test_positionalflagger.py
@@ -0,0 +1,54 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import pytest
+
+import numpy as np
+
+from test.common import initData
+from saqc.flagger import PositionalFlagger
+
+
+@pytest.fixture
+def data():
+    return initData(cols=2)
+
+
+def test_initFlags(data):
+    flagger = PositionalFlagger().initFlags(data=data)
+    assert (flagger.isFlagged() == False).all(axis=None)
+    assert (flagger.flags == flagger.UNFLAGGED).all(axis=None)
+
+
+def test_setFlags(data):
+    flagger = PositionalFlagger().initFlags(data=data)
+
+    field = data.columns[0]
+    mask = np.zeros(len(data[field]), dtype=bool)
+    mask[1:10:2] = True
+
+    flagger = flagger.setFlags(field=field, loc=mask, flag=flagger.SUSPICIOUS)
+    assert (flagger.flags.loc[mask, field] == "91").all(axis=None)
+    assert (flagger.flags.loc[~mask, field] == "90").all(axis=None)
+
+    flagger = flagger.setFlags(field=field, loc=~mask, flag=flagger.BAD)
+    assert (flagger.flags.loc[~mask, field] == "902").all(axis=None)
+    assert (flagger.flags.loc[mask, field] == "910").all(axis=None)
+
+    assert (flagger.flags[data.columns[1]] == "-1").all(axis=None)
+
+
+def test_isFlagged(data):
+    flagger = PositionalFlagger().initFlags(data=data)
+    field = data.columns[0]
+
+    loc_sus = slice(1, 20, 2)
+    flagger = flagger.setFlags(field=field, loc=loc_sus, flag=flagger.SUSPICIOUS)
+    assert (flagger.isFlagged(field=field, comparator=">=", flag=flagger.SUSPICIOUS)[loc_sus] == True).all(axis=None)
+    assert (flagger.isFlagged(field=field, comparator=">", flag=flagger.SUSPICIOUS) == False).all(axis=None)
+
+    loc_bad = slice(1, 10, 2)
+    flagger = flagger.setFlags(field=field, loc=loc_bad, flag=flagger.BAD)
+    assert (flagger.isFlagged(field=field, comparator=">")[loc_sus] == True).all(axis=None)
+    assert (flagger.isFlagged(field=field, comparator=">=", flag=flagger.BAD)[loc_bad] == True).all(axis=None)
+    assert (flagger.isFlagged(field=field, comparator=">", flag=flagger.BAD) == False).all(axis=None)
-- 
GitLab