Skip to content
Snippets Groups Projects
Commit 0ade30ac authored by David Schäfer's avatar David Schäfer
Browse files

use pd.CategoricalDtype to represent flags

parent 08437754
No related branches found
No related tags found
No related merge requests found
...@@ -32,7 +32,7 @@ def flagNext(flagger: BaseFlagger, flags: pd.Series, n: int) -> pd.Series: ...@@ -32,7 +32,7 @@ def flagNext(flagger: BaseFlagger, flags: pd.Series, n: int) -> pd.Series:
for nn in range(1, n + 1): for nn in range(1, n + 1):
nn_idx = np.clip(idx + nn, a_min=None, a_max=len(flags) - 1) nn_idx = np.clip(idx + nn, a_min=None, a_max=len(flags) - 1)
nn_idx_unflagged = nn_idx[~flagger.isFlagged(flags.iloc[nn_idx])] nn_idx_unflagged = nn_idx[~flagger.isFlagged(flags.iloc[nn_idx])]
flags.values[nn_idx_unflagged] = flags.iloc[nn_idx_unflagged - nn] flags.loc[flags.index[nn_idx_unflagged]] = flags.iloc[nn_idx_unflagged - nn].values
return flags return flags
...@@ -120,7 +120,7 @@ def runner(meta, flagger, data, flags=None, nodata=np.nan): ...@@ -120,7 +120,7 @@ def runner(meta, flagger, data, flags=None, nodata=np.nan):
flag_values) flag_values)
data.loc[start_date:end_date] = dchunk data.loc[start_date:end_date] = dchunk
flags.loc[start_date:end_date] = fchunk flags[start_date:end_date] = fchunk.squeeze()
flagger.nextTest() flagger.nextTest()
return data, flags return data, flags
......
#! /usr/bin/env python #! /usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from numbers import Number
from typing import Any, Optional from typing import Any, Optional
import numpy as np import numpy as np
...@@ -10,10 +9,24 @@ import pandas as pd ...@@ -10,10 +9,24 @@ import pandas as pd
from lib.types import PandasLike, ArrayLike, T from lib.types import PandasLike, ArrayLike, T
class Flags(pd.CategoricalDtype):
def __init__(self, flags):
assert len(flags) > 2
super().__init__(flags, ordered=True)
def min(self):
return self[2]
def max(self):
return self[-1]
def __getitem__(self, idx):
return self.categories[idx]
class BaseFlagger: class BaseFlagger:
def __init__(self, no_flag: T, flag: T): def __init__(self, flags):
self.no_flag: T = no_flag self.flags = Flags(flags)
self.flag: T = flag
def setFlag(self, def setFlag(self,
flags: PandasLike, flags: PandasLike,
...@@ -26,20 +39,19 @@ class BaseFlagger: ...@@ -26,20 +39,19 @@ class BaseFlagger:
in assignments, especially if a multi column index is used in assignments, especially if a multi column index is used
""" """
if flag is None: if flag is None:
flag = self.flag flag = self.flags[-1]
flags[:] = flag flags[flags < flag] = flag
return flags.values return flags.values
def initFlags(self, def initFlags(self, data: pd.DataFrame) -> pd.DataFrame:
data: pd.DataFrame, # out = data.copy() # .astype(self)
value: Optional[Number] = np.nan) -> pd.DataFrame: out = data.copy().astype(self.flags)
out = data.copy() out.loc[:] = self.flags[0]
out[:] = value
return out return out
def isFlagged(self, flags: ArrayLike, flag: T = None) -> ArrayLike: def isFlagged(self, flags: ArrayLike, flag: T = None) -> ArrayLike:
if flag is None: if flag is None:
return (pd.notnull(flags) & (flags != self.no_flag)) return (pd.notnull(flags) & (flags > self.flags[1]))
return flags == flag return flags == flag
def nextTest(self): def nextTest(self):
......
...@@ -16,34 +16,39 @@ class ColumnLevels: ...@@ -16,34 +16,39 @@ class ColumnLevels:
FLAGS = "flags" FLAGS = "flags"
class Flags: FLAGS = ["NIL", "OK", "DOUBTFUL", "BAD"]
OK = "OK"
DOUBTFUL = "DOUBTFUL"
BAD = "BAD"
@staticmethod
def isValid(flag):
return flag in [Flags.OK, Flags.DOUBTFUL, Flags.BAD]
class DmpFlagger(BaseFlagger): class DmpFlagger(BaseFlagger):
def __init__(self, no_flag="NIL", flag="BAD"):
super().__init__(no_flag, flag) def __init__(self):
super().__init__(FLAGS)
self.flag_fields = [FlagFields.FLAG, FlagFields.CAUSE, FlagFields.COMMENT] self.flag_fields = [FlagFields.FLAG, FlagFields.CAUSE, FlagFields.COMMENT]
def initFlags(self, data, value="NIL", **kwargs): def initFlags(self, data, **kwargs):
columns = data.columns if isinstance(data, pd.DataFrame) else [data.name] columns = data.columns if isinstance(data, pd.DataFrame) else [data.name]
columns = pd.MultiIndex.from_product(
colindex = pd.MultiIndex.from_product(
[columns, self.flag_fields], [columns, self.flag_fields],
names=[ColumnLevels.VARIABLES, ColumnLevels.FLAGS]) names=[ColumnLevels.VARIABLES, ColumnLevels.FLAGS])
return pd.DataFrame(data=value, columns=columns, index=data.index)
def setFlag(self, flags, flag=Flags.BAD, out = pd.DataFrame(data=self.flags[0],
cause="NIL", comment="NIL", **kwargs): columns=colindex,
self._isFlag(flag) index=data.index)
return out.astype(
{c: self.flags for c in out.columns if FlagFields.FLAG in c})
def setFlag(self, flags, flag=None, cause="", comment="", **kwargs):
if flag is None:
flag = self.flags.max()
assert flag in self.flags
flags = self._reduceColumns(flags) flags = self._reduceColumns(flags)
for field, f in zip(self.flag_fields, [flag, cause, comment]): flags.loc[flags[FlagFields.FLAG] < flag, FlagFields.FLAG] = flag
for field, f in [(FlagFields.CAUSE, cause), (FlagFields.COMMENT, comment)]:
flags.loc[:, field] = f flags.loc[:, field] = f
return flags.values return flags.values
def isFlagged(self, flags, flag=None): def isFlagged(self, flags, flag=None):
...@@ -56,5 +61,5 @@ class DmpFlagger(BaseFlagger): ...@@ -56,5 +61,5 @@ class DmpFlagger(BaseFlagger):
flags.columns = flags.columns.get_level_values(ColumnLevels.FLAGS) flags.columns = flags.columns.get_level_values(ColumnLevels.FLAGS)
return flags return flags
def _isFlag(self, flag): # def _isFlag(self, flag):
assert Flags.isValid(flag) # assert Flags.isValid(flag)
#! /usr/bin/env python #! /usr/bin/env python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from .baseflagger import BaseFlagger from .baseflagger import BaseFlagger
FLAGS = [-1, 0, 1]
class SimpleFlagger(BaseFlagger): class SimpleFlagger(BaseFlagger):
def __init__(self): def __init__(self):
super().__init__(0, 1) super().__init__(FLAGS)
...@@ -135,6 +135,7 @@ if __name__ == "__main__": ...@@ -135,6 +135,7 @@ if __name__ == "__main__":
# NOTE: PositionalFlagger is currently broken, going to fix it when needed # NOTE: PositionalFlagger is currently broken, going to fix it when needed
# for flagger in [SimpleFlagger, PositionalFlagger, DmpFlagger]: # for flagger in [SimpleFlagger, PositionalFlagger, DmpFlagger]:
for flagger in [SimpleFlagger(), DmpFlagger()]: for flagger in [SimpleFlagger(), DmpFlagger()]:
# for flagger in [DmpFlagger()]:
test_temporalPartitioning(flagger) test_temporalPartitioning(flagger)
test_flagNext(flagger) test_flagNext(flagger)
test_missingConfig(flagger) test_missingConfig(flagger)
......
...@@ -12,7 +12,7 @@ from dsl import evalExpression ...@@ -12,7 +12,7 @@ from dsl import evalExpression
def test_evaluationBool(): def test_evaluationBool():
data = initData() data = initData()
flagger = SimpleFlagger() flagger = SimpleFlagger()
flags = flagger.initFlags(data, 0) flags = flagger.initFlags(data)
var1, var2, *_ = data.columns var1, var2, *_ = data.columns
tests = [ tests = [
...@@ -47,7 +47,7 @@ def test_missingIdentifier(): ...@@ -47,7 +47,7 @@ def test_missingIdentifier():
def test_flagPropagation(): def test_flagPropagation():
data = initData() data = initData()
flagger = SimpleFlagger() flagger = SimpleFlagger()
flags = flagger.initFlags(data, 0) flags = flagger.initFlags(data)
flags.iloc[::5] = flagger.setFlag(flags.iloc[::5]) flags.iloc[::5] = flagger.setFlag(flags.iloc[::5])
var1, var2, *_ = data.columns var1, var2, *_ = data.columns
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
import pandas as pd import pandas as pd
from test.common import initData from test.common import initData
from core import runner, prepareMeta from core import runner, prepareMeta
from flagger.dmpflagger import DmpFlagger, FlagFields, Flags from flagger.dmpflagger import DmpFlagger, FlagFields
def test_DmpFlagger(): def test_DmpFlagger():
...@@ -22,7 +22,8 @@ def test_DmpFlagger(): ...@@ -22,7 +22,8 @@ def test_DmpFlagger():
pd.DataFrame(meta, columns=["headerout", "Flag_1", "Flag_2"]), pd.DataFrame(meta, columns=["headerout", "Flag_1", "Flag_2"]),
data) data)
data, flags = runner(meta, DmpFlagger(), data) flagger = DmpFlagger()
data, flags = runner(meta, flagger, data)
col1 = data[var1] col1 = data[var1]
col2 = data[var2] col2 = data[var2]
...@@ -32,14 +33,39 @@ def test_DmpFlagger(): ...@@ -32,14 +33,39 @@ def test_DmpFlagger():
flags21 = flags.loc[col2 > var2mean, (var2, FlagFields.CAUSE)] flags21 = flags.loc[col2 > var2mean, (var2, FlagFields.CAUSE)]
assert (flags11 == Flags.BAD).all() assert (flags11 >= flagger.flags.min()).all()
assert (flags12 == "saqc").all() assert (flags12 == "saqc").all()
assert (flags21 == "error").all() assert (flags21 == "error").all()
def test_flagOrder():
data = initData()
var, *_ = data.columns
flagger = DmpFlagger()
fmin = flagger.flags.min()
fmax = flagger.flags.max()
meta = [
[var, f"generic, {{func: this > mean(this), flag: {fmax}}}"],
[var, f"generic, {{func: this >= min(this), flag: {fmin}}}"],
]
meta = prepareMeta(
pd.DataFrame(meta, columns=["headerout", "Flag_1"]),
data)
pdata, pflags = runner(meta, flagger, data)
datacol = pdata[var]
flagcol = pflags[(var, FlagFields.FLAG)]
assert (flagcol[datacol > datacol.mean()] == fmax).all()
assert (flagcol[datacol <= datacol.mean()] == fmin).all()
if __name__ == "__main__": if __name__ == "__main__":
test_DmpFlagger() test_DmpFlagger()
test_flagOrder()
...@@ -39,7 +39,7 @@ def test_isflagged(): ...@@ -39,7 +39,7 @@ def test_isflagged():
flagger = SimpleFlagger() flagger = SimpleFlagger()
data = initData() data = initData()
flags = flagger.initFlags(data, 0) flags = flagger.initFlags(data)
var1, var2, *_ = data.columns var1, var2, *_ = data.columns
flags.iloc[::2, 0] = flagger.setFlag(flags.iloc[::2, 0]) flags.iloc[::2, 0] = flagger.setFlag(flags.iloc[::2, 0])
...@@ -57,7 +57,7 @@ def test_isflaggedArgument(): ...@@ -57,7 +57,7 @@ def test_isflaggedArgument():
flagger = SimpleFlagger() flagger = SimpleFlagger()
data = initData() data = initData()
flags = flagger.initFlags(data, 0) flags = flagger.initFlags(data)
var1, var2, *_ = data.columns var1, var2, *_ = data.columns
flags.iloc[::2, 0] = flagger.setFlag(flags.iloc[::2, 0], -9) flags.iloc[::2, 0] = flagger.setFlag(flags.iloc[::2, 0], -9)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment