Skip to content
Snippets Groups Projects
Commit 74cfc212 authored by David Schäfer's avatar David Schäfer
Browse files

Merge branch 'categorical'

parents 57e74979 0ade30ac
No related branches found
No related tags found
No related merge requests found
......@@ -32,7 +32,7 @@ def flagNext(flagger: BaseFlagger, flags: pd.Series, n: int) -> pd.Series:
for nn in range(1, n + 1):
nn_idx = np.clip(idx + nn, a_min=None, a_max=len(flags) - 1)
nn_idx_unflagged = nn_idx[~flagger.isFlagged(flags.iloc[nn_idx])]
flags.values[nn_idx_unflagged] = flags.iloc[nn_idx_unflagged - nn]
flags.loc[flags.index[nn_idx_unflagged]] = flags.iloc[nn_idx_unflagged - nn].values
return flags
......@@ -115,7 +115,7 @@ def runner(meta, flagger, data, flags=None, nodata=np.nan):
flag_values)
data.loc[start_date:end_date] = dchunk
flags.loc[start_date:end_date] = fchunk
flags[start_date:end_date] = fchunk.squeeze()
flagger.nextTest()
return data, flags
......
#! /usr/bin/env python
# -*- coding: utf-8 -*-
from numbers import Number
from typing import Any, Optional
import numpy as np
......@@ -10,10 +9,24 @@ import pandas as pd
from lib.types import PandasLike, ArrayLike, T
class Flags(pd.CategoricalDtype):
def __init__(self, flags):
assert len(flags) > 2
super().__init__(flags, ordered=True)
def min(self):
return self[2]
def max(self):
return self[-1]
def __getitem__(self, idx):
return self.categories[idx]
class BaseFlagger:
def __init__(self, no_flag: T, flag: T):
self.no_flag: T = no_flag
self.flag: T = flag
def __init__(self, flags):
self.flags = Flags(flags)
def setFlag(self,
flags: PandasLike,
......@@ -26,20 +39,19 @@ class BaseFlagger:
in assignments, especially if a multi column index is used
"""
if flag is None:
flag = self.flag
flags[:] = flag
flag = self.flags[-1]
flags[flags < flag] = flag
return flags.values
def initFlags(self,
data: pd.DataFrame,
value: Optional[Number] = np.nan) -> pd.DataFrame:
out = data.copy()
out[:] = value
def initFlags(self, data: pd.DataFrame) -> pd.DataFrame:
# out = data.copy() # .astype(self)
out = data.copy().astype(self.flags)
out.loc[:] = self.flags[0]
return out
def isFlagged(self, flags: ArrayLike, flag: T = None) -> ArrayLike:
if flag is None:
return (pd.notnull(flags) & (flags != self.no_flag))
return (pd.notnull(flags) & (flags > self.flags[1]))
return flags == flag
def nextTest(self):
......
......@@ -16,34 +16,39 @@ class ColumnLevels:
FLAGS = "flags"
class Flags:
OK = "OK"
DOUBTFUL = "DOUBTFUL"
BAD = "BAD"
@staticmethod
def isValid(flag):
return flag in [Flags.OK, Flags.DOUBTFUL, Flags.BAD]
FLAGS = ["NIL", "OK", "DOUBTFUL", "BAD"]
class DmpFlagger(BaseFlagger):
def __init__(self, no_flag="NIL", flag="BAD"):
super().__init__(no_flag, flag)
def __init__(self):
super().__init__(FLAGS)
self.flag_fields = [FlagFields.FLAG, FlagFields.CAUSE, FlagFields.COMMENT]
def initFlags(self, data, value="NIL", **kwargs):
def initFlags(self, data, **kwargs):
columns = data.columns if isinstance(data, pd.DataFrame) else [data.name]
columns = pd.MultiIndex.from_product(
colindex = pd.MultiIndex.from_product(
[columns, self.flag_fields],
names=[ColumnLevels.VARIABLES, ColumnLevels.FLAGS])
return pd.DataFrame(data=value, columns=columns, index=data.index)
def setFlag(self, flags, flag=Flags.BAD,
cause="NIL", comment="NIL", **kwargs):
self._isFlag(flag)
out = pd.DataFrame(data=self.flags[0],
columns=colindex,
index=data.index)
return out.astype(
{c: self.flags for c in out.columns if FlagFields.FLAG in c})
def setFlag(self, flags, flag=None, cause="", comment="", **kwargs):
if flag is None:
flag = self.flags.max()
assert flag in self.flags
flags = self._reduceColumns(flags)
for field, f in zip(self.flag_fields, [flag, cause, comment]):
flags.loc[flags[FlagFields.FLAG] < flag, FlagFields.FLAG] = flag
for field, f in [(FlagFields.CAUSE, cause), (FlagFields.COMMENT, comment)]:
flags.loc[:, field] = f
return flags.values
def isFlagged(self, flags, flag=None):
......@@ -56,5 +61,5 @@ class DmpFlagger(BaseFlagger):
flags.columns = flags.columns.get_level_values(ColumnLevels.FLAGS)
return flags
def _isFlag(self, flag):
assert Flags.isValid(flag)
# def _isFlag(self, flag):
# assert Flags.isValid(flag)
#! /usr/bin/env python
# -*- coding: utf-8 -*-
from .baseflagger import BaseFlagger
FLAGS = [-1, 0, 1]
class SimpleFlagger(BaseFlagger):
def __init__(self):
super().__init__(0, 1)
super().__init__(FLAGS)
......@@ -136,6 +136,7 @@ if __name__ == "__main__":
# NOTE: PositionalFlagger is currently broken, going to fix it when needed
# for flagger in [SimpleFlagger, PositionalFlagger, DmpFlagger]:
for flagger in [SimpleFlagger(), DmpFlagger()]:
# for flagger in [DmpFlagger()]:
test_temporalPartitioning(flagger)
test_flagNext(flagger)
test_missingConfig(flagger)
......
......@@ -12,7 +12,7 @@ from dsl import evalExpression
def test_evaluationBool():
data = initData()
flagger = SimpleFlagger()
flags = flagger.initFlags(data, 0)
flags = flagger.initFlags(data)
var1, var2, *_ = data.columns
tests = [
......@@ -47,7 +47,7 @@ def test_missingIdentifier():
def test_flagPropagation():
data = initData()
flagger = SimpleFlagger()
flags = flagger.initFlags(data, 0)
flags = flagger.initFlags(data)
flags.iloc[::5] = flagger.setFlag(flags.iloc[::5])
var1, var2, *_ = data.columns
......
......@@ -4,7 +4,7 @@
import pandas as pd
from test.common import initData
from core import runner, prepareMeta
from flagger.dmpflagger import DmpFlagger, FlagFields, Flags
from flagger.dmpflagger import DmpFlagger, FlagFields
def test_DmpFlagger():
......@@ -22,7 +22,8 @@ def test_DmpFlagger():
pd.DataFrame(meta, columns=["headerout", "Flag_1", "Flag_2"]),
data)
data, flags = runner(meta, DmpFlagger(), data)
flagger = DmpFlagger()
data, flags = runner(meta, flagger, data)
col1 = data[var1]
col2 = data[var2]
......@@ -32,14 +33,39 @@ def test_DmpFlagger():
flags21 = flags.loc[col2 > var2mean, (var2, FlagFields.CAUSE)]
assert (flags11 == Flags.BAD).all()
assert (flags11 >= flagger.flags.min()).all()
assert (flags12 == "saqc").all()
assert (flags21 == "error").all()
def test_flagOrder():
data = initData()
var, *_ = data.columns
flagger = DmpFlagger()
fmin = flagger.flags.min()
fmax = flagger.flags.max()
meta = [
[var, f"generic, {{func: this > mean(this), flag: {fmax}}}"],
[var, f"generic, {{func: this >= min(this), flag: {fmin}}}"],
]
meta = prepareMeta(
pd.DataFrame(meta, columns=["headerout", "Flag_1"]),
data)
pdata, pflags = runner(meta, flagger, data)
datacol = pdata[var]
flagcol = pflags[(var, FlagFields.FLAG)]
assert (flagcol[datacol > datacol.mean()] == fmax).all()
assert (flagcol[datacol <= datacol.mean()] == fmin).all()
if __name__ == "__main__":
test_DmpFlagger()
test_flagOrder()
......@@ -39,7 +39,7 @@ def test_isflagged():
flagger = SimpleFlagger()
data = initData()
flags = flagger.initFlags(data, 0)
flags = flagger.initFlags(data)
var1, var2, *_ = data.columns
flags.iloc[::2, 0] = flagger.setFlag(flags.iloc[::2, 0])
......@@ -57,7 +57,7 @@ def test_isflaggedArgument():
flagger = SimpleFlagger()
data = initData()
flags = flagger.initFlags(data, 0)
flags = flagger.initFlags(data)
var1, var2, *_ = data.columns
flags.iloc[::2, 0] = flagger.setFlag(flags.iloc[::2, 0], -9)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment