From 0ade30ac4dcf4e17cf84fa8eceb943381f5083b8 Mon Sep 17 00:00:00 2001 From: David Schaefer <david.schaefer@ufz.de> Date: Thu, 25 Apr 2019 11:14:16 +0200 Subject: [PATCH] use pd.CategoricalDtype to represent flags --- core.py | 4 ++-- flagger/baseflagger.py | 36 ++++++++++++++++++++++----------- flagger/dmpflagger.py | 43 ++++++++++++++++++++++------------------ flagger/simpleflagger.py | 7 ++++++- test/test_core.py | 1 + test/test_evaluator.py | 4 ++-- test/test_flagger.py | 32 +++++++++++++++++++++++++++--- test/test_generic.py | 4 ++-- 8 files changed, 90 insertions(+), 41 deletions(-) diff --git a/core.py b/core.py index 84afc00b9..ef57b49eb 100644 --- a/core.py +++ b/core.py @@ -32,7 +32,7 @@ def flagNext(flagger: BaseFlagger, flags: pd.Series, n: int) -> pd.Series: for nn in range(1, n + 1): nn_idx = np.clip(idx + nn, a_min=None, a_max=len(flags) - 1) nn_idx_unflagged = nn_idx[~flagger.isFlagged(flags.iloc[nn_idx])] - flags.values[nn_idx_unflagged] = flags.iloc[nn_idx_unflagged - nn] + flags.loc[flags.index[nn_idx_unflagged]] = flags.iloc[nn_idx_unflagged - nn].values return flags @@ -120,7 +120,7 @@ def runner(meta, flagger, data, flags=None, nodata=np.nan): flag_values) data.loc[start_date:end_date] = dchunk - flags.loc[start_date:end_date] = fchunk + flags[start_date:end_date] = fchunk.squeeze() flagger.nextTest() return data, flags diff --git a/flagger/baseflagger.py b/flagger/baseflagger.py index 69d09a25c..37907eb50 100644 --- a/flagger/baseflagger.py +++ b/flagger/baseflagger.py @@ -1,7 +1,6 @@ #! /usr/bin/env python # -*- coding: utf-8 -*- -from numbers import Number from typing import Any, Optional import numpy as np @@ -10,10 +9,24 @@ import pandas as pd from lib.types import PandasLike, ArrayLike, T +class Flags(pd.CategoricalDtype): + def __init__(self, flags): + assert len(flags) > 2 + super().__init__(flags, ordered=True) + + def min(self): + return self[2] + + def max(self): + return self[-1] + + def __getitem__(self, idx): + return self.categories[idx] + + class BaseFlagger: - def __init__(self, no_flag: T, flag: T): - self.no_flag: T = no_flag - self.flag: T = flag + def __init__(self, flags): + self.flags = Flags(flags) def setFlag(self, flags: PandasLike, @@ -26,20 +39,19 @@ class BaseFlagger: in assignments, especially if a multi column index is used """ if flag is None: - flag = self.flag - flags[:] = flag + flag = self.flags[-1] + flags[flags < flag] = flag return flags.values - def initFlags(self, - data: pd.DataFrame, - value: Optional[Number] = np.nan) -> pd.DataFrame: - out = data.copy() - out[:] = value + def initFlags(self, data: pd.DataFrame) -> pd.DataFrame: + # out = data.copy() # .astype(self) + out = data.copy().astype(self.flags) + out.loc[:] = self.flags[0] return out def isFlagged(self, flags: ArrayLike, flag: T = None) -> ArrayLike: if flag is None: - return (pd.notnull(flags) & (flags != self.no_flag)) + return (pd.notnull(flags) & (flags > self.flags[1])) return flags == flag def nextTest(self): diff --git a/flagger/dmpflagger.py b/flagger/dmpflagger.py index b5ef7eb21..1970c030f 100644 --- a/flagger/dmpflagger.py +++ b/flagger/dmpflagger.py @@ -16,34 +16,39 @@ class ColumnLevels: FLAGS = "flags" -class Flags: - OK = "OK" - DOUBTFUL = "DOUBTFUL" - BAD = "BAD" - - @staticmethod - def isValid(flag): - return flag in [Flags.OK, Flags.DOUBTFUL, Flags.BAD] +FLAGS = ["NIL", "OK", "DOUBTFUL", "BAD"] class DmpFlagger(BaseFlagger): - def __init__(self, no_flag="NIL", flag="BAD"): - super().__init__(no_flag, flag) + + def __init__(self): + super().__init__(FLAGS) self.flag_fields = [FlagFields.FLAG, FlagFields.CAUSE, FlagFields.COMMENT] - def initFlags(self, data, value="NIL", **kwargs): + def initFlags(self, data, **kwargs): columns = data.columns if isinstance(data, pd.DataFrame) else [data.name] - columns = pd.MultiIndex.from_product( + + colindex = pd.MultiIndex.from_product( [columns, self.flag_fields], names=[ColumnLevels.VARIABLES, ColumnLevels.FLAGS]) - return pd.DataFrame(data=value, columns=columns, index=data.index) - def setFlag(self, flags, flag=Flags.BAD, - cause="NIL", comment="NIL", **kwargs): - self._isFlag(flag) + out = pd.DataFrame(data=self.flags[0], + columns=colindex, + index=data.index) + return out.astype( + {c: self.flags for c in out.columns if FlagFields.FLAG in c}) + + def setFlag(self, flags, flag=None, cause="", comment="", **kwargs): + + if flag is None: + flag = self.flags.max() + assert flag in self.flags + flags = self._reduceColumns(flags) - for field, f in zip(self.flag_fields, [flag, cause, comment]): + flags.loc[flags[FlagFields.FLAG] < flag, FlagFields.FLAG] = flag + for field, f in [(FlagFields.CAUSE, cause), (FlagFields.COMMENT, comment)]: flags.loc[:, field] = f + return flags.values def isFlagged(self, flags, flag=None): @@ -56,5 +61,5 @@ class DmpFlagger(BaseFlagger): flags.columns = flags.columns.get_level_values(ColumnLevels.FLAGS) return flags - def _isFlag(self, flag): - assert Flags.isValid(flag) + # def _isFlag(self, flag): + # assert Flags.isValid(flag) diff --git a/flagger/simpleflagger.py b/flagger/simpleflagger.py index d10173703..7b2313793 100644 --- a/flagger/simpleflagger.py +++ b/flagger/simpleflagger.py @@ -1,9 +1,14 @@ #! /usr/bin/env python # -*- coding: utf-8 -*- + from .baseflagger import BaseFlagger +FLAGS = [-1, 0, 1] + + class SimpleFlagger(BaseFlagger): + def __init__(self): - super().__init__(0, 1) + super().__init__(FLAGS) diff --git a/test/test_core.py b/test/test_core.py index b51c7b7d9..5dc1e3fd5 100644 --- a/test/test_core.py +++ b/test/test_core.py @@ -135,6 +135,7 @@ if __name__ == "__main__": # NOTE: PositionalFlagger is currently broken, going to fix it when needed # for flagger in [SimpleFlagger, PositionalFlagger, DmpFlagger]: for flagger in [SimpleFlagger(), DmpFlagger()]: + # for flagger in [DmpFlagger()]: test_temporalPartitioning(flagger) test_flagNext(flagger) test_missingConfig(flagger) diff --git a/test/test_evaluator.py b/test/test_evaluator.py index 3960ad6aa..0ff60652d 100644 --- a/test/test_evaluator.py +++ b/test/test_evaluator.py @@ -12,7 +12,7 @@ from dsl import evalExpression def test_evaluationBool(): data = initData() flagger = SimpleFlagger() - flags = flagger.initFlags(data, 0) + flags = flagger.initFlags(data) var1, var2, *_ = data.columns tests = [ @@ -47,7 +47,7 @@ def test_missingIdentifier(): def test_flagPropagation(): data = initData() flagger = SimpleFlagger() - flags = flagger.initFlags(data, 0) + flags = flagger.initFlags(data) flags.iloc[::5] = flagger.setFlag(flags.iloc[::5]) var1, var2, *_ = data.columns diff --git a/test/test_flagger.py b/test/test_flagger.py index 26b1643a6..7b3e283a3 100644 --- a/test/test_flagger.py +++ b/test/test_flagger.py @@ -4,7 +4,7 @@ import pandas as pd from test.common import initData from core import runner, prepareMeta -from flagger.dmpflagger import DmpFlagger, FlagFields, Flags +from flagger.dmpflagger import DmpFlagger, FlagFields def test_DmpFlagger(): @@ -22,7 +22,8 @@ def test_DmpFlagger(): pd.DataFrame(meta, columns=["headerout", "Flag_1", "Flag_2"]), data) - data, flags = runner(meta, DmpFlagger(), data) + flagger = DmpFlagger() + data, flags = runner(meta, flagger, data) col1 = data[var1] col2 = data[var2] @@ -32,14 +33,39 @@ def test_DmpFlagger(): flags21 = flags.loc[col2 > var2mean, (var2, FlagFields.CAUSE)] - assert (flags11 == Flags.BAD).all() + assert (flags11 >= flagger.flags.min()).all() assert (flags12 == "saqc").all() assert (flags21 == "error").all() +def test_flagOrder(): + data = initData() + var, *_ = data.columns + + flagger = DmpFlagger() + fmin = flagger.flags.min() + fmax = flagger.flags.max() + + meta = [ + [var, f"generic, {{func: this > mean(this), flag: {fmax}}}"], + [var, f"generic, {{func: this >= min(this), flag: {fmin}}}"], + ] + + meta = prepareMeta( + pd.DataFrame(meta, columns=["headerout", "Flag_1"]), + data) + + pdata, pflags = runner(meta, flagger, data) + + datacol = pdata[var] + flagcol = pflags[(var, FlagFields.FLAG)] + + assert (flagcol[datacol > datacol.mean()] == fmax).all() + assert (flagcol[datacol <= datacol.mean()] == fmin).all() if __name__ == "__main__": test_DmpFlagger() + test_flagOrder() diff --git a/test/test_generic.py b/test/test_generic.py index afed038ec..1736a7e05 100644 --- a/test/test_generic.py +++ b/test/test_generic.py @@ -39,7 +39,7 @@ def test_isflagged(): flagger = SimpleFlagger() data = initData() - flags = flagger.initFlags(data, 0) + flags = flagger.initFlags(data) var1, var2, *_ = data.columns flags.iloc[::2, 0] = flagger.setFlag(flags.iloc[::2, 0]) @@ -57,7 +57,7 @@ def test_isflaggedArgument(): flagger = SimpleFlagger() data = initData() - flags = flagger.initFlags(data, 0) + flags = flagger.initFlags(data) var1, var2, *_ = data.columns flags.iloc[::2, 0] = flagger.setFlag(flags.iloc[::2, 0], -9) -- GitLab