From 74ec7f5769fc4b0f436df83aeb45a9257e0b075c Mon Sep 17 00:00:00 2001 From: Bert Palm <bert.palm@ufz.de> Date: Mon, 8 Feb 2021 22:35:04 +0100 Subject: [PATCH] implemented Flags --- saqc/flagger/flags.py | 167 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 saqc/flagger/flags.py diff --git a/saqc/flagger/flags.py b/saqc/flagger/flags.py new file mode 100644 index 000000000..299289a59 --- /dev/null +++ b/saqc/flagger/flags.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python + +from __future__ import annotations + +import dios + +from saqc.flagger.history import History +import numpy as np +import pandas as pd +from typing import Union, Dict, DefaultDict, Iterable, Tuple, Optional + +UNTOUCHED = np.nan +UNFLAGGED = 0 +DOUBTFUL = 25 +BAD = 99 + +_KEY = str +_VAL = Union[pd.Series, History] +DictLike = Union[ + pd.DataFrame, + dios.DictOfSeries, + Dict[_KEY, _VAL], + DefaultDict[_KEY, _VAL], + Iterable[Tuple[_KEY, _VAL]] +] + + +class _HistAccess: + + def __init__(self, obj: Flags): + self.obj = obj + + def __getitem__(self, key: str) -> History: + return self.obj._data[key] + + def __setitem__(self, key: str, value: Union[History, pd.DataFrame]): + if not isinstance(value, History): + value = History(value) + self.obj._data[key] = value + self.obj._cache.clear() + + +class Flags: + + def __init__(self, raw_data: Optional[Union[DictLike, Flags]] = None, copy: bool = False): + + if raw_data is None: + raw_data = {} + + if isinstance(raw_data, Flags): + raw_data = raw_data._data + + self._data: Dict[str, History] + self._data = self._init_from_raw(raw_data, copy) + + # this is a simple cache that reduce the calculation of the flags + # from the entire history of a flag column. The _cache is filled + # with __getitem__ and cleared in __setitem__ or if the whole history + # is written in _HistAccess.__setitem__. There is no other access, so + # we don't have to much trouble. + self._cache = {} + + def _init_from_raw(self, data, copy) -> Dict[str, History]: + result = {} + + for obj in data: + if isinstance(obj, tuple): + k, item = obj + else: + k, item = obj, data[obj] + + if k in result: + raise ValueError('raw_data must not have duplicate keys') + + if isinstance(item, pd.Series): + item = item.to_frame(name=0) + + result[k] = History(item, copy=copy) + + return result + + def __getitem__(self, key: str) -> pd.Series: + if key not in self._cache: + self._cache[key] = self._data[key].max() + return self._cache[key] + + def __setitem__(self, key: str, value: pd.Series): + if key not in self._data: + hist = History() + else: + hist = self._data[key] + + hist.append(value) + self._cache.pop(key, None) + + @property + def columns(self) -> pd.Index: + return pd.Index(self._data.keys()) + + @property + def history(self) -> _HistAccess: + return _HistAccess(self) + + def to_dios(self) -> dios.DictOfSeries: + di = dios.DictOfSeries(columns=self.columns) + + for k, v in self._data.items(): + di[k] = self[k] # cached + + return di.copy() + + def to_frame(self) -> pd.DataFrame: + return self.to_dios().to_df() + + def __repr__(self) -> str: + return str(self.to_dios()) + + +def init_flags_like(reference: Union[pd.Series, DictLike, Flags]) -> Flags: + """ + Create empty Flags, from an reference data structure. + + Parameters + ---------- + reference : pd.DataFrame, pd.Series, dios.DictOfSeries, dict of pd.Series + The reference structure to initialize for. + + Returns + ------- + + """ + result = {} + + if isinstance(reference, Flags): + reference = reference._data + + if isinstance(reference, pd.Series): + reference = reference.to_frame('f0') + + for obj in reference: + + # unpack + if isinstance(obj, tuple): + k, item = obj + else: + k, item = obj, reference[obj] + + if not isinstance(k, str): + raise TypeError(f"cannot use {k} as key, currently only string keys are allowed") + + if k in result: + raise ValueError('reference must not have duplicate keys') + + if not isinstance(item, (pd.Series, History)): + raise TypeError('items in reference must be of type pd.Series') + + item = pd.DataFrame(UNFLAGGED, index=item.index, columns=[0], dtype=float) + + result[k] = History(item) + + return Flags(result) + + +if __name__ == '__main__': + from dios import example_DictOfSeries + f = Flags(example_DictOfSeries().astype(float)) + print(f) \ No newline at end of file -- GitLab