From df3dbe9ebe7bb2ea18a85c4d8eb4246c06b49191 Mon Sep 17 00:00:00 2001 From: Bert Palm <bert.palm@ufz.de> Date: Tue, 9 Feb 2021 01:30:26 +0100 Subject: [PATCH] docu, test, minore improves --- saqc/flagger/flags.py | 104 +++++++++++++++++++++++++++++++++---- test/flagger/test_flags.py | 79 ++++++++++++++++++++++++++++ 2 files changed, 172 insertions(+), 11 deletions(-) create mode 100644 test/flagger/test_flags.py diff --git a/saqc/flagger/flags.py b/saqc/flagger/flags.py index 299289a59..db2996c4f 100644 --- a/saqc/flagger/flags.py +++ b/saqc/flagger/flags.py @@ -31,16 +31,43 @@ class _HistAccess: self.obj = obj def __getitem__(self, key: str) -> History: + # we don't know, what the user wants. Although we're not + # encouraging inplace modification of the history, the + # user may do it, so we remove the cached column here. + self.obj._cache.pop(key, None) return self.obj._data[key] def __setitem__(self, key: str, value: Union[History, pd.DataFrame]): if not isinstance(value, History): value = History(value) self.obj._data[key] = value - self.obj._cache.clear() + self.obj._cache.pop(key, None) class Flags: + """ + flags manipulation + ------------------ + insert new -> flags['new'] = pd.Series(...) + set items -> flags['v1'] = pd.Series(...) + get items -> v0 = flags['v0'] + delete items -> del flags['v0'] / drop('v0') + + metadata + -------- + reading columns -> flags.columns + renaming column(s) -> flags.columns = pd.Index(['a', 'b', 'c']) + + history + ------- + get history -> flags.history['v0'] + set history -> flags.history['v0'] = History(...) + + conversion + ---------- + make a dios -> flags.to_dios() + make a df -> flags.to_frame() + """ def __init__(self, raw_data: Optional[Union[DictLike, Flags]] = None, copy: bool = False): @@ -50,14 +77,15 @@ class Flags: if isinstance(raw_data, Flags): raw_data = raw_data._data + # with python 3.7 dicts are insertion-ordered by default self._data: Dict[str, History] self._data = self._init_from_raw(raw_data, copy) # this is a simple cache that reduce the calculation of the flags # from the entire history of a flag column. The _cache is filled - # with __getitem__ and cleared in __setitem__ or if the whole history - # is written in _HistAccess.__setitem__. There is no other access, so - # we don't have to much trouble. + # with __getitem__ and cleared on any write access to self_data. + # There are not to may write access possibilities here so we don't + # have to much trouble. self._cache = {} def _init_from_raw(self, data, copy) -> Dict[str, History]: @@ -80,23 +108,55 @@ class Flags: return result def __getitem__(self, key: str) -> pd.Series: + if key not in self._cache: self._cache[key] = self._data[key].max() - return self._cache[key] + + return self._cache[key].copy() def __setitem__(self, key: str, value: pd.Series): + if key not in self._data: hist = History() + else: hist = self._data[key] hist.append(value) self._cache.pop(key, None) + def __delitem__(self, key): + del self._data[key] + self._cache.pop(key, None) + + def drop(self, key): + self.__delitem__(key) + @property def columns(self) -> pd.Index: return pd.Index(self._data.keys()) + @columns.setter + def columns(self, value: pd.Index): + if not isinstance(value, pd.Index): + value = pd.Index(value) + if ( + not value.is_unique + or not pd.api.types.is_string_dtype(value) + ): + raise TypeError('value must be pd.Index, with unique indices of type str') + if not len(value) == len(self): + raise ValueError("index must match current index in length") + + _data, _cache = {}, {} + for old, new in zip(self.columns, value): + _data[new] = self._data.pop(old) + if old in self._cache: + _cache[new] = self._cache[old] + + self._data = _data + self._cache = _cache + @property def history(self) -> _HistAccess: return _HistAccess(self) @@ -105,18 +165,25 @@ class Flags: di = dios.DictOfSeries(columns=self.columns) for k, v in self._data.items(): - di[k] = self[k] # cached + di[k] = self[k] # use cache return di.copy() def to_frame(self) -> pd.DataFrame: return self.to_dios().to_df() + @property + def empty(self) -> bool: + return len(self._data) == 0 + + def __len__(self) -> int: + return len(self._data) + def __repr__(self) -> str: - return str(self.to_dios()) + return str(self.to_dios()).replace('DictOfSeries', type(self).__name__) -def init_flags_like(reference: Union[pd.Series, DictLike, Flags]) -> Flags: +def init_flags_like(reference: Union[pd.Series, DictLike, Flags], initial_value: float = UNFLAGGED) -> Flags: """ Create empty Flags, from an reference data structure. @@ -125,9 +192,22 @@ def init_flags_like(reference: Union[pd.Series, DictLike, Flags]) -> Flags: reference : pd.DataFrame, pd.Series, dios.DictOfSeries, dict of pd.Series The reference structure to initialize for. + initial_value : float, default 0 + value to initialize the columns with + + Notes + ----- + Implementation detail: + + The resulting Flags has not necessarily the exact same (inner) dimensions as the reference. + This may happen, if the passed structure, already holds History objects. Those are + reduced 1D-DataFrame (1-column-History). Nevertheless the returned flags are perfectly suitable + to be used in Saqc as flags container along with the passed reference structure (data). + Returns ------- - + flags: Flags + a flags object, """ result = {} @@ -163,5 +243,7 @@ def init_flags_like(reference: Union[pd.Series, DictLike, Flags]) -> Flags: if __name__ == '__main__': from dios import example_DictOfSeries - f = Flags(example_DictOfSeries().astype(float)) - print(f) \ No newline at end of file + + f = init_flags_like(example_DictOfSeries()) + print(f) + print(Flags()) diff --git a/test/flagger/test_flags.py b/test/flagger/test_flags.py new file mode 100644 index 000000000..7dcfc97dc --- /dev/null +++ b/test/flagger/test_flags.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python +import dios +import pytest +import numpy as np +import pandas as pd +from pandas.api.types import is_bool_dtype +from test.common import TESTFLAGGER, initData +from saqc.flagger.flags import Flags + +_data = [ + + np.array([[]]), + np.zeros((1, 1)), + np.zeros((3, 4)), + np.ones((3, 4)), + np.ones((3, 4)) * np.nan, + + np.array([ + [0, 0, 0, 0], + [0, 1, 2, 3], + [0, 1, 2, 3], + ]), + + np.array([ + [0, 0, 0, 0], + [0, 1, np.nan, 3], + [0, 1, 2, 3], + ]), +] + +data = [] +for d in _data: + columns = list('abcdefgh')[:d.shape[1]] + df = pd.DataFrame(d, dtype=float, columns=columns) + dis = dios.DictOfSeries(df) + di = {} + di.update(df.items()) + data.append(df) + data.append(di) + data.append(dis) + + +@pytest.mark.parametrize('data', data) +def test_init(data: np.array): + flags = Flags(data) + assert isinstance(flags, Flags) + assert len(data.keys()) == len(flags) + + +def test_cache(): + arr = np.array([ + [0, 0, 0, 0], + [0, 1, 2, 3], + [0, 1, 2, 3], + ]) + data = pd.DataFrame(arr, dtype=float, columns=list('abcd')) + flags = Flags(data) + + # cache empty + assert flags._cache == {} + + # invoke caching + flags['a'] + assert 'a' in flags._cache + + # clears cache + flags['a'] = pd.Series([0, 0, 0], dtype=float) + assert 'a' not in flags._cache + + # cache all + flags.to_dios() + for c in flags.columns: + assert c in flags._cache + + # cache survive renaming + flags.columns = list('xyzq') + for c in flags.columns: + assert c in flags._cache + -- GitLab