From df3dbe9ebe7bb2ea18a85c4d8eb4246c06b49191 Mon Sep 17 00:00:00 2001
From: Bert Palm <bert.palm@ufz.de>
Date: Tue, 9 Feb 2021 01:30:26 +0100
Subject: [PATCH] docu, test, minore improves

---
 saqc/flagger/flags.py      | 104 +++++++++++++++++++++++++++++++++----
 test/flagger/test_flags.py |  79 ++++++++++++++++++++++++++++
 2 files changed, 172 insertions(+), 11 deletions(-)
 create mode 100644 test/flagger/test_flags.py

diff --git a/saqc/flagger/flags.py b/saqc/flagger/flags.py
index 299289a59..db2996c4f 100644
--- a/saqc/flagger/flags.py
+++ b/saqc/flagger/flags.py
@@ -31,16 +31,43 @@ class _HistAccess:
         self.obj = obj
 
     def __getitem__(self, key: str) -> History:
+        # we don't know, what the user wants. Although we're not
+        # encouraging inplace modification of the history, the
+        # user may do it, so we remove the cached column here.
+        self.obj._cache.pop(key, None)
         return self.obj._data[key]
 
     def __setitem__(self, key: str, value: Union[History, pd.DataFrame]):
         if not isinstance(value, History):
             value = History(value)
         self.obj._data[key] = value
-        self.obj._cache.clear()
+        self.obj._cache.pop(key, None)
 
 
 class Flags:
+    """
+    flags manipulation
+    ------------------
+    insert new    -> flags['new'] = pd.Series(...)
+    set items     -> flags['v1'] = pd.Series(...)
+    get items     -> v0 = flags['v0']
+    delete items  -> del flags['v0']  / drop('v0')
+
+    metadata
+    --------
+    reading columns     -> flags.columns
+    renaming column(s)  -> flags.columns = pd.Index(['a', 'b', 'c'])
+
+    history
+    -------
+    get history  -> flags.history['v0']
+    set history  -> flags.history['v0'] = History(...)
+
+    conversion
+    ----------
+    make a dios  -> flags.to_dios()
+    make a df    -> flags.to_frame()
+    """
 
     def __init__(self, raw_data: Optional[Union[DictLike, Flags]] = None, copy: bool = False):
 
@@ -50,14 +77,15 @@ class Flags:
         if isinstance(raw_data, Flags):
             raw_data = raw_data._data
 
+        # with python 3.7 dicts are insertion-ordered by default
         self._data: Dict[str, History]
         self._data = self._init_from_raw(raw_data, copy)
 
         # this is a simple cache that reduce the calculation of the flags
         # from the entire history of a flag column. The _cache is filled
-        # with __getitem__ and cleared in __setitem__ or if the whole history
-        # is written in _HistAccess.__setitem__. There is no other access, so
-        # we don't have to much trouble.
+        # with __getitem__ and cleared on any write access to self_data.
+        # There are not to may write access possibilities here so we don't
+        # have to much trouble.
         self._cache = {}
 
     def _init_from_raw(self, data, copy) -> Dict[str, History]:
@@ -80,23 +108,55 @@ class Flags:
         return result
 
     def __getitem__(self, key: str) -> pd.Series:
+
         if key not in self._cache:
             self._cache[key] = self._data[key].max()
-        return self._cache[key]
+
+        return self._cache[key].copy()
 
     def __setitem__(self, key: str, value: pd.Series):
+
         if key not in self._data:
             hist = History()
+
         else:
             hist = self._data[key]
 
         hist.append(value)
         self._cache.pop(key, None)
 
+    def __delitem__(self, key):
+        del self._data[key]
+        self._cache.pop(key, None)
+
+    def drop(self, key):
+        self.__delitem__(key)
+
     @property
     def columns(self) -> pd.Index:
         return pd.Index(self._data.keys())
 
+    @columns.setter
+    def columns(self, value: pd.Index):
+        if not isinstance(value, pd.Index):
+            value = pd.Index(value)
+        if (
+                not value.is_unique
+                or not pd.api.types.is_string_dtype(value)
+        ):
+            raise TypeError('value must be pd.Index, with unique indices of type str')
+        if not len(value) == len(self):
+            raise ValueError("index must match current index in length")
+
+        _data, _cache = {}, {}
+        for old, new in zip(self.columns, value):
+            _data[new] = self._data.pop(old)
+            if old in self._cache:
+                _cache[new] = self._cache[old]
+
+        self._data = _data
+        self._cache = _cache
+
     @property
     def history(self) -> _HistAccess:
         return _HistAccess(self)
@@ -105,18 +165,25 @@ class Flags:
         di = dios.DictOfSeries(columns=self.columns)
 
         for k, v in self._data.items():
-            di[k] = self[k]  # cached
+            di[k] = self[k]  # use cache
 
         return di.copy()
 
     def to_frame(self) -> pd.DataFrame:
         return self.to_dios().to_df()
 
+    @property
+    def empty(self) -> bool:
+        return len(self._data) == 0
+
+    def __len__(self) -> int:
+        return len(self._data)
+
     def __repr__(self) -> str:
-        return str(self.to_dios())
+        return str(self.to_dios()).replace('DictOfSeries', type(self).__name__)
 
 
-def init_flags_like(reference: Union[pd.Series, DictLike, Flags]) -> Flags:
+def init_flags_like(reference: Union[pd.Series, DictLike, Flags], initial_value: float = UNFLAGGED) -> Flags:
     """
     Create empty Flags, from an reference data structure.
 
@@ -125,9 +192,22 @@ def init_flags_like(reference: Union[pd.Series, DictLike, Flags]) -> Flags:
     reference : pd.DataFrame, pd.Series, dios.DictOfSeries, dict of pd.Series
         The reference structure to initialize for.
 
+    initial_value : float, default 0
+        value to initialize the columns with
+
+    Notes
+    -----
+    Implementation detail:
+
+    The resulting Flags has not necessarily the exact same (inner) dimensions as the reference.
+    This may happen, if the passed structure, already holds History objects. Those are
+    reduced 1D-DataFrame (1-column-History). Nevertheless the returned flags are perfectly suitable
+    to be used in Saqc as flags container along with the passed reference structure (data).
+
     Returns
     -------
-
+    flags: Flags
+        a flags object,
     """
     result = {}
 
@@ -163,5 +243,7 @@ def init_flags_like(reference: Union[pd.Series, DictLike, Flags]) -> Flags:
 
 if __name__ == '__main__':
     from dios import example_DictOfSeries
-    f = Flags(example_DictOfSeries().astype(float))
-    print(f)
\ No newline at end of file
+
+    f = init_flags_like(example_DictOfSeries())
+    print(f)
+    print(Flags())
diff --git a/test/flagger/test_flags.py b/test/flagger/test_flags.py
new file mode 100644
index 000000000..7dcfc97dc
--- /dev/null
+++ b/test/flagger/test_flags.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+import dios
+import pytest
+import numpy as np
+import pandas as pd
+from pandas.api.types import is_bool_dtype
+from test.common import TESTFLAGGER, initData
+from saqc.flagger.flags import Flags
+
+_data = [
+
+    np.array([[]]),
+    np.zeros((1, 1)),
+    np.zeros((3, 4)),
+    np.ones((3, 4)),
+    np.ones((3, 4)) * np.nan,
+
+    np.array([
+        [0, 0, 0, 0],
+        [0, 1, 2, 3],
+        [0, 1, 2, 3],
+    ]),
+
+    np.array([
+        [0, 0, 0, 0],
+        [0, 1, np.nan, 3],
+        [0, 1, 2, 3],
+    ]),
+]
+
+data = []
+for d in _data:
+    columns = list('abcdefgh')[:d.shape[1]]
+    df = pd.DataFrame(d, dtype=float, columns=columns)
+    dis = dios.DictOfSeries(df)
+    di = {}
+    di.update(df.items())
+    data.append(df)
+    data.append(di)
+    data.append(dis)
+
+
+@pytest.mark.parametrize('data', data)
+def test_init(data: np.array):
+    flags = Flags(data)
+    assert isinstance(flags, Flags)
+    assert len(data.keys()) == len(flags)
+
+
+def test_cache():
+    arr = np.array([
+        [0, 0, 0, 0],
+        [0, 1, 2, 3],
+        [0, 1, 2, 3],
+    ])
+    data = pd.DataFrame(arr, dtype=float, columns=list('abcd'))
+    flags = Flags(data)
+
+    # cache empty
+    assert flags._cache == {}
+
+    # invoke caching
+    flags['a']
+    assert 'a' in flags._cache
+
+    # clears cache
+    flags['a'] = pd.Series([0, 0, 0], dtype=float)
+    assert 'a' not in flags._cache
+
+    # cache all
+    flags.to_dios()
+    for c in flags.columns:
+        assert c in flags._cache
+
+    # cache survive renaming
+    flags.columns = list('xyzq')
+    for c in flags.columns:
+        assert c in flags._cache
+
-- 
GitLab