Skip to content
Snippets Groups Projects
Commit df3dbe9e authored by Bert Palm's avatar Bert Palm 🎇
Browse files

docu, test, minore improves

parent 74ec7f57
No related branches found
No related tags found
1 merge request!218Flags
......@@ -31,16 +31,43 @@ class _HistAccess:
self.obj = obj
def __getitem__(self, key: str) -> History:
# we don't know, what the user wants. Although we're not
# encouraging inplace modification of the history, the
# user may do it, so we remove the cached column here.
self.obj._cache.pop(key, None)
return self.obj._data[key]
def __setitem__(self, key: str, value: Union[History, pd.DataFrame]):
if not isinstance(value, History):
value = History(value)
self.obj._data[key] = value
self.obj._cache.clear()
self.obj._cache.pop(key, None)
class Flags:
"""
flags manipulation
------------------
insert new -> flags['new'] = pd.Series(...)
set items -> flags['v1'] = pd.Series(...)
get items -> v0 = flags['v0']
delete items -> del flags['v0'] / drop('v0')
metadata
--------
reading columns -> flags.columns
renaming column(s) -> flags.columns = pd.Index(['a', 'b', 'c'])
history
-------
get history -> flags.history['v0']
set history -> flags.history['v0'] = History(...)
conversion
----------
make a dios -> flags.to_dios()
make a df -> flags.to_frame()
"""
def __init__(self, raw_data: Optional[Union[DictLike, Flags]] = None, copy: bool = False):
......@@ -50,14 +77,15 @@ class Flags:
if isinstance(raw_data, Flags):
raw_data = raw_data._data
# with python 3.7 dicts are insertion-ordered by default
self._data: Dict[str, History]
self._data = self._init_from_raw(raw_data, copy)
# this is a simple cache that reduce the calculation of the flags
# from the entire history of a flag column. The _cache is filled
# with __getitem__ and cleared in __setitem__ or if the whole history
# is written in _HistAccess.__setitem__. There is no other access, so
# we don't have to much trouble.
# with __getitem__ and cleared on any write access to self_data.
# There are not to may write access possibilities here so we don't
# have to much trouble.
self._cache = {}
def _init_from_raw(self, data, copy) -> Dict[str, History]:
......@@ -80,23 +108,55 @@ class Flags:
return result
def __getitem__(self, key: str) -> pd.Series:
if key not in self._cache:
self._cache[key] = self._data[key].max()
return self._cache[key]
return self._cache[key].copy()
def __setitem__(self, key: str, value: pd.Series):
if key not in self._data:
hist = History()
else:
hist = self._data[key]
hist.append(value)
self._cache.pop(key, None)
def __delitem__(self, key):
del self._data[key]
self._cache.pop(key, None)
def drop(self, key):
self.__delitem__(key)
@property
def columns(self) -> pd.Index:
return pd.Index(self._data.keys())
@columns.setter
def columns(self, value: pd.Index):
if not isinstance(value, pd.Index):
value = pd.Index(value)
if (
not value.is_unique
or not pd.api.types.is_string_dtype(value)
):
raise TypeError('value must be pd.Index, with unique indices of type str')
if not len(value) == len(self):
raise ValueError("index must match current index in length")
_data, _cache = {}, {}
for old, new in zip(self.columns, value):
_data[new] = self._data.pop(old)
if old in self._cache:
_cache[new] = self._cache[old]
self._data = _data
self._cache = _cache
@property
def history(self) -> _HistAccess:
return _HistAccess(self)
......@@ -105,18 +165,25 @@ class Flags:
di = dios.DictOfSeries(columns=self.columns)
for k, v in self._data.items():
di[k] = self[k] # cached
di[k] = self[k] # use cache
return di.copy()
def to_frame(self) -> pd.DataFrame:
return self.to_dios().to_df()
@property
def empty(self) -> bool:
return len(self._data) == 0
def __len__(self) -> int:
return len(self._data)
def __repr__(self) -> str:
return str(self.to_dios())
return str(self.to_dios()).replace('DictOfSeries', type(self).__name__)
def init_flags_like(reference: Union[pd.Series, DictLike, Flags]) -> Flags:
def init_flags_like(reference: Union[pd.Series, DictLike, Flags], initial_value: float = UNFLAGGED) -> Flags:
"""
Create empty Flags, from an reference data structure.
......@@ -125,9 +192,22 @@ def init_flags_like(reference: Union[pd.Series, DictLike, Flags]) -> Flags:
reference : pd.DataFrame, pd.Series, dios.DictOfSeries, dict of pd.Series
The reference structure to initialize for.
initial_value : float, default 0
value to initialize the columns with
Notes
-----
Implementation detail:
The resulting Flags has not necessarily the exact same (inner) dimensions as the reference.
This may happen, if the passed structure, already holds History objects. Those are
reduced 1D-DataFrame (1-column-History). Nevertheless the returned flags are perfectly suitable
to be used in Saqc as flags container along with the passed reference structure (data).
Returns
-------
flags: Flags
a flags object,
"""
result = {}
......@@ -163,5 +243,7 @@ def init_flags_like(reference: Union[pd.Series, DictLike, Flags]) -> Flags:
if __name__ == '__main__':
from dios import example_DictOfSeries
f = Flags(example_DictOfSeries().astype(float))
print(f)
\ No newline at end of file
f = init_flags_like(example_DictOfSeries())
print(f)
print(Flags())
#!/usr/bin/env python
import dios
import pytest
import numpy as np
import pandas as pd
from pandas.api.types import is_bool_dtype
from test.common import TESTFLAGGER, initData
from saqc.flagger.flags import Flags
_data = [
np.array([[]]),
np.zeros((1, 1)),
np.zeros((3, 4)),
np.ones((3, 4)),
np.ones((3, 4)) * np.nan,
np.array([
[0, 0, 0, 0],
[0, 1, 2, 3],
[0, 1, 2, 3],
]),
np.array([
[0, 0, 0, 0],
[0, 1, np.nan, 3],
[0, 1, 2, 3],
]),
]
data = []
for d in _data:
columns = list('abcdefgh')[:d.shape[1]]
df = pd.DataFrame(d, dtype=float, columns=columns)
dis = dios.DictOfSeries(df)
di = {}
di.update(df.items())
data.append(df)
data.append(di)
data.append(dis)
@pytest.mark.parametrize('data', data)
def test_init(data: np.array):
flags = Flags(data)
assert isinstance(flags, Flags)
assert len(data.keys()) == len(flags)
def test_cache():
arr = np.array([
[0, 0, 0, 0],
[0, 1, 2, 3],
[0, 1, 2, 3],
])
data = pd.DataFrame(arr, dtype=float, columns=list('abcd'))
flags = Flags(data)
# cache empty
assert flags._cache == {}
# invoke caching
flags['a']
assert 'a' in flags._cache
# clears cache
flags['a'] = pd.Series([0, 0, 0], dtype=float)
assert 'a' not in flags._cache
# cache all
flags.to_dios()
for c in flags.columns:
assert c in flags._cache
# cache survive renaming
flags.columns = list('xyzq')
for c in flags.columns:
assert c in flags._cache
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment