Skip to content
Snippets Groups Projects
Commit 17bcf742 authored by Bert Palm's avatar Bert Palm 🎇
Browse files

refactored Backtrack -> (Flag-)History

parent f7be0d54
No related branches found
No related tags found
1 merge request!218Flags
...@@ -6,19 +6,19 @@ import pandas as pd ...@@ -6,19 +6,19 @@ import pandas as pd
import numpy as np import numpy as np
class Backtrack: class History:
""" """
Saqc internal storage for the history of a (single) flags column. Saqc internal storage for the history of a (single) flags column.
The backtrack (BT) stores the history of a flags column. Each time The flag-history (FH) stores the history of a flags column. Each time
``append`` is called a new column is appended to the BT. The column ``append`` is called a new column is appended to the FH. The column
names are increasing integers starting with 0. After initialisation names are increasing integers starting with 0. After initialisation
the BT is empty and has no columns at all. If an initial `UNFLAGGED`- the FH is empty and has no columns at all. If an initial `UNFLAGGED`-
column is desired, it must created manually, or passed via the ``bt`` column is desired, it must created manually, or passed via the ``hist``
parameter. The same way a new BT can be created from an existing one. parameter. The same way a new FH can be created from an existing one.
To get the worst flags (highest value) that are currently stored in To get the worst flags (highest value) that are currently stored in
the BT, we provide a ``max()`` method. It returns a pd.Series indicating the FH, we provide a ``max()`` method. It returns a pd.Series indicating
the worst flag per row. the worst flag per row.
To counteract the problem, that one may want to force a better flag To counteract the problem, that one may want to force a better flag
...@@ -32,62 +32,62 @@ class Backtrack: ...@@ -32,62 +32,62 @@ class Backtrack:
Parameters Parameters
---------- ----------
bt : pd.Dataframe, default None hist : pd.Dataframe, default None
if None a empty BT is created, otherwise the existing dataframe if None a empty FH is created, otherwise the existing dataframe
is taken as the initial backtrack. is taken as the initial history.
mask : pd.Dataframe, default None mask : pd.Dataframe, default None
a mask holding the boolean force values. It must match the passed a mask holding the boolean force values. It must match the passed
``bt``. If None an matching mask is created, assuming force never ``hist``. If None an matching mask is created, assuming force never
was passed to any test. was passed to any test.
copy : bool, default False copy : bool, default False
If True, the input data is copied, otherwise not. If True, the input data is copied, otherwise not.
""" """
def __init__(self, bt: pd.DataFrame = None, mask: pd.DataFrame = None, copy: bool = False): def __init__(self, hist: pd.DataFrame = None, mask: pd.DataFrame = None, copy: bool = False):
# this is a hidden _feature_ and not exposed by the type # this is a hidden _feature_ and not exposed by the type
# of the bt parameter and serve as a fastpath for internal # of the hist parameter and serve as a fastpath for internal
# fast creation of a new BT, where no checks are needed. # fast creation of a new FH, where no checks are needed.
if isinstance(bt, Backtrack): if isinstance(hist, History):
# keep this order, otherwise bt.mask # keep this order, otherwise hist.mask
# will refer to pd.Dataframe.mask # will refer to pd.Dataframe.mask
mask = bt.mask mask = hist.mask
bt = bt.bt hist = hist.hist
elif bt is None and mask is None: elif hist is None and mask is None:
bt = pd.DataFrame() hist = pd.DataFrame()
mask = pd.DataFrame() mask = pd.DataFrame()
elif bt is None and mask is not None: elif hist is None and mask is not None:
raise ValueError("Cannot take 'mask' with no 'bt'") raise ValueError("Cannot take 'mask' with no 'hist'")
elif bt is not None and mask is None: elif hist is not None and mask is None:
bt = self._validate_bt(bt) hist = self._validate_hist(hist)
mask = pd.DataFrame(True, index=bt.index, columns=bt.columns) mask = pd.DataFrame(True, index=hist.index, columns=hist.columns)
else: else:
bt, mask = self._validate_bt_with_mask(bt, mask) hist, mask = self._validate_hist_with_mask(hist, mask)
if copy: if copy:
bt = bt.copy() hist = hist.copy()
mask = mask.copy() mask = mask.copy()
self.bt = bt self.hist = hist
self.mask = mask self.mask = mask
@property @property
def index(self) -> pd.Index: def index(self) -> pd.Index:
""" """
The index of BT. The index of FH.
The index is the same for all columns. The index is the same for all columns.
Notes Notes
----- -----
The index should always be equal to the flags series, The index should always be equal to the flags series,
this is BT is associated with. If this is messed up this is FH is associated with. If this is messed up
something went wrong in saqc internals or in a user- something went wrong in saqc internals or in a user-
defined test. defined test.
...@@ -95,12 +95,12 @@ class Backtrack: ...@@ -95,12 +95,12 @@ class Backtrack:
------- -------
index : pd.Index index : pd.Index
""" """
return self.bt.index return self.hist.index
@property @property
def columns(self) -> pd.Index: def columns(self) -> pd.Index:
""" """
Columns of the BT. Columns of the FH.
The columns are always continuously The columns are always continuously
increasing integers, starting from 0. increasing integers, starting from 0.
...@@ -109,27 +109,27 @@ class Backtrack: ...@@ -109,27 +109,27 @@ class Backtrack:
------- -------
columns : pd.Index columns : pd.Index
""" """
return self.bt.columns return self.hist.columns
@property @property
def empty(self) -> bool: def empty(self) -> bool:
""" """
Indicator whether Backtrack is empty. Indicator whether History is empty.
True if Backtrack is entirely empty (no items). True if History is entirely empty (no items).
Returns Returns
------- -------
bool bool
If Backtrack is empty, return True, if not return False. If History is empty, return True, if not return False.
""" """
# we take self.mask here, because it cannot have NaN's, # we take self.mask here, because it cannot have NaN's,
# but self.bt could have -> see pd.DataFrame.empty # but self.hist could have -> see pd.DataFrame.empty
return self.mask.empty return self.mask.empty
def _insert(self, s: pd.Series, nr: int, force=False) -> Backtrack: def _insert(self, s: pd.Series, nr: int, force=False) -> History:
""" """
Insert data at an arbitrary position in the BT. Insert data at an arbitrary position in the FH.
No validation of series is done here. No validation of series is done here.
...@@ -146,7 +146,7 @@ class Backtrack: ...@@ -146,7 +146,7 @@ class Backtrack:
Returns Returns
------- -------
Backtrack History
""" """
# internal detail: # internal detail:
# ensure continuous increasing columns # ensure continuous increasing columns
...@@ -157,7 +157,7 @@ class Backtrack: ...@@ -157,7 +157,7 @@ class Backtrack:
assert nr == 0 assert nr == 0
self.mask[nr] = pd.Series(True, index=s.index, dtype=bool) self.mask[nr] = pd.Series(True, index=s.index, dtype=bool)
self.bt[nr] = s self.hist[nr] = s
return self return self
if force: if force:
...@@ -168,23 +168,23 @@ class Backtrack: ...@@ -168,23 +168,23 @@ class Backtrack:
if nr == len(self): if nr == len(self):
self.mask[nr] = True self.mask[nr] = True
self.bt[nr] = s self.hist[nr] = s
return self return self
def append(self, value: pd.Series, force=False) -> Backtrack: def append(self, value: pd.Series, force=False) -> History:
""" """
Create a new BT column and insert given pd.Series to it. Create a new FH column and insert given pd.Series to it.
Parameters Parameters
---------- ----------
value : pd.Series value : pd.Series
the data to append. Must have dtype float and the index must the data to append. Must have dtype float and the index must
match the index of the BT. match the index of the FH.
force : bool, default False force : bool, default False
if True the internal mask is updated in a way that the currently if True the internal mask is updated in a way that the currently
set value (series values) will be returned if ``Backtrack.max()`` set value (series values) will be returned if ``History.max()``
is called. This apply for all valid values (not ``np.Nan`` and is called. This apply for all valid values (not ``np.Nan`` and
not ``-np.inf``). not ``-np.inf``).
...@@ -195,7 +195,7 @@ class Backtrack: ...@@ -195,7 +195,7 @@ class Backtrack:
Returns Returns
------- -------
Backtrack: BT with appended series History: FH with appended series
""" """
s = self._validate_value(value) s = self._validate_value(value)
...@@ -203,16 +203,16 @@ class Backtrack: ...@@ -203,16 +203,16 @@ class Backtrack:
raise ValueError('Cannot append empty pd.Series') raise ValueError('Cannot append empty pd.Series')
if not self.empty and not s.index.equals(self.index): if not self.empty and not s.index.equals(self.index):
raise ValueError("Index must be equal to BT's index") raise ValueError("Index must be equal to FH's index")
self._insert(value, nr=len(self), force=force) self._insert(value, nr=len(self), force=force)
return self return self
def squeeze(self, n: int) -> Backtrack: def squeeze(self, n: int) -> History:
""" """
Squeeze last `n` columns to a single column. Squeeze last `n` columns to a single column.
This **not** changes the result of ``Backtrack.max()``. This **not** changes the result of ``History.max()``.
Parameters Parameters
---------- ----------
...@@ -229,18 +229,18 @@ class Backtrack: ...@@ -229,18 +229,18 @@ class Backtrack:
Returns Returns
------- -------
Backtrack History
squeezed backtrack squeezed history
""" """
if n <= 1: if n <= 1:
return self return self
if n > len(self): if n > len(self):
raise ValueError(f"'n={n}' cannot be greater than columns in the BT") raise ValueError(f"'n={n}' cannot be greater than columns in the FH")
# shortcut # shortcut
if len(self) == n: if len(self) == n:
self.bt = pd.DataFrame() self.hist = pd.DataFrame()
self.mask = pd.DataFrame() self.mask = pd.DataFrame()
s = self.max() s = self.max()
...@@ -248,16 +248,16 @@ class Backtrack: ...@@ -248,16 +248,16 @@ class Backtrack:
# calc the squeezed series. # calc the squeezed series.
# we dont have to care about any forced series # we dont have to care about any forced series
# because anytime force was given, the False's in # because anytime force was given, the False's in
# the mask were propagated back over the whole BT # the mask were propagated back over the whole FH
mask = self.mask.iloc[:, -n:] mask = self.mask.iloc[:, -n:]
bt = self.bt.iloc[:, -n:] hist = self.hist.iloc[:, -n:]
s = bt[mask].max(axis=1) s = hist[mask].max(axis=1)
# slice self down # slice self down
# this may leave us in an unstable state, because # this may leave us in an unstable state, because
# the last column may not is entirely True, but # the last column may not is entirely True, but
# the following append, will fix this # the following append, will fix this
self.bt = self.bt.iloc[:, :-n] self.hist = self.hist.iloc[:, :-n]
self.mask = self.mask.iloc[:, :-n] self.mask = self.mask.iloc[:, :-n]
self.append(s) self.append(s)
...@@ -265,21 +265,21 @@ class Backtrack: ...@@ -265,21 +265,21 @@ class Backtrack:
def max(self) -> pd.Series: def max(self) -> pd.Series:
""" """
Get the maximum value per row of the BT. Get the maximum value per row of the FH.
Returns Returns
------- -------
pd.Series: maximum values pd.Series: maximum values
""" """
return self.bt[self.mask].max(axis=1) return self.hist[self.mask].max(axis=1)
@property @property
def _constructor(self) -> Type['Backtrack']: def _constructor(self) -> Type['History']:
return Backtrack return History
def copy(self, deep=True) -> Backtrack: def copy(self, deep=True) -> History:
""" """
Make a copy of the BT. Make a copy of the FH.
Parameters Parameters
---------- ----------
...@@ -289,20 +289,20 @@ class Backtrack: ...@@ -289,20 +289,20 @@ class Backtrack:
Returns Returns
------- -------
copy : Backtrack copy : History
the copied BT the copied FH
""" """
return self._constructor(bt=self, copy=deep) return self._constructor(hist=self, copy=deep)
def __len__(self) -> int: def __len__(self) -> int:
return len(self.bt.columns) return len(self.hist.columns)
def __repr__(self): def __repr__(self):
if self.empty: if self.empty:
return str(self.bt).replace('DataFrame', 'Backtrack') return str(self.hist).replace('DataFrame', 'History')
repr = self.bt.astype(str) repr = self.hist.astype(str)
m = self.mask m = self.mask
repr[m] = ' ' + repr[m] + ' ' repr[m] = ' ' + repr[m] + ' '
...@@ -314,13 +314,13 @@ class Backtrack: ...@@ -314,13 +314,13 @@ class Backtrack:
# validation # validation
# #
def _validate_bt_with_mask(self, obj: pd.DataFrame, mask: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: def _validate_hist_with_mask(self, obj: pd.DataFrame, mask: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
""" """
check type, columns, index, dtype and if the mask fits the obj. check type, columns, index, dtype and if the mask fits the obj.
""" """
# check bt # check hist
self._validate_bt(obj) self._validate_hist(obj)
# check mask # check mask
if not isinstance(mask, pd.DataFrame): if not isinstance(mask, pd.DataFrame):
...@@ -332,25 +332,25 @@ class Backtrack: ...@@ -332,25 +332,25 @@ class Backtrack:
if not mask.empty and not mask.iloc[:, -1].all(): if not mask.empty and not mask.iloc[:, -1].all():
raise ValueError("the values in the last column in mask must be 'True' everywhere.") raise ValueError("the values in the last column in mask must be 'True' everywhere.")
# check combination of bt and mask # check combination of hist and mask
if not obj.columns.equals(mask.columns): if not obj.columns.equals(mask.columns):
raise ValueError("'bt' and 'mask' must have same columns") raise ValueError("'hist' and 'mask' must have same columns")
if not obj.index.equals(mask.index): if not obj.index.equals(mask.index):
raise ValueError("'bt' and 'mask' must have same index") raise ValueError("'hist' and 'mask' must have same index")
return obj, mask return obj, mask
def _validate_bt(self, obj: pd.DataFrame) -> pd.DataFrame: def _validate_hist(self, obj: pd.DataFrame) -> pd.DataFrame:
""" """
check type, columns, dtype of obj. check type, columns, dtype of obj.
""" """
if not isinstance(obj, pd.DataFrame): if not isinstance(obj, pd.DataFrame):
raise TypeError(f"'bt' must be of type pd.DataFrame, but {type(obj).__name__} was given") raise TypeError(f"'hist' must be of type pd.DataFrame, but {type(obj).__name__} was given")
if any(obj.dtypes != float): if any(obj.dtypes != float):
raise ValueError('dtype of all columns in bt must be float') raise ValueError('dtype of all columns in hist must be float')
if not obj.empty and ( if not obj.empty and (
not obj.columns.equals(pd.Index(range(len(obj.columns)))) not obj.columns.equals(pd.Index(range(len(obj.columns))))
......
...@@ -5,9 +5,9 @@ import numpy as np ...@@ -5,9 +5,9 @@ import numpy as np
import pandas as pd import pandas as pd
from pandas.api.types import is_bool_dtype from pandas.api.types import is_bool_dtype
from test.common import TESTFLAGGER, initData from test.common import TESTFLAGGER, initData
from saqc.flagger.backtrack import Backtrack from saqc.flagger.history import History
# see #GH143 combined bt # see #GH143 combined backtrack
# (adjusted to current implementation) # (adjusted to current implementation)
example1 = ( example1 = (
...@@ -76,66 +76,66 @@ data = [ ...@@ -76,66 +76,66 @@ data = [
] ]
def check_invariants(bt): def check_invariants(hist):
""" """
This can be called for **any** BT. This can be called for **any** FH.
The assertions must hold in any case. The assertions must hold in any case.
""" """
# basics # basics
assert isinstance(bt, Backtrack) assert isinstance(hist, History)
assert isinstance(bt.bt, pd.DataFrame) assert isinstance(hist.hist, pd.DataFrame)
assert isinstance(bt.mask, pd.DataFrame) assert isinstance(hist.mask, pd.DataFrame)
assert all(bt.bt.dtypes == float) assert all(hist.hist.dtypes == float)
assert all(bt.mask.dtypes == bool) assert all(hist.mask.dtypes == bool)
assert bt.bt.columns.equals(bt.mask.columns) assert hist.hist.columns.equals(hist.mask.columns)
assert bt.columns is bt.bt.columns assert hist.columns is hist.hist.columns
assert bt.index is bt.bt.index assert hist.index is hist.hist.index
assert len(bt) == len(bt.columns) assert len(hist) == len(hist.columns)
# advanced # advanced
assert bt.columns.equals(pd.Index(range(len(bt)))) assert hist.columns.equals(pd.Index(range(len(hist))))
assert isinstance(bt.max(), pd.Series) assert isinstance(hist.max(), pd.Series)
assert bt.mask.empty or bt.mask.iloc[:, -1].all() assert hist.mask.empty or hist.mask.iloc[:, -1].all()
# False propagation # False propagation
# for each row this must hold: # for each row this must hold:
# either the row has one change (False->True) # either the row has one change (False->True)
# or the entire row is True # or the entire row is True
if not bt.empty: if not hist.empty:
idxmax = bt.mask.idxmax(axis=1) idxmax = hist.mask.idxmax(axis=1)
for row, col in idxmax.items(): for row, col in idxmax.items():
assert all(bt.mask.iloc[row, :col] == False) assert all(hist.mask.iloc[row, :col] == False)
assert all(bt.mask.iloc[row, col:] == True) assert all(hist.mask.iloc[row, col:] == True)
def is_equal(bt1: Backtrack, bt2: Backtrack): def is_equal(hist1: History, hist2: History):
""" """
Check if two BT are (considered) equal, namely Check if two FH are (considered) equal, namely
have equal 'bt' and equal 'mask'. have equal 'hist' and equal 'mask'.
""" """
return bt1.bt.equals(bt2.bt) and bt1.mask.equals(bt2.mask) return hist1.hist.equals(hist2.hist) and hist1.mask.equals(hist2.mask)
@pytest.mark.parametrize('data', data + [None]) @pytest.mark.parametrize('data', data + [None])
def test_init(data: np.array): def test_init(data: np.array):
# init # init
df = pd.DataFrame(data, dtype=float) df = pd.DataFrame(data, dtype=float)
bt = Backtrack(bt=df) hist = History(hist=df)
check_invariants(bt) check_invariants(hist)
# shape would fail # shape would fail
if data is not None: if data is not None:
assert len(bt.index) == data.shape[0] assert len(hist.index) == data.shape[0]
assert len(bt.columns) == data.shape[1] assert len(hist.columns) == data.shape[1]
assert bt.mask.all(axis=None) assert hist.mask.all(axis=None)
# check fastpath # check fastpath
fast = Backtrack(bt=bt) fast = History(hist=hist)
check_invariants(fast) check_invariants(fast)
assert is_equal(bt, fast) assert is_equal(hist, fast)
@pytest.mark.parametrize('data', data + [None]) @pytest.mark.parametrize('data', data + [None])
...@@ -145,52 +145,52 @@ def test_init_with_mask(data: np.array): ...@@ -145,52 +145,52 @@ def test_init_with_mask(data: np.array):
mask = pd.DataFrame(data, dtype=bool) mask = pd.DataFrame(data, dtype=bool)
if not mask.empty: if not mask.empty:
mask.iloc[:, -1] = True mask.iloc[:, -1] = True
bt = Backtrack(bt=df, mask=mask) hist = History(hist=df, mask=mask)
check_invariants(bt) check_invariants(hist)
# shape would fail # shape would fail
if data is not None: if data is not None:
assert len(bt.index) == data.shape[0] assert len(hist.index) == data.shape[0]
assert len(bt.columns) == data.shape[1] assert len(hist.columns) == data.shape[1]
# check fastpath # check fastpath
fast = Backtrack(bt=bt) fast = History(hist=hist)
check_invariants(fast) check_invariants(fast)
assert is_equal(bt, fast) assert is_equal(hist, fast)
@pytest.mark.parametrize('data', data + [None]) @pytest.mark.parametrize('data', data + [None])
def test_copy(data): def test_copy(data):
# init # init
df = pd.DataFrame(data, dtype=float) df = pd.DataFrame(data, dtype=float)
bt = Backtrack(bt=df) hist = History(hist=df)
shallow = bt.copy(deep=False) shallow = hist.copy(deep=False)
deep = bt.copy(deep=True) deep = hist.copy(deep=True)
# checks # checks
for copy in [deep, shallow]: for copy in [deep, shallow]:
check_invariants(copy) check_invariants(copy)
assert copy is not bt assert copy is not hist
assert is_equal(copy, bt) assert is_equal(copy, hist)
assert deep is not shallow assert deep is not shallow
assert is_equal(deep, shallow) assert is_equal(deep, shallow)
assert deep.bt is not bt.bt assert deep.hist is not hist.hist
assert deep.mask is not bt.mask assert deep.mask is not hist.mask
assert shallow.bt is bt.bt assert shallow.hist is hist.hist
assert shallow.mask is bt.mask assert shallow.mask is hist.mask
@pytest.fixture(scope='module') @pytest.fixture(scope='module')
def __bt(): def __hist():
# this BT is filled by # this FH is filled by
# - test_append # - test_append
# - test_append_force # - test_append_force
return Backtrack() return History()
@pytest.mark.parametrize('s, max_val', [ @pytest.mark.parametrize('s, max_val', [
...@@ -201,15 +201,15 @@ def __bt(): ...@@ -201,15 +201,15 @@ def __bt():
[0, 1, 1, 1, 1] # expected max-val [0, 1, 1, 1, 1] # expected max-val
) )
]) ])
def test_append(__bt, s, max_val): def test_append(__hist, s, max_val):
bt = __bt hist = __hist
bt.append(s, force=False) hist.append(s, force=False)
check_invariants(bt) check_invariants(hist)
assert all(bt.max() == max_val) assert all(hist.max() == max_val)
# this test append more rows to the resulting # this test append more rows to the resulting
# BT from the former test # FH from the former test
@pytest.mark.parametrize('s, max_val', [ @pytest.mark.parametrize('s, max_val', [
(pd.Series(val, index=range(6), dtype=float), max_val) (pd.Series(val, index=range(6), dtype=float), max_val)
for val, max_val for val, max_val
...@@ -218,11 +218,11 @@ def test_append(__bt, s, max_val): ...@@ -218,11 +218,11 @@ def test_append(__bt, s, max_val):
[0, 1, 1, 0], # expected max-val [0, 1, 1, 0], # expected max-val
) )
]) ])
def test_append_force(__bt, s, max_val): def test_append_force(__hist, s, max_val):
bt = __bt hist = __hist
bt.append(s, force=True) hist.append(s, force=True)
check_invariants(bt) check_invariants(hist)
assert all(bt.max() == max_val) assert all(hist.max() == max_val)
def test_squeeze(): def test_squeeze():
...@@ -230,7 +230,7 @@ def test_squeeze(): ...@@ -230,7 +230,7 @@ def test_squeeze():
d, m, exp = example2 d, m, exp = example2
d = pd.DataFrame(d, dtype=float) d = pd.DataFrame(d, dtype=float)
m = pd.DataFrame(m, dtype=bool) m = pd.DataFrame(m, dtype=bool)
orig = Backtrack(bt=d, mask=m) orig = History(hist=d, mask=m)
check_invariants(orig) check_invariants(orig)
assert all(orig.max() == exp) assert all(orig.max() == exp)
...@@ -238,17 +238,17 @@ def test_squeeze(): ...@@ -238,17 +238,17 @@ def test_squeeze():
# checks # checks
for n in range(len(orig)): for n in range(len(orig)):
bt = orig.copy() hist = orig.copy()
bt.squeeze(n) hist.squeeze(n)
check_invariants(bt) check_invariants(hist)
# squeeze for less then 2 rows does nothing # squeeze for less then 2 rows does nothing
if n < 2: if n < 2:
assert is_equal(bt, orig) assert is_equal(hist, orig)
else: else:
assert len(bt) == len(orig) - n + 1 assert len(hist) == len(orig) - n + 1
# result does not change # result does not change
assert all(bt.max() == exp) assert all(hist.max() == exp)
print(bt) print(hist)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment