Skip to content
Snippets Groups Projects
Commit 17bcf742 authored by Bert Palm's avatar Bert Palm 🎇
Browse files

refactored Backtrack -> (Flag-)History

parent f7be0d54
No related branches found
No related tags found
1 merge request!218Flags
......@@ -6,19 +6,19 @@ import pandas as pd
import numpy as np
class Backtrack:
class History:
"""
Saqc internal storage for the history of a (single) flags column.
The backtrack (BT) stores the history of a flags column. Each time
``append`` is called a new column is appended to the BT. The column
The flag-history (FH) stores the history of a flags column. Each time
``append`` is called a new column is appended to the FH. The column
names are increasing integers starting with 0. After initialisation
the BT is empty and has no columns at all. If an initial `UNFLAGGED`-
column is desired, it must created manually, or passed via the ``bt``
parameter. The same way a new BT can be created from an existing one.
the FH is empty and has no columns at all. If an initial `UNFLAGGED`-
column is desired, it must created manually, or passed via the ``hist``
parameter. The same way a new FH can be created from an existing one.
To get the worst flags (highest value) that are currently stored in
the BT, we provide a ``max()`` method. It returns a pd.Series indicating
the FH, we provide a ``max()`` method. It returns a pd.Series indicating
the worst flag per row.
To counteract the problem, that one may want to force a better flag
......@@ -32,62 +32,62 @@ class Backtrack:
Parameters
----------
bt : pd.Dataframe, default None
if None a empty BT is created, otherwise the existing dataframe
is taken as the initial backtrack.
hist : pd.Dataframe, default None
if None a empty FH is created, otherwise the existing dataframe
is taken as the initial history.
mask : pd.Dataframe, default None
a mask holding the boolean force values. It must match the passed
``bt``. If None an matching mask is created, assuming force never
``hist``. If None an matching mask is created, assuming force never
was passed to any test.
copy : bool, default False
If True, the input data is copied, otherwise not.
"""
def __init__(self, bt: pd.DataFrame = None, mask: pd.DataFrame = None, copy: bool = False):
def __init__(self, hist: pd.DataFrame = None, mask: pd.DataFrame = None, copy: bool = False):
# this is a hidden _feature_ and not exposed by the type
# of the bt parameter and serve as a fastpath for internal
# fast creation of a new BT, where no checks are needed.
if isinstance(bt, Backtrack):
# keep this order, otherwise bt.mask
# of the hist parameter and serve as a fastpath for internal
# fast creation of a new FH, where no checks are needed.
if isinstance(hist, History):
# keep this order, otherwise hist.mask
# will refer to pd.Dataframe.mask
mask = bt.mask
bt = bt.bt
mask = hist.mask
hist = hist.hist
elif bt is None and mask is None:
bt = pd.DataFrame()
elif hist is None and mask is None:
hist = pd.DataFrame()
mask = pd.DataFrame()
elif bt is None and mask is not None:
raise ValueError("Cannot take 'mask' with no 'bt'")
elif hist is None and mask is not None:
raise ValueError("Cannot take 'mask' with no 'hist'")
elif bt is not None and mask is None:
bt = self._validate_bt(bt)
mask = pd.DataFrame(True, index=bt.index, columns=bt.columns)
elif hist is not None and mask is None:
hist = self._validate_hist(hist)
mask = pd.DataFrame(True, index=hist.index, columns=hist.columns)
else:
bt, mask = self._validate_bt_with_mask(bt, mask)
hist, mask = self._validate_hist_with_mask(hist, mask)
if copy:
bt = bt.copy()
hist = hist.copy()
mask = mask.copy()
self.bt = bt
self.hist = hist
self.mask = mask
@property
def index(self) -> pd.Index:
"""
The index of BT.
The index of FH.
The index is the same for all columns.
Notes
-----
The index should always be equal to the flags series,
this is BT is associated with. If this is messed up
this is FH is associated with. If this is messed up
something went wrong in saqc internals or in a user-
defined test.
......@@ -95,12 +95,12 @@ class Backtrack:
-------
index : pd.Index
"""
return self.bt.index
return self.hist.index
@property
def columns(self) -> pd.Index:
"""
Columns of the BT.
Columns of the FH.
The columns are always continuously
increasing integers, starting from 0.
......@@ -109,27 +109,27 @@ class Backtrack:
-------
columns : pd.Index
"""
return self.bt.columns
return self.hist.columns
@property
def empty(self) -> bool:
"""
Indicator whether Backtrack is empty.
Indicator whether History is empty.
True if Backtrack is entirely empty (no items).
True if History is entirely empty (no items).
Returns
-------
bool
If Backtrack is empty, return True, if not return False.
If History is empty, return True, if not return False.
"""
# we take self.mask here, because it cannot have NaN's,
# but self.bt could have -> see pd.DataFrame.empty
# but self.hist could have -> see pd.DataFrame.empty
return self.mask.empty
def _insert(self, s: pd.Series, nr: int, force=False) -> Backtrack:
def _insert(self, s: pd.Series, nr: int, force=False) -> History:
"""
Insert data at an arbitrary position in the BT.
Insert data at an arbitrary position in the FH.
No validation of series is done here.
......@@ -146,7 +146,7 @@ class Backtrack:
Returns
-------
Backtrack
History
"""
# internal detail:
# ensure continuous increasing columns
......@@ -157,7 +157,7 @@ class Backtrack:
assert nr == 0
self.mask[nr] = pd.Series(True, index=s.index, dtype=bool)
self.bt[nr] = s
self.hist[nr] = s
return self
if force:
......@@ -168,23 +168,23 @@ class Backtrack:
if nr == len(self):
self.mask[nr] = True
self.bt[nr] = s
self.hist[nr] = s
return self
def append(self, value: pd.Series, force=False) -> Backtrack:
def append(self, value: pd.Series, force=False) -> History:
"""
Create a new BT column and insert given pd.Series to it.
Create a new FH column and insert given pd.Series to it.
Parameters
----------
value : pd.Series
the data to append. Must have dtype float and the index must
match the index of the BT.
match the index of the FH.
force : bool, default False
if True the internal mask is updated in a way that the currently
set value (series values) will be returned if ``Backtrack.max()``
set value (series values) will be returned if ``History.max()``
is called. This apply for all valid values (not ``np.Nan`` and
not ``-np.inf``).
......@@ -195,7 +195,7 @@ class Backtrack:
Returns
-------
Backtrack: BT with appended series
History: FH with appended series
"""
s = self._validate_value(value)
......@@ -203,16 +203,16 @@ class Backtrack:
raise ValueError('Cannot append empty pd.Series')
if not self.empty and not s.index.equals(self.index):
raise ValueError("Index must be equal to BT's index")
raise ValueError("Index must be equal to FH's index")
self._insert(value, nr=len(self), force=force)
return self
def squeeze(self, n: int) -> Backtrack:
def squeeze(self, n: int) -> History:
"""
Squeeze last `n` columns to a single column.
This **not** changes the result of ``Backtrack.max()``.
This **not** changes the result of ``History.max()``.
Parameters
----------
......@@ -229,18 +229,18 @@ class Backtrack:
Returns
-------
Backtrack
squeezed backtrack
History
squeezed history
"""
if n <= 1:
return self
if n > len(self):
raise ValueError(f"'n={n}' cannot be greater than columns in the BT")
raise ValueError(f"'n={n}' cannot be greater than columns in the FH")
# shortcut
if len(self) == n:
self.bt = pd.DataFrame()
self.hist = pd.DataFrame()
self.mask = pd.DataFrame()
s = self.max()
......@@ -248,16 +248,16 @@ class Backtrack:
# calc the squeezed series.
# we dont have to care about any forced series
# because anytime force was given, the False's in
# the mask were propagated back over the whole BT
# the mask were propagated back over the whole FH
mask = self.mask.iloc[:, -n:]
bt = self.bt.iloc[:, -n:]
s = bt[mask].max(axis=1)
hist = self.hist.iloc[:, -n:]
s = hist[mask].max(axis=1)
# slice self down
# this may leave us in an unstable state, because
# the last column may not is entirely True, but
# the following append, will fix this
self.bt = self.bt.iloc[:, :-n]
self.hist = self.hist.iloc[:, :-n]
self.mask = self.mask.iloc[:, :-n]
self.append(s)
......@@ -265,21 +265,21 @@ class Backtrack:
def max(self) -> pd.Series:
"""
Get the maximum value per row of the BT.
Get the maximum value per row of the FH.
Returns
-------
pd.Series: maximum values
"""
return self.bt[self.mask].max(axis=1)
return self.hist[self.mask].max(axis=1)
@property
def _constructor(self) -> Type['Backtrack']:
return Backtrack
def _constructor(self) -> Type['History']:
return History
def copy(self, deep=True) -> Backtrack:
def copy(self, deep=True) -> History:
"""
Make a copy of the BT.
Make a copy of the FH.
Parameters
----------
......@@ -289,20 +289,20 @@ class Backtrack:
Returns
-------
copy : Backtrack
the copied BT
copy : History
the copied FH
"""
return self._constructor(bt=self, copy=deep)
return self._constructor(hist=self, copy=deep)
def __len__(self) -> int:
return len(self.bt.columns)
return len(self.hist.columns)
def __repr__(self):
if self.empty:
return str(self.bt).replace('DataFrame', 'Backtrack')
return str(self.hist).replace('DataFrame', 'History')
repr = self.bt.astype(str)
repr = self.hist.astype(str)
m = self.mask
repr[m] = ' ' + repr[m] + ' '
......@@ -314,13 +314,13 @@ class Backtrack:
# validation
#
def _validate_bt_with_mask(self, obj: pd.DataFrame, mask: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
def _validate_hist_with_mask(self, obj: pd.DataFrame, mask: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
check type, columns, index, dtype and if the mask fits the obj.
"""
# check bt
self._validate_bt(obj)
# check hist
self._validate_hist(obj)
# check mask
if not isinstance(mask, pd.DataFrame):
......@@ -332,25 +332,25 @@ class Backtrack:
if not mask.empty and not mask.iloc[:, -1].all():
raise ValueError("the values in the last column in mask must be 'True' everywhere.")
# check combination of bt and mask
# check combination of hist and mask
if not obj.columns.equals(mask.columns):
raise ValueError("'bt' and 'mask' must have same columns")
raise ValueError("'hist' and 'mask' must have same columns")
if not obj.index.equals(mask.index):
raise ValueError("'bt' and 'mask' must have same index")
raise ValueError("'hist' and 'mask' must have same index")
return obj, mask
def _validate_bt(self, obj: pd.DataFrame) -> pd.DataFrame:
def _validate_hist(self, obj: pd.DataFrame) -> pd.DataFrame:
"""
check type, columns, dtype of obj.
"""
if not isinstance(obj, pd.DataFrame):
raise TypeError(f"'bt' must be of type pd.DataFrame, but {type(obj).__name__} was given")
raise TypeError(f"'hist' must be of type pd.DataFrame, but {type(obj).__name__} was given")
if any(obj.dtypes != float):
raise ValueError('dtype of all columns in bt must be float')
raise ValueError('dtype of all columns in hist must be float')
if not obj.empty and (
not obj.columns.equals(pd.Index(range(len(obj.columns))))
......
......@@ -5,9 +5,9 @@ import numpy as np
import pandas as pd
from pandas.api.types import is_bool_dtype
from test.common import TESTFLAGGER, initData
from saqc.flagger.backtrack import Backtrack
from saqc.flagger.history import History
# see #GH143 combined bt
# see #GH143 combined backtrack
# (adjusted to current implementation)
example1 = (
......@@ -76,66 +76,66 @@ data = [
]
def check_invariants(bt):
def check_invariants(hist):
"""
This can be called for **any** BT.
This can be called for **any** FH.
The assertions must hold in any case.
"""
# basics
assert isinstance(bt, Backtrack)
assert isinstance(bt.bt, pd.DataFrame)
assert isinstance(bt.mask, pd.DataFrame)
assert all(bt.bt.dtypes == float)
assert all(bt.mask.dtypes == bool)
assert bt.bt.columns.equals(bt.mask.columns)
assert bt.columns is bt.bt.columns
assert bt.index is bt.bt.index
assert len(bt) == len(bt.columns)
assert isinstance(hist, History)
assert isinstance(hist.hist, pd.DataFrame)
assert isinstance(hist.mask, pd.DataFrame)
assert all(hist.hist.dtypes == float)
assert all(hist.mask.dtypes == bool)
assert hist.hist.columns.equals(hist.mask.columns)
assert hist.columns is hist.hist.columns
assert hist.index is hist.hist.index
assert len(hist) == len(hist.columns)
# advanced
assert bt.columns.equals(pd.Index(range(len(bt))))
assert isinstance(bt.max(), pd.Series)
assert bt.mask.empty or bt.mask.iloc[:, -1].all()
assert hist.columns.equals(pd.Index(range(len(hist))))
assert isinstance(hist.max(), pd.Series)
assert hist.mask.empty or hist.mask.iloc[:, -1].all()
# False propagation
# for each row this must hold:
# either the row has one change (False->True)
# or the entire row is True
if not bt.empty:
idxmax = bt.mask.idxmax(axis=1)
if not hist.empty:
idxmax = hist.mask.idxmax(axis=1)
for row, col in idxmax.items():
assert all(bt.mask.iloc[row, :col] == False)
assert all(bt.mask.iloc[row, col:] == True)
assert all(hist.mask.iloc[row, :col] == False)
assert all(hist.mask.iloc[row, col:] == True)
def is_equal(bt1: Backtrack, bt2: Backtrack):
def is_equal(hist1: History, hist2: History):
"""
Check if two BT are (considered) equal, namely
have equal 'bt' and equal 'mask'.
Check if two FH are (considered) equal, namely
have equal 'hist' and equal 'mask'.
"""
return bt1.bt.equals(bt2.bt) and bt1.mask.equals(bt2.mask)
return hist1.hist.equals(hist2.hist) and hist1.mask.equals(hist2.mask)
@pytest.mark.parametrize('data', data + [None])
def test_init(data: np.array):
# init
df = pd.DataFrame(data, dtype=float)
bt = Backtrack(bt=df)
hist = History(hist=df)
check_invariants(bt)
check_invariants(hist)
# shape would fail
if data is not None:
assert len(bt.index) == data.shape[0]
assert len(bt.columns) == data.shape[1]
assert bt.mask.all(axis=None)
assert len(hist.index) == data.shape[0]
assert len(hist.columns) == data.shape[1]
assert hist.mask.all(axis=None)
# check fastpath
fast = Backtrack(bt=bt)
fast = History(hist=hist)
check_invariants(fast)
assert is_equal(bt, fast)
assert is_equal(hist, fast)
@pytest.mark.parametrize('data', data + [None])
......@@ -145,52 +145,52 @@ def test_init_with_mask(data: np.array):
mask = pd.DataFrame(data, dtype=bool)
if not mask.empty:
mask.iloc[:, -1] = True
bt = Backtrack(bt=df, mask=mask)
hist = History(hist=df, mask=mask)
check_invariants(bt)
check_invariants(hist)
# shape would fail
if data is not None:
assert len(bt.index) == data.shape[0]
assert len(bt.columns) == data.shape[1]
assert len(hist.index) == data.shape[0]
assert len(hist.columns) == data.shape[1]
# check fastpath
fast = Backtrack(bt=bt)
fast = History(hist=hist)
check_invariants(fast)
assert is_equal(bt, fast)
assert is_equal(hist, fast)
@pytest.mark.parametrize('data', data + [None])
def test_copy(data):
# init
df = pd.DataFrame(data, dtype=float)
bt = Backtrack(bt=df)
shallow = bt.copy(deep=False)
deep = bt.copy(deep=True)
hist = History(hist=df)
shallow = hist.copy(deep=False)
deep = hist.copy(deep=True)
# checks
for copy in [deep, shallow]:
check_invariants(copy)
assert copy is not bt
assert is_equal(copy, bt)
assert copy is not hist
assert is_equal(copy, hist)
assert deep is not shallow
assert is_equal(deep, shallow)
assert deep.bt is not bt.bt
assert deep.mask is not bt.mask
assert shallow.bt is bt.bt
assert shallow.mask is bt.mask
assert deep.hist is not hist.hist
assert deep.mask is not hist.mask
assert shallow.hist is hist.hist
assert shallow.mask is hist.mask
@pytest.fixture(scope='module')
def __bt():
# this BT is filled by
def __hist():
# this FH is filled by
# - test_append
# - test_append_force
return Backtrack()
return History()
@pytest.mark.parametrize('s, max_val', [
......@@ -201,15 +201,15 @@ def __bt():
[0, 1, 1, 1, 1] # expected max-val
)
])
def test_append(__bt, s, max_val):
bt = __bt
bt.append(s, force=False)
check_invariants(bt)
assert all(bt.max() == max_val)
def test_append(__hist, s, max_val):
hist = __hist
hist.append(s, force=False)
check_invariants(hist)
assert all(hist.max() == max_val)
# this test append more rows to the resulting
# BT from the former test
# FH from the former test
@pytest.mark.parametrize('s, max_val', [
(pd.Series(val, index=range(6), dtype=float), max_val)
for val, max_val
......@@ -218,11 +218,11 @@ def test_append(__bt, s, max_val):
[0, 1, 1, 0], # expected max-val
)
])
def test_append_force(__bt, s, max_val):
bt = __bt
bt.append(s, force=True)
check_invariants(bt)
assert all(bt.max() == max_val)
def test_append_force(__hist, s, max_val):
hist = __hist
hist.append(s, force=True)
check_invariants(hist)
assert all(hist.max() == max_val)
def test_squeeze():
......@@ -230,7 +230,7 @@ def test_squeeze():
d, m, exp = example2
d = pd.DataFrame(d, dtype=float)
m = pd.DataFrame(m, dtype=bool)
orig = Backtrack(bt=d, mask=m)
orig = History(hist=d, mask=m)
check_invariants(orig)
assert all(orig.max() == exp)
......@@ -238,17 +238,17 @@ def test_squeeze():
# checks
for n in range(len(orig)):
bt = orig.copy()
bt.squeeze(n)
hist = orig.copy()
hist.squeeze(n)
check_invariants(bt)
check_invariants(hist)
# squeeze for less then 2 rows does nothing
if n < 2:
assert is_equal(bt, orig)
assert is_equal(hist, orig)
else:
assert len(bt) == len(orig) - n + 1
assert len(hist) == len(orig) - n + 1
# result does not change
assert all(bt.max() == exp)
print(bt)
assert all(hist.max() == exp)
print(hist)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment