diff --git a/saqc/flagger/backtrack.py b/saqc/flagger/history.py similarity index 65% rename from saqc/flagger/backtrack.py rename to saqc/flagger/history.py index 01b812ad6e25bf33520a9f452f1732a11f084a7d..8a1967ac58315caa794be79d0037780ac5d168da 100644 --- a/saqc/flagger/backtrack.py +++ b/saqc/flagger/history.py @@ -6,19 +6,19 @@ import pandas as pd import numpy as np -class Backtrack: +class History: """ Saqc internal storage for the history of a (single) flags column. - The backtrack (BT) stores the history of a flags column. Each time - ``append`` is called a new column is appended to the BT. The column + The flag-history (FH) stores the history of a flags column. Each time + ``append`` is called a new column is appended to the FH. The column names are increasing integers starting with 0. After initialisation - the BT is empty and has no columns at all. If an initial `UNFLAGGED`- - column is desired, it must created manually, or passed via the ``bt`` - parameter. The same way a new BT can be created from an existing one. + the FH is empty and has no columns at all. If an initial `UNFLAGGED`- + column is desired, it must created manually, or passed via the ``hist`` + parameter. The same way a new FH can be created from an existing one. To get the worst flags (highest value) that are currently stored in - the BT, we provide a ``max()`` method. It returns a pd.Series indicating + the FH, we provide a ``max()`` method. It returns a pd.Series indicating the worst flag per row. To counteract the problem, that one may want to force a better flag @@ -32,62 +32,62 @@ class Backtrack: Parameters ---------- - bt : pd.Dataframe, default None - if None a empty BT is created, otherwise the existing dataframe - is taken as the initial backtrack. + hist : pd.Dataframe, default None + if None a empty FH is created, otherwise the existing dataframe + is taken as the initial history. mask : pd.Dataframe, default None a mask holding the boolean force values. It must match the passed - ``bt``. If None an matching mask is created, assuming force never + ``hist``. If None an matching mask is created, assuming force never was passed to any test. copy : bool, default False If True, the input data is copied, otherwise not. """ - def __init__(self, bt: pd.DataFrame = None, mask: pd.DataFrame = None, copy: bool = False): + def __init__(self, hist: pd.DataFrame = None, mask: pd.DataFrame = None, copy: bool = False): # this is a hidden _feature_ and not exposed by the type - # of the bt parameter and serve as a fastpath for internal - # fast creation of a new BT, where no checks are needed. - if isinstance(bt, Backtrack): - # keep this order, otherwise bt.mask + # of the hist parameter and serve as a fastpath for internal + # fast creation of a new FH, where no checks are needed. + if isinstance(hist, History): + # keep this order, otherwise hist.mask # will refer to pd.Dataframe.mask - mask = bt.mask - bt = bt.bt + mask = hist.mask + hist = hist.hist - elif bt is None and mask is None: - bt = pd.DataFrame() + elif hist is None and mask is None: + hist = pd.DataFrame() mask = pd.DataFrame() - elif bt is None and mask is not None: - raise ValueError("Cannot take 'mask' with no 'bt'") + elif hist is None and mask is not None: + raise ValueError("Cannot take 'mask' with no 'hist'") - elif bt is not None and mask is None: - bt = self._validate_bt(bt) - mask = pd.DataFrame(True, index=bt.index, columns=bt.columns) + elif hist is not None and mask is None: + hist = self._validate_hist(hist) + mask = pd.DataFrame(True, index=hist.index, columns=hist.columns) else: - bt, mask = self._validate_bt_with_mask(bt, mask) + hist, mask = self._validate_hist_with_mask(hist, mask) if copy: - bt = bt.copy() + hist = hist.copy() mask = mask.copy() - self.bt = bt + self.hist = hist self.mask = mask @property def index(self) -> pd.Index: """ - The index of BT. + The index of FH. The index is the same for all columns. Notes ----- The index should always be equal to the flags series, - this is BT is associated with. If this is messed up + this is FH is associated with. If this is messed up something went wrong in saqc internals or in a user- defined test. @@ -95,12 +95,12 @@ class Backtrack: ------- index : pd.Index """ - return self.bt.index + return self.hist.index @property def columns(self) -> pd.Index: """ - Columns of the BT. + Columns of the FH. The columns are always continuously increasing integers, starting from 0. @@ -109,27 +109,27 @@ class Backtrack: ------- columns : pd.Index """ - return self.bt.columns + return self.hist.columns @property def empty(self) -> bool: """ - Indicator whether Backtrack is empty. + Indicator whether History is empty. - True if Backtrack is entirely empty (no items). + True if History is entirely empty (no items). Returns ------- bool - If Backtrack is empty, return True, if not return False. + If History is empty, return True, if not return False. """ # we take self.mask here, because it cannot have NaN's, - # but self.bt could have -> see pd.DataFrame.empty + # but self.hist could have -> see pd.DataFrame.empty return self.mask.empty - def _insert(self, s: pd.Series, nr: int, force=False) -> Backtrack: + def _insert(self, s: pd.Series, nr: int, force=False) -> History: """ - Insert data at an arbitrary position in the BT. + Insert data at an arbitrary position in the FH. No validation of series is done here. @@ -146,7 +146,7 @@ class Backtrack: Returns ------- - Backtrack + History """ # internal detail: # ensure continuous increasing columns @@ -157,7 +157,7 @@ class Backtrack: assert nr == 0 self.mask[nr] = pd.Series(True, index=s.index, dtype=bool) - self.bt[nr] = s + self.hist[nr] = s return self if force: @@ -168,23 +168,23 @@ class Backtrack: if nr == len(self): self.mask[nr] = True - self.bt[nr] = s + self.hist[nr] = s return self - def append(self, value: pd.Series, force=False) -> Backtrack: + def append(self, value: pd.Series, force=False) -> History: """ - Create a new BT column and insert given pd.Series to it. + Create a new FH column and insert given pd.Series to it. Parameters ---------- value : pd.Series the data to append. Must have dtype float and the index must - match the index of the BT. + match the index of the FH. force : bool, default False if True the internal mask is updated in a way that the currently - set value (series values) will be returned if ``Backtrack.max()`` + set value (series values) will be returned if ``History.max()`` is called. This apply for all valid values (not ``np.Nan`` and not ``-np.inf``). @@ -195,7 +195,7 @@ class Backtrack: Returns ------- - Backtrack: BT with appended series + History: FH with appended series """ s = self._validate_value(value) @@ -203,16 +203,16 @@ class Backtrack: raise ValueError('Cannot append empty pd.Series') if not self.empty and not s.index.equals(self.index): - raise ValueError("Index must be equal to BT's index") + raise ValueError("Index must be equal to FH's index") self._insert(value, nr=len(self), force=force) return self - def squeeze(self, n: int) -> Backtrack: + def squeeze(self, n: int) -> History: """ Squeeze last `n` columns to a single column. - This **not** changes the result of ``Backtrack.max()``. + This **not** changes the result of ``History.max()``. Parameters ---------- @@ -229,18 +229,18 @@ class Backtrack: Returns ------- - Backtrack - squeezed backtrack + History + squeezed history """ if n <= 1: return self if n > len(self): - raise ValueError(f"'n={n}' cannot be greater than columns in the BT") + raise ValueError(f"'n={n}' cannot be greater than columns in the FH") # shortcut if len(self) == n: - self.bt = pd.DataFrame() + self.hist = pd.DataFrame() self.mask = pd.DataFrame() s = self.max() @@ -248,16 +248,16 @@ class Backtrack: # calc the squeezed series. # we dont have to care about any forced series # because anytime force was given, the False's in - # the mask were propagated back over the whole BT + # the mask were propagated back over the whole FH mask = self.mask.iloc[:, -n:] - bt = self.bt.iloc[:, -n:] - s = bt[mask].max(axis=1) + hist = self.hist.iloc[:, -n:] + s = hist[mask].max(axis=1) # slice self down # this may leave us in an unstable state, because # the last column may not is entirely True, but # the following append, will fix this - self.bt = self.bt.iloc[:, :-n] + self.hist = self.hist.iloc[:, :-n] self.mask = self.mask.iloc[:, :-n] self.append(s) @@ -265,21 +265,21 @@ class Backtrack: def max(self) -> pd.Series: """ - Get the maximum value per row of the BT. + Get the maximum value per row of the FH. Returns ------- pd.Series: maximum values """ - return self.bt[self.mask].max(axis=1) + return self.hist[self.mask].max(axis=1) @property - def _constructor(self) -> Type['Backtrack']: - return Backtrack + def _constructor(self) -> Type['History']: + return History - def copy(self, deep=True) -> Backtrack: + def copy(self, deep=True) -> History: """ - Make a copy of the BT. + Make a copy of the FH. Parameters ---------- @@ -289,20 +289,20 @@ class Backtrack: Returns ------- - copy : Backtrack - the copied BT + copy : History + the copied FH """ - return self._constructor(bt=self, copy=deep) + return self._constructor(hist=self, copy=deep) def __len__(self) -> int: - return len(self.bt.columns) + return len(self.hist.columns) def __repr__(self): if self.empty: - return str(self.bt).replace('DataFrame', 'Backtrack') + return str(self.hist).replace('DataFrame', 'History') - repr = self.bt.astype(str) + repr = self.hist.astype(str) m = self.mask repr[m] = ' ' + repr[m] + ' ' @@ -314,13 +314,13 @@ class Backtrack: # validation # - def _validate_bt_with_mask(self, obj: pd.DataFrame, mask: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: + def _validate_hist_with_mask(self, obj: pd.DataFrame, mask: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: """ check type, columns, index, dtype and if the mask fits the obj. """ - # check bt - self._validate_bt(obj) + # check hist + self._validate_hist(obj) # check mask if not isinstance(mask, pd.DataFrame): @@ -332,25 +332,25 @@ class Backtrack: if not mask.empty and not mask.iloc[:, -1].all(): raise ValueError("the values in the last column in mask must be 'True' everywhere.") - # check combination of bt and mask + # check combination of hist and mask if not obj.columns.equals(mask.columns): - raise ValueError("'bt' and 'mask' must have same columns") + raise ValueError("'hist' and 'mask' must have same columns") if not obj.index.equals(mask.index): - raise ValueError("'bt' and 'mask' must have same index") + raise ValueError("'hist' and 'mask' must have same index") return obj, mask - def _validate_bt(self, obj: pd.DataFrame) -> pd.DataFrame: + def _validate_hist(self, obj: pd.DataFrame) -> pd.DataFrame: """ check type, columns, dtype of obj. """ if not isinstance(obj, pd.DataFrame): - raise TypeError(f"'bt' must be of type pd.DataFrame, but {type(obj).__name__} was given") + raise TypeError(f"'hist' must be of type pd.DataFrame, but {type(obj).__name__} was given") if any(obj.dtypes != float): - raise ValueError('dtype of all columns in bt must be float') + raise ValueError('dtype of all columns in hist must be float') if not obj.empty and ( not obj.columns.equals(pd.Index(range(len(obj.columns)))) diff --git a/test/flagger/test_backtrack.py b/test/flagger/test_history.py similarity index 56% rename from test/flagger/test_backtrack.py rename to test/flagger/test_history.py index 51b86e5e8cfe7f3d72c2556a2af6382097cf9a7a..47c1306510752d02af0a62d2f80e62ad9e9bf695 100644 --- a/test/flagger/test_backtrack.py +++ b/test/flagger/test_history.py @@ -5,9 +5,9 @@ import numpy as np import pandas as pd from pandas.api.types import is_bool_dtype from test.common import TESTFLAGGER, initData -from saqc.flagger.backtrack import Backtrack +from saqc.flagger.history import History -# see #GH143 combined bt +# see #GH143 combined backtrack # (adjusted to current implementation) example1 = ( @@ -76,66 +76,66 @@ data = [ ] -def check_invariants(bt): +def check_invariants(hist): """ - This can be called for **any** BT. + This can be called for **any** FH. The assertions must hold in any case. """ # basics - assert isinstance(bt, Backtrack) - assert isinstance(bt.bt, pd.DataFrame) - assert isinstance(bt.mask, pd.DataFrame) - assert all(bt.bt.dtypes == float) - assert all(bt.mask.dtypes == bool) - assert bt.bt.columns.equals(bt.mask.columns) - assert bt.columns is bt.bt.columns - assert bt.index is bt.bt.index - assert len(bt) == len(bt.columns) + assert isinstance(hist, History) + assert isinstance(hist.hist, pd.DataFrame) + assert isinstance(hist.mask, pd.DataFrame) + assert all(hist.hist.dtypes == float) + assert all(hist.mask.dtypes == bool) + assert hist.hist.columns.equals(hist.mask.columns) + assert hist.columns is hist.hist.columns + assert hist.index is hist.hist.index + assert len(hist) == len(hist.columns) # advanced - assert bt.columns.equals(pd.Index(range(len(bt)))) - assert isinstance(bt.max(), pd.Series) - assert bt.mask.empty or bt.mask.iloc[:, -1].all() + assert hist.columns.equals(pd.Index(range(len(hist)))) + assert isinstance(hist.max(), pd.Series) + assert hist.mask.empty or hist.mask.iloc[:, -1].all() # False propagation # for each row this must hold: # either the row has one change (False->True) # or the entire row is True - if not bt.empty: - idxmax = bt.mask.idxmax(axis=1) + if not hist.empty: + idxmax = hist.mask.idxmax(axis=1) for row, col in idxmax.items(): - assert all(bt.mask.iloc[row, :col] == False) - assert all(bt.mask.iloc[row, col:] == True) + assert all(hist.mask.iloc[row, :col] == False) + assert all(hist.mask.iloc[row, col:] == True) -def is_equal(bt1: Backtrack, bt2: Backtrack): +def is_equal(hist1: History, hist2: History): """ - Check if two BT are (considered) equal, namely - have equal 'bt' and equal 'mask'. + Check if two FH are (considered) equal, namely + have equal 'hist' and equal 'mask'. """ - return bt1.bt.equals(bt2.bt) and bt1.mask.equals(bt2.mask) + return hist1.hist.equals(hist2.hist) and hist1.mask.equals(hist2.mask) @pytest.mark.parametrize('data', data + [None]) def test_init(data: np.array): # init df = pd.DataFrame(data, dtype=float) - bt = Backtrack(bt=df) + hist = History(hist=df) - check_invariants(bt) + check_invariants(hist) # shape would fail if data is not None: - assert len(bt.index) == data.shape[0] - assert len(bt.columns) == data.shape[1] - assert bt.mask.all(axis=None) + assert len(hist.index) == data.shape[0] + assert len(hist.columns) == data.shape[1] + assert hist.mask.all(axis=None) # check fastpath - fast = Backtrack(bt=bt) + fast = History(hist=hist) check_invariants(fast) - assert is_equal(bt, fast) + assert is_equal(hist, fast) @pytest.mark.parametrize('data', data + [None]) @@ -145,52 +145,52 @@ def test_init_with_mask(data: np.array): mask = pd.DataFrame(data, dtype=bool) if not mask.empty: mask.iloc[:, -1] = True - bt = Backtrack(bt=df, mask=mask) + hist = History(hist=df, mask=mask) - check_invariants(bt) + check_invariants(hist) # shape would fail if data is not None: - assert len(bt.index) == data.shape[0] - assert len(bt.columns) == data.shape[1] + assert len(hist.index) == data.shape[0] + assert len(hist.columns) == data.shape[1] # check fastpath - fast = Backtrack(bt=bt) + fast = History(hist=hist) check_invariants(fast) - assert is_equal(bt, fast) + assert is_equal(hist, fast) @pytest.mark.parametrize('data', data + [None]) def test_copy(data): # init df = pd.DataFrame(data, dtype=float) - bt = Backtrack(bt=df) - shallow = bt.copy(deep=False) - deep = bt.copy(deep=True) + hist = History(hist=df) + shallow = hist.copy(deep=False) + deep = hist.copy(deep=True) # checks for copy in [deep, shallow]: check_invariants(copy) - assert copy is not bt - assert is_equal(copy, bt) + assert copy is not hist + assert is_equal(copy, hist) assert deep is not shallow assert is_equal(deep, shallow) - assert deep.bt is not bt.bt - assert deep.mask is not bt.mask - assert shallow.bt is bt.bt - assert shallow.mask is bt.mask + assert deep.hist is not hist.hist + assert deep.mask is not hist.mask + assert shallow.hist is hist.hist + assert shallow.mask is hist.mask @pytest.fixture(scope='module') -def __bt(): - # this BT is filled by +def __hist(): + # this FH is filled by # - test_append # - test_append_force - return Backtrack() + return History() @pytest.mark.parametrize('s, max_val', [ @@ -201,15 +201,15 @@ def __bt(): [0, 1, 1, 1, 1] # expected max-val ) ]) -def test_append(__bt, s, max_val): - bt = __bt - bt.append(s, force=False) - check_invariants(bt) - assert all(bt.max() == max_val) +def test_append(__hist, s, max_val): + hist = __hist + hist.append(s, force=False) + check_invariants(hist) + assert all(hist.max() == max_val) # this test append more rows to the resulting -# BT from the former test +# FH from the former test @pytest.mark.parametrize('s, max_val', [ (pd.Series(val, index=range(6), dtype=float), max_val) for val, max_val @@ -218,11 +218,11 @@ def test_append(__bt, s, max_val): [0, 1, 1, 0], # expected max-val ) ]) -def test_append_force(__bt, s, max_val): - bt = __bt - bt.append(s, force=True) - check_invariants(bt) - assert all(bt.max() == max_val) +def test_append_force(__hist, s, max_val): + hist = __hist + hist.append(s, force=True) + check_invariants(hist) + assert all(hist.max() == max_val) def test_squeeze(): @@ -230,7 +230,7 @@ def test_squeeze(): d, m, exp = example2 d = pd.DataFrame(d, dtype=float) m = pd.DataFrame(m, dtype=bool) - orig = Backtrack(bt=d, mask=m) + orig = History(hist=d, mask=m) check_invariants(orig) assert all(orig.max() == exp) @@ -238,17 +238,17 @@ def test_squeeze(): # checks for n in range(len(orig)): - bt = orig.copy() - bt.squeeze(n) + hist = orig.copy() + hist.squeeze(n) - check_invariants(bt) + check_invariants(hist) # squeeze for less then 2 rows does nothing if n < 2: - assert is_equal(bt, orig) + assert is_equal(hist, orig) else: - assert len(bt) == len(orig) - n + 1 + assert len(hist) == len(orig) - n + 1 # result does not change - assert all(bt.max() == exp) - print(bt) + assert all(hist.max() == exp) + print(hist)