refactored Backtrack -> (Flag-)History

17bcf742 · Bert Palm · f7be0d54 · 17bcf742 · 17bcf742
Commit 17bcf742 authored 4 years ago by Bert Palm 🎇
--- a/saqc/flagger/backtrack.py
+++ b/saqc/flagger/backtrack.py
@@ -6,19 +6,19 @@ import pandas as pd
 import numpy as np


-class Backtrack:
+class History:
    """
    Saqc internal storage for the history of a (single) flags column.

-    The backtrack (BT) stores the history of a flags column. Each time
-    ``append`` is called a new column is appended to the BT. The column
+    The flag-history (FH) stores the history of a flags column. Each time
+    ``append`` is called a new column is appended to the FH. The column
    names are increasing integers starting with 0. After initialisation
-    the BT is empty and has no columns at all. If an initial `UNFLAGGED`-
-    column is desired, it must created manually, or passed via the ``bt``
-    parameter. The same way a new BT can be created from an existing one.
+    the FH is empty and has no columns at all. If an initial `UNFLAGGED`-
+    column is desired, it must created manually, or passed via the ``hist``
+    parameter. The same way a new FH can be created from an existing one.

    To get the worst flags (highest value) that are currently stored in
-    the BT, we provide a ``max()`` method. It returns a pd.Series indicating
+    the FH, we provide a ``max()`` method. It returns a pd.Series indicating
    the worst flag per row.

    To counteract the problem, that one may want to force a better flag
@@ -32,62 +32,62 @@ class Backtrack:

    Parameters
    ----------
-    bt : pd.Dataframe, default None
-        if None a empty BT is created, otherwise the existing dataframe
-        is taken as the initial backtrack.
+    hist : pd.Dataframe, default None
+        if None a empty FH is created, otherwise the existing dataframe
+        is taken as the initial history.

    mask : pd.Dataframe, default None
        a mask holding the boolean force values. It must match the passed
-        ``bt``. If None an matching mask is created, assuming force never
+        ``hist``. If None an matching mask is created, assuming force never
        was passed to any test.

    copy : bool, default False
        If True, the input data is copied, otherwise not.
    """

-    def __init__(self, bt: pd.DataFrame = None, mask: pd.DataFrame = None, copy: bool = False):
+    def __init__(self, hist: pd.DataFrame = None, mask: pd.DataFrame = None, copy: bool = False):

        # this is a hidden _feature_ and not exposed by the type
-        # of the bt parameter and serve as a fastpath for internal
-        # fast creation of a new BT, where no checks are needed.
-        if isinstance(bt, Backtrack):
-            # keep this order, otherwise bt.mask
+        # of the hist parameter and serve as a fastpath for internal
+        # fast creation of a new FH, where no checks are needed.
+        if isinstance(hist, History):
+            # keep this order, otherwise hist.mask
            # will refer to pd.Dataframe.mask
-            mask = bt.mask
-            bt = bt.bt
+            mask = hist.mask
+            hist = hist.hist

-        elif bt is None and mask is None:
-            bt = pd.DataFrame()
+        elif hist is None and mask is None:
+            hist = pd.DataFrame()
            mask = pd.DataFrame()

-        elif bt is None and mask is not None:
-            raise ValueError("Cannot take 'mask' with no 'bt'")
+        elif hist is None and mask is not None:
+            raise ValueError("Cannot take 'mask' with no 'hist'")

-        elif bt is not None and mask is None:
-            bt = self._validate_bt(bt)
-            mask = pd.DataFrame(True, index=bt.index, columns=bt.columns)
+        elif hist is not None and mask is None:
+            hist = self._validate_hist(hist)
+            mask = pd.DataFrame(True, index=hist.index, columns=hist.columns)

        else:
-            bt, mask = self._validate_bt_with_mask(bt, mask)
+            hist, mask = self._validate_hist_with_mask(hist, mask)

        if copy:
-            bt = bt.copy()
+            hist = hist.copy()
            mask = mask.copy()

-        self.bt = bt
+        self.hist = hist
        self.mask = mask

    @property
    def index(self) -> pd.Index:
        """
-        The index of BT.
+        The index of FH.

        The index is the same for all columns.

        Notes
        -----
        The index should always be equal to the flags series,
-        this is BT is associated with. If this is messed up
+        this is FH is associated with. If this is messed up
        something went wrong in saqc internals or in a user-
        defined test.

@@ -95,12 +95,12 @@ class Backtrack:
        -------
        index : pd.Index
        """
-        return self.bt.index
+        return self.hist.index

    @property
    def columns(self) -> pd.Index:
        """
-        Columns of the BT.
+        Columns of the FH.

        The columns are always continuously
        increasing integers, starting from 0.
@@ -109,27 +109,27 @@ class Backtrack:
        -------
        columns : pd.Index
        """
-        return self.bt.columns
+        return self.hist.columns

    @property
    def empty(self) -> bool:
        """
-        Indicator whether Backtrack is empty.
+        Indicator whether History is empty.

-        True if Backtrack is entirely empty (no items).
+        True if History is entirely empty (no items).

        Returns
        -------
        bool
-            If Backtrack is empty, return True, if not return False.
+            If History is empty, return True, if not return False.
        """
        # we take self.mask here, because it cannot have NaN's,
-        # but self.bt could have -> see pd.DataFrame.empty
+        # but self.hist could have -> see pd.DataFrame.empty
        return self.mask.empty

-    def _insert(self, s: pd.Series, nr: int, force=False) -> Backtrack:
+    def _insert(self, s: pd.Series, nr: int, force=False) -> History:
        """
-        Insert data at an arbitrary position in the BT.
+        Insert data at an arbitrary position in the FH.

        No validation of series is done here.

@@ -146,7 +146,7 @@ class Backtrack:

        Returns
        -------
-        Backtrack
+        History
        """
        # internal detail:
        # ensure continuous increasing columns
@@ -157,7 +157,7 @@ class Backtrack:
            assert nr == 0

            self.mask[nr] = pd.Series(True, index=s.index, dtype=bool)
-            self.bt[nr] = s
+            self.hist[nr] = s
            return self

        if force:
@@ -168,23 +168,23 @@ class Backtrack:
        if nr == len(self):
            self.mask[nr] = True

-        self.bt[nr] = s
+        self.hist[nr] = s

        return self

-    def append(self, value: pd.Series, force=False) -> Backtrack:
+    def append(self, value: pd.Series, force=False) -> History:
        """
-        Create a new BT column and insert given pd.Series to it.
+        Create a new FH column and insert given pd.Series to it.

        Parameters
        ----------
        value : pd.Series
            the data to append. Must have dtype float and the index must
-            match the index of the BT.
+            match the index of the FH.

        force : bool, default False
            if True the internal mask is updated in a way that the currently
-            set value (series values) will be returned if ``Backtrack.max()``
+            set value (series values) will be returned if ``History.max()``
            is called. This apply for all valid values (not ``np.Nan`` and
            not ``-np.inf``).

@@ -195,7 +195,7 @@ class Backtrack:

        Returns
        -------
-        Backtrack: BT with appended series
+        History: FH with appended series
        """
        s = self._validate_value(value)

@@ -203,16 +203,16 @@ class Backtrack:
            raise ValueError('Cannot append empty pd.Series')

        if not self.empty and not s.index.equals(self.index):
-            raise ValueError("Index must be equal to BT's index")
+            raise ValueError("Index must be equal to FH's index")

        self._insert(value, nr=len(self), force=force)
        return self

-    def squeeze(self, n: int) -> Backtrack:
+    def squeeze(self, n: int) -> History:
        """
        Squeeze last `n` columns to a single column.

-        This **not** changes the result of ``Backtrack.max()``.
+        This **not** changes the result of ``History.max()``.

        Parameters
        ----------
@@ -229,18 +229,18 @@ class Backtrack:

        Returns
        -------
-        Backtrack
-            squeezed backtrack
+        History
+            squeezed history
        """
        if n <= 1:
            return self

        if n > len(self):
-            raise ValueError(f"'n={n}' cannot be greater than columns in the BT")
+            raise ValueError(f"'n={n}' cannot be greater than columns in the FH")

        # shortcut
        if len(self) == n:
-            self.bt = pd.DataFrame()
+            self.hist = pd.DataFrame()
            self.mask = pd.DataFrame()
            s = self.max()

@@ -248,16 +248,16 @@ class Backtrack:
            # calc the squeezed series.
            # we dont have to care about any forced series
            # because anytime force was given, the False's in
-            # the mask were propagated back over the whole BT
+            # the mask were propagated back over the whole FH
            mask = self.mask.iloc[:, -n:]
-            bt = self.bt.iloc[:, -n:]
-            s = bt[mask].max(axis=1)
+            hist = self.hist.iloc[:, -n:]
+            s = hist[mask].max(axis=1)

            # slice self down
            # this may leave us in an unstable state, because
            # the last column may not is entirely True, but
            # the following append, will fix this
-            self.bt = self.bt.iloc[:, :-n]
+            self.hist = self.hist.iloc[:, :-n]
            self.mask = self.mask.iloc[:, :-n]

        self.append(s)
@@ -265,21 +265,21 @@ class Backtrack:

    def max(self) -> pd.Series:
        """
-        Get the maximum value per row of the BT.
+        Get the maximum value per row of the FH.

        Returns
        -------
        pd.Series: maximum values
        """
-        return self.bt[self.mask].max(axis=1)
+        return self.hist[self.mask].max(axis=1)

    @property
-    def _constructor(self) -> Type['Backtrack']:
-        return Backtrack
+    def _constructor(self) -> Type['History']:
+        return History

-    def copy(self, deep=True) -> Backtrack:
+    def copy(self, deep=True) -> History:
        """
-        Make a copy of the BT.
+        Make a copy of the FH.

        Parameters
        ----------
@@ -289,20 +289,20 @@ class Backtrack:

        Returns
        -------
-        copy : Backtrack
-            the copied BT
+        copy : History
+            the copied FH
        """
-        return self._constructor(bt=self, copy=deep)
+        return self._constructor(hist=self, copy=deep)

    def __len__(self) -> int:
-        return len(self.bt.columns)
+        return len(self.hist.columns)

    def __repr__(self):

        if self.empty:
-            return str(self.bt).replace('DataFrame', 'Backtrack')
+            return str(self.hist).replace('DataFrame', 'History')

-        repr = self.bt.astype(str)
+        repr = self.hist.astype(str)
        m = self.mask

        repr[m] = ' ' + repr[m] + ' '
@@ -314,13 +314,13 @@ class Backtrack:
    # validation
    #

-    def _validate_bt_with_mask(self, obj: pd.DataFrame, mask: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
+    def _validate_hist_with_mask(self, obj: pd.DataFrame, mask: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        check type, columns, index, dtype and if the mask fits the obj.
        """

-        # check bt
-        self._validate_bt(obj)
+        # check hist
+        self._validate_hist(obj)

        # check mask
        if not isinstance(mask, pd.DataFrame):
@@ -332,25 +332,25 @@ class Backtrack:
        if not mask.empty and not mask.iloc[:, -1].all():
            raise ValueError("the values in the last column in mask must be 'True' everywhere.")

-        # check combination of bt and mask
+        # check combination of hist and mask
        if not obj.columns.equals(mask.columns):
-            raise ValueError("'bt' and 'mask' must have same columns")
+            raise ValueError("'hist' and 'mask' must have same columns")

        if not obj.index.equals(mask.index):
-            raise ValueError("'bt' and 'mask' must have same index")
+            raise ValueError("'hist' and 'mask' must have same index")

        return obj, mask

-    def _validate_bt(self, obj: pd.DataFrame) -> pd.DataFrame:
+    def _validate_hist(self, obj: pd.DataFrame) -> pd.DataFrame:
        """
        check type, columns, dtype of obj.
        """

        if not isinstance(obj, pd.DataFrame):
-            raise TypeError(f"'bt' must be of type pd.DataFrame, but {type(obj).__name__} was given")
+            raise TypeError(f"'hist' must be of type pd.DataFrame, but {type(obj).__name__} was given")

        if any(obj.dtypes != float):
-            raise ValueError('dtype of all columns in bt must be float')
+            raise ValueError('dtype of all columns in hist must be float')

        if not obj.empty and (
                not obj.columns.equals(pd.Index(range(len(obj.columns))))

--- a/test/flagger/test_backtrack.py
+++ b/test/flagger/test_backtrack.py
@@ -5,9 +5,9 @@ import numpy as np
 import pandas as pd
 from pandas.api.types import is_bool_dtype
 from test.common import TESTFLAGGER, initData
-from saqc.flagger.backtrack import Backtrack
+from saqc.flagger.history import History

-# see #GH143 combined bt
+# see #GH143 combined backtrack
 # (adjusted to current implementation)
 example1 = (

@@ -76,66 +76,66 @@ data = [
 ]


-def check_invariants(bt):
+def check_invariants(hist):
    """
-    This can be called for **any** BT.
+    This can be called for **any** FH.
    The assertions must hold in any case.
    """
    # basics
-    assert isinstance(bt, Backtrack)
-    assert isinstance(bt.bt, pd.DataFrame)
-    assert isinstance(bt.mask, pd.DataFrame)
-    assert all(bt.bt.dtypes == float)
-    assert all(bt.mask.dtypes == bool)
-    assert bt.bt.columns.equals(bt.mask.columns)
-    assert bt.columns is bt.bt.columns
-    assert bt.index is bt.bt.index
-    assert len(bt) == len(bt.columns)
+    assert isinstance(hist, History)
+    assert isinstance(hist.hist, pd.DataFrame)
+    assert isinstance(hist.mask, pd.DataFrame)
+    assert all(hist.hist.dtypes == float)
+    assert all(hist.mask.dtypes == bool)
+    assert hist.hist.columns.equals(hist.mask.columns)
+    assert hist.columns is hist.hist.columns
+    assert hist.index is hist.hist.index
+    assert len(hist) == len(hist.columns)

    # advanced
-    assert bt.columns.equals(pd.Index(range(len(bt))))
-    assert isinstance(bt.max(), pd.Series)
-    assert bt.mask.empty or bt.mask.iloc[:, -1].all()
+    assert hist.columns.equals(pd.Index(range(len(hist))))
+    assert isinstance(hist.max(), pd.Series)
+    assert hist.mask.empty or hist.mask.iloc[:, -1].all()

    # False propagation

    # for each row this must hold:
    # either the row has one change (False->True)
    # or the entire row is True
-    if not bt.empty:
-        idxmax = bt.mask.idxmax(axis=1)
+    if not hist.empty:
+        idxmax = hist.mask.idxmax(axis=1)
        for row, col in idxmax.items():
-            assert all(bt.mask.iloc[row, :col] == False)
-            assert all(bt.mask.iloc[row, col:] == True)
+            assert all(hist.mask.iloc[row, :col] == False)
+            assert all(hist.mask.iloc[row, col:] == True)


-def is_equal(bt1: Backtrack, bt2: Backtrack):
+def is_equal(hist1: History, hist2: History):
    """
-    Check if two BT are (considered) equal, namely
-    have equal 'bt' and equal 'mask'.
+    Check if two FH are (considered) equal, namely
+    have equal 'hist' and equal 'mask'.
    """
-    return bt1.bt.equals(bt2.bt) and bt1.mask.equals(bt2.mask)
+    return hist1.hist.equals(hist2.hist) and hist1.mask.equals(hist2.mask)


 @pytest.mark.parametrize('data', data + [None])
 def test_init(data: np.array):
    # init
    df = pd.DataFrame(data, dtype=float)
-    bt = Backtrack(bt=df)
+    hist = History(hist=df)

-    check_invariants(bt)
+    check_invariants(hist)

    # shape would fail
    if data is not None:
-        assert len(bt.index) == data.shape[0]
-        assert len(bt.columns) == data.shape[1]
-        assert bt.mask.all(axis=None)
+        assert len(hist.index) == data.shape[0]
+        assert len(hist.columns) == data.shape[1]
+        assert hist.mask.all(axis=None)

    # check fastpath
-    fast = Backtrack(bt=bt)
+    fast = History(hist=hist)
    check_invariants(fast)

-    assert is_equal(bt, fast)
+    assert is_equal(hist, fast)


 @pytest.mark.parametrize('data', data + [None])
@@ -145,52 +145,52 @@ def test_init_with_mask(data: np.array):
    mask = pd.DataFrame(data, dtype=bool)
    if not mask.empty:
        mask.iloc[:, -1] = True
-    bt = Backtrack(bt=df, mask=mask)
+    hist = History(hist=df, mask=mask)

-    check_invariants(bt)
+    check_invariants(hist)

    # shape would fail
    if data is not None:
-        assert len(bt.index) == data.shape[0]
-        assert len(bt.columns) == data.shape[1]
+        assert len(hist.index) == data.shape[0]
+        assert len(hist.columns) == data.shape[1]

    # check fastpath
-    fast = Backtrack(bt=bt)
+    fast = History(hist=hist)
    check_invariants(fast)

-    assert is_equal(bt, fast)
+    assert is_equal(hist, fast)


 @pytest.mark.parametrize('data', data + [None])
 def test_copy(data):
    # init
    df = pd.DataFrame(data, dtype=float)
-    bt = Backtrack(bt=df)
-    shallow = bt.copy(deep=False)
-    deep = bt.copy(deep=True)
+    hist = History(hist=df)
+    shallow = hist.copy(deep=False)
+    deep = hist.copy(deep=True)

    # checks

    for copy in [deep, shallow]:
        check_invariants(copy)
-        assert copy is not bt
-        assert is_equal(copy, bt)
+        assert copy is not hist
+        assert is_equal(copy, hist)

    assert deep is not shallow
    assert is_equal(deep, shallow)

-    assert deep.bt is not bt.bt
-    assert deep.mask is not bt.mask
-    assert shallow.bt is bt.bt
-    assert shallow.mask is bt.mask
+    assert deep.hist is not hist.hist
+    assert deep.mask is not hist.mask
+    assert shallow.hist is hist.hist
+    assert shallow.mask is hist.mask


 @pytest.fixture(scope='module')
-def __bt():
-    # this BT is filled by
+def __hist():
+    # this FH is filled by
    #  - test_append
    #  - test_append_force
-    return Backtrack()
+    return History()


 @pytest.mark.parametrize('s, max_val', [
@@ -201,15 +201,15 @@ def __bt():
        [0, 1, 1, 1, 1]  # expected max-val
    )
 ])
-def test_append(__bt, s, max_val):
-    bt = __bt
-    bt.append(s, force=False)
-    check_invariants(bt)
-    assert all(bt.max() == max_val)
+def test_append(__hist, s, max_val):
+    hist = __hist
+    hist.append(s, force=False)
+    check_invariants(hist)
+    assert all(hist.max() == max_val)


 # this test append more rows to the resulting
-# BT from the former test
+# FH from the former test
 @pytest.mark.parametrize('s, max_val', [
    (pd.Series(val, index=range(6), dtype=float), max_val)
    for val, max_val
@@ -218,11 +218,11 @@ def test_append(__bt, s, max_val):
        [0, 1, 1, 0],  # expected max-val
    )
 ])
-def test_append_force(__bt, s, max_val):
-    bt = __bt
-    bt.append(s, force=True)
-    check_invariants(bt)
-    assert all(bt.max() == max_val)
+def test_append_force(__hist, s, max_val):
+    hist = __hist
+    hist.append(s, force=True)
+    check_invariants(hist)
+    assert all(hist.max() == max_val)


 def test_squeeze():
@@ -230,7 +230,7 @@ def test_squeeze():
    d, m, exp = example2
    d = pd.DataFrame(d, dtype=float)
    m = pd.DataFrame(m, dtype=bool)
-    orig = Backtrack(bt=d, mask=m)
+    orig = History(hist=d, mask=m)

    check_invariants(orig)
    assert all(orig.max() == exp)
@@ -238,17 +238,17 @@ def test_squeeze():
    # checks

    for n in range(len(orig)):
-        bt = orig.copy()
-        bt.squeeze(n)
+        hist = orig.copy()
+        hist.squeeze(n)

-        check_invariants(bt)
+        check_invariants(hist)

        # squeeze for less then 2 rows does nothing
        if n < 2:
-            assert is_equal(bt, orig)
+            assert is_equal(hist, orig)
        else:
-            assert len(bt) == len(orig) - n + 1
+            assert len(hist) == len(orig) - n + 1

        # result does not change
-        assert all(bt.max() == exp)
-        print(bt)
+        assert all(hist.max() == exp)
+        print(hist)