wip

058bded9 · Bert Palm · 8b0d2ca8 · 058bded9 · 058bded9 · 058bded9
Commit 058bded9 authored 5 years ago by Bert Palm 🎇
--- a/dios/dios.py
+++ b/dios/dios.py
@@ -18,6 +18,7 @@ from pandas.core.dtypes.common import (
    is_integer,
    is_dict_like,
    is_number,
+    is_hashable,
 )
 from pandas.core.dtypes.common import is_iterator as _is_iterator

@@ -48,6 +49,16 @@ def is_iterator(obj):
    return _is_iterator(obj)


+def align(s1, s2, method='dropna'):
+    if method == 'keepna':
+        s = s1.reindex_like(s2)
+    elif method == 'dropna':
+        s = s1.reindex_like(s2).dropna()
+    else:
+        raise ValueError(method)
+    return s
+
+
 class DictOfSeries:
    """
    DictionaryOfSeries is a collection of pd.Series's which aim to be as close as possible similar to
@@ -82,80 +93,61 @@ class DictOfSeries:

    def __init__(self, data=None, columns=None, itype=MixedItype, downcast_policy='lossless'):

-        self._data = OrderedDict()
+        def to_ser(item):
+            if not isinstance(item, pd.Series):
+                return pd.Series(item)
+            return item

-        # We need to keep track of the index-type (itype) of every new Series.
-        # If the itypes differ between different series, slicing will almost always fail
-        # (eg. a datetime-like slice cannot work on a numeric index and vice versa).
-        self._itype = None
-        self.itype = get_itype(itype)
+        self._data = pd.Series()
+        if data is not None:
+            if isinstance(data, dict):
+                for k in data:
+                    self._data.loc[k] = to_ser(k)
+            if is_list_like(data):
+                data = data if is_nested_list_like(data) else [data]
+                for i, d in enumerate(data):
+                    self._data.loc[i] = to_ser(d)
+            else:
+                self._data.loc[0] = pd.Series(data)
+
+        if columns is not None:
+            self.columns = columns

        if downcast_policy not in CAST_POLICIES:
            raise ValueError(f"downcast_policy must be one of {CAST_POLICIES}")
        self._policy = downcast_policy

-        if data is not None:
-            self.__init_insert_data__(data)
-
-        # we use the columns.setter to make all necessary checks
-        if columns is not None:
-            self.columns = columns
+        # We need to keep track of the index-type (itype) of every new Series.
+        # If the itypes differ between different series, slicing will almost always fail
+        # (eg. a datetime-like slice cannot work on a numeric index and vice versa).
+        self._itype = get_itype(itype)

-    def __init_insert_data__(self, data):
-        if isinstance(data, DictOfSeries):
-            g = ((k, data[k]) for k in data)
-        else:
-            data = list(data) if is_iterator(data) else data
-            if is_dict_like(data):
-                g = ((k, data[k]) for k in data)
-            elif is_nested_list_like(data):
-                g = ((str(i), d) for i, d in enumerate(data))
-            elif is_list_like(data):
-                g = [('0', data)]
-            else:
-                raise ValueError(f"init with data of type {type(data)} is not possible.")
-        for k, val in g:
-            self[k] = val
-        return
+        for s in self._data:
+            cast_to_itype(s, self._itype, policy=self._policy, inplace=True)

    @property
    def columns(self):
-        return list(self._data.keys())
+        return self._data.index

    @columns.setter
-    def columns(self, new):
-        if not isinstance(new, list):
-            raise TypeError("column names must be given as a list")
-
-        if len(set(new)) != len(new):
-            raise ValueError("column names must be unique")
-
-        if len(new) != len(self.columns):
-            raise ValueError(f"Length mismatch: Columns has {len(self.columns)} elements, "
-                             f"new values have {len(new)} elements")
-
-        # to keep order, we iterate over self instead of new
-        _d = OrderedDict()
-        for i, k in enumerate(self.columns):
-            _d[new[i]] = self[k]
-        self._data = _d
+    def columns(self, newindex):
+        self._data.index = newindex

    @property
    def values(self):
-        # will make all series same length, inset nan's
-        return to_object_array(self._data.values()).transpose()
+        return np.array([c.values for c in self._data])

    @property
    def data(self):
-        return self._data.values()
+        return self._data.values

    @property
    def itype(self):
        return self._itype

    @itype.setter
-    def itype(self, itype_like):
-        itype = get_itype(itype_like)
+    def itype(self, newitype):
+        itype = get_itype(newitype)

        if not itype_le(self._itype, itype):
            self.__cast_all(itype)
@@ -170,9 +162,8 @@ class DictOfSeries:
    def __cast_all(self, itype):
        k = '?'
        try:
-            for k in self.columns:
-                casted = cast_to_itype(self._data[k], itype, policy=self._policy)
-                self._data[k] = casted
+            for k in self._data:
+                cast_to_itype(k, itype, policy=self._policy, inplace=True)
        except Exception as e:
            raise type(e)(f"Column {k}: " + str(e)) from e

@@ -185,31 +176,34 @@ class DictOfSeries:
        Notes:
          - [1] If ``iterable`` contains any(!) label that does not exist, a KeyError is raised.
        """
-        # special case single label
-        if isinstance(key, str):
-            if key in self.columns:
-                new = self._get_item(key)
-            else:
-                raise KeyError(key)
-        # all other cases
-        else:
-            keys, ixs, ixstype = self._unpack_key(key)
-            ixs = self._unpack_indexer(keys, ixs, ixstype)
+        if is_bool_indexer(key):
+            if not is_series_like(key):
+                raise ValueError("Only boolean series are allowed as boolean indexer.")
            new = self.copy_empty()
-            for i, _ in enumerate(keys):
-                key, ix = keys[i], ixs[i]
-                new._data[key] = self._get_item(key, ix, True)
-        return new
+            for c in self.columns:
+                new._data[c] = align(self._data[c], key, method='dropna')
+
+        elif is_series_like(key):
+            raise ValueError("Only series with boolean values are allowed as indexer")
+
+        elif isinstance(key, slice):
+            new = self.copy_empty()
+            for c in self._data.index:
+                new._data[c] = self._data[c][key]
+
+        elif isinstance(key, self.__class__):
+            new = self.copy_empty()
+            cols = self.columns.intersection(key.columns)
+            for c in cols:
+                new._data[c] = align(key._data[c], self._data[c])
+
+        elif is_hashable(key):
+            new = self._data[key]

-    def _get_item(self, key, ix=None, insertna=False):
-        """Extract a pd.Series from self"""
-        if ix is None:
-            return self._data[key]
-        elif insertna:
-            s = self._data[key]
-            return s[ix].reindex_like(s)
        else:
-            return self._data[key][ix]
+            new = self.copy()
+            new._data = self._data[key]
+        return new

    def __setitem__(self, key, value):
        """
@@ -225,18 +219,19 @@ class DictOfSeries:
            in the ``options`` dictionary.
          - [3] If ``iterable`` contains any(!) label that does not exist, a KeyError is raised.
        """
-        # special case single label
-        if isinstance(key, str):
+        doalign = False
+
+        if is_hashable(key):
            if key not in self.columns:
                self._insert(key, value)
                return
            else:
-                k, i, it = [key], [slice(None)], None
-        # all other cases
+                keys, ixs = [key], [slice(None)]
        else:
-            k, i, it = self._unpack_key(key)
-        i = self._unpack_indexer(k, i, it)
-        gen = self._unpack_value(k, i, value)
+            keys, ixs, doalign = self._unpack_key(key)
+
+        assert len(keys) == len(ixs)
+        gen = self._unpack_value(keys, ixs, value)
        for tup in gen:
            self._set_item(*tup)

@@ -262,7 +257,7 @@ class DictOfSeries:
            raise ValueError(f"Only pd.Series can be inserted directly, given type {type(val)}")

        val = cast_to_itype(val, self._itype, policy=self._policy)
-        self._data[key] = val.copy(deep=True)
+        self._data.loc[key] = val.copy(deep=True)

    def _unpack_value(self, keys, ixs, val):
        """Return a generator that yield (key, indexer, value) for all keys"""
@@ -287,73 +282,27 @@ class DictOfSeries:

    def _unpack_key(self, key):
        """ Determine keys and indexer by type of key. This does not deal
-            with single label-access, only higher dimension objects are handled..
-
-        Notes:
-            Which keys we get, may depend on the policy in dios_options
+            with single (hashable) label-access, only higher dimension objects
+            are handled..
        """
-        len_err_msg = "length of given column-indexer does not match length of columns"
-        keys = None
-        indexer, idxtype = None, None
-
-        # prevent consuming of a generator
-        key = list(key) if is_iterator(key) else key
-
-        if isinstance(key, slice):
+        if is_bool_indexer(key):
+            if not is_series_like(key):
+                raise ValueError("Only boolean series are allowed as boolean indexer.")
            keys = self.columns
-            indexer, idxtype = [key], 'slice'
-
-        # list, np.arrays, ... of list, np.arrays..
-        elif is_nested_list_like(key):
-            # we only allow bool nlists
+            indexer, doalign = [key] * len(keys), True
+        elif isinstance(key, slice):
            keys = self.columns
-            indexer, idxtype = key, 'nlist'
-
-        # ser, df, dios
-        elif is_pandas_like(key):
-            if is_series_like(key):
-                mask = key.to_numpy()
-                if is_bool_indexer(mask):
-                    # bool series are column indexer not row-indexer!
-                    keys = []
-                    for k in self.columns:
-                        try:
-                            if key[k]:
-                                keys.append(k)
-                        except KeyError:
-                            pass
-                else:
-                    keys = key.to_list()
-
-            elif is_dataframe_like(key):
-                # we only allow bool df's
-                keys = key.columns.to_list()
-                indexer, idxtype = key, 'df'
-
-            elif is_dios_like(key):
-                # we only allow bool dios's
-                keys = key.columns
-                indexer, idxtype = key, 'dios'
-
-        # list, np.array, np.ndarray, ...
-        # Note: series considered list-like, so we handle lists at last
-        elif is_list_like(key):
-            arr = np.array(key)
-            if is_bool_array(arr):
-                keys = self.columns
-                if len(arr) != len(keys):
-                    raise ValueError(len_err_msg)
-                keys = np.array(keys)[arr]
-            else:
-                keys = key
+            indexer = [key] * len(keys)
+        elif isinstance(key, self.__class__):
+            keys = self.columns.intersection(key.columns).to_list()
+            indexer, doalign = key[keys].to_list(), True
+        elif is_list_like(key) and not is_nested_list_like(key):
+            # policy is fix here. we only want to allow known keys or less; empty list is ok
+            keys = check_keys_by_policy(key, self.columns, Opts.none_up2_all)
+            indexer = [slice(None)] * len(keys)
        else:
-            raise KeyError(f"{key}")
-
-        # check keys
-        method = dios_options[OptsFields.col_indexing_method]
-        keys = check_keys_by_policy(keys, self.columns, method)
-
-        return keys, indexer, idxtype
+            raise TypeError(f"Unknown indexer type: {type(key)}")
+        return keys, indexer, doalign

    def _unpack_indexer(self, keys, indexer, idxtype):
        err_bool = "only boolean values are allowed"
@@ -453,8 +402,8 @@ class DictOfSeries:
        # We use `_data` here, because all checks are already done.
        # So this should be much faster, especially, because we use the underlying dict for
        # getting and setting the values, instead of ``__setitem__`` and ``__getitem__``.
-        for k in self._data:
-            new._data[k] = self._data[k].copy(deep=deep)
+        for i in self._data.index:
+            new._data.loc[i] = self._data[i].copy(deep=deep)
        return new

    def copy_empty(self):

--- a/dios/options.py
+++ b/dios/options.py
@@ -31,9 +31,12 @@ class OptsFields:


 class Opts:
-    none_plus = 'none_plus'
-    at_least_one = 'at_least_one'
-    all_present = 'all_present'
+    none_up2_all = 'none_all'
+    none_up2_more = 'none_more'
+    one_up2_all = 'one_all'
+    one_up2_more = 'one_more'
+    exactly_all = 'all_all'
+    all_or_more = 'all_more'

    itype_warn = 'warn'
    itype_err = 'err'
@@ -54,20 +57,37 @@ dios_options = {
 }


-def check_keys_by_policy(check, keys, policy):
+def check_keys_by_policy(tocheck, keys, policy):

-    filtered = [k for k in check if k in keys]
-    if policy == Opts.none_plus:
+    filtered = [k for k in tocheck if k in keys]
+    if policy == Opts.none_up2_all:
+        fail = [k for k in tocheck if k not in keys]
+        if fail:
+            raise KeyError(f"Policy says: keys must be known. Unknown: {fail}")
+
+    elif policy == Opts.none_up2_more:
        pass

-    elif policy == Opts.at_least_one:
+    elif policy == Opts.one_up2_all:
+        fail = [k for k in tocheck if k not in keys]
+        if not filtered or fail:
+            if fail:
+                raise KeyError(f"Policy says: keys must be known and at least one must be shared. Unknown: {fail}")
+            raise KeyError("Policy says: keys must known and at least one key must be shared. None was shared.")
+
+    elif policy == Opts.one_up2_more:
        if not filtered:
            raise KeyError("Policy says: at least one key must be shared.")

-    elif Opts.all_present:
-        fail = set(filtered).symmetric_difference(set(check))
+    elif policy == Opts.exactly_all:
+        fail = set(tocheck).symmetric_difference(set(keys))
+        if fail:
+            raise KeyError(f"Policy says: exactly all keys must be given.")
+
+    elif Opts.all_or_more:
+        fail = set(filtered).symmetric_difference(set(keys))
        if fail:
-            raise KeyError(f"Unknown keys {fail}. Policy says: all given keys must be known.")
+            raise KeyError(f"Policy says: all known keys must be given, unknown are ignored.")
    else:
        raise ValueError(policy)


--- a/test/run_dios.py
+++ b/test/run_dios.py
@@ -8,6 +8,17 @@ if __name__ == '__main__':
    # df = pd.DataFrame([1,24,5,456,45], index=pd.date_range(periods=5, freq='1d', start='2000-01-01'))
    # df[[True, False]]

+    a = pd.Series([1, 12, 2])
+    b = pd.Series([2, 12, 2])
+    c = pd.Series([2, 12, 2])
+    d = pd.Series([3, 12, 2])
+    x = pd.Series([a, b, c])
+    y = pd.Series([a, b, d])
+    k = x == y
+    print(k)
+
+    exit(9384)
+
    dios = DictOfSeries(data=[234.54, 5, 5, 4, np.nan, 5, 4, 5])

    dios = abs(~dios)