create like df

ca31885e · Bert Palm · 6140bd50 · ca31885e · ca31885e · ca31885e
Commit ca31885e authored 5 years ago by Bert Palm 🎇
--- a/dios/dios.py
+++ b/dios/dios.py
 from .operators import OP_MAP as _OP_MAP

 from .lib import *
-from .lib import _CAST_POLICIES, _itype_le, _itype_lt, _throw_MixedItype_err_or_warn
+from .lib import (
+    _CAST_POLICIES,
+    _itype_le, _itype_lt,
+    _throw_MixedItype_err_or_warn,
+    _find_least_common_itype,
+)

 import pandas as pd
 import numpy as np
@@ -105,10 +110,8 @@ class DictOfSeries:
                self._data = pd.Series(dtype='O', index=columns)
        else:

-            self._data = pd.Series(dtype='O', index=pd.Index([]))
-
            # itype=None means infer the itype by the data, so we first set to the highest
-            # possible itype, then insert data, then find the best-fitting.
+            # possible itype, then insert data, then find the best-fitting itype.
            if itype is None:
                self._itype = MixedItype
            else:
@@ -118,53 +121,53 @@ class DictOfSeries:
                raise ValueError(f"downcast_policy must be one of {_CAST_POLICIES}")
            self._policy = cast_policy

-            if columns is not None and not _is_list_like_not_nested(columns):
-                raise TypeError("'columns' must be some kind of list-like collection.")
+            index = pd.Index([] if columns is None else columns)
+            self._data = pd.Series(dtype='O', index=index.unique())

            if data is not None:
                self._init_insert_data(data, columns)

+            # NOTE: self._data contain nans at locations
+            # where no data was present, but a column-name
+            # was given
+
            if itype is None:
-                self._itype = self.__find_least_common_itype()
+                self._itype = _find_least_common_itype(self._data.dropna())
                if not self._itype.unique:
                    _throw_MixedItype_err_or_warn(self.itype)

-        # insert empty series for requested columns
-        if columns is not None:
+        # insert empty columns
+        if self._data.hasnans:
            e = pd.Series(dtype='O')
-            for c in columns:
-                if fastpath or c not in self.columns:
-                    self._insert(c, e.copy())
+            for c in self.columns[self._data.isna()]:
+                self._insert(c, e.copy())

    def _init_insert_data(self, data, columns):
-
-        def incols(c):
-            return c in columns if columns is not None else True
+        """ Insert items of a iterable in self"""

        data = list(data) if _is_iterator(data) else data

        if isinstance(data, dict) or _is_dios_like(data):
            for k in data:
-                if incols(k):
+                if columns is None or k in self.columns:
                    self._insert(k, data[k])

        elif _is_list_like(data):  # also Series !
            data = data if _is_nested_list_like(data) else [data]

-            if columns is None:
-                for i, d in enumerate(data):
-                    self._insert(i, d)
-            else:
-                if len(data) != len(columns):
-                    raise ValueError(f"length of passed values is {len(data)}, columns imply {len(columns)}")
+            if self.columns.empty:
+                self._data = pd.Series(dtype='O', index=pd.RangeIndex(len(data)))
+
+            elif len(data) != len(self.columns):
+                raise ValueError(f"length of passed values is {len(data)}, columns imply {len(self.columns)}")

-                for i, c in enumerate(columns):
-                    self._insert(c, data[i])
+            for i, c in enumerate(self.columns):
+                self._insert(c, data[i])
        else:
            raise ValueError(f"data must be some kind of iterable, type {type(data)} was given")

    def _insert(self, col, val):
-        """Insert a fresh new value into self"""
+        """Insert a fresh new value as pd.Series into self"""
        val = list(val) if _is_iterator(val) else val

        if _is_dios_like(val):
@@ -183,8 +186,8 @@ class DictOfSeries:
        return self._data.index

    @columns.setter
-    def columns(self, newindex):
-        self._data.index = newindex
+    def columns(self, cols):
+        self._data.index = cols

    @property
    def itype(self):
@@ -205,20 +208,6 @@ class DictOfSeries:
        except Exception as e:
            raise type(e)(f"Column {c}: " + str(e)) from e

-    def __find_least_common_itype(self):
-        itypes = [NumItype, FloatItype, IntItype, DtItype]
-        tlist = [get_itype(s.index) for s in self._data]
-        found = MixedItype
-        if tlist:
-            for itype in itypes:
-                for t in tlist:
-                    if _itype_le(t, itype):
-                        continue
-                    break
-                else:
-                    found = itype
-        return found
-
    def __getitem__(self, key):
        """ dios[key] -> dios/series """
        key = list(key) if _is_iterator(key) else key

--- a/dios/lib.py
+++ b/dios/lib.py
@@ -135,6 +135,21 @@ def _itype_le(a, b):
    return is_itype_like(a, b)


+def _find_least_common_itype(iterable_of_series):
+    itypes = [NumItype, FloatItype, IntItype, DtItype]
+    tlist = [get_itype(s.index) for s in iterable_of_series]
+    found = MixedItype
+    if tlist:
+        for itype in itypes:
+            for t in tlist:
+                if _itype_le(t, itype):
+                    continue
+                break
+            else:
+                found = itype
+    return found
+
+
 ################################################################################
 # Casting


--- a/test/test_dflike.py
+++ b/test/test_dflike.py
@@ -7,8 +7,6 @@ from pandas.core.dtypes.common import is_dict_like, is_nested_list_like
 import numpy as np
 from copy import deepcopy

-pytestmark = pytest.mark.skip
-
 __author__ = "Bert Palm"
 __email__ = "bert.palm@ufz.de"
 __copyright__ = "Copyright 2018, Helmholtz-Zentrum für Umweltforschung GmbH - UFZ"
@@ -16,10 +14,13 @@ __copyright__ = "Copyright 2018, Helmholtz-Zentrum für Umweltforschung GmbH - U

 arr = np.random.rand(8)
 TESTDATA = [
-    None,   # empty
-    arr.copy(),    # list
-    # np.array([arr.copy(), arr.copy(), arr.copy()]),  # nested list
-    dict(a=arr.copy(), b=arr.copy()),  # dict
+    None,   # empty  # 0
+    [1],  # 1
+    arr.copy(),  # 2
+    np.array([arr.copy(), arr.copy(), arr.copy()]),  # 3 - nested list
+    range(4),  # 4
+    dict(a=arr.copy(), b=arr.copy()),  # 5 dict
+    pd.DataFrame(dict(a=arr.copy(), b=arr.copy()))  # 6 df
 ]


@@ -27,12 +28,10 @@ TESTDATA = [
 @pytest.mark.parametrize("with_column_param", [False, True])
 def test_dios_create(data, with_column_param):

-    if is_dict_like(data) and with_column_param:
-        # giving column names in dict-keys and in columns-parameter is special in df
-        pytest.skip()
-
    data_copy0 = deepcopy(data)
    data_copy1 = deepcopy(data)
+
+    # create columns list
    if with_column_param:
        df = pd.DataFrame(data=data_copy0)
        col = [f"new_{c}" for c in df]
@@ -43,16 +42,11 @@ def test_dios_create(data, with_column_param):
        # giving nested lists, work different between df and dios
        data_copy1 = data_copy1.transpose()

-    df = pd.DataFrame(data=data_copy1, columns=col)
-    dios = DictOfSeries(data=data_copy0, columns=col)
-
-    assert len(dios.columns) == len(df.columns)
-    assert np.all(dios.values == df.values)
+    df = pd.DataFrame(data=data_copy0, columns=col)
+    dios = DictOfSeries(data=data_copy1, columns=col)

-    # df columns may not be strings, but dios'es are always
-    columns = [str(c) for c in df.columns]
-    assert list(dios.columns) == columns
+    assert dios.columns.equals(df.columns)

    for c in df.columns:
-        assert np.all(dios[str(c)] == df[c])
+        assert np.all(dios[c] == df[c].dropna())

--- a/test/test_methods.py
+++ b/test/test_methods.py
@@ -5,15 +5,24 @@ def test_copy_copy_empty(getDtDiosAligned):
    dios = getDtDiosAligned.copy()
    shallow = dios.copy(deep=False)
    deep = dios.copy(deep=True)
-    empty = dios.copy_empty()
+    empty_w_cols = dios.copy_empty(columns=True)
+    empty_no_cols = dios.copy_empty(columns=False)

    assert dios is not shallow
    assert dios is not deep
-    assert dios is not empty
+    assert dios is not empty_w_cols
+    assert dios is not empty_no_cols

-    assert dios.itype == shallow.itype
-    assert dios.itype == deep.itype
-    assert dios.itype == empty.itype
+    for attr in ['itype', '_itype', '_policy', ]:
+        dios_attr = getattr(dios, attr)
+        for cop in [shallow, deep, empty_w_cols, empty_no_cols]:
+            copy_attr = getattr(cop, attr)
+            assert dios_attr == copy_attr
+
+    assert dios.columns.equals(shallow.columns)
+    assert dios.columns.equals(deep.columns)
+    assert dios.columns.equals(empty_w_cols.columns)
+    assert not dios.columns.equals(empty_no_cols.columns)

    for i in dios:
        assert dios[i].index is shallow[i].index