From ca31885ee6045b37325a0f739eee611f76f0e38b Mon Sep 17 00:00:00 2001
From: Bert Palm <bert.palm@ufz.de>
Date: Tue, 10 Mar 2020 19:33:52 +0100
Subject: [PATCH] create like df

---
 dios/dios.py         | 71 +++++++++++++++++++-------------------------
 dios/lib.py          | 15 ++++++++++
 test/test_dflike.py  | 32 ++++++++------------
 test/test_methods.py | 19 ++++++++----
 4 files changed, 72 insertions(+), 65 deletions(-)

diff --git a/dios/dios.py b/dios/dios.py
index cd5e7e1..ba6fad5 100644
--- a/dios/dios.py
+++ b/dios/dios.py
@@ -1,7 +1,12 @@
 from .operators import OP_MAP as _OP_MAP
 
 from .lib import *
-from .lib import _CAST_POLICIES, _itype_le, _itype_lt, _throw_MixedItype_err_or_warn
+from .lib import (
+    _CAST_POLICIES,
+    _itype_le, _itype_lt,
+    _throw_MixedItype_err_or_warn,
+    _find_least_common_itype,
+)
 
 import pandas as pd
 import numpy as np
@@ -105,10 +110,8 @@ class DictOfSeries:
                 self._data = pd.Series(dtype='O', index=columns)
         else:
 
-            self._data = pd.Series(dtype='O', index=pd.Index([]))
-
             # itype=None means infer the itype by the data, so we first set to the highest
-            # possible itype, then insert data, then find the best-fitting.
+            # possible itype, then insert data, then find the best-fitting itype.
             if itype is None:
                 self._itype = MixedItype
             else:
@@ -118,53 +121,53 @@ class DictOfSeries:
                 raise ValueError(f"downcast_policy must be one of {_CAST_POLICIES}")
             self._policy = cast_policy
 
-            if columns is not None and not _is_list_like_not_nested(columns):
-                raise TypeError("'columns' must be some kind of list-like collection.")
+            index = pd.Index([] if columns is None else columns)
+            self._data = pd.Series(dtype='O', index=index.unique())
 
             if data is not None:
                 self._init_insert_data(data, columns)
 
+            # NOTE: self._data contain nans at locations
+            # where no data was present, but a column-name
+            # was given
+
             if itype is None:
-                self._itype = self.__find_least_common_itype()
+                self._itype = _find_least_common_itype(self._data.dropna())
                 if not self._itype.unique:
                     _throw_MixedItype_err_or_warn(self.itype)
 
-        # insert empty series for requested columns
-        if columns is not None:
+        # insert empty columns
+        if self._data.hasnans:
             e = pd.Series(dtype='O')
-            for c in columns:
-                if fastpath or c not in self.columns:
-                    self._insert(c, e.copy())
+            for c in self.columns[self._data.isna()]:
+                self._insert(c, e.copy())
 
     def _init_insert_data(self, data, columns):
-
-        def incols(c):
-            return c in columns if columns is not None else True
+        """ Insert items of a iterable in self"""
 
         data = list(data) if _is_iterator(data) else data
 
         if isinstance(data, dict) or _is_dios_like(data):
             for k in data:
-                if incols(k):
+                if columns is None or k in self.columns:
                     self._insert(k, data[k])
 
         elif _is_list_like(data):  # also Series !
             data = data if _is_nested_list_like(data) else [data]
 
-            if columns is None:
-                for i, d in enumerate(data):
-                    self._insert(i, d)
-            else:
-                if len(data) != len(columns):
-                    raise ValueError(f"length of passed values is {len(data)}, columns imply {len(columns)}")
+            if self.columns.empty:
+                self._data = pd.Series(dtype='O', index=pd.RangeIndex(len(data)))
+
+            elif len(data) != len(self.columns):
+                raise ValueError(f"length of passed values is {len(data)}, columns imply {len(self.columns)}")
 
-                for i, c in enumerate(columns):
-                    self._insert(c, data[i])
+            for i, c in enumerate(self.columns):
+                self._insert(c, data[i])
         else:
             raise ValueError(f"data must be some kind of iterable, type {type(data)} was given")
 
     def _insert(self, col, val):
-        """Insert a fresh new value into self"""
+        """Insert a fresh new value as pd.Series into self"""
         val = list(val) if _is_iterator(val) else val
 
         if _is_dios_like(val):
@@ -183,8 +186,8 @@ class DictOfSeries:
         return self._data.index
 
     @columns.setter
-    def columns(self, newindex):
-        self._data.index = newindex
+    def columns(self, cols):
+        self._data.index = cols
 
     @property
     def itype(self):
@@ -205,20 +208,6 @@ class DictOfSeries:
         except Exception as e:
             raise type(e)(f"Column {c}: " + str(e)) from e
 
-    def __find_least_common_itype(self):
-        itypes = [NumItype, FloatItype, IntItype, DtItype]
-        tlist = [get_itype(s.index) for s in self._data]
-        found = MixedItype
-        if tlist:
-            for itype in itypes:
-                for t in tlist:
-                    if _itype_le(t, itype):
-                        continue
-                    break
-                else:
-                    found = itype
-        return found
-
     def __getitem__(self, key):
         """ dios[key] -> dios/series """
         key = list(key) if _is_iterator(key) else key
diff --git a/dios/lib.py b/dios/lib.py
index 665959c..df152ae 100644
--- a/dios/lib.py
+++ b/dios/lib.py
@@ -135,6 +135,21 @@ def _itype_le(a, b):
     return is_itype_like(a, b)
 
 
+def _find_least_common_itype(iterable_of_series):
+    itypes = [NumItype, FloatItype, IntItype, DtItype]
+    tlist = [get_itype(s.index) for s in iterable_of_series]
+    found = MixedItype
+    if tlist:
+        for itype in itypes:
+            for t in tlist:
+                if _itype_le(t, itype):
+                    continue
+                break
+            else:
+                found = itype
+    return found
+
+
 ################################################################################
 # Casting
 
diff --git a/test/test_dflike.py b/test/test_dflike.py
index a0b93a0..e0ec811 100644
--- a/test/test_dflike.py
+++ b/test/test_dflike.py
@@ -7,8 +7,6 @@ from pandas.core.dtypes.common import is_dict_like, is_nested_list_like
 import numpy as np
 from copy import deepcopy
 
-pytestmark = pytest.mark.skip
-
 __author__ = "Bert Palm"
 __email__ = "bert.palm@ufz.de"
 __copyright__ = "Copyright 2018, Helmholtz-Zentrum fÃ¼r Umweltforschung GmbH - UFZ"
@@ -16,10 +14,13 @@ __copyright__ = "Copyright 2018, Helmholtz-Zentrum fÃ¼r Umweltforschung GmbH - U
 
 arr = np.random.rand(8)
 TESTDATA = [
-    None,   # empty
-    arr.copy(),    # list
-    # np.array([arr.copy(), arr.copy(), arr.copy()]),  # nested list
-    dict(a=arr.copy(), b=arr.copy()),  # dict
+    None,   # empty  # 0
+    [1],  # 1
+    arr.copy(),  # 2
+    np.array([arr.copy(), arr.copy(), arr.copy()]),  # 3 - nested list
+    range(4),  # 4
+    dict(a=arr.copy(), b=arr.copy()),  # 5 dict
+    pd.DataFrame(dict(a=arr.copy(), b=arr.copy()))  # 6 df
 ]
 
 
@@ -27,12 +28,10 @@ TESTDATA = [
 @pytest.mark.parametrize("with_column_param", [False, True])
 def test_dios_create(data, with_column_param):
 
-    if is_dict_like(data) and with_column_param:
-        # giving column names in dict-keys and in columns-parameter is special in df
-        pytest.skip()
-
     data_copy0 = deepcopy(data)
     data_copy1 = deepcopy(data)
+
+    # create columns list
     if with_column_param:
         df = pd.DataFrame(data=data_copy0)
         col = [f"new_{c}" for c in df]
@@ -43,16 +42,11 @@ def test_dios_create(data, with_column_param):
         # giving nested lists, work different between df and dios
         data_copy1 = data_copy1.transpose()
 
-    df = pd.DataFrame(data=data_copy1, columns=col)
-    dios = DictOfSeries(data=data_copy0, columns=col)
-
-    assert len(dios.columns) == len(df.columns)
-    assert np.all(dios.values == df.values)
+    df = pd.DataFrame(data=data_copy0, columns=col)
+    dios = DictOfSeries(data=data_copy1, columns=col)
 
-    # df columns may not be strings, but dios'es are always
-    columns = [str(c) for c in df.columns]
-    assert list(dios.columns) == columns
+    assert dios.columns.equals(df.columns)
 
     for c in df.columns:
-        assert np.all(dios[str(c)] == df[c])
+        assert np.all(dios[c] == df[c].dropna())
 
diff --git a/test/test_methods.py b/test/test_methods.py
index 60a2b29..d8ab08a 100644
--- a/test/test_methods.py
+++ b/test/test_methods.py
@@ -5,15 +5,24 @@ def test_copy_copy_empty(getDtDiosAligned):
     dios = getDtDiosAligned.copy()
     shallow = dios.copy(deep=False)
     deep = dios.copy(deep=True)
-    empty = dios.copy_empty()
+    empty_w_cols = dios.copy_empty(columns=True)
+    empty_no_cols = dios.copy_empty(columns=False)
 
     assert dios is not shallow
     assert dios is not deep
-    assert dios is not empty
+    assert dios is not empty_w_cols
+    assert dios is not empty_no_cols
 
-    assert dios.itype == shallow.itype
-    assert dios.itype == deep.itype
-    assert dios.itype == empty.itype
+    for attr in ['itype', '_itype', '_policy', ]:
+        dios_attr = getattr(dios, attr)
+        for cop in [shallow, deep, empty_w_cols, empty_no_cols]:
+            copy_attr = getattr(cop, attr)
+            assert dios_attr == copy_attr
+
+    assert dios.columns.equals(shallow.columns)
+    assert dios.columns.equals(deep.columns)
+    assert dios.columns.equals(empty_w_cols.columns)
+    assert not dios.columns.equals(empty_no_cols.columns)
 
     for i in dios:
         assert dios[i].index is shallow[i].index
-- 
GitLab