From 1655868dce44a2310e5a886a5c7519b2df30785d Mon Sep 17 00:00:00 2001
From: Bert Palm <bert.palm@ufz.de>
Date: Wed, 15 Apr 2020 05:07:18 +0200
Subject: [PATCH] made itype lazy

---
 dios/dios.py | 50 ++++++++++++++++++++++++++++++++++----------------
 dios/lib.py  |  6 +++---
 2 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/dios/dios.py b/dios/dios.py
index 032113b..b5a0863 100644
--- a/dios/dios.py
+++ b/dios/dios.py
@@ -67,26 +67,25 @@ class DictOfSeries:
     """
 
     def __init__(self, data=None, columns=None, index=None, itype=None, cast_policy='save', fastpath=False):
-
+        
         # we are called internally
         if fastpath:
-
             self._itype = itype or ObjItype
             self._policy = cast_policy
-
             if data is not None:
                 self._data = data
             else:
                 # it is significantly faster, to provide an index and fill it,
                 # than to successively build the index by adding data
                 self._data = pd.Series(dtype='O', index=columns)
+
         else:
 
             if index is not None and not isinstance(index, pd.Index):
                 index = pd.Index(index)
 
             # itype=None means infer the itype by the data, so we first set to the highest
-            # possible itype, then insert data, then find the best-fitting itype.
+            # possible itype, then insert data, then infer the best-fitting itype.
             if itype is None and index is None:
                 self._itype = ObjItype
             else:
@@ -107,16 +106,8 @@ class DictOfSeries:
             if data is not None:
                 self._init_insert_data(data, columns, index)
 
-            # Note: self._data still contain nans at all positions
-            # where no data was present, but a column-name was given
-
-            if itype is None:
-                self._itype = _find_least_common_itype(self._data.dropna())
-                if not self._itype.unique:
-                    _throw_MixedItype_err_or_warn(self.itype)
-
-        # finalise data: insert empty
-        # columns at nan positions
+        # self._data still contain nans at all positions, where
+        # no data was present, but a column-name was given
         if self._data.hasnans:
             e = pd.Series(dtype='O', index=index)
             for c in self.columns[self._data.isna()]:
@@ -124,6 +115,17 @@ class DictOfSeries:
 
         self._data.index.name = 'columns'
 
+        # we try to infer the itype, but if we still have
+        # no data, we will set the itype lazy, i.e. with
+        # the first non-empty _insert()
+        if itype is None:
+            if self.empty:
+                self._itype = 'INFER'
+            else:
+                self._itype = _find_least_common_itype(self._data)
+                if not self._itype.unique:
+                    _throw_MixedItype_err_or_warn(self.itype)
+
     def _init_insert_data(self, data, columns, index):
         """ Insert items of a iterable in self"""
 
@@ -160,6 +162,9 @@ class DictOfSeries:
         for k in data:
             self._insert(k, pd.Series(data[k], index=index))
 
+    # ----------------------------------------------------------------------
+    #
+
     def _insert(self, col, val):
         """Insert a fresh new value as pd.Series into self"""
         val = list(val) if _is_iterator(val) else val
@@ -175,9 +180,20 @@ class DictOfSeries:
         elif not isinstance(val, pd.Series):
             raise TypeError(f"Only data of type pandas.Series can be inserted, passed was {type(val)}")
 
-        val = cast_to_itype(val, self.itype, policy=self._policy).copy(deep=True)
+        # set the itype lazy, i.e. when first non-empty
+        # column is inserted
+        if self._itype == 'INFER':
+            if not val.empty:
+                self._itype = get_itype(val.index)
+                # cast all pre-inserted empty series
+                self.__cast_all(self._itype, self._policy)
+                if not self._itype.unique:
+                    _throw_MixedItype_err_or_warn(self._itype)
+        else:
+            val = cast_to_itype(val, self.itype, policy=self._policy)
+
         val.name = col
-        self._data.at[col] = val
+        self._data.at[col] = val.copy(deep=True)
 
     @property
     def columns(self):
@@ -238,6 +254,8 @@ class DictOfSeries:
 
     @property
     def itype(self):
+        if self._itype == 'INFER':
+            return None
         return self._itype
 
     @itype.setter
diff --git a/dios/lib.py b/dios/lib.py
index 167f63c..10c87ae 100644
--- a/dios/lib.py
+++ b/dios/lib.py
@@ -195,13 +195,13 @@ def cast_to_itype(series, itype, policy='lossless', err='raise', inplace=False):
     if _itype_le(series.itype, itype):  # a <= b
         return series
 
-    e = f"A series index of type `{type(series.index)}` cannot be casted to Itype {itype.name}"
+    e = f"A series index of type '{type(series.index)}' cannot be casted to Itype '{itype.name}'"
 
     # cast any -> dt always fail.
     if is_itype(itype, DtItype):
         pass
     else:
-        e += f", as forbidden by the cast-policy `{policy}`."
+        e += f", as forbidden by the cast-policy '{policy}'."
 
     if policy == CastPolicy.never:
         pass
@@ -227,7 +227,7 @@ def cast_to_itype(series, itype, policy='lossless', err='raise', inplace=False):
             series.index = series.index.astype(int)
             if series.index.is_unique:
                 return series
-            e = f"The cast with policy {policy} from series index type `{type(series.index)}` to " \
+            e = f"The cast with policy {policy} from series index type '{type(series.index)}' to " \
                 f"itype {itype.name} resulted in a non-unique index."
         # cast mixed -> int/float always fail
 
-- 
GitLab