From 680ea4a8156e8c9bee14f8375e5df48c329eb938 Mon Sep 17 00:00:00 2001
From: Bert Palm <bert.palm@ufz.de>
Date: Thu, 13 Feb 2020 01:08:23 +0100
Subject: [PATCH] itype done

---
 dios/dios.py    | 121 +++++++++++++++++++++++++++---------------------
 dios/itypes.py  |  39 +++++++++++-----
 dios/lib.py     |  43 +++++++++++++++--
 dios/options.py |  73 ++---------------------------
 4 files changed, 141 insertions(+), 135 deletions(-)

diff --git a/dios/dios.py b/dios/dios.py
index c2528bf..79218f4 100644
--- a/dios/dios.py
+++ b/dios/dios.py
@@ -1,9 +1,7 @@
 from dios.lib import *
 from dios.options import *
 import pandas as pd
-import numpy as np
 import operator as op
-import datetime as dt
 
 from collections import OrderedDict
 from pandas.core.dtypes.common import (
@@ -13,7 +11,6 @@ from pandas.core.dtypes.common import (
     is_dict_like,
 )
 from pandas.core.dtypes.common import is_iterator as _is_iterator
-from pandas.core.indexing import need_slice
 
 
 def is_iterator(obj):
@@ -51,7 +48,7 @@ class DictOfSeries:
 
     Todos:
     -----
-    todo: allow any hashable obj as column identifier
+    todo: to_discuss!! allow any hashable obj as column identifier
          Currently we only allow strings as identifier, to be more df-like we should allow any
          hashable object (unlike df we may should exclude stuff like: ``None`` or ``np.nan`` ??)
 
@@ -64,20 +61,24 @@ class DictOfSeries:
         # We need to keep track of the index-type (itype) of every new Series.
         # If the itypes differ between different series, slicing will almost always fail
         # (eg. a datetime-like slice cannot work on a numeric index and vice versa).
+        #
+        # May data was given, so we firstly set itype to MixedItype, then insert all data,
+        # and check/cast the itype afterwards, otherwise __setitem_new() will set the itype,
+        # which may prevent inserting series with other (higher) itypes.
         self._itype = MixedItype
 
         self.__init_insert_data__(data)
 
-        # use property.setter to make necessary checks
+        # we use the columns.setter to make all necessary checks
         self.columns = columns
 
-        # 1. infer itype
-        # check with given -> fine
-        # check with given -> cast -> fine
-        # check with given -> cast -> err out
-        # given None:
-        # is unique -> fine
-        # not unique -> err out
+        # infer the itype by the data
+        inferred_itype = self.__find_least_common_itype()
+        itype = inferred_itype if itype is None else get_itype(itype)
+
+        # We use the itype.setter to make all checks. If the given itype was of a lower type
+        # than the inferred itype, a cast is tried on every series.
+        self.itype = itype
 
     def __init_insert_data__(self, data):
         if data is None:
@@ -98,6 +99,38 @@ class DictOfSeries:
         if is_list_like(data):
             self['0'] = data
 
+    def __find_least_common_itype(self):
+
+        def all_itypes_le(itypes, super_itype):
+            for itype in itypes:
+                if itype_le(itype, super_itype):
+                    continue
+                return False
+            return True
+
+        itypes = []
+        for k in self.columns:
+            itypes.append(get_itype(self._data[k].index))
+
+        found = None
+
+        # check supertypes
+        super_itypes = [MixedItype, NumericItype]
+        for super_itype in super_itypes:
+            if all_itypes_le(itypes, super_itype):
+                found = super_itype
+                continue
+            break
+        assert found, "At least this should be MixedItype"
+
+        # check base types
+        single_itypes = [DatetimeItype, IntegerItype, FloatItype]
+        for single_itype in single_itypes:
+            if all_itypes_le(itypes, single_itype):
+                found = single_itypes
+                break
+        return found
+
     @property
     def columns(self):
         return list(self._data.keys())
@@ -126,9 +159,24 @@ class DictOfSeries:
 
     @itype.setter
     def itype(self, itype_like):
-        if is_itype_subtype(self._itype, itype_like):
-            self._itype = itype_like
-        raise NotImplementedError("futur throw `mixed` warning")
+        itype = get_itype(itype_like)
+
+        if not is_itype_subtype(self._itype, itype):
+            # try to cast all series to the new itype
+            self.__cast_all(itype)
+
+        self._itype = itype
+
+        if not itype.unique:
+            throw(f"Using a {itype} as dios.itype is experimental. As soon as series with different index types "
+                  f"are inserted, slicing will almost always fail. You are hereby warned!", ItypeWarning)
+
+    def __cast_all(self, itype):
+        for k in self.columns:
+            casted = cast_to_fit_itype(self._data[k].copy(), itype)
+            if casted is None:
+                raise ItypeCastError(f"Cast series indicees to the given itype failed for series in column {k}.")
+            self._data[k] = casted
 
     def _check_keys(self, keys):
         missing = [k for k in keys if k not in self.columns]
@@ -348,43 +396,6 @@ class DictOfSeries:
 
     def __delitem__(self, key):
         del self._data[key]
-        self.__set_mixed_itype_from_all_keys()
-
-    def __set_mixed_itype_from_all_keys(self):
-        """ If the itype of dios is ``mixed`` and the itype of any stored
-        Series change, we need to check the itype of all other Series, to
-        validate the dios-wide itype."""
-
-        if len(self) == 0:
-            self._itype = None
-            return
-
-        if len(self) == 1:
-            self._itype = get_itype(self.squeeze().index)
-            return
-
-        # ``mixed`` isn't allowed in general, so we're done
-        if not dios_options[Options.allow_mixed_itypes]:
-            return
-
-        # itype wasn't ``mixed``, so we're done
-        if self._itype != IdxTypes.mixed:
-            return
-
-        # check all types
-        types = set()
-        for k in self._data.keys():
-            idx = self._data[k].index
-            types.add(get_itype(idx))
-
-            # If we have at least two different
-            # itypes, ``mixed`` still apply.
-            if len(types) > 1:
-                return
-
-        # index is of a single new type
-        self._itype = types.pop()
-        return
 
     def __copy__(self):
         return self.copy(deep=True)
@@ -565,6 +576,8 @@ class _LocIndexer(_Indexer):
         #       list_like -> check length
         for c in cols:
             self._data[c].loc[rkey] = value
+        # todo loc.__setitem__(self, key, value):
+        raise NotImplementedError
 
     def _unpack_key(self, key):
         # if we have a tuple, we have a rows- and a column-indexer
@@ -618,6 +631,10 @@ class _iLocIndexer(_Indexer):
             new[c] = self._data[c].iloc[rkey]
         return new
 
+    def __setitem__(self, key, value):
+        # todo iloc.__setitem__(self, key, value):
+        raise NotImplementedError
+
     def _unpack_key(self, key):
         # if we have a tuple, we have a rows- and a column-indexer
         # if not, we only have a row-indexer and work on all columns
diff --git a/dios/itypes.py b/dios/itypes.py
index 1a135e6..58647bb 100644
--- a/dios/itypes.py
+++ b/dios/itypes.py
@@ -1,6 +1,18 @@
 import pandas as pd
 
 
+class ItypeWarning(RuntimeWarning):
+    pass
+
+
+class ItypeCastWarning(ItypeWarning):
+    pass
+
+
+class ItypeCastError(RuntimeError):
+    pass
+
+
 class __Itype:
     def __init__(self):
         raise RuntimeError("DatetimeItype does not allow instances of itself.")
@@ -10,21 +22,18 @@ class DatetimeItype(__Itype):
     name = 'datetime'
     unique = True
     subtypes = (pd.DatetimeIndex,)
-    cast_to = ...
 
 
 class IntegerItype(__Itype):
     name = 'integer'
     unique = True
-    subtypes = (pd.RangeIndex, pd.Int64Index, pd.UInt64Index,)
-    cast_to = int
+    subtypes = (pd.RangeIndex, pd.Int64Index, pd.UInt64Index, int)
 
 
 class FloatItype(__Itype):
     name = 'float'
-    subtypes = (pd.Float64Index,)
+    subtypes = (pd.Float64Index, float)
     unique = True
-    cast_to = float
 
 
 # class MultiItype(__Itype):
@@ -76,11 +85,6 @@ def is_itype_like(obj, itype):
     return is_itype(obj, itype) or is_itype_subtype(obj, itype)
 
 
-def get_minimal_itype(obj):
-    """ alias for get_itype(), see there for more info"""
-    return get_itype(obj)
-
-
 def get_itype(obj):
     """
     Return the according Itype, by any of any possible user input, like
@@ -95,7 +99,7 @@ def get_itype(obj):
         return obj
 
     # check if it is the actual type, not a subtype
-    types = [DatetimeItype, IntegerItype, FloatItype, OtherItype, NumericItype, MixedItype]
+    types = [DatetimeItype, IntegerItype, FloatItype, NumericItype, MixedItype]
     for t in types:
         if is_itype(obj, t):
             return t
@@ -111,6 +115,18 @@ def get_itype(obj):
     raise ValueError(f"{obj} is not a itype, nor any known subtype of a itype, nor a itype string alias")
 
 
+def itype_eq(a, b):
+    return is_itype(a, b)
+
+
+def itype_lt(a, b):
+    return is_itype_subtype(a, b)
+
+
+def itype_le(a, b):
+    return is_itype_like(a, b)
+
+
 def cast_to_fit_itype(series, itype):
     """ Cast a series (more explicit the type of the index) to fit the itype of a dios.
 
@@ -147,4 +163,3 @@ def cast_to_fit_itype(series, itype):
         return None
 
     return None
-
diff --git a/dios/lib.py b/dios/lib.py
index 71426c7..55475fc 100644
--- a/dios/lib.py
+++ b/dios/lib.py
@@ -1,5 +1,6 @@
-import pandas as pd
 from dios.itypes import *
+from dios.options import *
+import pandas as pd
 import warnings
 
 
@@ -7,5 +8,41 @@ def _get_storage_class_values(cls):
     return [getattr(cls, c) for c in cls.__dict__ if not c.startswith("_")]
 
 
-class CastWarning(RuntimeWarning):
-    pass
+def throw(msg, wtype):
+    warnings.warn(msg, wtype)
+
+
+# todo: make method an kwarg and remove dios_options access
+def get_dios_to_dios_keys(keys, other):
+    # we can assume that all keys are exist in self._data
+    method = dios_options[Options.dios_to_dios_method]
+    err_append = "consider changing dios.option['dios_to_dios_method']"
+
+    # assign where possible, otherwise ignore
+    if method == 0:
+        keys = [k for k in keys if k in other.columns]
+
+    # at least one key must be in self
+    elif method == 1:
+        keys = [k for k in keys if k in other.columns]
+        if not keys:
+            raise KeyError("src-DioS and dest-DioS need to share at least one key, " + err_append)
+
+    # all keys must be in self, but more keys could exist in other,
+    # eg. ``dios['a'] = dios[['a','b']]`` will update column-a but not column-b
+    # eg. ``dios[['a','b']] = dios['a']`` will fail
+    elif method == 2:
+        fail = [k for k in keys if k not in other.columns]
+        if fail:
+            raise KeyError(f"{fail} are missing in the destiny-dios, " + err_append)
+
+    # keys in both dios's must be equal
+    elif method == 3:
+        fail = set(keys).symmetric_difference(set(other.columns))
+        if fail:
+            raise KeyError(f"{fail} is not in both of src- and dest-dios, " + err_append)
+
+    else:
+        raise OptionsError(f"{method} is an invalid value for dios.option[dios_to_dios]")
+
+    return keys
diff --git a/dios/options.py b/dios/options.py
index 260e7fd..7acdb12 100644
--- a/dios/options.py
+++ b/dios/options.py
@@ -1,5 +1,3 @@
-import warnings
-
 
 class OptionsWarning(UserWarning):
     pass
@@ -25,71 +23,10 @@ class Options:
       otherwise its the same than creating a new dios)"""
     dios_to_dios_method = "dios_to_dios_method"
 
-    """
-    If we have different types of indexes in the dios, slicing will almost always fail.
-    It is because, eg. a numeric slice cannot work on a pd.DatetimeIndex and vice versa.
-    To set this to True is highly experimental, any arising issues or errors should be
-    handled by the user."""
-    allow_mixed_itypes = "allow_mixed_itypes"
-
-    allowed_indextypes = "allowed_indextypes"
-
-
-class __OptionsDict(dict):
-    """ Simple dict that throw a warning, if a special value is inserted at a special key"""
-    def __setitem__(self, key, value):
-        # throw a warning when user set ``mixed_indextyes = True``
-        if key == Options.allow_mixed_itypes and value:
-            warnings.warn(f"Using ``dios_option[{Options.allow_mixed_itypes}]=True`` is highly experimental, "
-                          f"please do not report any bugs!", OptionsWarning)
-        return super().__setitem__(key, value)
-
 
 # set default values
-dios_options = __OptionsDict()
-dios_options[Options.disp_max_rows] = 10
-dios_options[Options.disp_max_vars] = 4
-dios_options[Options.dios_to_dios_method] = 3
-dios_options[Options.allow_mixed_itypes] = False
-dios_options[Options.allowed_indextypes] = [IdxTypes.datetime, IdxTypes.nunmeric]
-
-
-def check_allowed_itypes(idxtype):
-    if idxtype not in dios_options[Options.allowed_indextypes]:
-        raise RuntimeError(f"The index type `{idxtype}` is not allowed by the "
-                           f"`dios_option[{Options.allowed_indextypes}] = {dios_options[Options.allowed_indextypes]}`")
-
-
-def get_dios_to_dios_keys(keys, other):
-    # we can assume that all keys are exist in self._data
-    method = dios_options[Options.dios_to_dios_method]
-    err_append = "consider changing dios.option['dios_to_dios_method']"
-
-    # assign where possible, otherwise ignore
-    if method == 0:
-        keys = [k for k in keys if k in other.columns]
-
-    # at least one key must be in self
-    elif method == 1:
-        keys = [k for k in keys if k in other.columns]
-        if not keys:
-            raise KeyError("src-DioS and dest-DioS need to share at least one key, " + err_append)
-
-    # all keys must be in self, but more keys could exist in other,
-    # eg. ``dios['a'] = dios[['a','b']]`` will update column-a but not column-b
-    # eg. ``dios[['a','b']] = dios['a']`` will fail
-    elif method == 2:
-        fail = [k for k in keys if k not in other.columns]
-        if fail:
-            raise KeyError(f"{fail} are missing in the destiny-dios, " + err_append)
-
-    # keys in both dios's must be equal
-    elif method == 3:
-        fail = set(keys).symmetric_difference(set(other.columns))
-        if fail:
-            raise KeyError(f"{fail} is not in both of src- and dest-dios, " + err_append)
-
-    else:
-        raise OptionsError(f"{method} is an invalid value for dios.option[dios_to_dios]")
-
-    return keys
+dios_options = {
+    Options.disp_max_rows: 10,
+    Options.disp_max_vars: 4,
+    Options.dios_to_dios_method: 3,
+}
-- 
GitLab