From 4f62bda29d49b86f70a19c06c0d9c5905182c41c Mon Sep 17 00:00:00 2001 From: Bert Palm <bert.palm@ufz.de> Date: Wed, 12 Feb 2020 22:33:54 +0100 Subject: [PATCH] added cast itype --- dios/dios.py | 83 ++++++++++++++++++++++++++++++++----------------- dios/itypes.py | 71 +++++++++++++++++++++++++++++++++--------- dios/lib.py | 6 ++++ dios/options.py | 1 - 4 files changed, 116 insertions(+), 45 deletions(-) diff --git a/dios/dios.py b/dios/dios.py index 930f1f5..c2528bf 100644 --- a/dios/dios.py +++ b/dios/dios.py @@ -10,6 +10,7 @@ from pandas.core.dtypes.common import ( is_list_like, is_scalar, is_integer, + is_dict_like, ) from pandas.core.dtypes.common import is_iterator as _is_iterator from pandas.core.indexing import need_slice @@ -56,20 +57,46 @@ class DictOfSeries: """ - def __init__(self, itype=None, **kwargs): + def __init__(self, data=None, itype=None, columns=None): + self._data = OrderedDict() # We need to keep track of the index-type (itype) of every new Series. # If the itypes differ between different series, slicing will almost always fail # (eg. a datetime-like slice cannot work on a numeric index and vice versa). - if itype is not None: - itype = get_itype(itype) - check_allowed_itypes(itype) - self._itype = itype + self._itype = MixedItype + + self.__init_insert_data__(data) + + # use property.setter to make necessary checks + self.columns = columns + + # 1. infer itype + # check with given -> fine + # check with given -> cast -> fine + # check with given -> cast -> err out + # given None: + # is unique -> fine + # not unique -> err out + + def __init_insert_data__(self, data): + if data is None: + return + + if isinstance(data, DictOfSeries): + for k in data: + self[k] = data[k] + + if is_iterator(data): + data = list(data) - # fill initial given values in the dios - for kw in kwargs: - self[kw] = kwargs[kw] + if is_dict_like(data): + for k in data: + self[k] = data[k] + + # take care: dict's also list-like + if is_list_like(data): + self['0'] = data @property def columns(self): @@ -78,10 +105,10 @@ class DictOfSeries: @columns.setter def columns(self, new): if not isinstance(new, list): - raise NotImplementedError("Only lists supported so far") + raise TypeError("column names must be given as a list") if len(set(new)) != len(new): - raise ValueError("Names must be unique") + raise ValueError("column names must be unique") if len(new) != len(self.columns): raise ValueError(f"Length mismatch: Columns has {len(self.columns)} elements, " @@ -97,22 +124,11 @@ class DictOfSeries: def itype(self): return self._itype - def _set_itype(self, idx): - """ Set itype of dios. - - Note: If ``self._itype`` and ``idx`` are of the same type, - ``self._itype`` stays unchanged. - """ - idx = get_itype(idx) - check_allowed_itypes(idx) - - if self._itype is None: - self._itype = idx - elif self._itype != idx: - if dios_options[Options.allow_mixed_itypes]: - self._itype = IdxTypes.mixed - else: - raise ValueError(f"Only objects which have a index of type `{self._itype}` can be inserted.") + @itype.setter + def itype(self, itype_like): + if is_itype_subtype(self._itype, itype_like): + self._itype = itype_like + raise NotImplementedError("futur throw `mixed` warning") def _check_keys(self, keys): missing = [k for k in keys if k not in self.columns] @@ -227,7 +243,16 @@ class DictOfSeries: if not isinstance(v, pd.Series): raise ValueError(f"Only pd.Series and DictOfSeries (of length 1) can be assigned new") - self._set_itype(v.index) + if self._itype is None: + # if the user created a empty dios or + # the last emelent was deleted + self._itype = get_itype(v.index) + + v = cast_to_fit_itype(v, self._itype) + if v is None: + itype = get_itype(v.index) + raise ValueError(f"Itype mismach. Data of key `{key}`, with (infered) itype `{itype}` " + f"cannot be inserted in a dios with itype `{self.itype}`.") self._data[key] = v.copy(deep=True) def _setitem(self, key, val, sl=None): @@ -370,8 +395,8 @@ class DictOfSeries: def copy(self, deep=True): new = DictOfSeries() new._itype = self.itype - # We use `_data` here because all checks have already been done. - # So this should be much faster, especially because we use the underlying dict for + # We use `_data` here, because all checks are already done. + # So this should be much faster, especially, because we use the underlying dict for # getting and setting the values, instead of ``__setitem__`` and ``__getitem__``. # Note: don't use same approach elsewhere, unless you're very sure what you do. for k in self._data: diff --git a/dios/itypes.py b/dios/itypes.py index e31bd3a..1a135e6 100644 --- a/dios/itypes.py +++ b/dios/itypes.py @@ -10,24 +10,21 @@ class DatetimeItype(__Itype): name = 'datetime' unique = True subtypes = (pd.DatetimeIndex,) + cast_to = ... class IntegerItype(__Itype): name = 'integer' unique = True subtypes = (pd.RangeIndex, pd.Int64Index, pd.UInt64Index,) + cast_to = int class FloatItype(__Itype): name = 'float' subtypes = (pd.Float64Index,) unique = True - - -class OtherItype(__Itype): - name = "other" - subtypes = (pd.CategoricalIndex, pd.IntervalIndex, pd.PeriodIndex,) - unique = True + cast_to = float # class MultiItype(__Itype): @@ -38,18 +35,17 @@ class OtherItype(__Itype): class NumericItype(__Itype): name = "numeric" - subtypes = (IntegerItype.subtypes + FloatItype.subtypes) + _subitypes = (IntegerItype, FloatItype) + subtypes = (_subitypes + IntegerItype.subtypes + FloatItype.subtypes) unique = False -class MixedItype (__Itype): +class MixedItype(__Itype): name = "mixed" unique = False - subtypes = (DatetimeItype.subtypes + - NumericItype.subtypes + - OtherItype.subtypes + - # pd.MultiIndex, not supported - ()) + _subitypes = (DatetimeItype, IntegerItype, FloatItype, NumericItype) + _otheritypes = (pd.CategoricalIndex, pd.IntervalIndex, pd.PeriodIndex, pd.TimedeltaIndex) + subtypes = (_subitypes + _otheritypes + DatetimeItype.subtypes + NumericItype.subtypes) def is_itype(obj, itype): @@ -57,6 +53,7 @@ def is_itype(obj, itype): # user gave a Itype, like ``DatetimeItype`` if issubclass(obj, itype): return True + # todo: iter through itype as it could be a tuple, if called like ``is_itype(o, (t1,t2))`` # user gave a string, like 'datetime' if isinstance(obj, str) and obj == itype.name: return True @@ -79,6 +76,11 @@ def is_itype_like(obj, itype): return is_itype(obj, itype) or is_itype_subtype(obj, itype) +def get_minimal_itype(obj): + """ alias for get_itype(), see there for more info""" + return get_itype(obj) + + def get_itype(obj): """ Return the according Itype, by any of any possible user input, like @@ -93,17 +95,56 @@ def get_itype(obj): return obj # check if it is the actual type, not a subtype - types = [DatetimeItype, NumericItype, IntegerItype, FloatItype, OtherItype, MixedItype] + types = [DatetimeItype, IntegerItype, FloatItype, OtherItype, NumericItype, MixedItype] for t in types: if is_itype(obj, t): return t # If the above failed, we try to infer the itype by its subtypes. # We just check the unique types, because the non-unique are just - # collections of unique subtypes. + # collections of unique subtypes, and would have be detected by any + # of the upper if-statements for t in types: if is_itype_subtype(obj, t) and t.unique: return t raise ValueError(f"{obj} is not a itype, nor any known subtype of a itype, nor a itype string alias") + +def cast_to_fit_itype(series, itype): + """ Cast a series (more explicit the type of the index) to fit the itype of a dios. + + Return the casted series if successful, None otherwise. + + Note: + This is very basic number-casting, so in most cases, information from + the old index will be lost after the cast. + """ + series.itype = get_itype(series.index) + + # up-cast issn't necessary because a dios with a higher + # itype always can take lower itypes + # dt -> dt -> mixed + # int -> int -> num -> mixed + # float -> float -> num -> mixed + # num -> num -> mixed + # mixed -> mixed + if is_itype_subtype(series.itype, itype): + return series + + # any (dt/float/num/mixed) -> int/num OK + if is_itype(itype, IntegerItype) or is_itype(itype, NumericItype): + series.index = pd.RangeIndex(len(series)) + return series + + # any (dt/int/num/mixed) -> float OK + if is_itype(itype, FloatItype): + series.index = pd.Float64Index(range(len(series))) + return series + + # any (int/float/num/mixed) -> dt FAIL + if is_itype(itype, DatetimeItype): + return None + + return None + diff --git a/dios/lib.py b/dios/lib.py index d2f4c7a..71426c7 100644 --- a/dios/lib.py +++ b/dios/lib.py @@ -1,5 +1,11 @@ import pandas as pd +from dios.itypes import * +import warnings def _get_storage_class_values(cls): return [getattr(cls, c) for c in cls.__dict__ if not c.startswith("_")] + + +class CastWarning(RuntimeWarning): + pass diff --git a/dios/options.py b/dios/options.py index f4cf7bd..260e7fd 100644 --- a/dios/options.py +++ b/dios/options.py @@ -1,4 +1,3 @@ -from dios.itypes import IdxTypes import warnings -- GitLab