From 680ea4a8156e8c9bee14f8375e5df48c329eb938 Mon Sep 17 00:00:00 2001 From: Bert Palm <bert.palm@ufz.de> Date: Thu, 13 Feb 2020 01:08:23 +0100 Subject: [PATCH] itype done --- dios/dios.py | 121 +++++++++++++++++++++++++++--------------------- dios/itypes.py | 39 +++++++++++----- dios/lib.py | 43 +++++++++++++++-- dios/options.py | 73 ++--------------------------- 4 files changed, 141 insertions(+), 135 deletions(-) diff --git a/dios/dios.py b/dios/dios.py index c2528bf..79218f4 100644 --- a/dios/dios.py +++ b/dios/dios.py @@ -1,9 +1,7 @@ from dios.lib import * from dios.options import * import pandas as pd -import numpy as np import operator as op -import datetime as dt from collections import OrderedDict from pandas.core.dtypes.common import ( @@ -13,7 +11,6 @@ from pandas.core.dtypes.common import ( is_dict_like, ) from pandas.core.dtypes.common import is_iterator as _is_iterator -from pandas.core.indexing import need_slice def is_iterator(obj): @@ -51,7 +48,7 @@ class DictOfSeries: Todos: ----- - todo: allow any hashable obj as column identifier + todo: to_discuss!! allow any hashable obj as column identifier Currently we only allow strings as identifier, to be more df-like we should allow any hashable object (unlike df we may should exclude stuff like: ``None`` or ``np.nan`` ??) @@ -64,20 +61,24 @@ class DictOfSeries: # We need to keep track of the index-type (itype) of every new Series. # If the itypes differ between different series, slicing will almost always fail # (eg. a datetime-like slice cannot work on a numeric index and vice versa). + # + # May data was given, so we firstly set itype to MixedItype, then insert all data, + # and check/cast the itype afterwards, otherwise __setitem_new() will set the itype, + # which may prevent inserting series with other (higher) itypes. self._itype = MixedItype self.__init_insert_data__(data) - # use property.setter to make necessary checks + # we use the columns.setter to make all necessary checks self.columns = columns - # 1. infer itype - # check with given -> fine - # check with given -> cast -> fine - # check with given -> cast -> err out - # given None: - # is unique -> fine - # not unique -> err out + # infer the itype by the data + inferred_itype = self.__find_least_common_itype() + itype = inferred_itype if itype is None else get_itype(itype) + + # We use the itype.setter to make all checks. If the given itype was of a lower type + # than the inferred itype, a cast is tried on every series. + self.itype = itype def __init_insert_data__(self, data): if data is None: @@ -98,6 +99,38 @@ class DictOfSeries: if is_list_like(data): self['0'] = data + def __find_least_common_itype(self): + + def all_itypes_le(itypes, super_itype): + for itype in itypes: + if itype_le(itype, super_itype): + continue + return False + return True + + itypes = [] + for k in self.columns: + itypes.append(get_itype(self._data[k].index)) + + found = None + + # check supertypes + super_itypes = [MixedItype, NumericItype] + for super_itype in super_itypes: + if all_itypes_le(itypes, super_itype): + found = super_itype + continue + break + assert found, "At least this should be MixedItype" + + # check base types + single_itypes = [DatetimeItype, IntegerItype, FloatItype] + for single_itype in single_itypes: + if all_itypes_le(itypes, single_itype): + found = single_itypes + break + return found + @property def columns(self): return list(self._data.keys()) @@ -126,9 +159,24 @@ class DictOfSeries: @itype.setter def itype(self, itype_like): - if is_itype_subtype(self._itype, itype_like): - self._itype = itype_like - raise NotImplementedError("futur throw `mixed` warning") + itype = get_itype(itype_like) + + if not is_itype_subtype(self._itype, itype): + # try to cast all series to the new itype + self.__cast_all(itype) + + self._itype = itype + + if not itype.unique: + throw(f"Using a {itype} as dios.itype is experimental. As soon as series with different index types " + f"are inserted, slicing will almost always fail. You are hereby warned!", ItypeWarning) + + def __cast_all(self, itype): + for k in self.columns: + casted = cast_to_fit_itype(self._data[k].copy(), itype) + if casted is None: + raise ItypeCastError(f"Cast series indicees to the given itype failed for series in column {k}.") + self._data[k] = casted def _check_keys(self, keys): missing = [k for k in keys if k not in self.columns] @@ -348,43 +396,6 @@ class DictOfSeries: def __delitem__(self, key): del self._data[key] - self.__set_mixed_itype_from_all_keys() - - def __set_mixed_itype_from_all_keys(self): - """ If the itype of dios is ``mixed`` and the itype of any stored - Series change, we need to check the itype of all other Series, to - validate the dios-wide itype.""" - - if len(self) == 0: - self._itype = None - return - - if len(self) == 1: - self._itype = get_itype(self.squeeze().index) - return - - # ``mixed`` isn't allowed in general, so we're done - if not dios_options[Options.allow_mixed_itypes]: - return - - # itype wasn't ``mixed``, so we're done - if self._itype != IdxTypes.mixed: - return - - # check all types - types = set() - for k in self._data.keys(): - idx = self._data[k].index - types.add(get_itype(idx)) - - # If we have at least two different - # itypes, ``mixed`` still apply. - if len(types) > 1: - return - - # index is of a single new type - self._itype = types.pop() - return def __copy__(self): return self.copy(deep=True) @@ -565,6 +576,8 @@ class _LocIndexer(_Indexer): # list_like -> check length for c in cols: self._data[c].loc[rkey] = value + # todo loc.__setitem__(self, key, value): + raise NotImplementedError def _unpack_key(self, key): # if we have a tuple, we have a rows- and a column-indexer @@ -618,6 +631,10 @@ class _iLocIndexer(_Indexer): new[c] = self._data[c].iloc[rkey] return new + def __setitem__(self, key, value): + # todo iloc.__setitem__(self, key, value): + raise NotImplementedError + def _unpack_key(self, key): # if we have a tuple, we have a rows- and a column-indexer # if not, we only have a row-indexer and work on all columns diff --git a/dios/itypes.py b/dios/itypes.py index 1a135e6..58647bb 100644 --- a/dios/itypes.py +++ b/dios/itypes.py @@ -1,6 +1,18 @@ import pandas as pd +class ItypeWarning(RuntimeWarning): + pass + + +class ItypeCastWarning(ItypeWarning): + pass + + +class ItypeCastError(RuntimeError): + pass + + class __Itype: def __init__(self): raise RuntimeError("DatetimeItype does not allow instances of itself.") @@ -10,21 +22,18 @@ class DatetimeItype(__Itype): name = 'datetime' unique = True subtypes = (pd.DatetimeIndex,) - cast_to = ... class IntegerItype(__Itype): name = 'integer' unique = True - subtypes = (pd.RangeIndex, pd.Int64Index, pd.UInt64Index,) - cast_to = int + subtypes = (pd.RangeIndex, pd.Int64Index, pd.UInt64Index, int) class FloatItype(__Itype): name = 'float' - subtypes = (pd.Float64Index,) + subtypes = (pd.Float64Index, float) unique = True - cast_to = float # class MultiItype(__Itype): @@ -76,11 +85,6 @@ def is_itype_like(obj, itype): return is_itype(obj, itype) or is_itype_subtype(obj, itype) -def get_minimal_itype(obj): - """ alias for get_itype(), see there for more info""" - return get_itype(obj) - - def get_itype(obj): """ Return the according Itype, by any of any possible user input, like @@ -95,7 +99,7 @@ def get_itype(obj): return obj # check if it is the actual type, not a subtype - types = [DatetimeItype, IntegerItype, FloatItype, OtherItype, NumericItype, MixedItype] + types = [DatetimeItype, IntegerItype, FloatItype, NumericItype, MixedItype] for t in types: if is_itype(obj, t): return t @@ -111,6 +115,18 @@ def get_itype(obj): raise ValueError(f"{obj} is not a itype, nor any known subtype of a itype, nor a itype string alias") +def itype_eq(a, b): + return is_itype(a, b) + + +def itype_lt(a, b): + return is_itype_subtype(a, b) + + +def itype_le(a, b): + return is_itype_like(a, b) + + def cast_to_fit_itype(series, itype): """ Cast a series (more explicit the type of the index) to fit the itype of a dios. @@ -147,4 +163,3 @@ def cast_to_fit_itype(series, itype): return None return None - diff --git a/dios/lib.py b/dios/lib.py index 71426c7..55475fc 100644 --- a/dios/lib.py +++ b/dios/lib.py @@ -1,5 +1,6 @@ -import pandas as pd from dios.itypes import * +from dios.options import * +import pandas as pd import warnings @@ -7,5 +8,41 @@ def _get_storage_class_values(cls): return [getattr(cls, c) for c in cls.__dict__ if not c.startswith("_")] -class CastWarning(RuntimeWarning): - pass +def throw(msg, wtype): + warnings.warn(msg, wtype) + + +# todo: make method an kwarg and remove dios_options access +def get_dios_to_dios_keys(keys, other): + # we can assume that all keys are exist in self._data + method = dios_options[Options.dios_to_dios_method] + err_append = "consider changing dios.option['dios_to_dios_method']" + + # assign where possible, otherwise ignore + if method == 0: + keys = [k for k in keys if k in other.columns] + + # at least one key must be in self + elif method == 1: + keys = [k for k in keys if k in other.columns] + if not keys: + raise KeyError("src-DioS and dest-DioS need to share at least one key, " + err_append) + + # all keys must be in self, but more keys could exist in other, + # eg. ``dios['a'] = dios[['a','b']]`` will update column-a but not column-b + # eg. ``dios[['a','b']] = dios['a']`` will fail + elif method == 2: + fail = [k for k in keys if k not in other.columns] + if fail: + raise KeyError(f"{fail} are missing in the destiny-dios, " + err_append) + + # keys in both dios's must be equal + elif method == 3: + fail = set(keys).symmetric_difference(set(other.columns)) + if fail: + raise KeyError(f"{fail} is not in both of src- and dest-dios, " + err_append) + + else: + raise OptionsError(f"{method} is an invalid value for dios.option[dios_to_dios]") + + return keys diff --git a/dios/options.py b/dios/options.py index 260e7fd..7acdb12 100644 --- a/dios/options.py +++ b/dios/options.py @@ -1,5 +1,3 @@ -import warnings - class OptionsWarning(UserWarning): pass @@ -25,71 +23,10 @@ class Options: otherwise its the same than creating a new dios)""" dios_to_dios_method = "dios_to_dios_method" - """ - If we have different types of indexes in the dios, slicing will almost always fail. - It is because, eg. a numeric slice cannot work on a pd.DatetimeIndex and vice versa. - To set this to True is highly experimental, any arising issues or errors should be - handled by the user.""" - allow_mixed_itypes = "allow_mixed_itypes" - - allowed_indextypes = "allowed_indextypes" - - -class __OptionsDict(dict): - """ Simple dict that throw a warning, if a special value is inserted at a special key""" - def __setitem__(self, key, value): - # throw a warning when user set ``mixed_indextyes = True`` - if key == Options.allow_mixed_itypes and value: - warnings.warn(f"Using ``dios_option[{Options.allow_mixed_itypes}]=True`` is highly experimental, " - f"please do not report any bugs!", OptionsWarning) - return super().__setitem__(key, value) - # set default values -dios_options = __OptionsDict() -dios_options[Options.disp_max_rows] = 10 -dios_options[Options.disp_max_vars] = 4 -dios_options[Options.dios_to_dios_method] = 3 -dios_options[Options.allow_mixed_itypes] = False -dios_options[Options.allowed_indextypes] = [IdxTypes.datetime, IdxTypes.nunmeric] - - -def check_allowed_itypes(idxtype): - if idxtype not in dios_options[Options.allowed_indextypes]: - raise RuntimeError(f"The index type `{idxtype}` is not allowed by the " - f"`dios_option[{Options.allowed_indextypes}] = {dios_options[Options.allowed_indextypes]}`") - - -def get_dios_to_dios_keys(keys, other): - # we can assume that all keys are exist in self._data - method = dios_options[Options.dios_to_dios_method] - err_append = "consider changing dios.option['dios_to_dios_method']" - - # assign where possible, otherwise ignore - if method == 0: - keys = [k for k in keys if k in other.columns] - - # at least one key must be in self - elif method == 1: - keys = [k for k in keys if k in other.columns] - if not keys: - raise KeyError("src-DioS and dest-DioS need to share at least one key, " + err_append) - - # all keys must be in self, but more keys could exist in other, - # eg. ``dios['a'] = dios[['a','b']]`` will update column-a but not column-b - # eg. ``dios[['a','b']] = dios['a']`` will fail - elif method == 2: - fail = [k for k in keys if k not in other.columns] - if fail: - raise KeyError(f"{fail} are missing in the destiny-dios, " + err_append) - - # keys in both dios's must be equal - elif method == 3: - fail = set(keys).symmetric_difference(set(other.columns)) - if fail: - raise KeyError(f"{fail} is not in both of src- and dest-dios, " + err_append) - - else: - raise OptionsError(f"{method} is an invalid value for dios.option[dios_to_dios]") - - return keys +dios_options = { + Options.disp_max_rows: 10, + Options.disp_max_vars: 4, + Options.dios_to_dios_method: 3, +} -- GitLab