From c863d8a16220375c1a85c7a22b39c913148e2a40 Mon Sep 17 00:00:00 2001 From: Bert Palm <bert.palm@ufz.de> Date: Fri, 14 Feb 2020 18:37:38 +0100 Subject: [PATCH] working --- dios/__init__.py | 5 ++ dios/dios.py | 191 ++++++++++++++++++++--------------------------- dios/errors.py | 24 ++++++ dios/itypes.py | 73 +++++++++++------- dios/lib.py | 52 +------------ dios/options.py | 45 +++++++++-- 6 files changed, 196 insertions(+), 194 deletions(-) create mode 100644 dios/errors.py diff --git a/dios/__init__.py b/dios/__init__.py index 34cb18b..4b13497 100644 --- a/dios/__init__.py +++ b/dios/__init__.py @@ -1,5 +1,10 @@ +# low level +from dios.errors import * from dios.lib import * from dios.options import * + +# high level +from dios.itypes import * from dios.dios import * diff --git a/dios/dios.py b/dios/dios.py index 2e3701a..b453add 100644 --- a/dios/dios.py +++ b/dios/dios.py @@ -1,5 +1,7 @@ from dios.lib import * from dios.options import * +from dios.itypes import * +from dios.errors import * import pandas as pd import numpy as np import operator as op @@ -57,24 +59,21 @@ class DictOfSeries: """ - def __init__(self, data=None, columns=None, itype=None, downcast_policy='lossless'): + def __init__(self, data=None, columns=None, itype=MixedItype, downcast_policy='lossless'): self._data = OrderedDict() # We need to keep track of the index-type (itype) of every new Series. # If the itypes differ between different series, slicing will almost always fail # (eg. a datetime-like slice cannot work on a numeric index and vice versa). - # - # May data was given, so we firstly set itype to MixedItype, then insert all data, - # and check/cast the itype afterwards, otherwise __setitem_new() will set the itype, - # which may prevent inserting series with other (higher) itypes. + self._itype = None + with reraise("param itype: "): - self._itype = get_itype(itype) + self.itype = get_itype(itype) - policies = ['force', 'lossless', 'never'] - if downcast_policy not in policies: - raise ValueError(f"downcast_policy must be one of {policies}") - self._downcast_policy = downcast_policy + if downcast_policy not in CAST_POLICIES: + raise ValueError(f"downcast_policy must be one of {CAST_POLICIES}") + self._policy = downcast_policy if data is not None: self.__init_insert_data__(data) @@ -83,76 +82,23 @@ class DictOfSeries: if columns is not None: self.columns = columns - # infer the itype by the data - inferred_itype = self.__find_least_common_itype() - itype = inferred_itype if itype is None else get_itype(itype) - - # We use the itype.setter to make all checks. If the given itype was of a lower type - # than the inferred itype, a cast is tried on every series. - if itype is not None: - self.itype = itype - - # user created a empty dios: data=None(->inferred=None), itype=None - else: - self._itype = None - def __init_insert_data__(self, data): if isinstance(data, DictOfSeries): for k in data: self[k] = data[k] - - if is_iterator(data): - data = list(data) - - if is_dict_like(data): - for k in data: - self[k] = data[k] - return - - # take care: dict's also list-like - if is_nested_list_like(data): - for i, d in enumerate(data): - self[str(i)] = d - return - - if is_list_like(data): - self['0'] = data - return - - def __find_least_common_itype(self): - - def all_itypes_le(itypes, super_itype): - for itype in itypes: - if itype_le(itype, super_itype): - continue - return False - return True - - itypes = [] - for k in self.columns: - itypes.append(get_itype(self._data[k].index)) - - if not itypes: - return None - - found = None - - # check supertypes - super_itypes = [MixedItype, NumericItype] - for super_itype in super_itypes: - if all_itypes_le(itypes, super_itype): - found = super_itype - continue - break - assert found, "At least this should be MixedItype" - - # check base types - single_itypes = [DatetimeItype, IntegerItype, FloatItype] - for single_itype in single_itypes: - if all_itypes_le(itypes, single_itype): - found = single_itype - break - return found + else: + if is_iterator(data): + data = list(data) + + if is_dict_like(data): + for k in data: + self[k] = data[k] + elif is_nested_list_like(data): + for i, d in enumerate(data): + self[str(i)] = d + elif is_list_like(data): + self['0'] = data + return @property def columns(self): @@ -178,6 +124,7 @@ class DictOfSeries: @property def values(self): + # will make all series same length, inset nan's return to_object_array(self._data.values()).transpose() @property @@ -192,22 +139,20 @@ class DictOfSeries: def itype(self, itype_like): itype = get_itype(itype_like) - if not is_itype_subtype(self._itype, itype): - # try to cast all series to the new itype + if not itype_le(self._itype, itype): self.__cast_all(itype) self._itype = itype if not itype.unique: - throwMixedItypeErrWarn(f"Using a {itype} as dios.itype is experimental. As soon as series with " - f"different index types are inserted, slicing will almost always fail. " - f"You are hereby warned!") + throw_MixedItype_err_or_warn(f"Using a {itype} as dios.itype is experimental. As soon as series with " + f"different index types are inserted, slicing will almost always fail. " + f"You are hereby warned!") def __cast_all(self, itype): for k in self.columns: - casted = cast_to_fit_itype(self._data[k].copy(), itype, policy=self._downcast_policy) - if casted is None: - raise ItypeCastError(f"Cast series indicees to the given itype failed for series in column {k}.") + with reraise(f"Column {k}: "): + casted = cast_to_itype(self._data[k], itype, policy=self._policy) self._data[k] = casted def _check_keys(self, keys): @@ -237,7 +182,7 @@ class DictOfSeries: if isinstance(key, slice): return self._slice(self.columns, key) - if is_list_like(key): + if is_list_like(key) and not is_nested_list_like(key): self._check_keys(key) return self._getitem_listlike(key) @@ -250,14 +195,14 @@ class DictOfSeries: def _getitem_listlike(self, keys): new = self.copy_empty() for k in keys: - new[k] = self._get_item(k) + new._data[k] = self._get_item(k) return new def _slice(self, keys, slicer): """ Return a slice of self""" new = self.copy_empty() for k in keys: - new[k] = self._get_item(k)[slicer] + new._data[k] = self._get_item(k)[slicer] return new def __setitem__(self, key, value): @@ -310,30 +255,15 @@ class DictOfSeries: def _setitem_new(self, key, value, bypass_checks=False): v = value + if isinstance(v, DictOfSeries): + v = v.squeeze() + elif is_list_like(v) and not is_nested_list_like(v): + v = pd.Series(v) - # if the checks was already done, we skip them here, - # also the Errormessage wouldn't fully apply. - if not bypass_checks: - if isinstance(v, DictOfSeries): - v = v.squeeze() - - elif is_list_like(v): - v = pd.Series(v) # upcast - - if not isinstance(v, pd.Series): - raise ValueError(f"Only pd.Series and DictOfSeries (of length 1) can be assigned new") - - itype = get_itype(v.index) - - if self._itype is None: - # if the user created a empty dios - self._itype = itype - - v = cast_to_fit_itype(v, self._itype, policy=self._downcast_policy) - if v is None: - raise ValueError(f"Itype mismach. Policy `{self._downcast_policy}` forbid to down-cast" - f"itype `{itype}` to itype `{self.itype}`. key: {key}") + if not isinstance(v, pd.Series): + raise ValueError(f"Only pd.Series can be inserted directly") + v = cast_to_itype(v, self._itype, policy=self._policy) self._data[key] = v.copy(deep=True) def _setitem(self, key, val, sl=None): @@ -341,7 +271,8 @@ class DictOfSeries: # series, dios['a'] = series if isinstance(val, pd.Series) and sl is None: - self._setitem_new(key, val, bypass_checks=True) + val = cast_to_itype(val, self._itype, policy=self._policy) + self._data[key] = val.copy(deep=True) return sl = sl or slice(None) @@ -365,7 +296,8 @@ class DictOfSeries: return def _setitem_dios(self, keys, slicer, other): - keys = get_dios_to_dios_keys(keys, other) + method = dios_options[Options.dios_to_dios_method] + keys = get_dios_to_dios_keys(keys, other, method) for k in keys: self._setitem(k, other[k], slicer) @@ -471,7 +403,8 @@ class DictOfSeries: def __op2__(self, other, op): new = self.copy_empty() if isinstance(other, DictOfSeries): - keys = get_dios_to_dios_keys(self.columns, other) + method = dios_options[Options.dios_to_dios_method] + keys = get_dios_to_dios_keys(self.columns, other, method) for k in keys: new[k] = op(self[k], other[k]) else: @@ -609,6 +542,40 @@ class DictOfSeries: return None return news.squeeze() + # def __find_least_common_itype(self): + # def all_itypes_le(itypes, super_itype): + # for itype in itypes: + # if itype_le(itype, super_itype): + # continue + # return False + # return True + # + # itypes = [] + # for k in self.columns: + # itypes.append(get_itype(self._data[k].index)) + # + # if not itypes: + # return None + # + # found = None + # + # # check supertypes + # super_itypes = [MixedItype, NumericItype] + # for super_itype in super_itypes: + # if all_itypes_le(itypes, super_itype): + # found = super_itype + # continue + # break + # assert found, "At least this should be MixedItype" + # + # # check base types + # single_itypes = [DatetimeItype, IntegerItype, FloatItype] + # for single_itype in single_itypes: + # if all_itypes_le(itypes, single_itype): + # found = single_itype + # break + # return found + # class _Indexer: def __init__(self, _dios): diff --git a/dios/errors.py b/dios/errors.py new file mode 100644 index 0000000..9df116f --- /dev/null +++ b/dios/errors.py @@ -0,0 +1,24 @@ +import warnings +# do not import dios-stuff here + + +class ItypeWarning(RuntimeWarning): + pass + + +class ItypeCastWarning(ItypeWarning): + pass + + +class ItypeCastError(RuntimeError): + pass + + +class OptionsWarning(UserWarning): + pass + + +class OptionsError(RuntimeError): + pass + + diff --git a/dios/itypes.py b/dios/itypes.py index 5ce9710..e74d19a 100644 --- a/dios/itypes.py +++ b/dios/itypes.py @@ -1,16 +1,16 @@ import pandas as pd +from dios.options import * +from dios.lib import * +from dios.errors import * -class ItypeWarning(RuntimeWarning): - pass +class CastPolicy: + force = 'force' + lossless = 'lossless' + never = 'never' -class ItypeCastWarning(ItypeWarning): - pass - - -class ItypeCastError(RuntimeError): - pass +CAST_POLICIES = get_storage_class_values(CastPolicy) class __Itype: @@ -133,7 +133,7 @@ def itype_le(a, b): return is_itype_like(a, b) -def cast_to_fit_itype(series, itype, policy='force'): +def cast_to_itype(series, itype, policy='lossless', err='raise', inplace=False): """ Cast a series (more explicit the type of the index) to fit the itype of a dios. Return the casted series if successful, None otherwise. @@ -142,6 +142,13 @@ def cast_to_fit_itype(series, itype, policy='force'): This is very basic number-casting, so in most cases, information from the old index will be lost after the cast. """ + + if policy not in CAST_POLICIES: + raise ValueError(f"policy={policy}") + if err not in ['raise', 'ignore']: + raise ValueError(f"err={err}") + if not inplace: + series = series.copy() series.itype = get_itype(series.index) # up-cast issn't necessary because a dios with a higher @@ -154,37 +161,53 @@ def cast_to_fit_itype(series, itype, policy='force'): if itype_le(series.itype, itype): # a <= b return series - if policy in ['forbid', 'no-downcast', 'no-cast', 'never']: - return None + e = f"A series index of type `{type(series.index)}` cannot be casted to Itype {itype.name}" - elif policy == 'force': - # any (dt/float/mixed) -> int, always OK - # any (dt/float/mixed) -> num(int), always OK - # any (dt/int/mixed) -> float, always OK - # any (int/float/mixed) -> dt, always FAIL + # cast any -> dt always fail. + if is_itype(itype, DatetimeItype): + pass + else: + e += f", as forbidden by the cast-policy `{policy}`." + + if policy == CAST_POLICIES[CastPolicy.never]: + pass + + elif policy == CAST_POLICIES[CastPolicy.force]: + # cast any (dt/float/mixed) -> int + # cast any (dt/float/mixed) -> num if is_itype(itype, IntegerItype) or is_itype(itype, NumericItype): # a == b or a == c series.index = pd.RangeIndex(len(series)) return series + # cast any (dt/int/mixed) -> float if is_itype(itype, FloatItype): # a == b series.index = pd.Float64Index(range(len(series))) return series - if is_itype(itype, DatetimeItype): # a == b - return None - return None - elif policy == 'lossless': - # int -> float, always OK - # float -> int, maybe if unique - # mixed -> any, always FAIL - # dt -> any, always FAIL + elif policy == CAST_POLICIES[CastPolicy.lossless]: + # cast int -> float if is_itype(itype, IntegerItype) and is_itype(series.itype, FloatItype): # a == b and c == d series.index = series.index.astype(float) return series + # cast float -> int, maybe if unique if is_itype(itype, FloatItype) and is_itype(series.itype, IntegerItype): # a == b and c == d series.index = series.index.astype(int) if series.index.is_unique: return series + e = f"The cast with policy {policy} from series index type `{type(series.index)}` to " \ + f"itype {itype.name} resulted in a non-unique index." + # cast mixed -> int/float always fail + + if err == 'raise': + raise ItypeCastError(e) + else: return None - raise ValueError(f"Unknown policy `{policy}`.") +def throw_MixedItype_err_or_warn(msg): + if dios_options[Options.mixed_itype_policy] in ['ignore', 'silent']: + pass + elif dios_options[Options.mixed_itype_policy] in ['error', 'err']: + raise ItypeCastError(msg) + else: + warnings.warn(msg, ItypeWarning) + return diff --git a/dios/lib.py b/dios/lib.py index 27df4f4..e52ffc3 100644 --- a/dios/lib.py +++ b/dios/lib.py @@ -1,9 +1,7 @@ -from dios.itypes import * -from dios.options import * - import pandas as pd -import warnings +import numpy as np import contextlib +# do not import dios-stuff here @contextlib.contextmanager @@ -14,51 +12,7 @@ def reraise(prefix="", postfix=""): raise type(e)(prefix + str(e) + postfix) from e -def _get_storage_class_values(cls): +def get_storage_class_values(cls): return [getattr(cls, c) for c in cls.__dict__ if not c.startswith("_")] -def throwMixedItypeErrWarn(msg): - if dios_options[Options.mixed_itype_policy] in ['ignore', 'silent']: - pass - elif dios_options[Options.mixed_itype_policy] in ['error', 'err']: - raise ItypeCastError(msg) - else: - warnings.warn(msg, ItypeWarning) - return - - -# todo: make method an kwarg and remove dios_options access -def get_dios_to_dios_keys(keys, other): - # we can assume that all keys are exist in self._data - method = dios_options[Options.dios_to_dios_method] - err_append = "consider changing dios.option['dios_to_dios_method']" - - # assign where possible, otherwise ignore - if method == 0: - keys = [k for k in keys if k in other.columns] - - # at least one key must be in self - elif method == 1: - keys = [k for k in keys if k in other.columns] - if not keys: - raise KeyError("src-DioS and dest-DioS need to share at least one key, " + err_append) - - # all keys must be in self, but more keys could exist in other, - # eg. ``dios['a'] = dios[['a','b']]`` will update column-a but not column-b - # eg. ``dios[['a','b']] = dios['a']`` will fail - elif method == 2: - fail = [k for k in keys if k not in other.columns] - if fail: - raise KeyError(f"{fail} are missing in the destiny-dios, " + err_append) - - # keys in both dios's must be equal - elif method == 3: - fail = set(keys).symmetric_difference(set(other.columns)) - if fail: - raise KeyError(f"{fail} is not in both of src- and dest-dios, " + err_append) - - else: - raise OptionsError(f"{method} is an invalid value for dios.option[dios_to_dios]") - - return keys diff --git a/dios/options.py b/dios/options.py index 1fc8eb4..9350c8e 100644 --- a/dios/options.py +++ b/dios/options.py @@ -1,10 +1,4 @@ - -class OptionsWarning(UserWarning): - pass - - -class OptionsError(RuntimeError): - pass +# do not import dios-stuff here class Options: @@ -18,13 +12,19 @@ class Options: """ 0: accept all 1: accept if at least one keys is is in both DioS - 2: accept if all keys of the src-DioS in the dest-DioS 3: accept if both dios have the exact same keys (makes only sense for assignments with slicer, otherwise its the same than creating a new dios)""" dios_to_dios_method = "dios_to_dios_method" mixed_itype_policy = "mixed_itype_policy" + +class OptionsDiosToDios: + all_must_match = 'all' + at_least_one = 'one' + any_matching = 'any' + + # set default values dios_options = { Options.disp_max_rows: 10, @@ -32,3 +32,32 @@ dios_options = { Options.dios_to_dios_method: 3, Options.mixed_itype_policy: 'warn', } + + +def get_dios_to_dios_keys(keys, other, method): + + err_append = "consider changing dios.option['dios_to_dios_method']" + + if method == OptionsDiosToDios.any_matching: + keys = [k for k in keys if k in other.columns] + + elif method == OptionsDiosToDios.at_least_one: + keys = [k for k in keys if k in other.columns] + if not keys: + raise KeyError("src-DioS and dest-DioS need to share at least one key, " + err_append) + + # elif method == 2: + # fail = [k for k in keys if k not in other.columns] + # if fail: + # raise KeyError(f"{fail} are missing in the destiny-dios, " + err_append) + + # keys in both dios's must be equal + elif OptionsDiosToDios.all_must_match: + fail = set(keys).symmetric_difference(set(other.columns)) + if fail: + raise KeyError(f"{fail} is not in both of src- and dest-dios, " + err_append) + + else: + raise ValueError(method) + + return keys -- GitLab