From 28c06f478065fa6f6bd4393728d7bf30f04135fd Mon Sep 17 00:00:00 2001 From: Bert Palm <bert.palm@ufz.de> Date: Tue, 11 Feb 2020 19:02:42 +0100 Subject: [PATCH] options, lib, keep track of index type --- dios/dios.py | 152 ++++++++++++++++++++++++------------------------ dios/lib.py | 96 ++++++++++++++++++++++++------ dios/options.py | 41 +++++++++++++ 3 files changed, 195 insertions(+), 94 deletions(-) create mode 100644 dios/options.py diff --git a/dios/dios.py b/dios/dios.py index a748a56..7482402 100644 --- a/dios/dios.py +++ b/dios/dios.py @@ -14,69 +14,6 @@ from pandas.core.dtypes.common import ( from pandas.core.indexing import need_slice -def item_from_zerodim(key): - # if isinstance(key, DictOfSeries) and len(key) == 1: - # todo what if squeeze return a 1-value-series? squeeze again? - # return key.squeeze() - return pdlib.item_from_zerodim(key) - - -class _LocIndexer: - - def __init__(self, _dios): - self._dios = _dios - # short handles - self._data = _dios._data - self._check_keys = _dios._check_keys - - def __getitem__(self, key): - # if we have a tuple, we have rows and columns - # if not we have only rows and work on all columns - if isinstance(key, tuple): - rkey, ckey, *fail = key - if fail: - raise KeyError("To many indexers") - - # prepare ckey - if is_iterator(ckey): - ckey = list(ckey) - - # determine columns - if isinstance(ckey, str): - self._check_keys([ckey]) - cols = [ckey] - elif isinstance(ckey, slice): - cols = self._col_slice_to_col_list(ckey) - elif is_list_like(ckey): - self._check_keys(ckey) - cols = ckey - else: - raise KeyError(f"Type {type(ckey)} is not supported to select columns.") - else: - cols = self._data.keys() - rkey = key - - # pass the row-key directly to pd.Series.loc[row-key] - new = DictOfSeries() - for c in cols: - new[c] = self._data[c].loc[rkey] - return new - - def _col_slice_to_col_list(self, rslice): - """ see here: - https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-slicing-with-labels - """ - keys = list(self._data.keys) - try: - start = keys.index(rslice.start) if rslice.start is not None else None - stop = keys.index(rslice.stop) if rslice.stop is not None else None - except ValueError: - raise KeyError("The slice start label or the slice stop label is not present in the columns.") - if not is_integer(rslice) and rslice > 0: - raise TypeError("The step parameter of the slice must be positive integer.") - return keys[slice(start, stop + 1, rslice.step)] - - class DictOfSeries: """ DictionaryOfSeries is a collection of pd.Series's which aim to be as close as possible similar to @@ -112,10 +49,16 @@ class DictOfSeries: def __init__(self, indextype=None, **kwargs): self._data = OrderedDict() - # We need to keep track if the index type of every Series is the - # same, because if we have different types, it would make slicing - # impossible. - self._indextype = None + # We need to keep track of the type of the index of every new Series. + # If the types differ slicing will almost always fail, because a datetime-like + # slice cannont work on a numeric index and vice versa.. + if indextype is not None: + indextype = get_indextype(indextype) + check_mixed_indextype_option(indextype) + check_allowed_indextypes(indextype) + self._indextype = indextype + + # fill initial given values in the dios for kw in kwargs: self[kw] = kwargs[kw] @@ -146,17 +89,15 @@ class DictOfSeries: return self._indextype def _set_indextype(self, idx): - itype = 'other' - if is_dtIndex_like(idx): - itype = 'datetime' - elif is_numIndex_like(idx): - itype = 'numeric' + """ Set indextype of dios. + + Note: If ``self._indextype`` and ``idx`` are of the same type, + ``self._indextype`` stays unchanged. + """ if self._indextype is None: - self._indextype = itype - return - if self._indextype == itype: - return - self._indextype = 'mixed' + self._indextype = get_indextype(idx) + elif self._indextype != get_indextype(idx): + self._indextype = IdxTypes.mixed def _check_keys(self, keys): missing = [k for k in keys if k not in self.columns] @@ -535,3 +476,60 @@ class DictOfSeries: return None return news.squeeze() + +class _LocIndexer: + + def __init__(self, _dios): + self._dios = _dios + # short handles + self._data = _dios._data + self._check_keys = _dios._check_keys + + def __getitem__(self, key): + # if we have a tuple, we have rows and columns + # if not we have only rows and work on all columns + if isinstance(key, tuple): + rkey, ckey, *fail = key + if fail: + raise KeyError("To many indexers") + + # prepare ckey + if is_iterator(ckey): + ckey = list(ckey) + + # determine columns + if isinstance(ckey, str): + self._check_keys([ckey]) + cols = [ckey] + elif isinstance(ckey, slice): + cols = self._col_slice_to_col_list(ckey) + elif is_list_like(ckey): + self._check_keys(ckey) + cols = ckey + else: + raise KeyError(f"Type {type(ckey)} is not supported to select columns.") + else: + cols = self._data.keys() + rkey = key + + # pass the row-key directly to pd.Series.loc[row-key] + new = DictOfSeries() + for c in cols: + new[c] = self._data[c].loc[rkey] + return new + + def _col_slice_to_col_list(self, rslice): + """ see here: + https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-slicing-with-labels + """ + keys = list(self._data.keys) + try: + start = keys.index(rslice.start) if rslice.start is not None else None + stop = keys.index(rslice.stop) if rslice.stop is not None else None + except ValueError: + raise KeyError("The slice start label or the slice stop label is not present in the columns.") + if not is_integer(rslice) and rslice > 0: + raise TypeError("The step parameter of the slice must be positive integer.") + return keys[slice(start, stop + 1, rslice.step)] + + diff --git a/dios/lib.py b/dios/lib.py index a50950f..9febd21 100644 --- a/dios/lib.py +++ b/dios/lib.py @@ -1,29 +1,91 @@ - import pandas as pd -import pandas._libs.lib as pdlib +import warnings +from dios.options import * + + +def _get_storage_class_values(cls): + return [getattr(cls, c) for c in cls.__dict__ if not c.startswith("_")] -class IdxType: + +class IdxTypes: nunmeric = 'numeric' datetime = 'datetime' mixed = 'mixed' other = 'other' -def is_dtIndex_like(i): - return isinstance(i, pd.DatetimeIndex) +idxtypes = _get_storage_class_values(IdxTypes) + + +def check_mixed_indextype_option(idxtype): + if dios_options[Options.mixed_indextyes]: + warnings.warn(f"Using dios_option[{Options.mixed_indextyes}]=True is highly experimental, " + f"please do not report any bugs!", DiosOptionsWarning) + return + + +def check_allowed_indextypes(idxtype): + if idxtype not in [IdxTypes.nunmeric, IdxTypes.datetime]: + raise ValueError("The index of the given object is not of supported type") + + +def get_indextype(obj): + if _is_dtIndex_like(obj): + return IdxTypes.datetime + + if _is_numIndex_like(obj): + return IdxTypes.nunmeric + + if _is_pdIndex_like(obj): + return IdxTypes.other + + for itype in idxtypes: + if obj == itype: + return itype + + raise ValueError(f"{type(obj)} is not a indextype nor any known subtype of pd.Index") + + +def _is_dtIndex_like(i): + if isinstance(i, pd.DatetimeIndex): + return True + try: + if i == pd.DatetimeIndex: + return True + except TypeError: + return False + + +def _is_numIndex_like(i): + tup = (pd.RangeIndex, pd.Int64Index, pd.UInt64Index, pd.Float64Index) + if isinstance(i, tup): + return True + # was a pd.xxxIndex was given + for it in tup: + try: + if it == i: + return True + except TypeError: + pass + return False -def is_numIndex_like(i): - return isinstance(i, (pd.RangeIndex, pd.Int64Index, pd.UInt64Index, pd.Float64Index)) -dios_options = dict( - disp_max_rows=10, - disp_max_vars=4, +def _is_pdIndex_like(i): + """See here: + https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Index.html#pandas.Index + """ + if isinstance(i, pd.Index): + return True + tup = (pd.RangeIndex, pd.CategoricalIndex, pd.MultiIndex, pd.IntervalIndex, + pd.DatetimeIndex, pd.TimedeltaIndex, + pd.PeriodIndex, pd.Int64Index, pd.UInt64Index, pd.Float64Index) + # was a pd.xxxIndex was given + for it in tup: + try: + if it == i: + return True + except TypeError: + pass + return False - # 0: accept all - # 1: accept if at least one keys is is in both DioS - # 2: accept if all keys of the src-DioS in the dest-DioS - # 3: accept if both dios have the exact same keys (makes only sense for assignments with slicer, - # otherwise its the same than creating a new dios) - dios_to_dios_method=3 -) diff --git a/dios/options.py b/dios/options.py new file mode 100644 index 0000000..8104558 --- /dev/null +++ b/dios/options.py @@ -0,0 +1,41 @@ +from dios.lib import IdxTypes + + +class DiosOptionsWarning(UserWarning): + pass + + +class Options: + """storage class for dios options dict keys""" + + """Set the number of rows and variables to display in a call that use + ``__repr__`` or ``__str__`` like e.g. ``print(dios)`` do.""" + disp_max_rows = "disp_max_rows " + disp_max_vars = "disp_max_vars" + + """ + 0: accept all + 1: accept if at least one keys is is in both DioS + 2: accept if all keys of the src-DioS in the dest-DioS + 3: accept if both dios have the exact same keys (makes only sense for assignments with slicer, + otherwise its the same than creating a new dios)""" + dios_to_dios_method = "dios_to_dios_method" + + """ + If we have different types of indexes in the dios, slicing will almost always fail. + It is because, eg. a numeric slice cannot work on a pd.DatetimeIndex and vice versa. + To set this to True is highly experimental, any arising issues or errors should be + handled by the user.""" + mixed_indextyes = "mixed_indextyes" + + allowed_indextypes = "allowed_indextypes" + + +dios_options = { + Options.disp_max_rows : 10, + Options.disp_max_vars: 4, + Options.dios_to_dios_method: 3, + Options.mixed_indextyes: False, + Options.allowed_indextypes: [IdxTypes.datetime, IdxTypes.nunmeric] +} + -- GitLab