diff --git a/dios/dios.py b/dios/dios.py index a88d1beaaf18e2e055003b2fa8ba2dc5189bc9e9..514ae2866d110c46dd5377d5076fa43df96cb06f 100644 --- a/dios/dios.py +++ b/dios/dios.py @@ -6,7 +6,7 @@ import pandas as pd import numpy as np import operator as op from functools import partialmethod -from itertools import zip_longest +from itertools import zip_longest, compress, takewhile from functools import wraps from collections import OrderedDict @@ -94,7 +94,6 @@ class DictOfSeries: if columns is not None: for c in columns: if c not in self.columns: - # todo: itype maybe problemetic because Index is BaseIndex/ObjIndex(in future) self._insert(c, pd.Series()) for s in self._data: @@ -124,16 +123,12 @@ class DictOfSeries: @property def columns(self): - return self._data.index.copy() + return self._data.index @columns.setter def columns(self, newindex): self._data.index = newindex - @property - def values(self): - return np.array([s.values for s in self._data]) - @property def itype(self): return self._itype @@ -170,57 +165,72 @@ class DictOfSeries: Notes: - [1] If ``iterable`` contains any(!) label that does not exist, a KeyError is raised. """ - if is_hashable(key) and key is not None: - if key not in self.columns: - # bug-fix(?): integer defaults to iloc ?? - # `s = series([5,6], index=['a','b'])` - # s[1] returns 6 instead of raising a KeyError - raise KeyError(f"'{key}'") - new = self._data.loc[key] + if isinstance(key, tuple): + raise KeyError("tuples are not allowed") + elif is_hashable(key): + new = self._data.at[key] + elif is_dios_like(key): + # work on rows and columns + new = self._getitem_bool_dios(key) + elif isinstance(key, slice): + # work on rows + new = self._slice(key) else: - keys, ixs, ixalign = self._unpack_key(key) + # work on columns new = self.copy_empty() - for i, k in enumerate(keys): - new[k] = self._get_item(k, ixs[i], ixalign) + new._data = self._data.loc[key] return new - def _get_item(self, key, ix, ixalign): - ser = self._data.loc[key] - if ixalign: - ix = ser.index.intersection(ix.index) - return ser.loc[ix] + def _slice(self, key): + """slices self, return copy""" + if key == slice(None): + return self.copy() + + new = self.copy_empty() + for k in self.columns: + new._data.at[k] = self._data.at[k].loc[key] + + return new + + def _getitem_bool_dios(self, key): + # align columns + keys = self.columns.intersection(key.columns).to_list() + for k in keys: + if not is_bool_indexer(key[k]): + raise ValueError("Must pass DictOfSeries with boolean values only") + + new = self.copy_empty() + for k in keys: + ser = self._data.at[key] + boolser = key[k] + # align rows + idx = boolser[boolser].index.intersection(ser.index) + new._data.at[k] = ser.loc[idx] + return new def __setitem__(self, key, value): - """ dios[x] = y + """ dios[key] = value """ - Examples: - - ``dios['a'] = pd.Series()`` -> Set a new pd.Series to an existing column or add it as `new column`. - - ``dios['a'] = Any`` -> Pass ``any`` to the pd.Series in the corresponding existing(!) column. [1],[2] - - ``dios[iterable] = Any`` -> Pass ``any`` to the pd.Series's in the corresponding set of columns. [1],[2],[3] - - ``dios[slice] = Any`` -> Pass ``any`` to all(!) sliced pd.Series's from the current dios. [1],[2] + if isinstance(key, tuple): + raise KeyError(f"{key}. tuples are not allowed") - Notes: - - [1] The length of ``Any`` must be be equal to the length of every sliced pd.Series. - - [2] If ``Any`` is a ``DictOfSeries`` the behavior depends on the option ``dios_to_dios_method`` - in the ``options`` dictionary. - - [3] If ``iterable`` contains any(!) label that does not exist, a KeyError is raised. - """ - ixalign = False + elif is_hashable(key) and key not in self.columns: + self._insert(key, value) + return - if is_hashable(key) and key is not None: - if key not in self.columns: - self._insert(key, value) - return - else: - keys, ixs = [key], [slice(None)] - else: - keys, ixs, ixalign = self._unpack_key(key) + data = self.__getitem__(key) - assert len(keys) == len(ixs) + if isinstance(data, pd.Series): + data.loc[:] = value + self._data.at[key] = data - gen = self._unpack_value(keys, ixs, value) - for tup in gen: - self._set_item(*tup, ixalign) + elif isinstance(data, self.__class__): + for k in data.columns: + s = data._data.at[k] + s.loc[:] = value + self._data.at[k] = s + else: + raise AssertionError(f"getitem returned data of type {type(data)}") def _set_item(self, key, ix, right, ixalign=False): """Set a value (scalar or list or series)""" @@ -237,8 +247,11 @@ class DictOfSeries: def _insert(self, col, val): """Insert a fresh new value into self""" if is_dios_like(val): + if len(val) > 1: + raise ValueError(f"Cannot insert DictOfSeries " + f"with more than one series") val = val.squeeze() - elif is_list_like(val) and not is_nested_list_like(val): + else: val = pd.Series(val) if not isinstance(val, pd.Series): @@ -252,7 +265,7 @@ class DictOfSeries: # prepare value val = list(val) if is_iterator(val) else val - val = val.squeeze() if is_dios_like(val) else val + val = val.squeeze(axis=1) if is_dios_like(val) else val dioslike, nlistlike = is_dios_like(val), is_nested_list_like(val) # check value @@ -276,51 +289,32 @@ class DictOfSeries: yield key, ix, val def _unpack_key(self, key): - """ Determine keys and indexer by type of key. This does not deal - with single (hashable) label-access, only higher dimension objects - are handled.. - """ - ixalign = False key = list(key) if is_iterator(key) else key - # bool indexer - # ------------ - # bool indexer always work on rows, so they need to have - # an index, to which we can align to. This is necessary - # because we could hold series of different lenght/indexes. - if is_bool_indexer(key): - if not isinstance(key, pd.Series): - raise ValueError("Must pass Series with boolean values only") + # on rows + if isinstance(key, slice): keys = self.columns - indexer, ixalign = [key.loc[key]] * len(keys), True + slice_ = key - elif is_dios_like(key): - keys = self.columns.intersection(key.columns).to_list() - for k in keys: - if not is_bool_indexer(key[k]): - raise ValueError("Must pass DictOfSeries with boolean values only") - indexer, ixalign = [key[k].loc[key[k]] for k in keys], True - # slice - # ----- - # slices always work rows too, but never fail and - # doesnt need alignment - elif isinstance(key, slice): - keys = self.columns - indexer = [key] * len(keys) + # on columns + elif is_bool_indexer(key): + if len(key) != len(self.columns): + raise ValueError("bool list must have same length than columns") + keys = list(compress(self.columns, key)) + slice_ = slice(None) - # list-like - # --------- - # list like stuff, just selects columns + # on columns elif is_list_like(key) and not is_nested_list_like(key): key = key.values if isinstance(key, pd.Series) else key keys = [k for k in key if k in self.columns] - fail = [k for k in key if k not in self.columns] - if fail: - raise KeyError(f"{fail} not in index") - indexer = [slice(None)] * len(keys) + slice_ = slice(None) + if len(keys) != len(key): + fail = [k for k in key if k not in self.columns] + if fail: + raise KeyError(f"{fail} not in index") else: - raise KeyError(f"{type(key)}") - return keys, indexer, ixalign + raise KeyError(key) + return keys, slice_ @property def loc(self): @@ -332,6 +326,11 @@ class DictOfSeries: from dios.locator import _iLocIndexer return _iLocIndexer(self) + @property + def align(self): + from dios.locator import _aLocIndexer + return _aLocIndexer(self) + def __str__(self): return self.__repr__() diff --git a/dios/itypes.py b/dios/itypes.py index bfbda70ef240e26c384c3ed8f6fd8bd841fc11f0..3005c2d8da6c570d597a1a7b90e8d8db02775224 100644 --- a/dios/itypes.py +++ b/dios/itypes.py @@ -23,6 +23,7 @@ class DatetimeItype(__Itype): repr = 'DatetimeItype' unique = True subtypes = (pd.DatetimeIndex,) + min_pditype = pd.DatetimeIndex([]) class IntegerItype(__Itype): @@ -30,6 +31,7 @@ class IntegerItype(__Itype): repr = 'IntegerItype' unique = True subtypes = (pd.RangeIndex, pd.Int64Index, pd.UInt64Index, int) + min_pditype = pd.Int64Index([]) class FloatItype(__Itype): @@ -37,6 +39,7 @@ class FloatItype(__Itype): repr = 'FloatItype' subtypes = (pd.Float64Index, float) unique = True + min_pditype = pd.Float64Index([]) # class MultiItype(__Itype): @@ -52,6 +55,7 @@ class NumericItype(__Itype): _subitypes = (IntegerItype, FloatItype) subtypes = (_subitypes + IntegerItype.subtypes + FloatItype.subtypes) unique = False + min_pditype = pd.Float64Index([]) class MixedItype(__Itype): @@ -61,6 +65,7 @@ class MixedItype(__Itype): _subitypes = (DatetimeItype, IntegerItype, FloatItype, NumericItype) _otheritypes = (pd.CategoricalIndex, pd.IntervalIndex, pd.PeriodIndex, pd.TimedeltaIndex, pd.Index) subtypes = (_subitypes + _otheritypes + DatetimeItype.subtypes + NumericItype.subtypes) + min_pditype = pd.Index([]) def is_itype(obj, itype): @@ -151,8 +156,13 @@ def cast_to_itype(series, itype, policy='lossless', err='raise', inplace=False): raise ValueError(f"err={err}") if not inplace: series = series.copy() + itype = get_itype(itype) series.itype = get_itype(series.index) + if series.empty: + return pd.Series(index=itype.min_pditype) + + # up-cast issn't necessary because a dios with a higher # itype always can take lower itypes. # series can have dt/int/float/mixed diff --git a/dios/locator.py b/dios/locator.py index cb2f8b4e54fe4ba0762c56ed7f35545a6ba185ee..3f3d32e9d11d84c20622021cad696c26b6474160 100644 --- a/dios/locator.py +++ b/dios/locator.py @@ -6,97 +6,107 @@ class _Indexer: def __init__(self, _dios): self._dios = _dios self._data = _dios._data - self._columns = _dios.columns - def __setitem__(self, key, val): - keys, rkey, lowdim = self._unpack_key(key) - ix, ixalign = self._unpack_rowkey(rkey) - gen = _unpack_value(keys, ix, val) - for tup in gen: - self._set_item(*tup, ixalign=ixalign) +class _LocIndexer(_Indexer): - def __getitem__(self, key): - keys, rkey, lowdim = self._unpack_key(key) - ix, ixalign = self._unpack_rowkey(rkey) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) - if is_hashable(ix): - new = pd.Series() - new.name = ix + def __getitem__(self, key): + if isinstance(key, tuple): + if len(key) > 2: + raise KeyError("To many indexers") + rowkey, colkey = key else: - new = self._dios.copy_empty() + rowkey, colkey = key, slice(None) + + if isinstance(rowkey, tuple): + raise KeyError(f"{key}. tuples are not allowed.") + + data = self._dios[colkey] - # set series in new dios OR set values in - # new series if ix is hashable (see above) - for k in keys: - new[k] = self._get_item(k, ix, ixalign=ixalign) - maby_set_series_name(new[k], k) + # .loc[:, any] - simple low-cost optimization + if isinstance(rowkey, slice) and rowkey == slice(None): + new = data.copy() - # squeeze to series if a single label was given - # OR squeeze to val if additional ix is hashable - if lowdim: - new = new.squeeze() + # .loc[any, scalar] + elif isinstance(data, pd.Series): + new = data.loc[rowkey] + + elif isinstance(data, self._dios.__class__): + # .loc[scalar, any] + if is_hashable(rowkey): + new = pd.Series(index=type(data.columns)([])) + for k in data.columns: + s = data._data.at[k] + new.at[k] = s.loc[rowkey] + + # .loc[non-scalar, non-scalar] + else: + new = self._dios.copy_empty() + for k in data.columns: + s = data._data.at[k] + new._data.at[k] = s.loc[rowkey] + else: + raise AssertionError(f"getitem returned data of type {type(data)}") return new - def _unpack_rowkey(self, rkey): - align = False - if is_dios_like(rkey) or is_nested_list_like(rkey): - raise ValueError("Cannot index with multidimensional key") - if is_bool_indexer(rkey): - if not isinstance(rkey, pd.Series): - raise ValueError("Must pass Series with boolean values only") - rkey, align = rkey[rkey], True # kill `False` - return rkey, align - - @abstractmethod - def _unpack_key(self, key): - ... + def __setitem__(self, key, value): + if isinstance(key, tuple): + if len(key) > 2: + raise KeyError("To many indexers") - @abstractmethod - def _get_item(self, key, ix, ixalign=False): - ... + rowkey, colkey = key - @abstractmethod - def _set_item(self, ser, ix, val, ixalign=False): - ... + if isinstance(rowkey, tuple): + raise KeyError(f"{key}. tuples are not allowed.") + # .loc[-,new-scalar] = val + # if a unknown colkey was given, we insert it and ignore rowkey + if is_hashable(colkey) and colkey not in self._dios.columns: + self._dios._insert(colkey, value) + return + else: + rowkey, colkey = key, slice(None) -class _LocIndexer(_Indexer): + # get .loc[any,any] - we use key(!) here + data = self.__getitem__(key) - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - # we can use set item here, as this - # also uses .loc for setting values + if is_dios_like(value) or is_nested_list_like(value): + raise TypeError(".loc[] cannot be used to set multi-dimensional values, use .aloc[] instead.") - def _set_item(self, *args, **kwargs): - # we can use DictionaryOfSeries._set_item() - # here because it also uses .loc - self._dios._set_item(*args, **kwargs) + # .loc[scalar, any] or .loc[any, scalar] + if isinstance(data, pd.Series): + # .loc[scalar, non-scalar] - column-labeled series + if is_hashable(rowkey): + data.loc[:] = value + for k in data.index: + s = data._data.at[k] + s.at[rowkey] = data.at[k] + self._data.at[k] = s - def _get_item(self, key, ix, ixalign=False): - ser = self._data.loc[key] - if ixalign: - ix = ser.index.intersection(ix.index) - return ser.loc[ix] + # .loc[non-scalar, scalar] - (normal) row-labeled series + elif is_hashable(colkey): + data.loc[rowkey] = value + self._data.at[colkey] = data - def _unpack_key(self, key): - lowdim = False - if isinstance(key, tuple): - key, ckey, *fail = key - if fail: - raise KeyError("To many indexers") - if is_dios_like(ckey): - raise ValueError("Cannot index with multidimensional key") - if is_bool_series(ckey): - keys = ckey.where(ckey).dropna().index.to_list() else: - if is_hashable(ckey): - ckey, lowdim = [ckey], True - keys = self._data.loc[ckey].index.to_list() + raise AssertionError(f"getitem returned data of type {type(data)}") + + # .loc[non-scalar, non-scalar] + elif isinstance(data, self._dios.__class__): + for k in data.columns: + s = data._data.at[k] + s.loc[rowkey] = value + self._data.at[k] = s + + # .loc[scalar, scalar] else: - keys = self._columns.to_list() - return keys, key, lowdim + s = self._data.at[colkey] + s.at[rowkey] = value + self._data.at[colkey] = s class _iLocIndexer(_Indexer): @@ -130,7 +140,7 @@ class _iLocIndexer(_Indexer): if fail: raise KeyError("To many indexers") if is_dios_like(ckey): - raise ValueError("Cannot index with multidimensional key") + raise ValueError("Cannot index with multi-dimensional key") if is_bool_series(ckey): keys = ckey.where(ckey).dropna().index.to_list() else: @@ -142,6 +152,20 @@ class _iLocIndexer(_Indexer): return keys, key, lowdim +class _aLocIndexer(_Indexer): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def __setitem__(self, key, value): + # fallback to loc + if not is_dios_like(value) and not is_nested_list_like(value): + self._dios.loc[key] = value + + def __getitem__(self, item): + return item + + def _unpack_value(keys, ix, val): """Return a generator that yield (column key, corresponding value, value-align(bool) ) for all columns.