diff --git a/dios/dios.py b/dios/dios.py index 3cdf86a3369798aaa5e6845de49eaad6ef455d6f..46a0f17760e9d96970bbd5a72031ea1811fabc2a 100644 --- a/dios/dios.py +++ b/dios/dios.py @@ -33,34 +33,7 @@ Unlike the example says, return lists False, not True """ from pandas.core.dtypes.common import is_iterator as _is_iterator - -def _is_list_like_not_nested(obj): - return _is_list_like(obj) and not _is_nested_list_like(obj) - - -def _is_dios_like(obj): - # must have columns - # columns is some kind of pd.Index - # iter will iter through columns - # a `in` obj check if obj is in columns - # obj[key] will give a pd.Series - # obj.squeeze() give pd.Series if len(obj) == 1 - return isinstance(obj, DictOfSeries) or isinstance(obj, pd.DataFrame) - - -def _is_bool_series(obj): - return isinstance(obj, pd.Series) and obj.dtype == bool - - -def __monkey_patch_pandas(): - def to_dios(self): - return DictOfSeries(data=self) - - pd.Series.to_dios = to_dios - pd.DataFrame.to_dios = to_dios - - -__monkey_patch_pandas() +from typing import Union, Any class DictOfSeries: @@ -153,7 +126,10 @@ class DictOfSeries: if columns is None or k in self.columns: self._insert(k, data[k]) - elif _is_list_like(data): # also Series ! + elif isinstance(data, pd.Series): + self._insert(data.name or 0, data) + + elif _is_list_like(data): data = data if _is_nested_list_like(data) else [data] if self.columns.empty: @@ -301,17 +277,16 @@ class DictOfSeries: def _getitem_bool_dios(self, key): """ Select items by a boolean dios-like drop un-selected indices. """ - new = self.copy_empty(columns=True) + if not _is_bool_dios_like(key): + raise ValueError("Must pass DictOfSeries with boolean values only") + new = self.copy_empty(columns=True) for k in self.columns.intersection(key.columns): dat = self._data.at[k] val = key[k] - if not _is_bool_indexer(val): - raise ValueError("Must pass DictOfSeries with boolean values only") # align rows idx = val[val].index.intersection(dat.index) new._data.at[k] = dat[idx] - return new def _getitem_bool_listlike(self, key): @@ -874,3 +849,48 @@ def _to_aligned_df(dios, no_value=' '): df.loc[nandict[c], c] = np.nan return df + + +def _is_list_like_not_nested(obj): + return _is_list_like(obj) and not _is_nested_list_like(obj) + + +def _is_dios_like(obj) -> bool: + # must have columns + # columns is some kind of pd.Index + # iter will iter through columns + # a `in` obj check if obj is in columns + # obj[key] will give a pd.Series + # obj.squeeze() give pd.Series if len(obj) == 1 + return isinstance(obj, DictOfSeries) or isinstance(obj, pd.DataFrame) + + +def _is_bool_series(obj) -> bool: + return isinstance(obj, pd.Series) and obj.dtype == bool + + +def _is_bool_dios_like(obj) -> bool: + if not _is_dios_like(obj): + return False + dtypes = obj.dtypes + if (dtypes == bool).all(): + return True + if (dtypes == 'O').any(): + return obj.apply(_is_bool_indexer).all() + return False + + +def to_dios(obj) -> DictOfSeries: + return DictOfSeries(data=obj) + + +def __monkey_patch_pandas(): + def to_dios(self): + return DictOfSeries(data=self) + + pd.Series.to_dios = to_dios + pd.DataFrame.to_dios = to_dios + + +__monkey_patch_pandas() + diff --git a/dios/indexer.py b/dios/indexer.py index 716aa1e3589d5c8ba6a07e65b692ba06cead980b..28f8ef5a20df381f4d05253b36987394b9b40d92 100644 --- a/dios/indexer.py +++ b/dios/indexer.py @@ -3,11 +3,13 @@ from .dios import ( _is_dios_like, _is_bool_series, _is_list_like_not_nested, + _is_bool_dios_like, _is_iterator) import pandas as pd import pandas.core.common as ccom import pandas.core.dtypes.common as dcom + _is_list_like = dcom.is_list_like _is_nested_list_like = dcom.is_nested_list_like _is_scalar = dcom.is_scalar @@ -19,7 +21,7 @@ _is_bool_indexer = ccom.is_bool_indexer class _Indexer: - def __init__(self, obj): + def __init__(self, obj: DictOfSeries): self.obj = obj self._data = obj._data @@ -132,7 +134,7 @@ class _LocIndexer(_Indexer): except Exception as e: c = data.index[i] if i is not None else '?' - raise type(e)(f"failed for column {c}: " + str(e) ) from e + raise type(e)(f"failed for column {c}: " + str(e)) from e # ############################################################################# @@ -259,96 +261,121 @@ class _aLocIndexer(_Indexer): def __setitem__(self, key, value): rowkeys, colkeys, _ = self._unpack_key_aloc(key) - c = '?' - try: - # full-alignable: dios/df, align rows and columns of value to ourself - # NOTE: this may shrink columns a third time (1st & 2nd in unpack_key_aloc) - if _is_dios_like(value): - colkeys = value.columns.intersection(colkeys) - for i, c in enumerate(colkeys): - l = self._data.at[c] - r = value[c] - idx = l.loc[rowkeys[i]].index.intersection(r.index) - l[idx] = r[idx] - - # row-alignable: given series, align rows of value to every - # (colkeys selected) series in ourself - elif isinstance(value, pd.Series): - r, rindex = value, value.index - for i, c in enumerate(colkeys): - l = self._data.at[c] - idx = l.loc[rowkeys[i]].index.intersection(rindex) - l[idx] = r[idx] - - elif _is_nested_list_like(value): - # todo: iterate + enumerate, check length, set - raise NotImplementedError - - elif _is_list_like(value): - # todo: iterate columns, check length, set - raise NotImplementedError + def iter_self(colkeys, pos=True): + c = '?' + try: - # if no align is possible, fallback to .loc - else: for i, c in enumerate(colkeys): - self._data.at[c].loc[rowkeys[i]] = value + dat = self._data.at[c] + rk = rowkeys[i] + if len(dat.loc[rk]) == 0: + continue + yield dat, rk, i if pos else c - except Exception as e: - raise type(e)(f"failed for column {c}: " + str(e)) from e + except Exception as e: + raise type(e)(f"failed for column {c}: " + str(e)) from e + + # align columns, for rows use series.loc to align + if _is_dios_like(value): + colkeys = value.columns.intersection(colkeys) + for dat, rk, c in iter_self(colkeys, pos=False): + dat.loc[rk] = value[c] + + # align rows by using series.loc + elif isinstance(value, pd.Series): + for dat, rk, _ in iter_self(colkeys): + dat.loc[rk] = value + + # no align, no merci + elif _is_nested_list_like(value): + if len(colkeys) != len(value): + raise ValueError(f"shape mismatch: values array of shape " + f"(.., {len(value)}) could not " + f"be broadcast to indexing result of " + f"shape (.., {len(colkeys)})") + for dat, rk, i in iter_self(colkeys): + dat.loc[rk] = value[i] + + # no align, no merci + else: + for dat, rk, _ in iter_self(colkeys): + dat.loc[rk] = value def _unpack_key_aloc(self, key): """ Return a list of row indexer and a list of existing(!) column labels. Both list always have the same length and also could be empty together. - """ - # if a single column-key is given, we will - # return a single Series, instead of a dios + Note: + The items of the row indexer list should be passed to pd.Series.loc[] + """ + # if a single column-key is given, the caller may + # want to return a single Series, instead of a dios lowdim = False - # multi-dim (var I) depend on the set method - if _is_dios_like(key): + def keys_from_bool_dios_like(key): + if not _is_bool_dios_like(key): + raise ValueError("Must pass dios-like key with boolean " + "values only if passed as single indexer") + colkey = self.obj.columns.intersection(key.columns) + rowkey = [] + for c in colkey: + b = key[c] + rowkey += [self._data.at[c].index.intersection(b[b].index)] + return rowkey, colkey, lowdim - # bool dios / df - if self._use_bool_dios: - # todo: use a _is_bool_dioslike() helper function, - # that check for dtype==bool for each series or - # dtype of pd.Dataframe - - colkey = self.obj.columns.intersection(key.columns) - rowkey = [] - for c in colkey: - b = key[c] - if not _is_bool_indexer(b): - raise ValueError("Must pass dios-like key with boolean " - "values only if passed as single indexer") - rowkey += [self._data.at[c].index.intersection(b[b].index)] - - # align any dios-like - else: - colkey = self.obj.columns.intersection(key.columns) - rowkey = [self._data.at[c].index.intersection(key[c].index) for c in colkey] + def keys_from_dios_like(key): + colkey = self.obj.columns.intersection(key.columns) + rowkey = [self._data.at[c].index.intersection(key[c].index) for c in colkey] + return rowkey, colkey, lowdim + def keys_from_nested_list(key): + key = key.values if isinstance(key, pd.Series) else key + if len(key) != len(self.obj.columns): + raise ValueError("nested arrays outer length must have same langth than columns.") + colkey = self.obj.columns + rowkey = [] + for i, k in colkey: + rowkey.append(self._data.at[k].index.intersection(key[i])) return rowkey, colkey, lowdim - rowkey, colkey = self._unpack_key(key) + # handle multi-dim keys + if isinstance(key, tuple): + rowkey, colkey = self._unpack_key(key) + # .aloc[any, ...] + # The ellipsis is meant for dios only to indicate + # that alignment of dios is requested, instead of + # using (and checking) it as boolean dios + if colkey is Ellipsis: + if _is_dios_like(rowkey): + return keys_from_dios_like(rowkey) + if _is_nested_list_like(rowkey): + return keys_from_nested_list(rowkey) + colkey = slice(None) - # multi-dim (var II) - if colkey is Ellipsis: - if _is_dios_like(rowkey): - colkey = self.obj.columns.intersection(rowkey.columns) - rowkey = [self._data.at[c].index.intersection(rowkey[c].index) for c in colkey] - return rowkey, colkey, lowdim + # (I) .aloc[dios] -> defaults to (III) + # (II) .aloc(booldios=False)[dios] or + # (III) .aloc(booldios=True)[dios] + elif _is_dios_like(key): + if self._use_bool_dios: + return keys_from_bool_dios_like(key) else: - colkey = slice(None) + return keys_from_dios_like(key) + + elif _is_nested_list_like(key): + return keys_from_nested_list(key) + + # a single row indexer (not multi-dim) + # or just some random crap was given + else: + rowkey, colkey = self._unpack_key(key) - # if we come here no more multi-dim keys are allowed - elif _is_dios_like(rowkey): - raise ValueError("Could not index with multi-dimensional " - "row key, if column key is not Ellipsis.") - elif _is_dios_like(colkey): - raise ValueError("Could not index with multi-dimensional " - "column key.") + # all multi-dim indexer was already handled + if _is_dios_like(rowkey) or _is_nested_list_like(rowkey): + raise ValueError("Could not index with multi-dimensional row key" + ", if column key is given and is not Ellipsis.") + elif _is_dios_like(colkey) or _is_nested_list_like(colkey): + raise ValueError("Could not index with multi-dimensional column key.") # handle gratefully: scalar if _is_hashable(colkey): @@ -356,12 +383,11 @@ class _aLocIndexer(_Indexer): lowdim = True # column-alignable: list-like, filter only existing columns - elif _is_list_like_not_nested(colkey) and not _is_bool_indexer(colkey): + elif _is_list_like(colkey) and not _is_bool_indexer(colkey): colkey = colkey.values if isinstance(colkey, pd.Series) else colkey colkey = self.obj.columns.intersection(colkey) - # not alignable - # fall back to .loc (boolean list/series, slice(..), ... + # not alignable, fall back to .loc (boolean list/series, slice(..), etc. else: colkey = self._data.loc[colkey].index @@ -381,11 +407,12 @@ class _aLocIndexer(_Indexer): # handle gratefully: list-like, filter only existing rows # NOTE: dios.aloc[series.index] is processed here - elif _is_list_like_not_nested(rowkey) and not _is_bool_indexer(rowkey): + elif _is_list_like(rowkey) and not _is_bool_indexer(rowkey): rowkey = [self._data.at[c].index.intersection(rowkey) for c in colkey] # not alignable - # fallback to .loc (processed by caller) - (eg. slice(..), boolean list-like, ...) + # the rowkey is processed by .loc someway in + # the calling function - (eg. slice(..), boolean list-like, etc.) else: rowkey = [rowkey] * len(colkey) @@ -439,5 +466,3 @@ class _iAtIndexer(_Indexer): if _is_dios_like(value) or _is_nested_list_like(value): raise TypeError(".iat[] cannot be used to set multi-dimensional values, use .aloc[] instead.") self._data.iat[key[1]].iat[key[0]] = value - -