From d9b288ab7f2c35c9a30a9924c12973bf2538f3a5 Mon Sep 17 00:00:00 2001 From: Bert Palm <bert.palm@ufz.de> Date: Wed, 18 Mar 2020 18:25:14 +0100 Subject: [PATCH] texttexttex --- Readme.md | 42 ++++++++++++++++------------- dios/dios.py | 64 ++++++++++++++++++++++++++++---------------- test/test_methods.py | 6 ++--- 3 files changed, 68 insertions(+), 44 deletions(-) diff --git a/Readme.md b/Readme.md index 0f99304..7bf6344 100644 --- a/Readme.md +++ b/Readme.md @@ -49,6 +49,17 @@ are scalars the stored element itself is returned. In all other cases a dios is For more pandas-like indexing magic and the differences between the indexers, see the pandas documentation. +**multi-dimensional indexer** + +`dios[boolean dios-like]` (as single key) - dios accept boolean multi-indexer (boolean pd.Dataframe +or boolean Dios). Columns and rows from the multi-indexer align with the dios. +This means that only matching columns are selected/written, the same apply for rows. +Rows or whole columns that are missing in the indexer, but are present in the Dios are dropped, +but empty columns are preserved, with the effect that the resulting Dios always have the same +column dimension than the initial Dios. +This is a similar behavior to pd.DataFrame handling of multi-indexer, despite that pd.DataFrame +fill np.nans at missing locations and columns. + **setting values** Setting values with `di[]` and `.loc[]`, `.iloc[]` and `.at[]`, `.iat[]` work like in pandas. @@ -57,40 +68,35 @@ values can be: - *scalars*: these are broadcast to the selected positions - *nested lists*: the outer list must match selected columns length, the inner lists lengths must match selected rows. - *normal lists* : columns key must be a scalar(!), the list is passed down, and set to the underlying series. -- *pd.Series*: columns key must be a scalar(!), the series is passed down, and set to the underlying series, -where it is aligned. +- *pd.Series*: columns key must be a scalar(!), the series is passed down, and set to the underlying series +in the dios, where both are aligned. Examples: - `dios.loc[2:5, 'a'] = [1,2,3]` is the same as `a=dios['a']; a.loc[2:5]=[1,2,3]` - `dios.loc[2:5, :] = 99` : set 99 on rows 2 to 5 on all columns -**multi-dimensional indexing** - -`dios[BoolDiosLike]` - dios accept boolean multi-indexer (boolean pd.Dataframe -or boolean Dios). Columns and rows from the multi-indexer align with the dios. -This means that only matching columns are selected/written, the same apply for rows. -Rows or whole columns that are missing in the indexer, but are present in the Dios are dropped, -but empty columns are preserved, with the effect that the resulting Dios always have the same -column dimension than the initial Dios. -This is a similar behavior to pd.DataFrame handling of multi-indexer, despite that pd.DataFrame -fill np.nans at missing locations and columns. - **special indexer `.aloc`** Additional to the pandas like indexers we have a `.aloc[..]` (align locator) indexing method. Unlike `.iloc` and `.loc` indexers and/or values fully align if possible and 1D-array-likes can be broadcast to multiple columns at once. Also this method handle missing indexer-items gratefully. +It is used like `.loc`, so a single row-indexer (`.aloc[row-indexer]`) or a tuple of row-indexer and +column-indexer (`.aloc[row-indexer, column-indexer]`) can be given. *Alignable indexer* are: - `.aloc[pd.Series]` : only common indices are used in each column -- `.aloc[bool-dios]` (as single key) : only matching columns and matching indices are used +- `.aloc[boolean dios-like]` (as single key) : work same like `di[boolean dios-like]` (see above) + +``` +only matching columns and matching indices are used if the value is `True` (Values that are `False` are dropped and handled as they would be missing) -In contrast to `di[BoolDiosLike]` (see above), missing rows are **not** filled with nan's, instead -they are dropped on selection operations and ignored on setting operations. Nevertheless empty columns -are still preserved. -- `.aloc[dios, ...]` (dios-like, **Ellipsis**) : "`...`" is not a placeholder, it refer to the ellipsis object. +In contrast to *normal* indexing, with `di[boolean dios-like]` (see above), missing rows are **not** +filled with nan's, instead they are dropped on selection operations and ignored on setting operations. +Nevertheless empty columns are still preserved. +- `.aloc[dios-like, ...]` (dios-like, **Ellipsis**) : "`...`" is not a placeholder, it refer to the ellipsis object. Full align -> use only matching columns and indices. Alternatively, `.aloc(booldios=False)[dios]` can be used. +``` *Indexer* that are handled grateful: - `.aloc[list]` (lists or any iterable obj) : only present labels/positions are used diff --git a/dios/dios.py b/dios/dios.py index f999db6..3cdf86a 100644 --- a/dios/dios.py +++ b/dios/dios.py @@ -263,10 +263,16 @@ class DictOfSeries: key = list(key) if _is_iterator(key) else key if isinstance(key, tuple): raise KeyError("tuples are not allowed") - elif _is_hashable(key): + + if _is_hashable(key): + # NOTE: we use copy here to prevent index + # changes, that could result in an invalid + # itype. A shallow copy is not sufficient. + # work on columns, return series - return self._data.at[key] - elif _is_dios_like(key): + return self._data.at[key].copy() + + if _is_dios_like(key): # work on rows and columns new = self._getitem_bool_dios(key) elif isinstance(key, slice): @@ -279,6 +285,7 @@ class DictOfSeries: # work on columns data = self._data.loc[key] new = DictOfSeries(data=data, itype=self.itype, cast_policy=self._policy, fastpath=True) + return new def _slice(self, key): @@ -292,27 +299,18 @@ class DictOfSeries: return new def _getitem_bool_dios(self, key): - """ Select items by a boolean dios-like drop un-selected indices. - - todo: Desired behaivior: fill nan's at un-selected indices. but with this - we cannot set values properly (in the current implementation), because - we use __getitem__ in __setitem__ and cannot decide where the nans come from - i) from data or ii) from prior indexing :/""" + """ Select items by a boolean dios-like drop un-selected indices. """ new = self.copy_empty(columns=True) - for k in self.columns: + for k in self.columns.intersection(key.columns): dat = self._data.at[k] - - if k in key.columns: - val = key[k] - if not _is_bool_indexer(val): - raise ValueError("Must pass DictOfSeries with boolean values only") - # align rows - idx = val[val].index.intersection(dat.index) - new._data.at[k] = dat[idx] - else: - new._insert(k, pd.Series(dtype='O')) + val = key[k] + if not _is_bool_indexer(val): + raise ValueError("Must pass DictOfSeries with boolean values only") + # align rows + idx = val[val].index.intersection(dat.index) + new._data.at[k] = dat[idx] return new @@ -341,10 +339,15 @@ class DictOfSeries: assert isinstance(data, self.__class__), f"getitem returned data of type {type(data)}" # special cases + # (I) Dios (ok) + # (II) list-like (fail) + # (III) nested lists-like (ok) + # NOTE: pd.Series considered list-like and also fail + # if they dont hold list-likes obj if _is_dios_like(value): self._setitem_dios(data, value) elif _is_list_like(value): - self._setitem_listlike(data, value) + self._setitem_nested_listlike(data, value) # default case else: @@ -353,7 +356,9 @@ class DictOfSeries: s[:] = value self._data.at[k][s.index] = s - def _setitem_listlike(self, data, value): + def _setitem_nested_listlike(self, data, value): + # nested series, eg. `pd.Series([[1,2], [4,4]], dtype='O')` + value = value.values if isinstance(value, pd.Series) else value if not _is_nested_list_like(value): raise ValueError(f"1D array-like value could not be broadcast to " f"indexing result of shape (.., {len(data.columns)})") @@ -371,10 +376,20 @@ class DictOfSeries: def _setitem_dios(self, data, value): """ Write values from a dios-like to self. - No justification or alignment on columns, but on indices. + No justification or alignment of columns, but of indices. If value has missing indices, nan's are inserted at that locations, just like `series.loc[:]=val` or `df[:]=val` do. + Eg. + di1[::2] = di[::3] -> di[::2] + + x | x | x | + ===== | ==== | ====== | + 0 x | 0 z | 0 z | + 2 x | = 3 z | -> 2 NaN | + 4 x | 6 z | 4 NaN | + 6 x | 6 z | + Parameter ---------- data : dios @@ -391,6 +406,9 @@ class DictOfSeries: for i, k in enumerate(data): dat = data._data.at[k] + # .loc cannot handle empty series + if dat.empty: + continue val = value[value.columns[i]] dat.loc[:] = val self._data.at[k].loc[dat.index] = dat diff --git a/test/test_methods.py b/test/test_methods.py index 0cb25a6..840909c 100644 --- a/test/test_methods.py +++ b/test/test_methods.py @@ -25,9 +25,9 @@ def test_copy_copy_empty(dios_aligned): assert not di.columns.equals(empty_no_cols.columns) for i in di: - assert di[i].index is shallow[i].index - assert di[i].index is not deep[i].index - di[i][0] = 999999 + assert di._data[i].index is shallow._data[i].index + assert di._data[i].index is not deep._data[i].index + di._data[i][0] = 999999 assert di[i][0] == shallow[i][0] assert di[i][0] != deep[i][0] -- GitLab