From b8d571f0fb7a12c7fe006052c6d6ad0b9703d44d Mon Sep 17 00:00:00 2001 From: Bert Palm <bert.palm@ufz.de> Date: Tue, 21 Apr 2020 15:52:07 +0200 Subject: [PATCH] docs docs docs --- dios/base.py | 39 +++++++++++ dios/dios.py | 157 +++++++++++++++++++++++++++++++++++++++++--- dox/doc_indexing.md | 52 +++++++-------- 3 files changed, 212 insertions(+), 36 deletions(-) diff --git a/dios/base.py b/dios/base.py index f6679cb..6e77bc4 100644 --- a/dios/base.py +++ b/dios/base.py @@ -460,6 +460,45 @@ class _DiosBase: return self._constructor(data=data, fastpath=True, **kws) def copy_empty(self, columns=True): + """ + Return a new DictOfSeries object, with same properties than the original. + Parameters + ---------- + columns: bool, default True + If ``True``, the copy will have the same, but empty columns like the original. + + Returns + ------- + DictOfSeries: empty copy + + Examples + -------- + + >>> di = DictOfSeries({'A': range(2), 'B': range(3)}) + >>> di + A | B | + ==== | ==== | + 0 0 | 0 0 | + 1 1 | 1 1 | + | 2 2 | + + >>> empty = di.copy_empty() + >>> empty + Empty DictOfSeries + Columns: ['A', 'B'] + + The properties are the same, eg. + + >>> empty.itype == di.itype + True + >>> empty.cast_policy == di.cast_policy + True + >>> empty.dtypes == di.dtypes + columns + A True + B True + dtype: bool + """ data = None if columns is True: # is correct data = pd.Series(dtype='O', index=self.columns) diff --git a/dios/dios.py b/dios/dios.py index 5742cd2..875b797 100644 --- a/dios/dios.py +++ b/dios/dios.py @@ -324,7 +324,69 @@ class DictOfSeries(_DiosBase): See Also -------- DictOfSeries.for_each: apply pd.Series methods or properties to each column - """ + + Examples + -------- + + We use the example DictOfSeries from :ref:`indexing <doc_indexing:Example dios>`. + + >>> di = di[:5] + a | b | c | d | + ===== | ==== | ===== | ===== | + 0 0 | 2 5 | 4 7 | 6 0 | + 1 7 | 3 6 | 5 17 | 7 1 | + 2 14 | 4 7 | 6 27 | 8 2 | + 3 21 | 5 8 | 7 37 | 9 3 | + 4 28 | 6 9 | 8 47 | 10 4 | + + >>> di.apply(max) + columns + a 28 + b 9 + c 47 + d 4 + dtype: int64 + + >>> di.apply(pd.Series.count) + columns + a 5 + b 5 + c 5 + d 5 + dtype: int64 + + One can pass keyword arguments directly.. + + >>> di.apply(pd.Series.value_counts, normalize=True) + a | b | c | d | + ======= | ====== | ======= | ====== | + 7 0.2 | 7 0.2 | 7 0.2 | 4 0.2 | + 14 0.2 | 6 0.2 | 37 0.2 | 3 0.2 | + 21 0.2 | 5 0.2 | 47 0.2 | 2 0.2 | + 28 0.2 | 9 0.2 | 27 0.2 | 1 0.2 | + 0 0.2 | 8 0.2 | 17 0.2 | 0 0.2 | + + Or define a own funtion.. + + >>> di.apply(lambda s : 'high' if max(s) > 10 else 'low') + columns + a high + b low + c high + d low + dtype: object + + And also more advanced functions that return a list-like can be given. Note that + the returned lists not necessarily must have the same length. + + >>> func = lambda s : ('high', max(s), min(s)) if min(s) > (max(s)//2) else ('low',max(s)) + >>> di.apply(func) + a | b | c | d | + ====== | ======= | ====== | ====== | + 0 low | 0 high | 0 low | 0 low | + 1 28 | 1 9 | 1 47 | 1 4 | + | 2 5 | | | + """ if axis in [1, 'columns']: raise NotImplementedError @@ -485,11 +547,46 @@ class DictOfSeries(_DiosBase): return self.for_each(pd.Series.memory_usage, index=index, deep=deep).sum() def to_df(self): + """ + Transform DictOfSeries to a pandas.DataFrame. + + Because a pandas.DataFrame can not handle Series of different + length, but DictOfSeries can, the missing data is filled with + NaNs. + + Returns + ------- + pandas.DataFrame: transformed data + + Examples + -------- + + Missing data locations are filled with NaN's + + >>> a = pd.Series(11, index=range(2)) + >>> b = pd.Series(22, index=range(3)) + >>> c = pd.Series(33, index=range(1,9,3)) + >>> di = DictOfSeries(dict(a=a, b=b, c=c)) + >>> di + a | b | c | + ===== | ===== | ===== | + 0 11 | 0 22 | 1 33 | + 1 11 | 1 22 | 4 33 | + | 2 22 | 7 33 | + >>> di.to_df() + columns a b c + 0 11.0 22.0 NaN + 1 11.0 22.0 33.0 + 2 NaN 22.0 NaN + 4 NaN NaN 33.0 + 7 NaN NaN 33.0 + """ df_or_ser = self._data.apply(lambda s: s).transpose() return pd.DataFrame() if isinstance(df_or_ser, pd.Series) else df_or_ser @property def debugDf(self): + """ Alias for ``to_df()`` as property, for debugging purpose.""" return self.to_df() def min(self, axis=0, skipna=True): @@ -523,6 +620,29 @@ class DictOfSeries(_DiosBase): return DictOfSeries(data=data, itype=self.itype, cast_policy=self._policy, fastpath=True) def all(self, axis=0): + """ + Return whether all elements are True, potentially over an axis. + + Returns True unless there at least one element within a series + or along a DictOfSeries axis that is False or equivalent (e.g. zero or empty). + + Parameters + ---------- + axis : {0 or ‘index’, 1 or ‘columns’, None}, default 0 + Indicate which axis or axes should be reduced. + * 0 / ‘index’ : reduce the index, return a Series whose index is the original column labels. + * 1 / ‘columns’ : reduce the columns, return a Series whose index is the union of all columns indexes. + * None : reduce all axes, return a scalar. + + Returns + ------- + pandas.Series + + See Also + -------- + pandas.Series.all: Return True if all elements are True. + any: Return True if one (or more) elements are True. + """ if axis in [0, 'index']: return self._data.apply(all) elif axis in [1, 'columns']: @@ -534,6 +654,29 @@ class DictOfSeries(_DiosBase): raise ValueError(axis) def any(self, axis=0): + """ + Return whether any element is True, potentially over an axis. + + Returns False unless there at least one element within a series + or along a DictOfSeries axis that is True or equivalent (e.g. non-zero or non-empty). + + Parameters + ---------- + axis : {0 or ‘index’, 1 or ‘columns’, None}, default 0 + Indicate which axis or axes should be reduced. + * 0 / ‘index’ : reduce the index, return a Series whose index is the original column labels. + * 1 / ‘columns’ : reduce the columns, return a Series whose index is the union of all columns indexes. + * None : reduce all axes, return a scalar. + + Returns + ------- + pandas.Series + + See Also + -------- + pandas.Series.any: Return whether any element is True. + all: Return True if all elements are True. + """ if axis in [0, 'index']: return self._data.apply(any) elif axis in [1, 'columns']: @@ -584,21 +727,15 @@ class DictOfSeries(_DiosBase): return ~ self.isempty() def isdata(self): - """ Alias for ``DictOfSeries.notna(drop_empty=True)``. """ + """ Alias for ``notna(drop_empty=True)``. """ return self.notna(drop_empty=True) def isnull(self, drop_empty=False): - """ Alias for `isna()` - - See Also - -------- - isna : some foo - - """ + """ Alias for ``isna()`` """ return self.isna(drop_empty=drop_empty) def notnull(self, drop_empty=False): - """ Alias, see ``DictOfSeries.notna``. """ + """ Alias, see ``notna()``. """ return self.notna(drop_empty=drop_empty) # ---------------------------------------------------------------------- diff --git a/dox/doc_indexing.md b/dox/doc_indexing.md index 9a9f830..fc430ed 100644 --- a/dox/doc_indexing.md +++ b/dox/doc_indexing.md @@ -101,15 +101,15 @@ each column separately. So maybe a first example gives an rough idea: ``` >>> s = pd.Series([11] * 4 ) ->>> d = DictOfSeries(dict(a=s[:2]*6, b=s[2:4]*7, c=s[:2]*8, d=s[1:3]*9)) ->>> d +>>> di = DictOfSeries(dict(a=s[:2]*6, b=s[2:4]*7, c=s[:2]*8, d=s[1:3]*9)) +>>> di a | b | c | d | ===== | ===== | ===== | ===== | 0 66 | 2 77 | 0 88 | 1 99 | 1 66 | 3 77 | 1 88 | 2 99 | ->>> d.aloc[[1,2], ['a', 'b', 'd', 'x']] +>>> di.aloc[[1,2], ['a', 'b', 'd', 'x']] a | b | d | ===== | ===== | ===== | 1 66 | 2 77 | 1 99 | @@ -201,18 +201,18 @@ Example dios The dios used in the examples, unless stated otherwise: ``` -# generate dict +# generate example DictOfSeries >>> sa = pd.Series(range(0, 70, 7)) >>> sb = pd.Series(range(5, 15, 1)) >>> sc = pd.Series(range(7, 107, 10)) >>> sd = pd.Series(range(0, 10, 1)) >>> for i, s in enumerate([sa,sb,sc,sd]): s.index += i*2 ->>> d = DictOfSeries(dict(a=sa, b=sb, c=sc, d=sd))[:5] +>>> di = DictOfSeries(dict(a=sa, b=sb, c=sc, d=sd))[:5] ``` Looks like so: ``` ->>> d +>>> di a | b | c | d | ===== | ==== | ===== | ===== | 0 0 | 2 5 | 4 7 | 6 0 | @@ -230,7 +230,7 @@ The underling pandas.Series is returned, if the key exist. Otherwise a empty pandas.Series with `dtype=object` is returned. ``` ->>> d.aloc[:, 'a'] +>>> di.aloc[:, 'a'] 0 0 1 7 2 14 @@ -238,7 +238,7 @@ Otherwise a empty pandas.Series with `dtype=object` is returned. 4 28 Name: a, dtype: int64 ->>> d.aloc[:, 'x'] +>>> di.aloc[:, 'x'] Series([], dtype: object) ``` @@ -250,7 +250,7 @@ A dios is returned, with a subset of the existing columns. If no key is present a empty dios is returned. ``` ->>> d.aloc[:, ['c', 99, None, 'a', 'x', 'y']] +>>> di.aloc[:, ['c', 99, None, 'a', 'x', 'y']] a | c | ===== | ===== | 0 0 | 4 7 | @@ -259,7 +259,7 @@ If no key is present a empty dios is returned. 3 21 | 7 37 | 4 28 | 8 47 | ->>> d.aloc[:, ['x', 'y']] +>>> di.aloc[:, ['x', 'y']] Empty DictOfSeries Columns: [] @@ -298,16 +298,16 @@ For scalar and array-like indexer with label values, the keys are handled gracef array-like column indexers. ``` ->>> d.aloc[1] +>>> di.aloc[1] a | b | c | d | ==== | ======= | ======= | ======= | 1 7 | no data | no data | no data | ->>> d.aloc[99] +>>> di.aloc[99] Empty DictOfSeries Columns: ['a', 'b', 'c', 'd'] ->>> d.aloc[[3,6,7,18]] +>>> di.aloc[[3,6,7,18]] a | b | c | d | ===== | ==== | ===== | ==== | 3 21 | 3 6 | 6 27 | 6 0 | @@ -316,7 +316,7 @@ Columns: ['a', 'b', 'c', 'd'] The length of columns can differ: ``` ->>> d.aloc[[3,6,7,18]].aloc[[3,6]] +>>> di.aloc[[3,6,7,18]].aloc[[3,6]] a | b | c | d | ===== | ==== | ===== | ==== | 3 21 | 3 6 | 6 27 | 6 0 | @@ -329,7 +329,7 @@ Boolean array-likes as row indexer For array-like indexer that hold boolean values, the length of the indexer and the length of all column(s) to index must match. ``` ->>> d.aloc[[True,False,False,True,False]] +>>> di.aloc[[True,False,False,True,False]] a | b | c | d | ===== | ==== | ===== | ==== | 0 0 | 2 5 | 4 7 | 6 0 | @@ -337,7 +337,7 @@ the length of all column(s) to index must match. ``` If the length does not match a `IndexError` is raised: ``` ->>> d.aloc[[True,False,False]] +>>> di.aloc[[True,False,False]] Traceback (most recent call last): ... IndexError: failed for column a: Boolean index has wrong length: 3 instead of 5 @@ -365,7 +365,7 @@ When using a pandas.Series as row indexer with `aloc`, all its magic comes to li The index of the given series align itself with the index of each column separately and is this way used as a filter. ``` ->>> s = d['b'] + 100 +>>> s = di['b'] + 100 >>> s 2 105 3 106 @@ -374,7 +374,7 @@ The index of the given series align itself with the index of each column separat 6 109 Name: b, dtype: int64 ->>> d.aloc[s] +>>> di.aloc[s] a | b | c | d | ===== | ==== | ===== | ==== | 2 14 | 2 5 | 4 7 | 6 0 | @@ -393,7 +393,7 @@ The series align the same way as explained above, but additional only the `True` Thus `False`-values are treated like missing indices. The behavior here is analogous to `s1.loc[s2[s2].index]`. ``` ->>> boolseries = d['b'] > 6 +>>> boolseries = di['b'] > 6 >>> boolseries 2 False 3 False @@ -402,7 +402,7 @@ Thus `False`-values are treated like missing indices. The behavior here is analo 6 True Name: b, dtype: bool ->>> d.aloc[boolseries] +>>> di.aloc[boolseries] a | b | c | d | ===== | ==== | ===== | ==== | 4 28 | 4 7 | 4 7 | 6 0 | @@ -414,14 +414,14 @@ To evaluate boolean values is a very handy feature, as it can easily used with m nicely with writing those as one-liner: ``` ->>> d.aloc[d['b'] > 6] +>>> di.aloc[d['b'] > 6] a | b | c | d | ===== | ==== | ===== | ==== | 4 28 | 4 7 | 4 7 | 6 0 | | 5 8 | 5 17 | | | 6 9 | 6 27 | | ->>> d.aloc[(d['a'] > 6) & (d['b'] > 6)] +>>> di.aloc[(d['a'] > 6) & (d['b'] > 6)] a | b | c | d | ===== | ==== | ==== | ======= | 4 28 | 4 7 | 4 7 | no data | @@ -430,7 +430,7 @@ nicely with writing those as one-liner: >**Note:** > ->Nevertheless, something like `d.aloc[d['a'] > d['b']]` do not work, because the comparison fails, +>Nevertheless, something like `di.aloc[di['a'] > di['b']]` do not work, because the comparison fails, >as long as the two series objects not have the same index. But maybe one want to checkout >[DictOfSeries.index_of()](/docs/methods_and_properties.md#diosdictofseriesindex_of). @@ -453,7 +453,7 @@ Every inner list-like item is applied as row indexer to the according column. 3 21 | 5 8 | 7 37 | 9 3 | 4 28 | 6 9 | 8 47 | 10 4 | ->>> d.aloc[ [d['a'], [True,False,True,False,False], [], [7,8,10]] ] +>>> di.aloc[ [d['a'], [True,False,True,False,False], [], [7,8,10]] ] a | b | c | d | ===== | ==== | ======= | ===== | 0 0 | 2 5 | no data | 7 1 | @@ -463,7 +463,7 @@ Every inner list-like item is applied as row indexer to the according column. 4 28 | | | | >>> ar = np.array([2,3]) ->>> d.aloc[[ar, ar+1, ar+2, ar+3]] +>>> di.aloc[[ar, ar+1, ar+2, ar+3]] a | b | c | d | ===== | ==== | ===== | ==== | 2 14 | 3 6 | 4 7 | 6 0 | @@ -474,7 +474,7 @@ Even this looks like a 2D-indexer, that are explained in the next section, it is In contrast to the 2D-indexer, we also can provide a column key, to pre-filter the columns. ``` ->>> d.aloc[[ar, ar+1, ar+3], ['a','b','d']] +>>> di.aloc[[ar, ar+1, ar+3], ['a','b','d']] a | b | d | ===== | ==== | ==== | 2 14 | 3 6 | 6 0 | -- GitLab