diff --git a/dios/dios.py b/dios/dios.py index db5c0911a22bd5a93da87c6d6edb88eb2c12dc45..4e9b4f055583f0c85cb5d2cee8f93484be7dbdd7 100644 --- a/dios/dios.py +++ b/dios/dios.py @@ -195,7 +195,7 @@ class DictOfSeries(_DiosBase): yield idx, DictOfSeries(data=row.to_dict(), index=[idx]) # ------------------------------------------------------------------------------ - # Broadcasting methods and helper + # Broadcasting and Reducing def for_each(self, attr_or_callable, **kwds): """ @@ -354,42 +354,61 @@ class DictOfSeries(_DiosBase): raise ValueError(axis) return result - def _reduce_horizontal(self, func, initializer_value): + def reduce_columns(self, func, initial=None, skipna=False): """ - Reduce values of all columns to a single pandas.Series by a given function. + Reduce all columns to a single pandas.Series by a given function. - A given function is called on pairs of columns, and the result is used - for next pair-call. Because not all columns necessarily share the same - index, some indices (and its corresponding values) may just seen once. - Therefore, every firstly seen index' values are reduced against a dummy - series of the initializer_value. + Apply a function of two pandas.Series as arguments, cumulatively to all + columns, from left to right, so as to reduce the columns to a single + pandas.Series. If initial is present, it is placed before the columns + in the calculation, and serves as a default when the columns are empty. Parameters ---------- - func: function - The function must take two series and must return a single series. - Both input series will the same index and the returned one also - should have it. - - initializer_value: Any - A value that is overwritten, by any(!) other value, if the - ``func`` is evaluated. This is mandatory and also must apply, - if the value is not present in any of the columns! - E.g. ``False`` for ``func=lambda s1,s2: s1 | s2`` or - ``0`` for ``func=max`` if all values are positive integers. + func : function + The function must take two identically indexed pandas.Series and should + return a single pandas.Series with the same index. + + initial : column-label or pd.Series, default None + The series to start with. If None a dummy series is created, with the + indices of all columns and the first seen values. + + skipna : bool, default False + If True, skip NaN values. Returns ------- pandas.Series - A series that have a unique index with the union of indexes - of all columns and the function result as values. + A series with the reducing result and the index of the start series, + defined by ``initializer``. """ - res = pd.Series(data=initializer_value, index=self.index_of('all')) - for d in self._data: - base = res.loc[d.index] - if len(base) > 0: - res.loc[d.index] = func(base, d) - return res + if initial is None: + value = pd.Series(index=self.index_of('all')) + for d in self._data: + value = value.combine_first(d) + elif isinstance(initial, pd.Series): + value = initial.copy() + elif initial in self.columns: + value = self._data.at[initial].copy() + else: + raise ValueError("initial must be pd.Series, a column label or None") + + if skipna: + val = value.dropna() + data = self.dropna()._data + else: + val = value + data = self._data + + for d in data: + idx = val.index & d.index + if len(idx) > 0: + l, r = val.loc[idx], d.loc[idx] + val.loc[idx] = func(l, r) + + if skipna: + value.loc[val.index] = val + return value # ------------------------------------------------------------------------------ # Misc methods @@ -478,10 +497,7 @@ class DictOfSeries(_DiosBase): return self.for_each(pd.Series.min, skipna=skipna) elif axis in [1, 'columns']: func = lambda s1, s2: s1.where(s1 < s2, s2) - res = self._reduce_horizontal(func, np.inf) - if not skipna: - res.loc[self.isna().any(axis=1)] = np.nan - return res + return self.reduce_columns(func, skipna=skipna) raise ValueError(axis) def max(self, axis=None, skipna=None): @@ -489,10 +505,7 @@ class DictOfSeries(_DiosBase): return self.for_each(pd.Series.min, skipna=skipna) elif axis in [1, 'columns']: func = lambda s1, s2: s1.where(s1 > s2, s2) - res = self._reduce_horizontal(func, -np.inf) - if not skipna: - res.loc[self.isna().any(axis=1)] = np.nan - return res + return self.reduce_columns(func, skipna=skipna) raise ValueError(axis) # ---------------------------------------------------------------------- @@ -514,7 +527,8 @@ class DictOfSeries(_DiosBase): return self._data.apply(all) elif axis in [1, 'columns']: func = lambda s1, s2: s1.astype(bool) & s2.astype(bool) - return self._reduce_horizontal(func, True) + init = pd.Series(True, dtype=bool, index=self.index_of('all')) + return self.reduce_columns(func, init) elif axis is None: return self._data.apply(all).all() raise ValueError(axis) @@ -524,7 +538,8 @@ class DictOfSeries(_DiosBase): return self._data.apply(any) elif axis in [1, 'columns']: func = lambda s1, s2: s1.astype(bool) | s2.astype(bool) - return self._reduce_horizontal(func, False) + init = pd.Series(False, dtype=bool, index=self.index_of('all')) + return self.reduce_columns(func, init) elif axis is None: return self._data.apply(any).any() raise ValueError(axis) @@ -554,7 +569,8 @@ class DictOfSeries(_DiosBase): return data.for_each('hasnans') elif axis in [1, 'columns']: func = lambda s1, s2: s1.isna() | s2.isna() - return data._reduce_horizontal(func, False) + init = pd.Series(False, dtype=bool, index=self.index_of('all')) + return data.reduce_columns(func, init) elif axis is None: return self.isna(drop_empty=drop_empty) raise ValueError(axis)