diff --git a/dios/base.py b/dios/base.py new file mode 100644 index 0000000000000000000000000000000000000000..48a53002667b5493fab12c58978f038fec0b781f --- /dev/null +++ b/dios/base.py @@ -0,0 +1,527 @@ +#!/usr/bin/env python + +from . import operators as ops +from . import lib + +from .lib import ( + _CAST_POLICIES, + _throw_MixedItype_err_or_warn, + _find_least_common_itype, +) + +from abc import abstractmethod +import pandas as pd +import operator as op +import functools as ftools + +from pandas.core.common import is_bool_indexer as _is_bool_indexer +import pandas.core.dtypes.common as pdcom + +""" +Unlike the example says, return lists False, not True +>>is_iterator([1, 2, 3]) +>>False +""" +from pandas.core.dtypes.common import is_iterator as _is_iterator + +__author__ = "Bert Palm" +__email__ = "bert.palm@ufz.de" +__copyright__ = "Copyright 2018, Helmholtz-Zentrum für Umweltforschung GmbH - UFZ" + + +class _DiosBase: + + @property + @abstractmethod + def _constructor(self): + pass + + def __init__(self, data=None, columns=None, index=None, itype=None, cast_policy='save', fastpath=False): + + self.cast_policy = cast_policy + + # we are called internally + if fastpath: + self._itype = itype or lib.ObjItype + if data is not None: + self._data = data + else: + # it is significantly faster, to provide an index and fill it, + # than to successively build the index by adding data + self._data = pd.Series(dtype='O', index=columns) + + else: + + if index is not None and not isinstance(index, pd.Index): + index = pd.Index(index) + + # itype=None means infer the itype by the data, so we first set to the highest + # possible itype, then insert data, then infer the best-fitting itype. + if itype is None and index is None: + self._itype = lib.ObjItype + else: + if index is not None: + self._itype = lib.get_itype(index) + if itype is not None: + self._itype = lib.get_itype(itype) + + cols = pd.Index([] if columns is None else columns) + if not cols.is_unique: + raise ValueError("columns must be unique") + self._data = pd.Series(dtype='O', index=cols) + + if data is not None: + self._init_insert_data(data, columns, index) + + # self._data still contain nans at all positions, where + # no data was present, but a column-name was given + if self._data.hasnans: + e = pd.Series(dtype='O', index=index) + for c in self.columns[self._data.isna()]: + self._insert(c, e.copy()) + + self._data.index.name = 'columns' + + # we try to infer the itype, but if we still have + # no data, we will set the itype lazy, i.e. with + # the first non-empty _insert() + if itype is None: + if self.empty: + self._itype = 'INFER' + else: + self._itype = _find_least_common_itype(self._data) + if not self._itype.unique: + _throw_MixedItype_err_or_warn(self.itype) + + def _init_insert_data(self, data, columns, index): + """ Insert items of a iterable in self""" + + if _is_iterator(data): + data = list(data) + + if _is_dios_like(data) or isinstance(data, dict): + if columns is None: + pass # data is dict-like + else: + data = {k: data[k] for k in data if k in columns} + + elif isinstance(data, pd.Series): + name = data.name or 0 + if columns is not None and len(columns) > 0: + name = self.columns[0] + data = {name: data} + + elif pdcom.is_nested_list_like(data): + if columns is None: + data = {i: d for i, d in enumerate(data)} + elif len(data) == len(columns): + data = dict(zip(self.columns, data)) + else: + raise ValueError(f"{len(columns)} columns passed, data implies {len(data)} columns") + + elif pdcom.is_list_like(data): + name = 0 if columns is None or len(columns) < 1 else self.columns[0] + data = {name: data} + + else: + raise TypeError("data type not understood") + + for k in data: + self._insert(k, pd.Series(data[k], index=index)) + + # ---------------------------------------------------------------------- + # Indexing Methods + + def _insert(self, col, val): + """Insert a fresh new value as pd.Series into self""" + val = list(val) if _is_iterator(val) else val + + if _is_dios_like(val): + val = val.squeeze() + if not isinstance(val, pd.Series): + raise ValueError(f"Cannot insert frame-like with more than one column") + + elif val is None: + val = pd.Series() + + elif not isinstance(val, pd.Series): + raise TypeError(f"Only data of type pandas.Series can be inserted, passed was {type(val)}") + + # set the itype lazy, i.e. when first non-empty + # column is inserted + if self._itype == 'INFER': + if not val.empty: + self._itype = lib.get_itype(val.index) + # cast all pre-inserted empty series + self._cast_all(self._itype, self._policy) + if not self._itype.unique: + _throw_MixedItype_err_or_warn(self._itype) + else: + val = lib.cast_to_itype(val, self.itype, policy=self._policy) + + val.name = col + self._data.at[col] = val.copy(deep=True) + + def __getitem__(self, key): + """ dios[key] -> dios/series """ + key = list(key) if _is_iterator(key) else key + if isinstance(key, tuple): + raise KeyError("tuples are not allowed") + + if pdcom.is_hashable(key): + # NOTE: we use copy here to prevent index + # changes, that could result in an invalid + # itype. A shallow copy is not sufficient. + + # work on columns, return series + return self._data.at[key].copy() + + if _is_dios_like(key): + # work on rows and columns + new = self._getitem_bool_dios(key) + elif isinstance(key, slice): + # work on rows + new = self._slice(key) + elif _is_bool_indexer(key): + # work on rows + new = self._getitem_bool_listlike(key) + else: + # work on columns + data = self._data.loc[key] + new = self._constructor(data=data, itype=self.itype, cast_policy=self._policy, fastpath=True) + + return new + + def _slice(self, key): + """slices self, return copy""" + if key == slice(None): + return self.copy() + + new = self.copy_empty(columns=True) + for k in self.columns: + new._data.at[k] = self._data.at[k][key] + return new + + def _getitem_bool_dios(self, key): + """ Select items by a boolean dios-like drop un-selected indices. """ + + if not _is_bool_dios_like(key): + raise ValueError("Must pass DictOfSeries with boolean values only") + + new = self.copy_empty(columns=True) + for k in self.columns.intersection(key.columns): + dat = self._data.at[k] + val = key[k] + # align rows + idx = val[val].index.intersection(dat.index) + new._data.at[k] = dat[idx] + return new + + def _getitem_bool_listlike(self, key): + new = self.copy_empty(columns=True) + for k in self.columns: + new._data.at[k] = self._data.at[k].loc[key] + return new + + def __setitem__(self, key, value): + """ dios[key] = value """ + key = list(key) if _is_iterator(key) else key + if isinstance(key, tuple): + raise KeyError(f"{key}. tuples are not allowed") + + elif pdcom.is_hashable(key): + if isinstance(value, pd.Series) or key not in self.columns: + self._insert(key, value) + elif _is_dios_like(value) or pdcom.is_nested_list_like(value): + raise ValueError("Incompatible indexer with multi-dimensional value") + else: + self._data.at[key][:] = value + + else: + data = self.__getitem__(key) + assert isinstance(data, self.__class__), f"getitem returned data of type {type(data)}" + + # special cases + if _is_dios_like(value): + self._setitem_dios(data, value) + # NOTE: pd.Series also considered list-like + elif pdcom.is_list_like(value): + self._setitem_listlike(data, value) + + # default case + else: + for k in data.columns: + s = data._data.at[k] + s[:] = value + self._data.at[k][s.index] = s + + def _setitem_listlike(self, data, value): + + value = value.values if isinstance(value, pd.Series) else value + + if len(value) != len(data.columns): + raise ValueError(f"array-like value of length {len(value)} could " + f"not be broadcast to indexing result of shape " + f"(.., {len(data.columns)})") + + for i, k in enumerate(data.columns): + s = data._data.at[k] + s[:] = value[i] + self._data.at[k][s.index] = s + + def _setitem_dios(self, data, value): + """ Write values from a dios-like to self. + + No justification or alignment of columns, but of indices. + If value has missing indices, nan's are inserted at that + locations, just like `series.loc[:]=val` or `df[:]=val` do. + + Eg. + di[::2] = di[::3] -> di[::2] + + x | x | x | + ===== | ==== | ====== | + 0 x | 0 z | 0 z | + 2 x | = 3 z | -> 2 NaN | + 4 x | 6 z | 4 NaN | + 6 x | 6 z | + + Parameter + ---------- + data : dios + A maybe trimmed version of self + value : dios, pd.Dataframe + The value to set with the same column dimension like data + """ + + if len(data) != len(value.columns): + raise ValueError(f"shape mismatch: values array of shape " + f"(.., {len(value.columns)}) could not " + f"be broadcast to indexing result of " + f"shape (.., {len(data.columns)})") + + for i, k in enumerate(data): + dat = data._data.at[k] + # .loc cannot handle empty series, + # like `emptySeries.loc[:] = [1,2]` + if dat.empty: + continue + val = value[value.columns[i]] + dat.loc[:] = val + self._data.at[k].loc[dat.index] = dat + + def __delitem__(self, key): + del self._data[key] + + # ------------------------------------------------------------------------------ + # Base properties and basic dunder magic + + @property + def columns(self): + return self._data.index + + @columns.setter + def columns(self, cols): + index = pd.Index(cols) + if not index.is_unique: + raise ValueError("columns index must have unique values") + self._data.index = index + + @property + def itype(self): + if self._itype == 'INFER': + return None + return self._itype + + @itype.setter + def itype(self, itype): + itype = lib.get_itype(itype) + self._cast_all(itype, policy=self._policy) + self._itype = itype + + @property + def cast_policy(self): + return self._policy + + @cast_policy.setter + def cast_policy(self, policy): + if policy not in _CAST_POLICIES: + raise ValueError(f"policy must be one of {_CAST_POLICIES}") + self._policy = policy + + def _cast_all(self, itype, policy): + c = '?' + data = self.copy_empty() + try: + for c in self.columns: + data._data.at[c] = lib.cast_to_itype(self._data.at[c], itype, policy=policy) + except Exception as e: + raise type(e)(f"Column {c}: " + str(e)) from e + + def __len__(self): + return len(self.columns) + + @property + def empty(self): + return len(self) == 0 or all(s.empty for s in self._data) + + def __iter__(self): + yield from self.columns + + def __reversed__(self): + yield from reversed(self.columns) + + def __contains__(self, item): + return item in self.columns + + # ---------------------------------------------------------------------- + # if copy.copy() is copy.copy(): return copy.copy().copy() + + def __copy__(self): + return self.copy(deep=True) + + def __deepcopy__(self, memo=None): + return self.copy(deep=True) + + def copy(self, deep=True): + if deep: + data = pd.Series(dtype='O', index=self.columns) + for c in self.columns: + data.at[c] = self._data.at[c].copy(deep=True) + else: + data = self._data + kws = dict(itype=self._itype, cast_policy=self._policy) + return self._constructor(data=data, fastpath=True, **kws) + + def copy_empty(self, columns=True): + data = None + if columns is True: # is correct + data = pd.Series(dtype='O', index=self.columns) + for c in self.columns: + data.at[c] = pd.Series(dtype=self._data.at[c].dtype) + kws = dict(itype=self._itype, cast_policy=self._policy) + return self._constructor(data=data, fastpath=True, **kws) + + # ------------------------------------------------------------------------------ + # Operators + + def _op1(self, op): + new = self.copy_empty(columns=True) + try: + for k in self.columns: + new[k] = op(self[k]) + except Exception as e: + raise type(e)(f"'{ops.OP_MAP[op]} dios' failed: " + str(e)) from e + return new + + def _op2(self, op, other, align=True, inplace=False): + def raiseif(kself, kother, s): + if kself != kother: + raise ValueError(f"{s} does not match, {s} left: {kself}, {s} right: {kother}") + + def doalign(left, right): + return left.align(right, join='inner') if align else (left, right) + + def gen(): + if _is_dios_like(other): + raiseif(list(self), list(other), 'keys') + for k in self.columns: + left, right = self[k], other[k] + yield k, op(*doalign(left, right)) + elif isinstance(other, pd.Series): + for k in self.columns: + left, right = self[k], other + yield k, op(*doalign(left, right)) + elif pdcom.is_dict_like(other): + raiseif(sorted(self), sorted(other), 'keys') + for k in self.columns: + yield k, op(self[k], other[k]) + elif pdcom.is_nested_list_like(other): + raiseif(len(self), len(other), 'length') + for i, k in enumerate(self.columns): + yield k, op(self[k], other[i]) + elif pdcom.is_scalar(other) or pdcom.is_list_like(other): + for k in self.columns: + yield k, op(self[k], other) + else: + raise NotImplementedError + + new = self if inplace else self.copy_empty(columns=True) + try: + for k, val in gen(): + new[k] = val + except Exception as e: + raise type(e)(f"'dios {ops.OP_MAP[op]} other' failed: " + str(e)) from e + return new + + __neg__ = ftools.partialmethod(_op1, op.neg) + __abs__ = ftools.partialmethod(_op1, op.abs) + __invert__ = ftools.partialmethod(_op1, op.inv) + __eq__ = ftools.partialmethod(_op2, op.eq, align=False) + __ne__ = ftools.partialmethod(_op2, op.ne, align=False) + __le__ = ftools.partialmethod(_op2, op.le, align=False) + __ge__ = ftools.partialmethod(_op2, op.ge, align=False) + __lt__ = ftools.partialmethod(_op2, op.lt, align=False) + __gt__ = ftools.partialmethod(_op2, op.gt, align=False) + __add__ = ftools.partialmethod(_op2, op.add) + __sub__ = ftools.partialmethod(_op2, op.sub) + __mul__ = ftools.partialmethod(_op2, op.mul) + __mod__ = ftools.partialmethod(_op2, op.mod) + __truediv__ = ftools.partialmethod(_op2, op.truediv) + __floordiv__ = ftools.partialmethod(_op2, op.floordiv) + __pow__ = ftools.partialmethod(_op2, op.pow) + __and__ = ftools.partialmethod(_op2, op.and_) + __or__ = ftools.partialmethod(_op2, op.or_) + __xor__ = ftools.partialmethod(_op2, op.xor) + + # ------------------------------------------------------------------------------ + # Indexer + + @property + def loc(self): + return _LocIndexer(self) + + @property + def iloc(self): + return _iLocIndexer(self) + + @property + def aloc(self): + return _aLocIndexer(self) + + @property + def at(self): + return _AtIndexer(self) + + @property + def iat(self): + return _iAtIndexer(self) + + +def _is_dios_like(obj) -> bool: + # must have columns + # columns is some kind of pd.Index + # iter will iter through columns + # a `in` obj check if obj is in columns + # obj[key] will give a pd.Series + # obj.squeeze() give pd.Series if len(obj) == 1 + return isinstance(obj, _DiosBase) or isinstance(obj, pd.DataFrame) + + +def _is_bool_series(obj) -> bool: + return isinstance(obj, pd.Series) and obj.dtype == bool + + +def _is_bool_dios_like(obj) -> bool: + if not _is_dios_like(obj): + return False + dtypes = obj.dtypes + if (dtypes == bool).all(): + return True + if (dtypes == 'O').any(): + return obj.apply(_is_bool_indexer).all() + return False + + +# keep this here to prevent cyclic import +from .indexer import _aLocIndexer, _iLocIndexer, _LocIndexer, _iAtIndexer, _AtIndexer diff --git a/dios/dios.py b/dios/dios.py index 7f85b6a682515cf8f88f2a0fdb1decb50204bd4d..55725854b392110661697391b522f70fd2a9c648 100644 --- a/dios/dios.py +++ b/dios/dios.py @@ -1,42 +1,14 @@ -from .operators import OP_MAP as _OP_MAP - -from .lib import * -from .lib import ( - _CAST_POLICIES, - _itype_le, _itype_lt, - _throw_MixedItype_err_or_warn, - _find_least_common_itype, -) +from .base import _DiosBase, _is_dios_like +from .lib import Opts, OptsFields, dios_options +from .lib import _find_least_common_itype +import functools as ftools import pandas as pd +import pandas.core.dtypes.common as pdcom import numpy as np -import operator as op - -import functools as ftools - -import pandas.core.dtypes.common as dcom - -_is_list_like = dcom.is_list_like -_is_nested_list_like = dcom.is_nested_list_like -_is_scalar = dcom.is_scalar -_is_integer = dcom.is_integer -_is_dict_like = dcom.is_dict_like -_is_number = dcom.is_number -_is_hashable = dcom.is_hashable - -from pandas.core.common import is_bool_indexer as _is_bool_indexer - -""" -Unlike the example says, return lists False, not True ->>is_iterator([1, 2, 3]) ->>False -""" -from pandas.core.dtypes.common import is_iterator as _is_iterator -from typing import Union, Any - -class DictOfSeries: +class DictOfSeries(_DiosBase): """ A data frame where every column has its own index. DictOfSeries is a collection of pd.Series's which aim to be as close as possible similar to @@ -44,355 +16,40 @@ class DictOfSeries: unlike the former, which provide a single row-index for all columns. This solves problems with unaligned data and data which varies widely in length. - Indexing with ``di[]``, ``di.loc[]`` and ``di.iloc[]`` should work analogous to these methods - from pd.DataFrame. The indexer can be a single label, a slice, a list-like, a boolean list-like, - or a boolean dios/pd.DataFrame and can be used to selectively get or set data. - + Indexing with ``di[]``, ``di.loc[]`` and ``di.iloc[]`` should work analogous to these methods + from pd.DataFrame. The indexer can be a single label, a slice, a list-like, a boolean list-like, + or a boolean DictOfSeries/pd.DataFrame and can be used to selectively get or set data. + Parameters ---------- data : array-like, Iterable, dict, or scalar value Contains data stored in Series. - columns : array-like + columns : array-like Column labels to use for resulting frame. Will default to - RangeIndex (0, 1, 2, ..., n) if no column labels are provided. - - itype : Itype, pd.Index, Itype-string-repr, type - Index type that every series in this dios should have. - if None, the index-type is inferred each time a series is inserted - or deleted. - - cast_policy : str - Policy to use for down-casting an itype. - """ - - # ------------------------------------------------------------------------------ - # Constructors - - def __init__(self, data=None, columns=None, index=None, itype=None, cast_policy='save', fastpath=False): - - self.cast_policy = cast_policy - - # we are called internally - if fastpath: - self._itype = itype or ObjItype - if data is not None: - self._data = data - else: - # it is significantly faster, to provide an index and fill it, - # than to successively build the index by adding data - self._data = pd.Series(dtype='O', index=columns) - - else: - - if index is not None and not isinstance(index, pd.Index): - index = pd.Index(index) - - # itype=None means infer the itype by the data, so we first set to the highest - # possible itype, then insert data, then infer the best-fitting itype. - if itype is None and index is None: - self._itype = ObjItype - else: - if index is not None: - self._itype = get_itype(index) - if itype is not None: - self._itype = get_itype(itype) - - cols = pd.Index([] if columns is None else columns) - if not cols.is_unique: - raise ValueError("columns must be unique") - self._data = pd.Series(dtype='O', index=cols) - - if data is not None: - self._init_insert_data(data, columns, index) - - # self._data still contain nans at all positions, where - # no data was present, but a column-name was given - if self._data.hasnans: - e = pd.Series(dtype='O', index=index) - for c in self.columns[self._data.isna()]: - self._insert(c, e.copy()) - - self._data.index.name = 'columns' - - # we try to infer the itype, but if we still have - # no data, we will set the itype lazy, i.e. with - # the first non-empty _insert() - if itype is None: - if self.empty: - self._itype = 'INFER' - else: - self._itype = _find_least_common_itype(self._data) - if not self._itype.unique: - _throw_MixedItype_err_or_warn(self.itype) - - def _init_insert_data(self, data, columns, index): - """ Insert items of a iterable in self""" - - if _is_iterator(data): - data = list(data) - - if _is_dios_like(data) or isinstance(data, dict): - if columns is None: - pass # data is dict-like - else: - data = {k: data[k] for k in data if k in columns} - - elif isinstance(data, pd.Series): - name = data.name or 0 - if columns is not None and len(columns) > 0: - name = self.columns[0] - data = {name: data} - - elif _is_nested_list_like(data): - if columns is None: - data = {i: d for i, d in enumerate(data)} - elif len(data) == len(columns): - data = dict(zip(self.columns, data)) - else: - raise ValueError(f"{len(columns)} columns passed, data implies {len(data)} columns") - - elif _is_list_like(data): - name = 0 if columns is None or len(columns) < 1 else self.columns[0] - data = {name: data} - - else: - raise TypeError("data type not understood") + RangeIndex(0, 1, 2, ..., n) if no column labels are provided. - for k in data: - self._insert(k, pd.Series(data[k], index=index)) + index : Index or array-like + Index to use to reindex every given series during init. Ignored if omitted. - # ---------------------------------------------------------------------- - # Indexing Methods - - def _insert(self, col, val): - """Insert a fresh new value as pd.Series into self""" - val = list(val) if _is_iterator(val) else val - - if _is_dios_like(val): - val = val.squeeze() - if not isinstance(val, pd.Series): - raise ValueError(f"Cannot insert frame-like with more than one column") - - elif val is None: - val = pd.Series() - - elif not isinstance(val, pd.Series): - raise TypeError(f"Only data of type pandas.Series can be inserted, passed was {type(val)}") - - # set the itype lazy, i.e. when first non-empty - # column is inserted - if self._itype == 'INFER': - if not val.empty: - self._itype = get_itype(val.index) - # cast all pre-inserted empty series - self._cast_all(self._itype, self._policy) - if not self._itype.unique: - _throw_MixedItype_err_or_warn(self._itype) - else: - val = cast_to_itype(val, self.itype, policy=self._policy) - - val.name = col - self._data.at[col] = val.copy(deep=True) - - def __getitem__(self, key): - """ dios[key] -> dios/series """ - key = list(key) if _is_iterator(key) else key - if isinstance(key, tuple): - raise KeyError("tuples are not allowed") - - if _is_hashable(key): - # NOTE: we use copy here to prevent index - # changes, that could result in an invalid - # itype. A shallow copy is not sufficient. - - # work on columns, return series - return self._data.at[key].copy() - - if _is_dios_like(key): - # work on rows and columns - new = self._getitem_bool_dios(key) - elif isinstance(key, slice): - # work on rows - new = self._slice(key) - elif _is_bool_indexer(key): - # work on rows - new = self._getitem_bool_listlike(key) - else: - # work on columns - data = self._data.loc[key] - new = DictOfSeries(data=data, itype=self.itype, cast_policy=self._policy, fastpath=True) - - return new - - def _slice(self, key): - """slices self, return copy""" - if key == slice(None): - return self.copy() - - new = self.copy_empty(columns=True) - for k in self.columns: - new._data.at[k] = self._data.at[k][key] - return new - - def _getitem_bool_dios(self, key): - """ Select items by a boolean dios-like drop un-selected indices. """ - - if not _is_bool_dios_like(key): - raise ValueError("Must pass DictOfSeries with boolean values only") - - new = self.copy_empty(columns=True) - for k in self.columns.intersection(key.columns): - dat = self._data.at[k] - val = key[k] - # align rows - idx = val[val].index.intersection(dat.index) - new._data.at[k] = dat[idx] - return new - - def _getitem_bool_listlike(self, key): - new = self.copy_empty(columns=True) - for k in self.columns: - new._data.at[k] = self._data.at[k].loc[key] - return new - - def __setitem__(self, key, value): - """ dios[key] = value """ - key = list(key) if _is_iterator(key) else key - if isinstance(key, tuple): - raise KeyError(f"{key}. tuples are not allowed") - - elif _is_hashable(key): - if isinstance(value, pd.Series) or key not in self.columns: - self._insert(key, value) - elif _is_dios_like(value) or _is_nested_list_like(value): - raise ValueError("Incompatible indexer with multi-dimensional value") - else: - self._data.at[key][:] = value - - else: - data = self.__getitem__(key) - assert isinstance(data, self.__class__), f"getitem returned data of type {type(data)}" - - # special cases - if _is_dios_like(value): - self._setitem_dios(data, value) - # NOTE: pd.Series also considered list-like - elif _is_list_like(value): - self._setitem_listlike(data, value) - - # default case - else: - for k in data.columns: - s = data._data.at[k] - s[:] = value - self._data.at[k][s.index] = s - - def _setitem_listlike(self, data, value): - - value = value.values if isinstance(value, pd.Series) else value - - if len(value) != len(data.columns): - raise ValueError(f"array-like value of length {len(value)} could " - f"not be broadcast to indexing result of shape " - f"(.., {len(data.columns)})") - - for i, k in enumerate(data.columns): - s = data._data.at[k] - s[:] = value[i] - self._data.at[k][s.index] = s - - def _setitem_dios(self, data, value): - """ Write values from a dios-like to self. - - No justification or alignment of columns, but of indices. - If value has missing indices, nan's are inserted at that - locations, just like `series.loc[:]=val` or `df[:]=val` do. - - Eg. - di[::2] = di[::3] -> di[::2] + itype : Itype, pd.Index, Itype-string-repr or type + Every series that is inserted, must have an index of this type or any + of this types subtypes. + If None, the itype is inferred as soon as the first non-empty series is inserted. - x | x | x | - ===== | ==== | ====== | - 0 x | 0 z | 0 z | - 2 x | = 3 z | -> 2 NaN | - 4 x | 6 z | 4 NaN | - 6 x | 6 z | - - Parameter - ---------- - data : dios - A maybe trimmed version of self - value : dios, pd.Dataframe - The value to set with the same column dimension like data - """ - - if len(data) != len(value.columns): - raise ValueError(f"shape mismatch: values array of shape " - f"(.., {len(value.columns)}) could not " - f"be broadcast to indexing result of " - f"shape (.., {len(data.columns)})") - - for i, k in enumerate(data): - dat = data._data.at[k] - # .loc cannot handle empty series, - # like `emptySeries.loc[:] = [1,2]` - if dat.empty: - continue - val = value[value.columns[i]] - dat.loc[:] = val - self._data.at[k].loc[dat.index] = dat - - def __delitem__(self, key): - del self._data[key] - - # ------------------------------------------------------------------------------ - # Base properties and basic dunder magic - - @property - def columns(self): - return self._data.index - - @columns.setter - def columns(self, cols): - index = pd.Index(cols) - if not index.is_unique: - raise ValueError("columns index must have unique values") - self._data.index = index - - @property - def itype(self): - if self._itype == 'INFER': - return None - return self._itype + cast_policy : {'save', 'force', 'never'}, default 'save' + Policy used for (down-)casting the index of a series if its type does not match + the ``itype``. + """ - @itype.setter - def itype(self, itype): - itype = get_itype(itype) - self._cast_all(itype, policy=self._policy) - self._itype = itype + def __init__(self, data=None, columns=None, index=None, itype=None, cast_policy='save', fastpath=False): + super().__init__( + data=data, columns=columns, index=index, itype=itype, cast_policy=cast_policy, fastpath=fastpath + ) @property - def cast_policy(self): - return self._policy - - @cast_policy.setter - def cast_policy(self, policy): - if policy not in _CAST_POLICIES: - raise ValueError(f"policy must be one of {_CAST_POLICIES}") - self._policy = policy - - def _cast_all(self, itype, policy): - c = '?' - data = self.copy_empty() - try: - for c in self.columns: - data._data.at[c] = cast_to_itype(self._data.at[c], itype, policy=policy) - except Exception as e: - raise type(e)(f"Column {c}: " + str(e)) from e - - def __len__(self): - return len(self.columns) + def _constructor(self): + return DictOfSeries @property def indexes(self): @@ -412,23 +69,10 @@ class DictOfSeries: def lengths(self): return self._data.apply(len) - @property - def empty(self): - return len(self) == 0 or all(s.empty for s in self._data) - @property def size(self): return self.lengths.sum() - def __iter__(self): - yield from self.columns - - def __reversed__(self): - yield from reversed(self.columns) - - def __contains__(self, item): - return item in self.columns - # ------------------------------------------------------------------------------ # Dict-like methods @@ -480,7 +124,7 @@ class DictOfSeries: # ------------------------------------------------------------------------------ # Broadcasting methods and helper - def for_each(self, attr_or_callable, **kwargs): + def for_each(self, attr_or_callable, **kwds): """ Apply a callable or a pandas.Series method or property on each column. @@ -492,7 +136,7 @@ class DictOfSeries: could be specified as string. If a callable is given it must take pandas.Series as the only positional argument. - **kwargs: any + **kwds: any kwargs to passed to callable Returns @@ -554,7 +198,7 @@ class DictOfSeries: for c in self.columns: dat = self._data.at[c] if call: - data.at[c] = attr_or_callable(dat, **kwargs) + data.at[c] = attr_or_callable(dat, **kwds) else: data.at[c] = attr_or_callable.fget(dat) return data @@ -618,7 +262,7 @@ class DictOfSeries: dat = self._data.at[c].values if raw else self._data.at[c] s = func(dat, *args, **kwds) result.at[c] = s - if _is_scalar(s): + if pdcom.is_scalar(s): need_convert = True else: need_dios = True @@ -628,7 +272,7 @@ class DictOfSeries: if need_convert: for c in result.index: result.at[c] = pd.Series(result[c]) - itype = _find_least_common_itype(result) + itype = _find_least_common_itype(result) result = DictOfSeries(data=result, itype=itype, fastpath=True) else: raise ValueError(axis) @@ -739,6 +383,17 @@ class DictOfSeries: data = self.for_each('astype', dtype=dtype, copy=copy, errors=errors) return DictOfSeries(data=data, itype=self.itype, cast_policy=self._policy, fastpath=True) + def memory_usage(self, index=True, deep=False): + return self.for_each(pd.Series.memory_usage, index=index, deep=deep).sum() + + def to_df(self): + df_or_ser = self._data.apply(lambda s: s).transpose() + return pd.DataFrame() if isinstance(df_or_ser, pd.Series) else df_or_ser + + @property + def debugDf(self): + return self.to_df() + # ---------------------------------------------------------------------- # Boolean stuff @@ -781,34 +436,6 @@ class DictOfSeries: data = self.for_each('notna') return DictOfSeries(data=data, itype=self.itype, cast_policy=self._policy, fastpath=True) - # ---------------------------------------------------------------------- - # if copy.copy() is copy.copy(): return copy.copy().copy() - - def __copy__(self): - return self.copy(deep=True) - - def __deepcopy__(self, memo=None): - return self.copy(deep=True) - - def copy(self, deep=True): - if deep: - data = pd.Series(dtype='O', index=self.columns) - for c in self.columns: - data.at[c] = self._data.at[c].copy(deep=True) - else: - data = self._data - kws = dict(itype=self._itype, cast_policy=self._policy) - return DictOfSeries(data=data, fastpath=True, **kws) - - def copy_empty(self, columns=True): - data = None - if columns is True: # is correct - data = pd.Series(dtype='O', index=self.columns) - for c in self.columns: - data.at[c] = pd.Series(dtype=self._data.at[c].dtype) - kws = dict(itype=self._itype, cast_policy=self._policy) - return DictOfSeries(data=data, fastpath=True, **kws) - # ---------------------------------------------------------------------- # Rendering Methods @@ -875,113 +502,6 @@ class DictOfSeries: return pprint_dios(self, **kwargs) - def memory_usage(self, index=True, deep=False): - return self.for_each(pd.Series.memory_usage, index=index, deep=deep).sum() - - def to_df(self): - df_or_ser = self._data.apply(lambda s: s).transpose() - return pd.DataFrame() if isinstance(df_or_ser, pd.Series) else df_or_ser - - @property - def debugDf(self): - return self.to_df() - - # ------------------------------------------------------------------------------ - # Operators - - def _op1(self, op): - new = self.copy_empty(columns=True) - try: - for k in self.columns: - new[k] = op(self[k]) - except Exception as e: - raise type(e)(f"'{_OP_MAP[op]} dios' failed: " + str(e)) from e - return new - - def _op2(self, op, other, align=True, inplace=False): - def raiseif(kself, kother, s): - if kself != kother: - raise ValueError(f"{s} does not match, {s} left: {kself}, {s} right: {kother}") - - def doalign(left, right): - return left.align(right, join='inner') if align else (left, right) - - def gen(): - if _is_dios_like(other): - raiseif(list(self), list(other), 'keys') - for k in self.columns: - left, right = self[k], other[k] - yield k, op(*doalign(left, right)) - elif isinstance(other, pd.Series): - for k in self.columns: - left, right = self[k], other - yield k, op(*doalign(left, right)) - elif _is_dict_like(other): - raiseif(sorted(self), sorted(other), 'keys') - for k in self.columns: - yield k, op(self[k], other[k]) - elif _is_nested_list_like(other): - raiseif(len(self), len(other), 'length') - for i, k in enumerate(self.columns): - yield k, op(self[k], other[i]) - elif _is_scalar(other) or _is_list_like(other): - for k in self.columns: - yield k, op(self[k], other) - else: - raise NotImplementedError - - new = self if inplace else self.copy_empty(columns=True) - try: - for k, val in gen(): - new[k] = val - except Exception as e: - raise type(e)(f"'dios {_OP_MAP[op]} other' failed: " + str(e)) from e - return new - - __neg__ = ftools.partialmethod(_op1, op.neg) - __abs__ = ftools.partialmethod(_op1, op.abs) - __invert__ = ftools.partialmethod(_op1, op.inv) - __eq__ = ftools.partialmethod(_op2, op.eq, align=False) - __ne__ = ftools.partialmethod(_op2, op.ne, align=False) - __le__ = ftools.partialmethod(_op2, op.le, align=False) - __ge__ = ftools.partialmethod(_op2, op.ge, align=False) - __lt__ = ftools.partialmethod(_op2, op.lt, align=False) - __gt__ = ftools.partialmethod(_op2, op.gt, align=False) - __add__ = ftools.partialmethod(_op2, op.add) - __sub__ = ftools.partialmethod(_op2, op.sub) - __mul__ = ftools.partialmethod(_op2, op.mul) - __mod__ = ftools.partialmethod(_op2, op.mod) - __truediv__ = ftools.partialmethod(_op2, op.truediv) - __floordiv__ = ftools.partialmethod(_op2, op.floordiv) - __pow__ = ftools.partialmethod(_op2, op.pow) - __and__ = ftools.partialmethod(_op2, op.and_) - __or__ = ftools.partialmethod(_op2, op.or_) - __xor__ = ftools.partialmethod(_op2, op.xor) - - # ------------------------------------------------------------------------------ - # Indexer - - @property - def loc(self): - return _LocIndexer(self) - - @property - def iloc(self): - return _iLocIndexer(self) - - @property - def aloc(self): - return _aLocIndexer(self) - - @property - def at(self): - return _AtIndexer(self) - - @property - def iat(self): - return _iAtIndexer(self) - - def _empty_repr(di): return f"Empty DictOfSeries\n" \ @@ -1119,35 +639,6 @@ def _to_aligned_df(dios, no_value=' '): return df -def _is_list_like_not_nested(obj): - return _is_list_like(obj) and not _is_nested_list_like(obj) - - -def _is_dios_like(obj) -> bool: - # must have columns - # columns is some kind of pd.Index - # iter will iter through columns - # a `in` obj check if obj is in columns - # obj[key] will give a pd.Series - # obj.squeeze() give pd.Series if len(obj) == 1 - return isinstance(obj, DictOfSeries) or isinstance(obj, pd.DataFrame) - - -def _is_bool_series(obj) -> bool: - return isinstance(obj, pd.Series) and obj.dtype == bool - - -def _is_bool_dios_like(obj) -> bool: - if not _is_dios_like(obj): - return False - dtypes = obj.dtypes - if (dtypes == bool).all(): - return True - if (dtypes == 'O').any(): - return obj.apply(_is_bool_indexer).all() - return False - - def to_dios(obj) -> DictOfSeries: if isinstance(obj, DictOfSeries): return obj @@ -1163,6 +654,3 @@ def __monkey_patch_pandas(): __monkey_patch_pandas() - -# keep this here to prevent cyclic import -from .indexer import _aLocIndexer, _iLocIndexer, _LocIndexer, _iAtIndexer, _AtIndexer diff --git a/dios/indexer.py b/dios/indexer.py index a8be985e8c86567d5638e85ff11fdd1fcc41239b..f32d01fd9284563b39e4bea99db7c96f68b16ada 100644 --- a/dios/indexer.py +++ b/dios/indexer.py @@ -1,8 +1,6 @@ -from .dios import ( - DictOfSeries, +from .base import ( + _DiosBase, _is_dios_like, - _is_bool_series, - _is_list_like_not_nested, _is_bool_dios_like, _is_iterator) @@ -23,7 +21,7 @@ _is_null_slice = ccom.is_null_slice class _Indexer: - def __init__(self, obj: DictOfSeries): + def __init__(self, obj: _DiosBase): self.obj = obj self._data = obj._data @@ -271,7 +269,7 @@ class _aLocIndexer(_Indexer): if lowdim: return data.squeeze() else: - return DictOfSeries(data=data, fastpath=True, **kws) + return self.obj._constructor(data=data, fastpath=True, **kws) def __setitem__(self, key, value): rowkeys, colkeys, _ = self._unpack_key_aloc(key)