diff --git a/dios/__init__.py b/dios/__init__.py index 4b1349732ff98584f7397afe68f0d82d72225796..525c5b66cd5d9396859833ddabc016e0effb0b82 100644 --- a/dios/__init__.py +++ b/dios/__init__.py @@ -1,10 +1,4 @@ -# low level -from dios.errors import * from dios.lib import * -from dios.options import * - -# high level -from dios.itypes import * from dios.dios import * diff --git a/dios/dios.py b/dios/dios.py index 954ba13f6acccadd1cc572a2b084a99e4cb5b55d..0597d08680a2cb538a35a0fce5ca9588655dbded 100644 --- a/dios/dios.py +++ b/dios/dios.py @@ -1,34 +1,40 @@ +from dios.operators import OP_MAP as _OP_MAP + from dios.lib import * -from dios.options import * -from dios.itypes import * -from dios.errors import * +from dios.lib import _CAST_POLICIES, _itype_le, _throw_MixedItype_err_or_warn + import pandas as pd import numpy as np import operator as op -from functools import partialmethod -from itertools import zip_longest, compress, takewhile - -from functools import wraps -from collections import OrderedDict -from pandas._libs.lib import to_object_array, is_bool_array -from pandas.core.common import is_bool_indexer -from pandas.core.dtypes.common import ( - is_list_like, - is_nested_list_like, - is_scalar, - is_integer, - is_dict_like, - is_number, - is_hashable, -) -from pandas.core.dtypes.common import is_iterator as _is_iterator - - -def is_list_like_not_nested(obj): - return is_list_like(obj) and not is_nested_list_like(obj) - - -def is_dios_like(obj): + +import functools as ftools +import itertools + +import pandas.core.dtypes.common as dcom +_is_list_like = dcom.is_list_like +_is_nested_list_like = dcom.is_nested_list_like +_is_scalar = dcom.is_scalar +_is_integer = dcom.is_integer +_is_dict_like = dcom.is_dict_like +_is_number = dcom.is_number +_is_hashable = dcom.is_hashable + +import pandas.core.common as ccom +_is_bool_indexer = ccom.is_bool_indexer + +# """ +# Unlike the example says, return lists False, not True +# >>is_iterator([1, 2, 3]) +# >>False +# """ +# from pandas.core.dtypes.common import is_iterator + + +def _is_list_like_not_nested(obj): + return _is_list_like(obj) and not _is_nested_list_like(obj) + + +def _is_dios_like(obj): # must have columns # columns is some kind of pd.Index # iter will iter through columns @@ -38,19 +44,10 @@ def is_dios_like(obj): return isinstance(obj, DictOfSeries) or isinstance(obj, pd.DataFrame) -def is_bool_series(obj): +def _is_bool_series(obj): return isinstance(obj, pd.Series) and obj.dtype == bool -def is_iterator(obj): - """ This is only a dummy wrapper, to warn that the docu of this isnt't right. - Unlike the example says, - >>is_iterator([1, 2, 3]) - returns False, not True for lists - """ - return _is_iterator(obj) - - class DictOfSeries: """ DictionaryOfSeries is a collection of pd.Series's which aim to be as close as possible similar to @@ -78,13 +75,13 @@ class DictOfSeries: # (eg. a datetime-like slice cannot work on a numeric index and vice versa). self._itype = get_itype(itype) if not self._itype.unique: - throw_MixedItype_err_or_warn(self.itype) + _throw_MixedItype_err_or_warn(self.itype) - if downcast_policy not in CAST_POLICIES: - raise ValueError(f"downcast_policy must be one of {CAST_POLICIES}") + if downcast_policy not in _CAST_POLICIES: + raise ValueError(f"downcast_policy must be one of {_CAST_POLICIES}") self._policy = downcast_policy - if columns is not None and not is_list_like_not_nested(columns): + if columns is not None and not _is_list_like_not_nested(columns): raise TypeError("'columns' must be some kind of list-like collection.") if data is not None: @@ -103,13 +100,13 @@ class DictOfSeries: def incols(c): return c in columns if columns is not None else True - if isinstance(data, dict) or is_dios_like(data): + if isinstance(data, dict) or _is_dios_like(data): for k in data: if incols(k): self._insert(k, data[k]) - elif is_list_like(data): # also Series ! - data = data if is_nested_list_like(data) else [data] + elif _is_list_like(data): # also Series ! + data = data if _is_nested_list_like(data) else [data] if columns is None: for i, d in enumerate(data): self._insert(i, d) @@ -123,17 +120,15 @@ class DictOfSeries: def _insert(self, col, val): """Insert a fresh new value into self""" - if is_dios_like(val): + if _is_dios_like(val): if len(val) > 1: raise ValueError(f"Cannot insert DictOfSeries " f"with more than one series") val = val.squeeze() else: val = pd.Series(val) - if not isinstance(val, pd.Series): raise ValueError(f"Cannot insert data of type {type(val)}") - val = cast_to_itype(val, self.itype, policy=self._policy) self._data.at[col] = val.copy(deep=True) @@ -152,16 +147,13 @@ class DictOfSeries: @itype.setter def itype(self, newitype): itype = get_itype(newitype) - - if not itype_le(self._itype, itype): + if not _itype_le(self._itype, itype): self.__cast_all(itype) - self._itype = itype - if not itype.unique: - throw_MixedItype_err_or_warn(f"Using a {itype} as dios.itype is experimental. As soon as series with " - f"different index types are inserted, slicing will almost always fail. " - f"You are hereby warned!") + _throw_MixedItype_err_or_warn(f"Using a {itype} as dios.itype is experimental. As soon as series with " + f"different index types are inserted, slicing will almost always fail. " + f"You are hereby warned!") def __cast_all(self, itype): c = '?' @@ -176,15 +168,15 @@ class DictOfSeries: if isinstance(key, tuple): raise KeyError("tuples are not allowed") - elif is_hashable(key): + elif _is_hashable(key): new = self._data.at[key] - elif is_dios_like(key): + elif _is_dios_like(key): # work on rows and columns new = self._getitem_bool_dios(key) elif isinstance(key, slice): # work on rows new = self._slice(key) - elif is_bool_indexer(key): + elif _is_bool_indexer(key): # work on rows new = self._getitem_bool_listlike(key) else: @@ -213,7 +205,7 @@ class DictOfSeries: for k in keys: ser = self._data.at[k] boolser = key[k] - if not is_bool_indexer(boolser): + if not _is_bool_indexer(boolser): raise ValueError("Must pass DictOfSeries with boolean values only") # align rows idx = boolser[boolser].index.intersection(ser.index) @@ -232,17 +224,16 @@ class DictOfSeries: if isinstance(key, tuple): raise KeyError(f"{key}. tuples are not allowed") - elif is_hashable(key): + elif _is_hashable(key): if isinstance(value, pd.Series): self._insert(key, value) else: self._data.at[key][:] = value - else: data = self.__getitem__(key) assert isinstance(data, self.__class__), f"getitem returned data of type {type(data)}" - if is_dios_like(value): + if _is_dios_like(value): self._setitem_dios(data, value) else: for k in data.columns: @@ -283,14 +274,6 @@ class DictOfSeries: from dios.locator import _iAtIndexer return _iAtIndexer(self) - def __str__(self): - return self.__repr__() - - def __repr__(self): - return pprint(self, - max_rows=dios_options[OptsFields.disp_max_rows], - max_cols=dios_options[OptsFields.disp_max_vars]) - @property def empty(self): return len(self) == 0 or all(s.empty for s in self._data) @@ -360,6 +343,14 @@ class DictOfSeries: def copy_empty(self, columns=True): return DictOfSeries(columns=self.columns if columns is True else None, itype=self.itype) + def __str__(self): + return self.__repr__() + + def __repr__(self): + return pprint_dios(self, + max_rows=dios_options[OptsFields.disp_max_rows], + max_cols=dios_options[OptsFields.disp_max_vars]) + def to_df(self): return self._data.apply(lambda s: s).transpose() @@ -367,43 +358,33 @@ class DictOfSeries: def debugDf(self): return self.to_df() - def memory_usage(self, index=True, deep=False): - mem = 0 - for k in self.columns: - mem += self[k].memory_usage(index=index, deep=deep) - return mem - def apply(self, func, axis=0, raw=False, args=(), **kwds): if axis in [1, 'columns']: raise NotImplementedError elif axis in [0, 'index']: - pass + # we cannot use self._data.apply(func=func, args=args, **kwds) + # because this may return a df + need_dios = False + new = [] + for c in self.columns: + s = func(self[c].values if raw else self[c], *args, **kwds) + s = s.squeeze() if isinstance(s, pd.Series) else s + new.append(s) + try: + need_dios = True if len(s) > 1 else need_dios + except TypeError: + pass + + if need_dios: + dios = self.copy_empty(columns=False) + for i, c in enumerate(self.columns): + dios[c] = pd.Series(new[i]) + new = dios + else: + new = pd.Series(data=new, index=self.columns) else: raise ValueError(axis) - - # we cannot use self._data.apply(func=func, args=args, **kwds) - # because this may return a df - - need_dios = False - new = [] - for c in self.columns: - s = func(self[c].values if raw else self[c], *args, **kwds) - s = s.squeeze() if isinstance(s, pd.Series) else s - new.append(s) - try: - if len(s) > 1: - need_dios = True - except TypeError: - pass - - if need_dios: - dios = self.copy_empty(columns=False) - for i, c in enumerate(self.columns): - dios[c] = pd.Series(new[i]) - new = dios - else: - new = pd.Series(data=new, index=self.columns) return new @property @@ -419,13 +400,22 @@ class DictOfSeries: new._data.at[k] = self._data.at[k].astype(dtype=dtype, copy=True, errors=errors) return new + def memory_usage(self, index=True, deep=False): + mem = 0 + for k in self.columns: + mem += self[k].memory_usage(index=index, deep=deep) + return mem + + # ############################################################################# + # Operators + def _op1(self, op): new = self.copy_empty(columns=False) try: for k in self.columns: new[k] = op(self[k]) except Exception as e: - raise type(e)(f"'{OP_MAP[op]} dios' failed: " + str(e)) from e + raise type(e)(f"'{_OP_MAP[op]} dios' failed: " + str(e)) from e return new def _op2(self, op, other, align=True, inplace=False): @@ -437,7 +427,7 @@ class DictOfSeries: return left.align(right, join='inner') if align else (left, right) def gen(): - if is_dios_like(other): + if _is_dios_like(other): raiseif(list(self), list(other), 'keys') for k in self.columns: left, right = self[k], other[k] @@ -446,15 +436,15 @@ class DictOfSeries: for k in self.columns: left, right = self[k], other yield k, op(*doalign(left, right)) - elif is_dict_like(other): + elif _is_dict_like(other): raiseif(sorted(self), sorted(other), 'keys') for k in self.columns: yield k, op(self[k], other[k]) - elif is_nested_list_like(other): + elif _is_nested_list_like(other): raiseif(len(self), len(other), 'length') for i, k in enumerate(self.columns): yield k, op(self[k], other[i]) - elif is_scalar(other) or is_list_like(other): + elif _is_scalar(other) or _is_list_like(other): for k in self.columns: yield k, op(self[k], other) else: @@ -465,31 +455,31 @@ class DictOfSeries: for k, val in gen(): new[k] = val except Exception as e: - raise type(e)(f"'dios {OP_MAP[op]} other' failed: " + str(e)) from e + raise type(e)(f"'dios {_OP_MAP[op]} other' failed: " + str(e)) from e return new - __neg__ = partialmethod(_op1, op.neg) - __abs__ = partialmethod(_op1, op.abs) - __invert__ = partialmethod(_op1, op.inv) - __eq__ = partialmethod(_op2, op.eq, align=False) - __ne__ = partialmethod(_op2, op.ne, align=False) - __le__ = partialmethod(_op2, op.le, align=False) - __ge__ = partialmethod(_op2, op.ge, align=False) - __lt__ = partialmethod(_op2, op.lt, align=False) - __gt__ = partialmethod(_op2, op.gt, align=False) - __add__ = partialmethod(_op2, op.add) - __sub__ = partialmethod(_op2, op.sub) - __mul__ = partialmethod(_op2, op.mul) - __mod__ = partialmethod(_op2, op.mod) - __truediv__ = partialmethod(_op2, op.truediv) - __floordiv__ = partialmethod(_op2, op.floordiv) - __pow__ = partialmethod(_op2, op.pow) - __and__ = partialmethod(_op2, op.and_) - __or__ = partialmethod(_op2, op.or_) - __xor__ = partialmethod(_op2, op.xor) - - -def pprint(dios, max_rows=10, max_cols=2, delim=' '): + __neg__ = ftools.partialmethod(_op1, op.neg) + __abs__ = ftools.partialmethod(_op1, op.abs) + __invert__ = ftools.partialmethod(_op1, op.inv) + __eq__ = ftools.partialmethod(_op2, op.eq, align=False) + __ne__ = ftools.partialmethod(_op2, op.ne, align=False) + __le__ = ftools.partialmethod(_op2, op.le, align=False) + __ge__ = ftools.partialmethod(_op2, op.ge, align=False) + __lt__ = ftools.partialmethod(_op2, op.lt, align=False) + __gt__ = ftools.partialmethod(_op2, op.gt, align=False) + __add__ = ftools.partialmethod(_op2, op.add) + __sub__ = ftools.partialmethod(_op2, op.sub) + __mul__ = ftools.partialmethod(_op2, op.mul) + __mod__ = ftools.partialmethod(_op2, op.mod) + __truediv__ = ftools.partialmethod(_op2, op.truediv) + __floordiv__ = ftools.partialmethod(_op2, op.floordiv) + __pow__ = ftools.partialmethod(_op2, op.pow) + __and__ = ftools.partialmethod(_op2, op.and_) + __or__ = ftools.partialmethod(_op2, op.or_) + __xor__ = ftools.partialmethod(_op2, op.xor) + + +def pprint_dios(dios, max_rows=10, max_cols=2, delim=' '): if dios.empty: return f"Empty DictionaryOfSeries\n" \ f"Columns: {dios.columns.to_list()}" @@ -528,7 +518,7 @@ def pprint(dios, max_rows=10, max_cols=2, delim=' '): outer.append(inner + [" " * ml] * (maxlen - len(s))) # switch outer and inner - rstr = list(map(list, zip_longest(*outer, fillvalue=""))) + rstr = list(map(list, itertools.zip_longest(*outer, fillvalue=""))) txt = "" rstr = [header] + [subheader] + rstr diff --git a/dios/errors.py b/dios/errors.py deleted file mode 100644 index 9df116fb665182532e84a78beea3d558883ced55..0000000000000000000000000000000000000000 --- a/dios/errors.py +++ /dev/null @@ -1,24 +0,0 @@ -import warnings -# do not import dios-stuff here - - -class ItypeWarning(RuntimeWarning): - pass - - -class ItypeCastWarning(ItypeWarning): - pass - - -class ItypeCastError(RuntimeError): - pass - - -class OptionsWarning(UserWarning): - pass - - -class OptionsError(RuntimeError): - pass - - diff --git a/dios/itypes.py b/dios/itypes.py deleted file mode 100644 index 3005c2d8da6c570d597a1a7b90e8d8db02775224..0000000000000000000000000000000000000000 --- a/dios/itypes.py +++ /dev/null @@ -1,228 +0,0 @@ -import pandas as pd -from dios.options import * -from dios.lib import * -from dios.errors import * - - -class CastPolicy: - force = 'force' - save = 'save' - never = 'never' - - -CAST_POLICIES = get_storage_class_values(CastPolicy) - - -class __Itype: - def __init__(self): - raise RuntimeError("DatetimeItype does not allow instances of itself.") - - -class DatetimeItype(__Itype): - name = 'datetime' - repr = 'DatetimeItype' - unique = True - subtypes = (pd.DatetimeIndex,) - min_pditype = pd.DatetimeIndex([]) - - -class IntegerItype(__Itype): - name = 'integer' - repr = 'IntegerItype' - unique = True - subtypes = (pd.RangeIndex, pd.Int64Index, pd.UInt64Index, int) - min_pditype = pd.Int64Index([]) - - -class FloatItype(__Itype): - name = 'float' - repr = 'FloatItype' - subtypes = (pd.Float64Index, float) - unique = True - min_pditype = pd.Float64Index([]) - - -# class MultiItype(__Itype): -# name = "multi" -# repr = 'MultiItype' -# subtypes = (pd.MultiIndex, ) -# unique = ?? - - -class NumericItype(__Itype): - name = "numeric" - repr = 'NumericItype' - _subitypes = (IntegerItype, FloatItype) - subtypes = (_subitypes + IntegerItype.subtypes + FloatItype.subtypes) - unique = False - min_pditype = pd.Float64Index([]) - - -class MixedItype(__Itype): - name = "mixed" - repr = 'MixedItype' - unique = False - _subitypes = (DatetimeItype, IntegerItype, FloatItype, NumericItype) - _otheritypes = (pd.CategoricalIndex, pd.IntervalIndex, pd.PeriodIndex, pd.TimedeltaIndex, pd.Index) - subtypes = (_subitypes + _otheritypes + DatetimeItype.subtypes + NumericItype.subtypes) - min_pditype = pd.Index([]) - - -def is_itype(obj, itype): - """ Check if obj is a instance of the given itype or its str-alias was given""" - - # todo: iter through itype as it could be a tuple, if called like ``is_itype(o, (t1,t2))`` - - # user gave a Itype, like ``DatetimeItype`` - if type(obj) == type and issubclass(obj, itype): - return True - - # user gave a string, like 'datetime' - if isinstance(obj, str) and obj == itype.name: - return True - - return False - - -def is_itype_subtype(obj, itype): - """ Check if obj is a subclass or a instance of a subclass of the given itype""" - - # user gave a subtype, like ``pd.DatetimeIndex`` - if type(obj) == type and issubclass(obj, itype.subtypes): - return True - - # user gave a instance of a subtype, like ``pd.Series(..).index`` - if isinstance(obj, itype.subtypes): - return True - - return False - - -def is_itype_like(obj, itype): - """ Check if obj is a subclass or a instance of the given itype or any of its subtypes""" - return is_itype(obj, itype) or is_itype_subtype(obj, itype) - - -def get_itype(obj): - """ - Return the according Itype, by any of any possible user input, like - - "datetime" - - DatetimeItype - - pd.Series(...).index - - pd.DatetimeIndex - and return the according Itype - """ - if type(obj) == type and issubclass(obj, __Itype): - return obj - - # check if it is the actual type, not a subtype - types = [DatetimeItype, IntegerItype, FloatItype, NumericItype, MixedItype] - for t in types: - if is_itype(obj, t): - return t - - for t in types: - if is_itype_subtype(obj, t): - return t - - raise ValueError(f"{obj} is not a itype, nor any known subtype of a itype, nor a itype string alias") - - -def itype_eq(a, b): - return is_itype(a, b) - - -def itype_lt(a, b): - return is_itype_subtype(a, b) - - -def itype_le(a, b): - return is_itype_like(a, b) - - -def cast_to_itype(series, itype, policy='lossless', err='raise', inplace=False): - """ Cast a series (more explicit the type of the index) to fit the itype of a dios. - - Return the casted series if successful, None otherwise. - - Note: - This is very basic number-casting, so in most cases, information from - the old index will be lost after the cast. - """ - - if policy not in CAST_POLICIES: - raise ValueError(f"policy={policy}") - if err not in ['raise', 'ignore']: - raise ValueError(f"err={err}") - if not inplace: - series = series.copy() - itype = get_itype(itype) - series.itype = get_itype(series.index) - - if series.empty: - return pd.Series(index=itype.min_pditype) - - - # up-cast issn't necessary because a dios with a higher - # itype always can take lower itypes. - # series can have dt/int/float/mixed - # dt -> dt -> mixed - # int -> int -> num -> mixed - # float -> float -> num -> mixed - # mixed -> mixed - if itype_le(series.itype, itype): # a <= b - return series - - e = f"A series index of type `{type(series.index)}` cannot be casted to Itype {itype.name}" - - # cast any -> dt always fail. - if is_itype(itype, DatetimeItype): - pass - else: - e += f", as forbidden by the cast-policy `{policy}`." - - if policy == CAST_POLICIES[CastPolicy.never]: - pass - - elif policy == CAST_POLICIES[CastPolicy.force]: - # cast any (dt/float/mixed) -> int - # cast any (dt/float/mixed) -> num - if is_itype(itype, IntegerItype) or is_itype(itype, NumericItype): # a == b or a == c - series.index = pd.RangeIndex(len(series)) - return series - # cast any (dt/int/mixed) -> float - if is_itype(itype, FloatItype): # a == b - series.index = pd.Float64Index(range(len(series))) - return series - - elif policy == CAST_POLICIES[CastPolicy.save]: - # cast int -> float - if is_itype(itype, IntegerItype) and is_itype(series.itype, FloatItype): # a == b and c == d - series.index = series.index.astype(float) - return series - # cast float -> int, maybe if unique - if is_itype(itype, FloatItype) and is_itype(series.itype, IntegerItype): # a == b and c == d - series.index = series.index.astype(int) - if series.index.is_unique: - return series - e = f"The cast with policy {policy} from series index type `{type(series.index)}` to " \ - f"itype {itype.name} resulted in a non-unique index." - # cast mixed -> int/float always fail - - if err == 'raise': - raise ItypeCastError(e) - else: - return None - - -def throw_MixedItype_err_or_warn(itype): - msg = f"'{itype.repr}'. Using a non-unique Itype for a DictionaryOfSeries, is not recommend. " \ - f"As soon as series \nwith different index types are inserted, selecting and slicing data will " \ - f"almost always fail. You are hereby warned!" - if dios_options[OptsFields.mixed_itype_warn_policy] in ['ignore', Opts.itype_ignore]: - pass - elif dios_options[OptsFields.mixed_itype_warn_policy] in ['error', 'err', Opts.itype_err]: - raise ItypeCastError(msg) - else: - warnings.warn(msg, ItypeWarning) - return diff --git a/dios/lib.py b/dios/lib.py index 438ec08a135a0bbd50c2f06a8de67a565b03ac62..4ca07cb3912f8df04b48c29d4ff662b746356211 100644 --- a/dios/lib.py +++ b/dios/lib.py @@ -1,50 +1,272 @@ import pandas as pd -import numpy as np -import contextlib -# do not import dios-stuff here -import operator as op +import warnings -def get_storage_class_values(cls): - return [getattr(cls, c) for c in cls.__dict__ if not c.startswith("_")] +class ItypeWarning(RuntimeWarning): + pass -_OP1_MAP = { - op.inv: "~", - op.neg: "-", - op.abs: "abs()", -} +class ItypeCastWarning(ItypeWarning): + pass -_OP2_COMP_MAP = { - op.eq: '==', - op.ne: '!=', - op.le: '<=', - op.ge: '>=', - op.gt: '>', - op.lt: '<', -} -_OP2_BOOL_MAP = { - op.and_: "&", - op.or_: "|", - op.xor: "^", +class ItypeCastError(RuntimeError): + pass -} -_OP2_ARITH_MAP = { - op.add: "+", - op.sub: "-", - op.mul: "*", - op.pow: "**", -} -_OP2_DIV_MAP = { - op.mod: "%", - op.truediv: "/", - op.floordiv: "//", +class __Itype: + def __init__(self): + raise RuntimeError("DatetimeItype does not allow instances of itself.") + + +class DatetimeItype(__Itype): + name = 'datetime' + repr = 'DatetimeItype' + unique = True + subtypes = (pd.DatetimeIndex,) + min_pditype = pd.DatetimeIndex([]) + + +class IntegerItype(__Itype): + name = 'integer' + repr = 'IntegerItype' + unique = True + subtypes = (pd.RangeIndex, pd.Int64Index, pd.UInt64Index, int) + min_pditype = pd.Int64Index([]) + + +class FloatItype(__Itype): + name = 'float' + repr = 'FloatItype' + subtypes = (pd.Float64Index, float) + unique = True + min_pditype = pd.Float64Index([]) + + +# class MultiItype(__Itype): +# name = "multi" +# repr = 'MultiItype' +# subtypes = (pd.MultiIndex, ) +# unique = ?? + + +class NumericItype(__Itype): + name = "numeric" + repr = 'NumericItype' + _subitypes = (IntegerItype, FloatItype) + subtypes = (_subitypes + IntegerItype.subtypes + FloatItype.subtypes) + unique = False + min_pditype = pd.Float64Index([]) + + +class MixedItype(__Itype): + name = "mixed" + repr = 'MixedItype' + unique = False + _subitypes = (DatetimeItype, IntegerItype, FloatItype, NumericItype) + _otheritypes = (pd.CategoricalIndex, pd.IntervalIndex, pd.PeriodIndex, pd.TimedeltaIndex, pd.Index) + subtypes = (_subitypes + _otheritypes + DatetimeItype.subtypes + NumericItype.subtypes) + min_pditype = pd.Index([]) + + +def is_itype(obj, itype): + """ Check if obj is a instance of the given itype or its str-alias was given""" + + # todo: iter through itype as it could be a tuple, if called like ``is_itype(o, (t1,t2))`` + + # user gave a Itype, like ``DatetimeItype`` + if type(obj) == type and issubclass(obj, itype): + return True + + # user gave a string, like 'datetime' + if isinstance(obj, str) and obj == itype.name: + return True + + return False + + +def is_itype_subtype(obj, itype): + """ Check if obj is a subclass or a instance of a subclass of the given itype""" + + # user gave a subtype, like ``pd.DatetimeIndex`` + if type(obj) == type and issubclass(obj, itype.subtypes): + return True + + # user gave a instance of a subtype, like ``pd.Series(..).index`` + if isinstance(obj, itype.subtypes): + return True + + return False + + +def is_itype_like(obj, itype): + """ Check if obj is a subclass or a instance of the given itype or any of its subtypes""" + return is_itype(obj, itype) or is_itype_subtype(obj, itype) + + +def get_itype(obj): + """ + Return the according Itype, by any of any possible user input, like + - "datetime" + - DatetimeItype + - pd.Series(...).index + - pd.DatetimeIndex + and return the according Itype + """ + if type(obj) == type and issubclass(obj, __Itype): + return obj + + # check if it is the actual type, not a subtype + types = [DatetimeItype, IntegerItype, FloatItype, NumericItype, MixedItype] + for t in types: + if is_itype(obj, t): + return t + + for t in types: + if is_itype_subtype(obj, t): + return t + + raise ValueError(f"{obj} is not a itype, nor any known subtype of a itype, nor a itype string alias") + + +def _itype_eq(a, b): + return is_itype(a, b) + + +def _itype_lt(a, b): + return is_itype_subtype(a, b) + + +def _itype_le(a, b): + return is_itype_like(a, b) + + +################################################################################ +# Casting + +class CastPolicy: + force = 'force' + save = 'save' + never = 'never' + + +_CAST_POLICIES = [CastPolicy.force, CastPolicy.save, CastPolicy.never] + + +def cast_to_itype(series, itype, policy='lossless', err='raise', inplace=False): + """ Cast a series (more explicit the type of the index) to fit the itype of a dios. + + Return the casted series if successful, None otherwise. + + Note: + This is very basic number-casting, so in most cases, information from + the old index will be lost after the cast. + """ + + if policy not in _CAST_POLICIES: + raise ValueError(f"policy={policy}") + if err not in ['raise', 'ignore']: + raise ValueError(f"err={err}") + if not inplace: + series = series.copy() + itype = get_itype(itype) + series.itype = get_itype(series.index) + + if series.empty: + return pd.Series(index=itype.min_pditype) + + + # up-cast issn't necessary because a dios with a higher + # itype always can take lower itypes. + # series can have dt/int/float/mixed + # dt -> dt -> mixed + # int -> int -> num -> mixed + # float -> float -> num -> mixed + # mixed -> mixed + if _itype_le(series.itype, itype): # a <= b + return series + + e = f"A series index of type `{type(series.index)}` cannot be casted to Itype {itype.name}" + + # cast any -> dt always fail. + if is_itype(itype, DatetimeItype): + pass + else: + e += f", as forbidden by the cast-policy `{policy}`." + + if policy == CastPolicy.never: + pass + + elif policy == CastPolicy.force: + # cast any (dt/float/mixed) -> int + # cast any (dt/float/mixed) -> num + if is_itype(itype, IntegerItype) or is_itype(itype, NumericItype): # a == b or a == c + series.index = pd.RangeIndex(len(series)) + return series + # cast any (dt/int/mixed) -> float + if is_itype(itype, FloatItype): # a == b + series.index = pd.Float64Index(range(len(series))) + return series + + elif policy == CastPolicy.save: + # cast int -> float + if is_itype(itype, IntegerItype) and is_itype(series.itype, FloatItype): # a == b and c == d + series.index = series.index.astype(float) + return series + # cast float -> int, maybe if unique + if is_itype(itype, FloatItype) and is_itype(series.itype, IntegerItype): # a == b and c == d + series.index = series.index.astype(int) + if series.index.is_unique: + return series + e = f"The cast with policy {policy} from series index type `{type(series.index)}` to " \ + f"itype {itype.name} resulted in a non-unique index." + # cast mixed -> int/float always fail + + if err == 'raise': + raise ItypeCastError(e) + else: + return None + +################################################################################ +# OPTIONS + + +class OptsFields: + """storage class for the keys in ``dios_options``""" + + """How to inform user about mixed Itype (warn/err/ignore)""" + mixed_itype_warn_policy = "mixed_itype_policy" + + """ + Set the number of rows and variables to display in a call that use + ``__repr__`` or ``__str__`` like e.g. ``print(dios)`` do. + """ + disp_max_rows = "disp_max_rows " + disp_max_vars = "disp_max_vars" + + +class Opts: + itype_warn = 'warn' + itype_err = 'err' + itype_ignore = 'ignore' + + +# set default values +dios_options = { + OptsFields.disp_max_rows: 20, + OptsFields.disp_max_vars: 6, + OptsFields.mixed_itype_warn_policy: Opts.itype_warn, } -OP_MAP = _OP2_COMP_MAP.copy() -OP_MAP.update(_OP2_BOOL_MAP) -OP_MAP.update(_OP2_ARITH_MAP) -OP_MAP.update(_OP2_DIV_MAP) -OP_MAP.update(_OP1_MAP) + +def _throw_MixedItype_err_or_warn(itype): + msg = f"'{itype.repr}'. Using a non-unique Itype for a DictionaryOfSeries, is not recommend. " \ + f"As soon as series \nwith different index types are inserted, selecting and slicing data will " \ + f"almost always fail. You are hereby warned!" + if dios_options[OptsFields.mixed_itype_warn_policy] in ['ignore', Opts.itype_ignore]: + pass + elif dios_options[OptsFields.mixed_itype_warn_policy] in ['error', 'err', Opts.itype_err]: + raise ItypeCastError(msg) + else: + warnings.warn(msg, ItypeWarning) + return diff --git a/dios/locator.py b/dios/locator.py index 1d692a06a6c5e7bdd6d06e1e57fd8fc5e5d26f6d..ee4fceb9c82f773db634a5e21b790f73916cfc3b 100644 --- a/dios/locator.py +++ b/dios/locator.py @@ -1,5 +1,16 @@ -from dios.dios import * -from abc import abstractmethod +from dios.dios import _is_dios_like, _is_bool_series, _is_list_like_not_nested +import pandas as pd + +import pandas.core.common as ccom +import pandas.core.dtypes.common as dcom +_is_list_like = dcom.is_list_like +_is_nested_list_like = dcom.is_nested_list_like +_is_scalar = dcom.is_scalar +_is_integer = dcom.is_integer +_is_dict_like = dcom.is_dict_like +_is_number = dcom.is_number +_is_hashable = dcom.is_hashable +_is_bool_indexer = ccom.is_bool_indexer class _Indexer: @@ -33,20 +44,20 @@ class _LocIndexer(_Indexer): def __getitem__(self, key): rowkey, colkey = self._unpack_key(key) - if is_dios_like(rowkey) or is_dios_like(colkey): + if _is_dios_like(rowkey) or _is_dios_like(colkey): raise ValueError("Cannot index with multidimensional key") data = self._data.loc[colkey].copy() # .loc[any, scalar] - if is_hashable(colkey): + if _is_hashable(colkey): new = data.loc[rowkey] # .loc[any, non-scalar] else: for k in data.index: data.at[k] = data.at[k].loc[rowkey] - if is_hashable(rowkey): + if _is_hashable(rowkey): new = data else: new = self._dios.copy_empty(columns=False) @@ -57,11 +68,11 @@ class _LocIndexer(_Indexer): def __setitem__(self, key, value): rowkey, colkey = self._unpack_key(key) - if is_dios_like(rowkey) or is_dios_like(colkey): + if _is_dios_like(rowkey) or _is_dios_like(colkey): raise ValueError("Cannot index with multidimensional key") # .loc[any, scalar] - if is_hashable(colkey): + if _is_hashable(colkey): # .loc[dont-care, new-scalar] = val if colkey not in self._dios.columns: self._dios._insert(colkey, value) @@ -84,20 +95,20 @@ class _iLocIndexer(_Indexer): def __getitem__(self, key): rowkey, colkey = self._unpack_key(key) - if is_dios_like(rowkey) or is_dios_like(colkey): + if _is_dios_like(rowkey) or _is_dios_like(colkey): raise ValueError("Cannot index with multidimensional key") data = self._data.iloc[colkey].copy() # .iloc[any, scalar] - if is_integer(colkey): + if _is_integer(colkey): new = data.iloc[rowkey] # .iloc[any, non-scalar] else: for k in data.index: data.at[k] = data.at[k].iloc[rowkey] - if is_integer(rowkey): + if _is_integer(rowkey): new = data else: new = self._dios.copy_empty(columns=False) @@ -107,12 +118,11 @@ class _iLocIndexer(_Indexer): def __setitem__(self, key, value): rowkey, colkey = self._unpack_key(key) - if is_dios_like(rowkey) or is_dios_like(colkey): + if _is_dios_like(rowkey) or _is_dios_like(colkey): raise ValueError("Cannot index with multidimensional key") - # .iloc[any, scalar] - if is_integer(colkey): + if _is_integer(colkey): self._data.iat[colkey].iloc[rowkey] = value # .iloc[any, non-scalar] @@ -124,6 +134,56 @@ class _iLocIndexer(_Indexer): # ############################################################################# +class _AtIndexer(_Indexer): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def _check_key(self, key): + if not (isinstance(key, tuple) and len(key) == 2 + and _is_hashable(key[0]) and _is_hashable(key[1])): + raise KeyError(f"{key}. `.at` takes exactly one scalar row-key " + "and one scalar column-key") + + def __getitem__(self, key): + self._check_key(key) + return self._data.at[key[1]].at[key[0]] + + def __setitem__(self, key, value): + self._check_key(key) + if _is_dios_like(value) or _is_nested_list_like(value): + raise TypeError(".at[] cannot be used to set multi-dimensional values, use .aloc[] instead.") + self._data.at[key[1]].at[key[0]] = value + + +# ############################################################################# + + +class _iAtIndexer(_Indexer): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def _check_key(self, key): + if not (isinstance(key, tuple) and len(key) == 2 + and _is_integer(key[0]) and _is_integer(key[1])): + raise KeyError(f"{key} `.iat` takes exactly one integer positional " + f"row-key and one integer positional scalar column-key") + + def __getitem__(self, key): + self._check_key(key) + return self._data.iat[key[1]].iat[key[0]] + + def __setitem__(self, key, value): + self._check_key(key) + if _is_dios_like(value) or _is_nested_list_like(value): + raise TypeError(".iat[] cannot be used to set multi-dimensional values, use .aloc[] instead.") + self._data.iat[key[1]].iat[key[0]] = value + + +# ############################################################################# + + class _aLocIndexer(_Indexer): """ align Indexer @@ -140,7 +200,7 @@ class _aLocIndexer(_Indexer): rowkeys, colkeys = self._unpack_key_aloc(key) # full align (rows+cols) - align given dios with ourself - if is_dios_like(value): + if _is_dios_like(value): colkeys = value.columns.intersection(colkeys) for i, c in enumerate(colkeys): l = self._data.at[c] @@ -171,31 +231,31 @@ class _aLocIndexer(_Indexer): Both list always have the same length and also could be empty together. """ # boolean dios - if is_dios_like(key): + if _is_dios_like(key): colkey = self._dios.columns.intersection(key.columns).to_list() rowkey = [] for c in colkey: b = key[c] - if not is_bool_indexer(b): + if not _is_bool_indexer(b): raise ValueError("Must pass DictOfSeries with boolean values only") rowkey += [self._data.at[c].index.intersection(b[b].index)] else: rowkey, colkey = self._unpack_key(key) - if is_dios_like(rowkey) or is_dios_like(colkey): + if _is_dios_like(rowkey) or _is_dios_like(colkey): raise ValueError("multidimensional key must be passed as single key") # make column-slice from scalar - if is_hashable(colkey): + if _is_hashable(colkey): colkey = [colkey] if colkey in self._dios.columns else [] # pd.Series(a=True, b=False, x=True), columns:[a,b,c,d,] -> [a,] - elif is_bool_series(colkey): + elif _is_bool_series(colkey): colkey = self._dios.columns.intersection(colkey[colkey].index).to_list() # filter only existing columns from list - elif is_list_like_not_nested(colkey): + elif _is_list_like_not_nested(colkey): colkey = [c for c in self._dios.columns if c in colkey] else: @@ -204,12 +264,12 @@ class _aLocIndexer(_Indexer): # filter row key # make row-slice from scalar - if is_hashable(rowkey): + if _is_hashable(rowkey): rowkey = [slice(rowkey, rowkey)] * len(colkey) # pd.Series(1=True, 4=False, 12=True) # align every series, in columns - elif is_bool_series(rowkey): + elif _is_bool_series(rowkey): rowkey = rowkey[rowkey] # kill False rkeys = [] for c in colkey: @@ -217,7 +277,7 @@ class _aLocIndexer(_Indexer): rowkey = rkeys # filter only existing rows from list - elif is_list_like_not_nested(rowkey): + elif _is_list_like_not_nested(rowkey): rkeys = [] for c in colkey: rkeys += [self._data.at[c].index.intersection(rowkey)] @@ -230,54 +290,3 @@ class _aLocIndexer(_Indexer): # ############################################################################# - -class _AtIndexer(_Indexer): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def _check_key(self, key): - if not (isinstance(key, tuple) and len(key) == 2 - and is_hashable(key[0]) and is_hashable(key[1])): - raise KeyError(f"{key}. `.at` takes exactly one scalar row-key " - "and one scalar column-key") - - def __getitem__(self, key): - self._check_key(key) - return self._data.at[key[1]].at[key[0]] - - def __setitem__(self, key, value): - self._check_key(key) - if is_dios_like(value) or is_nested_list_like(value): - raise TypeError(".at[] cannot be used to set multi-dimensional values, use .aloc[] instead.") - self._data.at[key[1]].at[key[0]] = value - - -# ############################################################################# - - -class _iAtIndexer(_Indexer): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def _check_key(self, key): - if not (isinstance(key, tuple) and len(key) == 2 - and is_integer(key[0]) and is_integer(key[1])): - raise KeyError(f"{key} `.iat` takes exactly one integer positional " - f"row-key and one integer positional scalar column-key") - - def __getitem__(self, key): - self._check_key(key) - return self._data.iat[key[1]].iat[key[0]] - - def __setitem__(self, key, value): - self._check_key(key) - if is_dios_like(value) or is_nested_list_like(value): - raise TypeError(".iat[] cannot be used to set multi-dimensional values, use .aloc[] instead.") - self._data.iat[key[1]].iat[key[0]] = value - - -# ############################################################################# - - diff --git a/dios/operators.py b/dios/operators.py new file mode 100644 index 0000000000000000000000000000000000000000..fbd2712b7d87a90da24b3be6ce6df00205c47765 --- /dev/null +++ b/dios/operators.py @@ -0,0 +1,43 @@ +# do not import dios-stuff here +import operator as op + + +_OP1_MAP = { + op.inv: "~", + op.neg: "-", + op.abs: "abs()", +} + +_OP2_COMP_MAP = { + op.eq: '==', + op.ne: '!=', + op.le: '<=', + op.ge: '>=', + op.gt: '>', + op.lt: '<', +} + +_OP2_BOOL_MAP = { + op.and_: "&", + op.or_: "|", + op.xor: "^", + +} +_OP2_ARITH_MAP = { + op.add: "+", + op.sub: "-", + op.mul: "*", + op.pow: "**", +} + +_OP2_DIV_MAP = { + op.mod: "%", + op.truediv: "/", + op.floordiv: "//", +} + +OP_MAP = _OP2_COMP_MAP.copy() +OP_MAP.update(_OP2_BOOL_MAP) +OP_MAP.update(_OP2_ARITH_MAP) +OP_MAP.update(_OP2_DIV_MAP) +OP_MAP.update(_OP1_MAP) diff --git a/dios/options.py b/dios/options.py deleted file mode 100644 index 7d202525dd2bdc0ac5d4bedd4a4fd592be3fb4b1..0000000000000000000000000000000000000000 --- a/dios/options.py +++ /dev/null @@ -1,145 +0,0 @@ -import pandas as pd -import numpy as np -# do not import dios-stuff here - - -class OptsFields: - """storage class for the keys in ``dios_options``""" - - """How to inform user about mixed Itype (warn/err/ignore)""" - mixed_itype_warn_policy = "mixed_itype_policy" - - """ - Set the number of rows and variables to display in a call that use - ``__repr__`` or ``__str__`` like e.g. ``print(dios)`` do. - """ - disp_max_rows = "disp_max_rows " - disp_max_vars = "disp_max_vars" - - """ - should nans be droppend during comparision(drop), - stay nans (keep), or be compared (nplike). - nplike is quite silly as `5 < NaN` like every comparison - will simply evaluate to False ! - """ - comparison_nan_policy = "comparison_nan_policy" - - """ - Set item nan policy: - How does self and other align. - d1 - ---- d2 - 1: 1 ----- - 2: 2 2: 99 - 3: 3 3: 99 - 4: 99 - - d1[:] = d2 - - pdlike/nplike/keepnan: - new d1: - ------ - 1: Nan - 2: 99 - 3: 99 - - dioslike/dropnan: - new d1: - ------ - 2: 99 - 3: 99 - """ - setitem_nan_policy = 'setitem_nan_policy' - - -class Opts: - none_or_more = 'none_more' - one_or_more = 'one_more' - all_or_more = 'all_more' - none_up2_all = 'none_all' - one_up2_all = 'one_all' - exactly_all = 'all_all' - - itype_warn = 'warn' - itype_err = 'err' - itype_ignore = 'ignore' - - keep_nans = 'keep' - nplike_nans = 'nplike' - pdlike_nans = 'nplike' - drop_nans = 'drop' - - -# set default values -dios_options = { - OptsFields.disp_max_rows: 20, - OptsFields.disp_max_vars: 6, - OptsFields.mixed_itype_warn_policy: Opts.itype_warn, - # OptsFields.comparison_nan_policy: Opts.keep_nans, # fixme: not implemented yet - OptsFields.setitem_nan_policy: Opts.pdlike_nans, -} - - -def align_dioslikes(self, other, nan=np.nan, policy=None): - new = self.copy_empty(columns=False) - for k in self.columns: - left = self.at[k] - if k not in other: - new[k] = pd.Series(data=nan, index=left.index) - continue - right = other.at[k].reindex_like(left) - r = right - r = right.reindex_like(left) - l,r = l.align(r) - - -def align_index_by_policy(left, right, policy=None): - if policy is None: - policy = dios_options[OptsFields.setitem_nan_policy] - - if policy in [Opts.keep_nans, Opts.pdlike_nans]: - # r = right.align(left, join='right')[0] - r = right.reindex_like(left) - elif policy in [Opts.drop_nans]: - # r = right.align(left, join='inner')[0] - r = right.loc[left.index.intersection(right.index)] - else: - raise ValueError(policy) - - return r, r.index - - -def get_keys_by_policy(tocheck, keys, policy): - filtered = [k for k in tocheck if k in keys] - if policy == Opts.none_up2_all: - fail = [k for k in tocheck if k not in keys] - if fail: - raise KeyError(f"Policy says: keys must be known. Unknown: {fail}") - - elif policy == Opts.none_or_more: - pass - - elif policy == Opts.one_up2_all: - fail = [k for k in tocheck if k not in keys] - if not filtered or fail: - if fail: - raise KeyError(f"Policy says: keys must be known and at least one must be shared. Unknown: {fail}") - raise KeyError("Policy says: keys must known and at least one key must be shared. None was shared.") - - elif policy == Opts.one_or_more: - if not filtered: - raise KeyError("Policy says: at least one key must be shared.") - - elif policy == Opts.exactly_all: - fail = set(tocheck).symmetric_difference(set(keys)) - if fail: - raise KeyError(f"Policy says: exactly all keys must be given.") - - elif Opts.all_or_more: - fail = set(filtered).symmetric_difference(set(keys)) - if fail: - raise KeyError(f"Policy says: all known keys must be given, unknown are ignored.") - else: - raise ValueError(policy) - - return filtered diff --git a/test/test_methods.py b/test/test_methods.py index 0b6ffbec2b4dd7577a5c9a1e15fc4d4dc3055fcf..221f189bd2e21ac23a2ca67df9078eae7b9fbe8c 100644 --- a/test/test_methods.py +++ b/test/test_methods.py @@ -1,7 +1,6 @@ from test.test_setup import * - def test_copy_copy_empty(getDtDiosAligned): dios = getDtDiosAligned.copy() shallow = dios.copy(deep=False) @@ -36,5 +35,3 @@ def test_all(left, op): for i in range(len(res)): assert isinstance(exp[i], pd.Series) assert (res[i] == exp[i]).all() - - diff --git a/test/test_ops.py b/test/test_ops.py index 0a0ad7e639e5cfb195e91dff56afef875f7e39e1..bb5acfe32b55a9bcdc17a283ae2a6d24901c20a7 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -1,5 +1,6 @@ #!/usr/bin/env python from test.test_setup import * +from dios.operators import * __author__ = "Bert Palm" diff --git a/test/test_setup.py b/test/test_setup.py index 295a43ae4c73f293798a26d8c01d90d112d34a44..563cb5e501a5a1b28b88c0fc9c94abaed9d5ed94 100644 --- a/test/test_setup.py +++ b/test/test_setup.py @@ -1,6 +1,6 @@ import pytest from dios import * -from dios.lib import _OP1_MAP, _OP2_DIV_MAP, _OP2_ARITH_MAP, _OP2_BOOL_MAP, _OP2_COMP_MAP +from dios.operators import _OP1_MAP, _OP2_DIV_MAP, _OP2_ARITH_MAP, _OP2_BOOL_MAP, _OP2_COMP_MAP import pandas as pd import numpy as np from copy import deepcopy