From 60dff935042b9156718d79b1884d647ae9efa1a7 Mon Sep 17 00:00:00 2001 From: Bert Palm <bert.palm@ufz.de> Date: Sun, 16 Feb 2020 06:57:11 +0100 Subject: [PATCH] iloc done --- dios/dios.py | 332 +++++++++++++++++++++-------------------------- dios/lib.py | 12 +- test/run_dios.py | 7 +- 3 files changed, 159 insertions(+), 192 deletions(-) diff --git a/dios/dios.py b/dios/dios.py index a3fb535..678a43c 100644 --- a/dios/dios.py +++ b/dios/dios.py @@ -88,9 +88,7 @@ class DictOfSeries: # If the itypes differ between different series, slicing will almost always fail # (eg. a datetime-like slice cannot work on a numeric index and vice versa). self._itype = None - - with reraise("param itype: "): - self.itype = get_itype(itype) + self.itype = get_itype(itype) if downcast_policy not in CAST_POLICIES: raise ValueError(f"downcast_policy must be one of {CAST_POLICIES}") @@ -105,20 +103,19 @@ class DictOfSeries: def __init_insert_data__(self, data): if isinstance(data, DictOfSeries): - for k in data: - self[k] = data[k] + g = ((k, data[k]) for k in data) else: - if is_iterator(data): - data = list(data) - + data = list(data) if is_iterator(data) else data if is_dict_like(data): - for k in data: - self[k] = data[k] + g = ((k, data[k]) for k in data) elif is_nested_list_like(data): - for i, d in enumerate(data): - self[str(i)] = d + g = ((str(i), d) for i, d in enumerate(data)) elif is_list_like(data): - self['0'] = data + g = [('0', data)] + else: + raise ValueError(f"init with data of type {type(data)} is not possible.") + for k, val in g: + self[k] = val return @property @@ -171,15 +168,13 @@ class DictOfSeries: f"You are hereby warned!") def __cast_all(self, itype): - for k in self.columns: - with reraise(f"Column {k}: "): + k = '?' + try: + for k in self.columns: casted = cast_to_itype(self._data[k], itype, policy=self._policy) - self._data[k] = casted - - def _check_keys(self, keys): - missing = [k for k in keys if k not in self.columns] - if missing: - raise KeyError(f"{missing} not in index") + self._data[k] = casted + except Exception as e: + raise type(e)(f"Column {k}: " + str(e)) from e def __getitem__(self, key): """ @@ -196,6 +191,7 @@ class DictOfSeries: new = self._get_item(key) else: raise KeyError(key) + # all other cases else: keys, ixs = self._get_keys_and_indexer(key) new = self.copy_empty() @@ -205,7 +201,7 @@ class DictOfSeries: return new def _get_item(self, key): - # return always a pd.Series + """Extract a pd.Series from self""" return self._data[key] def __setitem__(self, key, value): @@ -229,20 +225,31 @@ class DictOfSeries: return else: k, i = [key], [slice(None)] + # all other cases else: k, i = self._get_keys_and_indexer(key) - gen = self._setitem(k, i, value) + gen = self._yield_tuple_to_set(k, i, value) for tup in gen: self._set_item(*tup) - def _setitem(self, keys, ixs, val): - """Return a generator that yield (key, indexer, value) for all keys""" - if is_iterator(val): - val = list(val) + def _set_item(self, key, ix, val): + "Set a value (scalar or list or series)" + ser = self._data[key] + if is_series_like(val): + left = ser[ix] + index = left.index.intersection(val.index) + if not index.empty: + left.loc[index] = val.loc[index].copy() + else: + ser[ix] = val + def _yield_tuple_to_set(self, keys, ixs, val): + """Return a generator that yield (key, indexer, value) for all keys""" + val = list(val) if is_iterator(val) else val diosl, dfl, nlistl = is_dios_like(val), is_dataframe_like(val), is_nested_list_like(val) - if diosl or dfl or nlistl and len(val) != len(keys): + + if (diosl or dfl or nlistl) and len(val) != len(keys): raise ValueError(f"could not broadcast input array with length {len(val)}" f" into dios of length {len(keys)}") @@ -258,19 +265,8 @@ class DictOfSeries: else: yield key, ix, val - def _set_item(self, key, ix, val): - "Set a value (scalar or list or series)" - ser = self._data[key] - if is_series_like(val): - left = ser[ix] - index = left.index.intersection(val.index) - if not index.empty: - left.loc[index] = val.loc[index].copy() - else: - ser[ix] = val - def _insert(self, key, val): - """""" + """Insert a fresh new value into self""" if isinstance(val, DictOfSeries): val = val.squeeze() elif is_list_like(val) and not is_nested_list_like(val): @@ -283,19 +279,19 @@ class DictOfSeries: self._data[key] = val.copy(deep=True) def _get_keys_and_indexer(self, key): - """ Determine keys and indexer + """ Determine keys and indexer by type of key. This does not deal + with single label-access, only higher dimension objects are handled.. + Notes: - Which keys we get, depends on the policy in dios_options + Which keys we get, may depend on the policy in dios_options """ - err_bool = "only boolen values are allowed" keys = None indexers = None blowup = False # prevent consuming of a generator - if is_iterator(key): - key = list(key) + key = list(key) if is_iterator(key) else key if isinstance(key, slice): keys = self.columns @@ -445,92 +441,74 @@ class DictOfSeries: new._itype = self.itype return new - def anyop(self, op, ): + def _op1(self, op): new = self.copy_empty() - with reraise(f"'{OP_MAP[op]} dios' failed: "): + try: for k in self: new[k] = op(self._data[k]) + except Exception as e: + raise type(e)(f"'{OP_MAP[op]} dios' failed: " + str(e)) from e return new - def foo(self, x): - raise ValueError('dnsjkncsncj') - - __neg__ = partialmethod(anyop, op.neg) - __abs__ = partialmethod(anyop, op.abs) - # __invert__ = partialmethod(anyop, op.inv) - __invert__ = partialmethod(anyop, foo) - - def _op2(self, other, op, inplace=False): - new = self.copy_empty() - - # with index - if isinstance(other, (self.__class__, pd.DataFrame)): - if set(other) != set(self): - raise ValueError(f"keys does not match, left: {len(self)} keys, right: {len(other)} keys") - for k in self: - left, right = self._data[k], other[k] - l, r = left.align(right, join='inner') - val = op(l, r) - new._data[k] = val - - elif isinstance(other, pd.Series): - for k in self: - left, right = self._data[k], other - l, r = left.align(right, join='inner') - val = op(l, r) - new._data[k] = val - - # no index - elif is_dict_like(other): - if set(other) != set(self): - raise ValueError(f"keys does not match, left: {len(self)} keys, right: {len(other)} keys") - for k in self: - new._data[k] = op(self._data[k], other[k]) - - elif is_nested_list_like(other): - if len(other) != len(self): - raise ValueError(f"keys does not match, left: {len(self)} keys, right: {len(other)} keys") - for i, k in enumerate(self): - new._data[k] = op(self._data[k], other[i]) + def _op2(self, op, other, inplace=False): + def raiseif(cond, s='lenght'): + if cond: + raise ValueError(f"{s} does not match, {s} left: {len(self)}, {s} right: {len(other)} keys") + + def gen(): + if isinstance(other, (self.__class__, pd.DataFrame)): + raiseif(set(other) != set(self), '#keys') + for k in self.columns: + left, right = self._data[k], other[k] + yield k, op(*(left.align(right, join='inner'))) + elif isinstance(other, pd.Series): + for k in self.columns: + left, right = self._data[k], other + yield k, op(*(left.align(right, join='inner'))) + elif is_dict_like(other): + raiseif(set(other) != set(self), '#keys') + for k in self.columns: + yield k, op(self._data[k], other[k]) + elif is_nested_list_like(other): + raiseif(len(other) != len(self), 'length') + for i, k in enumerate(self.columns): + yield k, op(self._data[k], other[i]) + elif is_scalar(other) or is_list_like(other): + for k in self.columns: + yield k, op(self._data[k], other) + else: + raise NotImplementedError - elif is_scalar(other) or is_list_like(other): - for k in self: - new._data[k] = op(self._data[k], other) - else: - return NotImplemented + new = self if inplace else self.copy_empty() + try: + for k, val in gen(): + new[k] = val + except Exception as e: + raise type(e)(f"'dios {OP_MAP[op]} other' failed: " + str(e)) from e return new - def _op2_wrap(op, inplace=False): - def anyop(self, other): - with reraise(f"'dios {OP_MAP[op]} other' failed: "): - return self._op2(other, op, inplace=inplace) - - anyop.__name__ = '__' + op.__name__ + '__' - return anyop - - # comparision - __eq__ = _op2_wrap(op.eq) - __ne__ = _op2_wrap(op.ne) - __le__ = _op2_wrap(op.le) - __ge__ = _op2_wrap(op.ge) - __lt__ = _op2_wrap(op.lt) - __gt__ = _op2_wrap(op.gt) - __add__ = _op2_wrap(op.add) - __sub__ = _op2_wrap(op.sub) - __mul__ = _op2_wrap(op.mul) - __mod__ = _op2_wrap(op.mod) - __truediv__ = _op2_wrap(op.truediv) - __floordiv__ = _op2_wrap(op.floordiv) - __pow__ = _op2_wrap(op.pow) - __and__ = _op2_wrap(op.and_) - __or__ = _op2_wrap(op.or_) - __xor__ = _op2_wrap(op.xor) + __neg__ = partialmethod(_op1, op.neg) + __abs__ = partialmethod(_op1, op.abs) + __invert__ = partialmethod(_op1, op.inv) + __eq__ = partialmethod(_op2, op.eq) + __ne__ = partialmethod(_op2, op.ne) + __le__ = partialmethod(_op2, op.le) + __ge__ = partialmethod(_op2, op.ge) + __lt__ = partialmethod(_op2, op.lt) + __gt__ = partialmethod(_op2, op.gt) + __add__ = partialmethod(_op2, op.add) + __sub__ = partialmethod(_op2, op.sub) + __mul__ = partialmethod(_op2, op.mul) + __mod__ = partialmethod(_op2, op.mod) + __truediv__ = partialmethod(_op2, op.truediv) + __floordiv__ = partialmethod(_op2, op.floordiv) + __pow__ = partialmethod(_op2, op.pow) + __and__ = partialmethod(_op2, op.and_) + __or__ = partialmethod(_op2, op.or_) + __xor__ = partialmethod(_op2, op.xor) def squeeze(self): - if len(self) == 1: - return self[self.columns[0]] - else: - return self + return self[self.columns[0]] if len(self) == 1 else self def memory_usage(self, index=True, deep=False): mem = 0 @@ -585,54 +563,21 @@ class DictOfSeries: return None return news.squeeze() - # def __find_least_common_itype(self): - # def all_itypes_le(itypes, super_itype): - # for itype in itypes: - # if itype_le(itype, super_itype): - # continue - # return False - # return True - # - # itypes = [] - # for k in self.columns: - # itypes.append(get_itype(self._data[k].index)) - # - # if not itypes: - # return None - # - # found = None - # - # # check supertypes - # super_itypes = [MixedItype, NumericItype] - # for super_itype in super_itypes: - # if all_itypes_le(itypes, super_itype): - # found = super_itype - # continue - # break - # assert found, "At least this should be MixedItype" - # - # # check base types - # single_itypes = [DatetimeItype, IntegerItype, FloatItype] - # for single_itype in single_itypes: - # if all_itypes_le(itypes, single_itype): - # found = single_itype - # break - # return found - # - class _Indexer: def __init__(self, _dios): self._dios = _dios - # short handles self._data = _dios._data + self._yield_tuple_to_set = _dios._yield_tuple_to_set + class _LocIndexer(_Indexer): def __init__(self, _dios): super().__init__(_dios) - self._check_keys = _dios._check_keys + self._set_item = _dios._set_item + self._get_keys_and_indexer = _dios._get_keys_and_indexer def __getitem__(self, key): rkey, cols = self._unpack_key(key) @@ -642,15 +587,10 @@ class _LocIndexer(_Indexer): return new def __setitem__(self, key, value): - rkey, cols = self._unpack_key(key) - # todo: dios -> dios_to_dios, -> series - # scalar, -> automatically - # series, -> automatically - # list_like -> check length - for c in cols: - self._data[c].loc[rkey] = value - # todo loc.__setitem__(self, key, value): - return NotImplemented + ixs, keys = self._unpack_key(key) + gen = self._yield_tuple_to_set(keys, ixs, value) + for tup in gen: + self._set_item(*tup) def _unpack_key(self, key): # if we have a tuple, we have a rows- and a column-indexer @@ -661,23 +601,26 @@ class _LocIndexer(_Indexer): raise KeyError("To many indexers") # prepare ckey - if is_iterator(ckey): - ckey = list(ckey) + ckey = list(ckey) if is_iterator(ckey) else ckey # determine columns + if is_dataframe_like(ckey) or is_nested_list_like(ckey) or is_dios_like(ckey): + raise ValueError("Cannot index with multidimensional key") if isinstance(ckey, str): - self._check_keys([ckey]) cols = [ckey] elif isinstance(ckey, slice): cols = self._col_slice_to_col_list(ckey) - elif is_list_like(ckey): - self._check_keys(ckey) - cols = ckey else: - raise KeyError(f"Type {type(ckey)} is not supported to select columns.") + try: + # list and bool list like + cols, _ = self._get_keys_and_indexer(key) + except Exception: + raise else: cols = self._data.keys() rkey = key + # blowup + rkey = [rkey] * len(cols) return rkey, cols def _col_slice_to_col_list(self, cslice): @@ -689,9 +632,9 @@ class _LocIndexer(_Indexer): start = keys.index(cslice.start) if cslice.start is not None else None stop = keys.index(cslice.stop) if cslice.stop is not None else None except ValueError: - raise KeyError("The slice start label or the slice stop label is not present in the columns.") - if not is_integer(cslice.step) and cslice.step > 0: - raise TypeError("The step parameter of the slice must be positive integer.") + raise KeyError("The slice start label, or the slice stop label, is not present in columns.") + if not is_integer(cslice.step) or cslice.step <= 0: + return [] return keys[slice(start, stop + 1, cslice.step)] @@ -705,9 +648,24 @@ class _iLocIndexer(_Indexer): return new def __setitem__(self, key, value): - # todo iloc.__setitem__(self, key, value): + ixs, keys = self._unpack_key(key) + gen = self._yield_tuple_to_set(keys, ixs, value) + for tup in gen: + self._set_item_positional(*tup) raise NotImplemented + def _set_item_positional(self, key, ix, val): + ser = self._data[key] + if is_series_like(val): + index = ser.iloc[ix].index + index = index.intersection(val.index) + if not index.empty: + ser.loc[index] = val.loc[index].copy() + else: + ser.iloc[ix] = val + + + def _unpack_key(self, key): # if we have a tuple, we have a rows- and a column-indexer # if not, we only have a row-indexer and work on all columns @@ -717,8 +675,7 @@ class _iLocIndexer(_Indexer): raise KeyError("To many indexers") # prepare ckey - if is_iterator(ckey): - ckey = list(ckey) + ckey = list(ckey) if is_iterator(ckey) else ckey # determine columns if is_integer(ckey): @@ -726,11 +683,18 @@ class _iLocIndexer(_Indexer): cols = self._integers_to_col_list([ckey]) elif isinstance(ckey, slice): cols = self._col_slice_to_col_list(ckey) - elif is_list_like(ckey): + elif is_list_like(ckey) and not is_nested_list_like(ckey): + arr = np.array(ckey) + if is_bool_array(arr): + raise NotImplementedError self._check_keys(ckey) cols = self._integers_to_col_list(ckey) + elif is_series_like(ckey): + raise NotImplementedError + elif is_bool_indexer(ckey): + raise NotImplementedError else: - raise KeyError(f"Type {type(ckey)} is not supported for indexing on columns.") + raise KeyError(f"{ckey} of type {type(ckey)}") else: cols = self._data.keys() rkey = key @@ -739,6 +703,8 @@ class _iLocIndexer(_Indexer): def _check_keys(self, keys): bound = len(self._data) for k in keys: + if not is_integer(k): + raise ValueError(f"{type(k)} is not integer") if k not in range(-bound, bound): raise KeyError("positional indexer(s) are out-of-bounds in columns") @@ -752,5 +718,5 @@ class _iLocIndexer(_Indexer): def _col_slice_to_col_list(self, sl): for s in [sl.start, sl.stop, sl.step]: if not is_integer(s): - raise TypeError(f"positional indexing with slice must be integers, passed was {s} of {type(s)}") + raise TypeError(f"positional indexing with slice must be integers, passed type was {type(s)}") return list(self._data.keys())[sl] diff --git a/dios/lib.py b/dios/lib.py index ae3306d..ad4ba78 100644 --- a/dios/lib.py +++ b/dios/lib.py @@ -5,12 +5,12 @@ import contextlib import operator as op -@contextlib.contextmanager -def reraise(prefix="", postfix=""): - try: - yield - except Exception as e: - raise type(e)(prefix + str(e) + postfix) from e +# @contextlib.contextmanager +# def reraise(prefix="", postfix=""): +# try: +# yield +# except Exception as e: +# raise type(e)(prefix + str(e) + postfix) from e diff --git a/test/run_dios.py b/test/run_dios.py index 8aba9fd..220d683 100644 --- a/test/run_dios.py +++ b/test/run_dios.py @@ -5,12 +5,13 @@ import numpy as np if __name__ == '__main__': # dios_options[Options.mixed_itype_policy] = 'error' - df = pd.DataFrame([1,24,5,456,45], index=pd.date_range(periods=5, freq='1d', start='2000-01-01')) - df[[True, False]] + # df = pd.DataFrame([1,24,5,456,45], index=pd.date_range(periods=5, freq='1d', start='2000-01-01')) + # df[[True, False]] dios = DictOfSeries(data=[234.54, 5, 5, 4, np.nan, 5, 4, 5]) - dios = abs(-dios) + dios = abs(~dios) + print(all(dios == dios)) dtser = pd.Series([2,4,4123,122,4], index=pd.date_range(freq='1d', periods=5, start='2000-01-01')) -- GitLab