diff --git a/dios/dios.py b/dios/dios.py index c694722eeeaa20faea8b18ff4fa0a3f41156edba..32bbaca97ee1e4b799370058c0415da399b0d4cc 100644 --- a/dios/dios.py +++ b/dios/dios.py @@ -283,8 +283,8 @@ class DictOfSeries: # bool indexer # ------------ # bool indexer always work on rows, so they need to have - # an index wo which we can align to. This is necessary - # because we can hold series of different indices. + # an index, to which we can align to. This is necessary + # because we could hold series of different lenght/indexes. if is_bool_indexer(key): if not isinstance(key, pd.Series): raise ValueError("Must pass Series with boolean values only") @@ -321,10 +321,12 @@ class DictOfSeries: @property def loc(self): + from dios.locator import _LocIndexer return _LocIndexer(self) @property def iloc(self): + from dios.locator import _iLocIndexer return _iLocIndexer(self) def __str__(self): @@ -552,202 +554,3 @@ class DictOfSeries: if len(news) == 0: return None return news.squeeze() - - -class _Indexer: - def __init__(self, _dios): - self._dios = _dios - self._data = _dios._data - self._unpack_value = _dios._unpack_value - - -class _LocIndexer(_Indexer): - - def __init__(self, _dios): - super().__init__(_dios) - self._set_item = _dios._set_item - - def __getitem__(self, key): - rkey, cols, lowdim = self._unpack_key(key) - if is_scalar(rkey[0]): - return self._series(rkey, cols, lowdim) - elif lowdim: - return self._scalar(rkey[0], cols[0]) - else: - new = self._dios.copy_empty() - for i, _ in enumerate(cols): - c, r = cols[i], rkey[i] - new[c] = self._data[c].loc[r] - return new - - def _series(self, rkey, cols, lowdim): - if lowdim: - return self._scalar(rkey[0], cols[0]) - new = pd.Series() - for c in cols: - try: - new[c] = self._data[c].loc[rkey] - except KeyError: - new[c] = np.nan - - def _scalar(self, r, c): - return self._data[c].loc[r] - - def __setitem__(self, key, value): - ixs, keys, _ = self._unpack_key(key) - gen = self._unpack_value(keys, ixs, value) - for tup in gen: - self._set_item(*tup) - - def _unpack_key(self, key): - # if we have a tuple, we have a rows- and a column-indexer - # if not, we only have a row-indexer and work on all columns - lowdim = False - if isinstance(key, tuple): - rkey, ckey, *fail = key - if fail: - raise KeyError("To many indexers") - - # prepare ckey - ckey = list(ckey) if is_iterator(ckey) else ckey - - # determine columns - if is_nested_list_like(ckey) or is_dios_like(ckey): - raise ValueError("Cannot index with multidimensional key") - if isinstance(ckey, str): - cols = [ckey] - lowdim = True - elif isinstance(ckey, slice): - cols = self._col_slice_to_col_list(ckey) - else: - try: - # list, boolean-list or series - cols, *_ = self._dios._unpack_key(ckey) - except Exception as e: - raise e - else: - cols = list(self._data.index) - rkey = key - # blowup - rkey = [rkey] * len(cols) - return rkey, cols, lowdim - - def _col_slice_to_col_list(self, cslice): - """ see here: - https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-slicing-with-labels - """ - keys = list(self._data.index) - try: - start = keys.index(cslice.start) if cslice.start is not None else None - stop = keys.index(cslice.stop) if cslice.stop is not None else None - except ValueError: - raise KeyError("The slice start label, or the slice stop label, is not present in columns.") - if not is_integer(cslice.step) or cslice.step <= 0: - return [] - return keys[slice(start, stop + 1, cslice.step)] - - -class _iLocIndexer(_Indexer): - - def __getitem__(self, key): - rkey, cols, lowdim = self._unpack_key(key) - if is_scalar(rkey[0]): - return self._series(rkey, cols, lowdim) - elif lowdim: - return self._scalar(rkey[0], cols[0]) - else: - new = self._dios.copy_empty() - for i, _ in enumerate(cols): - c, r = cols[i], rkey[i] - new[c] = self._data[c].iloc[r] - return new - - def _series(self, rkey, cols, lowdim): - if lowdim: - return self._scalar(rkey[0], cols[0]) - new = pd.Series() - for c in cols: - try: - new[c] = self._data[c].iloc[rkey] - except KeyError: - new[c] = np.nan - - def _scalar(self, r, c): - return self._data[c].iloc[r] - - def __setitem__(self, key, value): - ixs, keys, _ = self._unpack_key(key) - gen = self._unpack_value(keys, ixs, value) - for tup in gen: - self._set_item_positional(*tup) - raise NotImplemented - - def _set_item_positional(self, key, ix, val): - ser = self._data[key] - if isinstance(val, pd.Series): - index = ser.iloc[ix].index - index = index.intersection(val.index) - if not index.empty: - ser.loc[index] = val.loc[index].copy() - else: - ser.iloc[ix] = val - - def _unpack_key(self, key): - # if we have a tuple, we have a rows- and a column-indexer - # if not, we only have a row-indexer and work on all columns - lowdim = False - if isinstance(key, tuple): - rkey, ckey, *fail = key - if fail: - raise KeyError("To many indexers") - - # prepare ckey - ckey = list(ckey) if is_iterator(ckey) else ckey - - # determine columns - if is_integer(ckey): - self._check_keys([ckey]) - cols = self._integers_to_col_list([ckey]) - lowdim = True - elif isinstance(ckey, slice): - cols = self._col_slice_to_col_list(ckey) - elif is_list_like(ckey) and not is_nested_list_like(ckey): - arr = np.array(ckey) - if is_bool_array(arr): - raise NotImplementedError - self._check_keys(ckey) - cols = self._integers_to_col_list(ckey) - elif isinstance(ckey, pd.Series): - raise NotImplementedError - elif is_bool_indexer(ckey): - raise NotImplementedError - else: - raise KeyError(f"{ckey} of type {type(ckey)}") - else: - cols = list(self._data.index) - rkey = key - - # blowup - rkey = [rkey] * len(cols) - return rkey, cols, lowdim - - def _check_keys(self, keys): - bound = len(self._data) - for k in keys: - if not is_integer(k): - raise ValueError(f"{type(k)} is not integer") - if k not in range(-bound, bound): - raise KeyError("positional indexer(s) are out-of-bounds in columns") - - def _integers_to_col_list(self, ints): - klist = list(self._data.index) - ks = set() - for i in ints: - ks.add(klist[i]) - return list(ks) - - def _col_slice_to_col_list(self, sl): - for s in [sl.start, sl.stop, sl.step]: - if not is_integer(s): - raise TypeError(f"positional indexing with slice must be integers, passed type was {type(s)}") - return list(self._data.index)[sl] diff --git a/dios/locator.py b/dios/locator.py new file mode 100644 index 0000000000000000000000000000000000000000..b7e4ad5b4074fe9ddd7558f54830c7320902595c --- /dev/null +++ b/dios/locator.py @@ -0,0 +1,196 @@ +from dios.dios import * + +class _Indexer: + def __init__(self, _dios): + self._dios = _dios + self.columns = _dios.columns + self._data = _dios._data + # self._unpack_value = _dios._unpack_value + + +class _LocIndexer(_Indexer): + + def __init__(self, _dios): + super().__init__(_dios) + self._set_item = _dios._set_item + + + def _series(self, rkey, cols, lowdim): + if lowdim: + return self._scalar(rkey[0], cols[0]) + new = pd.Series() + for c in cols: + try: + new[c] = self._data[c].loc[rkey] + except KeyError: + new[c] = np.nan + + def _scalar(self, r, c): + return self._data[c].loc[r] + + def __setitem__(self, key, value): + data, rkey = self._getdata(key) + if data.empty: + return + if isinstance(data, pd.Series): + pass + + def __getitem__(self, key): + data, rkey, lowdim = self._getdata(key) + colseries = is_hashable(rkey) + + if data.empty: + if colseries: + data.name = rkey + return data # a empty Series + return self._dios.copy_empty() + + if colseries: + new = pd.Series() + else: + new = self._dios.copy_empty() + + if lowdim: + return data.loc[rkey] + + for s in data.index: + new[s] = data[s].loc[rkey] + return new + + def _getdata(self, key): + lowdim = False + if isinstance(key, tuple): + key, ckey, *fail = key + if fail: + raise KeyError("To many indexers") + if is_dios_like(ckey): + raise ValueError("Cannot index with multidimensional key") + if is_hashable(ckey): + lowdim = True + try: + data = self._data.loc[ckey] + except Exception as e: + raise e + else: + data = self._data + return data, key, lowdim + + def _col_slice_to_col_list(self, cslice): + """ see here: + https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-slicing-with-labels + """ + keys = list(self._data.index) + try: + start = keys.index(cslice.start) if cslice.start is not None else None + stop = keys.index(cslice.stop) if cslice.stop is not None else None + except ValueError: + raise KeyError("The slice start label, or the slice stop label, is not present in columns.") + if not is_integer(cslice.step) or cslice.step <= 0: + return [] + return keys[slice(start, stop + 1, cslice.step)] + + +class _iLocIndexer(_Indexer): + + def __getitem__(self, key): + rkey, cols, lowdim = self._unpack_key(key) + if is_scalar(rkey[0]): + return self._series(rkey, cols, lowdim) + elif lowdim: + return self._scalar(rkey[0], cols[0]) + else: + new = self._dios.copy_empty() + for i, _ in enumerate(cols): + c, r = cols[i], rkey[i] + new[c] = self._data[c].iloc[r] + return new + + def _series(self, rkey, cols, lowdim): + if lowdim: + return self._scalar(rkey[0], cols[0]) + new = pd.Series() + for c in cols: + try: + new[c] = self._data[c].iloc[rkey] + except KeyError: + new[c] = np.nan + + def _scalar(self, r, c): + return self._data[c].iloc[r] + + def __setitem__(self, key, value): + ixs, keys, _ = self._unpack_key(key) + gen = self._unpack_value(keys, ixs, value) + for tup in gen: + self._set_item_positional(*tup) + raise NotImplemented + + def _set_item_positional(self, key, ix, val): + ser = self._data[key] + if isinstance(val, pd.Series): + index = ser.iloc[ix].index + index = index.intersection(val.index) + if not index.empty: + ser.loc[index] = val.loc[index].copy() + else: + ser.iloc[ix] = val + + def _unpack_key(self, key): + # if we have a tuple, we have a rows- and a column-indexer + # if not, we only have a row-indexer and work on all columns + lowdim = False + if isinstance(key, tuple): + rkey, ckey, *fail = key + if fail: + raise KeyError("To many indexers") + + # prepare ckey + ckey = list(ckey) if is_iterator(ckey) else ckey + + # determine columns + if is_integer(ckey): + self._check_keys([ckey]) + cols = self._integers_to_col_list([ckey]) + lowdim = True + elif isinstance(ckey, slice): + cols = self._col_slice_to_col_list(ckey) + elif is_list_like(ckey) and not is_nested_list_like(ckey): + arr = np.array(ckey) + if is_bool_array(arr): + raise NotImplementedError + self._check_keys(ckey) + cols = self._integers_to_col_list(ckey) + elif isinstance(ckey, pd.Series): + raise NotImplementedError + elif is_bool_indexer(ckey): + raise NotImplementedError + else: + raise KeyError(f"{ckey} of type {type(ckey)}") + else: + cols = list(self._data.index) + rkey = key + + # blowup + rkey = [rkey] * len(cols) + return rkey, cols, lowdim + + def _check_keys(self, keys): + bound = len(self._data) + for k in keys: + if not is_integer(k): + raise ValueError(f"{type(k)} is not integer") + if k not in range(-bound, bound): + raise KeyError("positional indexer(s) are out-of-bounds in columns") + + def _integers_to_col_list(self, ints): + klist = list(self._data.index) + ks = set() + for i in ints: + ks.add(klist[i]) + return list(ks) + + def _col_slice_to_col_list(self, sl): + for s in [sl.start, sl.stop, sl.step]: + if not is_integer(s): + raise TypeError(f"positional indexing with slice must be integers, passed type was {type(s)}") + return list(self._data.index)[sl] diff --git a/test/run_dios.py b/test/run_dios.py index acde9081cebf001667273fda602f9d808ea1b852..dc51f2f27ff36a304c4b2305545d107c299ecf9e 100644 --- a/test/run_dios.py +++ b/test/run_dios.py @@ -5,10 +5,9 @@ import numpy as np if __name__ == '__main__': # dios_options[OptsFields.mixed_itype_policy] = 'error' - df = pd.DataFrame([1,24,5,456,45], index=pd.date_range(periods=5, freq='1y', start='2000-01-01')) + # df = pd.DataFrame([1,24,5,456,45], index=pd.date_range(periods=5, freq='1y', start='2000-01-01')) # df[[True, False]] - df1 = pd.DataFrame(dict(a=range(5), b=range(0,50,10))) df2 = pd.DataFrame(dict(b=[99], a=[888732727]), index=range(3,8)) d1 = DictOfSeries(df1) @@ -17,12 +16,12 @@ if __name__ == '__main__': df1.to_string() d = DictOfSeries(dict(a=[1,2], b=[12,38,32,32,323], ss=[2,23,3,2,3,], z=pd.Series([1,2,3], index=list("abc")))) - d['ss'].index = df.index - # d=DictOfSeries(df) - - d1[:] = d2 - print(d) - print(d1) - print(df1) + print(d, type(d)) + d = d.loc[:,:] + print(d, type(d)) + a = d.loc[:,'a'] + print(a, type(a)) + x = d.loc[1,['a', 'ss', 'z']] + print(x, type(x))