diff --git a/dios/dios.py b/dios/dios.py index fa72814e5d84e00c0cecd7621bc8a0c5d366ce5d..48af474a7f1dceb6b7169a45cf8d5d6c799c30db 100644 --- a/dios/dios.py +++ b/dios/dios.py @@ -200,7 +200,9 @@ class DictOfSeries: new = self.copy_empty() for k in self.columns: - new._data.at[k] = self._data.at[k].loc[key] + # we cannot use loc here, because s.loc[:4] + # is inclusive, whereas s[:4] isn't :( + new._data.at[k] = self._data.at[k][key] return new @@ -213,7 +215,7 @@ class DictOfSeries: new = self.copy_empty() for k in keys: - ser = self._data.at[key] + ser = self._data.at[k] boolser = key[k] # align rows idx = boolser[boolser].index.intersection(ser.index) @@ -223,7 +225,7 @@ class DictOfSeries: def _getitem_bool_listlike(self, key): new = self.copy_empty() for k in self.columns: - ser = self._data.at[key] + ser = self._data.at[k] new._data.at[k] = ser.loc[key] return new @@ -361,6 +363,10 @@ class DictOfSeries: def to_df(self): return self._data.apply(lambda s: s).transpose() + @property + def debugDf(self): + return self.to_df() + def memory_usage(self, index=True, deep=False): mem = 0 for k in self.columns: @@ -473,8 +479,13 @@ class DictOfSeries: def pprint(dios, max_rows=10, max_cols=2, delim=' '): sstr = [] cols = list(dios.columns) + + if dios.empty: + return "Empty DictionaryOfSeries" + for c in dios.columns: sstr.append(dios[c].to_string(max_rows=max_rows).split('\n')) + maxlen = max([len(x) for x in sstr]) # reduce number of visible vars diff --git a/dios/locator.py b/dios/locator.py index ca1f78cb2cc102ba32b0b1eba0bca6c2893593fd..e43cd94725166a2ad4575d4ffdbdc61bfe80c244 100644 --- a/dios/locator.py +++ b/dios/locator.py @@ -29,34 +29,36 @@ class _LocIndexer(_Indexer): if is_dios_like(rowkey) or is_dios_like(colkey): raise ValueError("Cannot index with multidimensional key") - data = self._dios[colkey] + data = self._data.loc[colkey] - # .loc[:, any] - simple low-cost optimization - if isinstance(rowkey, slice) and rowkey == slice(None): - new = data.copy() - - # .loc[any, scalar] - elif isinstance(data, pd.Series): - new = data.loc[rowkey] + # in any case data is a series now, + # either a column-indexed series of series, + # or a simple single row-indexed series (of values) + if isinstance(data, pd.Series): - # .loc[any, non-scalar] - elif isinstance(data, self._dios.__class__): + # .loc[any, scalar] - got a single row indexed series + if is_hashable(colkey): + new = data.loc[rowkey] - # .loc[scalar, non-scalar] - if is_hashable(rowkey): + # .loc[scalar, any] + elif is_hashable(rowkey): # we do not override data directly to may get # a better fitting series dtype - new = pd.Series(index=type(data.columns)([])) + new = pd.Series(index=type(data.index)([])) for k in data.index: - s = data._data.at[k] + s = data.at[k] new.at[k] = s.loc[rowkey] + # .iloc[:, any] - simple low-cost optimization + elif isinstance(rowkey, slice) and rowkey == slice(None): + new = self._dios.copy_empty() + new._data = data.copy() + # .loc[non-scalar, non-scalar] else: new = self._dios.copy_empty() - for k in data.columns: - s = data._data.at[k] - new._data.at[k] = s.loc[rowkey] + for k in data.index: + new._data.at[k] = data.at[k].loc[rowkey] else: raise AssertionError(f"getitem returned data of type {type(data)}") @@ -140,10 +142,7 @@ class _iLocIndexer(_Indexer): if is_dios_like(rowkey) or is_dios_like(colkey): raise ValueError("Cannot index with multidimensional key") - if is_bool_indexer(colkey): - data = self._dios[colkey]._data - else: - data = self._data.iloc[colkey] + data = self._data.iloc[colkey] # in any case data is a series now, # either a column-indexed series of series, @@ -155,7 +154,7 @@ class _iLocIndexer(_Indexer): new = data.iloc[rowkey] # .loc[int, any] - if is_integer(colkey): + elif is_integer(rowkey): # we do not override data directly to may get # a better fitting series dtype new = pd.Series(index=type(data.index)([])) diff --git a/dios/options.py b/dios/options.py index abd991cd6fedd68a3aaa7ded7754aa6df6c4acf2..89d24308c4947ac759a192891fd0897c28213fe4 100644 --- a/dios/options.py +++ b/dios/options.py @@ -1,3 +1,5 @@ +import pandas as pd +import numpy as np # do not import dios-stuff here @@ -78,6 +80,19 @@ dios_options = { } +def align_dioslikes(self, other, nan=np.nan, policy=None): + new = self.copy_empty() + for k in self.columns: + left = self.at[k] + if k not in other: + new[k] = pd.Series(data=nan, index=left.index) + continue + right = other.at[k].reindex_like(left) + r = right + r = right.reindex_like(left) + l,r = l.align(r) + + def align_index_by_policy(left, right, policy=None): if policy is None: policy = dios_options[OptsFields.setitem_nan_policy] diff --git a/test/test__getitem__.py b/test/test__getitem__.py index 5eab0d82c673089cd7129262c74418f88feaea13..9b0ee84b6798ddb52ab098dc372da560c559a827 100644 --- a/test/test__getitem__.py +++ b/test/test__getitem__.py @@ -1,12 +1,14 @@ from dios import * -import pytest +from test.test_setup import * -s1 = pd.Series(range(10), index=range(10)) -s2 = pd.Series(range(5, 10), index=range(5, 10)) -s3 = pd.Series(range(1, 30, 2), index=range(1, 30, 2)) -s4 = pd.Series(np.linspace(7, 13, 9), index=range(3, 12)) -s1.name, s2.name, s3.name, s4.name = 'a', 'b', 'c', 'd' -d1 = DictOfSeries(data=dict(a=s1.copy(), b=s2.copy(), c=s3.copy(), d=s4.copy())) +# s1 = pd.Series(range(10), index=range(10)) +# s2 = pd.Series(range(5, 10), index=range(5, 10)) +# s3 = pd.Series(range(1, 30, 2), index=range(1, 30, 2)) +# s4 = pd.Series(np.linspace(7, 13, 9), index=range(3, 12)) +# s1.name, s2.name, s3.name, s4.name = 'a', 'b', 'c', 'd' +# d1 = DictOfSeries(data=dict(a=s1.copy(), b=s2.copy(), c=s3.copy(), d=s4.copy())) +# +d1 = dios__() @pytest.mark.parametrize(('idxer', 'exp'), [('a', s1), ('c', s3)]) @@ -60,26 +62,12 @@ def test__getitem_single_iloc_fail(idxer): a = d1.iloc[:, idxer] -BLIST = [True, False, False, True] - -LISTIDXER = [['a'], ['a', 'c'], pd.Series(['a', 'c'])] -BOOLIDXER = [pd.Series(BLIST), d1.copy() > 10] -SLICEIDXER = [slice(None), slice(-3, -1), slice(-1, 3), slice(None, None, 3)] -MULTIIDXER = [] # [d1 > 9, d1 != d1, d1 == d1] -EMPTYIDEXER = [[], pd.Series(), slice(3, 3), slice(3, -1), DictOfSeries()] - -INDEXERS = LISTIDXER + BOOLIDXER + SLICEIDXER + MULTIIDXER + EMPTYIDEXER - - @pytest.mark.parametrize('idxer', INDEXERS) def test__getitem__(idxer): d = d1[idxer] assert isinstance(d, DictOfSeries) -FAIL_INDEXERS = [['z'], ['a', 'z'], pd.Series(['a', 'z']), BLIST, DictOfSeries(dict(a=[1, 2, 3]))] - - @pytest.mark.parametrize('idxer', FAIL_INDEXERS) def test__getitem__fail(idxer): with pytest.raises((ValueError, KeyError)): diff --git a/test/test__setitem__.py b/test/test__setitem__.py index a88dec72873dd87cf761fb5382d8a356437c50ac..6f9d4f531fc25740fad183f945cb7d964988d5d4 100644 --- a/test/test__setitem__.py +++ b/test/test__setitem__.py @@ -1,4 +1,5 @@ from dios import * +from test.test_setup import * import pytest s1 = pd.Series(range(10), index=range(10)) diff --git a/test/test_dflike__setget__.py b/test/test_dflike__setget__.py new file mode 100644 index 0000000000000000000000000000000000000000..87695ad359554e03432cd46ba8e4e84e61b69773 --- /dev/null +++ b/test/test_dflike__setget__.py @@ -0,0 +1,80 @@ +from dios import * +from test.test_setup import * +import pytest + + +def _test(val, exp): + if isinstance(exp, pd.DataFrame): + assert isinstance(val, DictOfSeries) + + if val.empty: + for c in exp: + assert exp[c].dropna().empty + return + + assert (val.columns == exp.columns).all() + for c in exp: + l = val[c] + r = exp[c].dropna() + assert isinstance(l, pd.Series) + assert isinstance(r, pd.Series) + assert (l == r).all() + else: + assert type(exp) == type(val) + + if isinstance(exp, pd.Series): + assert (val == exp.dropna()).all() + else: + assert val == exp + + +@pytest.mark.parametrize('idxer', INDEXERS) +def test_dflike__getitem__(df_, dios_, idxer): + print(idxer) + exp = df_[idxer] + val = dios_[idxer] + _test(val, exp) + + +@pytest.mark.parametrize('locL', LOC_L) +@pytest.mark.parametrize('locR', LOC_R) +def test_dflike__get_loc__(df_, dios_, locL, locR): + print(locL) + print(locR) + exp = df_.loc[locL, locR] + val = dios_.loc[locL, locR] + _test(val, exp) + + +@pytest.mark.parametrize('ilocL', ILOC_L) +@pytest.mark.parametrize('ilocR', ILOC_R) +def test_dflike__get_iloc__(df_, dios_, ilocL, ilocR): + print(ilocL) + print(ilocR) + exp = df_.iloc[ilocL, ilocR] + val = dios_.iloc[ilocL, ilocR] + # _test(val, exp) + + if isinstance(exp, pd.DataFrame): + assert isinstance(val, DictOfSeries) + + if val.empty: + for c in exp: + assert exp[c].dropna().empty + return + + assert (val.columns == exp.columns).all() + for c in exp: + l = val[c] + r = exp[c].dropna() + assert isinstance(l, pd.Series) + assert isinstance(r, pd.Series) + assert (l == r).all() + else: + assert type(exp) == type(val) + + if isinstance(exp, pd.Series): + assert (val == exp.dropna()).all() + else: + assert val == exp + diff --git a/test/test_setup.py b/test/test_setup.py index d293a1ecab2e80a570e83d4f789358fd7b838318..295a43ae4c73f293798a26d8c01d90d112d34a44 100644 --- a/test/test_setup.py +++ b/test/test_setup.py @@ -6,6 +6,52 @@ import numpy as np from copy import deepcopy from pandas.tests.series.conftest import datetime_series +a = pd.Series(range(0, 70, 7)) +b = pd.Series(range(5, 15, 1)) +c = pd.Series(range(7, 107, 10)) +d = pd.Series(range(0, 10, 1)) + +s1, s2, s3, s4 = a, b, c, d + + +def df__(): + return pd.DataFrame(dict(a=a.copy(), b=b.copy(), c=c.copy(), d=d.copy(), )) + + +def dios__(): + return DictOfSeries(dict(a=a.copy(), b=b.copy(), c=c.copy(), d=d.copy(), )) + + +@pytest.fixture +def df_(): + return df__().copy() + + +@pytest.fixture +def dios_(): + return dios__().copy() + + +BLIST = [True, False, False, False, True] * 2 + +LISTIDXER = [['a'], ['a', 'c'], pd.Series(['a', 'c'])] +BOOLIDXER = [BLIST, pd.Series(BLIST), df__() > 10] +SLICEIDXER = [slice(None), slice(4), slice(-3, -1), slice(-1, 3), slice(None, None, 3)] +MULTIIDXER = [df__() > 9, df__() != df__(), df__() == df__()] +EMPTYIDEXER = [[], pd.Series(), slice(3, 3), slice(3, -1), pd.DataFrame(), []] + +INDEXERS = LISTIDXER + BOOLIDXER + SLICEIDXER + MULTIIDXER + EMPTYIDEXER + +LOC_L = [slice(None), slice(2, 8), pd.Series(BLIST), BLIST, [6,5], 2] +LOC_R = [slice(None), slice('a', 'c'), pd.Series([False,False,True,False], index=list("abcd")), + [False,False,True,False], "a"] + LISTIDXER + +ILIST = [[0], [1,3], 2] +IEMPTY = [[], slice(3, 3), slice(3, -1), []] +ILOC_L = [slice(None), slice(2, 8), BLIST, ] + ILIST +ILOC_R = [slice(None), slice(1, 3), [False,False,True,False], ] + ILIST + +FAIL_INDEXERS = [['z'], ['a', 'z'], pd.Series(['a', 'z']), BLIST, pd.DataFrame(dict(a=[1, 2, 3]))] O = [[0, 0, 0], [0, 0, 0]] I = [[1, 1, 1], [1, 1, 1]] @@ -53,5 +99,3 @@ def _get_dios(ser, i): @pytest.fixture() def getDtDiosAligned(datetime_series): return _get_dios(datetime_series, 5) - -