From ca31885ee6045b37325a0f739eee611f76f0e38b Mon Sep 17 00:00:00 2001 From: Bert Palm <bert.palm@ufz.de> Date: Tue, 10 Mar 2020 19:33:52 +0100 Subject: [PATCH] create like df --- dios/dios.py | 71 +++++++++++++++++++------------------------- dios/lib.py | 15 ++++++++++ test/test_dflike.py | 32 ++++++++------------ test/test_methods.py | 19 ++++++++---- 4 files changed, 72 insertions(+), 65 deletions(-) diff --git a/dios/dios.py b/dios/dios.py index cd5e7e1..ba6fad5 100644 --- a/dios/dios.py +++ b/dios/dios.py @@ -1,7 +1,12 @@ from .operators import OP_MAP as _OP_MAP from .lib import * -from .lib import _CAST_POLICIES, _itype_le, _itype_lt, _throw_MixedItype_err_or_warn +from .lib import ( + _CAST_POLICIES, + _itype_le, _itype_lt, + _throw_MixedItype_err_or_warn, + _find_least_common_itype, +) import pandas as pd import numpy as np @@ -105,10 +110,8 @@ class DictOfSeries: self._data = pd.Series(dtype='O', index=columns) else: - self._data = pd.Series(dtype='O', index=pd.Index([])) - # itype=None means infer the itype by the data, so we first set to the highest - # possible itype, then insert data, then find the best-fitting. + # possible itype, then insert data, then find the best-fitting itype. if itype is None: self._itype = MixedItype else: @@ -118,53 +121,53 @@ class DictOfSeries: raise ValueError(f"downcast_policy must be one of {_CAST_POLICIES}") self._policy = cast_policy - if columns is not None and not _is_list_like_not_nested(columns): - raise TypeError("'columns' must be some kind of list-like collection.") + index = pd.Index([] if columns is None else columns) + self._data = pd.Series(dtype='O', index=index.unique()) if data is not None: self._init_insert_data(data, columns) + # NOTE: self._data contain nans at locations + # where no data was present, but a column-name + # was given + if itype is None: - self._itype = self.__find_least_common_itype() + self._itype = _find_least_common_itype(self._data.dropna()) if not self._itype.unique: _throw_MixedItype_err_or_warn(self.itype) - # insert empty series for requested columns - if columns is not None: + # insert empty columns + if self._data.hasnans: e = pd.Series(dtype='O') - for c in columns: - if fastpath or c not in self.columns: - self._insert(c, e.copy()) + for c in self.columns[self._data.isna()]: + self._insert(c, e.copy()) def _init_insert_data(self, data, columns): - - def incols(c): - return c in columns if columns is not None else True + """ Insert items of a iterable in self""" data = list(data) if _is_iterator(data) else data if isinstance(data, dict) or _is_dios_like(data): for k in data: - if incols(k): + if columns is None or k in self.columns: self._insert(k, data[k]) elif _is_list_like(data): # also Series ! data = data if _is_nested_list_like(data) else [data] - if columns is None: - for i, d in enumerate(data): - self._insert(i, d) - else: - if len(data) != len(columns): - raise ValueError(f"length of passed values is {len(data)}, columns imply {len(columns)}") + if self.columns.empty: + self._data = pd.Series(dtype='O', index=pd.RangeIndex(len(data))) + + elif len(data) != len(self.columns): + raise ValueError(f"length of passed values is {len(data)}, columns imply {len(self.columns)}") - for i, c in enumerate(columns): - self._insert(c, data[i]) + for i, c in enumerate(self.columns): + self._insert(c, data[i]) else: raise ValueError(f"data must be some kind of iterable, type {type(data)} was given") def _insert(self, col, val): - """Insert a fresh new value into self""" + """Insert a fresh new value as pd.Series into self""" val = list(val) if _is_iterator(val) else val if _is_dios_like(val): @@ -183,8 +186,8 @@ class DictOfSeries: return self._data.index @columns.setter - def columns(self, newindex): - self._data.index = newindex + def columns(self, cols): + self._data.index = cols @property def itype(self): @@ -205,20 +208,6 @@ class DictOfSeries: except Exception as e: raise type(e)(f"Column {c}: " + str(e)) from e - def __find_least_common_itype(self): - itypes = [NumItype, FloatItype, IntItype, DtItype] - tlist = [get_itype(s.index) for s in self._data] - found = MixedItype - if tlist: - for itype in itypes: - for t in tlist: - if _itype_le(t, itype): - continue - break - else: - found = itype - return found - def __getitem__(self, key): """ dios[key] -> dios/series """ key = list(key) if _is_iterator(key) else key diff --git a/dios/lib.py b/dios/lib.py index 665959c..df152ae 100644 --- a/dios/lib.py +++ b/dios/lib.py @@ -135,6 +135,21 @@ def _itype_le(a, b): return is_itype_like(a, b) +def _find_least_common_itype(iterable_of_series): + itypes = [NumItype, FloatItype, IntItype, DtItype] + tlist = [get_itype(s.index) for s in iterable_of_series] + found = MixedItype + if tlist: + for itype in itypes: + for t in tlist: + if _itype_le(t, itype): + continue + break + else: + found = itype + return found + + ################################################################################ # Casting diff --git a/test/test_dflike.py b/test/test_dflike.py index a0b93a0..e0ec811 100644 --- a/test/test_dflike.py +++ b/test/test_dflike.py @@ -7,8 +7,6 @@ from pandas.core.dtypes.common import is_dict_like, is_nested_list_like import numpy as np from copy import deepcopy -pytestmark = pytest.mark.skip - __author__ = "Bert Palm" __email__ = "bert.palm@ufz.de" __copyright__ = "Copyright 2018, Helmholtz-Zentrum für Umweltforschung GmbH - UFZ" @@ -16,10 +14,13 @@ __copyright__ = "Copyright 2018, Helmholtz-Zentrum für Umweltforschung GmbH - U arr = np.random.rand(8) TESTDATA = [ - None, # empty - arr.copy(), # list - # np.array([arr.copy(), arr.copy(), arr.copy()]), # nested list - dict(a=arr.copy(), b=arr.copy()), # dict + None, # empty # 0 + [1], # 1 + arr.copy(), # 2 + np.array([arr.copy(), arr.copy(), arr.copy()]), # 3 - nested list + range(4), # 4 + dict(a=arr.copy(), b=arr.copy()), # 5 dict + pd.DataFrame(dict(a=arr.copy(), b=arr.copy())) # 6 df ] @@ -27,12 +28,10 @@ TESTDATA = [ @pytest.mark.parametrize("with_column_param", [False, True]) def test_dios_create(data, with_column_param): - if is_dict_like(data) and with_column_param: - # giving column names in dict-keys and in columns-parameter is special in df - pytest.skip() - data_copy0 = deepcopy(data) data_copy1 = deepcopy(data) + + # create columns list if with_column_param: df = pd.DataFrame(data=data_copy0) col = [f"new_{c}" for c in df] @@ -43,16 +42,11 @@ def test_dios_create(data, with_column_param): # giving nested lists, work different between df and dios data_copy1 = data_copy1.transpose() - df = pd.DataFrame(data=data_copy1, columns=col) - dios = DictOfSeries(data=data_copy0, columns=col) - - assert len(dios.columns) == len(df.columns) - assert np.all(dios.values == df.values) + df = pd.DataFrame(data=data_copy0, columns=col) + dios = DictOfSeries(data=data_copy1, columns=col) - # df columns may not be strings, but dios'es are always - columns = [str(c) for c in df.columns] - assert list(dios.columns) == columns + assert dios.columns.equals(df.columns) for c in df.columns: - assert np.all(dios[str(c)] == df[c]) + assert np.all(dios[c] == df[c].dropna()) diff --git a/test/test_methods.py b/test/test_methods.py index 60a2b29..d8ab08a 100644 --- a/test/test_methods.py +++ b/test/test_methods.py @@ -5,15 +5,24 @@ def test_copy_copy_empty(getDtDiosAligned): dios = getDtDiosAligned.copy() shallow = dios.copy(deep=False) deep = dios.copy(deep=True) - empty = dios.copy_empty() + empty_w_cols = dios.copy_empty(columns=True) + empty_no_cols = dios.copy_empty(columns=False) assert dios is not shallow assert dios is not deep - assert dios is not empty + assert dios is not empty_w_cols + assert dios is not empty_no_cols - assert dios.itype == shallow.itype - assert dios.itype == deep.itype - assert dios.itype == empty.itype + for attr in ['itype', '_itype', '_policy', ]: + dios_attr = getattr(dios, attr) + for cop in [shallow, deep, empty_w_cols, empty_no_cols]: + copy_attr = getattr(cop, attr) + assert dios_attr == copy_attr + + assert dios.columns.equals(shallow.columns) + assert dios.columns.equals(deep.columns) + assert dios.columns.equals(empty_w_cols.columns) + assert not dios.columns.equals(empty_no_cols.columns) for i in dios: assert dios[i].index is shallow[i].index -- GitLab