From 1655868dce44a2310e5a886a5c7519b2df30785d Mon Sep 17 00:00:00 2001 From: Bert Palm <bert.palm@ufz.de> Date: Wed, 15 Apr 2020 05:07:18 +0200 Subject: [PATCH] made itype lazy --- dios/dios.py | 50 ++++++++++++++++++++++++++++++++++---------------- dios/lib.py | 6 +++--- 2 files changed, 37 insertions(+), 19 deletions(-) diff --git a/dios/dios.py b/dios/dios.py index 032113b..b5a0863 100644 --- a/dios/dios.py +++ b/dios/dios.py @@ -67,26 +67,25 @@ class DictOfSeries: """ def __init__(self, data=None, columns=None, index=None, itype=None, cast_policy='save', fastpath=False): - + # we are called internally if fastpath: - self._itype = itype or ObjItype self._policy = cast_policy - if data is not None: self._data = data else: # it is significantly faster, to provide an index and fill it, # than to successively build the index by adding data self._data = pd.Series(dtype='O', index=columns) + else: if index is not None and not isinstance(index, pd.Index): index = pd.Index(index) # itype=None means infer the itype by the data, so we first set to the highest - # possible itype, then insert data, then find the best-fitting itype. + # possible itype, then insert data, then infer the best-fitting itype. if itype is None and index is None: self._itype = ObjItype else: @@ -107,16 +106,8 @@ class DictOfSeries: if data is not None: self._init_insert_data(data, columns, index) - # Note: self._data still contain nans at all positions - # where no data was present, but a column-name was given - - if itype is None: - self._itype = _find_least_common_itype(self._data.dropna()) - if not self._itype.unique: - _throw_MixedItype_err_or_warn(self.itype) - - # finalise data: insert empty - # columns at nan positions + # self._data still contain nans at all positions, where + # no data was present, but a column-name was given if self._data.hasnans: e = pd.Series(dtype='O', index=index) for c in self.columns[self._data.isna()]: @@ -124,6 +115,17 @@ class DictOfSeries: self._data.index.name = 'columns' + # we try to infer the itype, but if we still have + # no data, we will set the itype lazy, i.e. with + # the first non-empty _insert() + if itype is None: + if self.empty: + self._itype = 'INFER' + else: + self._itype = _find_least_common_itype(self._data) + if not self._itype.unique: + _throw_MixedItype_err_or_warn(self.itype) + def _init_insert_data(self, data, columns, index): """ Insert items of a iterable in self""" @@ -160,6 +162,9 @@ class DictOfSeries: for k in data: self._insert(k, pd.Series(data[k], index=index)) + # ---------------------------------------------------------------------- + # + def _insert(self, col, val): """Insert a fresh new value as pd.Series into self""" val = list(val) if _is_iterator(val) else val @@ -175,9 +180,20 @@ class DictOfSeries: elif not isinstance(val, pd.Series): raise TypeError(f"Only data of type pandas.Series can be inserted, passed was {type(val)}") - val = cast_to_itype(val, self.itype, policy=self._policy).copy(deep=True) + # set the itype lazy, i.e. when first non-empty + # column is inserted + if self._itype == 'INFER': + if not val.empty: + self._itype = get_itype(val.index) + # cast all pre-inserted empty series + self.__cast_all(self._itype, self._policy) + if not self._itype.unique: + _throw_MixedItype_err_or_warn(self._itype) + else: + val = cast_to_itype(val, self.itype, policy=self._policy) + val.name = col - self._data.at[col] = val + self._data.at[col] = val.copy(deep=True) @property def columns(self): @@ -238,6 +254,8 @@ class DictOfSeries: @property def itype(self): + if self._itype == 'INFER': + return None return self._itype @itype.setter diff --git a/dios/lib.py b/dios/lib.py index 167f63c..10c87ae 100644 --- a/dios/lib.py +++ b/dios/lib.py @@ -195,13 +195,13 @@ def cast_to_itype(series, itype, policy='lossless', err='raise', inplace=False): if _itype_le(series.itype, itype): # a <= b return series - e = f"A series index of type `{type(series.index)}` cannot be casted to Itype {itype.name}" + e = f"A series index of type '{type(series.index)}' cannot be casted to Itype '{itype.name}'" # cast any -> dt always fail. if is_itype(itype, DtItype): pass else: - e += f", as forbidden by the cast-policy `{policy}`." + e += f", as forbidden by the cast-policy '{policy}'." if policy == CastPolicy.never: pass @@ -227,7 +227,7 @@ def cast_to_itype(series, itype, policy='lossless', err='raise', inplace=False): series.index = series.index.astype(int) if series.index.is_unique: return series - e = f"The cast with policy {policy} from series index type `{type(series.index)}` to " \ + e = f"The cast with policy {policy} from series index type '{type(series.index)}' to " \ f"itype {itype.name} resulted in a non-unique index." # cast mixed -> int/float always fail -- GitLab