From 994552389a8074c17cf143dcef84ddf338ffcb30 Mon Sep 17 00:00:00 2001 From: Bert Palm <bert.palm@ufz.de> Date: Tue, 11 Feb 2020 20:43:02 +0100 Subject: [PATCH] all basics and .loc done --- dios/dios.py | 129 +++++++++++++++++++++++++--------------------- dios/options.py | 53 ++++++++++++++++--- tests/run_dios.py | 2 + 3 files changed, 119 insertions(+), 65 deletions(-) diff --git a/dios/dios.py b/dios/dios.py index 8253c5c..c41bc33 100644 --- a/dios/dios.py +++ b/dios/dios.py @@ -9,6 +9,7 @@ from collections import OrderedDict from pandas.core.dtypes.common import ( is_list_like, is_iterator, + is_scalar, is_number, is_integer, ) @@ -81,7 +82,7 @@ class DictOfSeries: # to keep order, we iterate over self instead of new _d = OrderedDict() for i, k in enumerate(self.columns): - _d[new[i]] = self._data[k] + _d[new[i]] = self[k] self._data = _d @property @@ -94,16 +95,16 @@ class DictOfSeries: Note: If ``self._indextype`` and ``idx`` are of the same type, ``self._indextype`` stays unchanged. """ - i = get_indextype(idx) - check_allowed_indextypes(i) + idx = get_indextype(idx) + check_allowed_indextypes(idx) if self._indextype is None: - self._indextype = i - elif self._indextype != i: - if dios_options[Options.mixed_indextypes]: + self._indextype = idx + elif self._indextype != idx: + if dios_options[Options.allow_mixed_indextypes]: self._indextype = IdxTypes.mixed else: - raise ValueError(f"Only objects which have a index of type `{self._indextype}` are allowed.") + raise ValueError(f"Only objects which have a index of type `{self._indextype}` can be inserted.") def _check_keys(self, keys): missing = [k for k in keys if k not in self.columns] @@ -203,15 +204,14 @@ class DictOfSeries: def _setitem_new(self, key, value, bypass_checks=False): v = value + # if the checks was already done, we skip them here, + # also the Errormessage wouldn't fully apply. if not bypass_checks: if isinstance(v, DictOfSeries): v = v.squeeze() - if not isinstance(v, pd.Series): - raise ValueError(f"only DictOfSeries of length 1 can be assigned new") - if isinstance(v, list): - # upcast - v = pd.Series(v) + elif isinstance(v, list): + v = pd.Series(v) # upcast if not isinstance(v, pd.Series): raise ValueError(f"Only pd.Series and DictOfSeries (of length 1) can be assigned new") @@ -224,16 +224,14 @@ class DictOfSeries: # series, dios['a'] = series if isinstance(val, pd.Series) and sl is None: - # todo: update self._indextype - # todo: use self._setitem_new() here?, add no_check param? - self._data[key] = val.copy() + self._setitem_new(key, val, bypass_checks=True) return sl = sl or slice(None) # label, scalar: dios['a'] = 3.9 or # slice, scalar: dios[0:3] = 4.0 - if np.isscalar(val): + if is_scalar(val): self._data[key][sl] = val # label, list: dios['a'] = [0.0, 0.3, 0.0] @@ -249,34 +247,7 @@ class DictOfSeries: return def _setitem_dios(self, keys, slicer, other): - method = dios_options['dios_to_dios_method'] - err_append = "consider changing dios.option['dios_to_dios_method']" - - # assign where possible, otherwise ignore - if method == 0: - keys = [k for k in keys if k in other.columns] - - # at least one key must be in self - elif method == 1: - keys = [k for k in keys if k in other.columns] - if not keys: - raise KeyError("src-DioS and dest-DioS need to share at least one key, " + err_append) - - # all keys must be in self, but more keys could exist - elif method == 2: - dest_missing = [k for k in other.columns if k not in keys] - if dest_missing: - raise KeyError(f"{dest_missing} are missing in the destiny-dios, " + err_append) - - # keys in both dios's must be equal - elif method == 3: - diff = set(keys).symmetric_difference(set(other.columns)) - if diff: - raise KeyError(f"{diff} is not in both of src- and dest-dios, " + err_append) - - else: - raise RuntimeError(f"{method} is an invalid value for dios.option[dios_to_dios]") - + keys = get_dios_to_dios_keys(keys, other) for k in keys: self._setitem(k, other[k], slicer) @@ -289,13 +260,13 @@ class DictOfSeries: def __repr__(self): pd_max_rows = pd.get_option('display.max_rows') - pd.set_option('display.max_rows', dios_options['disp_max_rows']) + pd.set_option('display.max_rows', dios_options[Options.disp_max_rows]) def item2str(k): kstr = str(self[k]).replace('\n', '\n ') return f'{k}:\n {kstr}\n' - maxrows = dios_options['disp_max_vars'] + maxrows = dios_options[Options.disp_max_vars] s = '' head = maxrows // 2 tail = len(self.columns) - head @@ -335,8 +306,44 @@ class DictOfSeries: yield from self._data def __delitem__(self, key): - # todo: update self._indextype del self._data[key] + self.__set_mixed_indextype_from_all_keys() + + def __set_mixed_indextype_from_all_keys(self): + """ If the index-type of dios is ``mixed`` and the index-type of any stored + Series change, we need to check the index-type of all other Series, to + validate the dios-wide index-type.""" + + if len(self) == 0: + self._indextype = None + return + + if len(self) == 1: + self._indextype = get_indextype(self.squeeze().index) + return + + # ``mixed`` isn't allowed in general, so we're done + if not dios_options[Options.allow_mixed_indextypes]: + return + + # index-type wasn't ``mixed``, so we're done + if self._indextype != IdxTypes.mixed: + return + + # check all types + types = set() + for k in self._data.keys(): + idx = self._data[k].index + types.add(get_indextype(idx)) + + # If we have at least two different + # index-types, ``mixed`` still apply. + if len(types) > 1: + return + + # index is of a single new type + self._indextype = types.pop() + return def __copy__(self): return self.copy(deep=True) @@ -346,8 +353,13 @@ class DictOfSeries: def copy(self, deep=True): new = DictOfSeries() - for k in self.columns: - new[k] = self[k].copy(deep=deep) + new._indextype = self.indextype + # We use `_data` here because all checks have already been done. + # So this should be much faster, especially because we use the underlying dict for + # getting and setting the values, instead of ``__setitem__`` and ``__getitem__``. + # Note: don't use same approach elsewhere, unless you're very sure what you do. + for k in self._data: + new._data[k] = self._data[k].copy(deep=deep) return new def __op1__(self, op): @@ -359,7 +371,8 @@ class DictOfSeries: def __op2__(self, other, op): new = DictOfSeries() if isinstance(other, DictOfSeries): - for k in self.columns: + keys = get_dios_to_dios_keys(self.columns, other) + for k in keys: new[k] = op(self[k], other[k]) else: for k in self.columns: @@ -424,8 +437,8 @@ class DictOfSeries: return self.__op2__(other, op.xor) def squeeze(self): - if len(self._data) == 1: - return self._data[self.columns[0]] + if len(self) == 1: + return self[self.columns[0]] else: return self @@ -524,18 +537,18 @@ class _LocIndexer: new[c] = self._data[c].loc[rkey] return new - def _col_slice_to_col_list(self, rslice): + def _col_slice_to_col_list(self, cslice): """ see here: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-slicing-with-labels """ - keys = list(self._data.keys) + keys = list(self._data.keys()) try: - start = keys.index(rslice.start) if rslice.start is not None else None - stop = keys.index(rslice.stop) if rslice.stop is not None else None + start = keys.index(cslice.start) if cslice.start is not None else None + stop = keys.index(cslice.stop) if cslice.stop is not None else None except ValueError: raise KeyError("The slice start label or the slice stop label is not present in the columns.") - if not is_integer(rslice) and rslice > 0: + if not is_integer(cslice.step) and cslice.step > 0: raise TypeError("The step parameter of the slice must be positive integer.") - return keys[slice(start, stop + 1, rslice.step)] + return keys[slice(start, stop + 1, cslice.step)] diff --git a/dios/options.py b/dios/options.py index 5e983cf..bfd1c21 100644 --- a/dios/options.py +++ b/dios/options.py @@ -2,7 +2,11 @@ from dios.lib import IdxTypes import warnings -class DiosOptionsWarning(UserWarning): +class OptionsWarning(UserWarning): + pass + + +class OptionsError(RuntimeError): pass @@ -27,7 +31,7 @@ class Options: It is because, eg. a numeric slice cannot work on a pd.DatetimeIndex and vice versa. To set this to True is highly experimental, any arising issues or errors should be handled by the user.""" - mixed_indextypes = "mixed_indextypes" + allow_mixed_indextypes = "allow_mixed_indextypes" allowed_indextypes = "allowed_indextypes" @@ -36,9 +40,9 @@ class __OptionsDict(dict): """ Simple dict that throw a warning, if a special value is inserted at a special key""" def __setitem__(self, key, value): # throw a warning when user set ``mixed_indextyes = True`` - if key == Options.mixed_indextypes and value: - warnings.warn(f"Using dios_option[{Options.mixed_indextypes}]=True is highly experimental, " - f"please do not report any bugs!", DiosOptionsWarning) + if key == Options.allow_mixed_indextypes and value: + warnings.warn(f"Using ``dios_option[{Options.allow_mixed_indextypes}]=True`` is highly experimental, " + f"please do not report any bugs!", OptionsWarning) return super().__setitem__(key, value) @@ -47,11 +51,46 @@ dios_options = __OptionsDict() dios_options[Options.disp_max_rows] = 10 dios_options[Options.disp_max_vars] = 4 dios_options[Options.dios_to_dios_method] = 3 -dios_options[Options.mixed_indextypes] = False +dios_options[Options.allow_mixed_indextypes] = False dios_options[Options.allowed_indextypes] = [IdxTypes.datetime, IdxTypes.nunmeric] def check_allowed_indextypes(idxtype): if idxtype not in dios_options[Options.allowed_indextypes]: - raise ValueError("The index of the given object is not of supported type") + raise RuntimeError(f"The index type `{idxtype}` is not allowed by the " + f"`dios_option[{Options.allowed_indextypes}] = {dios_options[Options.allowed_indextypes]}`") + + +def get_dios_to_dios_keys(keys, other): + # we can assume that all keys are exist in self._data + method = dios_options[Options.dios_to_dios_method] + err_append = "consider changing dios.option['dios_to_dios_method']" + + # assign where possible, otherwise ignore + if method == 0: + keys = [k for k in keys if k in other.columns] + + # at least one key must be in self + elif method == 1: + keys = [k for k in keys if k in other.columns] + if not keys: + raise KeyError("src-DioS and dest-DioS need to share at least one key, " + err_append) + + # all keys must be in self, but more keys could exist in other, + # eg. ``dios['a'] = dios[['a','b']]`` will update column-a but not column-b + # eg. ``dios[['a','b']] = dios['a']`` will fail + elif method == 2: + fail = [k for k in keys if k not in other.columns] + if fail: + raise KeyError(f"{fail} are missing in the destiny-dios, " + err_append) + + # keys in both dios's must be equal + elif method == 3: + fail = set(keys).symmetric_difference(set(other.columns)) + if fail: + raise KeyError(f"{fail} is not in both of src- and dest-dios, " + err_append) + + else: + raise OptionsError(f"{method} is an invalid value for dios.option[dios_to_dios]") + return keys diff --git a/tests/run_dios.py b/tests/run_dios.py index 4fdd317..9cac320 100644 --- a/tests/run_dios.py +++ b/tests/run_dios.py @@ -6,6 +6,8 @@ if __name__ == '__main__': # dios['b'] = pd.Series([2,4,4123,122,4], pd.date_range(freq='1d', periods=5, start='2000-01-01')) dios2 = dios.copy() + dios_options[Options.allow_mixed_indextypes] = True + a = dios.loc[:] # df = pd.DataFrame([1,24,5,456,45], index=pd.date_range(periods=5, freq='1d', start='2000-01-01')) # a = df.loc["2000-01-02":] -- GitLab