diff --git a/Readme.md b/Readme.md index ffd380abc60f0d8de9761019dfadc46dd6d3d93c..4257c10db0607be3adf78e104aadc8e4a1dbe32e 100644 --- a/Readme.md +++ b/Readme.md @@ -5,7 +5,9 @@ Features * fast as pd.DataFrame * every 'column' has its own index * use very less memory then a disalignd pd.Dataframe -* `dios[var]` where `var` can be a list, slice, string -* `dios[ts,var]` where `ts` can be a slice (`ts:ts`) a single timestamp, or anything pd.Series will eat -* `+,-,*,/,//,%` and `==,!=,>,>=,<=,<` and `~,&,|,^` and `is, in` implemented -* harm_choice **any** method from pd.Series on dios with `dios.pipe(pd.Series.method)` or `dios.foreach()` (simple alias) +* `dios[var]` where `var` can be a list (or any iterable object) or a string +* `dios[slice]` where `slice` can be a row slicer +* `dios.loc[rowindexer]` act like `pandas.Series().loc[rowindexer]` for each series in the dios +* `dios.loc[rowindexer, columns]` like `dios.loc[rowindexer]` but for a subset of columns, which can be specified by a iterable, slice(on columns) or string +* `+,-,*,/,//,%` and `==,!=,>,>=,<=,<` and `~,&,|,^` and `is, in, len, all, any, empty, columns` are implemented +* also `pipe()` (and as alias `foreach()`) is implemented, which pass any `pandas.Series`-method to every series in the dios and return the resulting dios diff --git a/dios/__init__.py b/dios/__init__.py index 8875553b2e562e32d68485977a997a63e9122eb4..4b1349732ff98584f7397afe68f0d82d72225796 100644 --- a/dios/__init__.py +++ b/dios/__init__.py @@ -1,6 +1,10 @@ -from dios.dios import * +# low level +from dios.errors import * +from dios.lib import * +from dios.options import * -from dios.profiling.generate_testsets import get_testset, var_prefix +# high level +from dios.itypes import * +from dios.dios import * -from dios.profiling.performance import gen_random_timestamps, find_index_range diff --git a/dios/dios.py b/dios/dios.py index 78a6ff7c9fad22f384c67c23b1b87c9b6e8c52b7..35efd8a05b7a29d8b1bd0135cb673c322c2c6854 100644 --- a/dios/dios.py +++ b/dios/dios.py @@ -1,53 +1,122 @@ +from dios.lib import * +from dios.options import * +from dios.itypes import * +from dios.errors import * import pandas as pd -import pandas._libs.lib as pdlib -from pandas.core.dtypes.common import * import numpy as np -import datetime as dt -from collections import OrderedDict -from itertools import islice import operator as op -import functools +from functools import partialmethod -# _listlike = (list, set) +from functools import wraps +from collections import OrderedDict +from pandas._libs.lib import to_object_array, is_bool_array +from pandas.core.common import is_bool_indexer +from pandas.core.dtypes.common import ( + is_list_like, + is_nested_list_like, + is_scalar, + is_integer, + is_dict_like, + is_number, +) +from pandas.core.dtypes.common import is_iterator as _is_iterator -dios_options = dict( - disp_max_rows=10, - disp_max_vars=4, - # 0: accept all - # 1: accept if at least one keys is is in both DioS - # 2: accept if all keys of the src-DioS in the dest-DioS - # 3: accept if both dios have the exact same keys (makes only sense for assignments with slicer, - # otherwise its the same than creating a new dios) - dios_to_dios_method=3 -) +def is_dios_like(obj): + return isinstance(obj, DictOfSeries) + -# Dataframe -# - indexing with single label work on columns, return series -# - indexing with list work on columns, return df -# - indexing with slice work on rows, return df -# - mixing slice and label/list not allowed -# +def is_pandas_like(obj): + """We consider ourselfs (dios) as pandas-like""" + return is_series_like(obj) or is_dataframe_like(obj) or is_dios_like(obj) -# DictionaryOfSeries -# - indexing with single label work on columns, return series -# - indexing with list work on columns, return dios -# - indexing with slice not supported +def is_series_like(obj): + return isinstance(obj, pd.Series) -def item_from_zerodim(key): - # if isinstance(key, DictOfSeries) and len(key) == 1: - # todo what if squeeze return a 1-value-series? squeeze again? - # return key.squeeze() - return pdlib.item_from_zerodim(key) + +def is_dataframe_like(obj): + return isinstance(obj, pd.DataFrame) + + +def is_iterator(obj): + """ This is only a dummy wrapper, to warn that the docu of this isnt't right. + Unlike the example says, + >>is_iterator([1, 2, 3]) + returns False, not True for lists + """ + return _is_iterator(obj) class DictOfSeries: + """ + DictionaryOfSeries is a collection of pd.Series's which aim to be as close as possible similar to + pd.DataFrame. The advantage over pd.DataFrame is, that every `column` has its own row-index, + unlike the former, which provide a single row-index for all columns. This solves problems with + unaligned pd.Series. + + Dataframe vs DictionaryOfSeries + ------------------------------- + + pd.DataFrame : + - ``df['a']`` indexing with single label, work on columns, return series + - ``df[iterabel]`` indexing with list, work on columns, return df + - ``df[slice]`` indexing with slice, work on rows, return df + - ``df.loc[..]`` loc[cation] based indexing, work on row and/or columns `labels`, return df + - ``df.iloc[..]`` i[nteger]loc[cation] based indexing, work on row and/or columns, return df + + DictionaryOfSeries: + - ``dios['a']`` indexing with single label, work on columns, return series + - ``dios[iterable]`` indexing with list, work on columns, return dios + - ``dios[slice]`` indexing with slice, work on (all) index'es, return dios + - ``dios.loc[..]`` loc[cation] based indexing, work on row and/or columns `labels`, return dios + - ``dios.iloc[..]`` i[nteger]loc[cation] based indexing, work on row and/or columns, return dios + + Todos: + ----- + todo: to_discuss!! allow any hashable obj as column identifier + Currently we only allow strings as identifier, to be more df-like we should allow any + hashable object (unlike df we may should exclude stuff like: ``None`` or ``np.nan`` ??) + + """ + + def __init__(self, data=None, columns=None, itype=MixedItype, downcast_policy='lossless'): - def __init__(self, **kwargs): self._data = OrderedDict() - for kw in kwargs: - self[kw] = kwargs[kw] + + # We need to keep track of the index-type (itype) of every new Series. + # If the itypes differ between different series, slicing will almost always fail + # (eg. a datetime-like slice cannot work on a numeric index and vice versa). + self._itype = None + self.itype = get_itype(itype) + + if downcast_policy not in CAST_POLICIES: + raise ValueError(f"downcast_policy must be one of {CAST_POLICIES}") + self._policy = downcast_policy + + if data is not None: + self.__init_insert_data__(data) + + # we use the columns.setter to make all necessary checks + if columns is not None: + self.columns = columns + + def __init_insert_data__(self, data): + if isinstance(data, DictOfSeries): + g = ((k, data[k]) for k in data) + else: + data = list(data) if is_iterator(data) else data + if is_dict_like(data): + g = ((k, data[k]) for k in data) + elif is_nested_list_like(data): + g = ((str(i), d) for i, d in enumerate(data)) + elif is_list_like(data): + g = [('0', data)] + else: + raise ValueError(f"init with data of type {type(data)} is not possible.") + for k, val in g: + self[k] = val + return @property def columns(self): @@ -56,10 +125,10 @@ class DictOfSeries: @columns.setter def columns(self, new): if not isinstance(new, list): - raise NotImplementedError("Only lists supported so far") + raise TypeError("column names must be given as a list") if len(set(new)) != len(new): - raise ValueError("Names must be unique") + raise ValueError("column names must be unique") if len(new) != len(self.columns): raise ValueError(f"Length mismatch: Columns has {len(self.columns)} elements, " @@ -68,204 +137,308 @@ class DictOfSeries: # to keep order, we iterate over self instead of new _d = OrderedDict() for i, k in enumerate(self.columns): - _d[new[i]] = self._data[k] + _d[new[i]] = self[k] self._data = _d - def _check_keys(self, keys): - missing = [k for k in keys if k not in self.columns] - if missing: - raise KeyError(f"{missing} not in index") + @property + def values(self): + # will make all series same length, inset nan's + return to_object_array(self._data.values()).transpose() - def __getitem__(self, key): - """ - `dios['a']` return a pd.Series - `dios[iterable]` return a DictOfSeries - `dios[slice]` return a DictOfSeries + @property + def data(self): + return self._data.values() - If 'iterable' contains any(!) label that does not exist - a KeyError is raised - """ - key = item_from_zerodim(key) + @property + def itype(self): + return self._itype - if isinstance(key, str): - self._check_keys([key]) - return self._get_item(key) + @itype.setter + def itype(self, itype_like): + itype = get_itype(itype_like) - if isinstance(key, slice): - return self._slice(self.columns, key) + if not itype_le(self._itype, itype): + self.__cast_all(itype) - if is_list_like(key): - self._check_keys(key) - return self._getitem_listlike(key) + self._itype = itype - raise KeyError(f"{key}") + if not itype.unique: + throw_MixedItype_err_or_warn(f"Using a {itype} as dios.itype is experimental. As soon as series with " + f"different index types are inserted, slicing will almost always fail. " + f"You are hereby warned!") - def _get_item(self, key): - # return a pd.Series - return self._data[key] + def __cast_all(self, itype): + k = '?' + try: + for k in self.columns: + casted = cast_to_itype(self._data[k], itype, policy=self._policy) + self._data[k] = casted + except Exception as e: + raise type(e)(f"Column {k}: " + str(e)) from e - def _getitem_listlike(self, keys): - new = DictOfSeries() - for k in keys: - new[k] = self._data[k] + def __getitem__(self, key): + """ + Get items: + - ``dios['a']`` -> return a pd.Series + - ``dios[iterable]`` -> return a DictOfSeries of all given columns [1] + - ``dios[slice]`` -> return a sliced DictOfSeries of all(!) columns in the dios + Notes: + - [1] If ``iterable`` contains any(!) label that does not exist, a KeyError is raised. + """ + # special case single label + if isinstance(key, str): + if key in self.columns: + new = self._get_item(key) + else: + raise KeyError(key) + # all other cases + else: + keys, ixs, ixstype = self._unpack_key(key) + ixs = self._unpack_indexer(keys, ixs, ixstype) + new = self.copy_empty() + for i, _ in enumerate(keys): + key, ix = keys[i], ixs[i] + new._data[key] = self._get_item(key, ix, True) return new - def _slice(self, keys, slicer, axis=0): - """ Return a slice of self""" - if axis != 0: - raise NotImplementedError("currently sliceing only works on rows.") - new = DictOfSeries() - for k in keys: - # pd.Series[slice] always return a pd.Series - new[k] = self._data[k][slicer] - return new + def _get_item(self, key, ix=None, insertna=False): + """Extract a pd.Series from self""" + if ix is None: + return self._data[key] + elif insertna: + s = self._data[key] + return s[ix].reindex_like(s) + else: + return self._data[key][ix] def __setitem__(self, key, value): """ - dios['a'] = pd.Series() -> set a existing or add a new item - dios['a'] = any -> pass any to pd.Series(any) then add or set - dios[iterable] + Set items: + - ``dios['a'] = pd.Series()`` -> Set a new pd.Series to an existing column or add it as `new column`. + - ``dios['a'] = Any`` -> Pass ``any`` to the pd.Series in the corresponding existing(!) column. [1],[2] + - ``dios[iterable] = Any`` -> Pass ``any`` to the pd.Series's in the corresponding set of columns. [1],[2],[3] + - ``dios[slice] = Any`` -> Pass ``any`` to all(!) sliced pd.Series's from the current dios. [1],[2] + + Notes: + - [1] The length of ``Any`` must be be equal to the length of every sliced pd.Series. + - [2] If ``Any`` is a ``DictOfSeries`` the behavior depends on the option ``dios_to_dios_method`` + in the ``options`` dictionary. + - [3] If ``iterable`` contains any(!) label that does not exist, a KeyError is raised. """ - - # determine action by keys + # special case single label if isinstance(key, str): if key not in self.columns: - self._setitem_new(key, value) + self._insert(key, value) + return else: - self._setitem(key, value) - return - - elif is_list_like(key): - self._check_keys(key) - keys = key - slicer = None - - elif isinstance(key, slice): - keys = self.columns - slicer = key - + k, i, it = [key], [slice(None)], None + # all other cases else: - raise KeyError(f"{key}") - - if isinstance(value, DictOfSeries): - self._setitem_dios(keys, slicer, value) + k, i, it = self._unpack_key(key) + i = self._unpack_indexer(k, i, it) + gen = self._unpack_value(k, i, value) + for tup in gen: + self._set_item(*tup) + + def _set_item(self, key, ix, val): + "Set a value (scalar or list or series)" + ser = self._data[key] + if is_series_like(val): + left = ser[ix] + index = left.index.intersection(val.index) + if not index.empty: + left.loc[index] = val.loc[index].copy() else: - for k in keys: - self._setitem(k, value, slicer) - - def _setitem_new(self, key, value): - v = value - if isinstance(v, DictOfSeries): - v = v.squeeze() - if not isinstance(v, pd.Series): - raise ValueError(f"only DictOfSeries of length 1 can be assigned new") - - if isinstance(v, list): - # upcast - v = pd.Series(v) - - if isinstance(v, pd.Series): - self._data[key] = v.copy(deep=True) - else: - raise ValueError(f"Only pd.Series and DictOfSeries (of length 1) can be assigned new") + ser[ix] = val + + def _insert(self, key, val): + """Insert a fresh new value into self""" + if isinstance(val, DictOfSeries): + val = val.squeeze() + elif is_list_like(val) and not is_nested_list_like(val): + val = pd.Series(val) + + if not isinstance(val, pd.Series): + raise ValueError(f"Only pd.Series can be inserted directly, given type {type(val)}") + + val = cast_to_itype(val, self._itype, policy=self._policy) + self._data[key] = val.copy(deep=True) + + def _unpack_value(self, keys, ixs, val): + """Return a generator that yield (key, indexer, value) for all keys""" + val = list(val) if is_iterator(val) else val + diosl, dfl, nlistl = is_dios_like(val), is_dataframe_like(val), is_nested_list_like(val) + + if (diosl or dfl or nlistl) and len(val) != len(keys): + raise ValueError(f"could not broadcast input array with length {len(val)}" + f" into dios of length {len(keys)}") + + # now we have everything we need: key, indexer, value + # so we just pack it nice and cosy and let setitem + # do the dirty work. + for i, _ in enumerate(keys): + key, ix = keys[i], ixs[i] + if dfl or diosl: + yield key, ix, val[val.columns[i]] + elif nlistl: + yield key, ix, val[i] + else: + yield key, ix, val - def _setitem(self, key, val, sl=None): - """ Set a value or a set of values to a single key in self k""" + def _unpack_key(self, key): + """ Determine keys and indexer by type of key. This does not deal + with single label-access, only higher dimension objects are handled.. - # series, dios['a'] = series - if isinstance(val, pd.Series) and sl is None: - self._data[key] = val.copy() - return + Notes: + Which keys we get, may depend on the policy in dios_options + """ + len_err_msg = "length of given column-indexer does not match length of columns" + keys = None + indexer, idxtype = None, None - sl = sl or slice(None) + # prevent consuming of a generator + key = list(key) if is_iterator(key) else key - # label, scalar: dios['a'] = 3.9 or - # slice, scalar: dios[0:3] = 4.0 - if np.isscalar(val): - self._data[key][sl] = val + if isinstance(key, slice): + keys = self.columns + indexer, idxtype = [key], 'slice' - # label, list: dios['a'] = [0.0, 0.3, 0.0] - # sclice, list: dios[0:3] = [0.0, 0.3, 0.0] - elif is_list_like(val): - # ensure same size - if len(self._data[key]) == len(val): - self._data[key][sl] = val + # list, np.arrays, ... of list, np.arrays.. + elif is_nested_list_like(key): + # we only allow bool nlists + keys = self.columns + indexer, idxtype = key, 'nlist' + + # ser, df, dios + elif is_pandas_like(key): + if is_series_like(key): + mask = key.to_numpy() + if is_bool_indexer(mask): + # bool series are column indexer not row-indexer! + keys = [] + for k in self.columns: + try: + if key[k]: + keys.append(k) + except KeyError: + pass + else: + keys = key.to_list() + + elif is_dataframe_like(key): + # we only allow bool df's + keys = key.columns.to_list() + indexer, idxtype = key, 'df' + + elif is_dios_like(key): + # we only allow bool dios's + keys = key.columns + indexer, idxtype = key, 'dios' + + # list, np.array, np.ndarray, ... + # Note: series considered list-like, so we handle lists at last + elif is_list_like(key): + arr = np.array(key) + if is_bool_array(arr): + keys = self.columns + if len(arr) != len(keys): + raise ValueError(len_err_msg) + keys = np.array(keys)[arr] else: - raise ValueError('not the same length') # todo more info here + keys = key else: - raise ValueError(f"assignments with a values of type {type(val)} are not supported") - return - - def _setitem_dios(self, keys, slicer, other): - method = dios_options['dios_to_dios_method'] - err_append = "consider changing dios.option['dios_to_dios_method']" - - # assign where possible, otherwise ignore - if method == 0: - keys = [k for k in keys if k in other._data.keys()] - - # at least one key must be in self - elif method == 1: - keys = [k for k in keys if k in other._data.keys()] - if not keys: - raise KeyError("src-DioS and dest-DioS need to share at least one key, " + err_append) + raise KeyError(f"{key}") - # all keys must be in self, but more keys could exist - elif method == 2: - dest_missing = [k for k in other._data.keys() if k not in keys] - if dest_missing: - raise KeyError(f"{dest_missing} are missing in the destiny-dios, " + err_append) + # check keys + method = dios_options[OptsFields.col_indexing_method] + keys = check_keys_by_policy(keys, self.columns, method) - # keys in both dios's must be equal - elif method == 3: - diff = set(keys).symmetric_difference(set(other._data.keys())) - if diff: - raise KeyError(f"{diff} is not in both of src- and dest-dios, " + err_append) + return keys, indexer, idxtype + def _unpack_indexer(self, keys, indexer, idxtype): + err_bool = "only boolean values are allowed" + idxerlist = [] + if idxtype == 'slice': + idxerlist = indexer * len(keys) + elif idxtype in ['df', 'dios']: + for k in keys: + ix = indexer[k] + idxerlist.append(ix) + if not is_bool_indexer(ix): + raise ValueError(err_bool) + elif idxtype == 'nlist': + for i in indexer: + ix = np.array(i) + idxerlist.append(ix) + if not is_bool_array(ix): + raise ValueError(err_bool) else: - raise RuntimeError(f"{method} is an invalid value for dios.option[dios_to_dios]") + idxerlist = [slice(None)] * len(keys) + return idxerlist - for k in keys: - self._setitem(k, other[k], slicer) + @property + def loc(self): + return _LocIndexer(self) + + @property + def iloc(self): + return _iLocIndexer(self) def __str__(self): return self.__repr__() def __repr__(self): pd_max_rows = pd.get_option('display.max_rows') - pd.set_option('display.max_rows', dios_options['disp_max_rows']) + pd.set_option('display.max_rows', dios_options[OptsFields.disp_max_rows]) def item2str(k): kstr = str(self[k]).replace('\n', '\n ') return f'{k}:\n {kstr}\n' - try: - s = '' - head = dios_options['disp_max_vars'] // 2 - tail = len(self.columns) - head - for i, k in enumerate(self.columns): - if i < head: - s += item2str(k) - if i == head: - s += ' ... \t\t\t\t ...\n' * 2 - if i > tail: - s += item2str(k) - finally: - pd.set_option('display.max_rows', pd_max_rows) + maxrows = dios_options[OptsFields.disp_max_vars] + s = '' + head = maxrows // 2 + tail = len(self.columns) - head + for i, k in enumerate(self.columns): + if i < head: + s += item2str(k) + elif i == head: + s += ' ... \t\t\t\t ...\n' * 2 + elif i > tail: + s += item2str(k) + pd.set_option('display.max_rows', pd_max_rows) return s + @property + def empty(self): + return len(self) == 0 or all(self._data[c].empty for c in self._data) + + def all(self): + return pd.Series([self._data[c].any() for c in self._data]) + + def any(self): + return pd.Series([self._data[c].any() for c in self._data]) + + def aaall(self): + """absolute all all""" + return self.all().all() + + def aaany(self): + """absolute any any""" + return self.any().any() + def __len__(self): return len(self._data) + def __iter__(self): + yield from self._data + def __reversed__(self): - return reversed(self._data) + yield from reversed(self._data) def __contains__(self, item): return item in self._data - def __iter__(self): - yield from self._data - def __delitem__(self, key): del self._data[key] @@ -276,89 +449,87 @@ class DictOfSeries: return self.copy(deep=True) def copy(self, deep=True): - new = DictOfSeries() - for k in self.columns: - new[k] = self[k].copy(deep=deep) + new = self.copy_empty() + # We use `_data` here, because all checks are already done. + # So this should be much faster, especially, because we use the underlying dict for + # getting and setting the values, instead of ``__setitem__`` and ``__getitem__``. + for k in self._data: + new._data[k] = self._data[k].copy(deep=deep) return new - def __op1__(self, op): + def copy_empty(self): new = DictOfSeries() - for k in self.columns: - new[k] = op(self[k]) + new._itype = self.itype return new - def __op2__(self, other, op): - new = DictOfSeries() - if isinstance(other, DictOfSeries): - for k in self.columns: - new[k] = op(self[k], other[k]) - else: - for k in self.columns: - new[k] = op(self[k], other) + def _op1(self, op): + new = self.copy_empty() + try: + for k in self: + new[k] = op(self._data[k]) + except Exception as e: + raise type(e)(f"'{OP_MAP[op]} dios' failed: " + str(e)) from e return new - def __neg__(self): - return self.__op1__(op.neg) - - def __invert__(self): - return self.__op1__(op.inv) - - def __abs__(self): - return self.__op1__(op.abs) - - def __lt__(self, other): - return self.__op2__(other, op.lt) - - def __le__(self, other): - return self.__op2__(other, op.le) - - def __eq__(self, other): - return self.__op2__(other, op.eq) - - def __ne__(self, other): - return self.__op2__(other, op.ne) - - def __ge__(self, other): - return self.__op2__(other, op.ge) - - def __gt__(self, other): - return self.__op2__(other, op.gt) - - def __add__(self, other): - return self.__op2__(other, op.add) - - def __sub__(self, other): - return self.__op2__(other, op.sub) - - def __mul__(self, other): - return self.__op2__(other, op.mul) - - def __mod__(self, other): - return self.__op2__(other, op.mod) - - def __truediv__(self, other): - return self.__op2__(other, op.truediv) - - def __floordiv__(self, other): - return self.__op2__(other, op.floordiv) - - def __pow__(self, other): - return self.__op2__(other, op.pow) - - def __and__(self, other): - return self.__op2__(other, op.and_) + def _op2(self, op, other, inplace=False): + def raiseif(cond, s='lenght'): + if cond: + raise ValueError(f"{s} does not match, {s} left: {len(self)}, {s} right: {len(other)} keys") + + def gen(): + if isinstance(other, (self.__class__, pd.DataFrame)): + raiseif(set(other) != set(self), '#keys') + for k in self.columns: + left, right = self._data[k], other[k] + yield k, op(*(left.align(right, join='inner'))) + elif isinstance(other, pd.Series): + for k in self.columns: + left, right = self._data[k], other + yield k, op(*(left.align(right, join='inner'))) + elif is_dict_like(other): + raiseif(set(other) != set(self), '#keys') + for k in self.columns: + yield k, op(self._data[k], other[k]) + elif is_nested_list_like(other): + raiseif(len(other) != len(self), 'length') + for i, k in enumerate(self.columns): + yield k, op(self._data[k], other[i]) + elif is_scalar(other) or is_list_like(other): + for k in self.columns: + yield k, op(self._data[k], other) + else: + raise NotImplementedError - def __or__(self, other): - return self.__op2__(other, op.or_) + new = self if inplace else self.copy_empty() + try: + for k, val in gen(): + new[k] = val + except Exception as e: + raise type(e)(f"'dios {OP_MAP[op]} other' failed: " + str(e)) from e + return new - def __xor__(self, other): - return self.__op2__(other, op.xor) + __neg__ = partialmethod(_op1, op.neg) + __abs__ = partialmethod(_op1, op.abs) + __invert__ = partialmethod(_op1, op.inv) + __eq__ = partialmethod(_op2, op.eq) + __ne__ = partialmethod(_op2, op.ne) + __le__ = partialmethod(_op2, op.le) + __ge__ = partialmethod(_op2, op.ge) + __lt__ = partialmethod(_op2, op.lt) + __gt__ = partialmethod(_op2, op.gt) + __add__ = partialmethod(_op2, op.add) + __sub__ = partialmethod(_op2, op.sub) + __mul__ = partialmethod(_op2, op.mul) + __mod__ = partialmethod(_op2, op.mod) + __truediv__ = partialmethod(_op2, op.truediv) + __floordiv__ = partialmethod(_op2, op.floordiv) + __pow__ = partialmethod(_op2, op.pow) + __and__ = partialmethod(_op2, op.and_) + __or__ = partialmethod(_op2, op.or_) + __xor__ = partialmethod(_op2, op.xor) def squeeze(self): - if len(self) == 1: - return self._data[self.columns[0]] - else: - return self + return self[self.columns[0]] if len(self) == 1 else self def memory_usage(self, index=True, deep=False): mem = 0 @@ -379,7 +550,7 @@ class DictOfSeries: :return: """ news = pd.Series() - newd = DictOfSeries() + newd = self.copy_empty() need_dios = False # return a copy nevertheless, but also run inplace if inplace=True or # if the function not has this option, but work inplace. @@ -414,28 +585,200 @@ class DictOfSeries: return news.squeeze() -if __name__ == '__main__': - dios = DictOfSeries(a=[234.54, 5, 5, 4, np.nan, 5, 4, 5]) - dios['b'] = dios * 2 - dios2 = dios.copy() +class _Indexer: + def __init__(self, _dios): + self._dios = _dios + self._data = _dios._data + self._unpack_value = _dios._unpack_value + + +class _LocIndexer(_Indexer): + + def __init__(self, _dios): + super().__init__(_dios) + self._set_item = _dios._set_item + + def __getitem__(self, key): + rkey, cols, lowdim = self._unpack_key(key) + if is_scalar(rkey[0]): + return self._series(rkey, cols, lowdim) + elif lowdim: + return self._scalar(rkey[0], cols[0]) + else: + new = self._dios.copy_empty() + for i, _ in enumerate(cols): + c, r = cols[i], rkey[i] + new[c] = self._data[c].loc[r] + return new + + def _series(self, rkey, cols, lowdim): + if lowdim: + return self._scalar(rkey[0], cols[0]) + new = pd.Series() + for c in cols: + try: + new[c] = self._data[c].loc[rkey] + except KeyError: + new[c] = np.nan + + def _scalar(self, r, c): + return self._data[c].loc[r] + + def __setitem__(self, key, value): + ixs, keys, _ = self._unpack_key(key) + gen = self._unpack_value(keys, ixs, value) + for tup in gen: + self._set_item(*tup) + + def _unpack_key(self, key): + # if we have a tuple, we have a rows- and a column-indexer + # if not, we only have a row-indexer and work on all columns + lowdim = False + if isinstance(key, tuple): + rkey, ckey, *fail = key + if fail: + raise KeyError("To many indexers") + + # prepare ckey + ckey = list(ckey) if is_iterator(ckey) else ckey + + # determine columns + if is_dataframe_like(ckey) or is_nested_list_like(ckey) or is_dios_like(ckey): + raise ValueError("Cannot index with multidimensional key") + if isinstance(ckey, str): + cols = [ckey] + lowdim = True + elif isinstance(ckey, slice): + cols = self._col_slice_to_col_list(ckey) + else: + try: + # list, boolean-list or series + cols, *_ = self._dios._unpack_key(ckey) + except Exception as e: + raise e + else: + cols = list(self._data.keys()) + rkey = key + # blowup + rkey = [rkey] * len(cols) + return rkey, cols, lowdim + + def _col_slice_to_col_list(self, cslice): + """ see here: + https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-slicing-with-labels + """ + keys = list(self._data.keys()) + try: + start = keys.index(cslice.start) if cslice.start is not None else None + stop = keys.index(cslice.stop) if cslice.stop is not None else None + except ValueError: + raise KeyError("The slice start label, or the slice stop label, is not present in columns.") + if not is_integer(cslice.step) or cslice.step <= 0: + return [] + return keys[slice(start, stop + 1, cslice.step)] + - print(dios == dios2) +class _iLocIndexer(_Indexer): - dios.columns = ['foo', 'bar'] + def __getitem__(self, key): + rkey, cols, lowdim = self._unpack_key(key) + if is_scalar(rkey[0]): + return self._series(rkey, cols, lowdim) + elif lowdim: + return self._scalar(rkey[0], cols[0]) + else: + new = self._dios.copy_empty() + for i, _ in enumerate(cols): + c, r = cols[i], rkey[i] + new[c] = self._data[c].iloc[r] + return new + + def _series(self, rkey, cols, lowdim): + if lowdim: + return self._scalar(rkey[0], cols[0]) + new = pd.Series() + for c in cols: + try: + new[c] = self._data[c].iloc[rkey] + except KeyError: + new[c] = np.nan + + def _scalar(self, r, c): + return self._data[c].iloc[r] - for k in reversed(dios): - print(k, dios[k], "\n") + def __setitem__(self, key, value): + ixs, keys, _ = self._unpack_key(key) + gen = self._unpack_value(keys, ixs, value) + for tup in gen: + self._set_item_positional(*tup) + raise NotImplemented + + def _set_item_positional(self, key, ix, val): + ser = self._data[key] + if is_series_like(val): + index = ser.iloc[ix].index + index = index.intersection(val.index) + if not index.empty: + ser.loc[index] = val.loc[index].copy() + else: + ser.iloc[ix] = val + + def _unpack_key(self, key): + # if we have a tuple, we have a rows- and a column-indexer + # if not, we only have a row-indexer and work on all columns + lowdim = False + if isinstance(key, tuple): + rkey, ckey, *fail = key + if fail: + raise KeyError("To many indexers") + + # prepare ckey + ckey = list(ckey) if is_iterator(ckey) else ckey + + # determine columns + if is_integer(ckey): + self._check_keys([ckey]) + cols = self._integers_to_col_list([ckey]) + lowdim = True + elif isinstance(ckey, slice): + cols = self._col_slice_to_col_list(ckey) + elif is_list_like(ckey) and not is_nested_list_like(ckey): + arr = np.array(ckey) + if is_bool_array(arr): + raise NotImplementedError + self._check_keys(ckey) + cols = self._integers_to_col_list(ckey) + elif is_series_like(ckey): + raise NotImplementedError + elif is_bool_indexer(ckey): + raise NotImplementedError + else: + raise KeyError(f"{ckey} of type {type(ckey)}") + else: + cols = list(self._data.keys()) + rkey = key - exit(99) + # blowup + rkey = [rkey] * len(cols) + return rkey, cols, lowdim - dios.squeeze() - print(dios) - dd = dios + dios - dios.pipe(pd.Series.squeeze) - print() - print(dd) - # dios.dropna(inplace=True) - # print(dios) - ts = None - dios['var'] - dios['var', ts:ts] + def _check_keys(self, keys): + bound = len(self._data) + for k in keys: + if not is_integer(k): + raise ValueError(f"{type(k)} is not integer") + if k not in range(-bound, bound): + raise KeyError("positional indexer(s) are out-of-bounds in columns") + + def _integers_to_col_list(self, ints): + klist = list(self._data.keys()) + ks = set() + for i in ints: + ks.add(klist[i]) + return list(ks) + + def _col_slice_to_col_list(self, sl): + for s in [sl.start, sl.stop, sl.step]: + if not is_integer(s): + raise TypeError(f"positional indexing with slice must be integers, passed type was {type(s)}") + return list(self._data.keys())[sl] diff --git a/dios/errors.py b/dios/errors.py new file mode 100644 index 0000000000000000000000000000000000000000..9df116fb665182532e84a78beea3d558883ced55 --- /dev/null +++ b/dios/errors.py @@ -0,0 +1,24 @@ +import warnings +# do not import dios-stuff here + + +class ItypeWarning(RuntimeWarning): + pass + + +class ItypeCastWarning(ItypeWarning): + pass + + +class ItypeCastError(RuntimeError): + pass + + +class OptionsWarning(UserWarning): + pass + + +class OptionsError(RuntimeError): + pass + + diff --git a/dios/itypes.py b/dios/itypes.py new file mode 100644 index 0000000000000000000000000000000000000000..1b755605919c9aca509022777d4cbb689d818b36 --- /dev/null +++ b/dios/itypes.py @@ -0,0 +1,213 @@ +import pandas as pd +from dios.options import * +from dios.lib import * +from dios.errors import * + + +class CastPolicy: + force = 'force' + lossless = 'lossless' + never = 'never' + + +CAST_POLICIES = get_storage_class_values(CastPolicy) + + +class __Itype: + def __init__(self): + raise RuntimeError("DatetimeItype does not allow instances of itself.") + + +class DatetimeItype(__Itype): + name = 'datetime' + unique = True + subtypes = (pd.DatetimeIndex,) + + +class IntegerItype(__Itype): + name = 'integer' + unique = True + subtypes = (pd.RangeIndex, pd.Int64Index, pd.UInt64Index, int) + + +class FloatItype(__Itype): + name = 'float' + subtypes = (pd.Float64Index, float) + unique = True + + +# class MultiItype(__Itype): +# name = "multi" +# subtypes = (pd.MultiIndex, ) +# unique = ?? + + +class NumericItype(__Itype): + name = "numeric" + _subitypes = (IntegerItype, FloatItype) + subtypes = (_subitypes + IntegerItype.subtypes + FloatItype.subtypes) + unique = False + + +class MixedItype(__Itype): + name = "mixed" + unique = False + _subitypes = (DatetimeItype, IntegerItype, FloatItype, NumericItype) + _otheritypes = (pd.CategoricalIndex, pd.IntervalIndex, pd.PeriodIndex, pd.TimedeltaIndex) + subtypes = (_subitypes + _otheritypes + DatetimeItype.subtypes + NumericItype.subtypes) + + +def is_itype(obj, itype): + """ Check if obj is a instance of the given itype or its str-alias was given""" + + # todo: iter through itype as it could be a tuple, if called like ``is_itype(o, (t1,t2))`` + + # user gave a Itype, like ``DatetimeItype`` + if type(obj) == type and issubclass(obj, itype): + return True + + # user gave a string, like 'datetime' + if isinstance(obj, str) and obj == itype.name: + return True + + return False + + +def is_itype_subtype(obj, itype): + """ Check if obj is a subclass or a instance of a subclass of the given itype""" + + # user gave a subtype, like ``pd.DatetimeIndex`` + if type(obj) == type and issubclass(obj, itype.subtypes): + return True + + # user gave a instance of a subtype, like ``pd.Series(..).index`` + if isinstance(obj, itype.subtypes): + return True + + return False + + +def is_itype_like(obj, itype): + """ Check if obj is a subclass or a instance of the given itype or any of its subtypes""" + return is_itype(obj, itype) or is_itype_subtype(obj, itype) + + +def get_itype(obj): + """ + Return the according Itype, by any of any possible user input, like + - "datetime" + - DatetimeItype + - pd.Series(...).index + - pd.DatetimeIndex + and return the according Itype + """ + if type(obj) == type and issubclass(obj, __Itype): + return obj + + # check if it is the actual type, not a subtype + types = [DatetimeItype, IntegerItype, FloatItype, NumericItype, MixedItype] + for t in types: + if is_itype(obj, t): + return t + + # If the above failed, we try to infer the itype by its subtypes. + # We just check the unique types, because the non-unique are just + # collections of unique subtypes, and would have be detected by any + # of the upper if-statements + for t in types: + if is_itype_subtype(obj, t) and t.unique: + return t + + raise ValueError(f"{obj} is not a itype, nor any known subtype of a itype, nor a itype string alias") + + +def itype_eq(a, b): + return is_itype(a, b) + + +def itype_lt(a, b): + return is_itype_subtype(a, b) + + +def itype_le(a, b): + return is_itype_like(a, b) + + +def cast_to_itype(series, itype, policy='lossless', err='raise', inplace=False): + """ Cast a series (more explicit the type of the index) to fit the itype of a dios. + + Return the casted series if successful, None otherwise. + + Note: + This is very basic number-casting, so in most cases, information from + the old index will be lost after the cast. + """ + + if policy not in CAST_POLICIES: + raise ValueError(f"policy={policy}") + if err not in ['raise', 'ignore']: + raise ValueError(f"err={err}") + if not inplace: + series = series.copy() + series.itype = get_itype(series.index) + + # up-cast issn't necessary because a dios with a higher + # itype always can take lower itypes. + # series can have dt/int/float/mixed + # dt -> dt -> mixed + # int -> int -> num -> mixed + # float -> float -> num -> mixed + # mixed -> mixed + if itype_le(series.itype, itype): # a <= b + return series + + e = f"A series index of type `{type(series.index)}` cannot be casted to Itype {itype.name}" + + # cast any -> dt always fail. + if is_itype(itype, DatetimeItype): + pass + else: + e += f", as forbidden by the cast-policy `{policy}`." + + if policy == CAST_POLICIES[CastPolicy.never]: + pass + + elif policy == CAST_POLICIES[CastPolicy.force]: + # cast any (dt/float/mixed) -> int + # cast any (dt/float/mixed) -> num + if is_itype(itype, IntegerItype) or is_itype(itype, NumericItype): # a == b or a == c + series.index = pd.RangeIndex(len(series)) + return series + # cast any (dt/int/mixed) -> float + if is_itype(itype, FloatItype): # a == b + series.index = pd.Float64Index(range(len(series))) + return series + + elif policy == CAST_POLICIES[CastPolicy.lossless]: + # cast int -> float + if is_itype(itype, IntegerItype) and is_itype(series.itype, FloatItype): # a == b and c == d + series.index = series.index.astype(float) + return series + # cast float -> int, maybe if unique + if is_itype(itype, FloatItype) and is_itype(series.itype, IntegerItype): # a == b and c == d + series.index = series.index.astype(int) + if series.index.is_unique: + return series + e = f"The cast with policy {policy} from series index type `{type(series.index)}` to " \ + f"itype {itype.name} resulted in a non-unique index." + # cast mixed -> int/float always fail + + if err == 'raise': + raise ItypeCastError(e) + else: + return None + + +def throw_MixedItype_err_or_warn(msg): + if dios_options[OptsFields.mixed_itype_warn_policy] in ['ignore', 'silent']: + pass + elif dios_options[OptsFields.mixed_itype_warn_policy] in ['error', 'err']: + raise ItypeCastError(msg) + else: + warnings.warn(msg, ItypeWarning) + return diff --git a/dios/lib.py b/dios/lib.py new file mode 100644 index 0000000000000000000000000000000000000000..ad4ba780d4e91450ae89e457a6a67f3f12053c49 --- /dev/null +++ b/dios/lib.py @@ -0,0 +1,59 @@ +import pandas as pd +import numpy as np +import contextlib +# do not import dios-stuff here +import operator as op + + +# @contextlib.contextmanager +# def reraise(prefix="", postfix=""): +# try: +# yield +# except Exception as e: +# raise type(e)(prefix + str(e) + postfix) from e + + + +def get_storage_class_values(cls): + return [getattr(cls, c) for c in cls.__dict__ if not c.startswith("_")] + + +_OP1_MAP = { + op.inv: "~", + op.neg: "-", + op.abs: "abs()", +} + +_OP2_COMP_MAP = { + op.eq: '==', + op.ne: '!=', + op.le: '<=', + op.ge: '>=', + op.gt: '>', + op.lt: '<', +} + +_OP2_BOOL_MAP = { + op.and_: "&", + op.or_: "|", + op.xor: "^", + +} +_OP2_ARITH_MAP = { + op.add: "+", + op.sub: "-", + op.mul: "*", + op.pow: "**", +} + +_OP2_DIV_MAP = { + op.mod: "%", + op.truediv: "/", + op.floordiv: "//", +} + +OP_MAP = _OP2_COMP_MAP.copy() +OP_MAP.update(_OP2_BOOL_MAP) +OP_MAP.update(_OP2_ARITH_MAP) +OP_MAP.update(_OP2_DIV_MAP) +OP_MAP.update(_OP1_MAP) diff --git a/dios/options.py b/dios/options.py new file mode 100644 index 0000000000000000000000000000000000000000..ae735ea2d5740020a48b86fec9f0dc216952545e --- /dev/null +++ b/dios/options.py @@ -0,0 +1,74 @@ +# do not import dios-stuff here + + +class OptsFields: + """storage class for the keys in ``dios_options``""" + + """Set the number of rows and variables to display in a call that use + ``__repr__`` or ``__str__`` like e.g. ``print(dios)`` do.""" + disp_max_rows = "disp_max_rows " + disp_max_vars = "disp_max_vars" + + """ + none_plus: none or more columns, than in self, can be given + at_least_one: accept if at least one column is present in self + all_present: all given columns must be present in self + """ + col_indexing_method = "col_indexing_method" + + mixed_itype_warn_policy = "mixed_itype_policy" + + """ + should nans be droppend during comparision(drop), + stay nans (keep), or be compared (nplike). + nplike is quite silly as `5 < NaN` will simply evaluate to False""" + comparison_nan_policy = "comparison_nan_policy" + + """ + Get item nan policy: + + """ + + +class Opts: + none_plus = 'none_plus' + at_least_one = 'at_least_one' + all_present = 'all_present' + + itype_warn = 'warn' + itype_err = 'err' + itype_ignore = 'ignore' + + keep_nans = 'keep' + nplike_nans = 'nplike' + drop_nans = 'drop' + + +# set default values +dios_options = { + OptsFields.disp_max_rows: 10, + OptsFields.disp_max_vars: 4, + OptsFields.col_indexing_method: Opts.none_plus, + OptsFields.mixed_itype_warn_policy: Opts.itype_ignore, + OptsFields.comparison_nan_policy: Opts.keep_nans, +} + + +def check_keys_by_policy(check, keys, policy): + + filtered = [k for k in check if k in keys] + if policy == Opts.none_plus: + pass + + elif policy == Opts.at_least_one: + if not filtered: + raise KeyError("Policy says: at least one key must be shared.") + + elif Opts.all_present: + fail = set(filtered).symmetric_difference(set(check)) + if fail: + raise KeyError(f"Unknown keys {fail}. Policy says: all given keys must be known.") + else: + raise ValueError(policy) + + return filtered diff --git a/dios/profiling/__init__.py b/dios/profiling/__init__.py deleted file mode 100644 index 139597f9cb07c5d48bed18984ec4747f4b4f3438..0000000000000000000000000000000000000000 --- a/dios/profiling/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ - - diff --git a/profiling/__init__.py b/profiling/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..34c8fe901fd5efd3428df1304e8159bc5f173c3c --- /dev/null +++ b/profiling/__init__.py @@ -0,0 +1,3 @@ +from .generate_testsets import * +from profiling.performance import find_index_range, gen_random_timestamps + diff --git a/dios/profiling/generate_testsets.py b/profiling/generate_testsets.py similarity index 72% rename from dios/profiling/generate_testsets.py rename to profiling/generate_testsets.py index df2d97e8a2fe34392f400336ecaa001c27f667ca..9d95cc3405fa7be21ab474302cd237f3e7a54489 100644 --- a/dios/profiling/generate_testsets.py +++ b/profiling/generate_testsets.py @@ -44,32 +44,41 @@ def _gen_testset(rowsz, colsz, freq='1min', disalign=True, randstart=True): return df, dos -def _gen_df(rowsz, colsz, freq='1min', disalign=True, randstart=True): - df, _ = _gen_testset(rowsz=rowsz, colsz=colsz, freq=freq, disalign=disalign, randstart=randstart) - return df +def get_random_df_and_dios(rowsz, colsz, freq='1min', disalign=True, randstart=True): + df, _, _, dios, *_ = get_testset(rowsz, colsz, freq=freq, disalign=disalign, randstart=randstart) + return df, dios -def gen_dos(rowsz, colsz, freq='1min', disalign=True, randstart=True): - _, dos = _gen_testset(rowsz=rowsz, colsz=colsz, freq=freq, disalign=disalign, randstart=randstart) - return dos +def get_testset(rows, cols, freq='1s', disalign=True, randstart=True, storagedir=None, noresult=False): + if storagedir is None: + storagedir = os.path.dirname(__file__) + storagedir = os.path.join(storagedir, 'testsets') - -def get_testset(rows, cols, freq='1s', disalign=True, randstart=True, storagedir='testsets', noresult=False): fname = f'set_f{freq}_d{disalign}_r{randstart}_dim{rows}x{cols}.pkl' fpath = os.path.join(storagedir, fname) + + # try to get pickled data try: with open(fpath, 'rb') as fh: if noresult: return tup = pickle.load(fh) + + # file/data was present + return tup except (pickle.UnpicklingError, FileNotFoundError): - df_, dos_ = _gen_testset(rowsz=rows, colsz=cols, freq=freq, disalign=disalign, randstart=randstart) - df_ = df_.sort_index(axis=0, level=0) - a_ = df_.copy().stack(dropna=False).sort_index(axis=0, level=0).copy() - b_ = df_.copy().unstack().sort_index(axis=0, level=0).copy() - tup = df_, a_, b_, dos_ - with open(fpath, 'wb') as fh: - pickle.dump(tup, fh) + pass + + # generate testset(s) + df, dios = _gen_testset(rowsz=rows, colsz=cols, freq=freq, disalign=disalign, randstart=randstart) + df = df.sort_index(axis=0, level=0) + df_type_a = df.copy().stack(dropna=False).sort_index(axis=0, level=0).copy() + df_type_b = df.copy().unstack().sort_index(axis=0, level=0).copy() + tup = df, df_type_a, df_type_b, dios + + # store testsets + with open(fpath, 'wb') as fh: + pickle.dump(tup, fh) if noresult: return diff --git a/dios/profiling/memory.py b/profiling/memory.py similarity index 72% rename from dios/profiling/memory.py rename to profiling/memory.py index 33539ee5839cbff631d24014410d86fa69d6b719..d577464ea22b2e51416a5ac452393a5d3e23f14c 100644 --- a/dios/profiling/memory.py +++ b/profiling/memory.py @@ -1,5 +1,5 @@ import gc -from dios.profiling.generate_testsets import get_testset, _gen_testset +from profiling.generate_testsets import get_random_df_and_dios def calc_mem(rows, cols, shifted=False, dtypesz=(64 / 8)): @@ -36,7 +36,7 @@ def rows_by_time(nsec, mdays): if __name__ == '__main__': - # dos - linear in rows and colums, same size for r=10,c=100 or r=100,c=10 + # dios - linear in rows and colums, same size for r=10,c=100 or r=100,c=10 do_real_check = True cols = 10 rows = 100000 @@ -45,14 +45,14 @@ if __name__ == '__main__': mem = calc_mem(rows, cols, shifted=False) memsh = calc_mem(rows, cols, shifted=True) - df, _, _, dos = get_testset(rows, cols, disalign=False, randstart=True) - dos_mem = dos.memory_usage() - print(f"dos:\n-----------") - print("mem: ", *bytes2hread(dos_mem)) - print("entries:", sum([len(dos[e]) for e in dos])) + df, dios = get_random_df_and_dios(rows, cols, disalign=False, randstart=True) + dios_mem = dios.memory_usage() + print(f"dios:\n-----------") + print("mem: ", *bytes2hread(dios_mem)) + print("entries:", sum([len(dios[e]) for e in dios])) print() - ratio = (1 / (memsh - mem) ) * dos_mem + ratio = (1 / (memsh - mem) ) * dios_mem mem = bytes2hread(mem) memsh = bytes2hread(memsh) @@ -66,7 +66,7 @@ if __name__ == '__main__': print("entries:", rows * cols) print() - print(f"dfbest, dos, dfworst: 0%, {round(ratio, 4)*100}%, 100% ") + print(f"dfbest, dios, dfworst: 0%, {round(ratio, 4)*100}%, 100% ") if not do_real_check: exit(0) @@ -77,7 +77,7 @@ if __name__ == '__main__': # best case print() print('best case proove') - dfb, _ = _gen_testset(rows, cols, disalign=False, randstart=False) + dfb, _ = get_random_df_and_dios(rows, cols, disalign=False, randstart=False) dfb.info(memory_usage='deep', verbose=False) print() @@ -87,7 +87,7 @@ if __name__ == '__main__': print() print('rand start, rand freq') - df, _ = get_testset(rows, cols, disalign='random', randstart=True) + df, _ = get_random_df_and_dios(rows, cols, disalign='random', randstart=True) df.info(memory_usage='deep', verbose=False) print("entries:", sum([len(df[e]) for e in df])) @@ -95,7 +95,7 @@ if __name__ == '__main__': # worst case print() print('worst case proove') - df, _ = _gen_testset(rows, cols, disalign=True, randstart=False) + df, _ = get_random_df_and_dios(rows, cols, disalign=True, randstart=False) df.info(memory_usage='deep', verbose=False) gc.collect() diff --git a/dios/profiling/performance.py b/profiling/performance.py similarity index 93% rename from dios/profiling/performance.py rename to profiling/performance.py index ab9a3a91f681f56db25aaa36a34c9d429fa8b491..1be82e8fb98d1d96e1615e9a3d11bb21984274d5 100644 --- a/dios/profiling/performance.py +++ b/profiling/performance.py @@ -1,7 +1,7 @@ import pandas as pd import numpy as np import time -from dios.profiling.generate_testsets import get_testset, var_prefix +from profiling.generate_testsets import get_testset, var_prefix profile_assignment = False @@ -61,20 +61,20 @@ def b_timings(df, t0, t1, v1, v2): return a, b, df -def dos_timings(dos, t0, t1, v1, v2): +def dios_timings(dios, t0, t1, v1, v2): _t0 = time.time() - a = dos[t0:t1, :] + a = dios[t0:t1, :] _t1 = time.time() - b = dos[:, v1] + b = dios[:, v1] _t2 = time.time() if profile_assignment: - dos[t0:t1, v1] = dos[t0:t1, v1] * 1111 + dios[t0:t1, v1] = dios[t0:t1, v1] * 1111 _t3 = time.time() timingsdf.at[rows, ('ts', 'dios')] += _t1 - _t0 timingsdf.at[rows, ('var', 'dios')] += _t2 - _t1 timingsdf.at[rows, ('ass', 'dios')] += _t3 - _t2 - return a, b, dos + return a, b, dios def gen_random_timestamps(m, M): @@ -116,7 +116,7 @@ if __name__ == '__main__': use_df = True use_a = False use_b = False - use_dos = True + use_dios = True # plot options normalize_to_df = False @@ -132,7 +132,7 @@ if __name__ == '__main__': timingsdf.loc[rows] = (0,) * len(timingsdf.columns) - df, a, b, dos = get_testset(rows, cols) + df, a, b, dios = get_testset(rows, cols) t0, t4 = find_index_range(df) if use_df or normalize_to_df: @@ -153,11 +153,11 @@ if __name__ == '__main__': vr1 = var_prefix + str(np.random.randint(0, cols)) b_timings(b, t1, t2, vr1, None) - if use_dos: + if use_dios: for r in range(runs): t1, t2 = gen_random_timestamps(t0, t4) vr1 = var_prefix + str(np.random.randint(0, cols)) - dos_timings(dos, t1, t2, vr1, None) + dios_timings(dios, t1, t2, vr1, None) # calc the average timingsdf /= runs @@ -198,7 +198,7 @@ if __name__ == '__main__': a.plot(logy=plot_ylog, logx=plot_xlog, linestyle='--', ax=ax) if use_b: b.plot(logy=plot_ylog, logx=plot_xlog, linestyle=':', ax=ax) - if use_dos: + if use_dios: dios.plot(logy=plot_ylog, logx=plot_xlog, linestyle='-.', ax=ax) plt.show() diff --git a/dios/profiling/testsets/.gitignore b/profiling/testsets/.gitignore similarity index 100% rename from dios/profiling/testsets/.gitignore rename to profiling/testsets/.gitignore diff --git a/tests/__init__.py b/test/__init__.py similarity index 100% rename from tests/__init__.py rename to test/__init__.py diff --git a/test/run_dios.py b/test/run_dios.py new file mode 100644 index 0000000000000000000000000000000000000000..eaf78ab58adb164665fd26c5ddadd6e1ca5924eb --- /dev/null +++ b/test/run_dios.py @@ -0,0 +1,45 @@ + +from dios import * +import numpy as np + +if __name__ == '__main__': + # dios_options[OptsFields.mixed_itype_policy] = 'error' + + # df = pd.DataFrame([1,24,5,456,45], index=pd.date_range(periods=5, freq='1d', start='2000-01-01')) + # df[[True, False]] + + dios = DictOfSeries(data=[234.54, 5, 5, 4, np.nan, 5, 4, 5]) + + dios = abs(~dios) + + print(all(dios == dios)) + + dtser = pd.Series([2,4,4123,122,4], index=pd.date_range(freq='1d', periods=5, start='2000-01-01')) + dios['b'] = dtser + dios2 = dios.copy() + + a = dios.loc[:] + df = pd.DataFrame([1,24,5,456,45], index=pd.date_range(periods=5, freq='1d', start='2000-01-01')) + dios == df + a = df.iloc[:,0] + print(dios) + exit(4) + + dios.columns = ['foo', 'bar'] + + for k in reversed(dios): + print(k, dios[k], "\n") + + exit(99) + + dios.squeeze() + print(dios) + dd = dios + dios + dios.pipe(pd.Series.squeeze) + print() + print(dd) + # dios.dropna(inplace=True) + # print(dios) + ts = None + dios['var'] + dios['var', ts:ts] diff --git a/test/test__getsetitem__.py b/test/test__getsetitem__.py new file mode 100644 index 0000000000000000000000000000000000000000..56a2d7ff0a30d965c2da68bb0cdfb695d4afc51b --- /dev/null +++ b/test/test__getsetitem__.py @@ -0,0 +1,104 @@ +from dios import * +import pytest + +s1 = pd.Series(range(10), index=range(10)) +s2 = pd.Series(range(5, 10), index=range(5, 10)) +s3 = pd.Series(range(1, 30, 2), index=range(1, 30, 2)) +s4 = pd.Series(np.linspace(7, 13, 9), index=range(3, 12)) +s1.name, s2.name, s3.name, s4.name = 'a', 'b', 'c', 'd' +d1 = DictOfSeries(data=dict(a=s1.copy(), b=s2.copy(), c=s3.copy(), d=s4.copy())) + + +@pytest.mark.parametrize(('idxer', 'exp'), [('a', s1), ('c', s3)]) +def test__getitem_single(idxer, exp): + a = d1[idxer] + b = d1.loc[:, idxer] + assert isinstance(a, pd.Series) + assert isinstance(b, pd.Series) + assert (a == exp).all() + assert (b == exp).all() + + +@pytest.mark.parametrize(('idxer', 'exp'), [((1, 'a'), s1), ((3, 'c'), s3)]) +def test__getitem_scalar_loc(idxer, exp): + a = d1.loc[idxer] + assert is_scalar(a) + assert a == exp.loc[idxer[0]] + + +@pytest.mark.parametrize(('idxer', 'exp'), [(0, s1), (1, s2), (2, s3), (3, s4), + (-1, s4), (-2, s3), (-3, s2), (-4, s1)]) +def test__getitem_single_iloc(idxer, exp): + a = d1.iloc[:, idxer] + assert isinstance(a, pd.Series) + assert (a == exp).all() + + +@pytest.mark.parametrize(('idxer', 'exp'), [((1,0), s1), ((3,-2), s3), ((-1,-1), s4)]) +def test__getitem_scalar_iloc(idxer, exp): + a = d1.iloc[idxer] + assert is_scalar(a) + assert a == exp.iloc[idxer[0]] + +@pytest.mark.parametrize('idxer', ['x', '1', 1, None, ]) +def test__getitem_single_fail(idxer): + with pytest.raises(KeyError): + a = d1[idxer] + + +@pytest.mark.parametrize('idxer', ['x', '1', 1, None, ]) +def test__getitem_single_loc_fail(idxer): + with pytest.raises(KeyError): + a = d1.loc[:, idxer] + + +@pytest.mark.parametrize('idxer', [-5, 99, 'a', '2', ]) +def test__getitem_single_iloc_fail(idxer): + with pytest.raises(KeyError): + a = d1.iloc[:, idxer] + + +BLIST = [True, False, False, True] +BOOLIDXER = [BLIST, pd.Series(BLIST), d1.copy() > 10] +INDEXERS = [['a'], ['a', 'c'], pd.Series(['a', 'c'])] + BOOLIDXER + + +@pytest.mark.parametrize('idxer', INDEXERS) +def test__getitem__(idxer): + print() + print(d1.values) + + d = d1[idxer] + assert isinstance(d, DictOfSeries) + +# return +# d = d1[['a', 'b']] +# assert isinstance(d, DictOfSeries) +# a = d['a'] +# b = d['b'] +# assert (a == s1).all() +# assert (b == s2).all() +# +# l, r = 1, 7 +# d = d1[l:r] +# assert isinstance(d, DictOfSeries) +# for s in [s1, s2, s3, s4]: +# exp = s[l:r] +# assert (d[s.name] == exp).all() +# +# d = d1.loc[l:r] +# assert isinstance(d, DictOfSeries) +# for s in [s1, s2, s3, s4]: +# exp = s.loc[l:r] +# assert (d[s.name] == exp).all() +# +# d = d1.iloc[l:r] +# assert isinstance(d, DictOfSeries) +# for s in [s1, s2, s3, s4]: +# exp = s.iloc[l:r] +# assert (d[s.name] == exp).all() +# +# +# if __name__ == '__main__': +# for i in INDEXERS: +# test__geting_items(i) diff --git a/test/test_df_like.py b/test/test_df_like.py new file mode 100644 index 0000000000000000000000000000000000000000..d21f18b3bdaf4c91fcfa9bde4ed21a3904b314bd --- /dev/null +++ b/test/test_df_like.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python +import pytest +from dios import * + +import pandas as pd +import numpy as np +from copy import deepcopy + + +__author__ = "Bert Palm" +__email__ = "bert.palm@ufz.de" +__copyright__ = "Copyright 2018, Helmholtz-Zentrum für Umweltforschung GmbH - UFZ" + + +arr = np.random.rand(8) +TESTDATA = [ + None, # empty + arr.copy(), # list + # np.array([arr.copy(), arr.copy(), arr.copy()]), # nested list + dict(a=arr.copy(), b=arr.copy()), # dict +] + + +@pytest.mark.parametrize("data", TESTDATA) +@pytest.mark.parametrize("with_column_param", [False, True]) +def test_dios_create(data, with_column_param): + + if is_dict_like(data) and with_column_param: + # giving column names in dict-keys and in columns-parameter is special in df + pytest.skip() + + data_copy0 = deepcopy(data) + data_copy1 = deepcopy(data) + if with_column_param: + df = pd.DataFrame(data=data_copy0) + col = [f"new_{c}" for c in df] + else: + col = None + + if is_nested_list_like(data): + # giving nested lists, work different between df and dios + data_copy1 = data_copy1.transpose() + + df = pd.DataFrame(data=data_copy1, columns=col) + dios = DictOfSeries(data=data_copy0, columns=col) + + assert len(dios.columns) == len(df.columns) + assert np.all(dios.values == df.values) + + # df columns may not be strings, but dios'es are always + columns = [str(c) for c in df.columns] + assert list(dios.columns) == columns + + for c in df.columns: + assert np.all(dios[str(c)] == df[c]) + diff --git a/test/test_dios.py b/test/test_dios.py new file mode 100644 index 0000000000000000000000000000000000000000..77f884c09025ae33f1d4c8e79f37ba827e3b6d3d --- /dev/null +++ b/test/test_dios.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python +import pytest +from dios import * +from dios.lib import _OP1_MAP, _OP2_DIV_MAP, _OP2_ARITH_MAP, _OP2_BOOL_MAP, _OP2_COMP_MAP + +import pandas as pd +import numpy as np +from copy import deepcopy +from pandas.tests.series.conftest import datetime_series + +__author__ = "Bert Palm" +__email__ = "bert.palm@ufz.de" +__copyright__ = "Copyright 2018, Helmholtz-Centrum für Umweltforschung GmbH - UFC" + + +def get_dios(ser, i): + dios = DictOfSeries() + for i in range(i): + dios[f'c{i}'] = ser.copy() * (i + 1) // 2 + return dios + + +EE = [[], [], []] + +O = [[0, 0, 0], [0, 0, 0], [0]] +I = [[1, 1, 1], [1, 1, 1], [1]] +# A = [[1, 2, 3], [4, 5, 6]] +# B = [[0, 2, 2], [5, 5, 5]] +# C = [[3, 2, 0], [1, 0, 3]] +# D = [[6, 5, 4], [3, 2, 1]] +A = [[1], [2, 3], [4, 5, 6]] +B = [[0, 2, 2], [5], [5, 5]] +C = [[3, 2, 0], [1, 0, 3], [0,0,0]] +D = [[6], [2], [9]] + + +DATA23 = [O, I, A, B, C, D] +ALL = [EE, O, I, A, B, C, D] + + +OP2 = list(_OP2_ARITH_MAP) + list(_OP2_COMP_MAP) + list(_OP2_BOOL_MAP) + list(_OP2_DIV_MAP) +OP1 = list(_OP1_MAP) + + +def _diosmatr(mlist): + l = [] + for m in mlist: + m = np.array(m) + l.append(DictOfSeries(m.copy())) + return tuple(l) + + +@pytest.fixture() +def dictofseries(datetime_series): + return get_dios(datetime_series, 5) + + +def test_len__len__(datetime_series, maxlen=10): + dios = DictOfSeries() + assert len(dios) == 0 + + for i in range(maxlen): + dios[f'c{i}'] = datetime_series.copy() + assert len(dios) == i + 1 + + for i in reversed(range(maxlen)): + assert len(dios) == i + 1 + del dios[f'c{i}'] + + assert len(dios) == 0 + + +def test_copy_copy_empty__copy__(dictofseries): + dios = dictofseries.copy() + shallow = dios.copy(deep=False) + deep = dios.copy(deep=True) + empty = dios.copy_empty() + + assert dios is not shallow + assert dios is not deep + assert dios is not empty + + assert dios.itype == shallow.itype + assert dios.itype == deep.itype + assert dios.itype == empty.itype + + for i in dios: + assert dios[i].index is shallow[i].index + assert dios[i].index is not deep[i].index + dios[i][0] = 999999 + assert dios[i][0] == shallow[i][0] + assert dios[i][0] != deep[i][0] + + +@pytest.mark.parametrize('left', _diosmatr(DATA23)) +@pytest.mark.parametrize('right', _diosmatr(DATA23)) +def test__eq__(left, right): + a, b = left, right + _test = a == b + for c in _test: + for i in _test[c].index: + res = (_test[c])[i] + e1 = a[c][i] + e2 = b[c][i] + exp = e1 == e2 + assert res == exp + + +@pytest.mark.parametrize('left', _diosmatr(DATA23)) +@pytest.mark.parametrize('right', _diosmatr(DATA23)) +@pytest.mark.parametrize('op', _OP2_COMP_MAP) +def test_all(left, right, op): + # op's are only comparators + a = left + ser = (op(a,a)).all() + res = [e for e in ser] + exp = [(op(a[col],a[col])).all() for col in a] + for i in range(len(res)): + assert res[i] == exp[i] + + + +@pytest.mark.parametrize('left', _diosmatr(ALL)) +@pytest.mark.parametrize('right', _diosmatr(ALL)) +@pytest.mark.parametrize('op', OP2) +def test__op2__(left, right, op): + try: + a, b = left, right + test = op(a, b) + for c in test: + for j in test[c].index: + exp = op(a[c][j],b[c][j]) + res = test[c][j] + if not np.isfinite(res): + print(f"\n\nres: {res}, exp:{exp}, op: {OP_MAP[op]}") + pytest.skip('test not support non-finite values') + return + assert res == exp + + except ZeroDivisionError: + pytest.mark.skip('ZeroDivisionError') + + +@pytest.mark.parametrize('data', _diosmatr(ALL)) +@pytest.mark.parametrize('op', OP1) +def test__op1__(data, op): + test = op(data) + res = [entry for col in test for entry in test[col]] + e = [entry for col in data for entry in data[col]] + for i in range(len(res)): + exp = op(e[i]) + assert res[i] == exp diff --git a/tests/tests.py b/test/test_dios_old.py similarity index 69% rename from tests/tests.py rename to test/test_dios_old.py index 2223dfb616d4afd18542c31faaa235092d94eb73..b36aa35f8231b64eb062c169169fc7f96ad0888d 100644 --- a/tests/tests.py +++ b/test/test_dios_old.py @@ -1,4 +1,12 @@ from dios import * +from profiling import * +import pandas as pd +import datetime as dt +import numpy as np +import pytest + +# pytestmark = pytest.mark.skip + v0 = 'var0' v1 = 'var1' @@ -30,15 +38,15 @@ def test_getitem(): begin = dt.datetime.strptime("2000-01-10 00:00:00", "%Y-%m-%d %H:%M:%S") t0 = begin + pd.Timedelta('20s') t1 = t0 + pd.Timedelta('50s') - dos_aligned = DictOfSeries() - dos_rand = DictOfSeries() + dios_aligned = DictOfSeries() + dios_rand = DictOfSeries() # fill vars = 10 rows = 10 for i in range(0, vars): - dos_aligned[f'var{i}'] = gen_series(rows, randomize=False) - dos_rand[f'var{i}'] = gen_series(rows, randomize=True) + dios_aligned[f'var{i}'] = gen_series(rows, randomize=False) + dios_rand[f'var{i}'] = gen_series(rows, randomize=True) # testsets var = [v0, [], [v1], [v0, v0], [v0, v2]] @@ -47,20 +55,20 @@ def test_getitem(): for v in var: print(v) - dos_rand[v] + dios_rand[v] # use aligned dios for time stamps instead of time-ranges for t in tssl: print(t) - dos_rand[v] + dios_rand[v] try: - dos_aligned[v1, v2] + dios_aligned[v1, v2] except KeyError: pass try: - dos_aligned[v1, v2, v3] + dios_aligned[v1, v2, v3] except KeyError: pass @@ -70,59 +78,59 @@ def test_setitem(): begin = dt.datetime.strptime("2000-01-10 00:00:00", "%Y-%m-%d %H:%M:%S") t0 = begin + pd.Timedelta('30s') t1 = t0 + pd.Timedelta('50s') - dos_aligned = DictOfSeries() - dos_aligned.name = 'aligned' - dos_rand = DictOfSeries() - dos_rand.name = 'rand' + dios_aligned = DictOfSeries() + dios_aligned.name = 'aligned' + dios_rand = DictOfSeries() + dios_rand.name = 'rand' # fill vars = 10 rows = 100 for i in range(0, vars): - dos_aligned[f'var{i}'] = gen_series(rows, randomize=False) - dos_rand[f'var{i}'] = gen_series(rows, randomize=True) + dios_aligned[f'var{i}'] = gen_series(rows, randomize=False) + dios_rand[f'var{i}'] = gen_series(rows, randomize=True) # testsets keys = [v0, [v1], [v0, v0], [v0, v2]] tssl = [slice(None), slice(t0, None), slice(None, t1), slice(t0, t1), slice(t0, t0)] scalars = [1, 'deadbeef'] l = list(np.random.randint(0, 100, rows)) - dos = [dos_aligned.copy(), dos_rand.copy()] - for d in dos: + dios = [dios_aligned.copy(), dios_rand.copy()] + for d in dios: d.name = 'src-dios' # assign scalars for val in scalars: for v in keys: print(v, '=', val) - dos_rand[v] = val + dios_rand[v] = val # assign scalars for val in scalars: for t in tssl: print(t, '=', val) - dos_rand[t] = val + dios_rand[t] = val # assign list for v in keys: print(v, '=', l) - dos_aligned[v] = l + dios_aligned[v] = l # assign series for v in keys: print(v, '=', 'series') - dos_aligned[v] = dos_aligned[v4] + dios_aligned[v] = dios_aligned[v4] # assign dios for v in keys: - for d in dos: + for d in dios: print(f'{v} = dios[{v}]') - dos_aligned[v] = d[v] + dios_aligned[v] = d[v] # probkeys = [[], slice(v0, v0), ] # for v in probkeys: # try: - # dos_aligned[v] = l + # dios_aligned[v] = l # except ValueError: # pass # else: @@ -132,76 +140,76 @@ def test_setitem(): def test_integrity(): rows = 1000 cols = 10 - df, _, _, dos = get_testset(1000, 10, storagedir='../dios/profiling/testsets') + df, _, _, dios = get_testset(1000, 10) v = var_prefix + str(np.random.randint(0, cols)) - t = find_index_range(dos) + t = find_index_range(dios) t0, t1 = gen_random_timestamps(*t) # originals - dos_ = dos.copy(deep=False) + dios_ = dios.copy(deep=False) df_ = df.copy() s_ = df[v].dropna() # identity - assert dos_ == dos - assert (dos_[v] == dos[v]).all() - assert dos_ is not dos - assert dos_[v] is not dos[v] + assert (dios_ == dios).all().all() + assert (dios_[v] == dios[v]).all().all() + assert dios_ is not dios + assert dios_[v] is not dios[v] # equal t0 df - assert dos[v] is not df[v] - assert (dos[v] == df[v].dropna()).all() + assert dios[v] is not df[v] + assert (dios[v] == df[v].dropna()).all().all() # write - dos = dos_.copy() + dios = dios_.copy() s = s_.copy() - dos[t0:t1] = 4 + dios[t0:t1] = 4 s[t0:t1] = 4 - assert (dos[v] == s).all() + assert (dios[v] == s).all() # write all s = s_.copy() - dos = dos_.copy() - dos[v] = 111 + dios = dios_.copy() + dios[v] = 111 s[:] = 111 - assert (dos[v] == s).all() + assert (dios[v] == s).all() # multi variables - slice df = df_.copy() - dos = dos_.copy() + dios = dios_.copy() li = [v0, v1, v2] - dos[t0:t1] = 222 + dios[t0:t1] = 222 for x in li: s = df[x].dropna() s[t0:t1] = 222 - assert (dos[x] == s).all() + assert (dios[x] == s).all() # on all - dos[t0:t1] = 222.111 + dios[t0:t1] = 222.111 m = df.loc[t0:t1,:].notna() df[m] = 222.111 for x in df: s = df[x].dropna() - assert (dos[x] == s).all() + assert (dios[x] == s).all() # multi variables - list df = df_.copy() - dos = dos_.copy() + dios = dios_.copy() li = [v0, v5, v3, v9] - dos[t0:t1] = 333 + dios[t0:t1] = 333 for x in li: s = df[x].dropna() s[t0:t1] = 333 - assert (dos[x] == s).all() + assert (dios[x] == s).all() - # dos to dos + # dios to dios df = df_.copy() - dos = dos_.copy() - dos[v] = 444 - dos[v5] = dos[v] * 0.1 + dios = dios_.copy() + dios[v] = 444 + dios[v5] = dios[v] * 0.1 s = df[v].dropna() s[:] = 444 * 0.1 - assert (dos[v5] == s).all() + assert (dios[v5] == s).all() def test_foreach(): @@ -237,7 +245,7 @@ def test_foreach(): d1 = dios.copy() d2 = d.pipe(pd.Series.drop_duplicates, inplace=False) # original didtn change - assert d is not d1 and d == d1 + assert d is not d1 and (d == d1).all() # tests on d2 assert d2 is not d assert isinstance(d2, DictOfSeries) @@ -261,30 +269,30 @@ def tmptest(): begin = dt.datetime.strptime("2000-01-10 00:00:00", "%Y-%m-%d %H:%M:%S") t0 = begin + pd.Timedelta('20s') t1 = t0 + pd.Timedelta('50s') - dosa = DictOfSeries() - dosr = DictOfSeries() + diosa = DictOfSeries() + diosr = DictOfSeries() # fill vars = 10 rows = 8 for i in range(0, vars): - dosa[f'v{i}'] = gen_series(rows, randomize=False) - dosr[f'v{i}'] = gen_series(rows, randomize=True) + diosa[f'v{i}'] = gen_series(rows, randomize=False) + diosr[f'v{i}'] = gen_series(rows, randomize=True) - dosa[[], slice(None)] = 99 + diosa[[], slice(None)] = 99 exit(9) - print(dosa) - d = dosa[t0:t1, v1] + print(diosa) + d = diosa[t0:t1, v1] d = d * 1000 print(d) print() - print(dosa[t0:t1, v1]) - l1 = len(dosa[t0:t1, v1]) + print(diosa[t0:t1, v1]) + l1 = len(diosa[t0:t1, v1]) l2 = len(d) print(l1, l2) - dosa[t0:t1, v1] = d - print(dosa[v1]) + diosa[t0:t1, v1] = d + print(diosa[v1]) exit(3)