diff --git a/dios/dios.py b/dios/dios.py index 35efd8a05b7a29d8b1bd0135cb673c322c2c6854..17755a2ce0482510d8ddbf82b83bb858bc953e77 100644 --- a/dios/dios.py +++ b/dios/dios.py @@ -18,6 +18,7 @@ from pandas.core.dtypes.common import ( is_integer, is_dict_like, is_number, + is_hashable, ) from pandas.core.dtypes.common import is_iterator as _is_iterator @@ -48,6 +49,16 @@ def is_iterator(obj): return _is_iterator(obj) +def align(s1, s2, method='dropna'): + if method == 'keepna': + s = s1.reindex_like(s2) + elif method == 'dropna': + s = s1.reindex_like(s2).dropna() + else: + raise ValueError(method) + return s + + class DictOfSeries: """ DictionaryOfSeries is a collection of pd.Series's which aim to be as close as possible similar to @@ -82,80 +93,61 @@ class DictOfSeries: def __init__(self, data=None, columns=None, itype=MixedItype, downcast_policy='lossless'): - self._data = OrderedDict() + def to_ser(item): + if not isinstance(item, pd.Series): + return pd.Series(item) + return item - # We need to keep track of the index-type (itype) of every new Series. - # If the itypes differ between different series, slicing will almost always fail - # (eg. a datetime-like slice cannot work on a numeric index and vice versa). - self._itype = None - self.itype = get_itype(itype) + self._data = pd.Series() + if data is not None: + if isinstance(data, dict): + for k in data: + self._data.loc[k] = to_ser(k) + if is_list_like(data): + data = data if is_nested_list_like(data) else [data] + for i, d in enumerate(data): + self._data.loc[i] = to_ser(d) + else: + self._data.loc[0] = pd.Series(data) + + if columns is not None: + self.columns = columns if downcast_policy not in CAST_POLICIES: raise ValueError(f"downcast_policy must be one of {CAST_POLICIES}") self._policy = downcast_policy - if data is not None: - self.__init_insert_data__(data) - - # we use the columns.setter to make all necessary checks - if columns is not None: - self.columns = columns + # We need to keep track of the index-type (itype) of every new Series. + # If the itypes differ between different series, slicing will almost always fail + # (eg. a datetime-like slice cannot work on a numeric index and vice versa). + self._itype = get_itype(itype) - def __init_insert_data__(self, data): - if isinstance(data, DictOfSeries): - g = ((k, data[k]) for k in data) - else: - data = list(data) if is_iterator(data) else data - if is_dict_like(data): - g = ((k, data[k]) for k in data) - elif is_nested_list_like(data): - g = ((str(i), d) for i, d in enumerate(data)) - elif is_list_like(data): - g = [('0', data)] - else: - raise ValueError(f"init with data of type {type(data)} is not possible.") - for k, val in g: - self[k] = val - return + for s in self._data: + cast_to_itype(s, self._itype, policy=self._policy, inplace=True) @property def columns(self): - return list(self._data.keys()) + return self._data.index @columns.setter - def columns(self, new): - if not isinstance(new, list): - raise TypeError("column names must be given as a list") - - if len(set(new)) != len(new): - raise ValueError("column names must be unique") - - if len(new) != len(self.columns): - raise ValueError(f"Length mismatch: Columns has {len(self.columns)} elements, " - f"new values have {len(new)} elements") - - # to keep order, we iterate over self instead of new - _d = OrderedDict() - for i, k in enumerate(self.columns): - _d[new[i]] = self[k] - self._data = _d + def columns(self, newindex): + self._data.index = newindex @property def values(self): - # will make all series same length, inset nan's - return to_object_array(self._data.values()).transpose() + return np.array([c.values for c in self._data]) @property def data(self): - return self._data.values() + return self._data.values @property def itype(self): return self._itype @itype.setter - def itype(self, itype_like): - itype = get_itype(itype_like) + def itype(self, newitype): + itype = get_itype(newitype) if not itype_le(self._itype, itype): self.__cast_all(itype) @@ -170,9 +162,8 @@ class DictOfSeries: def __cast_all(self, itype): k = '?' try: - for k in self.columns: - casted = cast_to_itype(self._data[k], itype, policy=self._policy) - self._data[k] = casted + for k in self._data: + cast_to_itype(k, itype, policy=self._policy, inplace=True) except Exception as e: raise type(e)(f"Column {k}: " + str(e)) from e @@ -185,31 +176,34 @@ class DictOfSeries: Notes: - [1] If ``iterable`` contains any(!) label that does not exist, a KeyError is raised. """ - # special case single label - if isinstance(key, str): - if key in self.columns: - new = self._get_item(key) - else: - raise KeyError(key) - # all other cases - else: - keys, ixs, ixstype = self._unpack_key(key) - ixs = self._unpack_indexer(keys, ixs, ixstype) + if is_bool_indexer(key): + if not is_series_like(key): + raise ValueError("Only boolean series are allowed as boolean indexer.") new = self.copy_empty() - for i, _ in enumerate(keys): - key, ix = keys[i], ixs[i] - new._data[key] = self._get_item(key, ix, True) - return new + for c in self.columns: + new._data[c] = align(self._data[c], key, method='dropna') + + elif is_series_like(key): + raise ValueError("Only series with boolean values are allowed as indexer") + + elif isinstance(key, slice): + new = self.copy_empty() + for c in self._data.index: + new._data[c] = self._data[c][key] + + elif isinstance(key, self.__class__): + new = self.copy_empty() + cols = self.columns.intersection(key.columns) + for c in cols: + new._data[c] = align(key._data[c], self._data[c]) + + elif is_hashable(key): + new = self._data[key] - def _get_item(self, key, ix=None, insertna=False): - """Extract a pd.Series from self""" - if ix is None: - return self._data[key] - elif insertna: - s = self._data[key] - return s[ix].reindex_like(s) else: - return self._data[key][ix] + new = self.copy() + new._data = self._data[key] + return new def __setitem__(self, key, value): """ @@ -225,18 +219,19 @@ class DictOfSeries: in the ``options`` dictionary. - [3] If ``iterable`` contains any(!) label that does not exist, a KeyError is raised. """ - # special case single label - if isinstance(key, str): + doalign = False + + if is_hashable(key): if key not in self.columns: self._insert(key, value) return else: - k, i, it = [key], [slice(None)], None - # all other cases + keys, ixs = [key], [slice(None)] else: - k, i, it = self._unpack_key(key) - i = self._unpack_indexer(k, i, it) - gen = self._unpack_value(k, i, value) + keys, ixs, doalign = self._unpack_key(key) + + assert len(keys) == len(ixs) + gen = self._unpack_value(keys, ixs, value) for tup in gen: self._set_item(*tup) @@ -262,7 +257,7 @@ class DictOfSeries: raise ValueError(f"Only pd.Series can be inserted directly, given type {type(val)}") val = cast_to_itype(val, self._itype, policy=self._policy) - self._data[key] = val.copy(deep=True) + self._data.loc[key] = val.copy(deep=True) def _unpack_value(self, keys, ixs, val): """Return a generator that yield (key, indexer, value) for all keys""" @@ -287,73 +282,27 @@ class DictOfSeries: def _unpack_key(self, key): """ Determine keys and indexer by type of key. This does not deal - with single label-access, only higher dimension objects are handled.. - - Notes: - Which keys we get, may depend on the policy in dios_options + with single (hashable) label-access, only higher dimension objects + are handled.. """ - len_err_msg = "length of given column-indexer does not match length of columns" - keys = None - indexer, idxtype = None, None - - # prevent consuming of a generator - key = list(key) if is_iterator(key) else key - - if isinstance(key, slice): + if is_bool_indexer(key): + if not is_series_like(key): + raise ValueError("Only boolean series are allowed as boolean indexer.") keys = self.columns - indexer, idxtype = [key], 'slice' - - # list, np.arrays, ... of list, np.arrays.. - elif is_nested_list_like(key): - # we only allow bool nlists + indexer, doalign = [key] * len(keys), True + elif isinstance(key, slice): keys = self.columns - indexer, idxtype = key, 'nlist' - - # ser, df, dios - elif is_pandas_like(key): - if is_series_like(key): - mask = key.to_numpy() - if is_bool_indexer(mask): - # bool series are column indexer not row-indexer! - keys = [] - for k in self.columns: - try: - if key[k]: - keys.append(k) - except KeyError: - pass - else: - keys = key.to_list() - - elif is_dataframe_like(key): - # we only allow bool df's - keys = key.columns.to_list() - indexer, idxtype = key, 'df' - - elif is_dios_like(key): - # we only allow bool dios's - keys = key.columns - indexer, idxtype = key, 'dios' - - # list, np.array, np.ndarray, ... - # Note: series considered list-like, so we handle lists at last - elif is_list_like(key): - arr = np.array(key) - if is_bool_array(arr): - keys = self.columns - if len(arr) != len(keys): - raise ValueError(len_err_msg) - keys = np.array(keys)[arr] - else: - keys = key + indexer = [key] * len(keys) + elif isinstance(key, self.__class__): + keys = self.columns.intersection(key.columns).to_list() + indexer, doalign = key[keys].to_list(), True + elif is_list_like(key) and not is_nested_list_like(key): + # policy is fix here. we only want to allow known keys or less; empty list is ok + keys = check_keys_by_policy(key, self.columns, Opts.none_up2_all) + indexer = [slice(None)] * len(keys) else: - raise KeyError(f"{key}") - - # check keys - method = dios_options[OptsFields.col_indexing_method] - keys = check_keys_by_policy(keys, self.columns, method) - - return keys, indexer, idxtype + raise TypeError(f"Unknown indexer type: {type(key)}") + return keys, indexer, doalign def _unpack_indexer(self, keys, indexer, idxtype): err_bool = "only boolean values are allowed" @@ -453,8 +402,8 @@ class DictOfSeries: # We use `_data` here, because all checks are already done. # So this should be much faster, especially, because we use the underlying dict for # getting and setting the values, instead of ``__setitem__`` and ``__getitem__``. - for k in self._data: - new._data[k] = self._data[k].copy(deep=deep) + for i in self._data.index: + new._data.loc[i] = self._data[i].copy(deep=deep) return new def copy_empty(self): diff --git a/dios/options.py b/dios/options.py index ae735ea2d5740020a48b86fec9f0dc216952545e..5705615df785c3dbdfbdca785a6b263c1d14c6c8 100644 --- a/dios/options.py +++ b/dios/options.py @@ -31,9 +31,12 @@ class OptsFields: class Opts: - none_plus = 'none_plus' - at_least_one = 'at_least_one' - all_present = 'all_present' + none_up2_all = 'none_all' + none_up2_more = 'none_more' + one_up2_all = 'one_all' + one_up2_more = 'one_more' + exactly_all = 'all_all' + all_or_more = 'all_more' itype_warn = 'warn' itype_err = 'err' @@ -54,20 +57,37 @@ dios_options = { } -def check_keys_by_policy(check, keys, policy): +def check_keys_by_policy(tocheck, keys, policy): - filtered = [k for k in check if k in keys] - if policy == Opts.none_plus: + filtered = [k for k in tocheck if k in keys] + if policy == Opts.none_up2_all: + fail = [k for k in tocheck if k not in keys] + if fail: + raise KeyError(f"Policy says: keys must be known. Unknown: {fail}") + + elif policy == Opts.none_up2_more: pass - elif policy == Opts.at_least_one: + elif policy == Opts.one_up2_all: + fail = [k for k in tocheck if k not in keys] + if not filtered or fail: + if fail: + raise KeyError(f"Policy says: keys must be known and at least one must be shared. Unknown: {fail}") + raise KeyError("Policy says: keys must known and at least one key must be shared. None was shared.") + + elif policy == Opts.one_up2_more: if not filtered: raise KeyError("Policy says: at least one key must be shared.") - elif Opts.all_present: - fail = set(filtered).symmetric_difference(set(check)) + elif policy == Opts.exactly_all: + fail = set(tocheck).symmetric_difference(set(keys)) + if fail: + raise KeyError(f"Policy says: exactly all keys must be given.") + + elif Opts.all_or_more: + fail = set(filtered).symmetric_difference(set(keys)) if fail: - raise KeyError(f"Unknown keys {fail}. Policy says: all given keys must be known.") + raise KeyError(f"Policy says: all known keys must be given, unknown are ignored.") else: raise ValueError(policy) diff --git a/test/run_dios.py b/test/run_dios.py index eaf78ab58adb164665fd26c5ddadd6e1ca5924eb..77e680838960e15e0010099b50a34de66f0645c1 100644 --- a/test/run_dios.py +++ b/test/run_dios.py @@ -8,6 +8,17 @@ if __name__ == '__main__': # df = pd.DataFrame([1,24,5,456,45], index=pd.date_range(periods=5, freq='1d', start='2000-01-01')) # df[[True, False]] + a = pd.Series([1, 12, 2]) + b = pd.Series([2, 12, 2]) + c = pd.Series([2, 12, 2]) + d = pd.Series([3, 12, 2]) + x = pd.Series([a, b, c]) + y = pd.Series([a, b, d]) + k = x == y + print(k) + + exit(9384) + dios = DictOfSeries(data=[234.54, 5, 5, 4, np.nan, 5, 4, 5]) dios = abs(~dios)