Skip to content
Snippets Groups Projects
Commit 058bded9 authored by Bert Palm's avatar Bert Palm 🎇
Browse files

wip

parent 8b0d2ca8
No related branches found
No related tags found
1 merge request!2Develop
......@@ -18,6 +18,7 @@ from pandas.core.dtypes.common import (
is_integer,
is_dict_like,
is_number,
is_hashable,
)
from pandas.core.dtypes.common import is_iterator as _is_iterator
......@@ -48,6 +49,16 @@ def is_iterator(obj):
return _is_iterator(obj)
def align(s1, s2, method='dropna'):
if method == 'keepna':
s = s1.reindex_like(s2)
elif method == 'dropna':
s = s1.reindex_like(s2).dropna()
else:
raise ValueError(method)
return s
class DictOfSeries:
"""
DictionaryOfSeries is a collection of pd.Series's which aim to be as close as possible similar to
......@@ -82,80 +93,61 @@ class DictOfSeries:
def __init__(self, data=None, columns=None, itype=MixedItype, downcast_policy='lossless'):
self._data = OrderedDict()
def to_ser(item):
if not isinstance(item, pd.Series):
return pd.Series(item)
return item
# We need to keep track of the index-type (itype) of every new Series.
# If the itypes differ between different series, slicing will almost always fail
# (eg. a datetime-like slice cannot work on a numeric index and vice versa).
self._itype = None
self.itype = get_itype(itype)
self._data = pd.Series()
if data is not None:
if isinstance(data, dict):
for k in data:
self._data.loc[k] = to_ser(k)
if is_list_like(data):
data = data if is_nested_list_like(data) else [data]
for i, d in enumerate(data):
self._data.loc[i] = to_ser(d)
else:
self._data.loc[0] = pd.Series(data)
if columns is not None:
self.columns = columns
if downcast_policy not in CAST_POLICIES:
raise ValueError(f"downcast_policy must be one of {CAST_POLICIES}")
self._policy = downcast_policy
if data is not None:
self.__init_insert_data__(data)
# we use the columns.setter to make all necessary checks
if columns is not None:
self.columns = columns
# We need to keep track of the index-type (itype) of every new Series.
# If the itypes differ between different series, slicing will almost always fail
# (eg. a datetime-like slice cannot work on a numeric index and vice versa).
self._itype = get_itype(itype)
def __init_insert_data__(self, data):
if isinstance(data, DictOfSeries):
g = ((k, data[k]) for k in data)
else:
data = list(data) if is_iterator(data) else data
if is_dict_like(data):
g = ((k, data[k]) for k in data)
elif is_nested_list_like(data):
g = ((str(i), d) for i, d in enumerate(data))
elif is_list_like(data):
g = [('0', data)]
else:
raise ValueError(f"init with data of type {type(data)} is not possible.")
for k, val in g:
self[k] = val
return
for s in self._data:
cast_to_itype(s, self._itype, policy=self._policy, inplace=True)
@property
def columns(self):
return list(self._data.keys())
return self._data.index
@columns.setter
def columns(self, new):
if not isinstance(new, list):
raise TypeError("column names must be given as a list")
if len(set(new)) != len(new):
raise ValueError("column names must be unique")
if len(new) != len(self.columns):
raise ValueError(f"Length mismatch: Columns has {len(self.columns)} elements, "
f"new values have {len(new)} elements")
# to keep order, we iterate over self instead of new
_d = OrderedDict()
for i, k in enumerate(self.columns):
_d[new[i]] = self[k]
self._data = _d
def columns(self, newindex):
self._data.index = newindex
@property
def values(self):
# will make all series same length, inset nan's
return to_object_array(self._data.values()).transpose()
return np.array([c.values for c in self._data])
@property
def data(self):
return self._data.values()
return self._data.values
@property
def itype(self):
return self._itype
@itype.setter
def itype(self, itype_like):
itype = get_itype(itype_like)
def itype(self, newitype):
itype = get_itype(newitype)
if not itype_le(self._itype, itype):
self.__cast_all(itype)
......@@ -170,9 +162,8 @@ class DictOfSeries:
def __cast_all(self, itype):
k = '?'
try:
for k in self.columns:
casted = cast_to_itype(self._data[k], itype, policy=self._policy)
self._data[k] = casted
for k in self._data:
cast_to_itype(k, itype, policy=self._policy, inplace=True)
except Exception as e:
raise type(e)(f"Column {k}: " + str(e)) from e
......@@ -185,31 +176,34 @@ class DictOfSeries:
Notes:
- [1] If ``iterable`` contains any(!) label that does not exist, a KeyError is raised.
"""
# special case single label
if isinstance(key, str):
if key in self.columns:
new = self._get_item(key)
else:
raise KeyError(key)
# all other cases
else:
keys, ixs, ixstype = self._unpack_key(key)
ixs = self._unpack_indexer(keys, ixs, ixstype)
if is_bool_indexer(key):
if not is_series_like(key):
raise ValueError("Only boolean series are allowed as boolean indexer.")
new = self.copy_empty()
for i, _ in enumerate(keys):
key, ix = keys[i], ixs[i]
new._data[key] = self._get_item(key, ix, True)
return new
for c in self.columns:
new._data[c] = align(self._data[c], key, method='dropna')
elif is_series_like(key):
raise ValueError("Only series with boolean values are allowed as indexer")
elif isinstance(key, slice):
new = self.copy_empty()
for c in self._data.index:
new._data[c] = self._data[c][key]
elif isinstance(key, self.__class__):
new = self.copy_empty()
cols = self.columns.intersection(key.columns)
for c in cols:
new._data[c] = align(key._data[c], self._data[c])
elif is_hashable(key):
new = self._data[key]
def _get_item(self, key, ix=None, insertna=False):
"""Extract a pd.Series from self"""
if ix is None:
return self._data[key]
elif insertna:
s = self._data[key]
return s[ix].reindex_like(s)
else:
return self._data[key][ix]
new = self.copy()
new._data = self._data[key]
return new
def __setitem__(self, key, value):
"""
......@@ -225,18 +219,19 @@ class DictOfSeries:
in the ``options`` dictionary.
- [3] If ``iterable`` contains any(!) label that does not exist, a KeyError is raised.
"""
# special case single label
if isinstance(key, str):
doalign = False
if is_hashable(key):
if key not in self.columns:
self._insert(key, value)
return
else:
k, i, it = [key], [slice(None)], None
# all other cases
keys, ixs = [key], [slice(None)]
else:
k, i, it = self._unpack_key(key)
i = self._unpack_indexer(k, i, it)
gen = self._unpack_value(k, i, value)
keys, ixs, doalign = self._unpack_key(key)
assert len(keys) == len(ixs)
gen = self._unpack_value(keys, ixs, value)
for tup in gen:
self._set_item(*tup)
......@@ -262,7 +257,7 @@ class DictOfSeries:
raise ValueError(f"Only pd.Series can be inserted directly, given type {type(val)}")
val = cast_to_itype(val, self._itype, policy=self._policy)
self._data[key] = val.copy(deep=True)
self._data.loc[key] = val.copy(deep=True)
def _unpack_value(self, keys, ixs, val):
"""Return a generator that yield (key, indexer, value) for all keys"""
......@@ -287,73 +282,27 @@ class DictOfSeries:
def _unpack_key(self, key):
""" Determine keys and indexer by type of key. This does not deal
with single label-access, only higher dimension objects are handled..
Notes:
Which keys we get, may depend on the policy in dios_options
with single (hashable) label-access, only higher dimension objects
are handled..
"""
len_err_msg = "length of given column-indexer does not match length of columns"
keys = None
indexer, idxtype = None, None
# prevent consuming of a generator
key = list(key) if is_iterator(key) else key
if isinstance(key, slice):
if is_bool_indexer(key):
if not is_series_like(key):
raise ValueError("Only boolean series are allowed as boolean indexer.")
keys = self.columns
indexer, idxtype = [key], 'slice'
# list, np.arrays, ... of list, np.arrays..
elif is_nested_list_like(key):
# we only allow bool nlists
indexer, doalign = [key] * len(keys), True
elif isinstance(key, slice):
keys = self.columns
indexer, idxtype = key, 'nlist'
# ser, df, dios
elif is_pandas_like(key):
if is_series_like(key):
mask = key.to_numpy()
if is_bool_indexer(mask):
# bool series are column indexer not row-indexer!
keys = []
for k in self.columns:
try:
if key[k]:
keys.append(k)
except KeyError:
pass
else:
keys = key.to_list()
elif is_dataframe_like(key):
# we only allow bool df's
keys = key.columns.to_list()
indexer, idxtype = key, 'df'
elif is_dios_like(key):
# we only allow bool dios's
keys = key.columns
indexer, idxtype = key, 'dios'
# list, np.array, np.ndarray, ...
# Note: series considered list-like, so we handle lists at last
elif is_list_like(key):
arr = np.array(key)
if is_bool_array(arr):
keys = self.columns
if len(arr) != len(keys):
raise ValueError(len_err_msg)
keys = np.array(keys)[arr]
else:
keys = key
indexer = [key] * len(keys)
elif isinstance(key, self.__class__):
keys = self.columns.intersection(key.columns).to_list()
indexer, doalign = key[keys].to_list(), True
elif is_list_like(key) and not is_nested_list_like(key):
# policy is fix here. we only want to allow known keys or less; empty list is ok
keys = check_keys_by_policy(key, self.columns, Opts.none_up2_all)
indexer = [slice(None)] * len(keys)
else:
raise KeyError(f"{key}")
# check keys
method = dios_options[OptsFields.col_indexing_method]
keys = check_keys_by_policy(keys, self.columns, method)
return keys, indexer, idxtype
raise TypeError(f"Unknown indexer type: {type(key)}")
return keys, indexer, doalign
def _unpack_indexer(self, keys, indexer, idxtype):
err_bool = "only boolean values are allowed"
......@@ -453,8 +402,8 @@ class DictOfSeries:
# We use `_data` here, because all checks are already done.
# So this should be much faster, especially, because we use the underlying dict for
# getting and setting the values, instead of ``__setitem__`` and ``__getitem__``.
for k in self._data:
new._data[k] = self._data[k].copy(deep=deep)
for i in self._data.index:
new._data.loc[i] = self._data[i].copy(deep=deep)
return new
def copy_empty(self):
......
......@@ -31,9 +31,12 @@ class OptsFields:
class Opts:
none_plus = 'none_plus'
at_least_one = 'at_least_one'
all_present = 'all_present'
none_up2_all = 'none_all'
none_up2_more = 'none_more'
one_up2_all = 'one_all'
one_up2_more = 'one_more'
exactly_all = 'all_all'
all_or_more = 'all_more'
itype_warn = 'warn'
itype_err = 'err'
......@@ -54,20 +57,37 @@ dios_options = {
}
def check_keys_by_policy(check, keys, policy):
def check_keys_by_policy(tocheck, keys, policy):
filtered = [k for k in check if k in keys]
if policy == Opts.none_plus:
filtered = [k for k in tocheck if k in keys]
if policy == Opts.none_up2_all:
fail = [k for k in tocheck if k not in keys]
if fail:
raise KeyError(f"Policy says: keys must be known. Unknown: {fail}")
elif policy == Opts.none_up2_more:
pass
elif policy == Opts.at_least_one:
elif policy == Opts.one_up2_all:
fail = [k for k in tocheck if k not in keys]
if not filtered or fail:
if fail:
raise KeyError(f"Policy says: keys must be known and at least one must be shared. Unknown: {fail}")
raise KeyError("Policy says: keys must known and at least one key must be shared. None was shared.")
elif policy == Opts.one_up2_more:
if not filtered:
raise KeyError("Policy says: at least one key must be shared.")
elif Opts.all_present:
fail = set(filtered).symmetric_difference(set(check))
elif policy == Opts.exactly_all:
fail = set(tocheck).symmetric_difference(set(keys))
if fail:
raise KeyError(f"Policy says: exactly all keys must be given.")
elif Opts.all_or_more:
fail = set(filtered).symmetric_difference(set(keys))
if fail:
raise KeyError(f"Unknown keys {fail}. Policy says: all given keys must be known.")
raise KeyError(f"Policy says: all known keys must be given, unknown are ignored.")
else:
raise ValueError(policy)
......
......@@ -8,6 +8,17 @@ if __name__ == '__main__':
# df = pd.DataFrame([1,24,5,456,45], index=pd.date_range(periods=5, freq='1d', start='2000-01-01'))
# df[[True, False]]
a = pd.Series([1, 12, 2])
b = pd.Series([2, 12, 2])
c = pd.Series([2, 12, 2])
d = pd.Series([3, 12, 2])
x = pd.Series([a, b, c])
y = pd.Series([a, b, d])
k = x == y
print(k)
exit(9384)
dios = DictOfSeries(data=[234.54, 5, 5, 4, np.nan, 5, 4, 5])
dios = abs(~dios)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment