Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
dios
Manage
Activity
Members
Labels
Plan
Issues
11
Issue boards
Milestones
Wiki
Jira
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
RDM
dios
Commits
058bded9
Commit
058bded9
authored
5 years ago
by
Bert Palm
🎇
Browse files
Options
Downloads
Patches
Plain Diff
wip
parent
8b0d2ca8
No related branches found
Branches containing commit
No related tags found
Tags containing commit
1 merge request
!2
Develop
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
dios/dios.py
+97
-148
97 additions, 148 deletions
dios/dios.py
dios/options.py
+30
-10
30 additions, 10 deletions
dios/options.py
test/run_dios.py
+11
-0
11 additions, 0 deletions
test/run_dios.py
with
138 additions
and
158 deletions
dios/dios.py
+
97
−
148
View file @
058bded9
...
...
@@ -18,6 +18,7 @@ from pandas.core.dtypes.common import (
is_integer
,
is_dict_like
,
is_number
,
is_hashable
,
)
from
pandas.core.dtypes.common
import
is_iterator
as
_is_iterator
...
...
@@ -48,6 +49,16 @@ def is_iterator(obj):
return
_is_iterator
(
obj
)
def
align
(
s1
,
s2
,
method
=
'
dropna
'
):
if
method
==
'
keepna
'
:
s
=
s1
.
reindex_like
(
s2
)
elif
method
==
'
dropna
'
:
s
=
s1
.
reindex_like
(
s2
).
dropna
()
else
:
raise
ValueError
(
method
)
return
s
class
DictOfSeries
:
"""
DictionaryOfSeries is a collection of pd.Series
'
s which aim to be as close as possible similar to
...
...
@@ -82,80 +93,61 @@ class DictOfSeries:
def
__init__
(
self
,
data
=
None
,
columns
=
None
,
itype
=
MixedItype
,
downcast_policy
=
'
lossless
'
):
self
.
_data
=
OrderedDict
()
def
to_ser
(
item
):
if
not
isinstance
(
item
,
pd
.
Series
):
return
pd
.
Series
(
item
)
return
item
# We need to keep track of the index-type (itype) of every new Series.
# If the itypes differ between different series, slicing will almost always fail
# (eg. a datetime-like slice cannot work on a numeric index and vice versa).
self
.
_itype
=
None
self
.
itype
=
get_itype
(
itype
)
self
.
_data
=
pd
.
Series
()
if
data
is
not
None
:
if
isinstance
(
data
,
dict
):
for
k
in
data
:
self
.
_data
.
loc
[
k
]
=
to_ser
(
k
)
if
is_list_like
(
data
):
data
=
data
if
is_nested_list_like
(
data
)
else
[
data
]
for
i
,
d
in
enumerate
(
data
):
self
.
_data
.
loc
[
i
]
=
to_ser
(
d
)
else
:
self
.
_data
.
loc
[
0
]
=
pd
.
Series
(
data
)
if
columns
is
not
None
:
self
.
columns
=
columns
if
downcast_policy
not
in
CAST_POLICIES
:
raise
ValueError
(
f
"
downcast_policy must be one of
{
CAST_POLICIES
}
"
)
self
.
_policy
=
downcast_policy
if
data
is
not
None
:
self
.
__init_insert_data__
(
data
)
# we use the columns.setter to make all necessary checks
if
columns
is
not
None
:
self
.
columns
=
columns
# We need to keep track of the index-type (itype) of every new Series.
# If the itypes differ between different series, slicing will almost always fail
# (eg. a datetime-like slice cannot work on a numeric index and vice versa).
self
.
_itype
=
get_itype
(
itype
)
def
__init_insert_data__
(
self
,
data
):
if
isinstance
(
data
,
DictOfSeries
):
g
=
((
k
,
data
[
k
])
for
k
in
data
)
else
:
data
=
list
(
data
)
if
is_iterator
(
data
)
else
data
if
is_dict_like
(
data
):
g
=
((
k
,
data
[
k
])
for
k
in
data
)
elif
is_nested_list_like
(
data
):
g
=
((
str
(
i
),
d
)
for
i
,
d
in
enumerate
(
data
))
elif
is_list_like
(
data
):
g
=
[(
'
0
'
,
data
)]
else
:
raise
ValueError
(
f
"
init with data of type
{
type
(
data
)
}
is not possible.
"
)
for
k
,
val
in
g
:
self
[
k
]
=
val
return
for
s
in
self
.
_data
:
cast_to_itype
(
s
,
self
.
_itype
,
policy
=
self
.
_policy
,
inplace
=
True
)
@property
def
columns
(
self
):
return
list
(
self
.
_data
.
keys
())
return
self
.
_data
.
index
@columns.setter
def
columns
(
self
,
new
):
if
not
isinstance
(
new
,
list
):
raise
TypeError
(
"
column names must be given as a list
"
)
if
len
(
set
(
new
))
!=
len
(
new
):
raise
ValueError
(
"
column names must be unique
"
)
if
len
(
new
)
!=
len
(
self
.
columns
):
raise
ValueError
(
f
"
Length mismatch: Columns has
{
len
(
self
.
columns
)
}
elements,
"
f
"
new values have
{
len
(
new
)
}
elements
"
)
# to keep order, we iterate over self instead of new
_d
=
OrderedDict
()
for
i
,
k
in
enumerate
(
self
.
columns
):
_d
[
new
[
i
]]
=
self
[
k
]
self
.
_data
=
_d
def
columns
(
self
,
newindex
):
self
.
_data
.
index
=
newindex
@property
def
values
(
self
):
# will make all series same length, inset nan's
return
to_object_array
(
self
.
_data
.
values
()).
transpose
()
return
np
.
array
([
c
.
values
for
c
in
self
.
_data
])
@property
def
data
(
self
):
return
self
.
_data
.
values
()
return
self
.
_data
.
values
@property
def
itype
(
self
):
return
self
.
_itype
@itype.setter
def
itype
(
self
,
itype
_like
):
itype
=
get_itype
(
itype
_like
)
def
itype
(
self
,
new
itype
):
itype
=
get_itype
(
new
itype
)
if
not
itype_le
(
self
.
_itype
,
itype
):
self
.
__cast_all
(
itype
)
...
...
@@ -170,9 +162,8 @@ class DictOfSeries:
def
__cast_all
(
self
,
itype
):
k
=
'
?
'
try
:
for
k
in
self
.
columns
:
casted
=
cast_to_itype
(
self
.
_data
[
k
],
itype
,
policy
=
self
.
_policy
)
self
.
_data
[
k
]
=
casted
for
k
in
self
.
_data
:
cast_to_itype
(
k
,
itype
,
policy
=
self
.
_policy
,
inplace
=
True
)
except
Exception
as
e
:
raise
type
(
e
)(
f
"
Column
{
k
}
:
"
+
str
(
e
))
from
e
...
...
@@ -185,31 +176,34 @@ class DictOfSeries:
Notes:
- [1] If ``iterable`` contains any(!) label that does not exist, a KeyError is raised.
"""
# special case single label
if
isinstance
(
key
,
str
):
if
key
in
self
.
columns
:
new
=
self
.
_get_item
(
key
)
else
:
raise
KeyError
(
key
)
# all other cases
else
:
keys
,
ixs
,
ixstype
=
self
.
_unpack_key
(
key
)
ixs
=
self
.
_unpack_indexer
(
keys
,
ixs
,
ixstype
)
if
is_bool_indexer
(
key
):
if
not
is_series_like
(
key
):
raise
ValueError
(
"
Only boolean series are allowed as boolean indexer.
"
)
new
=
self
.
copy_empty
()
for
i
,
_
in
enumerate
(
keys
):
key
,
ix
=
keys
[
i
],
ixs
[
i
]
new
.
_data
[
key
]
=
self
.
_get_item
(
key
,
ix
,
True
)
return
new
for
c
in
self
.
columns
:
new
.
_data
[
c
]
=
align
(
self
.
_data
[
c
],
key
,
method
=
'
dropna
'
)
elif
is_series_like
(
key
):
raise
ValueError
(
"
Only series with boolean values are allowed as indexer
"
)
elif
isinstance
(
key
,
slice
):
new
=
self
.
copy_empty
()
for
c
in
self
.
_data
.
index
:
new
.
_data
[
c
]
=
self
.
_data
[
c
][
key
]
elif
isinstance
(
key
,
self
.
__class__
):
new
=
self
.
copy_empty
()
cols
=
self
.
columns
.
intersection
(
key
.
columns
)
for
c
in
cols
:
new
.
_data
[
c
]
=
align
(
key
.
_data
[
c
],
self
.
_data
[
c
])
elif
is_hashable
(
key
):
new
=
self
.
_data
[
key
]
def
_get_item
(
self
,
key
,
ix
=
None
,
insertna
=
False
):
"""
Extract a pd.Series from self
"""
if
ix
is
None
:
return
self
.
_data
[
key
]
elif
insertna
:
s
=
self
.
_data
[
key
]
return
s
[
ix
].
reindex_like
(
s
)
else
:
return
self
.
_data
[
key
][
ix
]
new
=
self
.
copy
()
new
.
_data
=
self
.
_data
[
key
]
return
new
def
__setitem__
(
self
,
key
,
value
):
"""
...
...
@@ -225,18 +219,19 @@ class DictOfSeries:
in the ``options`` dictionary.
- [3] If ``iterable`` contains any(!) label that does not exist, a KeyError is raised.
"""
# special case single label
if
isinstance
(
key
,
str
):
doalign
=
False
if
is_hashable
(
key
):
if
key
not
in
self
.
columns
:
self
.
_insert
(
key
,
value
)
return
else
:
k
,
i
,
it
=
[
key
],
[
slice
(
None
)],
None
# all other cases
keys
,
ixs
=
[
key
],
[
slice
(
None
)]
else
:
k
,
i
,
it
=
self
.
_unpack_key
(
key
)
i
=
self
.
_unpack_indexer
(
k
,
i
,
it
)
gen
=
self
.
_unpack_value
(
k
,
i
,
value
)
keys
,
ixs
,
doalign
=
self
.
_unpack_key
(
key
)
assert
len
(
keys
)
==
len
(
ixs
)
gen
=
self
.
_unpack_value
(
keys
,
ixs
,
value
)
for
tup
in
gen
:
self
.
_set_item
(
*
tup
)
...
...
@@ -262,7 +257,7 @@ class DictOfSeries:
raise
ValueError
(
f
"
Only pd.Series can be inserted directly, given type
{
type
(
val
)
}
"
)
val
=
cast_to_itype
(
val
,
self
.
_itype
,
policy
=
self
.
_policy
)
self
.
_data
[
key
]
=
val
.
copy
(
deep
=
True
)
self
.
_data
.
loc
[
key
]
=
val
.
copy
(
deep
=
True
)
def
_unpack_value
(
self
,
keys
,
ixs
,
val
):
"""
Return a generator that yield (key, indexer, value) for all keys
"""
...
...
@@ -287,73 +282,27 @@ class DictOfSeries:
def
_unpack_key
(
self
,
key
):
"""
Determine keys and indexer by type of key. This does not deal
with single label-access, only higher dimension objects are handled..
Notes:
Which keys we get, may depend on the policy in dios_options
with single (hashable) label-access, only higher dimension objects
are handled..
"""
len_err_msg
=
"
length of given column-indexer does not match length of columns
"
keys
=
None
indexer
,
idxtype
=
None
,
None
# prevent consuming of a generator
key
=
list
(
key
)
if
is_iterator
(
key
)
else
key
if
isinstance
(
key
,
slice
):
if
is_bool_indexer
(
key
):
if
not
is_series_like
(
key
):
raise
ValueError
(
"
Only boolean series are allowed as boolean indexer.
"
)
keys
=
self
.
columns
indexer
,
idxtype
=
[
key
],
'
slice
'
# list, np.arrays, ... of list, np.arrays..
elif
is_nested_list_like
(
key
):
# we only allow bool nlists
indexer
,
doalign
=
[
key
]
*
len
(
keys
),
True
elif
isinstance
(
key
,
slice
):
keys
=
self
.
columns
indexer
,
idxtype
=
key
,
'
nlist
'
# ser, df, dios
elif
is_pandas_like
(
key
):
if
is_series_like
(
key
):
mask
=
key
.
to_numpy
()
if
is_bool_indexer
(
mask
):
# bool series are column indexer not row-indexer!
keys
=
[]
for
k
in
self
.
columns
:
try
:
if
key
[
k
]:
keys
.
append
(
k
)
except
KeyError
:
pass
else
:
keys
=
key
.
to_list
()
elif
is_dataframe_like
(
key
):
# we only allow bool df's
keys
=
key
.
columns
.
to_list
()
indexer
,
idxtype
=
key
,
'
df
'
elif
is_dios_like
(
key
):
# we only allow bool dios's
keys
=
key
.
columns
indexer
,
idxtype
=
key
,
'
dios
'
# list, np.array, np.ndarray, ...
# Note: series considered list-like, so we handle lists at last
elif
is_list_like
(
key
):
arr
=
np
.
array
(
key
)
if
is_bool_array
(
arr
):
keys
=
self
.
columns
if
len
(
arr
)
!=
len
(
keys
):
raise
ValueError
(
len_err_msg
)
keys
=
np
.
array
(
keys
)[
arr
]
else
:
keys
=
key
indexer
=
[
key
]
*
len
(
keys
)
elif
isinstance
(
key
,
self
.
__class__
):
keys
=
self
.
columns
.
intersection
(
key
.
columns
).
to_list
()
indexer
,
doalign
=
key
[
keys
].
to_list
(),
True
elif
is_list_like
(
key
)
and
not
is_nested_list_like
(
key
):
# policy is fix here. we only want to allow known keys or less; empty list is ok
keys
=
check_keys_by_policy
(
key
,
self
.
columns
,
Opts
.
none_up2_all
)
indexer
=
[
slice
(
None
)]
*
len
(
keys
)
else
:
raise
KeyError
(
f
"
{
key
}
"
)
# check keys
method
=
dios_options
[
OptsFields
.
col_indexing_method
]
keys
=
check_keys_by_policy
(
keys
,
self
.
columns
,
method
)
return
keys
,
indexer
,
idxtype
raise
TypeError
(
f
"
Unknown indexer type:
{
type
(
key
)
}
"
)
return
keys
,
indexer
,
doalign
def
_unpack_indexer
(
self
,
keys
,
indexer
,
idxtype
):
err_bool
=
"
only boolean values are allowed
"
...
...
@@ -453,8 +402,8 @@ class DictOfSeries:
# We use `_data` here, because all checks are already done.
# So this should be much faster, especially, because we use the underlying dict for
# getting and setting the values, instead of ``__setitem__`` and ``__getitem__``.
for
k
in
self
.
_data
:
new
.
_data
[
k
]
=
self
.
_data
[
k
].
copy
(
deep
=
deep
)
for
i
in
self
.
_data
.
index
:
new
.
_data
.
loc
[
i
]
=
self
.
_data
[
i
].
copy
(
deep
=
deep
)
return
new
def
copy_empty
(
self
):
...
...
This diff is collapsed.
Click to expand it.
dios/options.py
+
30
−
10
View file @
058bded9
...
...
@@ -31,9 +31,12 @@ class OptsFields:
class
Opts
:
none_plus
=
'
none_plus
'
at_least_one
=
'
at_least_one
'
all_present
=
'
all_present
'
none_up2_all
=
'
none_all
'
none_up2_more
=
'
none_more
'
one_up2_all
=
'
one_all
'
one_up2_more
=
'
one_more
'
exactly_all
=
'
all_all
'
all_or_more
=
'
all_more
'
itype_warn
=
'
warn
'
itype_err
=
'
err
'
...
...
@@ -54,20 +57,37 @@ dios_options = {
}
def
check_keys_by_policy
(
check
,
keys
,
policy
):
def
check_keys_by_policy
(
to
check
,
keys
,
policy
):
filtered
=
[
k
for
k
in
check
if
k
in
keys
]
if
policy
==
Opts
.
none_plus
:
filtered
=
[
k
for
k
in
tocheck
if
k
in
keys
]
if
policy
==
Opts
.
none_up2_all
:
fail
=
[
k
for
k
in
tocheck
if
k
not
in
keys
]
if
fail
:
raise
KeyError
(
f
"
Policy says: keys must be known. Unknown:
{
fail
}
"
)
elif
policy
==
Opts
.
none_up2_more
:
pass
elif
policy
==
Opts
.
at_least_one
:
elif
policy
==
Opts
.
one_up2_all
:
fail
=
[
k
for
k
in
tocheck
if
k
not
in
keys
]
if
not
filtered
or
fail
:
if
fail
:
raise
KeyError
(
f
"
Policy says: keys must be known and at least one must be shared. Unknown:
{
fail
}
"
)
raise
KeyError
(
"
Policy says: keys must known and at least one key must be shared. None was shared.
"
)
elif
policy
==
Opts
.
one_up2_more
:
if
not
filtered
:
raise
KeyError
(
"
Policy says: at least one key must be shared.
"
)
elif
Opts
.
all_present
:
fail
=
set
(
filtered
).
symmetric_difference
(
set
(
check
))
elif
policy
==
Opts
.
exactly_all
:
fail
=
set
(
tocheck
).
symmetric_difference
(
set
(
keys
))
if
fail
:
raise
KeyError
(
f
"
Policy says: exactly all keys must be given.
"
)
elif
Opts
.
all_or_more
:
fail
=
set
(
filtered
).
symmetric_difference
(
set
(
keys
))
if
fail
:
raise
KeyError
(
f
"
Unknown keys
{
fail
}
.
Policy says: all
give
n keys must be
known
.
"
)
raise
KeyError
(
f
"
Policy says: all
know
n keys must be
given, unknown are ignored
.
"
)
else
:
raise
ValueError
(
policy
)
...
...
This diff is collapsed.
Click to expand it.
test/run_dios.py
+
11
−
0
View file @
058bded9
...
...
@@ -8,6 +8,17 @@ if __name__ == '__main__':
# df = pd.DataFrame([1,24,5,456,45], index=pd.date_range(periods=5, freq='1d', start='2000-01-01'))
# df[[True, False]]
a
=
pd
.
Series
([
1
,
12
,
2
])
b
=
pd
.
Series
([
2
,
12
,
2
])
c
=
pd
.
Series
([
2
,
12
,
2
])
d
=
pd
.
Series
([
3
,
12
,
2
])
x
=
pd
.
Series
([
a
,
b
,
c
])
y
=
pd
.
Series
([
a
,
b
,
d
])
k
=
x
==
y
print
(
k
)
exit
(
9384
)
dios
=
DictOfSeries
(
data
=
[
234.54
,
5
,
5
,
4
,
np
.
nan
,
5
,
4
,
5
])
dios
=
abs
(
~
dios
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment