Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
SaQC
Manage
Activity
Members
Labels
Plan
Issues
36
Issue boards
Milestones
Wiki
Code
Merge requests
8
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
rdm-software
SaQC
Commits
df3dbe9e
Commit
df3dbe9e
authored
4 years ago
by
Bert Palm
🎇
Browse files
Options
Downloads
Patches
Plain Diff
docu, test, minore improves
parent
74ec7f57
No related branches found
No related tags found
1 merge request
!218
Flags
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
saqc/flagger/flags.py
+93
-11
93 additions, 11 deletions
saqc/flagger/flags.py
test/flagger/test_flags.py
+79
-0
79 additions, 0 deletions
test/flagger/test_flags.py
with
172 additions
and
11 deletions
saqc/flagger/flags.py
+
93
−
11
View file @
df3dbe9e
...
@@ -31,16 +31,43 @@ class _HistAccess:
...
@@ -31,16 +31,43 @@ class _HistAccess:
self
.
obj
=
obj
self
.
obj
=
obj
def
__getitem__
(
self
,
key
:
str
)
->
History
:
def
__getitem__
(
self
,
key
:
str
)
->
History
:
# we don't know, what the user wants. Although we're not
# encouraging inplace modification of the history, the
# user may do it, so we remove the cached column here.
self
.
obj
.
_cache
.
pop
(
key
,
None
)
return
self
.
obj
.
_data
[
key
]
return
self
.
obj
.
_data
[
key
]
def
__setitem__
(
self
,
key
:
str
,
value
:
Union
[
History
,
pd
.
DataFrame
]):
def
__setitem__
(
self
,
key
:
str
,
value
:
Union
[
History
,
pd
.
DataFrame
]):
if
not
isinstance
(
value
,
History
):
if
not
isinstance
(
value
,
History
):
value
=
History
(
value
)
value
=
History
(
value
)
self
.
obj
.
_data
[
key
]
=
value
self
.
obj
.
_data
[
key
]
=
value
self
.
obj
.
_cache
.
clear
(
)
self
.
obj
.
_cache
.
pop
(
key
,
None
)
class
Flags
:
class
Flags
:
"""
flags manipulation
------------------
insert new -> flags[
'
new
'
] = pd.Series(...)
set items -> flags[
'
v1
'
] = pd.Series(...)
get items -> v0 = flags[
'
v0
'
]
delete items -> del flags[
'
v0
'
] / drop(
'
v0
'
)
metadata
--------
reading columns -> flags.columns
renaming column(s) -> flags.columns = pd.Index([
'
a
'
,
'
b
'
,
'
c
'
])
history
-------
get history -> flags.history[
'
v0
'
]
set history -> flags.history[
'
v0
'
] = History(...)
conversion
----------
make a dios -> flags.to_dios()
make a df -> flags.to_frame()
"""
def
__init__
(
self
,
raw_data
:
Optional
[
Union
[
DictLike
,
Flags
]]
=
None
,
copy
:
bool
=
False
):
def
__init__
(
self
,
raw_data
:
Optional
[
Union
[
DictLike
,
Flags
]]
=
None
,
copy
:
bool
=
False
):
...
@@ -50,14 +77,15 @@ class Flags:
...
@@ -50,14 +77,15 @@ class Flags:
if
isinstance
(
raw_data
,
Flags
):
if
isinstance
(
raw_data
,
Flags
):
raw_data
=
raw_data
.
_data
raw_data
=
raw_data
.
_data
# with python 3.7 dicts are insertion-ordered by default
self
.
_data
:
Dict
[
str
,
History
]
self
.
_data
:
Dict
[
str
,
History
]
self
.
_data
=
self
.
_init_from_raw
(
raw_data
,
copy
)
self
.
_data
=
self
.
_init_from_raw
(
raw_data
,
copy
)
# this is a simple cache that reduce the calculation of the flags
# this is a simple cache that reduce the calculation of the flags
# from the entire history of a flag column. The _cache is filled
# from the entire history of a flag column. The _cache is filled
# with __getitem__ and cleared
i
n
__setitem__ or if the whole history
# with __getitem__ and cleared
o
n
any write access to self_data.
#
is written in _HistAccess.__setitem__. There is no other access, so
#
There are not to may write access possibilities here so we don't
#
we don't
have to much trouble.
# have to much trouble.
self
.
_cache
=
{}
self
.
_cache
=
{}
def
_init_from_raw
(
self
,
data
,
copy
)
->
Dict
[
str
,
History
]:
def
_init_from_raw
(
self
,
data
,
copy
)
->
Dict
[
str
,
History
]:
...
@@ -80,23 +108,55 @@ class Flags:
...
@@ -80,23 +108,55 @@ class Flags:
return
result
return
result
def
__getitem__
(
self
,
key
:
str
)
->
pd
.
Series
:
def
__getitem__
(
self
,
key
:
str
)
->
pd
.
Series
:
if
key
not
in
self
.
_cache
:
if
key
not
in
self
.
_cache
:
self
.
_cache
[
key
]
=
self
.
_data
[
key
].
max
()
self
.
_cache
[
key
]
=
self
.
_data
[
key
].
max
()
return
self
.
_cache
[
key
]
return
self
.
_cache
[
key
].
copy
()
def
__setitem__
(
self
,
key
:
str
,
value
:
pd
.
Series
):
def
__setitem__
(
self
,
key
:
str
,
value
:
pd
.
Series
):
if
key
not
in
self
.
_data
:
if
key
not
in
self
.
_data
:
hist
=
History
()
hist
=
History
()
else
:
else
:
hist
=
self
.
_data
[
key
]
hist
=
self
.
_data
[
key
]
hist
.
append
(
value
)
hist
.
append
(
value
)
self
.
_cache
.
pop
(
key
,
None
)
self
.
_cache
.
pop
(
key
,
None
)
def
__delitem__
(
self
,
key
):
del
self
.
_data
[
key
]
self
.
_cache
.
pop
(
key
,
None
)
def
drop
(
self
,
key
):
self
.
__delitem__
(
key
)
@property
@property
def
columns
(
self
)
->
pd
.
Index
:
def
columns
(
self
)
->
pd
.
Index
:
return
pd
.
Index
(
self
.
_data
.
keys
())
return
pd
.
Index
(
self
.
_data
.
keys
())
@columns.setter
def
columns
(
self
,
value
:
pd
.
Index
):
if
not
isinstance
(
value
,
pd
.
Index
):
value
=
pd
.
Index
(
value
)
if
(
not
value
.
is_unique
or
not
pd
.
api
.
types
.
is_string_dtype
(
value
)
):
raise
TypeError
(
'
value must be pd.Index, with unique indices of type str
'
)
if
not
len
(
value
)
==
len
(
self
):
raise
ValueError
(
"
index must match current index in length
"
)
_data
,
_cache
=
{},
{}
for
old
,
new
in
zip
(
self
.
columns
,
value
):
_data
[
new
]
=
self
.
_data
.
pop
(
old
)
if
old
in
self
.
_cache
:
_cache
[
new
]
=
self
.
_cache
[
old
]
self
.
_data
=
_data
self
.
_cache
=
_cache
@property
@property
def
history
(
self
)
->
_HistAccess
:
def
history
(
self
)
->
_HistAccess
:
return
_HistAccess
(
self
)
return
_HistAccess
(
self
)
...
@@ -105,18 +165,25 @@ class Flags:
...
@@ -105,18 +165,25 @@ class Flags:
di
=
dios
.
DictOfSeries
(
columns
=
self
.
columns
)
di
=
dios
.
DictOfSeries
(
columns
=
self
.
columns
)
for
k
,
v
in
self
.
_data
.
items
():
for
k
,
v
in
self
.
_data
.
items
():
di
[
k
]
=
self
[
k
]
# cache
d
di
[
k
]
=
self
[
k
]
#
use
cache
return
di
.
copy
()
return
di
.
copy
()
def
to_frame
(
self
)
->
pd
.
DataFrame
:
def
to_frame
(
self
)
->
pd
.
DataFrame
:
return
self
.
to_dios
().
to_df
()
return
self
.
to_dios
().
to_df
()
@property
def
empty
(
self
)
->
bool
:
return
len
(
self
.
_data
)
==
0
def
__len__
(
self
)
->
int
:
return
len
(
self
.
_data
)
def
__repr__
(
self
)
->
str
:
def
__repr__
(
self
)
->
str
:
return
str
(
self
.
to_dios
())
return
str
(
self
.
to_dios
())
.
replace
(
'
DictOfSeries
'
,
type
(
self
).
__name__
)
def
init_flags_like
(
reference
:
Union
[
pd
.
Series
,
DictLike
,
Flags
])
->
Flags
:
def
init_flags_like
(
reference
:
Union
[
pd
.
Series
,
DictLike
,
Flags
]
,
initial_value
:
float
=
UNFLAGGED
)
->
Flags
:
"""
"""
Create empty Flags, from an reference data structure.
Create empty Flags, from an reference data structure.
...
@@ -125,9 +192,22 @@ def init_flags_like(reference: Union[pd.Series, DictLike, Flags]) -> Flags:
...
@@ -125,9 +192,22 @@ def init_flags_like(reference: Union[pd.Series, DictLike, Flags]) -> Flags:
reference : pd.DataFrame, pd.Series, dios.DictOfSeries, dict of pd.Series
reference : pd.DataFrame, pd.Series, dios.DictOfSeries, dict of pd.Series
The reference structure to initialize for.
The reference structure to initialize for.
initial_value : float, default 0
value to initialize the columns with
Notes
-----
Implementation detail:
The resulting Flags has not necessarily the exact same (inner) dimensions as the reference.
This may happen, if the passed structure, already holds History objects. Those are
reduced 1D-DataFrame (1-column-History). Nevertheless the returned flags are perfectly suitable
to be used in Saqc as flags container along with the passed reference structure (data).
Returns
Returns
-------
-------
flags: Flags
a flags object,
"""
"""
result
=
{}
result
=
{}
...
@@ -163,5 +243,7 @@ def init_flags_like(reference: Union[pd.Series, DictLike, Flags]) -> Flags:
...
@@ -163,5 +243,7 @@ def init_flags_like(reference: Union[pd.Series, DictLike, Flags]) -> Flags:
if
__name__
==
'
__main__
'
:
if
__name__
==
'
__main__
'
:
from
dios
import
example_DictOfSeries
from
dios
import
example_DictOfSeries
f
=
Flags
(
example_DictOfSeries
().
astype
(
float
))
print
(
f
)
f
=
init_flags_like
(
example_DictOfSeries
())
\ No newline at end of file
print
(
f
)
print
(
Flags
())
This diff is collapsed.
Click to expand it.
test/flagger/test_flags.py
0 → 100644
+
79
−
0
View file @
df3dbe9e
#!/usr/bin/env python
import
dios
import
pytest
import
numpy
as
np
import
pandas
as
pd
from
pandas.api.types
import
is_bool_dtype
from
test.common
import
TESTFLAGGER
,
initData
from
saqc.flagger.flags
import
Flags
_data
=
[
np
.
array
([[]]),
np
.
zeros
((
1
,
1
)),
np
.
zeros
((
3
,
4
)),
np
.
ones
((
3
,
4
)),
np
.
ones
((
3
,
4
))
*
np
.
nan
,
np
.
array
([
[
0
,
0
,
0
,
0
],
[
0
,
1
,
2
,
3
],
[
0
,
1
,
2
,
3
],
]),
np
.
array
([
[
0
,
0
,
0
,
0
],
[
0
,
1
,
np
.
nan
,
3
],
[
0
,
1
,
2
,
3
],
]),
]
data
=
[]
for
d
in
_data
:
columns
=
list
(
'
abcdefgh
'
)[:
d
.
shape
[
1
]]
df
=
pd
.
DataFrame
(
d
,
dtype
=
float
,
columns
=
columns
)
dis
=
dios
.
DictOfSeries
(
df
)
di
=
{}
di
.
update
(
df
.
items
())
data
.
append
(
df
)
data
.
append
(
di
)
data
.
append
(
dis
)
@pytest.mark.parametrize
(
'
data
'
,
data
)
def
test_init
(
data
:
np
.
array
):
flags
=
Flags
(
data
)
assert
isinstance
(
flags
,
Flags
)
assert
len
(
data
.
keys
())
==
len
(
flags
)
def
test_cache
():
arr
=
np
.
array
([
[
0
,
0
,
0
,
0
],
[
0
,
1
,
2
,
3
],
[
0
,
1
,
2
,
3
],
])
data
=
pd
.
DataFrame
(
arr
,
dtype
=
float
,
columns
=
list
(
'
abcd
'
))
flags
=
Flags
(
data
)
# cache empty
assert
flags
.
_cache
==
{}
# invoke caching
flags
[
'
a
'
]
assert
'
a
'
in
flags
.
_cache
# clears cache
flags
[
'
a
'
]
=
pd
.
Series
([
0
,
0
,
0
],
dtype
=
float
)
assert
'
a
'
not
in
flags
.
_cache
# cache all
flags
.
to_dios
()
for
c
in
flags
.
columns
:
assert
c
in
flags
.
_cache
# cache survive renaming
flags
.
columns
=
list
(
'
xyzq
'
)
for
c
in
flags
.
columns
:
assert
c
in
flags
.
_cache
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment