Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
SaQC
Manage
Activity
Members
Labels
Plan
Issues
36
Issue boards
Milestones
Wiki
Code
Merge requests
8
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
rdm-software
SaQC
Commits
24845912
Commit
24845912
authored
5 years ago
by
Bert Palm
🎇
Browse files
Options
Downloads
Patches
Plain Diff
fixed some bugs, improved setFlags, added tests for setFlags
parent
8044ded7
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
saqc/flagger/baseflagger.py
+26
-8
26 additions, 8 deletions
saqc/flagger/baseflagger.py
saqc/flagger/dmpflagger.py
+14
-7
14 additions, 7 deletions
saqc/flagger/dmpflagger.py
test/flagger/test_flagger.py
+147
-5
147 additions, 5 deletions
test/flagger/test_flagger.py
with
187 additions
and
20 deletions
saqc/flagger/baseflagger.py
+
26
−
8
View file @
24845912
...
@@ -66,15 +66,30 @@ class BaseFlagger:
...
@@ -66,15 +66,30 @@ class BaseFlagger:
check_ispdlike
(
flags
,
'
flags
'
,
allow_multiindex
=
False
)
check_ispdlike
(
flags
,
'
flags
'
,
allow_multiindex
=
False
)
return
flags
return
flags
def
setFlags
(
self
,
flags
:
pd
.
DataFrame
,
field
,
loc
=
None
,
iloc
=
None
,
flag
=
None
,
**
kwargs
)
->
pd
.
DataFrame
:
def
setFlags
(
self
,
flags
:
pd
.
DataFrame
,
field
,
loc
=
None
,
iloc
=
None
,
flag
=
None
,
force
=
False
,
**
kwargs
)
:
check_isdf
(
flags
,
'
flags
'
,
allow_multiindex
=
False
)
check_isdf
(
flags
,
'
flags
'
,
allow_multiindex
=
False
)
# prepare
# prepare
flags
=
self
.
_assureDtype
(
flags
,
field
)
flags
=
self
.
_assureDtype
(
flags
,
field
)
.
copy
()
flag
=
self
.
BAD
if
flag
is
None
else
self
.
_checkFlag
(
flag
)
flag
=
self
.
BAD
if
flag
is
None
else
self
.
_checkFlag
(
flag
)
flags_loc
,
rows
,
col
=
self
.
_getIndexer
(
flags
,
field
,
loc
,
iloc
)
flags_loc
,
rows
,
col
=
self
.
_getIndexer
(
flags
,
field
,
loc
,
iloc
)
# set
# set
mask
=
flags_loc
[
rows
,
col
]
<
flag
if
isinstance
(
flag
,
pd
.
Series
):
idx
=
mask
[
mask
].
index
if
len
(
flags
.
index
)
!=
len
(
flags
):
raise
ValueError
(
'
Length of flags and flag must match
'
)
i
,
r
,
_
=
self
.
_getIndexer
(
flag
,
field
,
loc
,
iloc
)
flag
=
i
[
r
].
squeeze
()
if
force
:
mask
=
[
True
]
*
len
(
rows
)
idx
=
flags_loc
[
rows
,
col
].
index
else
:
mask
=
flags_loc
[
rows
,
col
]
<
flag
idx
=
mask
[
mask
].
index
if
isinstance
(
flag
,
pd
.
Series
):
flag
=
flag
[
mask
]
flags
.
loc
[
idx
,
field
]
=
flag
flags
.
loc
[
idx
,
field
]
=
flag
return
self
.
_assureDtype
(
flags
,
field
)
return
self
.
_assureDtype
(
flags
,
field
)
...
@@ -86,9 +101,9 @@ class BaseFlagger:
...
@@ -86,9 +101,9 @@ class BaseFlagger:
def
_checkFlag
(
self
,
flag
):
def
_checkFlag
(
self
,
flag
):
if
isinstance
(
flag
,
pd
.
Series
):
if
isinstance
(
flag
,
pd
.
Series
):
if
f
lag
.
d
type
!=
self
.
flags
:
if
not
self
.
_isF
lag
sD
type
(
flag
.
dtype
)
:
raise
TypeError
(
f
"
Passed
flag
s
series is
of invalid
'
{
flag
.
dtype
}
'
dtype.
"
raise
TypeError
(
f
"
flag
(-
series
)
is
not of expected
'
{
self
.
flags
}
'
-dtype with ordered categories
"
f
"
Expected
{
self
.
flags
}
type with ordered categories
{
list
(
self
.
flags
.
categories
)
}
"
)
f
"
{
list
(
self
.
flags
.
categories
)
}
,
'
{
flag
.
dtype
}
'
-dtype was passed.
"
)
else
:
else
:
if
flag
not
in
self
.
flags
:
if
flag
not
in
self
.
flags
:
raise
ValueError
(
f
"
Invalid flag
'
{
flag
}
'
. Possible choices are
{
list
(
self
.
flags
.
categories
)[
1
:
]
}
"
)
raise
ValueError
(
f
"
Invalid flag
'
{
flag
}
'
. Possible choices are
{
list
(
self
.
flags
.
categories
)[
1
:
]
}
"
)
...
@@ -108,10 +123,13 @@ class BaseFlagger:
...
@@ -108,10 +123,13 @@ class BaseFlagger:
def
_assureDtype
(
self
,
flags
,
field
=
None
):
def
_assureDtype
(
self
,
flags
,
field
=
None
):
if
field
is
None
:
# we got a df
if
field
is
None
:
# we got a df
flags
=
flags
.
astype
(
self
.
flags
)
flags
=
flags
.
astype
(
self
.
flags
)
elif
not
isinstanc
e
(
flags
[
field
].
dtype
,
pd
.
Categorical
):
elif
not
self
.
_isFlagsDtyp
e
(
flags
[
field
].
dtype
):
flags
[
field
]
=
flags
[
field
].
astype
(
self
.
flags
)
flags
[
field
]
=
flags
[
field
].
astype
(
self
.
flags
)
return
flags
return
flags
def
_isFlagsDtype
(
self
,
dtype
):
return
isinstance
(
dtype
,
pd
.
CategoricalDtype
)
and
dtype
==
self
.
flags
def
nextTest
(
self
):
def
nextTest
(
self
):
pass
pass
...
...
This diff is collapsed.
Click to expand it.
saqc/flagger/dmpflagger.py
+
14
−
7
View file @
24845912
...
@@ -66,19 +66,26 @@ class DmpFlagger(BaseFlagger):
...
@@ -66,19 +66,26 @@ class DmpFlagger(BaseFlagger):
check_isdfmi
(
flags
,
'
flags
'
)
check_isdfmi
(
flags
,
'
flags
'
)
# prepare
# prepare
comment
=
json
.
dumps
(
dict
(
comment
=
comment
,
commit
=
self
.
project_version
,
test
=
kwargs
.
get
(
"
func_name
"
,
""
)))
comment
=
json
.
dumps
(
dict
(
comment
=
comment
,
commit
=
self
.
project_version
,
test
=
kwargs
.
get
(
"
func_name
"
,
""
)))
flags
=
self
.
_assureDtype
(
flags
,
field
)
flags
=
self
.
_assureDtype
(
flags
,
field
)
.
copy
()
flag
=
self
.
BAD
if
flag
is
None
else
self
.
_checkFlag
(
flag
)
flag
=
self
.
BAD
if
flag
is
None
else
self
.
_checkFlag
(
flag
)
# set
# set
flags
=
flags
.
copy
()
flags_loc
,
rows
,
col
=
self
.
_getIndexer
(
self
.
getFlags
(
flags
),
field
,
loc
,
iloc
)
indexer
,
rows
,
col
=
self
.
_getIndexer
(
self
.
getFlags
(
flags
),
field
,
loc
,
iloc
)
if
isinstance
(
flag
,
pd
.
Series
):
if
isinstance
(
flag
,
pd
.
Series
):
if
len
(
flags
.
index
)
!=
len
(
flags
):
raise
ValueError
(
'
Length of flags and flag must match
'
)
i
,
r
,
_
=
self
.
_getIndexer
(
flag
,
field
,
loc
,
iloc
)
i
,
r
,
_
=
self
.
_getIndexer
(
flag
,
field
,
loc
,
iloc
)
flag
=
i
[
r
]
flag
=
i
[
r
].
squeeze
()
if
force
:
if
force
:
idx
=
indexer
[
rows
,
col
].
index
mask
=
[
True
]
*
len
(
rows
)
idx
=
flags_loc
[
rows
,
col
].
index
else
:
else
:
mask
=
indexer
[
rows
,
col
]
<
flag
mask
=
flags_loc
[
rows
,
col
]
<
flag
idx
=
mask
[
mask
].
index
idx
=
mask
[
mask
].
index
if
isinstance
(
flag
,
pd
.
Series
):
flag
=
flag
[
mask
]
flags
.
loc
[
idx
,
field
]
=
flag
,
cause
,
comment
flags
.
loc
[
idx
,
field
]
=
flag
,
cause
,
comment
return
self
.
_assureDtype
(
flags
,
field
)
return
self
.
_assureDtype
(
flags
,
field
)
...
@@ -95,6 +102,6 @@ class DmpFlagger(BaseFlagger):
...
@@ -95,6 +102,6 @@ class DmpFlagger(BaseFlagger):
flags
=
super
().
_assureDtype
(
flags
,
None
)
flags
=
super
().
_assureDtype
(
flags
,
None
)
else
:
# we got a df with a multi-index
else
:
# we got a df with a multi-index
flags
=
flags
.
astype
({
c
:
self
.
flags
for
c
in
flags
.
columns
if
FlagFields
.
FLAG
in
c
})
flags
=
flags
.
astype
({
c
:
self
.
flags
for
c
in
flags
.
columns
if
FlagFields
.
FLAG
in
c
})
elif
not
isinstance
(
flags
[(
field
,
FlagFields
.
FLAG
)].
dtype
,
pd
.
Categorical
):
elif
not
isinstance
(
flags
[(
field
,
FlagFields
.
FLAG
)].
dtype
,
pd
.
Categorical
Dtype
):
flags
[(
field
,
FlagFields
.
FLAG
)]
=
flags
[(
field
,
FlagFields
.
FLAG
)].
astype
(
self
.
flags
)
flags
[(
field
,
FlagFields
.
FLAG
)]
=
flags
[(
field
,
FlagFields
.
FLAG
)].
astype
(
self
.
flags
)
return
flags
return
flags
This diff is collapsed.
Click to expand it.
test/flagger/test_flagger.py
+
147
−
5
View file @
24845912
#!/usr/bin/env python
#!/usr/bin/env python
"""
docstring: TODO
"""
__author__
=
"
Bert Palm
"
__author__
=
"
Bert Palm
"
__email__
=
"
bert.palm@ufz.de
"
__email__
=
"
bert.palm@ufz.de
"
__copyright__
=
"
Copyright 2018, Helmholtz-Zentrum für Umweltforschung GmbH - UFZ
"
__copyright__
=
"
Copyright 2018, Helmholtz-Zentrum für Umweltforschung GmbH - UFZ
"
\ No newline at end of file
import
pytest
import
numpy
as
np
import
pandas
as
pd
from
saqc.flagger.baseflagger
import
BaseFlagger
from
saqc.flagger.dmpflagger
import
DmpFlagger
from
saqc.flagger.simpleflagger
import
SimpleFlagger
from
pandas.core.indexing
import
IndexingError
from
saqc.funcs.functions
import
flagRange
,
flagSesonalRange
,
forceFlags
,
clearFlags
TESTFLAGGERS
=
[
BaseFlagger
([
'
NIL
'
,
'
GOOD
'
,
'
BAD
'
]),
DmpFlagger
(),
SimpleFlagger
()]
@pytest.mark.parametrize
(
'
flagger
'
,
TESTFLAGGERS
)
def
test_initFlags
(
flagger
):
field
=
'
testdata
'
index
=
pd
.
date_range
(
start
=
'
2011-01-01
'
,
end
=
'
2011-01-02
'
,
periods
=
100
)
data
=
pd
.
DataFrame
(
data
=
{
field
:
np
.
linspace
(
0
,
index
.
size
-
1
,
index
.
size
)},
index
=
index
)
flags
=
flagger
.
initFlags
(
data
)
assert
len
(
flags
)
==
100
assert
isinstance
(
flags
,
pd
.
DataFrame
)
@pytest.mark.parametrize
(
'
flagger
'
,
TESTFLAGGERS
)
def
test_getsetFlags
(
flagger
):
field
=
'
testdata
'
index
=
pd
.
date_range
(
start
=
'
2011-01-01
'
,
end
=
'
2011-01-02
'
,
periods
=
100
)
data
=
pd
.
DataFrame
(
data
=
{
field
:
np
.
linspace
(
0
,
index
.
size
-
1
,
index
.
size
)},
index
=
index
)
flags
=
flagger
.
initFlags
(
data
)
flags
=
flagger
.
setFlags
(
flags
,
field
,
flag
=
flagger
.
GOOD
)
flagged
=
flagger
.
getFlags
(
flags
)[
field
]
assert
isinstance
(
flagged
.
dtype
,
pd
.
CategoricalDtype
)
assert
(
flagged
==
flagger
.
GOOD
).
all
()
flags
=
flagger
.
setFlags
(
flags
,
field
,
flag
=
flagger
.
BAD
)
flagged
=
flagger
.
getFlags
(
flags
)[
field
]
assert
(
flagged
==
flagger
.
BAD
).
all
()
flags
=
flagger
.
setFlags
(
flags
,
field
,
flag
=
flagger
.
GOOD
)
flagged
=
flagger
.
getFlags
(
flags
)[
field
]
assert
(
flagged
==
flagger
.
BAD
).
all
()
@pytest.mark.parametrize
(
'
flagger
'
,
TESTFLAGGERS
)
def
test_setFlags_isFlagged
(
flagger
,
**
kwargs
):
field
=
'
testdata
'
index
=
pd
.
date_range
(
start
=
'
2011-01-01
'
,
end
=
'
2011-01-02
'
,
periods
=
100
)
data
=
pd
.
DataFrame
(
data
=
{
field
:
np
.
linspace
(
0
,
index
.
size
-
1
,
index
.
size
)},
index
=
index
)
flags
=
flagger
.
initFlags
(
data
)
d
=
data
[
field
]
mask
=
d
<
(
d
.
max
()
-
d
.
min
())
//
2
assert
len
(
mask
)
==
len
(
flags
.
index
)
f
=
flagger
.
setFlags
(
flags
,
field
,
loc
=
mask
.
values
,
flag
=
flagger
.
BAD
)
# test isFlagged
isflagged
=
flagger
.
isFlagged
(
f
[
field
])
assert
(
isflagged
==
mask
).
all
()
# test setFlag with mask
flagged
=
flagger
.
getFlags
(
f
[
field
])
isflagged
=
flagged
==
flagger
.
BAD
assert
(
isflagged
==
mask
).
all
()
# ok we can use isFlagged now :D
# test with mask and iloc
f
=
flagger
.
setFlags
(
flags
,
field
,
iloc
=
mask
.
values
,
flag
=
flagger
.
BAD
)
isflagged
=
flagger
.
isFlagged
(
f
[
field
])
assert
(
isflagged
==
mask
).
all
()
try
:
m
=
mask
[
mask
]
m
.
iloc
[
0
:
10
]
=
False
m
=
m
[
m
]
f
=
flagger
.
setFlags
(
flags
,
field
,
loc
=
m
,
flag
=
flagger
.
BAD
)
except
IndexingError
:
pass
else
:
raise
AssertionError
# test setFlags with loc and index
idx
=
mask
[
mask
].
index
assert
len
(
idx
)
<
len
(
flags
.
index
)
f
=
flagger
.
setFlags
(
flags
,
field
,
loc
=
idx
,
flag
=
flagger
.
BAD
)
isflagged
=
flagger
.
isFlagged
(
f
[
field
])
assert
(
isflagged
==
mask
).
all
()
# test setFlags with iloc and index
idx
=
mask
[
mask
].
reset_index
(
drop
=
True
).
index
assert
len
(
idx
)
<
len
(
flags
.
index
)
f
=
flagger
.
setFlags
(
flags
,
field
,
iloc
=
idx
,
flag
=
flagger
.
BAD
)
isflagged
=
flagger
.
isFlagged
(
f
[
field
])
assert
(
isflagged
==
mask
).
all
()
# test passing a series of flags as flag-arg
every
=
5
flagseries
=
pd
.
Series
(
data
=
flagger
.
GOOD
,
index
=
flags
.
index
)
flagseries
.
iloc
[::
every
]
=
flagger
.
BAD
flagseries
=
flagseries
.
astype
(
flagger
.
flags
)
idx
=
mask
[
mask
].
index
assert
len
(
flags
)
==
len
(
flagseries
)
assert
len
(
flags
)
!=
len
(
idx
)
f
=
flagger
.
setFlags
(
flags
,
field
,
loc
=
idx
,
flag
=
flagseries
)
bads
=
flagger
.
isFlagged
(
f
[
field
],
flag
=
flagger
.
BAD
,
comparator
=
'
==
'
)
bads
=
bads
[
bads
]
valid
=
mask
[
mask
].
iloc
[::
every
]
assert
len
(
valid
)
==
len
(
bads
)
and
(
valid
==
bads
).
all
()
# test passing a series of flags as flag-arg and force
f
=
flagger
.
setFlags
(
flags
,
field
,
flag
=
flagger
.
BAD
)
every
=
5
flagseries
=
pd
.
Series
(
data
=
flagger
.
GOOD
,
index
=
flags
.
index
)
flagseries
.
iloc
[::
every
]
=
flagger
.
UNFLAGGED
flagseries
=
flagseries
.
astype
(
flagger
.
flags
)
idx
=
mask
[
mask
].
index
assert
len
(
flags
)
==
len
(
flagseries
)
assert
len
(
flags
)
!=
len
(
idx
)
f
=
flagger
.
setFlags
(
f
,
field
,
loc
=
idx
,
flag
=
flagseries
,
force
=
True
)
unflagged
=
flagger
.
isFlagged
(
f
[
field
],
flag
=
flagger
.
UNFLAGGED
,
comparator
=
'
==
'
)
unflagged
=
unflagged
[
unflagged
]
valid
=
mask
[
mask
].
iloc
[::
every
]
assert
len
(
valid
)
==
len
(
unflagged
)
and
(
valid
==
unflagged
).
all
()
if
__name__
==
'
__main__
'
:
flagger
=
TESTFLAGGERS
[
0
]
test_setFlags_isFlagged
(
flagger
)
print
(
'
done
'
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment