Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
SaQC
Manage
Activity
Members
Labels
Plan
Issues
36
Issue boards
Milestones
Wiki
Code
Merge requests
8
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
rdm-software
SaQC
Commits
206cefc9
Commit
206cefc9
authored
4 years ago
by
Peter Lünenschloß
Browse files
Options
Downloads
Patches
Plain Diff
data modelling test module implemented/ bugfixes in polynomial fit
parent
c80b919e
No related branches found
Branches containing commit
No related tags found
Tags containing commit
3 merge requests
!193
Release 1.4
,
!188
Release 1.4
,
!49
Dataprocessing features
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
saqc/funcs/data_modelling.py
+25
-14
25 additions, 14 deletions
saqc/funcs/data_modelling.py
saqc/lib/ts_operators.py
+2
-2
2 additions, 2 deletions
saqc/lib/ts_operators.py
test/funcs/test_data_modelling.py
+15
-8
15 additions, 8 deletions
test/funcs/test_data_modelling.py
with
42 additions
and
24 deletions
saqc/funcs/data_modelling.py
+
25
−
14
View file @
206cefc9
...
...
@@ -38,6 +38,8 @@ def modelling_polyFit(data, field, flagger, winsz, polydeg, numba='auto', eval_f
(1) Harmonization/resampling of your data will have a noticable impact on polyfittings performance - since
numba_boost doesnt apply for irregularly sampled data in the current implementation.
Note, that in the current implementation, the initial and final winsz/2 values do not get fitted.
Parameters
----------
winsz : integer or offset String
...
...
@@ -55,10 +57,11 @@ def modelling_polyFit(data, field, flagger, winsz, polydeg, numba='auto', eval_f
eval_flags : boolean, default True
Wheather or not to assign new flags to the calculated residuals. If True, a residual gets assigned the worst
flag present in the interval, the data for its calculation was obtained from.
min_periods : integer, default 0
min_periods : integer
or np.nan
, default 0
The minimum number of periods, that has to be available in every values fitting surrounding for the polynomial
fit to be performed. If there are not enough values, np.nan gets assigned. Default (0) results in fitting
regardless of the number of values present (results in overfitting for too sparse intervals).
regardless of the number of values present (results in overfitting for too sparse intervals). To automatically
set the minimum number of periods to the number of values in an offset defined window size, pass np.nan.
kwargs
Returns
...
...
@@ -90,9 +93,9 @@ def modelling_polyFit(data, field, flagger, winsz, polydeg, numba='auto', eval_f
residues
[
residues
.
index
[
centers_iloc
[
-
1
]]:
residues
.
index
[
-
1
]]
=
np
.
nan
else
:
if
isinstance
(
winsz
,
str
):
winsz
=
np
.
floor
(
pd
.
Timedelta
(
winsz
)
/
pd
.
Timedelta
(
to_fit
.
index
.
freqstr
))
if
winsz
%
2
==
1
:
winsz
=
winsz
-
1
winsz
=
int
(
np
.
floor
(
pd
.
Timedelta
(
winsz
)
/
pd
.
Timedelta
(
to_fit
.
index
.
freqstr
))
)
if
winsz
%
2
==
0
:
winsz
=
int
(
winsz
-
1
)
if
numba
==
'
auto
'
:
if
to_fit
.
shape
[
0
]
<
200000
:
numba
=
False
...
...
@@ -100,11 +103,13 @@ def modelling_polyFit(data, field, flagger, winsz, polydeg, numba='auto', eval_f
numba
=
True
val_range
=
np
.
arange
(
0
,
winsz
)
center_index
=
np
.
floor
(
winsz
/
2
)
center_index
=
int
(
np
.
floor
(
winsz
/
2
)
)
if
min_periods
<
winsz
:
if
min_periods
>
0
:
max_nan_total
=
winsz
-
min_periods
to_fit
=
to_fit
.
rolling
(
winsz
,
center
=
True
).
apply
(
validationAgg
,
raw
=
True
,
args
=
(
max_nan_total
))
to_fit
=
to_fit
.
rolling
(
winsz
,
min_periods
=
min_periods
,
center
=
True
).
apply
(
lambda
x
,
y
:
x
[
y
],
raw
=
True
,
args
=
(
center_index
,))
# we need a missing value marker that is not nan, because nan values dont get passed by pandas rolling
# method
miss_marker
=
to_fit
.
min
()
...
...
@@ -112,20 +117,26 @@ def modelling_polyFit(data, field, flagger, winsz, polydeg, numba='auto', eval_f
na_mask
=
to_fit
.
isna
()
to_fit
[
na_mask
]
=
miss_marker
if
numba
:
residues
=
to_fit
.
rolling
(
winsz
,
center
=
True
).
apply
(
polyRoller_numba
,
args
=
(
miss_marker
,
val_range
,
center_index
,
polydeg
),
residues
=
to_fit
.
rolling
(
winsz
).
apply
(
polyRoller_numba
,
args
=
(
miss_marker
,
val_range
,
center_index
,
polydeg
),
raw
=
True
,
engine
=
'
numba
'
,
engine_kwargs
=
{
'
no_python
'
:
True
})
# due to a tiny bug - rolling with center=True doesnt work when using numba engine.
residues
=
residues
.
shift
(
-
int
(
center_index
))
else
:
residues
=
to_fit
.
rolling
(
winsz
,
center
=
True
).
apply
(
polyRoller
,
args
=
(
miss_marker
,
val_range
,
center_index
,
polydeg
),
raw
=
True
)
residues
=
to_fit
.
rolling
(
winsz
,
center
=
True
).
apply
(
polyRoller
,
args
=
(
miss_marker
,
val_range
,
center_index
,
polydeg
),
raw
=
True
)
residues
[
na_mask
]
=
np
.
nan
else
:
# we only fit fully populated intervals:
if
numba
:
residues
=
to_fit
.
rolling
(
winsz
,
center
=
True
).
apply
(
polyRollerNoMissing_numba
,
args
=
(
val_range
,
center_index
,
polydeg
),
residues
=
to_fit
.
rolling
(
winsz
).
apply
(
polyRollerNoMissing_numba
,
args
=
(
val_range
,
center_index
,
polydeg
),
engine
=
'
numba
'
,
engine_kwargs
=
{
'
no_python
'
:
True
},
raw
=
True
)
# due to a tiny bug - rolling with center=True doesnt work when using numba engine.
residues
=
residues
.
shift
(
-
int
(
center_index
))
else
:
residues
=
to_fit
.
rolling
(
winsz
,
center
=
True
).
apply
(
polyRollerNoMissing
,
args
=
(
val_range
,
center_index
,
polydeg
),
raw
=
True
)
...
...
This diff is collapsed.
Click to expand it.
saqc/lib/ts_operators.py
+
2
−
2
View file @
206cefc9
...
...
@@ -342,8 +342,8 @@ def polynomialInterpolation(data, inter_limit=2, inter_order=2):
return
interpolateNANs
(
data
,
'
polynomial
'
,
inter_limit
=
inter_limit
,
order
=
inter_order
)
def
validationAgg
(
x
,
max_nan_total
):
return
validationTrafo
(
x
,
max_nan_total
=
max_nan_total
)[
0
]
def
validationAgg
(
x
,
max_nan_total
,
max_nan_consec
):
return
validationTrafo
(
x
,
max_nan_total
=
max_nan_total
,
max_nan_consec
=
max_nan_consec
)[
0
]
@nb.njit
...
...
This diff is collapsed.
Click to expand it.
test/funcs/test_data_modelling.py
+
15
−
8
View file @
206cefc9
...
...
@@ -8,23 +8,30 @@ import pandas as pd
from
dios
import
dios
from
test.common
import
TESTFLAGGER
import
matplotlib.pyplot
as
plt
from
saqc.funcs.data_modelling
import
(
modelling_polyFit
)
import
numpy.polynomial.polynomial
as
poly
@pytest.mark.parametrize
(
"
flagger
"
,
T
ESTFLAGGER
)
TF
=
TESTFLAGGER
[:
1
]
@pytest.mark.parametrize
(
"
flagger
"
,
T
F
)
@pytest.mark.parametrize
(
"
dat
"
,
[
pytest
.
lazy_fixture
(
"
course_2
"
)])
def
test_modelling_polyFit_forRegular
(
dat
,
flagger
):
data
,
_
=
dat
(
freq
=
'
10min
'
,
periods
=
10
0
,
initial_level
=
0
,
final_level
=
100
,
out_val
=-
100
)
data
,
_
=
dat
(
freq
=
'
10min
'
,
periods
=
3
0
,
initial_level
=
0
,
final_level
=
100
,
out_val
=-
100
)
# add some nice sine distortion
data
+
=
np
.
sin
(
np
.
arange
(
0
,
len
(
data
)))
data
=
data
+
10
*
np
.
sin
(
np
.
arange
(
0
,
len
(
data
.
indexes
[
0
]
)))
data
=
dios
.
DictOfSeries
(
data
)
flagger
=
flagger
.
initFlags
(
data
)
result1
,
_
=
modelling_polyFit
(
data
,
'
data
'
,
flagger
,
11
,
2
,
numba
=
False
)
result2
,
_
=
modelling_polyFit
(
data
,
'
data
'
,
flagger
,
11
,
2
,
numba
=
True
)
assert
(
result1
[
'
data
'
]
-
result2
[
'
data
'
]).
abs
().
max
()
<
10
**-
10
result3
,
_
=
modelling_polyFit
(
data
,
'
data
'
,
flagger
,
'
110min
'
,
2
,
numba
=
False
)
assert
result3
[
'
data
'
].
equals
(
result1
[
'
data
'
])
result4
,
_
=
modelling_polyFit
(
data
,
'
data
'
,
flagger
,
11
,
2
,
numba
=
True
,
min_periods
=
11
)
assert
(
result4
[
'
data
'
]
-
result2
[
'
data
'
]).
abs
().
max
()
<
10
**-
10
data
.
iloc
[
13
:
16
]
=
np
.
nan
result5
,
_
=
modelling_polyFit
(
data
,
'
data
'
,
flagger
,
11
,
2
,
numba
=
True
,
min_periods
=
9
)
assert
result5
[
'
data
'
].
iloc
[
10
:
19
].
isna
().
all
()
result1
,
_
=
modelling_polyFit
(
data
,
'
data
'
,
flagger
,
11
,
numba
=
False
)
result2
,
_
=
modelling_polyFit
(
data
,
'
data
'
,
flagger
,
11
,
numba
=
True
)
result3
,
_
=
modelling_polyfit
(
data
,
'
data
'
,
flagger
,
'
2h
'
,
numba
=
False
)
result4
,
_
=
modelling_polyfit
(
data
,
'
data
'
,
flagger
,
'
2h
'
,
numba
=
True
)
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment