Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • berntm/saqc
  • rdm-software/saqc
  • schueler/saqc
3 results
Show changes
Commits on Source (84)
Showing
with 199 additions and 235 deletions
...@@ -5,3 +5,4 @@ ...@@ -5,3 +5,4 @@
*.feather filter=lfs diff=lfs merge=lfs -text *.feather filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text *.pkl filter=lfs diff=lfs merge=lfs -text
resources/machine_learning/data/soil_moisture_mwe.feather filter=lfs diff=lfs merge=lfs -text resources/machine_learning/data/soil_moisture_mwe.feather filter=lfs diff=lfs merge=lfs -text
saqc/_version.py export-subst
...@@ -29,7 +29,7 @@ jobs: ...@@ -29,7 +29,7 @@ jobs:
fail-fast: false fail-fast: false
matrix: matrix:
os: ["windows-latest", "ubuntu-latest", "macos-latest"] os: ["windows-latest", "ubuntu-latest", "macos-latest"]
python-version: ["3.8", "3.9", "3.10"] python-version: ["3.9", "3.10", "3.11"]
defaults: defaults:
run: run:
# somehow this also works for windows O.o ?? # somehow this also works for windows O.o ??
......
...@@ -2,6 +2,20 @@ ...@@ -2,6 +2,20 @@
# #
# SPDX-License-Identifier: GPL-3.0-or-later # SPDX-License-Identifier: GPL-3.0-or-later
# ===========================================================
# Hints
# ===========================================================
# $PYPI_PKG_NAME
# The variable PYPI_PKG_NAME is used in setup.py to determine
# how to name the tarball package. If not set the package is
# named 'saqc'.
# $TESTPYPI_TOKEN
# The upload token used for testpypi, set it on the gitlab
# page and enable masking to prevent revealing
# =========================================================== # ===========================================================
# preparation # preparation
# =========================================================== # ===========================================================
...@@ -75,9 +89,9 @@ coverage: ...@@ -75,9 +89,9 @@ coverage:
path: coverage.xml path: coverage.xml
python38: python39:
stage: test stage: test
image: python:3.8 image: python:3.9
script: script:
- pytest tests -Werror --junitxml=report.xml - pytest tests -Werror --junitxml=report.xml
- python -m saqc --config docs/resources/data/config.csv --data docs/resources/data/data.csv --outfile /tmp/test.csv - python -m saqc --config docs/resources/data/config.csv --data docs/resources/data/data.csv --outfile /tmp/test.csv
...@@ -87,9 +101,9 @@ python38: ...@@ -87,9 +101,9 @@ python38:
junit: report.xml junit: report.xml
python39: python310:
stage: test stage: test
image: python:3.9 image: python:3.10
script: script:
- pytest tests -Werror --junitxml=report.xml - pytest tests -Werror --junitxml=report.xml
- python -m saqc --config docs/resources/data/config.csv --data docs/resources/data/data.csv --outfile /tmp/test.csv - python -m saqc --config docs/resources/data/config.csv --data docs/resources/data/data.csv --outfile /tmp/test.csv
...@@ -98,10 +112,9 @@ python39: ...@@ -98,10 +112,9 @@ python39:
reports: reports:
junit: report.xml junit: report.xml
python311:
python310:
stage: test stage: test
image: python:3.10 image: python:3.11
script: script:
- pytest tests -Werror --junitxml=report.xml - pytest tests -Werror --junitxml=report.xml
- python -m saqc --config docs/resources/data/config.csv --data docs/resources/data/data.csv --outfile /tmp/test.csv - python -m saqc --config docs/resources/data/config.csv --data docs/resources/data/data.csv --outfile /tmp/test.csv
...@@ -110,17 +123,6 @@ python310: ...@@ -110,17 +123,6 @@ python310:
reports: reports:
junit: report.xml junit: report.xml
# python311:
# stage: test
# image: python:3.11
# script:
# - pytest tests -Werror --junitxml=report.xml
# - python -m saqc --config docs/resources/data/config.csv --data docs/resources/data/data.csv --outfile /tmp/test.csv
# artifacts:
# when: always
# reports:
# junit: report.xml
doctest: doctest:
stage: test stage: test
script: script:
...@@ -135,50 +137,61 @@ doctest: ...@@ -135,50 +137,61 @@ doctest:
# =========================================================== # ===========================================================
# check if we are able to build a wheel # check if we are able to build a wheel
# and if the import works # and if the import works
wheel38:
stage: build
image: python:3.8
script:
- pip install wheel
- pip wheel .
- pip install .
- python -c 'import saqc; print(f"{saqc.__version__=}")'
wheel39: wheel39:
stage: build stage: build
image: python:3.9 image: python:3.9
variables:
PYPI_PKG_NAME: "saqc-dev"
script: script:
- pip install wheel - pip install wheel
- pip wheel . - pip wheel .
- pip install . - pip install .
- python -c 'import saqc; print(f"{saqc.__version__=}")' - python -c 'import saqc; print(f"{saqc.__version__=}")'
wheel310: wheel310:
stage: build stage: build
image: python:3.10 image: python:3.10
variables:
PYPI_PKG_NAME: "saqc-dev"
script: script:
- pip install wheel - pip install wheel
- pip wheel . - pip wheel .
- pip install . - pip install .
- python -c 'import saqc; print(f"{saqc.__version__=}")' - python -c 'import saqc; print(f"{saqc.__version__=}")'
wheel311: wheel311:
stage: build stage: build
image: python:3.11 image: python:3.11
variables:
PYPI_PKG_NAME: "saqc-dev"
script: script:
- pip install wheel - pip install wheel
- pip wheel . - pip wheel .
- pip install . - pip install .
- python -c 'import saqc; print(f"{saqc.__version__=}")' - python -c 'import saqc; print(f"{saqc.__version__=}")'
docs:
stage: build
script:
- cd docs
- pip install -r requirements.txt
- make doc
# =========================================================== # ===========================================================
# Extra Pipeline (run with a successful run of all other jobs on develop) # Extra Pipeline (run with a successful run of all other jobs on develop)
# =========================================================== # ===========================================================
upload_testpypi:
stage: deploy
only:
- develop
except:
- schedules
variables:
PYPI_PKG_NAME: "saqc-dev"
TWINE_USERNAME: __token__
TWINE_PASSWORD: $TESTPYPI_TOKEN
script:
- pip install build twine
- python -m build
- twine check --strict dist/*
- twine upload -r testpypi dist/*
# make html docu with sphinx # make html docu with sphinx
pages: pages:
stage: deploy stage: deploy
......
...@@ -5,19 +5,48 @@ SPDX-License-Identifier: GPL-3.0-or-later ...@@ -5,19 +5,48 @@ SPDX-License-Identifier: GPL-3.0-or-later
--> -->
# Changelog # Changelog
## Unreleased ## Unreleased
[List of commits](https://git.ufz.de/rdm-software/saqc/-/compare/v2.4.0...develop) [List of commits](https://git.ufz.de/rdm-software/saqc/-/compare/v2.5.0...develop)
### Added ### Added
### Changed ### Changed
- pin pandas to versions >= 2.0
### Removed ### Removed
- removed deprecated `DictOfSeries.to_df`
### Fixed ### Fixed
## [2.5.0](https://git.ufz.de/rdm-software/saqc/-/tags/v2.4.1) - 2023-06-22
[List of commits](https://git.ufz.de/rdm-software/saqc/-/compare/v2.4.1...v2.5.0)
### Added
- `SaQC.plot`:
- enable multivariate plots
- keyword `plot_kwargs` to pass matplotlib related arguments
- CLI:
- `--version` to print the SaQC version
- `-ll` as a shorthand for `--log-level`
- `--json-field` to use a non-root element of a json file.
- basic json support for CLI config files, which are detected by `.json`-extension.
- `SaQC.flagScatterLowpass`: option to select function based on string names.
- Checks and unified error message for common function inputs.
### Changed
- Require pandas >= 2.0
- `SaQC.flagUniLOF` and `SaQC.assignUniLOF`: changed parameter `fill_na` to type `bool`.
- `SaQC.plot`:
- changed default color for single variables to `black` with `80% transparency`
- added seperate legend for flags
### Removed
- `SaQC.plot`: option to plot with complete history (`history="complete"`)
- Support for Python 3.8
### Fixed
- `SaQC.assignChangePointCluster` and `SaQC.flagChangePoints`: A tuple passed `min_period`
was only recognised if `window` was also a tuple.
- `SaQC.propagateFlags` was overwriting existing flags
### Deprecated ### Deprecated
- `SaQC.andGroup` and `SaQC.orGroup`: option to pass dictionaries to `group`.
- `SaQC.plot`:
- `phaseplot` in favor of usage with `mode="biplot"`
- `cyclestart` in favor of usage with `marker_kwargs`
- `SaQC.flagStatLowPass` in favor of `SaQC.flagScatterLowpass`
## [2.4.1](https://git.ufz.de/rdm-software/saqc/-/tags/v2.4.1) - 2023-06-22 ## [2.4.1](https://git.ufz.de/rdm-software/saqc/-/tags/v2.4.1) - 2023-06-22
[List of commits](https://git.ufz.de/rdm-software/saqc/-/compare/v2.4.0...develop) [List of commits](https://git.ufz.de/rdm-software/saqc/-/compare/v2.4.0...v.2.4.1)
### Added ### Added
### Changed ### Changed
- pin pandas to versions >= 2.0 - pin pandas to versions >= 2.0
...@@ -44,14 +73,13 @@ SPDX-License-Identifier: GPL-3.0-or-later ...@@ -44,14 +73,13 @@ SPDX-License-Identifier: GPL-3.0-or-later
- `func` arguments in text configurations were not parsed correctly - `func` arguments in text configurations were not parsed correctly
- fail on duplicated arguments to test methods - fail on duplicated arguments to test methods
- `reample` was not writing meta entries - `reample` was not writing meta entries
- `flagByStatLowPass` was overwriting existing flags - `flagByScatterLowpass` was overwriting existing flags
- `flagUniLOF` and `flagLOF` were overwriting existing flags - `flagUniLOF` and `flagLOF` were overwriting existing flags
### Deprecated ### Deprecated
- Deprecate `flagMVScore` parameters: `partition` in favor of `window`, `partition_min` in favor of `min_periods`, `min_periods` in favor of `min_periods_r` - Deprecate `flagMVScore` parameters: `partition` in favor of `window`, `partition_min` in favor of `min_periods`, `min_periods` in favor of `min_periods_r`
- Deprecate `interpolate`, `linear` and `shift` in favor of `align` - Deprecate `interpolate`, `linear` and `shift` in favor of `align`
- Deprecate `roll` in favor of `rolling` - Deprecate `roll` in favor of `rolling`
- Deprecate `DictOfSeries.to_df` in favor of `DictOfSeries.to_pandas` - Deprecate `DictOfSeries.to_df` in favor of `DictOfSeries.to_pandas`
## [2.3.0](https://git.ufz.de/rdm-software/saqc/-/tags/v2.3.0) - 2023-01-17 ## [2.3.0](https://git.ufz.de/rdm-software/saqc/-/tags/v2.3.0) - 2023-01-17
[List of commits](https://git.ufz.de/rdm-software/saqc/-/compare/v2.2.1...v2.3.0) [List of commits](https://git.ufz.de/rdm-software/saqc/-/compare/v2.2.1...v2.3.0)
### Added ### Added
......
...@@ -59,7 +59,7 @@ It is not a shame to name a parameter just `n` or `alpha` etc., if, for example, ...@@ -59,7 +59,7 @@ It is not a shame to name a parameter just `n` or `alpha` etc., if, for example,
### Test Functions ### Test Functions
- testnames: [testmodule_]flagTestName - testnames: flagTestName
## Formatting ## Formatting
We use [black](https://black.readthedocs.io/en/stable/) in its default settings. We use [black](https://black.readthedocs.io/en/stable/) in its default settings.
...@@ -70,13 +70,17 @@ Only absolute imports are accepted. ...@@ -70,13 +70,17 @@ Only absolute imports are accepted.
# Development Workflow # Development Workflow
## Releases
Every release is planned by an associated Milestone. This milestone should have a end date, usually, the first of the month the next release is planned and contain all issue/merge requests to include.
## Repository Structure ## Repository Structure
- `master` - branch: - `main` - branch:
+ Stable and usually protected. + Stable and usually protected.
+ Regular merges from `develop`, these merges are tagged and increasing at least the minor version. + Regular merges from `develop`, these merges are tagged and increasing at least the minor version.
+ Irregular merges from `develop` in case of critical bugs. Such merges increase at least the patch level. + Irregular merges from `develop` in case of critical bugs. Such merges increase at least the patch level.
+ Merges into `master` usually lead to a PyPI release. + Merges into `main` usually lead to a PyPI release.
- `develop` - branch: - `develop` - branch:
+ The main development branch, no hard stability requirements/guarantees. + The main development branch, no hard stability requirements/guarantees.
+ Merges into `develop` should mostly follow a [Merge Request Workflow](#merge-request-workflow), minor changes can however be committed directly. Such minor changes include: + Merges into `develop` should mostly follow a [Merge Request Workflow](#merge-request-workflow), minor changes can however be committed directly. Such minor changes include:
...@@ -105,6 +109,6 @@ Only absolute imports are accepted. ...@@ -105,6 +109,6 @@ Only absolute imports are accepted.
release date. Commits to `develop` after the merge window of a release closes need to be integrated during the subsequent release release date. Commits to `develop` after the merge window of a release closes need to be integrated during the subsequent release
cycle cycle
- The release cycle is organized by Gitlab Milestones, the expiration date of a certain milestone indicates the end of the - The release cycle is organized by Gitlab Milestones, the expiration date of a certain milestone indicates the end of the
related merge window, the actual merge into `master` and the accompanying release is scheduled for the week after the related merge window, the actual merge into `main` and the accompanying release is scheduled for the week after the
milestones expiration date. milestones expiration date.
- Issues and Merge Requests can and should be associated to these milestone as this help in the organization of review activities. - Issues and Merge Requests can and should be associated to these milestone as this help in the organization of review activities.
This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or distribute this software, either in source code form or as a compiled binary, for any purpose, commercial or non-commercial, and by any means.
In jurisdictions that recognize copyright laws, the author or authors of this software dedicate any and all copyright interest in the software to the public domain. We make this dedication for the benefit of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of relinquishment in perpetuity of all present and future rights to this software under copyright law.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
For more information, please refer to <http://unlicense.org/>
...@@ -14,30 +14,39 @@ SPDX-License-Identifier: GPL-3.0-or-later ...@@ -14,30 +14,39 @@ SPDX-License-Identifier: GPL-3.0-or-later
# SaQC: System for automated Quality Control # SaQC: System for automated Quality Control
Anomalies and errors are the rule not the exception when working with `SaQC` is a tool/framework/application to quality control time series data.
time series data. This is especially true, if such data originates It provides
from in-situ measurements of environmental properties. a growing collection of algorithms and methods to analyze, annotate and
Almost all applications, however, implicily rely on data, that complies process timeseries data. It supports the end to end enrichment of metadata
with some definition of 'correct'. and provides various user interfaces: 1) a Python API, 2) a command line interface
In order to infer reliable data products and tools, there is no alternative with a text based configuration system and a
to quality control. SaQC provides all the building blocks to comfortably [web based user interface](https://webapp.ufz.de/saqc-config-app/)
bridge the gap between 'usually faulty' and 'expected to be corrected' in
a accessible, consistent, objective and reproducible way. `SaQC` is designed with a particular focus on the needs of active data professionals,
including sensor hardware-oriented engineers, domain experts, and data scientists,
all of whom can benefit from its capabilities to improve the quality standards of given data products.
For a (continously improving) overview of features, typical usage patterns, For a (continously improving) overview of features, typical usage patterns,
the specific system components and how to customize `SaQC` to your specific the specific system components and how to customize `SaQC` to your own
needs, please refer to our needs, please refer to our
[online documentation](https://rdm-software.pages.ufz.de/saqc/index.html). [online documentation](https://rdm-software.pages.ufz.de/saqc/index.html).
## Installation ## Installation
SaQC is available on the Python Package Index ([PyPI](https://pypi.org/)) and `SaQC` is available on the Python Package Index ([PyPI](https://pypi.org/)) and
can be installed using [pip](https://pip.pypa.io/en/stable/): can be installed using [pip](https://pip.pypa.io/en/stable/):
```sh ```sh
python -m pip install saqc python -m pip install saqc
``` ```
For a more detailed installion guide, see the [installation guide](https://rdm-software.pages.ufz.de/saqc/gettingstarted/InstallationGuide.html). Additionally `SaQC` is available via conda and can be installed with:
```sh
conda create -c conda-forge -n saqc saqc
```
For more details, see the [installation guide](https://rdm-software.pages.ufz.de/saqc/gettingstarted/InstallationGuide.html).
## Usage ## Usage
...@@ -57,7 +66,7 @@ SM2 ; shift(freq="15Min") ...@@ -57,7 +66,7 @@ SM2 ; shift(freq="15Min")
'SM(1|2)+' ; flagMissing() 'SM(1|2)+' ; flagMissing()
SM1 ; flagRange(min=10, max=60) SM1 ; flagRange(min=10, max=60)
SM2 ; flagRange(min=10, max=40) SM2 ; flagRange(min=10, max=40)
SM2 ; flagMAD(window="30d", z=3.5) SM2 ; flagZScore(window="30d", thresh=3.5, method='modified', center=False)
Dummy ; flagGeneric(field=["SM1", "SM2"], func=(isflagged(x) | isflagged(y))) Dummy ; flagGeneric(field=["SM1", "SM2"], func=(isflagged(x) | isflagged(y)))
``` ```
...@@ -92,30 +101,27 @@ data = pd.read_csv( ...@@ -92,30 +101,27 @@ data = pd.read_csv(
index_col=0, parse_dates=True, index_col=0, parse_dates=True,
) )
saqc = SaQC(data=data) qc = SaQC(data=data)
saqc = (saqc qc = (qc
.shift("SM2", freq="15Min") .shift("SM2", freq="15Min")
.flagMissing("SM(1|2)+", regex=True) .flagMissing("SM(1|2)+", regex=True)
.flagRange("SM1", min=10, max=60) .flagRange("SM1", min=10, max=60)
.flagRange("SM2", min=10, max=40) .flagRange("SM2", min=10, max=40)
.flagMAD("SM2", window="30d", z=3.5) .flagZScore("SM2", window="30d", thresh=3.5, method='modified', center=False)
.flagGeneric(field=["SM1", "SM2"], target="Dummy", func=lambda x, y: (isflagged(x) | isflagged(y)))) .flagGeneric(field=["SM1", "SM2"], target="Dummy", func=lambda x, y: (isflagged(x) | isflagged(y))))
``` ```
A more detailed description of the Python API is available in the A more detailed description of the Python API is available in the
[respective section](https://rdm-software.pages.ufz.de/saqc/gettingstarted/TutorialAPI.html) [respective section](https://rdm-software.pages.ufz.de/saqc/gettingstarted/TutorialAPI.html)
of the documentation. of the documentation.
## Changelog
All notable changes to this project will be documented in [CHANGELOG.md](CHANGELOG.md).
## Get involved ## Get involved
### Contributing ### Contributing
You found a bug or you want to suggest some cool features? Please refer to our [contributing guidelines](CONTRIBUTING.md) to see how you can contribute to SaQC. You found a bug or you want to suggest new features? Please refer to our [contributing guidelines](CONTRIBUTING.md) to see how you can contribute to SaQC.
### User support ### User support
If you need help or have a question, you can use the SaQC user support mailing list: [saqc-support@ufz.de](mailto:saqc-support@ufz.de) If you need help or have questions, send us an email to [saqc-support@ufz.de](mailto:saqc-support@ufz.de)
## Copyright and License ## Copyright and License
Copyright(c) 2021, [Helmholtz-Zentrum für Umweltforschung GmbH -- UFZ](https://www.ufz.de). All rights reserved. Copyright(c) 2021, [Helmholtz-Zentrum für Umweltforschung GmbH -- UFZ](https://www.ufz.de). All rights reserved.
...@@ -125,17 +131,18 @@ Copyright(c) 2021, [Helmholtz-Zentrum für Umweltforschung GmbH -- UFZ](https:// ...@@ -125,17 +131,18 @@ Copyright(c) 2021, [Helmholtz-Zentrum für Umweltforschung GmbH -- UFZ](https://
For full details, see [LICENSE](LICENSE.md). For full details, see [LICENSE](LICENSE.md).
## Acknowledgements
...
## Publications ## Publications
coming soon... > Lennart Schmidt, David Schäfer, Juliane Geller, Peter Lünenschloss, Bert Palm, Karsten Rinke, Corinna Rebmann, Michael Rode, Jan Bumberger, System for automated Quality Control (SaQC) to enable traceable and reproducible data streams in environmental science, Environmental Modelling & Software, 2023, 105809, ISSN 1364-8152, https://doi.org/10.1016/j.envsoft.2023.105809. (https://www.sciencedirect.com/science/article/pii/S1364815223001950)
## How to cite SaQC ## How to cite SaQC
If SaQC is advancing your research, please cite as: If SaQC is advancing your research, please cite as:
> Schäfer, David, Palm, Bert, Lünenschloß, Peter, Schmidt, Lennart, & Bumberger, Jan. (2023). System for automated Quality Control - SaQC (2.3.0). Zenodo. https://doi.org/10.5281/zenodo.5888547 > Schäfer, David, Palm, Bert, Lünenschloß, Peter, Schmidt, Lennart, & Bumberger, Jan. (2023). System for automated Quality Control - SaQC (2.3.0). Zenodo. https://doi.org/10.5281/zenodo.5888547
or
> Lennart Schmidt, David Schäfer, Juliane Geller, Peter Lünenschloss, Bert Palm, Karsten Rinke, Corinna Rebmann, Michael Rode, Jan Bumberger, System for automated Quality Control (SaQC) to enable traceable and reproducible data streams in environmental science, Environmental Modelling & Software, 2023, 105809, ISSN 1364-8152, https://doi.org/10.1016/j.envsoft.2023.105809. (https://www.sciencedirect.com/science/article/pii/S1364815223001950)
----------------- -----------------
<a href="https://www.ufz.de/index.php?en=33573"> <a href="https://www.ufz.de/index.php?en=33573">
......
...@@ -24,11 +24,14 @@ package_path = os.path.abspath("..") ...@@ -24,11 +24,14 @@ package_path = os.path.abspath("..")
os.environ["PYTHONPATH"] = ":".join((package_path, os.environ.get("PYTHONPATH", ""))) os.environ["PYTHONPATH"] = ":".join((package_path, os.environ.get("PYTHONPATH", "")))
# ---------- Version string -------------------------------------------------- # ---------- Version string --------------------------------------------------
# read the version string without importing it # TODO: what for we need `version` and the `release` variables for ?
vdict = {}
with open("../saqc/version.py") as f: # import saqc for versioning, but prevent plots to pop up
exec(f.read(), vdict) # by setting mpl backend to non-interactive
version = vdict["__version__"] import saqc.funcs
version = saqc.__version__
saqc.funcs.tools._MPL_DEFAULT_BACKEND = "Agg"
# -- Customize logging ------------------------------------------------------- # -- Customize logging -------------------------------------------------------
......
...@@ -100,18 +100,22 @@ Example Data Import ...@@ -100,18 +100,22 @@ Example Data Import
import pandas as pd import pandas as pd
data = pd.read_csv('../resources/data/tempSensorGroup.csv', index_col=0) data = pd.read_csv('../resources/data/tempSensorGroup.csv', index_col=0)
data.index = pd.DatetimeIndex(data.index) data.index = pd.DatetimeIndex(data.index)
variables = ['temp1 [degC]', 'temp2 [degC]', 'temp3 [degC]', 'temp4 [degC]', 'temp5 [degC]']
qc = saqc.SaQC(data) qc = saqc.SaQC(data)
We load the example `data set <https://git.ufz.de/rdm-software/saqc/-/blob/develop/docs/resources/data/tempsenorGroup.csv>`_ We load the example `data set <https://git.ufz.de/rdm-software/saqc/-/blob/develop/docs/resources/data/tempsenorGroup.csv>`_
from the *saqc* repository using the `pandas <https://pandas.pydata.org/>`_ csv from the *saqc* repository using the `pandas <https://pandas.pydata.org/>`_ csv
file reader. Subsequently, we cast the index of the imported data to `DatetimeIndex` file reader. Subsequently, we cast the index of the imported data to `DatetimeIndex`
and use the dataframe's `plot` method, to inspect the imported data: instantiate a saqc object and plot the data:
.. doctest:: flagDriftFromNorm .. doctest:: flagDriftFromNorm
>>> import saqc
>>> data = pd.read_csv('./resources/data/tempSensorGroup.csv', index_col=0) >>> data = pd.read_csv('./resources/data/tempSensorGroup.csv', index_col=0)
>>> data.index = pd.DatetimeIndex(data.index) >>> data.index = pd.DatetimeIndex(data.index)
>>> data.plot() # doctest: +SKIP >>> variables = ['temp1 [degC]', 'temp2 [degC]', 'temp3 [degC]', 'temp4 [degC]', 'temp5 [degC]']
>>> qc = saqc.SaQC(data)
>>> qc.plot(variables) # doctest: +SKIP
.. plot:: .. plot::
...@@ -119,22 +123,13 @@ and use the dataframe's `plot` method, to inspect the imported data: ...@@ -119,22 +123,13 @@ and use the dataframe's `plot` method, to inspect the imported data:
:include-source: False :include-source: False
:class: center :class: center
data.plot() qc.plot(variables)
Example Algorithm Application Example Algorithm Application
----------------------------- -----------------------------
Looking at our example data set more closely, we see that 2 of the 5 variables start to drift away. Looking at the example data set more closely, we see that 2 of the 5 variables start to drift away.
.. plot::
:context: close-figs
:include-source: False
:class: center
:caption: 2 variables start departing the majority group of variables (the group containing more than ``frac`` variables) around july.
data['2017-05':'2017-11'].plot()
.. plot:: .. plot::
:context: close-figs :context: close-figs
...@@ -142,17 +137,9 @@ Looking at our example data set more closely, we see that 2 of the 5 variables s ...@@ -142,17 +137,9 @@ Looking at our example data set more closely, we see that 2 of the 5 variables s
:class: center :class: center
:caption: 2 variables are departed from the majority group of variables (the group containing more than ``frac`` variables) by the end of the year. :caption: 2 variables are departed from the majority group of variables (the group containing more than ``frac`` variables) by the end of the year.
data['2017-09':'2018-01'].plot() qc.plot(variables, xscope=slice('2017-05', '2017-11'))
Lets try to detect those drifts via saqc. There for we import the *saqc* package and instantiate a :py:class:`saqc.SaQC`
object with the data:
.. doctest:: flagDriftFromNorm
>>> import saqc
>>> qc = saqc.SaQC(data)
The changes we observe in the data seem to develop significantly only in temporal spans over a month, Lets try to detect those drifts via saqc. The changes we observe in the data seem to develop significantly only in temporal spans over a month,
so we go for ``"1M"`` as value for the so we go for ``"1M"`` as value for the
``window`` parameter. We identified the majority group as a group containing three variables, whereby two variables ``window`` parameter. We identified the majority group as a group containing three variables, whereby two variables
seem to be scattered away, so that we can leave the ``frac`` value at its default ``.5`` level. seem to be scattered away, so that we can leave the ``frac`` value at its default ``.5`` level.
...@@ -179,55 +166,12 @@ Lets check the results: ...@@ -179,55 +166,12 @@ Lets check the results:
.. doctest:: flagDriftFromNorm .. doctest:: flagDriftFromNorm
>>> qc.plot('temp1 [degC]') # doctest: +SKIP >>> qc.plot(variables, marker_kwargs={'alpha':.3, 's': 1, 'color': 'red', 'edgecolor': 'face'}) # doctest: +SKIP
.. plot:: .. plot::
:context: close-figs :context: close-figs
:include-source: False :include-source: False
:class: center :class: center
qc.plot('temp1 [degC]') qc.plot(variables, marker_kwargs={'alpha':.3, 's': 1, 'color': 'red', 'edgecolor': 'face'})
.. doctest:: flagDriftFromNorm
>>> qc.plot('temp2 [degC]') # doctest: +SKIP
.. plot::
:context: close-figs
:include-source: False
:class: center
qc.plot('temp2 [degC]')
.. doctest:: flagDriftFromNorm
>>> qc.plot('temp3 [degC]') # doctest: +SKIP
.. plot::
:context: close-figs
:include-source: False
:class: center
qc.plot('temp3 [degC]')
.. doctest:: flagDriftFromNorm
>>> qc.plot('temp4 [degC]') # doctest: +SKIP
.. plot::
:context: close-figs
:include-source: False
:class: center
qc.plot('temp4 [degC]')
.. doctest:: flagDriftFromNorm
>>> qc.plot('temp5 [degC]') # doctest: +SKIP
.. plot::
:context: close-figs
:include-source: False
:class: center
qc.plot('temp5 [degC]')
\ No newline at end of file
...@@ -246,17 +246,14 @@ Check out the results for the year *2016* ...@@ -246,17 +246,14 @@ Check out the results for the year *2016*
.. doctest:: exampleMV .. doctest:: exampleMV
>>> plt.plot(qc.data['sac254_raw']['2016'], alpha=.5, color='black', label='original') # doctest:+SKIP >>> qc.plot(['sac254_raw','sac254_corrected'], xscope='2016', plot_kwargs={'color':['black', 'black'], 'alpha':[.5, 1], 'label':['original', 'corrrected']}) # doctest:+SKIP
>>> plt.plot(qc.data['sac254_corrected']['2016'], color='black', label='corrected') # doctest:+SKIP
.. plot:: .. plot::
:context: :context:
:include-source: False :include-source: False
plt.figure(figsize=(16,9)) >>> qc.plot(['sac254_raw','sac254_corrected'], xscope='2016', plot_kwargs={'color':['black', 'black'], 'alpha':[.5, 1], 'label':['original', 'corrrected']})
plt.plot(qc.data['sac254_raw']['2016'], alpha=.5, color='black', label='original')
plt.plot(qc.data['sac254_corrected']['2016'], color='black', label='corrected')
plt.legend()
Multivariate Flagging Procedure Multivariate Flagging Procedure
------------------------------- -------------------------------
...@@ -345,7 +342,7 @@ correlated with relatively high *kNNscores*, we could try to calculate a thresho ...@@ -345,7 +342,7 @@ correlated with relatively high *kNNscores*, we could try to calculate a thresho
`STRAY <https://arxiv.org/pdf/1908.04000.pdf>`_ algorithm, which is available as the method: `STRAY <https://arxiv.org/pdf/1908.04000.pdf>`_ algorithm, which is available as the method:
:py:meth:`~saqc.SaQC.flagByStray`. This method will mark some samples of the `kNNscore` variable as anomaly. :py:meth:`~saqc.SaQC.flagByStray`. This method will mark some samples of the `kNNscore` variable as anomaly.
Subsequently we project this marks (or *flags*) on to the *sac* variable with a call to Subsequently we project this marks (or *flags*) on to the *sac* variable with a call to
:py:meth:`~saqc.SaQC.concatFlags`. For the sake of demonstration, we also project the flags :py:meth:`~saqc.SaQC.transferFlags`. For the sake of demonstration, we also project the flags
on the normalized *sac* and plot the flagged values in the *sac254_norm* - *level_norm* feature space. on the normalized *sac* and plot the flagged values in the *sac254_norm* - *level_norm* feature space.
...@@ -353,8 +350,8 @@ on the normalized *sac* and plot the flagged values in the *sac254_norm* - *leve ...@@ -353,8 +350,8 @@ on the normalized *sac* and plot the flagged values in the *sac254_norm* - *leve
.. doctest:: exampleMV .. doctest:: exampleMV
>>> qc = qc.flagByStray(field='kNNscores', freq='30D', alpha=.3) >>> qc = qc.flagByStray(field='kNNscores', freq='30D', alpha=.3)
>>> qc = qc.concatFlags(field='kNNscores', target='sac254_corrected', label='STRAY') >>> qc = qc.transferFlags(field='kNNscores', target='sac254_corrected', label='STRAY')
>>> qc = qc.concatFlags(field='kNNscores', target='sac254_norm', label='STRAY') >>> qc = qc.transferFlags(field='kNNscores', target='sac254_norm', label='STRAY')
>>> qc.plot('sac254_corrected', xscope='2016-11') # doctest:+SKIP >>> qc.plot('sac254_corrected', xscope='2016-11') # doctest:+SKIP
>>> qc.plot('sac254_norm', phaseplot='level_norm', xscope='2016-11') # doctest:+SKIP >>> qc.plot('sac254_norm', phaseplot='level_norm', xscope='2016-11') # doctest:+SKIP
...@@ -363,8 +360,8 @@ on the normalized *sac* and plot the flagged values in the *sac254_norm* - *leve ...@@ -363,8 +360,8 @@ on the normalized *sac* and plot the flagged values in the *sac254_norm* - *leve
:include-source: False :include-source: False
qc = qc.flagByStray(field='kNNscores', freq='30D', alpha=.3) qc = qc.flagByStray(field='kNNscores', freq='30D', alpha=.3)
qc = qc.concatFlags(field='kNNscores', target='sac254_corrected', label='STRAY') qc = qc.transferFlags(field='kNNscores', target='sac254_corrected', label='STRAY')
qc = qc.concatFlags(field='kNNscores', target='sac254_norm', label='STRAY') qc = qc.transferFlags(field='kNNscores', target='sac254_norm', label='STRAY')
.. plot:: .. plot::
:context: close-figs :context: close-figs
...@@ -393,4 +390,4 @@ Config ...@@ -393,4 +390,4 @@ Config
To configure `saqc` to execute the above data processing and flagging steps, the config file would have to look To configure `saqc` to execute the above data processing and flagging steps, the config file would have to look
as follows: as follows:
.. literalinclude:: ../resources/data/hydro_config.csv .. literalinclude:: ../resources/data/hydro_config.csv
\ No newline at end of file
...@@ -255,25 +255,11 @@ This function object, we can pass on to the :py:meth:`~saqc.SaQC.processGeneric` ...@@ -255,25 +255,11 @@ This function object, we can pass on to the :py:meth:`~saqc.SaQC.processGeneric`
Visualisation Visualisation
------------- -------------
We can obtain those updated informations by generating a `pandas dataframe <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html>`_ To see all the results obtained so far, plotted in one figure window, we make use of the :py:meth:`~saqc.SaQC.plot` method.
representation of it, with the :py:attr:`data <saqc.core.core.SaQC.data>` method:
.. doctest:: exampleOD .. doctest:: exampleOD
>>> data = qc.data >>> qc.plot(".", regex=True) # doctest: +SKIP
.. plot::
:context:
:include-source: False
data = qc.data
To see all the results obtained so far, plotted in one figure window, we make use of the dataframes `plot <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.plot.html>`_ method.
.. doctest:: exampleOD
>>> data.to_pandas().plot()
<Axes...>
.. plot:: .. plot::
:context: :context:
...@@ -281,7 +267,7 @@ To see all the results obtained so far, plotted in one figure window, we make us ...@@ -281,7 +267,7 @@ To see all the results obtained so far, plotted in one figure window, we make us
:width: 80 % :width: 80 %
:class: center :class: center
data.to_pandas().plot() qc.plot(".", regex=True)
Residuals and Scores Residuals and Scores
......
...@@ -51,8 +51,7 @@ dummy dataset, to lead us through the following code snippets: ...@@ -51,8 +51,7 @@ dummy dataset, to lead us through the following code snippets:
.. testsetup:: python .. testsetup:: python
from saqc import fromConfig from saqc.parsing.reader import _ConfigReader as ConfigReader
from tests.common import writeIO
.. testcode:: python .. testcode:: python
...@@ -116,16 +115,13 @@ Simple constraints ...@@ -116,16 +115,13 @@ Simple constraints
.. doctest:: python .. doctest:: python
:hide: :hide:
>>> tmp = fromConfig( >>> tmp = ConfigReader(data).readString(
... writeIO(
... """ ... """
... varname ; test ... varname ; test
... #-------;------------------------ ... #-------;------------------------
... x ; flagGeneric(func=x < 30) ... x ; flagGeneric(func=x < 30)
... """ ... """
... ), ... ).run()
... data
... )
>>> tmp.flags == qc1.flags #doctest:+NORMALIZE_WHITESPACE >>> tmp.flags == qc1.flags #doctest:+NORMALIZE_WHITESPACE
True True
...@@ -177,16 +173,13 @@ Cross variable constraints ...@@ -177,16 +173,13 @@ Cross variable constraints
.. doctest:: python .. doctest:: python
:hide: :hide:
>>> tmp = fromConfig( >>> tmp = ConfigReader(data).readString(
... writeIO(
... """ ... """
... varname ; test ... varname ; test
... #-------;------------------------------------ ... #-------;------------------------------------
... x ; flagGeneric(field="y", func=y > 30) ... x ; flagGeneric(field="y", func=y > 30)
... """ ... """
... ), ... ).run()
... data
... )
>>> tmp.flags == qc2.flags #doctest:+NORMALIZE_WHITESPACE >>> tmp.flags == qc2.flags #doctest:+NORMALIZE_WHITESPACE
True True
...@@ -241,16 +234,13 @@ need to be put in parentheses. ...@@ -241,16 +234,13 @@ need to be put in parentheses.
.. doctest:: python .. doctest:: python
:hide: :hide:
>>> tmp = fromConfig( >>> tmp = ConfigReader(data).readString(
... writeIO(
... """ ... """
... varname ; test ... varname ; test
... #-------;-------------------------------------------------------- ... #-------;--------------------------------------------------------
... x ; flagGeneric(field=["y", "z"], func=(y > 30) & (z < 50)) ... x ; flagGeneric(field=["y", "z"], func=(y > 30) & (z < 50))
... """ ... """
... ), ... ).run()
... data
... )
>>> tmp.flags == qc3.flags #doctest:+NORMALIZE_WHITESPACE >>> tmp.flags == qc3.flags #doctest:+NORMALIZE_WHITESPACE
True True
...@@ -293,16 +283,13 @@ Arithmetics ...@@ -293,16 +283,13 @@ Arithmetics
.. doctest:: python .. doctest:: python
:hide: :hide:
>>> tmp = fromConfig( >>> tmp = ConfigReader(data).readString(
... writeIO(
... """ ... """
... varname ; test ... varname ; test
... #-------;------------------------------------------------------- ... #-------;-------------------------------------------------------
... x ; flagGeneric(field=["x", "y", "z"], func=x > (y + z)/2) ... x ; flagGeneric(field=["x", "y", "z"], func=x > (y + z)/2)
... """ ... """
... ), ... ).run()
... data
... )
>>> tmp.flags == qc4.flags #doctest:+NORMALIZE_WHITESPACE >>> tmp.flags == qc4.flags #doctest:+NORMALIZE_WHITESPACE
True True
...@@ -351,16 +338,13 @@ Special functions ...@@ -351,16 +338,13 @@ Special functions
.. doctest:: python .. doctest:: python
:hide: :hide:
>>> tmp = fromConfig( >>> tmp = ConfigReader(data).readString(
... writeIO(
... """ ... """
... varname ; test ... varname ; test
... #-------;--------------------------------------------------- ... #-------;---------------------------------------------------
... x ; flagGeneric(field=["x", "z"], func=x > std(z) * 2) ... x ; flagGeneric(field=["x", "z"], func=x > std(z) * 2)
... """ ... """
... ), ... ).run()
... data
... )
>>> tmp.flags == qc5.flags #doctest:+NORMALIZE_WHITESPACE >>> tmp.flags == qc5.flags #doctest:+NORMALIZE_WHITESPACE
True True
...@@ -402,17 +386,14 @@ Special functions ...@@ -402,17 +386,14 @@ Special functions
.. doctest:: python .. doctest:: python
:hide: :hide:
>>> tmp = fromConfig( >>> tmp = ConfigReader(data).readString(
... writeIO(
... """ ... """
... varname ; test ... varname ; test
... #-------;------------------------------------------ ... #-------;------------------------------------------
... y ; flagRange(min=10, max=60) ... y ; flagRange(min=10, max=60)
... x ; flagGeneric(field="y", func=isflagged(y)) ... x ; flagGeneric(field="y", func=isflagged(y))
... """ ... """
... ), ... ).run()
... data
... )
>>> tmp.flags == qc6.flags #doctest:+NORMALIZE_WHITESPACE >>> tmp.flags == qc6.flags #doctest:+NORMALIZE_WHITESPACE
True True
...@@ -481,16 +462,13 @@ Let's consider the following dataset: ...@@ -481,16 +462,13 @@ Let's consider the following dataset:
.. doctest:: python .. doctest:: python
:hide: :hide:
>>> tmp = fromConfig( >>> tmp = ConfigReader(data).readString(
... writeIO(
... """ ... """
... varname ; test ... varname ; test
... #-------;--------------------------------------------------------------- ... #-------;---------------------------------------------------------------
... meas ; flagGeneric(field=["fan", "volt"], func=(x == 0) | (y < 12.0)) ... meas ; flagGeneric(field=["fan", "volt"], func=(x == 0) | (y < 12.0))
... """ ... """
... ), ... ).run()
... data
... )
>>> tmp.flags == qc7.flags #doctest:+NORMALIZE_WHITESPACE >>> tmp.flags == qc7.flags #doctest:+NORMALIZE_WHITESPACE
True True
...@@ -533,8 +511,7 @@ But we could also quality check our independent variables first and than leverag ...@@ -533,8 +511,7 @@ But we could also quality check our independent variables first and than leverag
.. doctest:: python .. doctest:: python
:hide: :hide:
>>> tmp = fromConfig( >>> tmp = ConfigReader(data).readString(
... writeIO(
... """ ... """
... varname ; test ... varname ; test
... #-------;-------------------------------------------------------------------------- ... #-------;--------------------------------------------------------------------------
...@@ -543,9 +520,7 @@ But we could also quality check our independent variables first and than leverag ...@@ -543,9 +520,7 @@ But we could also quality check our independent variables first and than leverag
... volt ; flagGeneric(func=volt < 12.0) ... volt ; flagGeneric(func=volt < 12.0)
... meas ; flagGeneric(field=["fan", "volt"], func=isflagged(fan) | isflagged(volt)) ... meas ; flagGeneric(field=["fan", "volt"], func=isflagged(fan) | isflagged(volt))
... """ ... """
... ), ... ).run()
... data
... )
>>> tmp.flags == qc8.flags #doctest:+NORMALIZE_WHITESPACE >>> tmp.flags == qc8.flags #doctest:+NORMALIZE_WHITESPACE
True True
...@@ -634,16 +609,13 @@ variables in a given dataset. We start with dummy data again: ...@@ -634,16 +609,13 @@ variables in a given dataset. We start with dummy data again:
.. doctest:: python .. doctest:: python
:hide: :hide:
>>> tmp = fromConfig( >>> tmp = ConfigReader(data).readString(
... writeIO(
... """ ... """
... varname ; test ... varname ; test
... #-------;------------------------------------------------------ ... #-------;------------------------------------------------------
... mean ; processGeneric(field=["x", "y", "z"], func=(x+y+z)/2) ... mean ; processGeneric(field=["x", "y", "z"], func=(x+y+z)/2)
... """ ... """
... ), ... ).run()
... data
... )
>>> tmp.data == qc1.data #doctest:+NORMALIZE_WHITESPACE >>> tmp.data == qc1.data #doctest:+NORMALIZE_WHITESPACE
True True
......
...@@ -37,7 +37,6 @@ Example Data ...@@ -37,7 +37,6 @@ Example Data
:context: close-figs :context: close-figs
:include-source: False :include-source: False
import matplotlib.pyplot as plt
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import saqc import saqc
......
...@@ -13,4 +13,4 @@ Change Points and Noise ...@@ -13,4 +13,4 @@ Change Points and Noise
~SaQC.flagChangePoints ~SaQC.flagChangePoints
~SaQC.assignChangePointCluster ~SaQC.assignChangePointCluster
~SaQC.flagByStatLowPass ~SaQC.flagByScatterLowpass
...@@ -78,7 +78,7 @@ or ...@@ -78,7 +78,7 @@ or
.. code-block:: sh .. code-block:: sh
pip install git+https://git.ufz.de/rdm-software/saqc@master pip install git+https://git.ufz.de/rdm-software/saqc@main
If you feel more adventurous, feel free to use the latest development version from our If you feel more adventurous, feel free to use the latest development version from our
......
...@@ -4,11 +4,11 @@ ...@@ -4,11 +4,11 @@
recommonmark==0.7.1 recommonmark==0.7.1
sphinx==6.2.1 sphinx==6.2.1
sphinx-automodapi==0.15.0 sphinx-automodapi==0.16.0
sphinxcontrib-fulltoc==1.2.0 sphinxcontrib-fulltoc==1.2.0
sphinx-markdown-tables==0.0.17 sphinx-markdown-tables==0.0.17
jupyter-sphinx==0.4.0 jupyter-sphinx==0.4.0
sphinx_autodoc_typehints==1.23 sphinx_autodoc_typehints==1.23
sphinx-tabs==3.4.1 sphinx-tabs==3.4.1
sphinx-design==0.4.1 sphinx-design==0.5.0
pydata-sphinx-theme==0.13.3 pydata-sphinx-theme==0.13.3
...@@ -3,4 +3,4 @@ varname ; test ...@@ -3,4 +3,4 @@ varname ; test
SM2 ; align(freq="15Min", method="nshift") SM2 ; align(freq="15Min", method="nshift")
SM2 ; flagMissing() SM2 ; flagMissing()
'SM(1|2)+' ; flagRange(min=10, max=60) 'SM(1|2)+' ; flagRange(min=10, max=60)
SM2 ; flagMAD(window="30d", z=3.5) SM2 ; flagZScore(window="30d", thresh=3.5, method='modified', center=False)
...@@ -3,5 +3,5 @@ SM2;align(freq="15Min", method="nshift");False ...@@ -3,5 +3,5 @@ SM2;align(freq="15Min", method="nshift");False
'.*';flagRange(min=10, max=60);False '.*';flagRange(min=10, max=60);False
SM2;flagMissing();False SM2;flagMissing();False
SM2;flagRange(min=10, max=60);False SM2;flagRange(min=10, max=60);False
SM2;flagMAD(window="30d", z=3.5);False SM2;flagZScore(window="30d", thresh=3.5, method='modified', center=False);False
Dummy;flag(func=(isflagged(SM1) | isflagged(SM2))) Dummy;flag(func=(isflagged(SM1) | isflagged(SM2)))
...@@ -16,6 +16,6 @@ water_z ; transform(field=['water_temp_raw'], func=zScore(x), fr ...@@ -16,6 +16,6 @@ water_z ; transform(field=['water_temp_raw'], func=zScore(x), fr
sac_z ; transform(field=['sac254_raw'], func=zScore(x), freq='20D') sac_z ; transform(field=['sac254_raw'], func=zScore(x), freq='20D')
kNN_scores ; assignKNNScore(field=['level_z', 'water_z', 'sac_z'], freq='20D') kNN_scores ; assignKNNScore(field=['level_z', 'water_z', 'sac_z'], freq='20D')
kNN_scores ; flagByStray(freq='20D') kNN_scores ; flagByStray(freq='20D')
level_raw ; concatFlags(field=['kNN_scores'], label='STRAY') level_raw ; transferFlags(field=['kNN_scores'], label='STRAY')
sac254_corr ; concatFlags(field=['kNN_scores'], label='STRAY') sac254_corr ; transferFlags(field=['kNN_scores'], label='STRAY')
water_temp_raw ; concatFlags(field=['kNN_scores'], label='STRAY') water_temp_raw ; transferFlags(field=['kNN_scores'], label='STRAY')
varname;test varname;test
#------;-------------------------- #------;--------------------------
SM2 ;flagRange(min=10, max=60) SM2 ;flagRange(min=10, max=60)
SM2 ;flagMAD(window="30d", z=3.5) SM2 ;flagZScore(window="30d", thresh=3.5, method="modified", center=False)
SM2 ;plot() SM2 ;plot()
\ No newline at end of file