Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • berntm/saqc
  • rdm-software/saqc
  • schueler/saqc
3 results
Show changes
Commits on Source (283)
Showing
with 636 additions and 222 deletions
......@@ -5,3 +5,4 @@
*.feather filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
resources/machine_learning/data/soil_moisture_mwe.feather filter=lfs diff=lfs merge=lfs -text
saqc/_version.py export-subst
......@@ -29,7 +29,7 @@ jobs:
fail-fast: false
matrix:
os: ["windows-latest", "ubuntu-latest", "macos-latest"]
python-version: ["3.8", "3.9", "3.10"]
python-version: ["3.9", "3.10", "3.11", "3.12"]
defaults:
run:
# somehow this also works for windows O.o ??
......@@ -58,7 +58,6 @@ jobs:
- name: run SaQC test suite
run: |
pytest tests dios/test -Werror
python -m saqc --config docs/resources/data/config.csv --data docs/resources/data/data.csv --outfile /tmp/test.csv
# - name: run doc tests
......
......@@ -2,6 +2,20 @@
#
# SPDX-License-Identifier: GPL-3.0-or-later
# ===========================================================
# Hints
# ===========================================================
# $PYPI_PKG_NAME
# The variable PYPI_PKG_NAME is used in setup.py to determine
# how to name the tarball package. If not set the package is
# named 'saqc'.
# $TESTPYPI_TOKEN
# The upload token used for testpypi, set it on the gitlab
# page and enable masking to prevent revealing
# ===========================================================
# preparation
# ===========================================================
......@@ -16,11 +30,13 @@ stages:
- deploy
default:
image: python:3.10
image: python:3.11
before_script:
- pip install --upgrade pip
- pip install -r requirements.txt
- pip install -r tests/requirements.txt
- apt update
- apt install -y xvfb
# ===========================================================
# Compliance stage
......@@ -61,8 +77,10 @@ coverage:
stage: test
allow_failure: true
script:
- export DISPLAY=:99
- Xvfb :99 &
- pip install pytest-cov coverage
- pytest --cov=saqc tests --ignore=tests/fuzzy -Werror
- pytest --cov=saqc tests --ignore=tests/fuzzy tests/extras -Werror
after_script:
- coverage xml
# regex to find the coverage percentage in the job output
......@@ -75,11 +93,13 @@ coverage:
path: coverage.xml
python38:
python39:
stage: test
image: python:3.8
image: python:3.9
script:
- pytest tests -Werror --junitxml=report.xml
- export DISPLAY=:99
- Xvfb :99 &
- pytest tests -Werror --junitxml=report.xml --ignore=tests/extras
- python -m saqc --config docs/resources/data/config.csv --data docs/resources/data/data.csv --outfile /tmp/test.csv
artifacts:
when: always
......@@ -87,42 +107,49 @@ python38:
junit: report.xml
python39:
python310:
stage: test
image: python:3.9
image: python:3.10
script:
- pytest tests -Werror --junitxml=report.xml
- export DISPLAY=:99
- Xvfb :99 &
- pytest tests -Werror --junitxml=report.xml --ignore=tests/extras
- python -m saqc --config docs/resources/data/config.csv --data docs/resources/data/data.csv --outfile /tmp/test.csv
artifacts:
when: always
reports:
junit: report.xml
python310:
python311:
stage: test
image: python:3.10
image: python:3.11
script:
- pytest tests -Werror --junitxml=report.xml
- export DISPLAY=:99
- Xvfb :99 &
- pytest tests -Werror --junitxml=report.xml --ignore=tests/extras
- python -m saqc --config docs/resources/data/config.csv --data docs/resources/data/data.csv --outfile /tmp/test.csv
artifacts:
when: always
reports:
junit: report.xml
# python311:
# stage: test
# image: python:3.11
# script:
# - pytest tests -Werror --junitxml=report.xml
# - python -m saqc --config docs/resources/data/config.csv --data docs/resources/data/data.csv --outfile /tmp/test.csv
# artifacts:
# when: always
# reports:
# junit: report.xml
python312:
stage: test
image: python:3.12
script:
- export DISPLAY=:99
- Xvfb :99 &
- pytest tests -Werror --junitxml=report.xml --ignore=tests/extras
- python -m saqc --config docs/resources/data/config.csv --data docs/resources/data/data.csv --outfile /tmp/test.csv
artifacts:
when: always
reports:
junit: report.xml
doctest:
stage: test
variables:
COLUMNS: 200
script:
- cd docs
- pip install -r requirements.txt
......@@ -134,24 +161,72 @@ doctest:
# Building stage
# ===========================================================
# check if we are able to build a wheel
wheel:
# and if the import works
wheel39:
stage: build
image: python:3.9
variables:
PYPI_PKG_NAME: "saqc-dev"
script:
- pip install wheel
- pip wheel .
- pip install .
- python -c 'import saqc; print(f"{saqc.__version__=}")'
docs:
wheel310:
stage: build
image: python:3.10
variables:
PYPI_PKG_NAME: "saqc-dev"
script:
- cd docs
- pip install -r requirements.txt
- make doc
- pip install wheel
- pip wheel .
- pip install .
- python -c 'import saqc; print(f"{saqc.__version__=}")'
wheel311:
stage: build
image: python:3.11
variables:
PYPI_PKG_NAME: "saqc-dev"
script:
- pip install wheel
- pip wheel .
- pip install .
- python -c 'import saqc; print(f"{saqc.__version__=}")'
wheel312:
stage: build
image: python:3.12
variables:
PYPI_PKG_NAME: "saqc-dev"
script:
- pip install wheel
- pip wheel .
- pip install .
- python -c 'import saqc; print(f"{saqc.__version__=}")'
# ===========================================================
# Extra Pipeline (run with a successful run of all other jobs on develop)
# ===========================================================
upload_testpypi:
stage: deploy
only:
- develop
except:
- schedules
variables:
PYPI_PKG_NAME: "saqc-dev"
TWINE_USERNAME: __token__
TWINE_PASSWORD: $TESTPYPI_TOKEN
script:
- pip install build twine
- python -m build
- twine check --strict dist/*
- twine upload -r testpypi dist/*
# make html docu with sphinx
pages:
stage: deploy
......
......@@ -5,13 +5,101 @@ SPDX-License-Identifier: GPL-3.0-or-later
-->
# Changelog
## Unreleased
[List of commits](https://git.ufz.de/rdm-software/saqc/-/compare/v2.4.0...develop)
[List of commits](https://git.ufz.de/rdm-software/saqc/-/compare/v2.6.0...develop)
### Added
- `flagPlateaus`: added function to search and flag outlierish value plateaus of certain temporal extension
- `flagUniLOF`: added dispatch to Local Outlier Probability (*LoOP*) variant
- `flaguniLOF`: made `thresh` Optional
- `flagPlateaus`: added function to search and flag anomalous value plateaus of certain temporal extension
### Changed
### Removed
### Fixed
- `flagConstants`: fixed bug where last `min_periods` will never get flagged
### Deprecated
## [2.6.0](https://git.ufz.de/rdm-software/saqc/-/tags/v2.6.0) - 2024-04-15
[List of commits](https://git.ufz.de/rdm-software/saqc/-/compare/v2.5.0...v2.6.0)
### Added
- `reindex`: base reindexer function
- `flagGeneric`, `processGeneric`: target broadcasting and numpy array support
- `SaQC`: automatic translation of incoming flags
- Option to change the flagging scheme after initialization
- `flagByClick`: manually assign flags using a graphical user interface
- `SaQC`: support for selection, slicing and setting of items by subscription on `SaQC` objects
- `transferFlags` is a multivariate function
- `plot`: added `yscope` keyword
- `setFlags`: function to replace `flagManual`
- `flagUniLOF`: added parameter `slope_correct` to correct for overflagging at relatively steep data value slopes
- `History`: added option to change aggregation behavior
- "horizontal" axis / multivariate mode for `rolling`
- Translation scheme `AnnotatedFloatScheme`
### Changed
- `SaQC.flags` always returns a `DictOfSeries`
### Removed
- `SaQC` methods deprecated in version 2.4: `interpolate`, `interpolateIndex`, `interpolateInvalid`, `roll`, `linear`,`shift`, `flagCrossStatistics`
- Method `Flags.toDios` deprecated in version 2.4
- Method `DictOfSeries.index_of` method deprecated in version 2.4
- Option `"complete"` for parameter `history` of method `plot`
- Option `"cycleskip"` for parameter `ax_kwargs` of method `plot`
- Parameter `phaseplot` from method `plot`
### Fixed
- `flagConstants`: fixed flagging of rolling ramps
- `Flags`: add meta entry to imported flags
- group operations were overwriting existing flags
- `SaQC._construct` : was not working for inherited classes
- `processgeneric`: improved numpy function compatability
### Deprecated
- `flagManual` in favor of `setFlags`
- `inverse_**` options for `concatFlags` parameter `method` in favor of `invert=True`
- `flagRaise` with delegation to better replacements `flagZScore`, `flagUniLOF`, `flagJumps` or `flagOffset`
- `flagByGrubbs` with delegation to better replacements `flagZScore`, `flagUniLOF`s
- `flagMVScore` with delegation to manual application of the steps
## [2.5.0](https://git.ufz.de/rdm-software/saqc/-/tags/v2.5.0) - 2023-09-05
[List of commits](https://git.ufz.de/rdm-software/saqc/-/compare/v2.4.1...v2.5.0)
### Added
- WMO standard mean aggregations
- Function selection via strings for most function-expecting parameters
- `SaQC.plot`:
- enable multivariate plots
- keyword `plot_kwargs` to pass matplotlib related arguments
- CLI:
- `--version` to print the SaQC version
- `-ll` as a shorthand for `--log-level`
- `--json-field` to use a non-root element of a json file.
- basic json support for CLI config files, which are detected by `.json`-extension.
- `SaQC.flagScatterLowpass`: option to select function based on string names.
- Checks and unified error message for common function inputs.
### Changed
- Require pandas >= 2.0
- `SaQC.flagUniLOF` and `SaQC.assignUniLOF`: changed parameter `fill_na` to type `bool`.
- `SaQC.plot`:
- changed default color for single variables to `black` with `80% transparency`
- added seperate legend for flags
### Removed
- `SaQC.plot`: option to plot with complete history (`history="complete"`)
- Support for Python 3.8
### Fixed
- `SaQC.assignChangePointCluster` and `SaQC.flagChangePoints`: A tuple passed `min_period`
was only recognised if `window` was also a tuple.
- `SaQC.propagateFlags` was overwriting existing flags
### Deprecated
- `SaQC.andGroup` and `SaQC.orGroup`: option to pass dictionaries to `group`.
- `SaQC.plot`:
- `phaseplot` in favor of usage with `mode="biplot"`
- `cyclestart` in favor of usage with `marker_kwargs`
- `SaQC.flagStatLowPass` in favor of `SaQC.flagScatterLowpass`
## [2.4.1](https://git.ufz.de/rdm-software/saqc/-/tags/v2.4.1) - 2023-06-22
[List of commits](https://git.ufz.de/rdm-software/saqc/-/compare/v2.4.0...v.2.4.1)
### Added
### Changed
- pin pandas to versions >= 2.0
### Removed
- removed deprecated `DictOfSeries.to_df`
### Fixed
### Deprecated
## [2.4.0](https://git.ufz.de/rdm-software/saqc/-/tags/v2.4.0) - 2023-04-25
[List of commits](https://git.ufz.de/rdm-software/saqc/-/compare/v2.3.0...v2.4.0)
......@@ -21,11 +109,9 @@ SPDX-License-Identifier: GPL-3.0-or-later
- Expose the `History` via `SaQC._history`
- Config function `cv` (coefficient of variation)
### Changed
- Deprecate `interpolate`, `linear` and `shift` in favor of `align`
- Deprecate `roll` in favor of `rolling`
- Rename `interplateInvalid` to `interpolate`
- Rename `interpolateIndex` to `align`
- Deprecate `flagMVScore` parameters: `partition_min` in favor of `window`, `partition_min` in favor of `min_periods`, `min_periods` in favor of `min_periods_r`
- Rewrite of `dios.DictOfSeries`
### Removed
- Parameter `limit` from `align`
- Parameter `max_na_group_flags`, `max_na_flags`, `flag_func`, `freq_check` from `resample`
......@@ -33,9 +119,13 @@ SPDX-License-Identifier: GPL-3.0-or-later
- `func` arguments in text configurations were not parsed correctly
- fail on duplicated arguments to test methods
- `reample` was not writing meta entries
- `flagByStatLowPass` was overwriting existing flags
- `flagByScatterLowpass` was overwriting existing flags
- `flagUniLOF` and `flagLOF` were overwriting existing flags
### Deprecated
- Deprecate `flagMVScore` parameters: `partition` in favor of `window`, `partition_min` in favor of `min_periods`, `min_periods` in favor of `min_periods_r`
- Deprecate `interpolate`, `linear` and `shift` in favor of `align`
- Deprecate `roll` in favor of `rolling`
- Deprecate `DictOfSeries.to_df` in favor of `DictOfSeries.to_pandas`
## [2.3.0](https://git.ufz.de/rdm-software/saqc/-/tags/v2.3.0) - 2023-01-17
[List of commits](https://git.ufz.de/rdm-software/saqc/-/compare/v2.2.1...v2.3.0)
### Added
......
......@@ -3,7 +3,7 @@ title: SaQC - System for automated Quality Control
message: "Please cite this software using these metadata."
type: software
version: 2.0.0
doi: https://doi.org/10.5281/zenodo.5888547
doi: 10.5281/zenodo.5888547
date-released: "2021-11-25"
license: "GPL-3.0"
repository-code: "https://git.ufz.de/rdm-software/saqc"
......
......@@ -59,7 +59,7 @@ It is not a shame to name a parameter just `n` or `alpha` etc., if, for example,
### Test Functions
- testnames: [testmodule_]flagTestName
- testnames: flagTestName
## Formatting
We use [black](https://black.readthedocs.io/en/stable/) in its default settings.
......@@ -70,13 +70,17 @@ Only absolute imports are accepted.
# Development Workflow
## Releases
Every release is planned by an associated Milestone. This milestone should have a end date, usually, the first of the month the next release is planned and contain all issue/merge requests to include.
## Repository Structure
- `master` - branch:
- `main` - branch:
+ Stable and usually protected.
+ Regular merges from `develop`, these merges are tagged and increasing at least the minor version.
+ Irregular merges from `develop` in case of critical bugs. Such merges increase at least the patch level.
+ Merges into `master` usually lead to a PyPI release.
+ Merges into `main` usually lead to a PyPI release.
- `develop` - branch:
+ The main development branch, no hard stability requirements/guarantees.
+ Merges into `develop` should mostly follow a [Merge Request Workflow](#merge-request-workflow), minor changes can however be committed directly. Such minor changes include:
......@@ -105,6 +109,6 @@ Only absolute imports are accepted.
release date. Commits to `develop` after the merge window of a release closes need to be integrated during the subsequent release
cycle
- The release cycle is organized by Gitlab Milestones, the expiration date of a certain milestone indicates the end of the
related merge window, the actual merge into `master` and the accompanying release is scheduled for the week after the
related merge window, the actual merge into `main` and the accompanying release is scheduled for the week after the
milestones expiration date.
- Issues and Merge Requests can and should be associated to these milestone as this help in the organization of review activities.
This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or distribute this software, either in source code form or as a compiled binary, for any purpose, commercial or non-commercial, and by any means.
In jurisdictions that recognize copyright laws, the author or authors of this software dedicate any and all copyright interest in the software to the public domain. We make this dedication for the benefit of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of relinquishment in perpetuity of all present and future rights to this software under copyright law.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
For more information, please refer to <http://unlicense.org/>
......@@ -14,30 +14,39 @@ SPDX-License-Identifier: GPL-3.0-or-later
# SaQC: System for automated Quality Control
Anomalies and errors are the rule not the exception when working with
time series data. This is especially true, if such data originates
from in-situ measurements of environmental properties.
Almost all applications, however, implicily rely on data, that complies
with some definition of 'correct'.
In order to infer reliable data products and tools, there is no alternative
to quality control. SaQC provides all the building blocks to comfortably
bridge the gap between 'usually faulty' and 'expected to be corrected' in
a accessible, consistent, objective and reproducible way.
`SaQC` is a tool/framework/application to quality control time series data.
It provides
a growing collection of algorithms and methods to analyze, annotate and
process timeseries data. It supports the end to end enrichment of metadata
and provides various user interfaces: 1) a Python API, 2) a command line interface
with a text based configuration system and a
[web based user interface](https://webapp.ufz.de/saqc-config-app/)
`SaQC` is designed with a particular focus on the needs of active data professionals,
including sensor hardware-oriented engineers, domain experts, and data scientists,
all of whom can benefit from its capabilities to improve the quality standards of given data products.
For a (continously improving) overview of features, typical usage patterns,
the specific system components and how to customize `SaQC` to your specific
the specific system components and how to customize `SaQC` to your own
needs, please refer to our
[online documentation](https://rdm-software.pages.ufz.de/saqc/index.html).
## Installation
SaQC is available on the Python Package Index ([PyPI](https://pypi.org/)) and
`SaQC` is available on the Python Package Index ([PyPI](https://pypi.org/)) and
can be installed using [pip](https://pip.pypa.io/en/stable/):
```sh
python -m pip install saqc
```
For a more detailed installion guide, see the [installation guide](https://rdm-software.pages.ufz.de/saqc/gettingstarted/InstallationGuide.html).
Additionally `SaQC` is available via conda and can be installed with:
```sh
conda create -c conda-forge -n saqc saqc
```
For more details, see the [installation guide](https://rdm-software.pages.ufz.de/saqc/gettingstarted/InstallationGuide.html).
## Usage
......@@ -53,11 +62,11 @@ could look like [this](https://git.ufz.de/rdm-software/saqc/raw/develop/docs/res
```
varname ; test
#----------; ---------------------------------------------------------------------
SM2 ; shift(freq="15Min")
SM2 ; align(freq="15Min")
'SM(1|2)+' ; flagMissing()
SM1 ; flagRange(min=10, max=60)
SM2 ; flagRange(min=10, max=40)
SM2 ; flagMAD(window="30d", z=3.5)
SM2 ; flagZScore(window="30d", thresh=3.5, method='modified', center=False)
Dummy ; flagGeneric(field=["SM1", "SM2"], func=(isflagged(x) | isflagged(y)))
```
......@@ -92,30 +101,27 @@ data = pd.read_csv(
index_col=0, parse_dates=True,
)
saqc = SaQC(data=data)
saqc = (saqc
.shift("SM2", freq="15Min")
.flagMissing("SM(1|2)+", regex=True)
.flagRange("SM1", min=10, max=60)
.flagRange("SM2", min=10, max=40)
.flagMAD("SM2", window="30d", z=3.5)
.flagGeneric(field=["SM1", "SM2"], target="Dummy", func=lambda x, y: (isflagged(x) | isflagged(y))))
qc = SaQC(data=data)
qc = (qc
.align("SM2", freq="15Min")
.flagMissing("SM(1|2)+", regex=True)
.flagRange("SM1", min=10, max=60)
.flagRange("SM2", min=10, max=40)
.flagZScore("SM2", window="30d", thresh=3.5, method='modified', center=False)
.flagGeneric(field=["SM1", "SM2"], target="Dummy", func=lambda x, y: (isflagged(x) | isflagged(y))))
```
A more detailed description of the Python API is available in the
A more detailed description of the Python API is available in the
[respective section](https://rdm-software.pages.ufz.de/saqc/gettingstarted/TutorialAPI.html)
of the documentation.
## Changelog
All notable changes to this project will be documented in [CHANGELOG.md](CHANGELOG.md).
## Get involved
### Contributing
You found a bug or you want to suggest some cool features? Please refer to our [contributing guidelines](CONTRIBUTING.md) to see how you can contribute to SaQC.
You found a bug or you want to suggest new features? Please refer to our [contributing guidelines](CONTRIBUTING.md) to see how you can contribute to SaQC.
### User support
If you need help or have a question, you can use the SaQC user support mailing list: [saqc-support@ufz.de](mailto:saqc-support@ufz.de)
If you need help or have questions, send us an email to [saqc-support@ufz.de](mailto:saqc-support@ufz.de)
## Copyright and License
Copyright(c) 2021, [Helmholtz-Zentrum für Umweltforschung GmbH -- UFZ](https://www.ufz.de). All rights reserved.
......@@ -125,17 +131,18 @@ Copyright(c) 2021, [Helmholtz-Zentrum für Umweltforschung GmbH -- UFZ](https://
For full details, see [LICENSE](LICENSE.md).
## Acknowledgements
...
## Publications
coming soon...
> Lennart Schmidt, David Schäfer, Juliane Geller, Peter Lünenschloss, Bert Palm, Karsten Rinke, Corinna Rebmann, Michael Rode, Jan Bumberger, System for automated Quality Control (SaQC) to enable traceable and reproducible data streams in environmental science, Environmental Modelling & Software, 2023, 105809, ISSN 1364-8152, https://doi.org/10.1016/j.envsoft.2023.105809. (https://www.sciencedirect.com/science/article/pii/S1364815223001950)
## How to cite SaQC
If SaQC is advancing your research, please cite as:
> Schäfer, David, Palm, Bert, Lünenschloß, Peter, Schmidt, Lennart, & Bumberger, Jan. (2023). System for automated Quality Control - SaQC (2.3.0). Zenodo. https://doi.org/10.5281/zenodo.5888547
or
> Lennart Schmidt, David Schäfer, Juliane Geller, Peter Lünenschloss, Bert Palm, Karsten Rinke, Corinna Rebmann, Michael Rode, Jan Bumberger, System for automated Quality Control (SaQC) to enable traceable and reproducible data streams in environmental science, Environmental Modelling & Software, 2023, 105809, ISSN 1364-8152, https://doi.org/10.1016/j.envsoft.2023.105809. (https://www.sciencedirect.com/science/article/pii/S1364815223001950)
-----------------
<a href="https://www.ufz.de/index.php?en=33573">
......
......@@ -30,7 +30,7 @@ clean:
# make documentation
doc:
# generate environment table from dictionary
@$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
@ $(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
# run tests
test:
......
......@@ -24,11 +24,14 @@ package_path = os.path.abspath("..")
os.environ["PYTHONPATH"] = ":".join((package_path, os.environ.get("PYTHONPATH", "")))
# ---------- Version string --------------------------------------------------
# read the version string without importing it
vdict = {}
with open("../saqc/version.py") as f:
exec(f.read(), vdict)
version = vdict["__version__"]
# TODO: what for we need `version` and the `release` variables for ?
# import saqc for versioning, but prevent plots to pop up
# by setting mpl backend to non-interactive
import saqc.funcs
version = saqc.__version__
saqc.funcs.tools._MPL_DEFAULT_BACKEND = "Agg"
# -- Customize logging -------------------------------------------------------
......
......@@ -7,6 +7,18 @@
Cook Books
==========
.. toctree::
:caption: Cookbooks
:maxdepth: 1
:hidden:
DataRegularisation
OutlierDetection
ResidualOutlierDetection
DriftDetection
MultivariateFlagging
../documentation/GenericFunctions
.. grid:: 2
:gutter: 2
......@@ -52,6 +64,16 @@ Cook Books
+++
*Wrap your custom logical and arithmetic expressions with the generic functions*
.. grid-item-card:: Drift Detection
:link: DriftDetection
:link-type: doc
* define metrics to measure distance between data series
* automatically determine majority and anomalous data groups
+++
*Detecting datachunks drifting apart from a reference group*
.. grid-item-card:: Modelling, Residuals and Arithmetics
:link: ResidualOutlierDetection
:link-type: doc
......
......@@ -315,10 +315,10 @@ Aggregation
If we want to comprise several values by aggregation and assign the result to the new regular timestamp, instead of
selecting a single one, we can do this, with the :py:meth:`~saqc.SaQC.resample` method.
Lets resample the *SoilMoisture* data to have a *20* minutes sample rate by aggregating every *20* minutes intervals
content with the arithmetic mean (which is provided by the ``numpy.mean`` function for example).
content with the arithmetic mean.
>>> import numpy as np
>>> qc = qc.resample('SoilMoisture', target='SoilMoisture_mean', freq='20min', method='bagg', func=np.mean)
>>> qc = qc.resample('SoilMoisture', target='SoilMoisture_mean', freq='20min', method='bagg', func="mean")
>>> qc.data # doctest: +SKIP
SoilMoisture | SoilMoisture_mean |
================================ | ===================================== |
......
.. SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ
..
.. SPDX-License-Identifier: GPL-3.0-or-later
Drift Detection
===============
Overview
--------
The guide briefly introduces the usage of the :py:meth:`~saqc.SaQC.flagDriftFromNorm` method.
The method detects sections in timeseries that deviate from the majority in a group of variables
* :ref:`Parameters <cookbooks/DriftDetection:Parameters>`
* :ref:`Algorithm <cookbooks/DriftDetection:Algorithm>`
* :ref:`Example Data import <cookbooks/DriftDetection:Example Data import>`
* :ref:`Example Algorithm Application <cookbooks/DriftDetection:Example Algorithm Application>`
Parameters
----------
Although there seems to be a lot of user input to parametrize, most of it is easy to be interpreted and can be selected
defaultly.
window
^^^^^^
Length of the partitions the target group of data series` is divided into.
For example, if selected ``1D`` (one day), the group to check will be divided into one day chunks and every chunk is be checked for time series deviating from the normal group.
frac
^^^^
The percentage of data, needed to define the "normal" group expressed in a number out of :math:`[0,1]`.
This, of course must be something over 50 percent (math:`0.5`), and can be
selected according to the number of drifting variables one expects the data to have at max.
method
^^^^^^
The linkage method can have some impact on the clustering, but sticking to the default value `single` might be
sufficient for most the tasks.
spread
^^^^^^
The main parameter to control the algorithm's behavior. It has to be selected carefully.
It determines the maximum spread of a normal group by limiting the costs, a cluster agglomeration must not exceed in
every linkage step.
For singleton clusters, that costs equals half the distance, the timeseries in the clusters have to each other. So, only timeseries with a distance of less than two times the spreading norm can be clustered.
When timeseries get clustered together, this new clusters distance to all the other timeseries/clusters is calculated
according to the linkage method specified. By default, it is the minimum distance, the members of the clusters have to
each other.
Having that in mind, it is advisable to choose a distance function as metric, that can be well interpreted in the units
dimension of the measurement, and where the interpretation is invariant over the length of the timeseries.
metric
^^^^^^
The default *averaged manhatten metric* roughly represents the averaged value distance of two timeseries (as opposed to *euclidean*, which scales non linearly with the
compared timeseries' length). For the selection of the :py:attr:`spread` parameter the default metric is helpful, since it allows to interpret the spreading in the dimension of the measurements.
Algorithm
---------
The aim of the algorithm is to flag sections in timeseries, that significantly deviate from a normal group of timeseries running in parallel within a given section.
"Normality" is determined in terms of a maximum spreading distance, that members of a normal group must not exceed.
In addition, a group is only considered to be "normal", if it contains more then a certain percentage of the timeseries to be clustered into "normal" ones and "abnormal" ones.
The steps of the algorithm are the following:
* Calculate the distances :math:`d(x_i,x_j)` for all timeseries :math:`x_i` that are to be clustered with a metric specified by the user
* Calculate a dendogram using a hierarchical linkage algorithm, specified by the user.
* Flatten the dendogram at the level, the agglomeration costs exceed the value given by a spreading norm, specified by the user
* check if there is a cluster containing more than a certain percentage of variables as specified by the user.
* if yes: flag all the variables that are not in that cluster
* if no: flag nothing
Example Data Import
-------------------
.. plot::
:context: reset
:include-source: False
import matplotlib
import saqc
import pandas as pd
data = pd.read_csv('../resources/data/tempSensorGroup.csv', index_col=0)
data.index = pd.DatetimeIndex(data.index)
variables = ['temp1 [degC]', 'temp2 [degC]', 'temp3 [degC]', 'temp4 [degC]', 'temp5 [degC]']
qc = saqc.SaQC(data)
We load the example `data set <https://git.ufz.de/rdm-software/saqc/-/blob/develop/docs/resources/data/tempsenorGroup.csv>`_
from the *saqc* repository using the `pandas <https://pandas.pydata.org/>`_ csv
file reader. Subsequently, we cast the index of the imported data to `DatetimeIndex`
instantiate a saqc object and plot the data:
.. doctest:: flagDriftFromNorm
>>> import saqc
>>> data = pd.read_csv('./resources/data/tempSensorGroup.csv', index_col=0)
>>> data.index = pd.DatetimeIndex(data.index)
>>> variables = ['temp1 [degC]', 'temp2 [degC]', 'temp3 [degC]', 'temp4 [degC]', 'temp5 [degC]']
>>> qc = saqc.SaQC(data)
>>> qc.plot(variables) # doctest: +SKIP
.. plot::
:context: close-figs
:include-source: False
:class: center
qc.plot(variables)
Example Algorithm Application
-----------------------------
Looking at the example data set more closely, we see that 2 of the 5 variables start to drift away.
.. plot::
:context: close-figs
:include-source: False
:class: center
:caption: 2 variables are departed from the majority group of variables (the group containing more than ``frac`` variables) by the end of the year.
qc.plot(variables, xscope=slice('2017-05', '2017-11'))
Lets try to detect those drifts via saqc. The changes we observe in the data seem to develop significantly only in temporal spans over a month,
so we go for ``"1ME"`` as value for the
``window`` parameter. We identified the majority group as a group containing three variables, whereby two variables
seem to be scattered away, so that we can leave the ``frac`` value at its default ``.5`` level.
The majority group seems on average not to be spread out more than 3 or 4 degrees. So, for the ``spread`` value
we go for ``3``. This can be interpreted as follows, for every member of a group, there is another member that
is not distanted more than ``3`` degrees from that one (on average in one month) - this should be sufficient to bundle
the majority group and to discriminate against the drifting variables, that seem to deviate more than 3 degrees on
average in a month from any member of the majority group.
.. doctest:: flagDriftFromNorm
>>> variables = ['temp1 [degC]', 'temp2 [degC]', 'temp3 [degC]', 'temp4 [degC]', 'temp5 [degC]']
>>> qc = qc.flagDriftFromNorm(variables, window='1ME', spread=3)
.. plot::
:context: close-figs
:include-source: False
:class: center
>>> variables = ['temp1 [degC]', 'temp2 [degC]', 'temp3 [degC]', 'temp4 [degC]', 'temp5 [degC]']
>>> qc = qc.flagDriftFromNorm(variables, window='1ME', spread=3)
Lets check the results:
.. doctest:: flagDriftFromNorm
>>> qc.plot(variables, marker_kwargs={'alpha':.3, 's': 1, 'color': 'red', 'edgecolor': 'face'}) # doctest: +SKIP
.. plot::
:context: close-figs
:include-source: False
:class: center
qc.plot(variables, marker_kwargs={'alpha':.3, 's': 1, 'color': 'red', 'edgecolors': 'face'})
......@@ -191,7 +191,6 @@ The resulting timeseries now has has regular timestamp.
.. doctest:: exampleMV
>>> qc.data['sac254_raw'] #doctest:+NORMALIZE_WHITESPACE
Timestamp
2016-01-01 00:00:00 NaN
2016-01-01 00:15:00 18.617873
2016-01-01 00:30:00 18.942700
......@@ -246,17 +245,14 @@ Check out the results for the year *2016*
.. doctest:: exampleMV
>>> plt.plot(qc.data['sac254_raw']['2016'], alpha=.5, color='black', label='original') # doctest:+SKIP
>>> plt.plot(qc.data['sac254_corrected']['2016'], color='black', label='corrected') # doctest:+SKIP
>>> qc.plot(['sac254_raw','sac254_corrected'], xscope='2016', plot_kwargs={'color':['black', 'black'], 'alpha':[.5, 1], 'label':['original', 'corrrected']}) # doctest:+SKIP
.. plot::
:context:
:include-source: False
plt.figure(figsize=(16,9))
plt.plot(qc.data['sac254_raw']['2016'], alpha=.5, color='black', label='original')
plt.plot(qc.data['sac254_corrected']['2016'], color='black', label='corrected')
plt.legend()
>>> qc.plot(['sac254_raw','sac254_corrected'], xscope='2016', plot_kwargs={'color':['black', 'black'], 'alpha':[.5, 1], 'label':['original', 'corrrected']})
Multivariate Flagging Procedure
-------------------------------
......@@ -345,7 +341,7 @@ correlated with relatively high *kNNscores*, we could try to calculate a thresho
`STRAY <https://arxiv.org/pdf/1908.04000.pdf>`_ algorithm, which is available as the method:
:py:meth:`~saqc.SaQC.flagByStray`. This method will mark some samples of the `kNNscore` variable as anomaly.
Subsequently we project this marks (or *flags*) on to the *sac* variable with a call to
:py:meth:`~saqc.SaQC.concatFlags`. For the sake of demonstration, we also project the flags
:py:meth:`~saqc.SaQC.transferFlags`. For the sake of demonstration, we also project the flags
on the normalized *sac* and plot the flagged values in the *sac254_norm* - *level_norm* feature space.
......@@ -353,8 +349,8 @@ on the normalized *sac* and plot the flagged values in the *sac254_norm* - *leve
.. doctest:: exampleMV
>>> qc = qc.flagByStray(field='kNNscores', freq='30D', alpha=.3)
>>> qc = qc.concatFlags(field='kNNscores', target='sac254_corrected', label='STRAY')
>>> qc = qc.concatFlags(field='kNNscores', target='sac254_norm', label='STRAY')
>>> qc = qc.transferFlags(field='kNNscores', target='sac254_corrected', label='STRAY')
>>> qc = qc.transferFlags(field='kNNscores', target='sac254_norm', label='STRAY')
>>> qc.plot('sac254_corrected', xscope='2016-11') # doctest:+SKIP
>>> qc.plot('sac254_norm', phaseplot='level_norm', xscope='2016-11') # doctest:+SKIP
......@@ -363,8 +359,8 @@ on the normalized *sac* and plot the flagged values in the *sac254_norm* - *leve
:include-source: False
qc = qc.flagByStray(field='kNNscores', freq='30D', alpha=.3)
qc = qc.concatFlags(field='kNNscores', target='sac254_corrected', label='STRAY')
qc = qc.concatFlags(field='kNNscores', target='sac254_norm', label='STRAY')
qc = qc.transferFlags(field='kNNscores', target='sac254_corrected', label='STRAY')
qc = qc.transferFlags(field='kNNscores', target='sac254_norm', label='STRAY')
.. plot::
:context: close-figs
......@@ -393,4 +389,4 @@ Config
To configure `saqc` to execute the above data processing and flagging steps, the config file would have to look
as follows:
.. literalinclude:: ../resources/data/hydro_config.csv
\ No newline at end of file
.. literalinclude:: ../resources/data/hydro_config.csv
......@@ -147,19 +147,19 @@ Rolling Mean
^^^^^^^^^^^^
Easiest thing to do, would be, to apply some rolling mean
model via the method :py:meth:`saqc.SaQC.roll`.
model via the method :py:meth:`saqc.SaQC.rolling`.
.. doctest:: exampleOD
>>> import numpy as np
>>> qc = qc.roll(field='incidents', target='incidents_mean', func=np.mean, window='13D')
>>> qc = qc.rolling(field='incidents', target='incidents_mean', func=np.mean, window='13D')
.. plot::
:context:
:include-source: False
import numpy as np
qc = qc.roll(field='incidents', target='incidents_mean', func=np.mean, window='13D')
qc = qc.rolling(field='incidents', target='incidents_mean', func=np.mean, window='13D')
The ``field`` parameter is passed the variable name, we want to calculate the rolling mean of.
The ``target`` parameter holds the name, we want to store the results of the calculation to.
......@@ -174,13 +174,13 @@ under the name ``np.median``. We just calculate another model curve for the ``"i
.. doctest:: exampleOD
>>> qc = qc.roll(field='incidents', target='incidents_median', func=np.median, window='13D')
>>> qc = qc.rolling(field='incidents', target='incidents_median', func=np.median, window='13D')
.. plot::
:context:
:include-source: False
qc = qc.roll(field='incidents', target='incidents_median', func=np.median, window='13D')
qc = qc.rolling(field='incidents', target='incidents_median', func=np.median, window='13D')
We chose another :py:attr:`target` value for the rolling *median* calculation, in order to not override our results from
the previous rolling *mean* calculation.
......@@ -255,25 +255,11 @@ This function object, we can pass on to the :py:meth:`~saqc.SaQC.processGeneric`
Visualisation
-------------
We can obtain those updated informations by generating a `pandas dataframe <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html>`_
representation of it, with the :py:attr:`data <saqc.core.core.SaQC.data>` method:
To see all the results obtained so far, plotted in one figure window, we make use of the :py:meth:`~saqc.SaQC.plot` method.
.. doctest:: exampleOD
>>> data = qc.data
.. plot::
:context:
:include-source: False
data = qc.data
To see all the results obtained so far, plotted in one figure window, we make use of the dataframes `plot <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.plot.html>`_ method.
.. doctest:: exampleOD
>>> data.to_df().plot()
<Axes...>
>>> qc.plot(".", regex=True) # doctest: +SKIP
.. plot::
:context:
......@@ -281,7 +267,7 @@ To see all the results obtained so far, plotted in one figure window, we make us
:width: 80 %
:class: center
data.to_df().plot()
qc.plot(".", regex=True)
Residuals and Scores
......@@ -332,18 +318,18 @@ for the point lying in the center of every window, we would define our function
z_score = lambda D: abs((D[14] - np.mean(D)) / np.std(D))
And subsequently, use the :py:meth:`~saqc.SaQC.roll` method to make a rolling window application with the scoring
And subsequently, use the :py:meth:`~saqc.SaQC.rolling` method to make a rolling window application with the scoring
function:
.. doctest:: exampleOD
>>> qc = qc.roll(field='incidents_residuals', target='incidents_scores', func=z_score, window='27D')
>>> qc = qc.rolling(field='incidents_residuals', target='incidents_scores', func=z_score, window='27D', min_periods=27)
.. plot::
:context: close-figs
:include-source: False
qc = qc.roll(field='incidents_residuals', target='incidents_scores', func=z_score, window='27D')
qc = qc.rolling(field='incidents_residuals', target='incidents_scores', func=z_score, window='27D', min_periods=27)
Optimization by Decomposition
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
......@@ -361,13 +347,13 @@ So the attempt works fine, only because our data set is small and strictly regul
Meaning that it has constant temporal distances between subsequent meassurements.
In order to tweak our calculations and make them much more stable, it might be useful to decompose the scoring
into seperate calls to the :py:meth:`~saqc.SaQC.roll` function, by calculating the series of the
into seperate calls to the :py:meth:`~saqc.SaQC.rolling` function, by calculating the series of the
residuals *mean* and *standard deviation* seperately:
.. doctest:: exampleOD
>>> qc = qc.roll(field='incidents_residuals', target='residuals_mean', window='27D', func=np.mean)
>>> qc = qc.roll(field='incidents_residuals', target='residuals_std', window='27D', func=np.std)
>>> qc = qc.rolling(field='incidents_residuals', target='residuals_mean', window='27D', func=np.mean)
>>> qc = qc.rolling(field='incidents_residuals', target='residuals_std', window='27D', func=np.std)
>>> qc = qc.processGeneric(field=['incidents_scores', "residuals_mean", "residuals_std"], target="residuals_norm",
... func=lambda this, mean, std: (this - mean) / std)
......@@ -376,15 +362,15 @@ residuals *mean* and *standard deviation* seperately:
:context: close-figs
:include-source: False
qc = qc.roll(field='incidents_residuals', target='residuals_mean', window='27D', func=np.mean)
qc = qc.roll(field='incidents_residuals', target='residuals_std', window='27D', func=np.std)
qc = qc.rolling(field='incidents_residuals', target='residuals_mean', window='27D', func=np.mean)
qc = qc.rolling(field='incidents_residuals', target='residuals_std', window='27D', func=np.std)
qc = qc.processGeneric(field=['incidents_scores', "residuals_mean", "residuals_std"], target="residuals_norm", func=lambda this, mean, std: (this - mean) / std)
With huge datasets, this will be noticably faster, compared to the method presented :ref:`initially <cookbooks/ResidualOutlierDetection:Scores>`\ ,
because ``saqc`` dispatches the rolling with the basic numpy statistic methods to an optimized pandas built-in.
Also, as a result of the :py:meth:`~saqc.SaQC.roll` assigning its results to the center of every window,
Also, as a result of the :py:meth:`~saqc.SaQC.rolling` assigning its results to the center of every window,
all the values are centered and we dont have to care about window center indices when we are generating
the *Z*\ -Scores from the two series.
......
......@@ -5,6 +5,13 @@
Developers Resources
====================
.. toctree::
:caption: Developer Resources
:hidden:
:maxdepth: 1
Documentation Guide <HowToDoc>
Writing Functions <WritingFunctions>
.. grid:: 3
:gutter: 2
......
......@@ -5,88 +5,136 @@
Customizations
==============
SaQC comes with a continuously growing number of pre-implemented
quality checking and processing routines as well as flagging schemes.
For any sufficiently large use case however, it is very likely that the
functions provided won't fulfill all your needs and requirements.
Acknowledging the impossibility to address all imaginable use cases, we
designed the system to allow for extensions and costumizations. The main extensions options, namely
SaQC comes with a continuously growing number of pre-implemented quality-checking and processing
routines as well as flagging schemes. For a sufficiently large use case, however, it might be
necessary to extend the system anyhow. The main extension options, namely
:ref:`quality check routines <documentation/Customizations:custom quality check routines>`
and the :ref:`flagging scheme <documentation/Customizations:custom flagging schemes>`
are described within this documents.
and the :ref:`flagging scheme <documentation/Customizations:custom flagging schemes>`.
Both of these mechanisms are described within this document.
Custom quality check routines
Custom Quality Check Routines
-----------------------------
In case you are missing quality check routines, you are of course very
welcome to file a feature request issue on the project's
`gitlab repository <https://git.ufz.de/rdm-software/saqc>`_. However, if
you are more the "I-get-this-done-by-myself" type of person,
SaQC provides two ways to integrate custom routines into the system:
In case you are missing quality check routines, you are, of course, very welcome to file a feature request issue on the project's `GitLab repository <https://git.ufz.de/rdm-software/saqc>`_. However, if you are more the "I-get-this-done-by-myself" type of person, SaQC offers the possibility to directly extend its functionality using its interface to the evaluation machinery.
#. The :ref:`extension language <documentation/GenericFunctions:Generic Functions>`
#. An :ref:`interface <documentation/Customizations:interface>` to the evaluation machinery
In order to make a function usable within the evaluation framework of SaQC, it needs to implement the following function interface:
Interface
^^^^^^^^^
In order to make a function usable within the evaluation framework of SaQC, it needs to
implement the following function interface
.. code-block:: python
import pandas
import saqc
def yourTestFunction(
saqc: SaQC
field: str,
*args,
**kwargs
) -> SaQC
def yourTestFunction(qc: SaQC, field: str | list[str], *args, **kwargs) -> SaQC:
# your code
return qc
Argument Descriptions
~~~~~~~~~~~~~~~~~~~~~
with the following parameters
.. list-table::
:header-rows: 1
* - Name
- Description
* - ``data``
- The actual dataset, an instance of ``saqc.DictOfSeries``.
* - ``qc``
- An instance of ``SaQC``
* - ``field``
- The field/column within ``data``, that function is processing.
* - ``flags``
- An instance of saqc.Flags, responsible for the translation of test results into quality attributes.
- The field(s)/column(s) of ``data`` the function is processing/flagging.
* - ``args``
- Any other arguments needed to parameterize the function.
- Any number of named arguments needed to parameterize the function.
* - ``kwargs``
- Any other keyword arguments needed to parameterize the function.
- Any number of named keyword arguments needed to parameterize the function. ``kwargs``
need to be present, even if the function needs no keyword arguments at all
Integrate into SaQC
^^^^^^^^^^^^^^^^^^^
In order make your function available to the system it needs to be registered. We provide a decorator
`\ ``flagging`` <saqc/functions/register.py>`_ with saqc, to integrate your
test functions into SaQC. Here is a complete dummy example:
SaQC provides two decorators, :py:func:`@flagging` and :py:func:`@register`, to integrate custom functions
into its workflow. The choice between them depends on the nature of your algorithm. :py:func:`@register`
is a more versatile decorator, allowing you to handle masking, demasking, and squeezing of data and flags, while
:py:func:`@flagging` is simpler and suitable for univariate flagging functions without the need for complex
data manipulations.
Use :py:func:`@flagging` for simple univariate flagging tasks without the need for complex data manipulations.
:py:func:`@flagging` is especially suitable when your algorithm operates on a single column
.. code-block:: python
from saqc import register
from saqc import SaQC
from saqc.core.register import flagging
@flagging()
def yourTestFunction(saqc: SaQC, field: str, *args, **kwargs):
def simpleFlagging(saqc: SaQC, field: str | list[str], param1: ..., param2: ..., **kwargs) -> SaQC:
"""
Your simple univariate flagging logic goes here.
Parameters
----------
saqc : SaQC
The SaQC instance.
field : str
The field or fields on which to apply anomaly detection.
param1 : ...
Additional parameters needed for your algorithm.
param2 : ...
Additional parameters needed for your algorithm.
Returns
-------
SaQC
The modified SaQC instance.
"""
# Your flagging logic here
# Modify saqc._flags as needed
return saqc
Use :py:func:`@register` when your algorithm needs to handle multiple columns simultaneously (``multivariate=True``)
and or you need explicit control over masking, demasking, and squeezing of data and flags.
:py:func:`register` is especially for complex algorithms that involve interactions between different columns.
.. code-block:: python
from saqc import SaQC
from saqc.core.register import register
@register(
mask=["field"], # Parameter(s) of the decorated functions giving the names of columns in SaQC._data to mask before the call
demask=["field"], # Parameter(s) of the decorated functions giving the names of columns in SaQC._data to unmask after the call
squeeze=["field"], # Parameter(s) of the decorated functions giving the names of columns in SaQC._flags to squeeze into a single flags column after the call
multivariate=True, # Set to True to handle multiple columns
handles_target=False,
)
def complexAlgorithm(
saqc: SaQC, field: str | list[str], param1: ..., param2: ..., **kwargs
) -> SaQC:
"""
Your custom anomaly detection logic goes here.
Parameters
----------
saqc : SaQC
The SaQC instance.
field : str or list of str
The field or fields on which to apply anomaly detection.
param1 : ...
Additional parameters needed for your algorithm.
param2 : ...
Additional parameters needed for your algorithm.
Returns
-------
SaQC
The modified SaQC instance.
"""
# Your anomaly detection logic here
# Modify saqc._flags and saqc._data as needed
return saqc
Example
^^^^^^^
The function `\ ``flagRange`` <saqc/funcs/outliers.py>`_ provides a simple, yet complete implementation of
a quality check routine. You might want to look into its implementation as an example.
Custom flagging schemes
-----------------------
......
......@@ -51,8 +51,7 @@ dummy dataset, to lead us through the following code snippets:
.. testsetup:: python
from saqc import fromConfig
from tests.common import writeIO
from saqc.parsing.reader import _ConfigReader as ConfigReader
.. testcode:: python
......@@ -116,16 +115,13 @@ Simple constraints
.. doctest:: python
:hide:
>>> tmp = fromConfig(
... writeIO(
>>> tmp = ConfigReader(data).readString(
... """
... varname ; test
... #-------;------------------------
... x ; flagGeneric(func=x < 30)
... """
... ),
... data
... )
... ).run()
>>> tmp.flags == qc1.flags #doctest:+NORMALIZE_WHITESPACE
True
......@@ -177,16 +173,13 @@ Cross variable constraints
.. doctest:: python
:hide:
>>> tmp = fromConfig(
... writeIO(
>>> tmp = ConfigReader(data).readString(
... """
... varname ; test
... #-------;------------------------------------
... x ; flagGeneric(field="y", func=y > 30)
... """
... ),
... data
... )
... ).run()
>>> tmp.flags == qc2.flags #doctest:+NORMALIZE_WHITESPACE
True
......@@ -241,16 +234,13 @@ need to be put in parentheses.
.. doctest:: python
:hide:
>>> tmp = fromConfig(
... writeIO(
>>> tmp = ConfigReader(data).readString(
... """
... varname ; test
... #-------;--------------------------------------------------------
... x ; flagGeneric(field=["y", "z"], func=(y > 30) & (z < 50))
... """
... ),
... data
... )
... ).run()
>>> tmp.flags == qc3.flags #doctest:+NORMALIZE_WHITESPACE
True
......@@ -293,16 +283,13 @@ Arithmetics
.. doctest:: python
:hide:
>>> tmp = fromConfig(
... writeIO(
>>> tmp = ConfigReader(data).readString(
... """
... varname ; test
... #-------;-------------------------------------------------------
... x ; flagGeneric(field=["x", "y", "z"], func=x > (y + z)/2)
... """
... ),
... data
... )
... ).run()
>>> tmp.flags == qc4.flags #doctest:+NORMALIZE_WHITESPACE
True
......@@ -351,16 +338,13 @@ Special functions
.. doctest:: python
:hide:
>>> tmp = fromConfig(
... writeIO(
>>> tmp = ConfigReader(data).readString(
... """
... varname ; test
... #-------;---------------------------------------------------
... x ; flagGeneric(field=["x", "z"], func=x > std(z) * 2)
... """
... ),
... data
... )
... ).run()
>>> tmp.flags == qc5.flags #doctest:+NORMALIZE_WHITESPACE
True
......@@ -402,17 +386,14 @@ Special functions
.. doctest:: python
:hide:
>>> tmp = fromConfig(
... writeIO(
>>> tmp = ConfigReader(data).readString(
... """
... varname ; test
... #-------;------------------------------------------
... y ; flagRange(min=10, max=60)
... x ; flagGeneric(field="y", func=isflagged(y))
... """
... ),
... data
... )
... ).run()
>>> tmp.flags == qc6.flags #doctest:+NORMALIZE_WHITESPACE
True
......@@ -481,16 +462,13 @@ Let's consider the following dataset:
.. doctest:: python
:hide:
>>> tmp = fromConfig(
... writeIO(
>>> tmp = ConfigReader(data).readString(
... """
... varname ; test
... #-------;---------------------------------------------------------------
... meas ; flagGeneric(field=["fan", "volt"], func=(x == 0) | (y < 12.0))
... """
... ),
... data
... )
... ).run()
>>> tmp.flags == qc7.flags #doctest:+NORMALIZE_WHITESPACE
True
......@@ -533,8 +511,7 @@ But we could also quality check our independent variables first and than leverag
.. doctest:: python
:hide:
>>> tmp = fromConfig(
... writeIO(
>>> tmp = ConfigReader(data).readString(
... """
... varname ; test
... #-------;--------------------------------------------------------------------------
......@@ -543,9 +520,7 @@ But we could also quality check our independent variables first and than leverag
... volt ; flagGeneric(func=volt < 12.0)
... meas ; flagGeneric(field=["fan", "volt"], func=isflagged(fan) | isflagged(volt))
... """
... ),
... data
... )
... ).run()
>>> tmp.flags == qc8.flags #doctest:+NORMALIZE_WHITESPACE
True
......@@ -634,16 +609,13 @@ variables in a given dataset. We start with dummy data again:
.. doctest:: python
:hide:
>>> tmp = fromConfig(
... writeIO(
>>> tmp = ConfigReader(data).readString(
... """
... varname ; test
... #-------;------------------------------------------------------
... mean ; processGeneric(field=["x", "y", "z"], func=(x+y+z)/2)
... """
... ),
... data
... )
... ).run()
>>> tmp.data == qc1.data #doctest:+NORMALIZE_WHITESPACE
True
......
......@@ -37,7 +37,6 @@ Example Data
:context: close-figs
:include-source: False
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import saqc
......
......@@ -7,7 +7,18 @@ Documentation
=============
.. grid:: 3
.. toctree::
:caption: Documentation
:maxdepth: 1
:hidden:
ConfigurationFiles
GlobalKeywords
Customizations
SourceTarget
FlaggingTranslation
.. grid:: 2
:gutter: 2
.. grid-item-card:: Configuration files (csv)
......@@ -30,5 +41,12 @@ Documentation
+++
*Keywords shared by all the flagging functions*
.. grid-item-card:: Customizations
:link: Customizations
:link-type: doc
* add custom functions to SaQC
+++
*Keywords shared by all the flagging functions*