Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • berntm/saqc
  • rdm-software/saqc
  • schueler/saqc
3 results
Show changes
Commits on Source (442)
Showing with 326 additions and 3402 deletions
......@@ -5,3 +5,4 @@
*.feather filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
resources/machine_learning/data/soil_moisture_mwe.feather filter=lfs diff=lfs merge=lfs -text
saqc/_version.py export-subst
......@@ -12,9 +12,9 @@ on:
branches:
- master
- develop
tags:
tags:
- v**
pull_request:
# Allow to run this workflow manually from the Actions tab
......@@ -29,14 +29,14 @@ jobs:
fail-fast: false
matrix:
os: ["windows-latest", "ubuntu-latest", "macos-latest"]
python-version: ["3.7", "3.8", "3.9", "3.10"]
python-version: ["3.9", "3.10", "3.11", "3.12"]
defaults:
run:
# somehow this also works for windows O.o ??
shell: bash -l {0}
steps:
# checkout the repository under $GITHUB_WORKSPACE
- uses: actions/checkout@v3
- uses: conda-incubator/setup-miniconda@v2
......@@ -44,28 +44,25 @@ jobs:
auto-update-conda: true
python-version: ${{ matrix.python-version }}
activate-environment: venv
- name: show conda info
run: conda info
- name: install requirements
run: |
run: |
pip install -r requirements.txt
pip install -r tests/requirements.txt
- name: show installed packages
- name: show installed packages
run: conda list
- name: run SaQC test suite
run: |
pytest tests dios/test -Werror
python -m saqc --config docs/resources/data/config.csv --data docs/resources/data/data.csv --outfile /tmp/test.csv
- name: run doc tests
run: |
cd docs
pip install -r requirements.txt
make doc
make test
# - name: run doc tests
# run: |
# cd docs
# pip install -r requirements.txt
# make doc
# make test
......@@ -8,6 +8,7 @@
*.automodapi
docs/_api
docs/_build
docs/resources/temp/*
coverage.xml
venv*/
**/.*
......@@ -2,6 +2,20 @@
#
# SPDX-License-Identifier: GPL-3.0-or-later
# ===========================================================
# Hints
# ===========================================================
# $PYPI_PKG_NAME
# The variable PYPI_PKG_NAME is used in setup.py to determine
# how to name the tarball package. If not set the package is
# named 'saqc'.
# $TESTPYPI_TOKEN
# The upload token used for testpypi, set it on the gitlab
# page and enable masking to prevent revealing
# ===========================================================
# preparation
# ===========================================================
......@@ -16,11 +30,13 @@ stages:
- deploy
default:
image: python:3.10
image: python:3.11
before_script:
- pip install --upgrade pip
- pip install -r requirements.txt
- pip install -r tests/requirements.txt
- apt update
- apt install -y xvfb
# ===========================================================
# Compliance stage
......@@ -61,8 +77,10 @@ coverage:
stage: test
allow_failure: true
script:
- export DISPLAY=:99
- Xvfb :99 &
- pip install pytest-cov coverage
- pytest --cov=saqc tests --ignore=tests/fuzzy -Werror
- pytest --cov=saqc tests --ignore=tests/fuzzy tests/extras -Werror
after_script:
- coverage xml
# regex to find the coverage percentage in the job output
......@@ -75,12 +93,13 @@ coverage:
path: coverage.xml
# test saqc with python 3.7
python37:
python39:
stage: test
image: python:3.7
image: python:3.9
script:
- pytest tests dios/test -Werror --junitxml=report.xml
- export DISPLAY=:99
- Xvfb :99 &
- pytest tests -Werror --junitxml=report.xml --ignore=tests/extras
- python -m saqc --config docs/resources/data/config.csv --data docs/resources/data/data.csv --outfile /tmp/test.csv
artifacts:
when: always
......@@ -88,46 +107,49 @@ python37:
junit: report.xml
# test saqc with python 3.8
python38:
python310:
stage: test
image: python:3.10
script:
- pytest tests dios/test -Werror --junitxml=report.xml
- export DISPLAY=:99
- Xvfb :99 &
- pytest tests -Werror --junitxml=report.xml --ignore=tests/extras
- python -m saqc --config docs/resources/data/config.csv --data docs/resources/data/data.csv --outfile /tmp/test.csv
artifacts:
when: always
reports:
junit: report.xml
# test saqc with python 3.9
python39:
python311:
stage: test
image: python:3.9
image: python:3.11
script:
- pytest tests dios/test -Werror --junitxml=report.xml
- export DISPLAY=:99
- Xvfb :99 &
- pytest tests -Werror --junitxml=report.xml --ignore=tests/extras
- python -m saqc --config docs/resources/data/config.csv --data docs/resources/data/data.csv --outfile /tmp/test.csv
artifacts:
when: always
reports:
junit: report.xml
# test saqc with python 3.10
python310:
python312:
stage: test
image: python:3.10
image: python:3.12
script:
- pytest tests dios/test -Werror --junitxml=report.xml
- export DISPLAY=:99
- Xvfb :99 &
- pytest tests -Werror --junitxml=report.xml --ignore=tests/extras
- python -m saqc --config docs/resources/data/config.csv --data docs/resources/data/data.csv --outfile /tmp/test.csv
artifacts:
when: always
reports:
junit: report.xml
doctest:
stage: test
variables:
COLUMNS: 200
script:
- cd docs
- pip install -r requirements.txt
......@@ -139,24 +161,72 @@ doctest:
# Building stage
# ===========================================================
# check if we are able to build a wheel
wheel:
# and if the import works
wheel39:
stage: build
image: python:3.9
variables:
PYPI_PKG_NAME: "saqc-dev"
script:
- pip install wheel
- pip wheel .
- pip install .
- python -c 'import saqc; print(f"{saqc.__version__=}")'
docs:
wheel310:
stage: build
image: python:3.10
variables:
PYPI_PKG_NAME: "saqc-dev"
script:
- cd docs
- pip install -r requirements.txt
- make doc
- pip install wheel
- pip wheel .
- pip install .
- python -c 'import saqc; print(f"{saqc.__version__=}")'
wheel311:
stage: build
image: python:3.11
variables:
PYPI_PKG_NAME: "saqc-dev"
script:
- pip install wheel
- pip wheel .
- pip install .
- python -c 'import saqc; print(f"{saqc.__version__=}")'
wheel312:
stage: build
image: python:3.12
variables:
PYPI_PKG_NAME: "saqc-dev"
script:
- pip install wheel
- pip wheel .
- pip install .
- python -c 'import saqc; print(f"{saqc.__version__=}")'
# ===========================================================
# Extra Pipeline (run with a successful run of all other jobs on develop)
# ===========================================================
upload_testpypi:
stage: deploy
only:
- develop
except:
- schedules
variables:
PYPI_PKG_NAME: "saqc-dev"
TWINE_USERNAME: __token__
TWINE_PASSWORD: $TESTPYPI_TOKEN
script:
- pip install build twine
- python -m build
- twine check --strict dist/*
- twine upload -r testpypi dist/*
# make html docu with sphinx
pages:
stage: deploy
......
......@@ -5,13 +5,146 @@ SPDX-License-Identifier: GPL-3.0-or-later
-->
# Changelog
## Unreleased
[List of commits](https://git.ufz.de/rdm-software/saqc/-/compare/v2.2.1...develop)
[List of commits](https://git.ufz.de/rdm-software/saqc/-/compare/v2.6.0...develop)
### Added
- `flagPlateaus`: added function to search and flag outlierish value plateaus of certain temporal extension
- `flagUniLOF`: added dispatch to Local Outlier Probability (*LoOP*) variant
- `flaguniLOF`: made `thresh` Optional
- `flagPlateaus`: added function to search and flag anomalous value plateaus of certain temporal extension
### Changed
### Removed
### Fixed
- `flagConstants`: fixed bug where last `min_periods` will never get flagged
### Deprecated
## [2.6.0](https://git.ufz.de/rdm-software/saqc/-/tags/v2.6.0) - 2024-04-15
[List of commits](https://git.ufz.de/rdm-software/saqc/-/compare/v2.5.0...v2.6.0)
### Added
- `reindex`: base reindexer function
- `flagGeneric`, `processGeneric`: target broadcasting and numpy array support
- `SaQC`: automatic translation of incoming flags
- Option to change the flagging scheme after initialization
- `flagByClick`: manually assign flags using a graphical user interface
- `SaQC`: support for selection, slicing and setting of items by subscription on `SaQC` objects
- `transferFlags` is a multivariate function
- `plot`: added `yscope` keyword
- `setFlags`: function to replace `flagManual`
- `flagUniLOF`: added parameter `slope_correct` to correct for overflagging at relatively steep data value slopes
- `History`: added option to change aggregation behavior
- "horizontal" axis / multivariate mode for `rolling`
- Translation scheme `AnnotatedFloatScheme`
### Changed
- `SaQC.flags` always returns a `DictOfSeries`
### Removed
- `SaQC` methods deprecated in version 2.4: `interpolate`, `interpolateIndex`, `interpolateInvalid`, `roll`, `linear`,`shift`, `flagCrossStatistics`
- Method `Flags.toDios` deprecated in version 2.4
- Method `DictOfSeries.index_of` method deprecated in version 2.4
- Option `"complete"` for parameter `history` of method `plot`
- Option `"cycleskip"` for parameter `ax_kwargs` of method `plot`
- Parameter `phaseplot` from method `plot`
### Fixed
- `flagConstants`: fixed flagging of rolling ramps
- `Flags`: add meta entry to imported flags
- group operations were overwriting existing flags
- `SaQC._construct` : was not working for inherited classes
- `processgeneric`: improved numpy function compatability
### Deprecated
- `flagManual` in favor of `setFlags`
- `inverse_**` options for `concatFlags` parameter `method` in favor of `invert=True`
- `flagRaise` with delegation to better replacements `flagZScore`, `flagUniLOF`, `flagJumps` or `flagOffset`
- `flagByGrubbs` with delegation to better replacements `flagZScore`, `flagUniLOF`s
- `flagMVScore` with delegation to manual application of the steps
## [2.5.0](https://git.ufz.de/rdm-software/saqc/-/tags/v2.5.0) - 2023-09-05
[List of commits](https://git.ufz.de/rdm-software/saqc/-/compare/v2.4.1...v2.5.0)
### Added
- WMO standard mean aggregations
- Function selection via strings for most function-expecting parameters
- `SaQC.plot`:
- enable multivariate plots
- keyword `plot_kwargs` to pass matplotlib related arguments
- CLI:
- `--version` to print the SaQC version
- `-ll` as a shorthand for `--log-level`
- `--json-field` to use a non-root element of a json file.
- basic json support for CLI config files, which are detected by `.json`-extension.
- `SaQC.flagScatterLowpass`: option to select function based on string names.
- Checks and unified error message for common function inputs.
### Changed
- Require pandas >= 2.0
- `SaQC.flagUniLOF` and `SaQC.assignUniLOF`: changed parameter `fill_na` to type `bool`.
- `SaQC.plot`:
- changed default color for single variables to `black` with `80% transparency`
- added seperate legend for flags
### Removed
- `SaQC.plot`: option to plot with complete history (`history="complete"`)
- Support for Python 3.8
### Fixed
- `SaQC.assignChangePointCluster` and `SaQC.flagChangePoints`: A tuple passed `min_period`
was only recognised if `window` was also a tuple.
- `SaQC.propagateFlags` was overwriting existing flags
### Deprecated
- `SaQC.andGroup` and `SaQC.orGroup`: option to pass dictionaries to `group`.
- `SaQC.plot`:
- `phaseplot` in favor of usage with `mode="biplot"`
- `cyclestart` in favor of usage with `marker_kwargs`
- `SaQC.flagStatLowPass` in favor of `SaQC.flagScatterLowpass`
## [2.4.1](https://git.ufz.de/rdm-software/saqc/-/tags/v2.4.1) - 2023-06-22
[List of commits](https://git.ufz.de/rdm-software/saqc/-/compare/v2.4.0...v.2.4.1)
### Added
### Changed
- pin pandas to versions >= 2.0
### Removed
- removed deprecated `DictOfSeries.to_df`
### Fixed
### Deprecated
## [2.4.0](https://git.ufz.de/rdm-software/saqc/-/tags/v2.4.0) - 2023-04-25
[List of commits](https://git.ufz.de/rdm-software/saqc/-/compare/v2.3.0...v2.4.0)
### Added
- Methods `logicalAnd` and `logicalOr`
- `Flags` support slicing and column selection with `list` or a `pd.Index`.
- Expose the `History` via `SaQC._history`
- Config function `cv` (coefficient of variation)
### Changed
- Rename `interplateInvalid` to `interpolate`
- Rename `interpolateIndex` to `align`
- Rewrite of `dios.DictOfSeries`
### Removed
- Parameter `limit` from `align`
- Parameter `max_na_group_flags`, `max_na_flags`, `flag_func`, `freq_check` from `resample`
### Fixed
- `func` arguments in text configurations were not parsed correctly
- fail on duplicated arguments to test methods
- `reample` was not writing meta entries
- `flagByScatterLowpass` was overwriting existing flags
- `flagUniLOF` and `flagLOF` were overwriting existing flags
### Deprecated
- Deprecate `flagMVScore` parameters: `partition` in favor of `window`, `partition_min` in favor of `min_periods`, `min_periods` in favor of `min_periods_r`
- Deprecate `interpolate`, `linear` and `shift` in favor of `align`
- Deprecate `roll` in favor of `rolling`
- Deprecate `DictOfSeries.to_df` in favor of `DictOfSeries.to_pandas`
## [2.3.0](https://git.ufz.de/rdm-software/saqc/-/tags/v2.3.0) - 2023-01-17
[List of commits](https://git.ufz.de/rdm-software/saqc/-/compare/v2.2.1...v2.3.0)
### Added
- add option to not overwrite existing flags to `concatFlags`
- add option to pass existing axis object to `plot`
- python 3.11 support
- added Local Outlier Factor functionality
### Changed
- Remove all flag value restrictions from the default flagging scheme `FloatTranslator`
- Renamed `TranslationScheme.forward` to `TranslationScheme.toInternal`
- Renamed `TranslationScheme.backward` to `TranslationScheme.toExternal`
- Changed default value of the parameter `limit` for `SaQC.interpolateIndex` and `SaQC.interpolateInvalid` to ``None``
- Changed default value of the parameter ``overwrite`` for ``concatFlags`` to ``False``
- Deprecate ``transferFlags`` in favor of ``concatFlags``
### Removed
- python 3.7 support
### Fixed
- Error for interpolations with limits set to be greater than 2 (`interpolateNANs`)
- Error when fitting polynomials to irregularly sampled data (`fitPolynomial`)
## [2.2.1](https://git.ufz.de/rdm-software/saqc/-/tags/v2.2.1) - 2022-10-29
[List of commits](https://git.ufz.de/rdm-software/saqc/-/compare/v2.2.0...v2.2.1)
......@@ -29,7 +162,7 @@ SPDX-License-Identifier: GPL-3.0-or-later
- translation of `dfilter`
- new generic function `clip`
- parameter `min_periods` to `SaQC.flagConstants`
- function `fitButterworth`
- function `fitLowpassFilter`
- tracking interpolation routines in `History`
### Changed
- test function interface changed to `func(saqc: SaQC, field: str | Sequence[str], *args, **kwargs)`
......@@ -38,7 +171,7 @@ SPDX-License-Identifier: GPL-3.0-or-later
### Removed
- `closed` keyword in `flagJumps`
### Fixed
- fixed undesired behavior in `flagIsolated` for not harmonized data
- fixed undesired behavior in `flagIsolated` for not harmonized data
- fixed failing translation of `dfilter`-defaults
- fixed unbound recursion error when interpolating with order-independent methods in `interpolateIndex`
- fixed not working min_periods condition if `window=None` in `assignZScore`
......
......@@ -3,7 +3,7 @@ title: SaQC - System for automated Quality Control
message: "Please cite this software using these metadata."
type: software
version: 2.0.0
doi: https://doi.org/10.5281/zenodo.5888547
doi: 10.5281/zenodo.5888547
date-released: "2021-11-25"
license: "GPL-3.0"
repository-code: "https://git.ufz.de/rdm-software/saqc"
......
......@@ -59,7 +59,7 @@ It is not a shame to name a parameter just `n` or `alpha` etc., if, for example,
### Test Functions
- testnames: [testmodule_]flagTestName
- testnames: flagTestName
## Formatting
We use [black](https://black.readthedocs.io/en/stable/) in its default settings.
......@@ -70,13 +70,17 @@ Only absolute imports are accepted.
# Development Workflow
## Releases
Every release is planned by an associated Milestone. This milestone should have a end date, usually, the first of the month the next release is planned and contain all issue/merge requests to include.
## Repository Structure
- `master` - branch:
- `main` - branch:
+ Stable and usually protected.
+ Regular merges from `develop`, these merges are tagged and increasing at least the minor version.
+ Irregular merges from `develop` in case of critical bugs. Such merges increase at least the patch level.
+ Merges into `master` usually lead to a PyPI release.
+ Merges into `main` usually lead to a PyPI release.
- `develop` - branch:
+ The main development branch, no hard stability requirements/guarantees.
+ Merges into `develop` should mostly follow a [Merge Request Workflow](#merge-request-workflow), minor changes can however be committed directly. Such minor changes include:
......@@ -105,6 +109,6 @@ Only absolute imports are accepted.
release date. Commits to `develop` after the merge window of a release closes need to be integrated during the subsequent release
cycle
- The release cycle is organized by Gitlab Milestones, the expiration date of a certain milestone indicates the end of the
related merge window, the actual merge into `master` and the accompanying release is scheduled for the week after the
related merge window, the actual merge into `main` and the accompanying release is scheduled for the week after the
milestones expiration date.
- Issues and Merge Requests can and should be associated to these milestone as this help in the organization of review activities.
This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or distribute this software, either in source code form or as a compiled binary, for any purpose, commercial or non-commercial, and by any means.
In jurisdictions that recognize copyright laws, the author or authors of this software dedicate any and all copyright interest in the software to the public domain. We make this dedication for the benefit of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of relinquishment in perpetuity of all present and future rights to this software under copyright law.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
For more information, please refer to <http://unlicense.org/>
......@@ -4,43 +4,49 @@ SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ
SPDX-License-Identifier: GPL-3.0-or-later
-->
<a href="https://www.ufz.de/index.php?en=33573">
<img src="https://git.ufz.de/rdm-software/saqc/raw/develop/docs/resources/images/representative/UFZLogo.png" width="400"/>
</a>
<a href="https://www.ufz.de/index.php?en=45348">
<img src="https://git.ufz.de/rdm-software/saqc/raw/develop/docs/resources/images/representative/RDMLogo.png" align="right" width="220"/>
</a>
<br>
<div align="center">
<img src="https://git.ufz.de/rdm-software/saqc/raw/develop/docs/resources/images/representative/SaQCLogo.png" width="300">
</div>
-----------------
[![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active)
# SaQC: System for automated Quality Control
# System for automated Quality Control (SaQC)
`SaQC` is a tool/framework/application to quality control time series data.
It provides
a growing collection of algorithms and methods to analyze, annotate and
process timeseries data. It supports the end to end enrichment of metadata
and provides various user interfaces: 1) a Python API, 2) a command line interface
with a text based configuration system and a
[web based user interface](https://webapp.ufz.de/saqc-config-app/)
Anomalies and errors are the rule not the exception when working with
time series data. This is especially true, if such data originates
from in-situ measurements of environmental properties.
Almost all applications, however, implicily rely on data, that complies
with some definition of 'correct'.
In order to infer reliable data products and tools, there is no alternative
to quality control. SaQC provides all the building blocks to comfortably
bridge the gap between 'usually faulty' and 'expected to be corrected' in
a accessible, consistent, objective and reproducible way.
`SaQC` is designed with a particular focus on the needs of active data professionals,
including sensor hardware-oriented engineers, domain experts, and data scientists,
all of whom can benefit from its capabilities to improve the quality standards of given data products.
For a (continously improving) overview of features, typical usage patterns,
the specific system components and how to customize `SaQC` to your specific
the specific system components and how to customize `SaQC` to your own
needs, please refer to our
[online documentation](https://rdm-software.pages.ufz.de/saqc/index.html).
## Installation
SaQC is available on the Python Package Index ([PyPI](https://pypi.org/)) and
`SaQC` is available on the Python Package Index ([PyPI](https://pypi.org/)) and
can be installed using [pip](https://pip.pypa.io/en/stable/):
```sh
python -m pip install saqc
```
For a more detailed installion guide, see the [installation guide](https://rdm-software.pages.ufz.de/saqc/gettingstarted/InstallationGuide.html).
Additionally `SaQC` is available via conda and can be installed with:
```sh
conda create -c conda-forge -n saqc saqc
```
For more details, see the [installation guide](https://rdm-software.pages.ufz.de/saqc/gettingstarted/InstallationGuide.html).
## Usage
......@@ -56,11 +62,11 @@ could look like [this](https://git.ufz.de/rdm-software/saqc/raw/develop/docs/res
```
varname ; test
#----------; ---------------------------------------------------------------------
SM2 ; shift(freq="15Min")
SM2 ; align(freq="15Min")
'SM(1|2)+' ; flagMissing()
SM1 ; flagRange(min=10, max=60)
SM2 ; flagRange(min=10, max=40)
SM2 ; flagMAD(window="30d", z=3.5)
SM2 ; flagZScore(window="30d", thresh=3.5, method='modified', center=False)
Dummy ; flagGeneric(field=["SM1", "SM2"], func=(isflagged(x) | isflagged(y)))
```
......@@ -95,30 +101,27 @@ data = pd.read_csv(
index_col=0, parse_dates=True,
)
saqc = SaQC(data=data)
saqc = (saqc
.shift("SM2", freq="15Min")
.flagMissing("SM(1|2)+", regex=True)
.flagRange("SM1", min=10, max=60)
.flagRange("SM2", min=10, max=40)
.flagMAD("SM2", window="30d", z=3.5)
.flagGeneric(field=["SM1", "SM2"], target="Dummy", func=lambda x, y: (isflagged(x) | isflagged(y))))
qc = SaQC(data=data)
qc = (qc
.align("SM2", freq="15Min")
.flagMissing("SM(1|2)+", regex=True)
.flagRange("SM1", min=10, max=60)
.flagRange("SM2", min=10, max=40)
.flagZScore("SM2", window="30d", thresh=3.5, method='modified', center=False)
.flagGeneric(field=["SM1", "SM2"], target="Dummy", func=lambda x, y: (isflagged(x) | isflagged(y))))
```
A more detailed description of the Python API is available in the
A more detailed description of the Python API is available in the
[respective section](https://rdm-software.pages.ufz.de/saqc/gettingstarted/TutorialAPI.html)
of the documentation.
## Changelog
All notable changes to this project will be documented in [CHANGELOG.md](CHANGELOG.md).
## Get involved
### Contributing
You found a bug or you want to suggest some cool features? Please refer to our [contributing guidelines](CONTRIBUTING.md) to see how you can contribute to SaQC.
You found a bug or you want to suggest new features? Please refer to our [contributing guidelines](CONTRIBUTING.md) to see how you can contribute to SaQC.
### User support
If you need help or have a question, you can use the SaQC user support mailing list: [saqc-support@ufz.de](mailto:saqc-support@ufz.de)
If you need help or have questions, send us an email to [saqc-support@ufz.de](mailto:saqc-support@ufz.de)
## Copyright and License
Copyright(c) 2021, [Helmholtz-Zentrum für Umweltforschung GmbH -- UFZ](https://www.ufz.de). All rights reserved.
......@@ -128,13 +131,24 @@ Copyright(c) 2021, [Helmholtz-Zentrum für Umweltforschung GmbH -- UFZ](https://
For full details, see [LICENSE](LICENSE.md).
## Acknowledgements
...
## Publications
coming soon...
> Lennart Schmidt, David Schäfer, Juliane Geller, Peter Lünenschloss, Bert Palm, Karsten Rinke, Corinna Rebmann, Michael Rode, Jan Bumberger, System for automated Quality Control (SaQC) to enable traceable and reproducible data streams in environmental science, Environmental Modelling & Software, 2023, 105809, ISSN 1364-8152, https://doi.org/10.1016/j.envsoft.2023.105809. (https://www.sciencedirect.com/science/article/pii/S1364815223001950)
## How to cite SaQC
If SaQC is advancing your research, please cite as:
> Schäfer, David; Palm, Bert; Lünenschloß, Peter. (2021). System for automated Quality Control - SaQC. Zenodo. https://doi.org/10.5281/zenodo.5888547
> Schäfer, David, Palm, Bert, Lünenschloß, Peter, Schmidt, Lennart, & Bumberger, Jan. (2023). System for automated Quality Control - SaQC (2.3.0). Zenodo. https://doi.org/10.5281/zenodo.5888547
or
> Lennart Schmidt, David Schäfer, Juliane Geller, Peter Lünenschloss, Bert Palm, Karsten Rinke, Corinna Rebmann, Michael Rode, Jan Bumberger, System for automated Quality Control (SaQC) to enable traceable and reproducible data streams in environmental science, Environmental Modelling & Software, 2023, 105809, ISSN 1364-8152, https://doi.org/10.1016/j.envsoft.2023.105809. (https://www.sciencedirect.com/science/article/pii/S1364815223001950)
-----------------
<a href="https://www.ufz.de/index.php?en=33573">
<img src="https://git.ufz.de/rdm-software/saqc/raw/develop/docs/resources/images/representative/UFZLogo.png" width="400"/>
</a>
<a href="https://www.ufz.de/index.php?en=45348">
<img src="https://git.ufz.de/rdm-software/saqc/raw/develop/docs/resources/images/representative/RDMLogo.png" align="right" width="220"/>
</a>
<!--
SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ
SPDX-License-Identifier: GPL-3.0-or-later
-->
DictOfSeries
============
DictOfSeries is a pandas.Series of pandas.Series objects which aims to behave as similar as possible to pandas.DataFrame.
Nomenclature
------------
- series/ser: instance of pandas.Series
- dios: instance of dios.DictOfSeries
- df: instance of pandas.DataFrame
- dios-like: a *dios* or a *df*
- alignable object: a *dios*, *df* or a *series*
Features
--------
* every *column* has its own index
* uses much less memory than a misaligned pandas.DataFrame
* behaves quite like a pandas.DataFrame
* additional align locator (`.aloc[]`)
Install
-------
todo: PyPi
```
import dios
# Have fun :)
```
Documentation
-------------
The main docu is on ReadTheDocs at:
* [dios.rtfd.io](https://dios.rtfd.io)
but some docs are also available local:
* [Indexing](/docs/doc_indexing.md)
* [Cookbook](/docs/doc_cookbook.md)
* [Itype](/docs/doc_itype.md)
TL;DR
-----
**get it**
```
>>> from dios import DictOfSeries
```
**empty**
```
>>> DictOfSeries()
Empty DictOfSeries
Columns: []
>>> DictOfSeries(columns=['x', 'y'])
Empty DictOfSeries
Columns: ['x', 'y']
>>> DictOfSeries(columns=['x', 'y'], index=[3,4,5])
x | y |
====== | ====== |
3 NaN | 3 NaN |
4 NaN | 4 NaN |
5 NaN | 5 NaN |
```
**with data**
```
>>> DictOfSeries([range(4), range(2), range(3)])
0 | 1 | 2 |
==== | ==== | ==== |
0 0 | 0 0 | 0 0 |
1 1 | 1 1 | 1 1 |
2 2 | | 2 2 |
3 3 | | |
>>> DictOfSeries(np.random.random([2,4]))
0 | 1 |
=========== | =========== |
0 0.112020 | 0 0.509881 |
1 0.108070 | 1 0.285779 |
2 0.851453 | 2 0.805933 |
3 0.138352 | 3 0.812339 |
>>> DictOfSeries(np.random.random([2,4]), columns=['a','b'], index=[11,12,13,14])
a | b |
============ | ============ |
11 0.394304 | 11 0.356206 |
12 0.943689 | 12 0.735356 |
13 0.791820 | 13 0.066947 |
14 0.759802 | 14 0.496321 |
>>> DictOfSeries(dict(today=['spam']*3, tomorrow=['spam']*2))
today | tomorrow |
======= | ========== |
0 spam | 0 spam |
1 spam | 1 spam |
2 spam | |
```
# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ
#
# SPDX-License-Identifier: GPL-3.0-or-later
from .dios import *
from .lib import *
__all__ = [
"DictOfSeries",
"to_dios",
"pprint_dios",
"IntItype",
"FloatItype",
"NumItype",
"DtItype",
"ObjItype",
"ItypeWarning",
"ItypeCastWarning",
"ItypeCastError",
"is_itype",
"is_itype_subtype",
"is_itype_like",
"get_itype",
"cast_to_itype",
"CastPolicy",
"Opts",
"OptsFields",
"OptsFields",
"dios_options",
"example_DictOfSeries",
]
This diff is collapsed.
This diff is collapsed.
# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ
#
# SPDX-License-Identifier: GPL-3.0-or-later
import numpy as np
import pandas as pd
from . import pandas_bridge as pdextra
from .base import _DiosBase, _is_bool_dios_like, _is_dios_like
class _Indexer:
def __init__(self, obj: _DiosBase):
self.obj = obj
self._data = obj._data
def _unpack_key(self, key):
key = list(key) if pdextra.is_iterator(key) else key
if isinstance(key, tuple):
if len(key) > 2:
raise KeyError("To many indexers")
rowkey, colkey = key
else:
rowkey, colkey = key, slice(None)
if isinstance(rowkey, tuple) or isinstance(colkey, tuple):
raise KeyError(f"{key}. tuples are not allowed.")
rowkey = list(rowkey) if pdextra.is_iterator(rowkey) else rowkey
colkey = list(colkey) if pdextra.is_iterator(colkey) else colkey
return rowkey, colkey
def _set_value_muli_column(self, rowkey, colkey, value, xloc="loc"):
"""set value helper for loc and iloc"""
data = getattr(self._data, xloc)[colkey]
hashable_rkey = pdextra.is_hashable(rowkey)
dioslike_value = False
iter_value = False
if _is_dios_like(value):
dioslike_value = True
if hashable_rkey:
raise ValueError(f"Incompatible indexer with DictOfSeries")
elif pdextra.is_list_like(value):
value = value.values if isinstance(value, pd.Series) else value
iter_value = True
if len(value) != len(data):
raise ValueError(
f"shape mismatch: value array of shape (.., {len(value)}) could "
f"not be broadcast to indexing result of shape (.., {len(data)})"
)
c = "?"
try:
for i, c in enumerate(data.index):
dat = data.at[c]
dat_xloc = getattr(dat, xloc)
if dioslike_value:
# set to empty series fail; emptySer.loc[:] = [2,1]
# len(scalar) -> would fail, but cannot happen,
# because dioslike+hashable, already was checked
if len(dat_xloc[rowkey]) == 0:
continue
# unpack the value if necessary
if iter_value:
val = value[i]
elif dioslike_value:
val = value[c] if c in value else np.nan
else:
val = value
dat_xloc[rowkey] = val
except Exception as e:
raise type(e)(f"failed for column {c}: " + str(e)) from e
# #############################################################################
class _LocIndexer(_Indexer):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def __getitem__(self, key):
rowkey, colkey = self._unpack_key(key)
if _is_dios_like(rowkey) or _is_dios_like(colkey):
raise ValueError("Could not index with multidimensional key")
# simple optimisation
if pdextra.is_null_slice(rowkey) and pdextra.is_null_slice(colkey):
return self.obj.copy()
data = self._data.loc[colkey].copy()
# .loc[any, scalar] -> (a single) series
# .loc[scalar, scalar] -> (a single) value
if pdextra.is_hashable(colkey):
new = data.loc[rowkey]
# .loc[any, non-scalar]
else:
k = "?"
try:
for k in data.index:
data.at[k] = data.at[k].loc[rowkey]
except Exception as e:
raise type(e)(f"failed for column {k}: " + str(e)) from e
# .loc[scalar, non-scalar] -> column-indexed series
if pdextra.is_hashable(rowkey):
new = data
# .loc[non-scalar, non-scalar] -> dios
else:
new = self.obj.copy_empty(columns=False)
new._data = data
return new
def __setitem__(self, key, value):
rowkey, colkey = self._unpack_key(key)
if _is_dios_like(rowkey) or _is_dios_like(colkey):
raise ValueError("Cannot index with multi-dimensional key")
# .loc[any, scalar] - set on single column
if pdextra.is_hashable(colkey):
# .loc[dont-care, new-scalar] = val
if colkey not in self.obj.columns:
self.obj._insert(colkey, value)
# .loc[any, scalar] = multi-dim
elif _is_dios_like(value) or pdextra.is_nested_list_like(value):
raise ValueError("Incompatible indexer with multi-dimensional value")
# .loc[any, scalar] = val
else:
self._data.at[colkey].loc[rowkey] = value
# .loc[any, non-scalar] = any
else:
self._set_value_muli_column(rowkey, colkey, value, xloc="loc")
# #############################################################################
class _iLocIndexer(_Indexer):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def __getitem__(self, key):
rowkey, colkey = self._unpack_key(key)
if _is_dios_like(rowkey) or _is_dios_like(colkey):
raise ValueError("Cannot index with multidimensional key")
# simple optimisation
if pdextra.is_null_slice(rowkey) and pdextra.is_null_slice(colkey):
return self.obj.copy()
data = self._data.iloc[colkey].copy()
# .iloc[any, int] -> single series
# .iloc[int, int] -> single value
if pdextra.is_integer(colkey):
new = data.iloc[rowkey]
# .iloc[any, non-int]
else:
k = "?"
try:
for k in data.index:
data.at[k] = data.at[k].iloc[rowkey]
except Exception as e:
raise type(e)(f"failed for column {k}: " + str(e)) from e
# .iloc[int, non-int] -> column-indexed series
if pdextra.is_integer(rowkey):
new = data
# .iloc[non-int, non-int] -> dios
else:
new = self.obj.copy_empty(columns=False)
new._data = data
return new
def __setitem__(self, key, value):
rowkey, colkey = self._unpack_key(key)
if _is_dios_like(rowkey) or _is_dios_like(colkey):
raise ValueError("Cannot index with multidimensional key")
# .iloc[any, int] = Any
if pdextra.is_integer(colkey):
if _is_dios_like(value) or pdextra.is_nested_list_like(value):
raise ValueError("Incompatible indexer with multi-dimensional value")
self._data.iat[colkey].iloc[rowkey] = value
# .iloc[any, non-int] = Any
else:
self._set_value_muli_column(rowkey, colkey, value, xloc="iloc")
# #############################################################################
class _aLocIndexer(_Indexer):
"""align Indexer
Automatically align (alignable) indexer on all possible axis,
and handle indexing with non-existent or missing keys gracefully.
Also align (alignable) values before setting them with .loc
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._usebool = True
def __call__(self, usebool=True):
"""We are called if the user want to set `usebool=False', which make
boolean alignable indexer treat as non-boolean alignable indexer.
Explanation: A boolean dios indexer align its indices with the indices
of the receiving dios like a non-boolean dios indexer also would do.
Additionally all rows with False values are kicked too. To disable
that `usebool=False` can be given."""
self._usebool = usebool
return self
def __getitem__(self, key):
rowkeys, colkeys, lowdim = self._unpack_key_aloc(key)
data = pd.Series(dtype="O", index=colkeys)
kws = dict(itype=self.obj.itype, cast_policy=self.obj._policy)
c = "?"
try:
for i, c in enumerate(data.index):
data.at[c] = self._data.at[c].loc[rowkeys[i]]
except Exception as e:
raise type(e)(f"failed for column {c}: " + str(e)) from e
if lowdim:
return data.squeeze()
else:
return self.obj._constructor(data=data, fastpath=True, **kws)._finalize(
self.obj
)
def __setitem__(self, key, value):
rowkeys, colkeys, _ = self._unpack_key_aloc(key)
def iter_self(colkeys, position=False):
c = "?"
try:
for i, c in enumerate(colkeys):
dat = self._data.at[c]
rk = rowkeys[i]
if len(dat.loc[rk]) == 0:
continue
yield dat, rk, i if position else c
except Exception as e:
raise type(e)(f"failed for column {c}: " + str(e)) from e
# align columns, for rows use series.loc to align
if _is_dios_like(value):
colkeys = value.columns.intersection(colkeys)
for dat, rk, c in iter_self(colkeys):
dat.loc[rk] = value[c]
# no align, no merci
elif pdextra.is_nested_list_like(value):
if len(colkeys) != len(value):
raise ValueError(
f"shape mismatch: values array of shape "
f"(.., {len(value)}) could not "
f"be broadcast to indexing result of "
f"shape (.., {len(colkeys)})"
)
for dat, rk, i in iter_self(colkeys, position=True):
dat.loc[rk] = value[i]
# align rows by using series.loc
elif isinstance(value, pd.Series):
for dat, rk, _ in iter_self(colkeys):
dat.loc[rk] = value
# no align, no merci
else:
for dat, rk, _ in iter_self(colkeys):
dat.loc[rk] = value
def _unpack_key_aloc(self, key):
"""
Return a list of row indexer and a list of existing(!) column labels.
Both list always have the same length and also could be empty together.
Note:
The items of the row indexer list should be passed to pd.Series.loc[]
"""
# if a single column-key is given, the caller may
# want to return a single Series, instead of a dios
lowdim = False
def keys_from_bool_dios_like(key):
if not _is_bool_dios_like(key):
raise ValueError("Must pass dios-like key with boolean values only.")
colkey = self.obj.columns.intersection(key.columns)
rowkey = []
for c in colkey:
b = key[c]
rowkey += [self._data.at[c].index.intersection(b[b].index)]
return rowkey, colkey, lowdim
def keys_from_dios_like(key):
colkey = self.obj.columns.intersection(key.columns)
rowkey = [self._data.at[c].index.intersection(key[c].index) for c in colkey]
return rowkey, colkey, lowdim
rowkey, colkey = self._unpack_key(key)
if _is_dios_like(colkey) or pdextra.is_nested_list_like(colkey):
raise ValueError("Could not index with multi-dimensional column key.")
# giving the ellipsis as column key, is an alias
# for giving `usebool=False`. see self.__call__()
if colkey is Ellipsis:
self._usebool = False
colkey = slice(None)
# .aloc[dios]
if _is_dios_like(rowkey):
if not pdextra.is_null_slice(colkey):
raise ValueError(
f"Could not index with a dios-like indexer as rowkey,"
f"and a column key of that type {type(colkey)}"
)
if self._usebool:
return keys_from_bool_dios_like(rowkey)
else:
return keys_from_dios_like(rowkey)
# handle gracefully: scalar
elif pdextra.is_hashable(colkey):
colkey = [colkey] if colkey in self.obj.columns else []
lowdim = True
# column-alignable: list-like, filter only existing columns
elif pdextra.is_list_like(colkey) and not pdextra.is_bool_indexer(colkey):
colkey = colkey.values if isinstance(colkey, pd.Series) else colkey
colkey = self.obj.columns.intersection(colkey)
# handle gracefully (automatically)
# just a simple optimisation
elif pdextra.is_null_slice(colkey):
colkey = self.obj.columns
# not alignable, fall back to .loc (boolean list/series, slice(..), etc.
else:
colkey = self._data.loc[colkey].index
if len(colkey) == 0: # (!) `if not colkey:` fails for pd.Index
return [], [], lowdim
rowkey = self._get_rowkey(rowkey, colkey)
return rowkey, colkey, lowdim
def _get_rowkey(self, rowkey, colkey, depth=0):
if pdextra.is_nested_list_like(rowkey) and depth == 0:
rowkey = rowkey.values if isinstance(rowkey, pd.Series) else rowkey
if len(rowkey) != len(colkey):
raise ValueError(
"Nested arrays indexer must have same (outer) "
"length than the number of selected columns."
)
indexer = []
for i, c in enumerate(colkey):
# recurse to get the row indexer from inner element
indexer += self._get_rowkey(rowkey[i], [c], depth=depth + 1)
rowkey = indexer
# row-alignable: pd.Series(), align rows to every series in colkey (columns)
elif isinstance(rowkey, pd.Series):
if self._usebool and pdextra.is_bool_indexer(rowkey):
rowkey = [
self._data.at[c].index.intersection(rowkey[rowkey].index)
for c in colkey
]
else:
rowkey = [
self._data.at[c].index.intersection(rowkey.index) for c in colkey
]
# handle gracefully: scalar, transform to row-slice
elif pdextra.is_hashable(rowkey):
rowkey = [slice(rowkey, rowkey)] * len(colkey)
# handle gracefully: list-like, filter only existing rows
# NOTE: dios.aloc[series.index] is processed here
elif pdextra.is_list_like(rowkey) and not pdextra.is_bool_indexer(rowkey):
rowkey = [self._data.at[c].index.intersection(rowkey) for c in colkey]
# not alignable
# the rowkey is processed by .loc someway in
# the calling function - (eg. slice(..), boolean list-like, etc.)
else:
rowkey = [rowkey] * len(colkey)
return rowkey
# #############################################################################
class _AtIndexer(_Indexer):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def _check_key(self, key):
if not (
isinstance(key, tuple)
and len(key) == 2
and pdextra.is_hashable(key[0])
and pdextra.is_hashable(key[1])
):
raise KeyError(
f"{key}. `.at` takes exactly one scalar row-key "
"and one scalar column-key"
)
def __getitem__(self, key):
self._check_key(key)
return self._data.at[key[1]].at[key[0]]
def __setitem__(self, key, value):
self._check_key(key)
if _is_dios_like(value) or pdextra.is_nested_list_like(value):
raise TypeError(
".at[] cannot be used to set multi-dimensional values, use .aloc[] instead."
)
self._data.at[key[1]].at[key[0]] = value
# #############################################################################
class _iAtIndexer(_Indexer):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def _check_key(self, key):
if not (
isinstance(key, tuple)
and len(key) == 2
and pdextra.is_integer(key[0])
and pdextra.is_integer(key[1])
):
raise KeyError(
f"{key} `.iat` takes exactly one integer positional "
f"row-key and one integer positional scalar column-key"
)
def __getitem__(self, key):
self._check_key(key)
return self._data.iat[key[1]].iat[key[0]]
def __setitem__(self, key, value):
self._check_key(key)
if _is_dios_like(value) or pdextra.is_nested_list_like(value):
raise TypeError(
".iat[] cannot be used to set multi-dimensional values, use .aloc[] instead."
)
self._data.iat[key[1]].iat[key[0]] = value
# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ
#
# SPDX-License-Identifier: GPL-3.0-or-later
import warnings
from contextlib import contextmanager
import pandas as pd
@contextmanager
def no_index_warning():
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=FutureWarning)
yield
class ItypeWarning(RuntimeWarning):
pass
class ItypeCastWarning(ItypeWarning):
pass
class ItypeCastError(RuntimeError):
pass
class __Itype:
def __init__(self):
raise RuntimeError("a Itype class does not allow instances of itself.")
class DtItype(__Itype):
name = "datetime"
unique = True
subtypes = (pd.DatetimeIndex,)
min_pdindex = pd.DatetimeIndex([])
class IntItype(__Itype):
name = "integer"
unique = True
with no_index_warning():
subtypes = (pd.RangeIndex, pd.Int64Index, pd.UInt64Index, int)
min_pdindex = pd.Int64Index([])
class FloatItype(__Itype):
name = "float"
unique = True
with no_index_warning():
subtypes = (pd.Float64Index, float)
min_pdindex = pd.Float64Index([])
# class MultiItype(__Itype):
# name = "multi"
# subtypes = (pd.MultiIndex, )
# unique = ??
class NumItype(__Itype):
name = "numeric"
_subitypes = (IntItype, FloatItype)
subtypes = _subitypes + IntItype.subtypes + FloatItype.subtypes
unique = False
with no_index_warning():
min_pdindex = pd.Float64Index([])
class ObjItype(__Itype):
name = "object"
unique = False
_subitypes = (DtItype, IntItype, FloatItype, NumItype, str)
_otheritypes = (
pd.CategoricalIndex,
pd.IntervalIndex,
pd.PeriodIndex,
pd.TimedeltaIndex,
pd.Index,
)
subtypes = _subitypes + _otheritypes + DtItype.subtypes + NumItype.subtypes
min_pdindex = pd.Index([])
def is_itype(obj, itype):
"""Check if obj is a instance of the given itype or its str-alias was given"""
# todo: iter through itype as it could be a tuple, if called like ``is_itype(o, (t1,t2))``
# user gave a Itype, like ``DtItype``
if type(obj) == type and issubclass(obj, itype):
return True
# user gave a string, like 'datetime'
if isinstance(obj, str) and obj == itype.name:
return True
return False
def is_itype_subtype(obj, itype):
"""Check if obj is a subclass or a instance of a subclass of the given itype"""
# user gave a subtype, like ``pd.DatetimeIndex``
if type(obj) == type and issubclass(obj, itype.subtypes):
return True
# user gave a instance of a subtype, like ``pd.Series(..).index``
if isinstance(obj, itype.subtypes):
return True
return False
def is_itype_like(obj, itype):
"""Check if obj is a subclass or a instance of the given itype or any of its subtypes"""
return is_itype(obj, itype) or is_itype_subtype(obj, itype)
def get_itype(obj):
"""
Return the according Itype.
and return the according Itype
Parameters
----------
obj : {itype string, Itype, pandas.Index, instance of pd.index}
get the itype fitting for the input
Examples
--------
>>> get_itype("datetime")
<class 'dios.lib.DtItype'>
>>> s = pd.Series(index=pd.to_datetime([]))
>>> get_itype(s.index)
<class 'dios.lib.DtItype'>
>>> get_itype(DtItype)
<class 'dios.lib.DtItype'>
>>> get_itype(pd.DatetimeIndex)
<class 'dios.lib.DtItype'>
"""
if type(obj) == type and issubclass(obj, __Itype):
return obj
# check if it is the actual type, not a subtype
types = [DtItype, IntItype, FloatItype, NumItype, ObjItype]
for t in types:
if is_itype(obj, t):
return t
for t in types:
if is_itype_subtype(obj, t):
return t
raise ValueError(
f"{obj} is not a itype, nor any known subtype of a itype, nor a itype string alias"
)
def _itype_eq(a, b):
return is_itype(a, b)
def _itype_lt(a, b):
return is_itype_subtype(a, b)
def _itype_le(a, b):
return is_itype_like(a, b)
def _find_least_common_itype(iterable_of_series):
itypes = [NumItype, FloatItype, IntItype, DtItype]
tlist = [get_itype(s.index) for s in iterable_of_series]
found = ObjItype
if tlist:
for itype in itypes:
for t in tlist:
if _itype_le(t, itype):
continue
break
else:
found = itype
return found
################################################################################
# Casting
class CastPolicy:
force = "force"
save = "save"
never = "never"
_CAST_POLICIES = [CastPolicy.force, CastPolicy.save, CastPolicy.never]
def cast_to_itype(series, itype, policy="lossless", err="raise", inplace=False):
"""Cast a series (more explicit the type of the index) to fit the itype of a dios.
Return the casted series if successful, None otherwise.
Note:
This is very basic number-casting, so in most cases, information from
the old index will be lost after the cast.
"""
if policy not in _CAST_POLICIES:
raise ValueError(f"policy={policy}")
if err not in ["raise", "ignore"]:
raise ValueError(f"err={err}")
if not inplace:
series = series.copy()
itype = get_itype(itype)
if series.empty:
return pd.Series(index=itype.min_pdindex, dtype=series.dtype)
series.itype = get_itype(series.index)
# up-cast issn't necessary because a dios with a higher
# itype always can take lower itypes.
# series can have dt/int/float/mixed
# dt -> dt -> mixed
# int -> int -> num -> mixed
# float -> float -> num -> mixed
# mixed -> mixed
if _itype_le(series.itype, itype): # a <= b
return series
e = f"A series index of type '{type(series.index)}' cannot be casted to Itype '{itype.name}'"
# cast any -> dt always fail.
if is_itype(itype, DtItype):
pass
else:
e += f", as forbidden by the cast-policy '{policy}'."
if policy == CastPolicy.never:
pass
elif policy == CastPolicy.force:
# cast any (dt/float/mixed) -> int
if is_itype(itype, IntItype): # a == b
series.index = pd.RangeIndex(len(series))
return series
# cast any (dt/int/mixed) -> float
# cast any (dt/float/mixed) -> nur
if is_itype(itype, FloatItype) or is_itype(itype, NumItype): # a == b or a == c
series.index = pd.Float64Index(range(len(series)))
return series
elif policy == CastPolicy.save:
# cast int -> float
if is_itype(itype, IntItype) and is_itype(
series.itype, FloatItype
): # a == b and c == d
series.index = series.index.astype(float)
return series
# cast float -> int, maybe if unique
if is_itype(itype, FloatItype) and is_itype(
series.itype, IntItype
): # a == b and c == d
series.index = series.index.astype(int)
if series.index.is_unique:
return series
e = (
f"The cast with policy {policy} from series index type '{type(series.index)}' to "
f"itype {itype.name} resulted in a non-unique index."
)
# cast mixed -> int/float always fail
if err == "raise":
raise ItypeCastError(e)
else:
return None
################################################################################
# OPTIONS
class OptsFields:
"""storage class for the keys in `dios_options`
Use like so: ``dios_options[OptsFields.X] = Opts.Y``.
See Also
--------
Opts: values for the options dict
dios_options: options dict for module
"""
mixed_itype_warn_policy = "mixed_itype_policy"
disp_max_rows = "disp_max_rows "
disp_min_rows = "disp_min_rows "
disp_max_cols = "disp_max_vars"
dios_repr = "dios_repr"
class Opts:
"""storage class for string values for `dios_options`
Use like so: ``dios_options[OptsFields.X] = Opts.Y``.
See Also
--------
OptsFields: keys for the options dict
dios_options: options dict for module
"""
itype_warn = "warn"
itype_err = "err"
itype_ignore = "ignore"
repr_aligned = "aligned"
repr_indexed = "indexed"
class __DocDummy(dict):
pass
dios_options = __DocDummy()
dios_options.update(
**{
OptsFields.disp_max_rows: 60,
OptsFields.disp_min_rows: 10,
OptsFields.disp_max_cols: 10,
OptsFields.mixed_itype_warn_policy: Opts.itype_warn,
OptsFields.dios_repr: Opts.repr_indexed,
}
)
opdoc = f"""Options dictionary for module `dios`.
Use like so: ``dios_options[OptsFields.X] = Opts.Y``.
**Items**:
* {OptsFields.dios_repr}: {{'indexed', 'aligned'}} default: 'indexed'
dios default representation if:
* `indexed`: show every column with its index
* `aligned`: transform to pandas.DataFrame with indexed merged together.
* {OptsFields.disp_max_rows} : int
Maximum numbers of row before truncated to `disp_min_rows`
in representation of DictOfSeries
* {OptsFields.disp_min_rows} : int
min rows to display if `max_rows` is exceeded
* {OptsFields.disp_max_cols} : int
Maximum numbers of columns before truncated representation
* {OptsFields.mixed_itype_warn_policy} : {{'warn', 'err', 'ignore'}}
How to inform user about mixed Itype
See Also
--------
OptsFields: keys for the options dict
Opts: values for the options dict
"""
dios_options.__doc__ = opdoc
def _throw_MixedItype_err_or_warn(itype):
msg = (
f"Using '{itype.name}' as itype is not recommend. "
f"As soon as series with different index types are inserted,\n"
f"indexing and slicing will almost always fail. "
)
if dios_options[OptsFields.mixed_itype_warn_policy] in [
"ignore",
Opts.itype_ignore,
]:
pass
elif dios_options[OptsFields.mixed_itype_warn_policy] in [
"error",
"err",
Opts.itype_err,
]:
msg += "Suppress this error by specifying an unitary 'itype' or giving an 'index' to DictOfSeries."
raise ItypeCastError(msg)
else:
msg += "Silence this warning by specifying an unitary 'itype' or giving an 'index' to DictOfSeries."
warnings.warn(msg, ItypeWarning)
return
def example_DictOfSeries():
"""Return a example dios.
Returns
-------
DictOfSeries: an example
Examples
--------
>>> from dios import example_DictOfSeries
>>> di = example_DictOfSeries()
>>> di
a | b | c | d |
===== | ====== | ====== | ===== |
0 0 | 2 5 | 4 7 | 6 0 |
1 7 | 3 6 | 5 17 | 7 1 |
2 14 | 4 7 | 6 27 | 8 2 |
3 21 | 5 8 | 7 37 | 9 3 |
4 28 | 6 9 | 8 47 | 10 4 |
5 35 | 7 10 | 9 57 | 11 5 |
6 42 | 8 11 | 10 67 | 12 6 |
7 49 | 9 12 | 11 77 | 13 7 |
8 56 | 10 13 | 12 87 | 14 8 |
9 63 | 11 14 | 13 97 | 15 9 |
"""
from dios import DictOfSeries
a = pd.Series(range(0, 70, 7))
b = pd.Series(range(5, 15, 1))
c = pd.Series(range(7, 107, 10))
d = pd.Series(range(0, 10, 1))
for i, s in enumerate([a, b, c, d]):
s.index += i * 2
di = DictOfSeries(dict(a=a, b=b, c=c, d=d))
return di.copy()
# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ
#
# SPDX-License-Identifier: GPL-3.0-or-later
# do not import dios-stuff here
import operator as op
_OP1_MAP = {
op.inv: "~",
op.neg: "-",
op.abs: "abs()",
}
_OP2_COMP_MAP = {
op.eq: "==",
op.ne: "!=",
op.le: "<=",
op.ge: ">=",
op.gt: ">",
op.lt: "<",
}
_OP2_BOOL_MAP = {
op.and_: "&",
op.or_: "|",
op.xor: "^",
}
_OP2_ARITH_MAP = {
op.add: "+",
op.sub: "-",
op.mul: "*",
op.pow: "**",
}
_OP2_DIV_MAP = {
op.mod: "%",
op.truediv: "/",
op.floordiv: "//",
}
OP_MAP = _OP2_COMP_MAP.copy()
OP_MAP.update(_OP2_BOOL_MAP)
OP_MAP.update(_OP2_ARITH_MAP)
OP_MAP.update(_OP2_DIV_MAP)
OP_MAP.update(_OP1_MAP)
#!/usr/bin/env python
# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ
#
# SPDX-License-Identifier: GPL-3.0-or-later
__author__ = "Bert Palm"
__email__ = "bert.palm@ufz.de"
__copyright__ = "Copyright 2020, Helmholtz-Zentrum für Umweltforschung GmbH - UFZ"
from pandas.api.types import ( # Unlike the example says, return lists False, not True; >>is_iterator([1, 2, 3]); >>False
is_dict_like,
is_hashable,
is_integer,
is_iterator,
is_list_like,
is_scalar,
)
from pandas.core.common import is_bool_indexer, is_null_slice
from pandas.core.dtypes.common import is_nested_list_like
# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ
#
# SPDX-License-Identifier: GPL-3.0-or-later
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile clean
clean:
rm -rf _build _static _api
rm -f *.automodsumm
mkdir _static
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ
#
# SPDX-License-Identifier: GPL-3.0-or-later
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys
sys.path.insert(0, os.path.abspath(".."))
# -- Project information -----------------------------------------------------
project = "dios"
copyright = "2020, Bert Palm"
author = "Bert Palm"
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
"sphinx.ext.autodoc",
"sphinx.ext.autosummary",
# "sphinx.ext.doctest",
# "sphinx.ext.extlinks",
# "sphinx.ext.todo",
# "sphinx.ext.intersphinx",
# "sphinx.ext.coverage",
# "sphinx.ext.mathjax",
# "sphinx.ext.ifconfig",
"sphinx.ext.autosectionlabel",
# link source code
"sphinx.ext.viewcode",
# add suupport for NumPy style docstrings
"sphinx.ext.napoleon",
# doc the whole module
"sphinx_automodapi.automodapi",
"sphinxcontrib.fulltoc",
# markdown sources support
"recommonmark",
"sphinx_markdown_tables",
]
numpydoc_show_class_members = False
automodsumm_inherited_members = True
automodapi_inheritance_diagram = False
automodapi_toctreedirnm = "_api"
# automodsumm_writereprocessed = True
autosectionlabel_prefix_document = True
# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
source_suffix = [".rst", ".md"]
# -- Options for HTML output -------------------------------------------------
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = "sphinx"
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
html_theme = "nature"
# use pandas theme
# html_theme = "pydata_sphinx_theme"
# html_theme_options = {
# }
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ["_static"]
.. SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ
..
.. SPDX-License-Identifier: GPL-3.0-or-later
API
====
.. automodapi:: dios
:include-all-objects:
:no-heading: