Merge branch 'develop' into flagConstantsFix

945700e4 · Peter Lünenschloß · 62e9a08c · 33ac032e · 945700e4 · 945700e4
Commit 945700e4 authored 2 weeks ago by Peter Lünenschloß
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -29,7 +29,7 @@ jobs:
      fail-fast: false
      matrix:
        os: ["windows-latest", "ubuntu-latest", "macos-latest"]
-        python-version: ["3.9", "3.10", "3.11"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
    defaults:
      run:
        # somehow this also works for windows O.o ??

--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -30,11 +30,13 @@ stages:
  - deploy

 default:
-  image: python:3.10
+  image: python:3.11
  before_script:
    - pip install --upgrade pip
    - pip install -r requirements.txt
    - pip install -r tests/requirements.txt
+    - apt update
+    - apt install -y xvfb

 # ===========================================================
 # Compliance stage
@@ -75,8 +77,10 @@ coverage:
  stage: test
  allow_failure: true
  script:
+    - export DISPLAY=:99
+    - Xvfb :99 &
    - pip install pytest-cov coverage
-    - pytest --cov=saqc tests --ignore=tests/fuzzy -Werror
+    - pytest --cov=saqc tests --ignore=tests/fuzzy tests/extras -Werror
  after_script:
    - coverage xml
  # regex to find the coverage percentage in the job output
@@ -93,7 +97,9 @@ python39:
  stage: test
  image: python:3.9
  script:
-    - pytest tests -Werror --junitxml=report.xml
+    - export DISPLAY=:99
+    - Xvfb :99 &
+    - pytest tests -Werror --junitxml=report.xml --ignore=tests/extras
    - python -m saqc --config docs/resources/data/config.csv --data docs/resources/data/data.csv --outfile /tmp/test.csv
  artifacts:
    when: always
@@ -105,7 +111,9 @@ python310:
  stage: test
  image: python:3.10
  script:
-    - pytest tests -Werror --junitxml=report.xml
+    - export DISPLAY=:99
+    - Xvfb :99 &
+    - pytest tests -Werror --junitxml=report.xml --ignore=tests/extras
    - python -m saqc --config docs/resources/data/config.csv --data docs/resources/data/data.csv --outfile /tmp/test.csv
  artifacts:
    when: always
@@ -116,7 +124,22 @@ python311:
  stage: test
  image: python:3.11
  script:
-    - pytest tests -Werror --junitxml=report.xml
+    - export DISPLAY=:99
+    - Xvfb :99 &
+    - pytest tests -Werror --junitxml=report.xml --ignore=tests/extras
+    - python -m saqc --config docs/resources/data/config.csv --data docs/resources/data/data.csv --outfile /tmp/test.csv
+  artifacts:
+    when: always
+    reports:
+      junit: report.xml
+
+python312:
+  stage: test
+  image: python:3.12
+  script:
+    - export DISPLAY=:99
+    - Xvfb :99 &
+    - pytest tests -Werror --junitxml=report.xml --ignore=tests/extras
    - python -m saqc --config docs/resources/data/config.csv --data docs/resources/data/data.csv --outfile /tmp/test.csv
  artifacts:
    when: always
@@ -125,6 +148,8 @@ python311:

 doctest:
  stage: test
+  variables:
+    COLUMNS: 200
  script:
    - cd docs
    - pip install -r requirements.txt
@@ -170,6 +195,16 @@ wheel311:
    - pip install .
    - python -c 'import saqc; print(f"{saqc.__version__=}")'

+wheel312:
+  stage: build
+  image: python:3.12
+  variables:
+    PYPI_PKG_NAME: "saqc-dev"
+  script:
+    - pip install wheel
+    - pip wheel .
+    - pip install .
+    - python -c 'import saqc; print(f"{saqc.__version__=}")'

 # ===========================================================
 # Extra Pipeline (run with a successful run of all other jobs on develop)

--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,27 +6,56 @@ SPDX-License-Identifier: GPL-3.0-or-later

 # Changelog
 ## Unreleased
-[List of commits](https://git.ufz.de/rdm-software/saqc/-/compare/v2.5.0...develop)
+[List of commits](https://git.ufz.de/rdm-software/saqc/-/compare/v2.6.0...develop)
 ### Added
- `flagGeneric`: target broadcasting
+- `flagPlateaus`: added function to search and flag outlierish value plateaus of certain temporal extension
+- `flagUniLOF`: added dispatch to Local Outlier Probability (*LoOP*) variant
+- `flaguniLOF`: made `thresh` Optional
+- `flagPlateaus`: added function to search and flag anomalous value plateaus of certain temporal extension
+### Changed
+### Removed
+### Fixed
+### Deprecated
+
+## [2.6.0](https://git.ufz.de/rdm-software/saqc/-/tags/v2.6.0) - 2024-04-15
+[List of commits](https://git.ufz.de/rdm-software/saqc/-/compare/v2.5.0...v2.6.0)
+### Added
+- `reindex`: base reindexer function
+- `flagGeneric`, `processGeneric`: target broadcasting and numpy array support
 - `SaQC`: automatic translation of incoming flags
 - Option to change the flagging scheme after initialization
- `SaQC`: support for selection, slicing and setting of items by use of subscription on SaQC objects (e.g. `qc[key]` and `qc[key] = value`).
-   Selection works with single keys, collections of keys and string slices (e.g. `qc["a":"f"]`).  Values can be SaQC objects, pd.Series, 
-   Iterable of Series and dict-like with series values.
+- `flagByClick`: manually assign flags using a graphical user interface
+- `SaQC`: support for selection, slicing and setting of items by subscription on `SaQC` objects
+- `transferFlags` is a multivariate function
 - `plot`: added `yscope` keyword
 - `setFlags`: function to replace `flagManual`
+- `flagUniLOF`: added parameter `slope_correct` to correct for overflagging at relatively steep data value slopes
+- `History`: added option to change aggregation behavior
+- "horizontal" axis / multivariate mode for `rolling`
+- Translation scheme `AnnotatedFloatScheme`
 ### Changed
+- `SaQC.flags` always returns a `DictOfSeries`
 ### Removed
+- `SaQC` methods deprecated in version 2.4: `interpolate`, `interpolateIndex`, `interpolateInvalid`, `roll`, `linear`,`shift`, `flagCrossStatistics`
+- Method `Flags.toDios` deprecated in version 2.4
+- Method `DictOfSeries.index_of` method deprecated in version 2.4
+- Option `"complete"` for parameter `history` of method `plot`
+- Option `"cycleskip"` for parameter `ax_kwargs` of method `plot`
+- Parameter `phaseplot` from method `plot`
 ### Fixed
 - `flagConstants`: fixed flagging of rolling ramps
 - `Flags`: add meta entry to imported flags
 - group operations were overwriting existing flags
- `SaQC._construct` : was not working for inherit classes (used hardcoded `SaQC` to construct a new instance).
+- `SaQC._construct` : was not working for inherited classes
+- `processgeneric`: improved numpy function compatability
 ### Deprecated
 - `flagManual` in favor of `setFlags`
+- `inverse_**` options for `concatFlags` parameter `method` in favor of `invert=True`
+- `flagRaise` with delegation to better replacements `flagZScore`, `flagUniLOF`, `flagJumps` or `flagOffset`
+- `flagByGrubbs` with delegation to better replacements `flagZScore`, `flagUniLOF`s
+- `flagMVScore` with delegation to manual application of the steps

-## [2.5.0](https://git.ufz.de/rdm-software/saqc/-/tags/v2.4.1) - 2023-06-22
+## [2.5.0](https://git.ufz.de/rdm-software/saqc/-/tags/v2.5.0) - 2023-09-05
 [List of commits](https://git.ufz.de/rdm-software/saqc/-/compare/v2.4.1...v2.5.0)
 ### Added
 - WMO standard mean aggregations

--- a/README.md
+++ b/README.md
@@ -62,7 +62,7 @@ could look like [this](https://git.ufz.de/rdm-software/saqc/raw/develop/docs/res
 ```
 varname    ; test
 #----------; ---------------------------------------------------------------------
-SM2        ; shift(freq="15Min")
+SM2        ; align(freq="15Min")
 'SM(1|2)+' ; flagMissing()
 SM1        ; flagRange(min=10, max=60)
 SM2        ; flagRange(min=10, max=40)
@@ -103,7 +103,7 @@ data = pd.read_csv(

 qc = SaQC(data=data)
 qc = (qc
-      .shift("SM2", freq="15Min")
+      .align("SM2", freq="15Min")
      .flagMissing("SM(1|2)+", regex=True)
      .flagRange("SM1", min=10, max=60)
      .flagRange("SM2", min=10, max=40)

--- a/docs/Makefile
+++ b/docs/Makefile
@@ -30,7 +30,7 @@ clean:
 # make documentation
 doc:
 	# generate environment table from dictionary
-	@$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+	@ $(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

 # run tests
 test:

--- a/docs/cookbooks/DataRegularisation.rst
+++ b/docs/cookbooks/DataRegularisation.rst
@@ -315,10 +315,10 @@ Aggregation
 If we want to comprise several values by aggregation and assign the result to the new regular timestamp, instead of
 selecting a single one, we can do this, with the :py:meth:`~saqc.SaQC.resample` method.
 Lets resample the *SoilMoisture* data to have a *20* minutes sample rate by aggregating every *20* minutes intervals
-content with the arithmetic mean (which is provided by the ``numpy.mean`` function for example).
+content with the arithmetic mean.

   >>> import numpy as np
-   >>> qc = qc.resample('SoilMoisture', target='SoilMoisture_mean', freq='20min', method='bagg', func=np.mean)
+   >>> qc = qc.resample('SoilMoisture', target='SoilMoisture_mean', freq='20min', method='bagg', func="mean")
   >>> qc.data # doctest: +SKIP
                       SoilMoisture |                     SoilMoisture_mean |
   ================================ | ===================================== |

--- a/docs/cookbooks/DriftDetection.rst
+++ b/docs/cookbooks/DriftDetection.rst
@@ -140,7 +140,7 @@ Looking at the example data set more closely, we see that 2 of the 5 variables s
    qc.plot(variables, xscope=slice('2017-05', '2017-11'))

 Lets try to detect those drifts via saqc. The changes we observe in the data seem to develop significantly only in temporal spans over a month,
-so we go for ``"1M"`` as value for the
+so we go for ``"1ME"`` as value for the
 ``window`` parameter. We identified the majority group as a group containing three variables, whereby two variables
 seem to be scattered away, so that we can leave the ``frac`` value at its default ``.5`` level.
 The majority group seems on average not to be spread out more than 3 or 4 degrees. So, for the ``spread`` value
@@ -152,7 +152,7 @@ average in a month from any member of the majority group.
 .. doctest:: flagDriftFromNorm

   >>> variables = ['temp1 [degC]', 'temp2 [degC]', 'temp3 [degC]', 'temp4 [degC]', 'temp5 [degC]']
-   >>> qc = qc.flagDriftFromNorm(variables, window='1M', spread=3)
+   >>> qc = qc.flagDriftFromNorm(variables, window='1ME', spread=3)

 .. plot::
   :context: close-figs
@@ -160,7 +160,7 @@ average in a month from any member of the majority group.
   :class: center

   >>> variables = ['temp1 [degC]', 'temp2 [degC]', 'temp3 [degC]', 'temp4 [degC]', 'temp5 [degC]']
-   >>> qc = qc.flagDriftFromNorm(variables, window='1M', spread=3)
+   >>> qc = qc.flagDriftFromNorm(variables, window='1ME', spread=3)

 Lets check the results:


--- a/docs/cookbooks/MultivariateFlagging.rst
+++ b/docs/cookbooks/MultivariateFlagging.rst
@@ -191,7 +191,6 @@ The resulting timeseries now has has regular timestamp.
 .. doctest:: exampleMV

   >>> qc.data['sac254_raw'] #doctest:+NORMALIZE_WHITESPACE
-   Timestamp
   2016-01-01 00:00:00          NaN
   2016-01-01 00:15:00    18.617873
   2016-01-01 00:30:00    18.942700

--- a/docs/cookbooks/ResidualOutlierDetection.rst
+++ b/docs/cookbooks/ResidualOutlierDetection.rst
@@ -147,19 +147,19 @@ Rolling Mean
 ^^^^^^^^^^^^

 Easiest thing to do, would be, to apply some rolling mean
-model via the method :py:meth:`saqc.SaQC.roll`.
+model via the method :py:meth:`saqc.SaQC.rolling`.

 .. doctest:: exampleOD

   >>> import numpy as np
-   >>> qc = qc.roll(field='incidents', target='incidents_mean', func=np.mean, window='13D')
+   >>> qc = qc.rolling(field='incidents', target='incidents_mean', func=np.mean, window='13D')

 .. plot::
   :context:
   :include-source: False

   import numpy as np
-   qc = qc.roll(field='incidents', target='incidents_mean', func=np.mean, window='13D')
+   qc = qc.rolling(field='incidents', target='incidents_mean', func=np.mean, window='13D')

 The ``field`` parameter is passed the variable name, we want to calculate the rolling mean of.
 The ``target`` parameter holds the name, we want to store the results of the calculation to.
@@ -174,13 +174,13 @@ under the name ``np.median``. We just calculate another model curve for the ``"i

 .. doctest:: exampleOD

-   >>> qc = qc.roll(field='incidents', target='incidents_median', func=np.median, window='13D')
+   >>> qc = qc.rolling(field='incidents', target='incidents_median', func=np.median, window='13D')

 .. plot::
   :context:
   :include-source: False

-   qc = qc.roll(field='incidents', target='incidents_median', func=np.median, window='13D')
+   qc = qc.rolling(field='incidents', target='incidents_median', func=np.median, window='13D')

 We chose another :py:attr:`target` value for the rolling *median* calculation, in order to not override our results from
 the previous rolling *mean* calculation.
@@ -318,18 +318,18 @@ for the point lying in the center of every window, we would define our function

   z_score = lambda D: abs((D[14] - np.mean(D)) / np.std(D))

-And subsequently, use the :py:meth:`~saqc.SaQC.roll` method to make a rolling window application with the scoring
+And subsequently, use the :py:meth:`~saqc.SaQC.rolling` method to make a rolling window application with the scoring
 function:

 .. doctest:: exampleOD

-   >>> qc = qc.roll(field='incidents_residuals', target='incidents_scores', func=z_score, window='27D')
+   >>> qc = qc.rolling(field='incidents_residuals', target='incidents_scores', func=z_score, window='27D', min_periods=27)

 .. plot::
   :context: close-figs
   :include-source: False

-   qc = qc.roll(field='incidents_residuals', target='incidents_scores', func=z_score, window='27D')
+   qc = qc.rolling(field='incidents_residuals', target='incidents_scores', func=z_score, window='27D', min_periods=27)

 Optimization by Decomposition
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -347,13 +347,13 @@ So the attempt works fine, only because our data set is small and strictly regul
 Meaning that it has constant temporal distances between subsequent meassurements.

 In order to tweak our calculations and make them much more stable, it might be useful to decompose the scoring
-into seperate calls to the :py:meth:`~saqc.SaQC.roll` function, by calculating the series of the
+into seperate calls to the :py:meth:`~saqc.SaQC.rolling` function, by calculating the series of the
 residuals *mean* and *standard deviation* seperately:

 .. doctest:: exampleOD

-   >>> qc = qc.roll(field='incidents_residuals', target='residuals_mean', window='27D', func=np.mean)
-   >>> qc = qc.roll(field='incidents_residuals', target='residuals_std', window='27D', func=np.std)
+   >>> qc = qc.rolling(field='incidents_residuals', target='residuals_mean', window='27D', func=np.mean)
+   >>> qc = qc.rolling(field='incidents_residuals', target='residuals_std', window='27D', func=np.std)
   >>> qc = qc.processGeneric(field=['incidents_scores', "residuals_mean", "residuals_std"], target="residuals_norm",
   ... func=lambda this, mean, std: (this - mean) / std)

@@ -362,15 +362,15 @@ residuals *mean* and *standard deviation* seperately:
   :context: close-figs
   :include-source: False

-   qc = qc.roll(field='incidents_residuals', target='residuals_mean', window='27D', func=np.mean)
-   qc = qc.roll(field='incidents_residuals', target='residuals_std', window='27D', func=np.std)
+   qc = qc.rolling(field='incidents_residuals', target='residuals_mean', window='27D', func=np.mean)
+   qc = qc.rolling(field='incidents_residuals', target='residuals_std', window='27D', func=np.std)
   qc = qc.processGeneric(field=['incidents_scores', "residuals_mean", "residuals_std"], target="residuals_norm", func=lambda this, mean, std: (this - mean) / std)


 With huge datasets, this will be noticably faster, compared to the method presented :ref:`initially <cookbooks/ResidualOutlierDetection:Scores>`\ ,
 because ``saqc`` dispatches the rolling with the basic numpy statistic methods to an optimized pandas built-in.

-Also, as a result of the :py:meth:`~saqc.SaQC.roll` assigning its results to the center of every window,
+Also, as a result of the :py:meth:`~saqc.SaQC.rolling` assigning its results to the center of every window,
 all the values are centered and we dont have to care about window center indices when we are generating
 the *Z*\ -Scores from the two series.


--- a/docs/funcs/basicAnomalies.rst
+++ b/docs/funcs/basicAnomalies.rst
@@ -16,5 +16,6 @@ Basic Anomalies
   ~SaQC.flagRaise
   ~SaQC.flagConstants
   ~SaQC.flagByVariance
+   ~SaQC.flagPlateau


--- a/docs/funcs/filling.rst
+++ b/docs/funcs/filling.rst
@@ -11,4 +11,3 @@ Gap filling
   :nosignatures:

   ~SaQC.interpolateByRolling
-   ~SaQC.interpolate
--- a/docs/funcs/flagTools.rst
+++ b/docs/funcs/flagTools.rst
@@ -15,3 +15,5 @@ Flagtools
   ~SaQC.flagManual
   ~SaQC.flagDummy
   ~SaQC.transferFlags
+   ~SaQC.andGroup
+   ~SaQC.orGroup
--- a/docs/funcs/genericWrapper.rst
+++ b/docs/funcs/genericWrapper.rst
@@ -13,6 +13,6 @@ Generic Functions

   ~SaQC.processGeneric
   ~SaQC.flagGeneric
-   ~SaQC.roll
-   ~SaQC.transform
-   ~SaQC.resample
+   ~SaQC.andGroup
+   ~SaQC.orGroup
+
--- a/docs/funcs/multivariateAnalysis.rst
+++ b/docs/funcs/multivariateAnalysis.rst
@@ -12,7 +12,6 @@ Multivariate outlier detection.
 .. autosummary::
   :nosignatures:

-   ~SaQC.flagMVScores
-   ~SaQC.flagCrossStatistics
   ~SaQC.flagLOF
+   ~SaQC.flagZScore

--- a/docs/funcs/outlierDetection.rst
+++ b/docs/funcs/outlierDetection.rst
@@ -16,8 +16,8 @@ Univariate Outlier Detection
   ~SaQC.flagByStray
   ~SaQC.flagMAD
   ~SaQC.flagOffset
-   ~SaQC.flagByGrubbs
   ~SaQC.flagRange
   ~SaQC.flagLOF
   ~SaQC.flagZScore
+   ~SaQC.flagPlateau

--- a/docs/funcs/samplingAlignment.rst
+++ b/docs/funcs/samplingAlignment.rst
@@ -10,9 +10,7 @@ Sampling Alignment
 .. autosummary::
   :nosignatures:

-   ~SaQC.linear
-   ~SaQC.shift
   ~SaQC.align
   ~SaQC.concatFlags
-   ~SaQC.interpolateIndex
   ~SaQC.resample
+   ~SaQC.reindex
--- a/docs/funcs/tools.rst
+++ b/docs/funcs/tools.rst
@@ -15,3 +15,4 @@ Tools
   ~SaQC.renameField
   ~SaQC.selectTime
   ~SaQC.plot
+
--- a/docs/gettingstarted/TutorialAPI.rst
+++ b/docs/gettingstarted/TutorialAPI.rst
@@ -50,7 +50,7 @@ with something more elaborate, is in fact a one line change. So let's start with
   from saqc import SaQC

   # we need some dummy data
-   values = np.array([12, 24, 36, 33, 89, 87, 45, 31, 18, 99])
+   values = np.array([12, 24, 36, 33, 89, 87, 45, 31, 18, 99], dtype="float")
   dates = pd.date_range(start="2020-01-01", periods=len(values), freq="D")
   data = pd.DataFrame({"a": values}, index=dates)
   # let's insert some constant values ...
@@ -103,32 +103,32 @@ number of different attributes, of which you likely might want to use the follow
 .. doctest:: python

   >>> qc.data  #doctest:+NORMALIZE_WHITESPACE
-                   a | 
-   ================= | 
-   2020-01-01   12.0 | 
-   2020-01-02   24.0 | 
-   2020-01-03   36.0 | 
-   2020-01-04   47.4 | 
-   2020-01-05   47.4 | 
-   2020-01-06   47.4 | 
-   2020-01-07   45.0 | 
-   2020-01-08   31.0 | 
-   2020-01-09  175.0 | 
-   2020-01-10   99.0 | 
+                   a |
+   ================= |
+   2020-01-01   12.0 |
+   2020-01-02   24.0 |
+   2020-01-03   36.0 |
+   2020-01-04   47.4 |
+   2020-01-05   47.4 |
+   2020-01-06   47.4 |
+   2020-01-07   45.0 |
+   2020-01-08   31.0 |
+   2020-01-09  175.0 |
+   2020-01-10   99.0 |

   >>> qc.flags  #doctest:+NORMALIZE_WHITESPACE
-                       a | 
-   ===================== | 
-   2020-01-01        BAD | 
-   2020-01-02  UNFLAGGED | 
-   2020-01-03  UNFLAGGED | 
-   2020-01-04  UNFLAGGED | 
-   2020-01-05  UNFLAGGED | 
-   2020-01-06  UNFLAGGED | 
-   2020-01-07  UNFLAGGED | 
-   2020-01-08  UNFLAGGED | 
-   2020-01-09        BAD | 
-   2020-01-10        BAD | 
+                       a |
+   ===================== |
+   2020-01-01        BAD |
+   2020-01-02  UNFLAGGED |
+   2020-01-03  UNFLAGGED |
+   2020-01-04  UNFLAGGED |
+   2020-01-05  UNFLAGGED |
+   2020-01-06  UNFLAGGED |
+   2020-01-07  UNFLAGGED |
+   2020-01-08  UNFLAGGED |
+   2020-01-09        BAD |
+   2020-01-10        BAD |


 Putting it together - The complete workflow
@@ -142,7 +142,7 @@ The snippet below provides you with a compete example from the things we have se
   from saqc import SaQC

   # we need some dummy data
-   values = np.random.randint(low=0, high=100, size=100)
+   values = np.random.randint(low=0, high=100, size=100).astype(float)
   dates = pd.date_range(start="2020-01-01", periods=len(values), freq="D")
   data = pd.DataFrame({"a": values}, index=dates)
   # let's insert some constant values ...

--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -4,11 +4,11 @@

 recommonmark==0.7.1
 sphinx==7.2.6
-sphinx-automodapi==0.16.0
+sphinx-automodapi==0.17.0
 sphinxcontrib-fulltoc==1.2.0
 sphinx-markdown-tables==0.0.17
 jupyter-sphinx==0.5.3
-sphinx_autodoc_typehints==1.25.2
-sphinx-tabs==3.4.4
+sphinx_autodoc_typehints==1.23.0
+sphinx-tabs==3.4.5
 sphinx-design==0.5.0
-pydata-sphinx-theme==0.14.4
+pydata-sphinx-theme==0.15.2
--- a/docs/resources/data/turbidity_plateaus.csv
+++ b/docs/resources/data/turbidity_plateaus.csv