diff --git a/.gitignore b/.gitignore index dfacde84c06ace3bf99af6b853b13350a6a5cd06..90558b0d9b52ae581d58614ff3c054f8dc348fe1 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ *.automodapi docs/_api docs/_build +docs/resources/temp/* coverage.xml venv*/ **/.* diff --git a/CHANGELOG.md b/CHANGELOG.md index 6731aa25e3a53a09da14156df029fb9f6976d5a7..839310af7c89e234921ce0d54afdbadd97558a88 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ SPDX-License-Identifier: GPL-3.0-or-later ## Unreleased [List of commits](https://git.ufz.de/rdm-software/saqc/-/compare/v2.3.0...develop) ### Added +- Methods `logicalAnd` and `logicalOr` ### Changed ### Removed ### Fixed diff --git a/README.md b/README.md index 05ca0fdd51adedb1f593a719e9fddc69c9b907ec..c1d63a74f60c76b5f5d4e4b62b534c7e21ec638e 100644 --- a/README.md +++ b/README.md @@ -137,4 +137,4 @@ coming soon... ## How to cite SaQC If SaQC is advancing your research, please cite as: -> Schäfer, David; Palm, Bert; Lünenschloß, Peter. (2021). System for automated Quality Control - SaQC. Zenodo. https://doi.org/10.5281/zenodo.5888547 +> Schäfer, David, Palm, Bert, Lünenschloß, Peter, Schmidt, Lennart, & Bumberger, Jan. (2023). System for automated Quality Control - SaQC (2.3.0). Zenodo. https://doi.org/10.5281/zenodo.5888547 diff --git a/dios/dios/dios.py b/dios/dios/dios.py index 659718d9df655806a964b563dc7f5efa10a6f00e..8a5ee9433be9e59d1e81ed4ec9d1c19012a092b1 100644 --- a/dios/dios/dios.py +++ b/dios/dios/dios.py @@ -1179,8 +1179,9 @@ def pprint_dios( # stringified values of the series upto max_rows+1, where # the additional row is the column-name outer = [] - for colname in data.index: - s = data.at[colname] + for i, colname in enumerate(data.index): + # use iat instead of at, see #GL391 + s = data.iat[i] isempty = s.empty if isempty: @@ -1277,6 +1278,7 @@ def _to_aligned_df(dios, no_value=" "): def to_dios(obj) -> DictOfSeries: + """try cast obj to DictOfSeries.""" if isinstance(obj, DictOfSeries): return obj return DictOfSeries(data=obj) diff --git a/dios/requirements.txt b/dios/requirements.txt index 981ad1b6463cf6f897cc0efe1c9c25d1e344c9ce..dfd11d04b95a848f21759d79f50b716ef03860ab 100644 --- a/dios/requirements.txt +++ b/dios/requirements.txt @@ -5,5 +5,5 @@ numpy==1.21.2 pandas==1.3.5 python-dateutil==2.8.2 -pytz==2022.2.1 +pytz==2022.7.1 six==1.16.0 diff --git a/docs/conf.py b/docs/conf.py index 11ebee06bf571f1995e3c2607094824a4d4d386c..bfe7e814164dd79dd837e219e598c82490623516 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -32,7 +32,7 @@ version = vdict["__version__"] # -- Customize logging ------------------------------------------------------- -# couldn't get rid of a ignorable warning, so filter it +# couldn't get rid of an ignorable warning, so filter it # also see: https://issuemode.com/issues/sphinx-doc/sphinx/73994507 diff --git a/docs/documentation/Customizations.rst b/docs/documentation/Customizations.rst index 3c76088f20a08079c12087d5f9a263124009e6a0..989e4d4e1770e1b82a3ae4e6326d04ca612e523a 100644 --- a/docs/documentation/Customizations.rst +++ b/docs/documentation/Customizations.rst @@ -38,7 +38,6 @@ implement the following function interface .. code-block:: python import pandas - import dios import saqc def yourTestFunction( diff --git a/docs/modules/SaQCCore.rst b/docs/modules/SaQCCore.rst index 197da183d6809feed687dcf07a360f7fc76c5aaf..93569cdb1b17699415c1b5455ef15685a6bb9bcf 100644 --- a/docs/modules/SaQCCore.rst +++ b/docs/modules/SaQCCore.rst @@ -4,6 +4,17 @@ SaQC ==== +.. currentmodule:: saqc + +.. HACK: add 'our' external imported objects to core, but dont make it show up here + .. autosummary:: + :toctree: ../_api + + saqc.core.to_dios + saqc.core.DictOfSeries + .. automodapi:: saqc.core :include-all-objects: + + diff --git a/docs/requirements.txt b/docs/requirements.txt index 412a452a20ca0b408634ec6172a52d36cd492bde..8798e4f94e2f51abacc0b7ead5fd46c860c988ab 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -8,5 +8,5 @@ sphinx-automodapi==0.14.1 sphinxcontrib-fulltoc==1.2.0 sphinx-markdown-tables==0.0.17 jupyter-sphinx==0.3.2 -sphinx_autodoc_typehints==1.18.2 +sphinx_autodoc_typehints==1.22 sphinx-tabs==3.4.1 diff --git a/requirements.txt b/requirements.txt index 2c1599bdcc6a53acaccedc5f00fadd2879816802..20eafea505c2213fb136a41f814321a419223fbd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,12 +4,12 @@ Click==8.1.3 dtw==1.4.0 -matplotlib==3.6.2 +matplotlib==3.6.3 numba==0.56.4 numpy==1.23.5 outlier-utils==0.0.3 -pyarrow==10.0.1 +pyarrow==11.0.0 pandas==1.3.5 -scikit-learn==1.2.0 +scikit-learn==1.2.1 scipy==1.10.0 typing_extensions==4.4.0 diff --git a/saqc/__init__.py b/saqc/__init__.py index c6db9151c878416e559f1b78824df83596dea397..082baf5bc8a209dd8163b2d35eab33ae2aaf82a4 100644 --- a/saqc/__init__.py +++ b/saqc/__init__.py @@ -1,15 +1,14 @@ #! /usr/bin/env python - # SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# # SPDX-License-Identifier: GPL-3.0-or-later - # -*- coding: utf-8 -*- +# isort: skip_file + """The System for automated Quality Control package.""" from saqc.constants import BAD, DOUBTFUL, FILTER_ALL, FILTER_NONE, GOOD, UNFLAGGED - -# import order: from small to big, to a void cycles -from saqc.core import Flags, SaQC, fromConfig +from saqc.core import Flags, SaQC +from saqc.core.translation import DmpScheme, FloatScheme, PositionalScheme, SimpleScheme +from saqc.parsing.reader import fromConfig from saqc.version import __version__ diff --git a/saqc/__main__.py b/saqc/__main__.py index 24ba5b8b09ac4ce7ee049da2b177728580b72c3d..eadfef4b6d53f149472919833a648bd79facae62 100644 --- a/saqc/__main__.py +++ b/saqc/__main__.py @@ -15,9 +15,9 @@ import numpy as np import pandas as pd import pyarrow as pa -from dios.dios.dios import DictOfSeries +from saqc.core import DictOfSeries from saqc.core.core import TRANSLATION_SCHEMES -from saqc.core.reader import fromConfig +from saqc.parsing.reader import fromConfig logger = logging.getLogger("SaQC") diff --git a/saqc/constants.py b/saqc/constants.py index 985bcbdc12348c96def01dd41def071559dc28d4..5cc9ae0bc378d7ba72fddd91597ce2d3a0c544f1 100644 --- a/saqc/constants.py +++ b/saqc/constants.py @@ -29,16 +29,12 @@ __all__ = [ "DOUBTFUL", "BAD", "GOOD", - "ENVIRONMENT", "FILTER_ALL", "FILTER_NONE", ] import numpy as np -import scipy.stats as st - -import saqc.lib.ts_operators as ts_ops # ---------------------------------------------------------------------- # global flag constants @@ -55,65 +51,3 @@ BAD = 255.0 FILTER_ALL = -np.inf FILTER_NONE = np.inf - - -# ---------------------------------------------------------------------- -# other -# ---------------------------------------------------------------------- -def clip(series, lower=None, upper=None): - return series.clip(lower=lower, upper=upper) - - -ENVIRONMENT = { - # Infinity constant - "inf": np.inf, - "INF": np.inf, - # Not A number Constant. - "NAN": np.nan, - "nan": np.nan, - # Pointwise absolute Value Function. - "abs": np.abs, - # Maximum Value Function. Ignores NaN. - "max": np.nanmax, - # Minimum Value Function. Ignores NaN. - "min": np.nanmin, - # Mean Value Function. Ignores NaN. - "mean": np.nanmean, - # Summation. Ignores NaN. - "sum": np.nansum, - # Standart Deviation. Ignores NaN. - "len": len, - # Pointwise Exponential. - "exp": np.exp, - # Pointwise Logarithm. - "log": np.log, - # Logarithm, returning NaN for zero input, instead of -inf. - "nanLog": ts_ops.zeroLog, - # Standart Deviation. Ignores NaN. - "std": np.nanstd, - # Variance. Ignores NaN. - "var": np.nanvar, - # Median. Ignores NaN. - "median": np.nanmedian, - # Count Number of values. Ignores NaNs. - "count": ts_ops.count, - # Identity. - "id": ts_ops.identity, - # Returns a Series` diff. - "diff": ts_ops.difference, - # Scales data to [0,1] Interval. - "scale": ts_ops.normScale, - # Standardize with Standart Deviation. - "zScore": lambda x: st.zscore(x, nan_policy="omit"), - # Standardize with Median and MAD. - "madScore": ts_ops.standardizeByMedian, - # Standardize with Median and inter quantile range. - "iqsScore": ts_ops.standardizeByIQR, - "clip": clip, - "GOOD": GOOD, - "BAD": BAD, - "UNFLAGGED": UNFLAGGED, - "DOUBTFUL": DOUBTFUL, - "FILTER_ALL": FILTER_ALL, - "FILTER_NONE": FILTER_NONE, -} diff --git a/saqc/core/__init__.py b/saqc/core/__init__.py index 7d30f6416abb5611e17cbfb7fae6025e48243296..775e7f23fda3afc45663371042f418fd924dde21 100644 --- a/saqc/core/__init__.py +++ b/saqc/core/__init__.py @@ -1,14 +1,11 @@ #! /usr/bin/env python - # SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# # SPDX-License-Identifier: GPL-3.0-or-later - # -*- coding: utf-8 -*- -from saqc.core.core import SaQC -from saqc.core.flags import Flags, initFlagsLike +# isort: skip_file +from saqc.core.frame import DictOfSeries, to_dios # noqa from saqc.core.history import History -from saqc.core.reader import fromConfig +from saqc.core.flags import Flags, initFlagsLike from saqc.core.register import flagging, processing, register -from saqc.core.translation import DmpScheme, FloatScheme, PositionalScheme, SimpleScheme +from saqc.core.core import SaQC diff --git a/saqc/core/core.py b/saqc/core/core.py index ebce7c049c1ed9ded44bbc04202106e46b6f84f4..c719375bc4e99bcd911e4b8961c63b8b71909663 100644 --- a/saqc/core/core.py +++ b/saqc/core/core.py @@ -15,8 +15,8 @@ from typing import Any, Hashable, MutableMapping import numpy as np import pandas as pd -from dios import DictOfSeries, to_dios from saqc.core.flags import Flags, initFlagsLike +from saqc.core.frame import DictOfSeries, concatDios, to_dios from saqc.core.history import History from saqc.core.register import FUNC_MAP from saqc.core.translation import ( @@ -27,7 +27,6 @@ from saqc.core.translation import ( TranslationScheme, ) from saqc.funcs import FunctionsMixin -from saqc.lib.tools import concatDios # warnings pd.set_option("mode.chained_assignment", "warn") diff --git a/saqc/core/flags.py b/saqc/core/flags.py index 34042c8a9c2b505c79d5cae097b9ac6b824bd18b..ebff8349b01683f05d297c54824d07a8f99c467f 100644 --- a/saqc/core/flags.py +++ b/saqc/core/flags.py @@ -11,13 +11,12 @@ from typing import DefaultDict, Dict, Iterable, Mapping, Tuple, Type, Union import numpy as np import pandas as pd -import dios -from saqc.core.history import History +from saqc.core import DictOfSeries, History _VAL = Union[pd.Series, History] DictLike = Union[ pd.DataFrame, - dios.DictOfSeries, + DictOfSeries, Dict[str, _VAL], DefaultDict[str, _VAL], ] @@ -77,8 +76,8 @@ class Flags: .. doctest:: exampleFlags - >>> from saqc.constants import UNFLAGGED, BAD, DOUBTFUL - >>> flags = saqc.Flags() + >>> from saqc import UNFLAGGED, BAD, DOUBTFUL, Flags + >>> flags = Flags() >>> flags Empty Flags Columns: [] @@ -394,7 +393,7 @@ class Flags: Access via ``flags.history['var']``. To set a new history use ``flags.history['var'] = value``. - The passed value must be a instance of History or must be convertible to a + The passed value must be an instance of History or must be convertible to a history. Returns @@ -444,15 +443,15 @@ class Flags: # ---------------------------------------------------------------------- # transformation and representation - def toDios(self) -> dios.DictOfSeries: + def toDios(self) -> DictOfSeries: """ - Transform the flags container to a ``dios.DictOfSeries``. + Transform the flags container to a ``DictOfSeries``. Returns ------- - dios.DictOfSeries + DictOfSeries """ - di = dios.DictOfSeries(columns=self.columns) + di = DictOfSeries(columns=self.columns) for k in self._data.keys(): di[k] = self[k] @@ -478,11 +477,11 @@ def initFlagsLike( name: str = None, ) -> Flags: """ - Create empty Flags, from an reference data structure. + Create empty Flags, from a reference data structure. Parameters ---------- - reference : pd.DataFrame, pd.Series, dios.DictOfSeries, dict of pd.Series + reference : pd.DataFrame, pd.Series, DictOfSeries, dict of pd.Series The reference structure to initialize for. name : str, default None diff --git a/saqc/core/frame.py b/saqc/core/frame.py new file mode 100644 index 0000000000000000000000000000000000000000..225ec64144c1730f50750ccfdf7ff784468b233c --- /dev/null +++ b/saqc/core/frame.py @@ -0,0 +1,62 @@ +#! /usr/bin/env python +# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ +# SPDX-License-Identifier: GPL-3.0-or-later +# -*- coding: utf-8 -*- + +import warnings +from typing import List + +from dios import DictOfSeries, to_dios # noqa + + +def mergeDios(left: DictOfSeries, right: DictOfSeries, subset=None, join="merge"): + # use dios.merge() as soon as it implemented + # see https://git.ufz.de/rdm/dios/issues/15 + + merged = left.copy() + if subset is not None: + right_subset_cols = right.columns.intersection(subset) + else: + right_subset_cols = right.columns + + shared_cols = left.columns.intersection(right_subset_cols) + + for c in shared_cols: + l, r = left[c], right[c] + if join == "merge": + # NOTE: + # our merge behavior is nothing more than an + # outer join, where the right join argument + # overwrites the left at the shared indices, + # while on a normal outer join common indices + # hold the values from the left join argument + r, l = l.align(r, join="outer") + else: + l, r = l.align(r, join=join) + merged[c] = l.combine_first(r) + + newcols = right_subset_cols.difference(left.columns) + for c in newcols: + merged[c] = right[c].copy() + + return merged + + +def concatDios(data: List[DictOfSeries], warn: bool = True, stacklevel: int = 2): + # fast path for most common case + if len(data) == 1 and data[0].columns.is_unique: + return data[0] + + result = DictOfSeries() + for di in data: + for c in di.columns: + if c in result.columns: + if warn: + warnings.warn( + f"Column {c} already exist. Data is overwritten. " + f"Avoid duplicate columns names over all inputs.", + stacklevel=stacklevel, + ) + result[c] = di[c] + + return result diff --git a/saqc/core/history.py b/saqc/core/history.py index 480c9593a56fa0bfcdea15db7375ef8711c2ca6b..16abacdf57feb2a039de6148f59af31762dd8a7b 100644 --- a/saqc/core/history.py +++ b/saqc/core/history.py @@ -6,15 +6,14 @@ from __future__ import annotations -from copy import copy as shallowcopy -from copy import deepcopy +import copy as _copy from typing import Any, Callable, Dict, List, Tuple import numpy as np import pandas as pd from pandas.api.types import is_categorical_dtype, is_float_dtype -from saqc.constants import UNFLAGGED +from saqc import UNFLAGGED class History: @@ -70,7 +69,7 @@ class History: @meta.setter def meta(self, value: list[dict[str, Any]]) -> None: self._validateMetaList(value, self._hist) - self._meta = deepcopy(value) + self._meta = _copy.deepcopy(value) @property def index(self) -> pd.Index: @@ -428,7 +427,7 @@ class History: copy : History the copied FH """ - copyfunc = deepcopy if deep else shallowcopy + copyfunc = _copy.deepcopy if deep else _copy.copy new = History(self.index) new._hist = self._hist.copy(deep) new._meta = copyfunc(self._meta) @@ -564,7 +563,7 @@ class History: if copy: hist = hist.copy() - meta = deepcopy(meta) + meta = _copy.deepcopy(meta) history = cls(index=None) # noqa history._hist = hist.astype("category", copy=False) diff --git a/saqc/core/register.py b/saqc/core/register.py index 7ac2c33ec8bae3386878d1fbd3c6d56b3d59e1d5..7f364ce5e72e36facf5648849131deba37a6732a 100644 --- a/saqc/core/register.py +++ b/saqc/core/register.py @@ -3,7 +3,6 @@ # SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ # # SPDX-License-Identifier: GPL-3.0-or-later - from __future__ import annotations import functools @@ -15,15 +14,20 @@ import numpy as np import pandas as pd from typing_extensions import ParamSpec -import dios -from saqc.constants import FILTER_ALL, FILTER_NONE, UNFLAGGED -from saqc.core.flags import Flags, History +from saqc import FILTER_ALL, FILTER_NONE +from saqc.core import DictOfSeries, Flags, History from saqc.core.translation.basescheme import TranslationScheme -from saqc.lib.tools import squeezeSequence, toSequence +from saqc.lib.tools import isflagged, squeezeSequence, toSequence from saqc.lib.types import ExternalFlag, OptionalNone if TYPE_CHECKING: - from saqc.core.core import SaQC + from saqc import SaQC + +__all__ = [ + "register", + "processing", + "flagging", +] # NOTE: # the global SaQC function store, @@ -153,24 +157,24 @@ def _squeezeFlags(old_flags, new_flags: Flags, columns: pd.Index, meta) -> Flags def _maskData( - data: dios.DictOfSeries, flags: Flags, columns: Sequence[str], thresh: float -) -> Tuple[dios.DictOfSeries, dios.DictOfSeries]: + data: DictOfSeries, flags: Flags, columns: Sequence[str], thresh: float +) -> Tuple[DictOfSeries, DictOfSeries]: """ Mask data with Nans, if the flags are worse than a threshold. - mask only passed `columns` (preselected by `datamask`-kw from decorator) Returns ------- - masked : dios.DictOfSeries + masked : DictOfSeries masked data, same dim as original - mask : dios.DictOfSeries + mask : DictOfSeries dios holding iloc-data-pairs for every column in `data` """ - mask = dios.DictOfSeries(columns=columns) + mask = DictOfSeries(columns=columns) # we use numpy here because it is faster for c in columns: - col_mask = _isflagged(flags[c], thresh) + col_mask = isflagged(flags[c], thresh) if col_mask.any(): col_data = data[c].to_numpy(dtype=np.float64) @@ -184,8 +188,8 @@ def _maskData( def _unmaskData( - data: dios.DictOfSeries, mask: dios.DictOfSeries, columns: pd.Index | None = None -) -> dios.DictOfSeries: + data: DictOfSeries, mask: DictOfSeries, columns: pd.Index | None = None +) -> DictOfSeries: """ Restore the masked data. @@ -462,19 +466,3 @@ def processing(**kwargs): if kwargs: raise ValueError("use '@register' to pass keywords") return register(mask=[], demask=[], squeeze=[]) - - -A = TypeVar("A", np.ndarray, pd.Series) - - -def _isflagged(flagscol: A, thresh: float) -> A: - """ - Return a mask of flags accordingly to `thresh`. Return type is same as flags. - """ - if not isinstance(thresh, (float, int)): - raise TypeError(f"thresh must be of type float, not {repr(type(thresh))}") - - if thresh == FILTER_ALL: - return flagscol > UNFLAGGED - - return flagscol >= thresh diff --git a/saqc/core/translation/__init__.py b/saqc/core/translation/__init__.py index c40914611bbe1fad26132d3520f2e39e4a803dc6..fe2d85790a1f5d516c832527931c10ece45464a0 100644 --- a/saqc/core/translation/__init__.py +++ b/saqc/core/translation/__init__.py @@ -8,8 +8,8 @@ from saqc.core.translation.basescheme import ( FloatScheme, MappingScheme, - SimpleScheme, TranslationScheme, ) from saqc.core.translation.dmpscheme import DmpScheme from saqc.core.translation.positionalscheme import PositionalScheme +from saqc.core.translation.simplescheme import SimpleScheme diff --git a/saqc/core/translation/basescheme.py b/saqc/core/translation/basescheme.py index b7a3d67f4d8df76d996aa9dcee77144fbc77e957..d469e02b33edbb8467e7471a982d8fc3128611a8 100644 --- a/saqc/core/translation/basescheme.py +++ b/saqc/core/translation/basescheme.py @@ -8,15 +8,14 @@ from __future__ import annotations -from abc import abstractmethod, abstractproperty +from abc import abstractmethod from typing import Any, Dict import numpy as np import pandas as pd -from dios import DictOfSeries -from saqc.constants import BAD, FILTER_ALL, GOOD, UNFLAGGED -from saqc.core.flags import Flags +from saqc import BAD, FILTER_ALL, GOOD, UNFLAGGED +from saqc.core import DictOfSeries, Flags from saqc.lib.types import ExternalFlag ForwardMap = Dict[ExternalFlag, float] @@ -35,25 +34,52 @@ class TranslationScheme: # pragma: no cover @abstractmethod def toInternal(self, flags: pd.DataFrame | DictOfSeries) -> Flags: + """ + Translate from 'external flags' to 'internal flags' + + Parameters + ---------- + flags : pd.DataFrame + The external flags to translate + + Returns + ------- + Flags object + """ pass @abstractmethod def toExternal(self, flags: Flags, attrs: dict | None = None) -> DictOfSeries: + """ + Translate from 'internal flags' to 'external flags' + + Parameters + ---------- + flags : pd.DataFrame + The external flags to translate + + attrs : dict or None, default None + global meta information of saqc-object + + Returns + ------- + pd.DataFrame + """ pass class MappingScheme(TranslationScheme): """ This class provides the basic translation mechanism and should serve as - a base class for every other translation scheme. + a base class for most other translation scheme. - The general translation is realized through dictionary lookups, altough + The general translation is realized through dictionary lookups, although we might need to extend this logic to also allow calls to translation - functions in the future. Currently at least one `dict` defining the + functions in the future. Currently, at least one `dict` defining the 'forward' translation from 'user flags' -> 'internal flags' needs to be provided. Optionally a second `dict` can be passed to map 'internal flags' -> 'user flags', - if the latter is not given, this 'backward' translation will inferred as + if the latter is not given, this 'backward' translation is inferred as the inverse of the 'forward' translation. The translation mechanism imposes a few restrictions: @@ -217,27 +243,3 @@ class FloatScheme(TranslationScheme): out = flags.toDios() out.attrs = attrs or {} return out - - -class SimpleScheme(MappingScheme): - - """ - Acts as the default Translator, provides a changeable subset of the - internal float flags - """ - - _FORWARD = { - "UNFLAGGED": UNFLAGGED, - "BAD": BAD, - "OK": GOOD, - } - - _BACKWARD = { - UNFLAGGED: "UNFLAGGED", - np.nan: "UNFLAGGED", - BAD: "BAD", - GOOD: "OK", - } - - def __init__(self): - super().__init__(forward=self._FORWARD, backward=self._BACKWARD) diff --git a/saqc/core/translation/dmpscheme.py b/saqc/core/translation/dmpscheme.py index 17f958484643941cc1641d0cebb1ad69a4e93d67..bf35b7b895c04cda129f698763ebaf9f711ae3ab 100644 --- a/saqc/core/translation/dmpscheme.py +++ b/saqc/core/translation/dmpscheme.py @@ -14,9 +14,8 @@ from functools import reduce import numpy as np import pandas as pd -from saqc.constants import BAD, DOUBTFUL, GOOD, UNFLAGGED -from saqc.core.flags import Flags -from saqc.core.history import History +from saqc import BAD, DOUBTFUL, GOOD, UNFLAGGED +from saqc.core import Flags, History from saqc.core.translation.basescheme import BackwardMap, ForwardMap, MappingScheme _QUALITY_CAUSES = [ diff --git a/saqc/core/translation/positionalscheme.py b/saqc/core/translation/positionalscheme.py index 23b724293b121a15ba0ba3988ea60e250e1b085a..f503b0fe379019d62c40c9ce9d76bbaa21545f09 100644 --- a/saqc/core/translation/positionalscheme.py +++ b/saqc/core/translation/positionalscheme.py @@ -11,7 +11,7 @@ import numpy as np import pandas as pd from saqc.constants import BAD, DOUBTFUL, GOOD, UNFLAGGED -from saqc.core.flags import Flags, History +from saqc.core import Flags, History from saqc.core.translation.basescheme import BackwardMap, ForwardMap, MappingScheme diff --git a/saqc/core/translation/simplescheme.py b/saqc/core/translation/simplescheme.py new file mode 100644 index 0000000000000000000000000000000000000000..aabd0472b55817696156ec156d036484a7f7264b --- /dev/null +++ b/saqc/core/translation/simplescheme.py @@ -0,0 +1,33 @@ +#! /usr/bin/env python +# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ +# SPDX-License-Identifier: GPL-3.0-or-later +# -*- coding: utf-8 -*- + +import numpy as np + +from saqc.constants import BAD, GOOD, UNFLAGGED +from saqc.core.translation import MappingScheme + + +class SimpleScheme(MappingScheme): + + """ + Acts as the default Translator, provides a changeable subset of the + internal float flags + """ + + _FORWARD = { + "UNFLAGGED": UNFLAGGED, + "BAD": BAD, + "OK": GOOD, + } + + _BACKWARD = { + UNFLAGGED: "UNFLAGGED", + np.nan: "UNFLAGGED", + BAD: "BAD", + GOOD: "OK", + } + + def __init__(self): + super().__init__(forward=self._FORWARD, backward=self._BACKWARD) diff --git a/saqc/funcs/breaks.py b/saqc/funcs/breaks.py index 0b700d6ae5a3c80c38e5987468fe8642d91ce195..5d9e1fc58b44b234311475a12e0e459c9064bfc4 100644 --- a/saqc/funcs/breaks.py +++ b/saqc/funcs/breaks.py @@ -22,9 +22,10 @@ from typing import TYPE_CHECKING import numpy as np import pandas as pd -from saqc.constants import BAD, FILTER_ALL -from saqc.core.register import _isflagged, flagging, register +from saqc import BAD, FILTER_ALL +from saqc.core import flagging, register from saqc.funcs.changepoints import _assignChangePointCluster +from saqc.lib.tools import isflagged if TYPE_CHECKING: from saqc.core.core import SaQC @@ -64,7 +65,7 @@ class BreaksMixin: datacol = self._data[field] mask = datacol.isna() - mask = ~_isflagged(self._flags[field], dfilter) & mask + mask = ~isflagged(self._flags[field], dfilter) & mask self._flags[mask, field] = flag return self diff --git a/saqc/funcs/changepoints.py b/saqc/funcs/changepoints.py index c4d6db0955e555b5728b4c531487cf56f10125c7..9b9267c380826ab92d3af89bd94554b37570c1a9 100644 --- a/saqc/funcs/changepoints.py +++ b/saqc/funcs/changepoints.py @@ -12,16 +12,13 @@ from typing import TYPE_CHECKING, Callable, Tuple import numba import numpy as np import pandas as pd -from typing_extensions import Literal -from dios import DictOfSeries -from saqc.constants import BAD, UNFLAGGED -from saqc.core.flags import Flags -from saqc.core.register import flagging, register +from saqc import BAD, UNFLAGGED +from saqc.core import DictOfSeries, Flags, flagging, register from saqc.lib.tools import customRoller, filterKwargs if TYPE_CHECKING: - from saqc.core.core import SaQC + from saqc import SaQC class ChangepointsMixin: @@ -39,12 +36,10 @@ class ChangepointsMixin: **kwargs, ) -> "SaQC": """ - Flag data where it significantly changes. + Flag values that represent a system state transition. - Flag data points, where the parametrization of the process, the data is assumed to - generate by, significantly changes. - - The change points detection is based on a sliding window search. + Flag data points, where the parametrization of the assumed process generating this data, + significantly changes. Parameters ---------- diff --git a/saqc/funcs/constants.py b/saqc/funcs/constants.py index fc1a77f99b0435496ea607c1922522848038c464..ffd2acb07ba45d64e9efd3a4daa205e1f9a6e622 100644 --- a/saqc/funcs/constants.py +++ b/saqc/funcs/constants.py @@ -14,13 +14,13 @@ from typing import TYPE_CHECKING import numpy as np import pandas as pd -from saqc.constants import BAD -from saqc.core.register import flagging +from saqc import BAD +from saqc.core import flagging from saqc.lib.tools import customRoller, getFreqDelta, statPass from saqc.lib.ts_operators import varQC if TYPE_CHECKING: - from saqc.core.core import SaQC + from saqc import SaQC class ConstantsMixin: diff --git a/saqc/funcs/curvefit.py b/saqc/funcs/curvefit.py index 0444d7983ddf695d1458757f1b506ea5b756c292..c0a86fd5185869c181af20b45b7259b1cc28801a 100644 --- a/saqc/funcs/curvefit.py +++ b/saqc/funcs/curvefit.py @@ -13,9 +13,7 @@ import numpy as np import pandas as pd from typing_extensions import Literal -from dios import DictOfSeries -from saqc.core.flags import Flags -from saqc.core.register import register +from saqc.core import DictOfSeries, Flags, register from saqc.lib.tools import getFreqDelta from saqc.lib.ts_operators import ( butterFilter, @@ -27,7 +25,7 @@ from saqc.lib.ts_operators import ( ) if TYPE_CHECKING: - from saqc.core.core import SaQC + from saqc import SaQC _FILL_METHODS = Literal[ "linear", diff --git a/saqc/funcs/drift.py b/saqc/funcs/drift.py index 52d1b5997d6b68797015fbcf5d1e6c1bc1993935..d3555ac51ab8d7989aec5b34440a40e18677432b 100644 --- a/saqc/funcs/drift.py +++ b/saqc/funcs/drift.py @@ -19,16 +19,15 @@ from scipy.optimize import curve_fit from scipy.spatial.distance import pdist from typing_extensions import Literal -from dios import DictOfSeries -from saqc.constants import BAD -from saqc.core.register import Flags, flagging, register +from saqc import BAD +from saqc.core import DictOfSeries, Flags, flagging, register from saqc.funcs.changepoints import _assignChangePointCluster from saqc.lib.tools import detectDeviants, filterKwargs, toSequence from saqc.lib.ts_operators import expDriftModel, linearDriftModel from saqc.lib.types import CurveFitter if TYPE_CHECKING: - from saqc.core.core import SaQC + from saqc import SaQC LinkageString = Literal[ @@ -569,7 +568,7 @@ class DriftMixin: **kwargs, ) -> "SaQC": """ - Flags anomalous regimes regarding to modelling regimes of field. + Flags anomalous regimes regarding to modelling regimes of ``field``. "Normality" is determined in terms of a maximum spreading distance, regimes must not exceed in respect to a certain metric and linkage method. diff --git a/saqc/funcs/flagtools.py b/saqc/funcs/flagtools.py index 80420a1cdd18ac939a7981a0cda961219e263b3a..208aecd667a1b67ded026150bd0025eb700e0db1 100644 --- a/saqc/funcs/flagtools.py +++ b/saqc/funcs/flagtools.py @@ -7,19 +7,20 @@ # -*- coding: utf-8 -*- from __future__ import annotations +import operator import warnings -from typing import TYPE_CHECKING, Any, Union +from typing import TYPE_CHECKING, Any, Callable, Sequence, Union import numpy as np import pandas as pd from typing_extensions import Literal -from dios import DictOfSeries -from saqc.constants import BAD, FILTER_ALL, UNFLAGGED -from saqc.core.register import _isflagged, flagging, register +from saqc import BAD, FILTER_ALL, UNFLAGGED +from saqc.core import DictOfSeries, flagging, register +from saqc.lib.tools import isflagged, toSequence if TYPE_CHECKING: - from saqc.core.core import SaQC + from saqc import SaQC class FlagtoolsMixin: @@ -516,7 +517,7 @@ class FlagtoolsMixin: # get dfilter from meta or get of rid of this and # consider everything != np.nan as flag - flagged = _isflagged(hc, dfilter) + flagged = isflagged(hc, dfilter) repeated = ( flagged.rolling(window, min_periods=1, closed="left") @@ -531,3 +532,139 @@ class FlagtoolsMixin: self._flags[repeated, field] = flag return self + + @register( + mask=["field"], + demask=["field"], + squeeze=["field"], + multivariate=False, + handles_target=True, + ) + def andGroup( + self: "SaQC", + field: str, + group: Sequence["SaQC"] | dict["SaQC", str | Sequence[str]], + target: str | None = None, + flag: float = BAD, + **kwargs, + ) -> "SaQC": + """ + Flag all values, if a given variable is also flagged in all other given SaQC objects. + + Parameters + ---------- + field : str + Name of the field to check for flags. 'field' needs to present in all + objects in 'qcs'. + + qcs : list of SaQC + A list of SaQC objects to check for flags. + + target : str, default none + Name of the field the generated flags will be written to. If None, the result + will be written to 'field', + + flag: float, default ``BAD`` + The quality flag to set. + + Returns + ------- + saqc.SaQC + """ + + return _groupOperation( + base=self, + field=field, + target=target, + func=operator.and_, + group=group, + flag=flag, + **kwargs, + ) + + @register( + mask=["field"], + demask=["field"], + squeeze=["field"], + multivariate=False, + handles_target=True, + ) + def orGroup( + self: "SaQC", + field: str, + group: Sequence["SaQC"] | dict["SaQC", str | Sequence[str]], + target: str | None = None, + flag: float = BAD, + **kwargs, + ) -> "SaQC": + """ + Flag all values, if a given variable is also flagged in at least one other of the given SaQC objects. + + Parameters + ---------- + field : str + Name of the field to check for flags. 'field' needs to present in all + objects in 'qcs'. + + qcs : list of SaQC + A list of SaQC objects to check for flags. + + target : str, default none + Name of the field the generated flags will be written to. If None, the result + will be written to 'field', + + flag: float, default ``BAD`` + The quality flag to set. + + Returns + ------- + saqc.SaQC + """ + return _groupOperation( + base=self, + field=field, + target=target, + func=operator.or_, + group=group, + flag=flag, + **kwargs, + ) + + +def _groupOperation( + base: "SaQC", + field: str, + func: Callable[[pd.Series, pd.Series], pd.Series], + group: Sequence["SaQC"] | dict["SaQC", str | Sequence[str]], + target: str | None = None, + flag: float = BAD, + **kwargs, +) -> "SaQC": + # Should this be multivariate? And what would multivariate mean in this context + + dfilter = kwargs.get("dfilter", FILTER_ALL) + if target is None: + target = field + + # harmonise `group` to type dict[SaQC, list[str]] + if not isinstance(group, dict): + group = {qc: field for qc in group} + + for k, v in group.items(): + group[k] = toSequence(v) + + qcs_items: list[tuple["SaQC", list[str]]] = list(group.items()) + # generate initial mask from the first `qc` object on the popped first field + mask = isflagged(qcs_items[0][0]._flags[qcs_items[0][1].pop(0)], thresh=dfilter) + + for qc, fields in qcs_items: + if field not in qc._flags: + raise KeyError(f"variable {field} is missing in given SaQC object") + for field in fields: + mask = func(mask, isflagged(qc._flags[field], thresh=FILTER_ALL)) + + if target not in base._data: + base = base.copyField(field=field, target=target) + + base._flags[mask, target] = flag + return base diff --git a/saqc/funcs/generic.py b/saqc/funcs/generic.py index 7f5ec19b53523a768aaab0947f89b866cd3714c9..e70b4c17e4ff9a41f5170acff21f1ed05be14e3a 100644 --- a/saqc/funcs/generic.py +++ b/saqc/funcs/generic.py @@ -12,16 +12,15 @@ from typing import TYPE_CHECKING, Sequence, Tuple, Union import numpy as np import pandas as pd -from dios import DictOfSeries -from saqc.constants import BAD, ENVIRONMENT, FILTER_ALL -from saqc.core.flags import Flags -from saqc.core.history import History -from saqc.core.register import _isflagged, _maskData, register -from saqc.lib.tools import toSequence +from saqc import BAD, FILTER_ALL +from saqc.core import DictOfSeries, Flags, History, register +from saqc.core.register import _maskData +from saqc.lib.tools import isflagged, toSequence from saqc.lib.types import GenericFunction, PandasLike +from saqc.parsing.environ import ENVIRONMENT if TYPE_CHECKING: - from saqc.core.core import SaQC + from saqc import SaQC def _flagSelect(field, flags, label=None): @@ -64,7 +63,7 @@ def _execGeneric( dfilter: float = FILTER_ALL, ) -> DictOfSeries: globs = { - "isflagged": lambda data, label=None: _isflagged( + "isflagged": lambda data, label=None: isflagged( _flagSelect(data.name, flags, label), thresh=dfilter ), **ENVIRONMENT, @@ -104,14 +103,7 @@ class GenericMixin: """ Generate/process data with user defined functions. - Formally, what the function does, is the following: - - 1. Let F be a Callable, depending on fields f_1, f_2,...f_K, (F = F(f_1, f_2,...f_K)) - Than, for every timestamp t_i that occurs in at least one of the timeseries data[f_j] (outer join), - The value v_i is computed via: - v_i = data([f_1][t_i], data[f_2][t_i], ..., data[f_K][t_i]), if all data[f_j][t_i] do exist - v_i = ``np.nan``, if at least one of the data[f_j][t_i] is missing. - 2. The result is stored to ``data[target]``, if ``target`` is given or to ``data[field]`` otherwise + Call the given ``func`` on the variables given in ``field``. Parameters ---------- @@ -212,12 +204,9 @@ class GenericMixin: **kwargs, ) -> "SaQC": """ - Flag data with user defined functions. + Flag data based on a given function. - Formally, what the function does, is the following: - Let X be a Callable, depending on fields f_1, f_2,...f_K, (X = X(f_1, f_2,...f_K)) - Than for every timestamp t_i in data[field]: - data[field][t_i] is flagged if X(data[f_1][t_i], data[f_2][t_i], ..., data[f_K][t_i]) is True. + Evaluate ``func`` on all variables given in ``field``. Parameters ---------- @@ -225,21 +214,18 @@ class GenericMixin: The variable(s) passed to func. func : callable - Function to call on the variables given in ``field``. The function needs to accept the same - number of arguments (of type pandas.Series) as variables given in ``field`` and return an - iterable of array-like objects of with dtype bool and with the same number of elements as - given in ``target`` (or ``field`` if ``target`` is not specified). The function output - determines the values to flag. + Function to call. The function needs to accept the same number of arguments + (of type pandas.Series) as variables given in ``field`` and return an + iterable of array-like objects of data type ``bool`` with the same length as + ``target``. target: str or list of str The variable(s) to write the result of ``func`` to. If not given, the variable(s) - specified in ``field`` will be overwritten. If a ``target`` is not given, it will be - created. + specified in ``field`` will be overwritten. Non-existing ``target``s will be created + as all ``NaN`` timeseries. flag: float, default ``BAD`` - The quality flag to set. The default ``BAD`` states the general idea, that - ``processGeneric`` generates 'new' data without direct relation to the potentially - already present flags. + Quality flag to set. dfilter: float, default ``FILTER_ALL`` Threshold flag. Flag values greater than ``dfilter`` indicate that the associated @@ -249,10 +235,6 @@ class GenericMixin: ------- saqc.SaQC - Note - ----- - All the numpy functions are available within the generic expressions. - Examples -------- diff --git a/saqc/funcs/interpolation.py b/saqc/funcs/interpolation.py index 99344f4c11ad915aa3208508b40df73765c689cd..9def348d0594a7528536675ff33a3e18aa5e459f 100644 --- a/saqc/funcs/interpolation.py +++ b/saqc/funcs/interpolation.py @@ -13,12 +13,13 @@ import numpy as np import pandas as pd from typing_extensions import Literal -from saqc.constants import UNFLAGGED -from saqc.core.register import _isflagged, register +from saqc import UNFLAGGED +from saqc.core import register +from saqc.lib.tools import isflagged from saqc.lib.ts_operators import interpolateNANs if TYPE_CHECKING: - from saqc.core.core import SaQC + from saqc import SaQC _SUPPORTED_METHODS = Literal[ @@ -341,7 +342,7 @@ class InterpolationMixin: # TODO: # in future we could use `register(mask=[field], [], [])` # and dont handle masking manually here - flagged = _isflagged(self._flags[field], kwargs["dfilter"]) + flagged = isflagged(self._flags[field], kwargs["dfilter"]) # drop all points that hold no relevant grid information datcol = datcol[~flagged].dropna() diff --git a/saqc/funcs/noise.py b/saqc/funcs/noise.py index 8945f22332a763794d6beb7c4179ac34767df7dd..feca72039d0fb1808daf2f6bb97bd3e398cfc182 100644 --- a/saqc/funcs/noise.py +++ b/saqc/funcs/noise.py @@ -13,12 +13,12 @@ from typing import TYPE_CHECKING, Callable import numpy as np import pandas as pd -from saqc.constants import BAD -from saqc.core.register import flagging +from saqc import BAD +from saqc.core import flagging from saqc.lib.tools import statPass if TYPE_CHECKING: - from saqc.core.core import SaQC + from saqc import SaQC class NoiseMixin: @@ -36,11 +36,11 @@ class NoiseMixin: **kwargs, ) -> "SaQC": """ - Flag *chunks* of length, `window`: + Flag data chunks of length ``window``, if: - 1. If they excexceed `thresh` with regard to `stat`: - 2. If all (maybe overlapping) *sub-chunks* of *chunk*, with length `sub_window`, - `excexceed `sub_thresh` with regard to `stat`: + 1. they excexceed ``thresh`` with regard to ``func`` and + 2. all (maybe overlapping) sub-chunks of the data chunks with length ``sub_window``, + exceed ``sub_thresh`` with regard to ``func`` Parameters ---------- @@ -48,21 +48,24 @@ class NoiseMixin: The fieldname of the column, holding the data-to-be-flagged. func: Callable[[np.array, pd.Series], float] - Function to aggregate chunk contnent with. + Aggregation function applied on every chunk. window: str - Temporal extension of the chunks to test + Window (i.e. chunk) size. thresh: float - Threshold, that triggers flagging, if exceeded by stat value. + Threshold. A given chunk is flagged, if the return value of ``func`` excceeds ``thresh``. sub_window: str, default None, - Window size of the sub chunks, that are additionally tested for exceeding - `sub_thresh` with respect to `stat`. + Window size of sub chunks, that are additionally tested for exceeding ``sub_thresh`` + with respect to ``func``. sub_thresh: float, default None + Threshold. A given sub chunk is flagged, if the return value of ``func` excceeds ``sub_thresh``. min_periods: int, default None + Minimum number of values needed in a chunk to perfom the test. + Ignored if ``window`` is an integer. flag : float, default BAD flag to set diff --git a/saqc/funcs/outliers.py b/saqc/funcs/outliers.py index 50d5c156f836e6edc8dc2fb7aa9ad0976fb5c0e6..182205ec39429b316fb9dba93ef7acfc7f1c25fd 100644 --- a/saqc/funcs/outliers.py +++ b/saqc/funcs/outliers.py @@ -15,19 +15,17 @@ import numba import numpy as np import numpy.polynomial.polynomial as poly import pandas as pd -from outliers import smirnov_grubbs +from outliers import smirnov_grubbs # noqa, on pypi as outlier-utils from scipy.stats import median_abs_deviation from typing_extensions import Literal -from dios import DictOfSeries -from saqc.constants import BAD, UNFLAGGED -from saqc.core.flags import Flags -from saqc.core.register import flagging, register +from saqc import BAD, UNFLAGGED +from saqc.core import DictOfSeries, Flags, flagging, register from saqc.funcs.scores import _univarScoring from saqc.lib.tools import customRoller, getFreqDelta, toSequence if TYPE_CHECKING: - from saqc.core.core import SaQC + from saqc import SaQC class OutliersMixin: @@ -122,8 +120,10 @@ class OutliersMixin: References ---------- - [1] Talagala, P. D., Hyndman, R. J., & Smith-Miles, K. (2019). Anomaly detection in - high dimensional data. arXiv preprint arXiv:1908.04000. + [1] Priyanga Dilini Talagala, Rob J. Hyndman & Kate Smith-Miles (2021): + Anomaly Detection in High-Dimensional Data, + Journal of Computational and Graphical Statistics, 30:2, 360-374, + DOI: 10.1080/10618600.2020.1807997 """ scores = self._data[field].dropna() @@ -279,10 +279,6 @@ class OutliersMixin: flag : float, default BAD flag to set. - Returns - ------- - saqc.SaQC - Notes ----- The basic steps are: @@ -320,6 +316,18 @@ class OutliersMixin: this gap, get flagged outliers. See description of the `threshing` parameter for more details. Although [2] gives a fully detailed overview over the `stray` algorithm. + + Returns + ------- + saqc.SaQC + + References + ---------- + [1] Priyanga Dilini Talagala, Rob J. Hyndman & Kate Smith-Miles (2021): + Anomaly Detection in High-Dimensional Data, + Journal of Computational and Graphical Statistics, 30:2, 360-374, + DOI: 10.1080/10618600.2020.1807997 + """ fields = toSequence(field) @@ -570,11 +578,13 @@ class OutliersMixin: **kwargs, ) -> "SaQC": """ - The function represents an implementation of the modyfied Z-score outlier detection method. + Flag outiers using the modified Z-score outlier detection method. See references [1] for more details on the algorithm. - Note, that the test needs the input data to be sampled regularly (fixed sampling rate). + Note + ---- + Data needs to be sampled at a regular equidistant time grid. Parameters ---------- @@ -857,20 +867,19 @@ class OutliersMixin: **kwargs, ) -> "SaQC": """ - The function flags values that are regarded outliers due to the grubbs test. - - See reference [1] for more information on the grubbs tests definition. + Flag outliers using the Grubbs algorithm. - The (two-sided) test gets applied onto data chunks of size "window". The tests - application will be iterated on each data-chunk under test, till no more - outliers are detected in that chunk. + See [1] for more information on the grubbs tests definition. - Note, that the test performs poorely for small data chunks (resulting in heavy - overflagging). Therefor you should select "window" so that every window contains - at least > 8 values and also adjust the min_periods values accordingly. + The (two-sided) test gets applied to data chunks of size ``window``. The + tests will be iterated chunkwise until no more outliers are detected. - Note, that the data to be tested by the grubbs test are expected to be distributed - "normalish". + Note + ---- + * The test performs poorly for small data chunks, resulting in considerable + overflagging. Select ``window`` such that every data chunck contains at + least 8 values and also adjust the ``min_periods`` values accordingly. + * The dara is expected to be normally distributed Parameters ---------- @@ -878,25 +887,22 @@ class OutliersMixin: The fieldname of the column, holding the data-to-be-flagged. window : {int, str} - The size of the window you want to use for outlier testing. If an integer is - passed, the size refers to the number of periods of every testing window. If a - string is passed, it has to be an offset string, and will denote the total - temporal extension of every window. + Size of the testing window. + If an integer, the fixed number of observations used for each window. + If an offset string the time period of each window. alpha : float, default 0.05 - The level of significance, the grubbs test is to be performed at. (between 0 and 1) + Level of significance, the grubbs test is to be performed at. Must be between 0 and 1 min_periods : int, default 8 - The minimum number of values that have to be present in an interval under test, - for a grubbs test result to be accepted. Only makes sence in case `window` is - an offset string. + Minimum number of values needed in a ``window`` in order to perform the grubs test. + Ignored if ``window`` is an integer. pedantic: boolean, default False - If True, every value gets checked twice for being an outlier. Ones in the - initial rolling window and one more time in a rolling window that is lagged - by half the windows delimeter (window/2). Recommended for avoiding false - positives at the window edges. Only available when rolling with integer - defined window size. + If ``True``, every value gets checked twice. First in the initial rolling ``window`` + and second in a rolling window that is lagging by ``window``/2. Recommended to avoid + false positives at the window edges. + Ignored if ``window`` is an offset string. flag : float, default BAD flag to set. diff --git a/saqc/funcs/pattern.py b/saqc/funcs/pattern.py index c3d02be0a55ad27ed4a3551858a8334de9f34155..39b15dc63ca20ea10173ecaf98a76036045d5fda 100644 --- a/saqc/funcs/pattern.py +++ b/saqc/funcs/pattern.py @@ -12,12 +12,12 @@ from typing import TYPE_CHECKING import dtw import pandas as pd -from saqc.constants import BAD -from saqc.core.register import flagging +from saqc import BAD +from saqc.core import flagging from saqc.lib.tools import customRoller if TYPE_CHECKING: - from saqc.core.core import SaQC + from saqc import SaQC def calculateDistanceByDTW( diff --git a/saqc/funcs/resampling.py b/saqc/funcs/resampling.py index c7dcf511e5a25b0345ff65c4556993bee2d736ac..4d2a59e96a06ac8b0eba3e9cf8898436c8f47412 100644 --- a/saqc/funcs/resampling.py +++ b/saqc/funcs/resampling.py @@ -14,14 +14,13 @@ import numpy as np import pandas as pd from typing_extensions import Literal -from dios import DtItype -from saqc.core.register import _isflagged, register +from saqc.core import register from saqc.funcs.interpolation import _SUPPORTED_METHODS -from saqc.lib.tools import evalFreqStr, filterKwargs, getFreqDelta +from saqc.lib.tools import evalFreqStr, filterKwargs, getFreqDelta, isflagged from saqc.lib.ts_operators import aggregate2Freq, shift2Freq if TYPE_CHECKING: - from saqc.core.core import SaQC + from saqc import SaQC METHOD2ARGS = { @@ -132,7 +131,7 @@ class ResamplingMixin: **kwargs, ) -> "SaQC": """ - Function to shift data and flags to a regular (equidistant) timestamp grid, according to ``method``. + Shift data points and flags to a regular frequency grid. Parameters ---------- @@ -140,24 +139,19 @@ class ResamplingMixin: The fieldname of the column, holding the data-to-be-shifted. freq : str - An frequency Offset String that will be interpreted as the sampling rate you want the data to be shifted to. + Offset string. Sampling rate of the target frequency. method : {'fshift', 'bshift', 'nshift'}, default 'nshift' - Specifies how misaligned data-points get propagated to a grid timestamp. - Following choices are available: + Method to propagate values: - * 'nshift' : every grid point gets assigned the nearest value in its range. (range = +/- 0.5 * `freq`) - * 'bshift' : every grid point gets assigned its first succeeding value, if one is available in - the succeeding sampling interval. - * 'fshift' : every grid point gets assigned its ultimately preceding value, if one is available in - the preceeding sampling interval. + * 'nshift' : shift grid points to the nearest time stamp in the range = +/- 0.5 * ``freq`` + * 'bshift' : shift grid points to the first succeeding time stamp (if any) + * 'fshift' : shift grid points to the last preceeding time stamp (if any) freq_check : {None, 'check', 'auto'}, default None - - * ``None`` : do not validate frequency-string passed to `freq` - * 'check' : estimate frequency and log a warning if estimate miss matches frequency string passed to `freq`, - or if no uniform sampling rate could be estimated - * 'auto' : estimate frequency and use estimate. (Ignores `freq` parameter.) + * ``None`` : do not validate the ``freq`` string. + * 'check' : check ``freq`` against an frequency estimation, produces a warning in case of miss matches. + * 'auto' : estimate frequency, `freq` is ignored. Returns ------- @@ -202,12 +196,12 @@ class ResamplingMixin: **kwargs, ) -> "SaQC": """ - Function to resample the data. + Resample data points and flags to a regular frequency. - The data will be sampled at regular (equidistant) timestamps aka. Grid points. + The data will be sampled to regular (equidistant) timestamps. Sampling intervals therefore get aggregated with a function, specified by - 'func' parameter and the result gets projected onto the new timestamps with a - method, specified by "method". The following method (keywords) are available: + ``func``, the result is projected to the new timestamps using + ``method``. The following methods are available: * ``'nagg'``: all values in the range (+/- `freq`/2) of a grid point get aggregated with func and assigned to it. @@ -217,15 +211,13 @@ class ResamplingMixin: the result gets assigned to the next grid point. - Note, that. if possible, functions passed to func will get projected - internally onto pandas.resample methods, wich results in some reasonable - performance boost - however, for this to work, you should pass functions that - have the __name__ attribute initialised and the according methods name assigned - to it. Furthermore, you shouldnt pass numpys nan-functions (``nansum``, - ``nanmean``,...) because those for example, have ``__name__ == 'nansum'`` and - they will thus not trigger ``resample.func()``, but the slower ``resample.apply( - nanfunc)``. Also, internally, no nans get passed to the functions anyway, - so that there is no point in passing the nan functions. + Note + ---- + For perfomance reasons, ``func`` will be mapped to pandas.resample methods, + if possible. However, for this to work, functions need an initialized + ``__name__`` attribute, holding the function's name. Furthermore, you should + not pass numpys nan-functions (``nansum``, ``nanmean``,...) because they + cannot be optimised and the handling of ``NaN`` is already taken care of. Parameters ---------- @@ -233,19 +225,18 @@ class ResamplingMixin: The fieldname of the column, holding the data-to-be-resampled. freq : str - An Offset String, that will be interpreted as the frequency you want to - resample your data with. + Offset string. Sampling rate of the target frequency grid. func : Callable - The function you want to use for aggregation. + Aggregation function. See notes for performance considerations. method: {'fagg', 'bagg', 'nagg'}, default 'bagg' Specifies which intervals to be aggregated for a certain timestamp. (preceding, succeeding or "surrounding" interval). See description above for more details. maxna : {None, int}, default None - Maximum number NaNs in a resampling interval. If maxna is exceeded, the interval - is set entirely to NaN. + Maximum number of allowed ``NaN``s in a resampling interval. If exceeded, the + entire interval is filled with ``NaN``. maxna_group : {None, int}, default None Same as `maxna` but for consecutive NaNs. @@ -280,8 +271,12 @@ class ResamplingMixin: datcol = self._data[field] # workaround for #GL-333 - if datcol.empty and self._data.itype in [None, DtItype]: - datcol = pd.Series(index=pd.DatetimeIndex([]), dtype=datcol.dtype) + if datcol.empty: + if self._data.itype is None: + index = pd.DatetimeIndex([]) + else: + index = self._data.itype.min_pdindex + datcol = pd.Series(index=index, dtype=datcol.dtype) freq = evalFreqStr(freq, freq_check, datcol.index) @@ -341,37 +336,14 @@ class ResamplingMixin: **kwargs, ) -> "SaQC": """ - The Function appends flags history of ``fields`` to flags history of ``target``. - Before appending, columns in ``field`` history are projected onto the target index via ``method`` - - method: (field_flag associated with "field", source_flags associated with "source") - - * 'inverse_nagg' - all target_flags within the range +/- freq/2 of a field_flag, get assigned this field flags value. - (if field_flag > target_flag) - - * 'inverse_bagg' - all target_flags succeeding a field_flag within the range of "freq", get assigned this field flags - value. (if field_flag > target_flag) - - * 'inverse_fagg' - all target_flags preceeding a field_flag within the range of "freq", get assigned this field flags - value. (if field_flag > target_flag) + Append the flags/history of ``field`` to ``target``. If necessary the flags are + projected to the ``target`` frequency grid. - * 'inverse_interpolation' - all target_flags within the range +/- freq of a field_flag, get assigned this source flags value. - (if field_flag > target_flag) - - * 'inverse_nshift' - That target_flag within the range +/- freq/2, that is nearest to a field_flag, gets the source - flags value. (if field_flag > target_flag) - - * 'inverse_bshift' - That target_flag succeeding a field flag within the range freq, that is nearest to a - field_flag, gets assigned this field flags value. (if field_flag > target_flag) - - * 'inverse_nshift' - That target_flag preceeding a field flag within the range freq, that is nearest to a - field_flag, gets assigned this field flags value. (if field_flag > target_flag) - - * 'match' - any target_flag with a timestamp matching a field_flags timestamp gets this field_flags value - (if field_flag > target_flag) - - Note, to undo or backtrack a resampling/shifting/interpolation that has been performed with a certain method, - you can just pass the associated "inverse" method. Also you should pass the same ``drop`` keyword. + Note + ---- + To undo or backtrack resampling, shifting or interpolation operations, use the + associated inversion method (e.g. to undo a former interpolation use + ``method="inverse_interpolation"``). Parameters ---------- @@ -382,22 +354,28 @@ class ResamplingMixin: Field name of flags history to append to. method : {'inverse_fagg', 'inverse_bagg', 'inverse_nagg', 'inverse_fshift', 'inverse_bshift', 'inverse_nshift', 'match'}, default 'match' - The method used for projection of ``field`` flags onto ``target`` flags. See description above for more details. + Method to project the flags of ``field`` the flags to ``target``: + + * 'inverse_nagg': project a flag of ``field`` to all timestamps of ``target`` within the range +/- ``freq``/2. + * 'inverse_bagg': project a flag of ``field`` to all preceeding timestamps of ``target`` within the range ``freq`` + * 'inverse_fagg': project a flag of ``field`` to all succeeding timestamps of ``target`` within the range ``freq`` + * 'inverse_interpolation' - project a flag of ``field`` to all timestamps of ``target`` within the range +/- ``freq`` + * 'inverse_nshift' - project a flag of ``field`` to the neaerest timestamps in ``target`` within the range +/- ``freq``/2 + * 'inverse_bshift' - project a flag of ``field`` to nearest preceeding timestamps in ``target`` + * 'inverse_nshift' - project a flag of ``field`` to nearest succeeding timestamps in ``target`` + * 'match' - project a flag of ``field`` to all identical timestamps ``target`` freq : str or None, default None - The ``freq`` determines the projection range for the projection method. See above description for more details. - Defaultly (None), the sampling frequency of ``field`` is used. + Projection range. If ``None`` the sampling frequency of ``field`` is used. drop : bool, default False - If set to `True`, the `field` column will be removed after processing + Remove ``field`` if ``True`` squeeze : bool, default False - If set to `True`, the appended flags frame will be squeezed - resulting in function specific flags informations - getting lost. + Squueze the history into a single column if ``True``. Function specific flag information is lost. overwrite: bool, default False - If set to True, the newly appended flags will overwrite exsiting flags. This might result in a loss of previous - flagging information. + Overwrite existing flags if ``True`` Returns ------- @@ -432,7 +410,7 @@ class ResamplingMixin: func_kws = dict(freq=tolerance, method=projection_method, target=dummy) elif method[-5:] == "shift": - drop_mask = target_datcol.isna() | _isflagged( + drop_mask = target_datcol.isna() | isflagged( target_flagscol, kwargs["dfilter"] ) projection_method = METHOD2ARGS[method][0] @@ -456,7 +434,7 @@ class ResamplingMixin: history = self._flags.history[field].apply(dummy.index, func, func_kws) if overwrite is False: - mask = _isflagged(self._flags[target], thresh=kwargs["dfilter"]) + mask = isflagged(self._flags[target], thresh=kwargs["dfilter"]) history._hist[mask] = np.nan if squeeze: diff --git a/saqc/funcs/residuals.py b/saqc/funcs/residuals.py index dec6681e2be5c2f3b16e112bbe2b4984703e99da..635b71da51e27edee8130f42b35bbdef05d62ce1 100644 --- a/saqc/funcs/residuals.py +++ b/saqc/funcs/residuals.py @@ -12,12 +12,12 @@ from typing import TYPE_CHECKING, Callable, Optional, Union import numpy as np import pandas as pd -from saqc.core.register import register +from saqc.core import register from saqc.funcs.curvefit import _fitPolynomial from saqc.funcs.rolling import _roll if TYPE_CHECKING: - from saqc.core.core import SaQC + from saqc import SaQC class ResidualsMixin: diff --git a/saqc/funcs/rolling.py b/saqc/funcs/rolling.py index 80699200d866e7bf498426d0c214de3e0e815fae..125991c4f8d5e16bde0adc2703f7646a113500a3 100644 --- a/saqc/funcs/rolling.py +++ b/saqc/funcs/rolling.py @@ -11,13 +11,11 @@ from typing import TYPE_CHECKING, Callable, Union import numpy as np import pandas as pd -from dios import DictOfSeries -from saqc.core.flags import Flags -from saqc.core.register import register +from saqc.core import DictOfSeries, Flags, register from saqc.lib.tools import getFreqDelta if TYPE_CHECKING: - from saqc.core.core import SaQC + from saqc import SaQC class RollingMixin: diff --git a/saqc/funcs/scores.py b/saqc/funcs/scores.py index 8e0be0b36f6d2bad5810096798eaf23470b606ae..1846cab2b40f8b4837acce6b0ff11c581d2691f3 100644 --- a/saqc/funcs/scores.py +++ b/saqc/funcs/scores.py @@ -13,13 +13,13 @@ import numpy as np import pandas as pd from typing_extensions import Literal -import saqc.lib.ts_operators as ts_ops -from saqc.constants import UNFLAGGED -from saqc.core.register import register +from saqc import UNFLAGGED +from saqc.core import register from saqc.lib.tools import getApply, toSequence +from saqc.lib.ts_operators import kNN if TYPE_CHECKING: - from saqc.core.core import SaQC + from saqc import SaQC def _univarScoring( @@ -210,7 +210,7 @@ class ScoresMixin: sample_size = partition.shape[0] nn_neighbors = min(n, max(sample_size, 2) - 1) - dist, *_ = ts_ops.kNN( + dist, *_ = kNN( partition.values, nn_neighbors, algorithm=method, metric=metric, p=p ) try: diff --git a/saqc/funcs/tools.py b/saqc/funcs/tools.py index 9888bb1ff43b98b84627fd7f51170b676bba2267..1036b7207c674f116ed24a34645b23478ead73ad 100644 --- a/saqc/funcs/tools.py +++ b/saqc/funcs/tools.py @@ -15,13 +15,13 @@ import matplotlib.pyplot as plt import numpy as np from typing_extensions import Literal -from saqc.constants import FILTER_NONE, UNFLAGGED -from saqc.core.register import processing, register +from saqc import FILTER_NONE, UNFLAGGED +from saqc.core import processing, register from saqc.lib.plotting import makeFig from saqc.lib.tools import periodicMask if TYPE_CHECKING: - from saqc.core.core import SaQC + from saqc import SaQC _MPL_DEFAULT_BACKEND = mpl.get_backend() diff --git a/saqc/funcs/transformation.py b/saqc/funcs/transformation.py index a55568ddb1b300cf438898513c07a11a73fdec3f..0fe642131f21b2279bffa3ffc89b43dc3a060e3a 100644 --- a/saqc/funcs/transformation.py +++ b/saqc/funcs/transformation.py @@ -12,10 +12,10 @@ from typing import TYPE_CHECKING, Callable, Optional, Union import numpy as np import pandas as pd -from saqc.core.register import register +from saqc.core import register if TYPE_CHECKING: - from saqc.core.core import SaQC + from saqc import SaQC class TransformationMixin: @@ -23,14 +23,12 @@ class TransformationMixin: def transform( self: "SaQC", field: str, - func: Callable[[pd.Series], pd.Series], + func: Callable[[pd.Series | np.ndarray], pd.Series], freq: Optional[Union[float, str]] = None, **kwargs, ) -> "SaQC": """ - Function to transform data columns with a transformation that maps series onto series of the same length. - - Note, that flags get preserved. + Transform data by applying a custom function on data chunks of variable size. Existing flags are preserved. Parameters ---------- @@ -38,15 +36,14 @@ class TransformationMixin: The fieldname of the column, holding the data-to-be-transformed. func : Callable[{pd.Series, np.array}, np.array] - Function to transform data[field] with. + Transformation function. freq : {None, float, str}, default None - Determines the segmentation of the data into partitions, the transformation is applied on individually + Size of the data partition. The transformation is applied on each partition individually - * ``np.inf``: Apply transformation on whole data set at once - * ``x`` > 0 : Apply transformation on successive data chunks of periods length ``x`` - * Offset String : Apply transformation on successive partitions of temporal extension matching the passed offset - string + * ``None``: Apply transformation on the entire data set at once + * ``int`` : Apply transformation on successive data chunks of the given length. Must be grater than 0. + * Offset String : Apply transformation on successive data chunks of the given temporal extension. Returns ------- diff --git a/saqc/lib/docurator.py b/saqc/lib/docurator.py index 59b27d7393d7943c7818e018914cf8f1911aeca4..3fd272b07b7c2bb58a38e1639f1e98487504b020 100644 --- a/saqc/lib/docurator.py +++ b/saqc/lib/docurator.py @@ -145,7 +145,7 @@ def saqcMethodsTemplate(doc_string: str, source="function_string"): out_para = mkParameter( parameter_name="out", parameter_type="saqc.SaQC", - parameter_doc="An :py:meth:`saqc.SaQC` object, holding the (possibly) modified data", + parameter_doc="An :py:meth:`saqc.SaQC` object, holding the data", indent_str=indent_string, ) returns_section["Returns"] += out_para["out"] diff --git a/saqc/lib/plotting.py b/saqc/lib/plotting.py index d3f20acb791cac6a7c78d87854a42066c71af37b..56a27e8f29828f7c6b65c207c800c2e18f8fb6ea 100644 --- a/saqc/lib/plotting.py +++ b/saqc/lib/plotting.py @@ -9,7 +9,6 @@ from __future__ import annotations import itertools -from typing import Optional, Union import matplotlib as mpl import matplotlib.pyplot as plt @@ -17,7 +16,7 @@ import numpy as np import pandas as pd from typing_extensions import Literal -from saqc.core.flags import Flags +from saqc.core import Flags from saqc.lib.tools import toSequence from saqc.lib.types import DiosLikeT diff --git a/saqc/lib/tools.py b/saqc/lib/tools.py index 41a3cd96172b0c25bd06eb706c5e1b7425a140ec..3a454bd181f6c938ee21c136c40cd25a7cdc7772 100644 --- a/saqc/lib/tools.py +++ b/saqc/lib/tools.py @@ -18,11 +18,9 @@ import pandas as pd from scipy import fft from scipy.cluster.hierarchy import fcluster, linkage -import dios - # keep this for external imports # TODO: fix the external imports -from saqc.lib.rolling import customRoller +from saqc.lib.rolling import customRoller # noqa from saqc.lib.types import CompT T = TypeVar("T", str, float, int) @@ -171,59 +169,6 @@ def periodicMask(dtindex, season_start, season_end, include_bounds): return out -def concatDios(data: List[dios.DictOfSeries], warn: bool = True, stacklevel: int = 2): - # fast path for most common case - if len(data) == 1 and data[0].columns.is_unique: - return data[0] - - result = dios.DictOfSeries() - for di in data: - for c in di.columns: - if c in result.columns: - if warn: - warnings.warn( - f"Column {c} already exist. Data is overwritten. " - f"Avoid duplicate columns names over all inputs.", - stacklevel=stacklevel, - ) - result[c] = di[c] - - return result - - -def mergeDios(left, right, subset=None, join="merge"): - # use dios.merge() as soon as it implemented - # see https://git.ufz.de/rdm/dios/issues/15 - - merged = left.copy() - if subset is not None: - right_subset_cols = right.columns.intersection(subset) - else: - right_subset_cols = right.columns - - shared_cols = left.columns.intersection(right_subset_cols) - - for c in shared_cols: - l, r = left[c], right[c] - if join == "merge": - # NOTE: - # our merge behavior is nothing more than an - # outer join, where the right join argument - # overwrites the left at the shared indices, - # while on a normal outer join common indices - # hold the values from the left join argument - r, l = l.align(r, join="outer") - else: - l, r = l.align(r, join=join) - merged[c] = l.combine_first(r) - - newcols = right_subset_cols.difference(left.columns) - for c in newcols: - merged[c] = right[c].copy() - - return merged - - def isQuoted(string): return bool(re.search(r"'.*'|\".*\"", string)) @@ -594,3 +539,21 @@ def filterKwargs( ) kwargs.pop(key, None) return kwargs + + +from saqc import FILTER_ALL, UNFLAGGED + +A = TypeVar("A", np.ndarray, pd.Series) + + +def isflagged(flagscol: A, thresh: float) -> A: + """ + Return a mask of flags accordingly to `thresh`. Return type is same as flags. + """ + if not isinstance(thresh, (float, int)): + raise TypeError(f"thresh must be of type float, not {repr(type(thresh))}") + + if thresh == FILTER_ALL: + return flagscol > UNFLAGGED + + return flagscol >= thresh diff --git a/saqc/lib/types.py b/saqc/lib/types.py index 2c11b3c7768342192a431d0b7ce94027892b6ab2..4bdb7e4e2841dd3dbd6e8d01151de86c6e2961bc 100644 --- a/saqc/lib/types.py +++ b/saqc/lib/types.py @@ -7,7 +7,14 @@ # -*- coding: utf-8 -*- from __future__ import annotations -from abc import abstractmethod +import abc +from typing import Any, Dict, TypeVar, Union + +import numpy as np +import pandas as pd +from typing_extensions import Protocol + +from saqc.core import DictOfSeries __all__ = [ "T", @@ -19,15 +26,6 @@ __all__ = [ "OptionalNone", ] - -from typing import Any, Dict, TypeVar, Union - -import numpy as np -import pandas as pd -from typing_extensions import Protocol - -from dios import DictOfSeries - T = TypeVar("T") ArrayLike = TypeVar("ArrayLike", np.ndarray, pd.Series, pd.DataFrame) PandasLike = Union[pd.Series, pd.DataFrame, DictOfSeries] @@ -51,7 +49,7 @@ class GenericFunction(Protocol): class Comparable(Protocol): - @abstractmethod + @abc.abstractmethod def __gt__(self: CompT, other: CompT) -> bool: pass diff --git a/saqc/parsing/__init__.py b/saqc/parsing/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1d74ada495ad00a20887cdee50a426347f37efae --- /dev/null +++ b/saqc/parsing/__init__.py @@ -0,0 +1,7 @@ +#! /usr/bin/env python + +# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ +# +# SPDX-License-Identifier: GPL-3.0-or-later + +# -*- coding: utf-8 -*- diff --git a/saqc/parsing/environ.py b/saqc/parsing/environ.py new file mode 100644 index 0000000000000000000000000000000000000000..65bc04fa4795cff0f04c6423e3b4280ffc02a124 --- /dev/null +++ b/saqc/parsing/environ.py @@ -0,0 +1,76 @@ +#! /usr/bin/env python + +# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ +# +# SPDX-License-Identifier: GPL-3.0-or-later + +# -*- coding: utf-8 -*- + +import numpy as np +import scipy.stats as st + +import saqc.lib.ts_operators as ts_ops +from saqc import BAD, DOUBTFUL, FILTER_ALL, FILTER_NONE, GOOD, UNFLAGGED + + +def clip(series, lower=None, upper=None): + return series.clip(lower=lower, upper=upper) + + +def zscore(obj): + return st.zscore(obj, nan_policy="omit") + + +ENVIRONMENT = { + # Infinity constant + "inf": np.inf, + "INF": np.inf, + # Not a number constant. + "NAN": np.nan, + "nan": np.nan, + # Absolute value function. + "abs": np.abs, + # Maximum value function. Ignores NaN. + "max": np.nanmax, + # Minimum Value function. Ignores NaN. + "min": np.nanmin, + # Mean value function. Ignores NaN. + "mean": np.nanmean, + # Summation. Ignores NaN. + "sum": np.nansum, + # Standard deviation. Ignores NaN. + "len": len, + # exponential function. + "exp": np.exp, + # Logarithm. + "log": np.log, + # Logarithm, returning NaN for zero input, instead of -inf. + "nanLog": ts_ops.zeroLog, + # Standard deviation. Ignores NaN. + "std": np.nanstd, + # Variance. Ignores NaN. + "var": np.nanvar, + # Median. Ignores NaN. + "median": np.nanmedian, + # Count Number of values. Ignores NaNs. + "count": ts_ops.count, + # Identity. + "id": ts_ops.identity, + # Returns a series` diff. + "diff": ts_ops.difference, + # Scales data to [0,1] interval. + "scale": ts_ops.normScale, + # Standardize with standard deviation. + "zScore": zscore, + # Standardize with median and MAD. + "madScore": ts_ops.standardizeByMedian, + # Standardize with median and inter quantile range. + "iqsScore": ts_ops.standardizeByIQR, + "clip": clip, + "GOOD": GOOD, + "BAD": BAD, + "UNFLAGGED": UNFLAGGED, + "DOUBTFUL": DOUBTFUL, + "FILTER_ALL": FILTER_ALL, + "FILTER_NONE": FILTER_NONE, +} diff --git a/saqc/core/reader.py b/saqc/parsing/reader.py similarity index 97% rename from saqc/core/reader.py rename to saqc/parsing/reader.py index b5ff437875f2bb225a05f5a2ae082b7a8ee1db3e..8c8673e31817361b16b64daaf20889ca2335d975 100644 --- a/saqc/core/reader.py +++ b/saqc/parsing/reader.py @@ -15,9 +15,9 @@ from urllib.request import urlopen import pandas as pd -from saqc.core.core import SaQC -from saqc.core.visitor import ConfigFunctionParser +from saqc import SaQC from saqc.lib.tools import isQuoted +from saqc.parsing.visitor import ConfigFunctionParser COMMENT = "#" SEPARATOR = ";" diff --git a/saqc/core/visitor.py b/saqc/parsing/visitor.py similarity index 99% rename from saqc/core/visitor.py rename to saqc/parsing/visitor.py index 294a5a812807d91326ce8ea635e1f4b353c84f4c..91f086e2f4aaa81065d2c0430d65aa9ad7ce8c7a 100644 --- a/saqc/core/visitor.py +++ b/saqc/parsing/visitor.py @@ -8,8 +8,8 @@ import ast -from saqc.constants import ENVIRONMENT from saqc.core.register import FUNC_MAP +from saqc.parsing.environ import ENVIRONMENT class ConfigExpressionParser(ast.NodeVisitor): diff --git a/tests/api/test_creation.py b/tests/api/test_creation.py index 85fbebeb871c5d1d379fae84859b0162a463a8ac..60ae95a4ab7f66305a1f7cf420a05a4cba55e3e9 100644 --- a/tests/api/test_creation.py +++ b/tests/api/test_creation.py @@ -7,6 +7,8 @@ import numpy as np import pandas as pd +# directly import container class to avoid importing +# saqc here. import dios diff --git a/tests/common.py b/tests/common.py index 3a973e4c9f5d60d3ced0a5f1068588ef96ea8d0a..c82b5d2bf791a499dd9f5345ae9b3f69d0f18580 100644 --- a/tests/common.py +++ b/tests/common.py @@ -11,8 +11,8 @@ import io import numpy as np import pandas as pd -import dios from saqc.core import Flags +from saqc.core.frame import DictOfSeries from saqc.core.history import History, createHistoryFromData @@ -22,7 +22,7 @@ def initData( if rows is None: freq = freq or "1h" - di = dios.DictOfSeries(itype=dios.DtItype) + di = DictOfSeries(itype="datetime") dates = pd.date_range(start=start_date, end=end_date, freq=freq, periods=rows) dummy = np.arange(len(dates)) @@ -74,7 +74,7 @@ def checkDataFlagsInvariants(data, flags, field, identical=True): whether to check indexes of data and flags to be identical (True, default) of just for equality. """ - assert isinstance(data, dios.DictOfSeries) + assert isinstance(data, DictOfSeries) assert isinstance(flags, Flags) # all columns in data are in flags diff --git a/tests/core/test_core.py b/tests/core/test_core.py index 82b30186491a2ec8c1540902e8f530806087e57a..e0dedae95b498008aa52b4e345c68ff2d7c214c2 100644 --- a/tests/core/test_core.py +++ b/tests/core/test_core.py @@ -12,11 +12,8 @@ import numpy as np import pandas as pd import pytest -import saqc -from saqc.constants import BAD, FILTER_ALL, FILTER_NONE, UNFLAGGED -from saqc.core import SaQC, initFlagsLike -from saqc.core.flags import Flags -from saqc.core.register import flagging, processing, register +from saqc import BAD, FILTER_ALL, FILTER_NONE, UNFLAGGED, SaQC +from saqc.core import Flags, flagging, initFlagsLike, processing, register from saqc.lib.types import OptionalNone from tests.common import initData @@ -68,12 +65,12 @@ def test_dtypes(data, flags): def test_new_call(data): - qc = saqc.SaQC(data) + qc = SaQC(data) qc = qc.flagRange("var1", max=5) def test_copy(data): - qc = saqc.SaQC(data) + qc = SaQC(data) qc = qc.flagRange("var1").flagRange("var1", min=0, max=0) diff --git a/tests/core/test_flags.py b/tests/core/test_flags.py index 6be9479869c464c7de609ca1618479fdf1668a30..fbb09e0d3c335401050535201ebea0f15568eabe 100644 --- a/tests/core/test_flags.py +++ b/tests/core/test_flags.py @@ -10,11 +10,9 @@ import numpy as np import pandas as pd import pytest -import dios -from saqc.constants import UNFLAGGED -from saqc.core.flags import Flags -from tests.core.test_history import History -from tests.core.test_history import is_equal as hist_equal +import tests.core.test_history as test_hist +from saqc import UNFLAGGED +from saqc.core import DictOfSeries, Flags, History _arrays = [ np.array([[]]), @@ -42,7 +40,7 @@ testdata = [] for d in _arrays: columns = list("abcdefgh")[: d.shape[1]] df = pd.DataFrame(d, dtype=float, columns=columns) - dis = dios.DictOfSeries(df) + dis = DictOfSeries(df) di = {} di.update(df.items()) testdata.append(df) @@ -53,7 +51,7 @@ for d in _arrays: def is_equal(f1, f2): assert f1.columns.equals(f2.columns) for c in f1.columns: - assert hist_equal(f1.history[c], f2.history[c]) + assert test_hist.is_equal(f1.history[c], f2.history[c]) @pytest.mark.parametrize("data", testdata) @@ -103,7 +101,7 @@ def test_init_raise_TypeError(data, msg): @pytest.mark.parametrize("data", testdata) -def test_copy(data: Union[pd.DataFrame, dios.DictOfSeries, Dict[str, pd.Series]]): +def test_copy(data: Union[pd.DataFrame, DictOfSeries, Dict[str, pd.Series]]): flags = Flags(data) shallow = flags.copy(deep=False) deep = flags.copy(deep=True) @@ -131,9 +129,7 @@ def test_copy(data: Union[pd.DataFrame, dios.DictOfSeries, Dict[str, pd.Series]] @pytest.mark.parametrize("data", testdata) -def test_flags_history( - data: Union[pd.DataFrame, dios.DictOfSeries, Dict[str, pd.Series]] -): +def test_flags_history(data: Union[pd.DataFrame, DictOfSeries, Dict[str, pd.Series]]): flags = Flags(data) # get @@ -153,7 +149,7 @@ def test_flags_history( @pytest.mark.parametrize("data", testdata) -def test_get_flags(data: Union[pd.DataFrame, dios.DictOfSeries, Dict[str, pd.Series]]): +def test_get_flags(data: Union[pd.DataFrame, DictOfSeries, Dict[str, pd.Series]]): flags = Flags(data) for c in flags.columns: @@ -172,7 +168,7 @@ def test_get_flags(data: Union[pd.DataFrame, dios.DictOfSeries, Dict[str, pd.Ser @pytest.mark.parametrize("data", testdata) -def test_set_flags(data: Union[pd.DataFrame, dios.DictOfSeries, Dict[str, pd.Series]]): +def test_set_flags(data: Union[pd.DataFrame, DictOfSeries, Dict[str, pd.Series]]): flags = Flags(data) for c in flags.columns: @@ -202,7 +198,7 @@ def test_set_flags(data: Union[pd.DataFrame, dios.DictOfSeries, Dict[str, pd.Ser @pytest.mark.parametrize("data", testdata) def test_set_flags_with_mask( - data: Union[pd.DataFrame, dios.DictOfSeries, Dict[str, pd.Series]] + data: Union[pd.DataFrame, DictOfSeries, Dict[str, pd.Series]] ): flags = Flags(data) @@ -249,7 +245,7 @@ def test_set_flags_with_mask( @pytest.mark.parametrize("data", testdata) def test_set_flags_with_index( - data: Union[pd.DataFrame, dios.DictOfSeries, Dict[str, pd.Series]] + data: Union[pd.DataFrame, DictOfSeries, Dict[str, pd.Series]] ): flags = Flags(data) @@ -292,16 +288,16 @@ def _validate_flags_equals_frame(flags, df): @pytest.mark.parametrize("data", testdata) -def test_to_dios(data: Union[pd.DataFrame, dios.DictOfSeries, Dict[str, pd.Series]]): +def test_to_dios(data: Union[pd.DataFrame, DictOfSeries, Dict[str, pd.Series]]): flags = Flags(data) df = flags.toDios() - assert isinstance(df, dios.DictOfSeries) + assert isinstance(df, DictOfSeries) _validate_flags_equals_frame(flags, df) @pytest.mark.parametrize("data", testdata) -def test_to_frame(data: Union[pd.DataFrame, dios.DictOfSeries, Dict[str, pd.Series]]): +def test_to_frame(data: Union[pd.DataFrame, DictOfSeries, Dict[str, pd.Series]]): flags = Flags(data) df = flags.toFrame() diff --git a/tests/core/test_frame.py b/tests/core/test_frame.py new file mode 100644 index 0000000000000000000000000000000000000000..267e522e0b5cb4f509572a5b7572a2c41a1d8549 --- /dev/null +++ b/tests/core/test_frame.py @@ -0,0 +1,40 @@ +#! /usr/bin/env python +# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ +# SPDX-License-Identifier: GPL-3.0-or-later +# -*- coding: utf-8 -*- + + +import pytest + +from saqc.core.frame import DictOfSeries as DoS +from saqc.core.frame import concatDios + + +@pytest.mark.parametrize( + "data, expected", + [ + # 2c + 1c -> 3c + ([DoS(dict(a=[1], b=[2])), DoS(dict(c=[3]))], DoS(dict(a=[1], b=[2], c=[3]))), + # 1c + 1c + 1c -> 3c + ( + [DoS(dict(a=[1])), DoS(dict(b=[1])), DoS(dict(c=[1]))], + DoS(dict(a=[1], b=[1], c=[1])), + ), + # 2c + 1c (overwrite) = 2c + ([DoS(dict(a=[1], b=[2])), DoS(dict(b=[22]))], DoS(dict(a=[1], b=[22]))), + # 1c + 1c + 1c (all overwrite) -> 1c + ( + [DoS(dict(a=[1])), DoS(dict(a=[11])), DoS(dict(a=[111]))], + DoS(dict(a=[111])), + ), + ], +) +def test_concatDios(data, expected): + result = concatDios(data, warn=False) + assert result == expected + + +@pytest.mark.parametrize("data", [[DoS(dict(a=[1], b=[2])), DoS(dict(b=[22]))]]) +def test_concatDios_warning(data): + with pytest.warns(UserWarning): + concatDios(data, warn=True, stacklevel=0) diff --git a/tests/core/test_reader.py b/tests/core/test_reader.py index f6b55f31b240a396e662663acc7533c8e2724bfe..c3de90c160f046d9dd860cbeda11574a6335651f 100644 --- a/tests/core/test_reader.py +++ b/tests/core/test_reader.py @@ -9,15 +9,13 @@ import numpy as np import pytest -import dios -from saqc.core.flags import Flags -from saqc.core.reader import fromConfig, readFile -from saqc.core.register import flagging +from saqc.core import DictOfSeries, Flags, flagging +from saqc.parsing.reader import fromConfig, readFile from tests.common import initData, writeIO @pytest.fixture -def data() -> dios.DictOfSeries: +def data() -> DictOfSeries: return initData(3) diff --git a/tests/core/test_translator.py b/tests/core/test_translator.py index cfacdbd12135c363c6145e458b0694524878da00..1da0075e90d378ba77465faa2f826fc8ee5f2158 100644 --- a/tests/core/test_translator.py +++ b/tests/core/test_translator.py @@ -13,9 +13,8 @@ import numpy as np import pandas as pd import pytest -from saqc.constants import BAD, DOUBTFUL, FILTER_NONE, UNFLAGGED -from saqc.core.core import SaQC -from saqc.core.flags import Flags +from saqc import BAD, DOUBTFUL, FILTER_NONE, UNFLAGGED, SaQC +from saqc.core import Flags from saqc.core.translation import DmpScheme, MappingScheme, PositionalScheme from tests.common import initData diff --git a/tests/fixtures.py b/tests/fixtures.py index ea11559296a8b59b4cfbd8d3ae18ab5cd8ccc010..b8d05ceb5d12d07a41ab27a8e8cd7a67b00e222e 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -6,7 +6,7 @@ import numpy as np import pandas as pd import pytest -from dios import DictOfSeries +from saqc.core import DictOfSeries # TODO: this is odd # Why not simple fixtures with talking-names, diff --git a/tests/funcs/test_constants_detection.py b/tests/funcs/test_constants_detection.py index 381d5260fafb2d3ac3382d9c0110e6342cce5922..29a5a251458cc225fa012652b2c509cfd39b866f 100644 --- a/tests/funcs/test_constants_detection.py +++ b/tests/funcs/test_constants_detection.py @@ -9,7 +9,7 @@ import numpy as np import pytest -from saqc.constants import BAD, UNFLAGGED +from saqc import BAD, UNFLAGGED from saqc.core import SaQC, initFlagsLike from tests.common import initData diff --git a/tests/funcs/test_flagtools.py b/tests/funcs/test_flagtools.py index 457edab8ab666175cafc9bb5df6cf01f5bb70af1..6286a2145425ab1856f490704bc9ab0a6b2a1166 100644 --- a/tests/funcs/test_flagtools.py +++ b/tests/funcs/test_flagtools.py @@ -4,15 +4,16 @@ # # SPDX-License-Identifier: GPL-3.0-or-later +import operator + import numpy as np import pandas as pd - -# -*- coding: utf-8 -*- import pytest from saqc import BAD as B from saqc import UNFLAGGED as U from saqc import SaQC +from saqc.funcs.flagtools import _groupOperation N = np.nan @@ -98,3 +99,67 @@ def test_propagateFlagsIrregularIndex(got, expected, kwargs): saqc = SaQC(data=data, flags=flags).propagateFlags(field="x", **kwargs) result = saqc._flags.history["x"].hist[1].astype(float) assert result.equals(expected) + + +@pytest.mark.parametrize( + "left,right,expected", + [ + ([B, U, U, B], [B, B, U, U], [B, U, U, U]), + ([B, B, B, B], [B, B, B, B], [B, B, B, B]), + ([U, U, U, U], [U, U, U, U], [U, U, U, U]), + ], +) +def test_andGroup(left, right, expected): + data = pd.DataFrame({"data": [1, 2, 3, 4]}) + + base = SaQC(data=data) + this = SaQC(data=data, flags=pd.DataFrame({"data": pd.Series(left)})) + that = SaQC(data=data, flags=pd.DataFrame({"data": pd.Series(right)})) + result = base.andGroup(field="data", group=[this, that]) + + assert pd.Series(expected).equals(result.flags["data"]) + + +@pytest.mark.parametrize( + "left,right,expected", + [ + ([B, U, U, B], [B, B, U, U], [B, B, U, B]), + ([B, B, B, B], [B, B, B, B], [B, B, B, B]), + ([U, U, U, U], [U, U, U, U], [U, U, U, U]), + ], +) +def test_orGroup(left, right, expected): + data = pd.DataFrame({"data": [1, 2, 3, 4]}) + + base = SaQC(data=data) + this = SaQC(data=data, flags=pd.DataFrame({"data": pd.Series(left)})) + that = SaQC(data=data, flags=pd.DataFrame({"data": pd.Series(right)})) + result = base.orGroup(field="data", group=[this, that]) + + assert pd.Series(expected).equals(result.flags["data"]) + + +@pytest.mark.parametrize( + "left,right,expected", + [ + ([B, U, U, B], [B, B, U, U], [B, B, U, B]), + ([B, B, B, B], [B, B, B, B], [B, B, B, B]), + ([U, U, U, U], [U, U, U, U], [U, U, U, U]), + ], +) +def test__groupOperation(left, right, expected): + data = pd.DataFrame( + {"x": [0, 1, 2, 3], "y": [0, 11, 22, 33], "z": [0, 111, 222, 333]} + ) + base = SaQC(data=data) + this = SaQC( + data=data, flags=pd.DataFrame({k: pd.Series(left) for k in data.columns}) + ) + that = SaQC( + data=data, flags=pd.DataFrame({k: pd.Series(right) for k in data.columns}) + ) + result = _groupOperation( + base=base, field="x", func=operator.or_, group={this: "y", that: ["y", "z"]} + ) + + assert pd.Series(expected).equals(result.flags["x"]) diff --git a/tests/funcs/test_functions.py b/tests/funcs/test_functions.py index 53c13a6deac9909b23ae0c85b591fa6a7aa301e8..9ae18b532101c1822140aaf502913719644ddd33 100644 --- a/tests/funcs/test_functions.py +++ b/tests/funcs/test_functions.py @@ -10,13 +10,11 @@ import numpy as np import pandas as pd import pytest -import dios import saqc -from saqc.constants import BAD, DOUBTFUL, UNFLAGGED -from saqc.core import initFlagsLike -from saqc.core.core import SaQC +from saqc import BAD, DOUBTFUL, UNFLAGGED, SaQC +from saqc.core import DictOfSeries, initFlagsLike from tests.common import initData -from tests.fixtures import char_dict, course_1 +from tests.fixtures import char_dict, course_1 # noqa, todo: fix fixtures @pytest.fixture @@ -34,7 +32,7 @@ def test_statPass(): noise = [-1, 1] * 10 data[100:120] = noise data[200:210] = noise[:10] - data = dios.DictOfSeries(data) + data = DictOfSeries(data) flags = initFlagsLike(data) qc = SaQC(data, flags).flagByStatLowPass( "data", np.std, "20D", 0.999, "5D", 0.999, 0, flag=BAD @@ -287,7 +285,7 @@ def test_transferFlags(): qc = saqc.SaQC(data) qc = qc.flagRange("a", max=1.5) with pytest.deprecated_call(): - qc = qc.transferFlags(["a", "a"], ["b", "c"]) + qc = qc.transferFlags(["a", "a"], ["b", "c"]) # noqa assert np.all(qc.flags["b"].values == np.array([UNFLAGGED, BAD])) assert np.all(qc.flags["c"].values == np.array([UNFLAGGED, BAD])) diff --git a/tests/funcs/test_generic_api_functions.py b/tests/funcs/test_generic_api_functions.py index dd1c7f4aea68e8cea28cd8fdd6172707bea1ab21..e2181e9bd30d5c3c75472b8a06cd3b4cc062a321 100644 --- a/tests/funcs/test_generic_api_functions.py +++ b/tests/funcs/test_generic_api_functions.py @@ -9,10 +9,8 @@ import pandas as pd import pytest -from dios.dios.dios import DictOfSeries -from saqc import SaQC -from saqc.constants import BAD, FILTER_ALL, UNFLAGGED -from saqc.core.flags import Flags +from saqc import BAD, FILTER_ALL, UNFLAGGED, SaQC +from saqc.core import DictOfSeries, Flags from saqc.lib.tools import toSequence from tests.common import initData diff --git a/tests/funcs/test_generic_config_functions.py b/tests/funcs/test_generic_config_functions.py index 3f61e775647c705512cb21ed572bfcf2c9237a9c..0dc9d20ec672887a4977ad7c2cead67f1511473b 100644 --- a/tests/funcs/test_generic_config_functions.py +++ b/tests/funcs/test_generic_config_functions.py @@ -12,15 +12,11 @@ import numpy as np import pandas as pd import pytest -import dios -from saqc import SaQC -from saqc.constants import BAD, UNFLAGGED -from saqc.core import initFlagsLike -from saqc.core.flags import Flags -from saqc.core.reader import fromConfig -from saqc.core.register import register -from saqc.core.visitor import ConfigFunctionParser +from saqc import BAD, UNFLAGGED, SaQC +from saqc.core import DictOfSeries, Flags, initFlagsLike, register from saqc.funcs.generic import _execGeneric +from saqc.parsing.reader import fromConfig +from saqc.parsing.visitor import ConfigFunctionParser from tests.common import initData, writeIO @@ -36,7 +32,7 @@ def data_diff(): col1 = data[data.columns[1]] mid = len(col0) // 2 offset = len(col0) // 8 - return dios.DictOfSeries( + return DictOfSeries( data={ col0.name: col0.iloc[: mid + offset], col1.name: col1.iloc[mid - offset :], diff --git a/tests/funcs/test_outlier_detection.py b/tests/funcs/test_outlier_detection.py index 921a82e5300c426217669c800b5521c90ec57839..4fdc17ec0e310526281357d8249bb39836a48a10 100644 --- a/tests/funcs/test_outlier_detection.py +++ b/tests/funcs/test_outlier_detection.py @@ -12,10 +12,9 @@ import pandas as pd # see test/functs/fixtures.py for global fixtures "course_..." import pytest -import dios import saqc -from saqc.constants import BAD, UNFLAGGED -from saqc.core import SaQC, initFlagsLike +from saqc import BAD, UNFLAGGED +from saqc.core import DictOfSeries, SaQC, initFlagsLike from tests.fixtures import char_dict, course_1, course_2, course_3, course_4 @@ -26,7 +25,7 @@ def spiky_data(): s.iloc[100] = 100 s.iloc[1000] = -100 flag_assertion = [100, 1000] - return dios.DictOfSeries(s), flag_assertion + return DictOfSeries(s), flag_assertion def test_flagMad(spiky_data): @@ -99,7 +98,7 @@ def test_flagMVScores(dat): s1, s2 = data1.squeeze(), data2.squeeze() s1 = pd.Series(data=s1.values, index=s1.index) s2 = pd.Series(data=s2.values, index=s1.index) - data = dios.DictOfSeries([s1, s2], columns=["field1", "field2"]) + data = DictOfSeries([s1, s2], columns=["field1", "field2"]) flags = initFlagsLike(data) qc = SaQC(data, flags).flagMVScores( field=fields, @@ -135,7 +134,7 @@ def test_flagCrossStatistics(dat): s1, s2 = data1.squeeze(), data2.squeeze() s1 = pd.Series(data=s1.values, index=s1.index) s2 = pd.Series(data=s2.values, index=s1.index) - data = dios.DictOfSeries([s1, s2], columns=["field1", "field2"]) + data = DictOfSeries([s1, s2], columns=["field1", "field2"]) flags = initFlagsLike(data) qc = SaQC(data, flags).flagCrossStatistics( diff --git a/tests/funcs/test_pattern_rec.py b/tests/funcs/test_pattern_rec.py index 7b9e087164e7227f0a7ac58ec741150b02d6215e..f885522c42e040e99837dab58c4c24569fa66789 100644 --- a/tests/funcs/test_pattern_rec.py +++ b/tests/funcs/test_pattern_rec.py @@ -9,9 +9,8 @@ import pandas as pd import pytest -import dios -from saqc.constants import BAD, UNFLAGGED -from saqc.core import SaQC, initFlagsLike +from saqc import BAD, UNFLAGGED, SaQC +from saqc.core import DictOfSeries, initFlagsLike from tests.common import initData @@ -32,7 +31,7 @@ def test_flagPattern_dtw(plot, normalize): data.iloc[10:18] = [0, 5, 6, 7, 6, 8, 5, 0] pattern = data.iloc[10:18] - data = dios.DictOfSeries(dict(data=data, pattern_data=pattern)) + data = DictOfSeries(dict(data=data, pattern_data=pattern)) flags = initFlagsLike(data, name="data") qc = SaQC(data, flags).flagPatternByDTW( "data", diff --git a/tests/funcs/test_proc_functions.py b/tests/funcs/test_proc_functions.py index b1dd896078b14ca9dafb9aec2bfc03619c6ecf8b..4843842f37c4e31c5991e0a14231630aba7cb319 100644 --- a/tests/funcs/test_proc_functions.py +++ b/tests/funcs/test_proc_functions.py @@ -13,18 +13,17 @@ import pandas as pd # see test/functs/fixtures.py for global fixtures "course_..." import pytest -import dios import saqc -from saqc.constants import UNFLAGGED -from saqc.core import SaQC, initFlagsLike +from saqc import UNFLAGGED, SaQC +from saqc.core import DictOfSeries, initFlagsLike from saqc.lib.ts_operators import linearInterpolation, polynomialInterpolation -from tests.fixtures import char_dict, course_3, course_5 +from tests.fixtures import char_dict, course_3, course_5 # noqa, todo: fix fixtures def test_rollingInterpolateMissing(course_5): data, characteristics = course_5(periods=10, nan_slice=[5, 6]) field = data.columns[0] - data = dios.DictOfSeries(data) + data = DictOfSeries(data) flags = initFlagsLike(data) qc = SaQC(data, flags).interpolateByRolling( field, @@ -49,7 +48,7 @@ def test_rollingInterpolateMissing(course_5): def test_interpolateMissing(course_5): data, characteristics = course_5(periods=10, nan_slice=[5]) field = data.columns[0] - data = dios.DictOfSeries(data) + data = DictOfSeries(data) flags = initFlagsLike(data) qc = SaQC(data, flags) @@ -73,7 +72,7 @@ def test_interpolateMissing(course_5): def test_transform(course_5): data, characteristics = course_5(periods=10, nan_slice=[5, 6]) field = data.columns[0] - data = dios.DictOfSeries(data) + data = DictOfSeries(data) flags = initFlagsLike(data) qc = SaQC(data, flags) @@ -93,7 +92,7 @@ def test_transform(course_5): def test_resample(course_5): data, _ = course_5(freq="1min", periods=30, nan_slice=[1, 11, 12, 22, 24, 26]) field = data.columns[0] - data = dios.DictOfSeries(data) + data = DictOfSeries(data) flags = initFlagsLike(data) qc = SaQC(data, flags).resample( field, diff --git a/tests/funcs/test_resampling.py b/tests/funcs/test_resampling.py index c5dbd1570b9eb3b34990c37844312a20284126cc..2eb7316afc7a5faca5dfd7796c12c486aef17add 100644 --- a/tests/funcs/test_resampling.py +++ b/tests/funcs/test_resampling.py @@ -10,9 +10,8 @@ import numpy as np import pandas as pd import pytest -import dios -from saqc.constants import BAD, UNFLAGGED -from saqc.core import SaQC, initFlagsLike +from saqc import BAD, UNFLAGGED, SaQC +from saqc.core import DictOfSeries, initFlagsLike from tests.common import checkDataFlagsInvariants @@ -30,7 +29,7 @@ def data(): dat = pd.Series(np.linspace(-50, 50, index.size), index=index, name="data") # good to have some nan dat[-3] = np.nan - data = dios.DictOfSeries(dat) + data = DictOfSeries(dat) return data @@ -88,7 +87,7 @@ def test_gridInterpolation(data, method, fill_history): field = "data" data = data[field] data = pd.concat([data * np.sin(data), data.shift(1, "2h")]).shift(1, "3s") - data = dios.DictOfSeries(data) + data = DictOfSeries(data) flags = initFlagsLike(data) if fill_history == "none": diff --git a/tests/funcs/test_tools.py b/tests/funcs/test_tools.py index 6c63f5ec4358f44d3ddc81d44e8341f820b54a2d..75e51273734a2dcfb0ca8b5253fff0616219bdec 100644 --- a/tests/funcs/test_tools.py +++ b/tests/funcs/test_tools.py @@ -8,14 +8,14 @@ import numpy as np import pandas as pd import pytest -import dios import saqc +from saqc.core import DictOfSeries @pytest.mark.slow def test_makeFig(tmp_path): # just testing for no errors to occure... - data = dios.DictOfSeries( + data = DictOfSeries( pd.Series( np.linspace(0, 1000, 1000), pd.date_range("2000", "2001", periods=1000), diff --git a/tests/fuzzy/lib.py b/tests/fuzzy/lib.py index 6210389fb18e13333da604ea6f82c207af2ac672..a1bf9ed274e8b8e069ab4945e49ff92f331f0bdd 100644 --- a/tests/fuzzy/lib.py +++ b/tests/fuzzy/lib.py @@ -23,9 +23,8 @@ from hypothesis.strategies import ( ) from hypothesis.strategies._internal.types import _global_type_lookup -import dios -from saqc.constants import BAD -from saqc.core import initFlagsLike +from saqc import BAD +from saqc.core import DictOfSeries, initFlagsLike from saqc.core.register import FUNC_MAP MAX_EXAMPLES = 50 @@ -46,7 +45,7 @@ def dioses(draw, min_cols=1): cols = draw(lists(columnNames(), unique=True, min_size=min_cols)) columns = {c: draw(dataSeries(min_size=3)) for c in cols} - return dios.DictOfSeries(columns) + return DictOfSeries(columns) @composite diff --git a/tests/fuzzy/test_masking.py b/tests/fuzzy/test_masking.py index 0d0a49e92e88e932227e28bb6a4f0487fc8149b7..472601037279556a5f069ff703ea4cb95a789c6e 100644 --- a/tests/fuzzy/test_masking.py +++ b/tests/fuzzy/test_masking.py @@ -10,7 +10,7 @@ import pandas as pd import pytest from hypothesis import given, settings -from saqc.constants import BAD, UNFLAGGED +from saqc import BAD, UNFLAGGED from saqc.core.register import _maskData, _unmaskData from tests.fuzzy.lib import MAX_EXAMPLES, dataFieldFlags diff --git a/tests/lib/test_tools.py b/tests/lib/test_tools.py index b282f24758ce4735f42dce82c69c43dddaf5af1c..2e1534ce39616add875fb4ab6a40ae05cfe8c7fe 100644 --- a/tests/lib/test_tools.py +++ b/tests/lib/test_tools.py @@ -6,7 +6,6 @@ import pandas as pd import pytest import saqc.lib.tools as tools -from dios import DictOfSeries as DoS @pytest.mark.parametrize("optional", [False, True]) @@ -69,33 +68,3 @@ def test_toSequence(value, expected): def test_squeezeSequence(value, expected): result = tools.squeezeSequence(value) assert result == expected - - -@pytest.mark.parametrize( - "data, expected", - [ - # 2c + 1c -> 3c - ([DoS(dict(a=[1], b=[2])), DoS(dict(c=[3]))], DoS(dict(a=[1], b=[2], c=[3]))), - # 1c + 1c + 1c -> 3c - ( - [DoS(dict(a=[1])), DoS(dict(b=[1])), DoS(dict(c=[1]))], - DoS(dict(a=[1], b=[1], c=[1])), - ), - # 2c + 1c (overwrite) = 2c - ([DoS(dict(a=[1], b=[2])), DoS(dict(b=[22]))], DoS(dict(a=[1], b=[22]))), - # 1c + 1c + 1c (all overwrite) -> 1c - ( - [DoS(dict(a=[1])), DoS(dict(a=[11])), DoS(dict(a=[111]))], - DoS(dict(a=[111])), - ), - ], -) -def test_concatDios(data, expected): - result = tools.concatDios(data, warn=False) - assert result == expected - - -@pytest.mark.parametrize("data", [[DoS(dict(a=[1], b=[2])), DoS(dict(b=[22]))]]) -def test_concatDios_warning(data): - with pytest.warns(UserWarning): - tools.concatDios(data, warn=True, stacklevel=0) diff --git a/tests/lib/test_ts_operators.py b/tests/lib/test_ts_operators.py index c7288ce2e3f0b9ca62128c4bb312d3808eea7af2..96fed103586c828ca5ac4003d0d6dc2fd349aa76 100644 --- a/tests/lib/test_ts_operators.py +++ b/tests/lib/test_ts_operators.py @@ -9,7 +9,6 @@ import pytest from pandas.testing import assert_series_equal import saqc.lib.ts_operators as tsops -from saqc.lib.ts_operators import interpolateNANs def test_butterFilter(): @@ -251,7 +250,7 @@ def test_rateOfChange(data, expected): ], ) def test_interpolatNANs(limit, extrapolate, data, expected): - got = interpolateNANs( + got = tsops.interpolateNANs( pd.Series(data), gap_limit=limit, method="linear", extrapolate=extrapolate ) try: diff --git a/tests/requirements.txt b/tests/requirements.txt index 9dbae67138f14716991f2e650fc7e3a14c890875..97b1713299194c16e72d5ccec10b157521b872c9 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -2,9 +2,9 @@ # # SPDX-License-Identifier: GPL-3.0-or-later -beautifulsoup4==4.11.1 -hypothesis==6.61.0 +beautifulsoup4==4.11.2 +hypothesis==6.65.2 Markdown==3.3.7 -pytest==7.1.3 +pytest==7.2.1 pytest-lazy-fixture==0.6.3 -requests==2.27.1 +requests==2.28.2