diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index ceb758210f9d0e3834968b5241bb3174163721e7..9b17bd55b10d149958d13eeec989b0e7587943f0 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -79,7 +79,7 @@ python38: stage: test image: python:3.8 script: - - pytest tests dios/test -Werror --junitxml=report.xml + - pytest tests -Werror --junitxml=report.xml - python -m saqc --config docs/resources/data/config.csv --data docs/resources/data/data.csv --outfile /tmp/test.csv artifacts: when: always @@ -91,7 +91,7 @@ python39: stage: test image: python:3.9 script: - - pytest tests dios/test -Werror --junitxml=report.xml + - pytest tests -Werror --junitxml=report.xml - python -m saqc --config docs/resources/data/config.csv --data docs/resources/data/data.csv --outfile /tmp/test.csv artifacts: when: always @@ -103,7 +103,7 @@ python310: stage: test image: python:3.10 script: - - pytest tests dios/test -Werror --junitxml=report.xml + - pytest tests -Werror --junitxml=report.xml - python -m saqc --config docs/resources/data/config.csv --data docs/resources/data/data.csv --outfile /tmp/test.csv artifacts: when: always diff --git a/dios/.gitignore b/dios/.gitignore deleted file mode 100644 index 54b82b15527c60fd8a706a410f8d81ad22d379ed..0000000000000000000000000000000000000000 --- a/dios/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -__pycache__/ diff --git a/dios/Readme.md b/dios/Readme.md deleted file mode 100644 index 195b46a128fcf06aa2f71ff3d5d705004f650773..0000000000000000000000000000000000000000 --- a/dios/Readme.md +++ /dev/null @@ -1,108 +0,0 @@ -<!-- -SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ - -SPDX-License-Identifier: GPL-3.0-or-later ---> - -DictOfSeries -============ - -DictOfSeries is a pandas.Series of pandas.Series objects which aims to behave as similar as possible to pandas.DataFrame. - - -Nomenclature ------------- -- series/ser: instance of pandas.Series -- dios: instance of dios.DictOfSeries -- df: instance of pandas.DataFrame -- dios-like: a *dios* or a *df* -- alignable object: a *dios*, *df* or a *series* - - -Features --------- -* every *column* has its own index -* uses much less memory than a misaligned pandas.DataFrame -* behaves quite like a pandas.DataFrame -* additional align locator (`.aloc[]`) - -Install -------- - -todo: PyPi - -``` -import dios - -# Have fun :) -``` - -Documentation -------------- - -The main docu is on ReadTheDocs at: - -* [dios.rtfd.io](https://dios.rtfd.io) - -but some docs are also available local: -* [Indexing](/docs/doc_indexing.md) -* [Cookbook](/docs/doc_cookbook.md) -* [Itype](/docs/doc_itype.md) - -TL;DR ------ -**get it** -``` ->>> from dios import DictOfSeries -``` -**empty** -``` ->>> DictOfSeries() -Empty DictOfSeries -Columns: [] - ->>> DictOfSeries(columns=['x', 'y']) -Empty DictOfSeries -Columns: ['x', 'y'] - ->>> DictOfSeries(columns=['x', 'y'], index=[3,4,5]) - x | y | -====== | ====== | -3 NaN | 3 NaN | -4 NaN | 4 NaN | -5 NaN | 5 NaN | -``` -**with data** -``` ->>> DictOfSeries([range(4), range(2), range(3)]) - 0 | 1 | 2 | -==== | ==== | ==== | -0 0 | 0 0 | 0 0 | -1 1 | 1 1 | 1 1 | -2 2 | | 2 2 | -3 3 | | | - ->>> DictOfSeries(np.random.random([2,4])) - 0 | 1 | -=========== | =========== | -0 0.112020 | 0 0.509881 | -1 0.108070 | 1 0.285779 | -2 0.851453 | 2 0.805933 | -3 0.138352 | 3 0.812339 | - ->>> DictOfSeries(np.random.random([2,4]), columns=['a','b'], index=[11,12,13,14]) - a | b | -============ | ============ | -11 0.394304 | 11 0.356206 | -12 0.943689 | 12 0.735356 | -13 0.791820 | 13 0.066947 | -14 0.759802 | 14 0.496321 | - ->>> DictOfSeries(dict(today=['spam']*3, tomorrow=['spam']*2)) - today | tomorrow | -======= | ========== | -0 spam | 0 spam | -1 spam | 1 spam | -2 spam | | -``` - diff --git a/dios/__init__.py b/dios/__init__.py deleted file mode 100644 index 7ed11a503edeb985d9a9beecedbbbac7d319f196..0000000000000000000000000000000000000000 --- a/dios/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env python - -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -from .dios import * diff --git a/dios/dios/__init__.py b/dios/dios/__init__.py deleted file mode 100644 index 6814f6d18cfde0d6b4d11f0398fbaabda041dc6a..0000000000000000000000000000000000000000 --- a/dios/dios/__init__.py +++ /dev/null @@ -1,31 +0,0 @@ -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -from .dios import * -from .lib import * - -__all__ = [ - "DictOfSeries", - "to_dios", - "pprint_dios", - "IntItype", - "FloatItype", - "NumItype", - "DtItype", - "ObjItype", - "ItypeWarning", - "ItypeCastWarning", - "ItypeCastError", - "is_itype", - "is_itype_subtype", - "is_itype_like", - "get_itype", - "cast_to_itype", - "CastPolicy", - "Opts", - "OptsFields", - "OptsFields", - "dios_options", - "example_DictOfSeries", -] diff --git a/dios/dios/base.py b/dios/dios/base.py deleted file mode 100644 index ad9a3698511d13c8d912e6e7a16ac0ddab03ff8c..0000000000000000000000000000000000000000 --- a/dios/dios/base.py +++ /dev/null @@ -1,737 +0,0 @@ -#!/usr/bin/env python - -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -from __future__ import annotations - -import functools as ftools -import operator as op -from abc import abstractmethod -from copy import copy as shallowcopy -from copy import deepcopy -from typing import Any, Hashable, Mapping, Sequence, TypeVar, overload - -import pandas as pd - -from . import lib -from . import operators as ops -from . import pandas_bridge as pdextra -from .lib import _CAST_POLICIES, _find_least_common_itype, _throw_MixedItype_err_or_warn - -__author__ = "Bert Palm" -__email__ = "bert.palm@ufz.de" -__copyright__ = "Copyright 2018, Helmholtz-Zentrum für Umweltforschung GmbH - UFZ" - - -D = TypeVar("D", bound="_DiosBase") - - -class _DiosBase: - @property - @abstractmethod - def _constructor(self: D) -> type[D]: - raise NotImplementedError - - def _finalize(self, other: _DiosBase): - self._attrs = other._attrs - return self - - def __init__( - self, - data=None, - columns=None, - index=None, - itype=None, - cast_policy="save", - fastpath=False, - ): - self._attrs = {} - self.cast_policy = cast_policy # set via property - - # we are called internally - if fastpath: - self._itype = itype or lib.ObjItype - if data is not None: - self._data = data - else: - # it is significantly faster, to provide an index and fill it, - # than to successively build the index by adding data - self._data = pd.Series(dtype="O", index=columns) - - else: - if index is not None and not isinstance(index, pd.Index): - index = pd.Index(index) - - # itype=None means infer the itype by the data, so we first set to the highest - # possible itype, then insert data, then infer the best-fitting itype. - if itype is None and index is None: - self._itype = lib.ObjItype - else: - if index is not None: - self._itype = lib.get_itype(index) - if itype is not None: - self._itype = lib.get_itype(itype) - - cols = pd.Index([] if columns is None else columns) - if not cols.is_unique: - raise ValueError("columns must be unique") - self._data = pd.Series(dtype="O", index=cols) - - if data is not None: - self._init_insert_data(data, columns, index) - - # self._data may still contain nans; at positions where - # no data was present, but a column-name was given - if self._data.hasnans: - e = pd.Series(dtype="O", index=index) - for c in self.columns[self._data.isna()]: - self._insert(c, e.copy()) - - self._data.index.name = "columns" - - # we try to infer the itype, but if we still have - # no data, we will set the itype lazy, i.e. with - # the first non-empty _insert() - if itype is None: - if self.empty: - self._itype = "INFER" - else: - self._itype = _find_least_common_itype(self._data) - if not self._itype.unique: - _throw_MixedItype_err_or_warn(self.itype) - - def _init_insert_data(self, data, columns, index): - """Insert items of a iterable in self""" - - if pdextra.is_iterator(data): - data = list(data) - - if _is_dios_like(data) and not data.columns.is_unique: - raise ValueError("columns index must have unique values") - - if _is_dios_like(data) or isinstance(data, dict): - if columns is None: - pass # data is dict-like - else: - data = {k: data[k] for k in data if k in columns} - - elif isinstance(data, pd.Series): - name = data.name or 0 - if columns is not None and len(columns) > 0: - name = self.columns[0] - data = {name: data} - - elif pdextra.is_nested_list_like(data): - if columns is None: - data = {i: d for i, d in enumerate(data)} - elif len(data) == len(columns): - data = dict(zip(self.columns, data)) - else: - raise ValueError( - f"{len(columns)} columns passed, data implies {len(data)} columns" - ) - - elif pdextra.is_list_like(data): - name = 0 if columns is None or len(columns) < 1 else self.columns[0] - data = {name: data} - - else: - raise TypeError("data type not understood") - - for k in data: - s = pd.Series(data[k], index=index, dtype=object).infer_objects() - self._insert(k, s) - - # ---------------------------------------------------------------------- - # checks - - def _is_valid_columns_index(self, obj): - if isinstance(obj, pd.Series) and obj.dtype == "O": - return True - return False - - # ---------------------------------------------------------------------- - # Indexing Methods - - def _insert(self, col, val): - """Insert a fresh new value as pd.Series into self""" - val = list(val) if pdextra.is_iterator(val) else val - - if _is_dios_like(val): - val = val.squeeze() - if not isinstance(val, pd.Series): - raise ValueError(f"Cannot insert frame-like with more than one column") - - elif val is None: - val = pd.Series() - - elif not isinstance(val, pd.Series): - raise TypeError( - f"Only data of type pandas.Series can be inserted, passed was {type(val)}" - ) - - # set the itype lazy, i.e. when first non-empty - # column is inserted - if self._itype == "INFER": - if not val.empty: - self._itype = lib.get_itype(val.index) - # cast all pre-inserted empty series - self._cast_all(self._itype, self.cast_policy) - if not self._itype.unique: - _throw_MixedItype_err_or_warn(self._itype) - else: - val = lib.cast_to_itype(val, self.itype, policy=self.cast_policy) - - val.name = col - self._data.at[col] = val.copy(deep=True) - - @overload - def __getitem__(self, key: str | int | slice) -> pd.Series: - ... - - @overload - def __getitem__( - self: D, key: "_DiosBase" | pd.DataFrame | Sequence[str | int] - ) -> D: - ... - - def __getitem__(self, key): - """dios[key] -> dios/series""" - # scalar -> select a column - # slice -> select rows (on all columns) - # bool dios -> select columns, select rows - # mask -> select rows (on all columns) - # list-like -> select columns - - if pdextra.is_scalar(key): - # NOTE: we shallow copy, to prevent changes on the - # index mirror back to us and may mess up the itype. - s = self._data.at[key] - s.index = s.index.copy() - return s - - if isinstance(key, slice): - return self._slice(key) - - if _is_dios_like(key): - return self._getitem_bool_dios(key) - - if pdextra.is_bool_indexer(key): - return self._getitem_bool_listlike(key) - - # select columns and let pandas handle it - data = self._data.loc[key] - if self._is_valid_columns_index(data): - return self._constructor( - data=data, itype=self.itype, cast_policy=self.cast_policy, fastpath=True - )._finalize(self) - - raise TypeError(f"cannot index columns with this type, {type(key)}") - - def _slice(self, key): - """slices self, return copy""" - if key == slice(None): - return self.copy() - - new = self.copy_empty(columns=True) - for c, series in self.items(): - new._data.at[c] = series[key] - return new - - def _getitem_bool_dios(self, key): - """Select items by a boolean dios-like drop un-selected indices.""" - - if not _is_bool_dios_like(key): - raise ValueError("Must pass DictOfSeries with boolean values only") - - new = self.copy_empty(columns=True) - for c, series in self.items(): - if c in key: - val = key[c].reindex(index=series.index, fill_value=False) - new._data.at[c] = series.loc[val] - return new - - def _getitem_bool_listlike(self, key): - new = self.copy_empty(columns=True) - for c, series in self.items(): - new._data.at[c] = series.loc[key] - return new - - def __setitem__(self, key, value): - """dios[key] = value""" - key = list(key) if pdextra.is_iterator(key) else key - if isinstance(key, tuple): - raise KeyError(f"{key}. tuples are not allowed") - - elif pdextra.is_hashable(key): - if isinstance(value, pd.Series) or key not in self.columns: - self._insert(key, value) - elif _is_dios_like(value) or pdextra.is_nested_list_like(value): - raise ValueError("Incompatible indexer with multi-dimensional value") - else: - self._data.at[key][:] = value - - else: - data = self.__getitem__(key) - assert isinstance( - data, self.__class__ - ), f"getitem returned data of type {type(data)}" - - # special cases - if _is_dios_like(value): - self._setitem_dios(data, value) - # NOTE: pd.Series also considered list-like - elif pdextra.is_list_like(value): - self._setitem_listlike(data, value) - - # default case - else: - for c, series in data.items(): - series[:] = value - self._data.at[c][series.index] = series - - def _setitem_listlike(self, data, value): - value = value.values if isinstance(value, pd.Series) else value - - if len(value) != len(data.columns): - raise ValueError( - f"array-like value of length {len(value)} could " - f"not be broadcast to indexing result of shape " - f"(.., {len(data.columns)})" - ) - - for i, (c, series) in enumerate(data.items()): - series[:] = value[i] - self._data.at[c][series.index] = series - - def _setitem_dios(self, data, value): - """Write values from a dios-like to self. - - No justification or alignment of columns, but of indices. - If value has missing indices, nan's are inserted at that - locations, just like `series.loc[:]=val` or `df[:]=val` do. - - Eg. - di[::2] = di[::3] -> di[::2] - - x | x | x | - ===== | ==== | ====== | - 0 x | 0 z | 0 z | - 2 x | = 3 z | -> 2 NaN | - 4 x | 6 z | 4 NaN | - 6 x | 6 z | - - Parameter - ---------- - data : dios - A maybe trimmed version of self - value : dios, pd.Dataframe - The value to set with the same column dimension like data - """ - - if len(data) != len(value.columns): - raise ValueError( - f"shape mismatch: values array of shape " - f"(.., {len(value.columns)}) could not " - f"be broadcast to indexing result of " - f"shape (.., {len(data.columns)})" - ) - - for i, (c, series) in enumerate(data.items()): - # .loc cannot handle empty series, - # like `emptySeries.loc[:] = [1,2]` - if series.empty: - continue - val = value[value.columns[i]] - series.loc[:] = val - self._data.at[c].loc[series.index] = series - - def __delitem__(self, key): - del self._data[key] - - # ------------------------------------------------------------------------------ - # Base properties and basic dunder magic - - @property - def attrs(self) -> dict[Hashable, Any]: - """ - Dictionary of global attributes of this dataset. - """ - if self._attrs is None: - self._attrs = {} - return self._attrs - - @attrs.setter - def attrs(self, value: Mapping[Hashable, Any]) -> None: - self._attrs = dict(value) - - @property - def columns(self): - """The column labels of the DictOfSeries""" - return self._data.index - - @columns.setter - def columns(self, cols): - index = pd.Index(cols) - if not index.is_unique: - raise ValueError("columns index must have unique values") - self._data.index = index - # rename all columns - for i, s in enumerate(self._data): - s.name = index[i] - - @property - def itype(self): - """The ``Itype`` of the DictOfSeries. - - See :ref:`Itype documentation <doc_itype:Itype>` for more info. - """ - if self._itype == "INFER": - return None - return self._itype - - @itype.setter - def itype(self, itype): - itype = lib.get_itype(itype) - self._cast_all(itype, policy=self.cast_policy) - self._itype = itype - - @property - def cast_policy(self): - """The policy to use for casting new columns if its initial itype does not fit. - - See :ref:`Itype documentation <doc_itype:Itype>` for more info. - """ - return self._policy - - @cast_policy.setter - def cast_policy(self, policy): - if policy not in _CAST_POLICIES: - raise ValueError(f"policy must be one of {_CAST_POLICIES}") - self._policy = policy - - def _cast_all(self, itype, policy): - c = "?" - new = self.copy_empty() - try: - for c, series in self.items(): - new._data.at[c] = lib.cast_to_itype(series, itype, policy=policy) - except Exception as e: - raise type(e)(f"Column {c}: " + str(e)) from e - - def __len__(self): - return len(self.columns) - - @property - def empty(self): - """Indicator whether DictOfSeries is empty. - - Returns - ------- - bool : - If DictOfSeries is empty, return True, if not return False. - - See Also - -------- - DictOfSeries.dropempty : drop empty columns - DictOfSeries.dropna : drop NAN's from a DictOfSeries - pandas.Series.dropna : drop NAN's from a Series - - Notes - ----- - If DictOfSeries contains only NaNs, it is still not considered empty. See the example below. - - Examples - -------- - An example of an actual empty DictOfSeries. - - >>> di_empty = DictOfSeries(columns=['A']) - >>> di_empty - Empty DictOfSeries - Columns: ['A'] - >>> di_empty.empty - True - - If we only have NaNs in our DictOfSeries, it is not considered empty! - We will need to drop the NaNs to make the DictOfSeries empty: - - >>> di = pd.DictOfSeries({'A' : [np.nan]}) - >>> di - A | - ===== | - 0 NaN | - >>> di.empty - False - >>> di.dropna().empty - True - """ - return len(self) == 0 or all(s.empty for s in self._data) - - def __iter__(self): - yield from self.columns - - def __reversed__(self): - yield from reversed(self.columns) - - def __contains__(self, item): - return item in self.columns - - def items(self): - yield from self._data.items() - - # ---------------------------------------------------------------------- - # if copy.copy() is copy.copy(): return copy.copy().copy() - - def __deepcopy__(self, memo=None): - return self.copy(deep=True) - - def __copy__(self): - return self.copy(deep=False) - - def copy(self, deep=True): - """Make a copy of this DictOfSeries' indices and data. - - Parameters - ---------- - deep : bool, default True - Make a deep copy, including a copy of the data and the indices. - With deep=False neither the indices nor the data are copied. - - Returns - ------- - copy : DictOfSeries - - See Also - -------- - pandas.DataFrame.copy - """ - data = self._data.copy() # always copy the outer hull series - if deep: - for c, series in self.items(): - data.at[c] = series.copy() - - new = self._constructor( - data=data, itype=self.itype, cast_policy=self.cast_policy, fastpath=True - ) - - copyfunc = deepcopy if deep else shallowcopy - new._attrs = copyfunc(self._attrs) - - return new - - def copy_empty(self, columns=True): - """ - Return a new DictOfSeries object, with same properties than the original. - Parameters - ---------- - columns: bool, default True - If ``True``, the copy will have the same, but empty columns like the original. - - Returns - ------- - DictOfSeries: empty copy - - Examples - -------- - - >>> di = DictOfSeries({'A': range(2), 'B': range(3)}) - >>> di - A | B | - ==== | ==== | - 0 0 | 0 0 | - 1 1 | 1 1 | - | 2 2 | - - >>> empty = di.copy_empty() - >>> empty - Empty DictOfSeries - Columns: ['A', 'B'] - - The properties are the same, eg. - - >>> empty.itype == di.itype - True - >>> empty.cast_policy == di.cast_policy - True - >>> empty.dtypes == di.dtypes - columns - A True - B True - dtype: bool - """ - data = None - if columns is True: # is correct - data = pd.Series(dtype="O", index=self.columns) - for c, series in self.items(): - # OPTIM: the following code is about 2x faster than - # data.at[c] = pd.Series(dtype=self._data.at[c].dtype) - data.at[c] = series.reindex([]) - - return self._constructor( - data=data, itype=self.itype, cast_policy=self.cast_policy, fastpath=True - )._finalize(self) - - # ------------------------------------------------------------------------------ - # Operators - - def _op1(self, op): - new = self.copy_empty(columns=True) - try: - for k, series in self.items(): - new[k] = op(series) - except Exception as e: - raise type(e)(f"'{ops.OP_MAP[op]} dios' failed: " + str(e)) from e - return new - - def _op2_inplace(self, op, other, align=True) -> None: - new = self._op2(op, other, align) - self._data = new._data - - def _op2(self, op, other, align=True) -> "_DiosBase": - def raiseif(kself, kother, s): - if kself != kother: - raise ValueError( - f"{s} does not match, {s} left: {kself}, {s} right: {kother}" - ) - - def doalign(left, right): - return left.align(right, join="inner") if align else (left, right) - - def get_operants(): - if _is_dios_like(other): - raiseif(list(self), list(other), "keys") - for k, series in self.items(): - yield (k, *doalign(series, other[k])) - elif isinstance(other, pd.Series): - for k, series in self.items(): - yield (k, *doalign(series, other)) - elif pdextra.is_dict_like(other): - raiseif(sorted(self), sorted(other), "keys") - for k, series in self.items(): - yield (k, series, other[k]) - elif pdextra.is_nested_list_like(other): - raiseif(len(self), len(other), "length") - for i, (k, series) in enumerate(self.items()): - yield (k, series, other[i]) - elif pdextra.is_scalar(other) or pdextra.is_list_like(other): - for k, series in self.items(): - yield (k, series, other) - else: - raise NotImplementedError - - new = self.copy_empty(columns=True) - try: - for k, ser, oth in get_operants(): - new[k] = op(ser, oth) - except Exception as e: - raise type(e)(f"'dios {ops.OP_MAP[op]} other' failed: " + str(e)) from e - - return new - - # unary - __neg__ = ftools.partialmethod(_op1, op.neg) - __abs__ = ftools.partialmethod(_op1, op.abs) - __invert__ = ftools.partialmethod(_op1, op.inv) - # comparison - __eq__ = ftools.partialmethod(_op2, op.eq, align=False) - __ne__ = ftools.partialmethod(_op2, op.ne, align=False) - __le__ = ftools.partialmethod(_op2, op.le, align=False) - __ge__ = ftools.partialmethod(_op2, op.ge, align=False) - __lt__ = ftools.partialmethod(_op2, op.lt, align=False) - __gt__ = ftools.partialmethod(_op2, op.gt, align=False) - # arithmetic - __add__ = ftools.partialmethod(_op2, op.add) - __sub__ = ftools.partialmethod(_op2, op.sub) - __mul__ = ftools.partialmethod(_op2, op.mul) - __mod__ = ftools.partialmethod(_op2, op.mod) - __truediv__ = ftools.partialmethod(_op2, op.truediv) - __floordiv__ = ftools.partialmethod(_op2, op.floordiv) - __pow__ = ftools.partialmethod(_op2, op.pow) - __iadd__ = ftools.partialmethod(_op2_inplace, op.add) - __isub__ = ftools.partialmethod(_op2_inplace, op.sub) - __imul__ = ftools.partialmethod(_op2_inplace, op.mul) - __imod__ = ftools.partialmethod(_op2_inplace, op.mod) - __itruediv__ = ftools.partialmethod(_op2_inplace, op.truediv) - __ifloordiv__ = ftools.partialmethod(_op2_inplace, op.floordiv) - __ipow__ = ftools.partialmethod(_op2_inplace, op.pow) - # bool - __and__ = ftools.partialmethod(_op2, op.and_) - __or__ = ftools.partialmethod(_op2, op.or_) - __xor__ = ftools.partialmethod(_op2, op.xor) - __iand__ = ftools.partialmethod(_op2, op.and_, inplace=True) - __ior__ = ftools.partialmethod(_op2, op.or_, inplace=True) - __ixor__ = ftools.partialmethod(_op2, op.xor, inplace=True) - - # ------------------------------------------------------------------------------ - # Indexer - - @property - def loc(self): - """Access a group of rows and columns by label(s) or a boolean array. - - See :ref:`indexing docs <doc_indexing:Pandas-like indexing>` - """ - return _LocIndexer(self) - - @property - def iloc(self): - """Purely integer-location based indexing for selection by position. - - See :ref:`indexing docs <doc_indexing:Pandas-like indexing>` - """ - return _iLocIndexer(self) - - @property - def aloc(self): - """Access a group of rows and columns by label(s) or a boolean array with automatic alignment of indexers. - - See :ref:`indexing docs <doc_indexing:Special indexer .aloc>` - """ - return _aLocIndexer(self) - - @property - def at(self): - """Access a single value for a row/column label pair. - - See :ref:`indexing docs <doc_indexing:Pandas-like indexing>` - """ - return _AtIndexer(self) - - @property - def iat(self): - """Access a single value for a row/column pair by integer position. - - See :ref:`indexing docs <doc_indexing:Pandas-like indexing>` - """ - return _iAtIndexer(self) - - -def _is_dios_like(obj) -> bool: - # must have columns - # columns is some kind of pd.Index - # iter will iter through columns - # a `in` obj check if obj is in columns - # obj[key] will give a pd.Series - # obj.squeeze() give pd.Series if len(obj) == 1 - return isinstance(obj, (_DiosBase, pd.DataFrame)) - - -def _is_bool_series(obj) -> bool: - return isinstance(obj, pd.Series) and obj.dtype == bool - - -def _is_bool_dios_like(obj) -> bool: - if not _is_dios_like(obj): - return False - dtypes = obj.dtypes - if (dtypes == bool).all(): - return True - if (dtypes == "O").any(): - return obj.apply(pdextra.is_bool_indexer).all() - return False - - -# keep this here to prevent cyclic import -from .indexer import _aLocIndexer, _AtIndexer, _iAtIndexer, _iLocIndexer, _LocIndexer diff --git a/dios/dios/dios.py b/dios/dios/dios.py deleted file mode 100644 index 8a5ee9433be9e59d1e81ed4ec9d1c19012a092b1..0000000000000000000000000000000000000000 --- a/dios/dios/dios.py +++ /dev/null @@ -1,1295 +0,0 @@ -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -from __future__ import annotations - -import functools as ftools -from typing import Any, Hashable, Mapping - -import numpy as np -import pandas as pd - -from . import pandas_bridge as pdextra -from .base import _DiosBase, _is_dios_like -from .lib import Opts, OptsFields, _find_least_common_itype, dios_options - - -class DictOfSeries(_DiosBase): - """A data frame where every column has its own index. - - DictOfSeries is a collection of pd.Series's which aim to be as close as possible similar to - pd.DataFrame. The advantage over pd.DataFrame is, that every `column` has its own row-index, - unlike the former, which provide a single row-index for all columns. This solves problems with - unaligned data and data which varies widely in length. - - Indexing with ``di[]``, ``di.loc[]`` and ``di.iloc[]`` should work analogous to these methods - from pd.DataFrame. The indexer can be a single label, a slice, a list-like, a boolean list-like, - or a boolean DictOfSeries/pd.DataFrame and can be used to selectively get or set data. - - Parameters - ---------- - data : array-like, Iterable, dict, or scalar value - Contains data stored in Series. - - columns : array-like - Column labels to use for resulting frame. Will default to - RangeIndex(0, 1, 2, ..., n) if no column labels are provided. - - index : Index or array-like - Index to use to reindex every given series during init. Ignored if omitted. - - itype : Itype, pd.Index, Itype-string-repr or type - Every series that is inserted, must have an index of this type or any - of this types subtypes. - If None, the itype is inferred as soon as the first non-empty series is inserted. - - cast_policy : {'save', 'force', 'never'}, default 'save' - Policy used for (down-)casting the index of a series if its type does not match - the ``itype``. - """ - - @property - def _constructor(self) -> type[DictOfSeries]: - """Return the class. Useful for construction in the elder class. - A import of DictOfSeries would end up cyclic.""" - return DictOfSeries - - def _construct_like_self(self, **kwargs): - kwargs.setdefault("itype", self.itype) - kwargs.setdefault("cast_policy", self.cast_policy) - return self._constructor(**kwargs)._finalize(self) - - @property - def indexes(self): - """Return pandas.Series with the indexes of all columns.""" - return self.for_each("index") - - def values(self): - """Return a numpy.array of numpy.arrays with the values of all columns. - - The outer has the length of columns, the inner holds the values of the column. - """ - s = self.for_each("values") - return s.values - - @property - def dtypes(self): - """Return pandas.Series with the dtypes of all columns.""" - return self.for_each("dtype") - - @property - def lengths(self): - """Return pandas.Series with the lenght of all columns.""" - return self._data.apply(len) - - @property - def size(self): - return self.lengths.sum() - - @property - def shape(self): - return tuple(self.lengths), len(self.columns) - - # ------------------------------------------------------------------------------ - # Dict-like methods - - def clear(self): - d = self._data - self._data = pd.Series(dtype=d.dtype, index=type(d.index)([])) - - def get(self, key, default=None): - return self._data.get(key, default) - - # implemented in _BaseClass - # def items(self): - # return self._data.items() - - def keys(self): - return self.columns - - def pop(self, *args): - # We support a default value, like dict, in contrary to pd. - # Therefore we need to handle args manually, because dict-style pop() - # differ between a single arg and a tuple-arg, with arg and default, - # where the second arg can be anything, including None. If the key is - # not present, and a single arg is given, a KeyError is raised, but - # with a given default value, it is returned instead. - if len(args) == 0: - raise TypeError("pop expected at least 1 arguments, got 0") - if len(args) > 2: - raise TypeError(f"pop expected at most 2 arguments, got {len(args)}") - key, *rest = args - if key in self.columns: - return self._data.pop(key) - elif rest: - return rest.pop() - raise KeyError(key) - - def popitem(self): - last = self.columns[-1] - return last, self._data.pop(last) - - def setdefault(self, key, default=None): - if key not in self.columns: - self._insert(key, default) - return self._data[key] - - def update(self, other): - if not _is_dios_like(other): - other = to_dios(other) - self.aloc[other, ...] = other - - # ------------------------------------------------------------------------------ - # High-Level Iteration - - def iteritems(self): - yield from self.items() - - def iterrows(self, fill_value=np.nan, squeeze=True): - """ - Iterate over DictOfSeries rows as (index, pandas.Series/DictOfSeries) pairs. - **MAY BE VERY PERFORMANCE AND/OR MEMORY EXPENSIVE** - - Parameters - ---------- - fill_value: scalar, default numpy.nan - Fill value for row entry, if the column does not have an entry - at the current index location. This ensures that the returned - Row always contain all columns. If ``None`` is given no value - is filled. - - If ``fill_value=None`` and ``squeeze=True`` the resulting Row - (a pandas.Series) may differ in length between iterator calls. - That's because an entry, that is not present in a column, will - also not be present in the resulting Row. - - squeeze: bool, default False - * ``True`` : A pandas.Series is returned for each row. - * ``False`` : A single-rowed DictOfSeries is returned for each row. - - Yields - ------ - index : label - The index of the row. - data : Series or DictOfSeries - The data of the row as a Series if squeeze is True, as - a DictOfSeries otherwise. - - See Also - -------- - DictOfSeries.iteritems : Iterate over (column name, Series) pairs. - """ - - # todo: 2nd posibility for fill_value=Any, squeeze=False - # do it like in case fill_value=None -> - # 1. row = aloc the row - # 2. e = row.isempty() - # 3. row.loc[idx,e] = fill_value - # This approach could be much better, because the dtype of - # the columns is preserved. - - # PROBABLY PERFORMANCE EXPENSIVE - if fill_value is None: - allidx = self.index_of("all") - if squeeze: - for i in allidx: - yield i, self.aloc[i:i].dropempty().squeeze(axis=0) - else: - for i in allidx: - yield self.aloc[i:i] - - # PROBABLY MEMORY EXPENSIVE - else: - if fill_value is np.nan: - df = self.to_df() - else: - nans = self.isna().to_df().fillna(False) - df = self.to_df().fillna(fill_value) - df[nans] = np.nan - if squeeze: - yield from df.iterrows() - else: - for idx, row in df.iterrows(): - yield idx, self._constructor( - data=row.to_dict(), index=[idx] - )._finalize(self) - - # ------------------------------------------------------------------------------ - # Broadcasting and Reducing - - def for_each(self, attr_or_callable, **kwds): - """ - Apply a callable or a pandas.Series method or property on each column. - - Parameters - ---------- - attr_or_callable: Any - A pandas.Series attribute or any callable, to apply on each column. - A series attribute can be any property, field or method and also - could be specified as string. If a callable is given it must take - pandas.Series as the only positional argument and return a scalar. - - **kwds: any - kwargs to passed to callable - - Returns - ------- - pandas.Series - A series with the results, indexed by the column labels. - - Notes - ----- - The called function or the attribute works on the actual underlying series. - If the provided function works inplace it can and will modify the actual data. - If this is not desired one can should make an explicit copy beforehand. If the - function returns new objects or copies, explicit copying is not needed. - - See Also - -------- - DictOfSeries.apply : Apply functions to columns and convert - result to DictOfSeries. - - Examples - -------- - >>> d = DictOfSeries([range(3), range(4)], columns=['a', 'b']) - >>> d - a | b | - ==== | ==== | - 0 0 | 0 0 | - 1 1 | 1 1 | - 2 2 | 2 2 | - | 3 3 | - - Use with a callable.. - - >>> d.for_each(max) - columns - a 2 - b 3 - dtype: object - - ..or with a string, denoting a pd.Series attribute and - therefor is the same as giving the latter. - - >>> d.for_each('max') - columns - a 2 - b 3 - dtype: object - - >>> d.for_each(pd.Series.max) - columns - a 2 - b 3 - dtype: object - - Both also works with properties: - - >>> d.for_each('dtype') - columns - a int64 - b int64 - dtype: object - """ - attrOcall = attr_or_callable - if isinstance(attrOcall, str): - attrOcall = getattr(pd.Series, attrOcall) - call = callable(attrOcall) - if not call: - attrOcall = attr_or_callable - data = pd.Series(dtype="O", index=self.columns) - for c, series in self.items(): - if call: - data.at[c] = attrOcall(series, **kwds) - else: - data.at[c] = getattr(series, attrOcall) - return data - - def apply(self, func, axis=0, raw=False, args=(), **kwds): - """ - Apply a function along an axis of the DictOfSeries. - - Parameters - ---------- - func : callable - Function to apply on each column. - axis : {0 or 'index', 1 or 'columns'}, default 0 - Axis along which the function is applied: - - * 0 or 'index': apply function to each column. - * 1 or 'columns': NOT IMPLEMENTED - - raw : bool, default False - Determines if row or column is passed as a Series or ndarray object: - - * ``False`` : passes each row or column as a Series to the - function. - * ``True`` : the passed function will receive ndarray objects - instead. - If you are just applying a NumPy reduction function this will - achieve much better performance. - - args : tuple - Positional arguments to pass to `func` in addition to the - array/series. - **kwds - Additional keyword arguments to pass as keywords arguments to - `func`. - - Returns - ------- - Series or DataFrame - Result of applying ``func`` along the given axis of the - DataFrame. - - Notes - ----- - The called function or the attribute works on the actual underlying series. - If the provided function works inplace it can and will modify the actual data. - If this is not desired one should make an explicit copy beforehand. If the - function returns new objects or copies, and does not mess with the data, explicit - copying is not needed. - - - Raises - ------ - NotImplementedError - * if axis is 'columns' or 1 - - See Also - -------- - DictOfSeries.for_each: apply pd.Series methods or properties to each column - - Examples - -------- - - We use the example DictOfSeries from :ref:`indexing <doc_indexing:Example dios>`. - - >>> di = di[:5] - a | b | c | d | - ===== | ==== | ===== | ===== | - 0 0 | 2 5 | 4 7 | 6 0 | - 1 7 | 3 6 | 5 17 | 7 1 | - 2 14 | 4 7 | 6 27 | 8 2 | - 3 21 | 5 8 | 7 37 | 9 3 | - 4 28 | 6 9 | 8 47 | 10 4 | - - >>> di.apply(max) - columns - a 28 - b 9 - c 47 - d 4 - dtype: int64 - - >>> di.apply(pd.Series.count) - columns - a 5 - b 5 - c 5 - d 5 - dtype: int64 - - One can pass keyword arguments directly.. - - >>> di.apply(pd.Series.value_counts, normalize=True) - a | b | c | d | - ======= | ====== | ======= | ====== | - 7 0.2 | 7 0.2 | 7 0.2 | 4 0.2 | - 14 0.2 | 6 0.2 | 37 0.2 | 3 0.2 | - 21 0.2 | 5 0.2 | 47 0.2 | 2 0.2 | - 28 0.2 | 9 0.2 | 27 0.2 | 1 0.2 | - 0 0.2 | 8 0.2 | 17 0.2 | 0 0.2 | - - Or define a own funtion.. - - >>> di.apply(lambda s : 'high' if max(s) > 10 else 'low') - columns - a high - b low - c high - d low - dtype: object - - And also more advanced functions that return a list-like can be given. Note that - the returned lists not necessarily must have the same length. - - >>> func = lambda s : ('high', max(s), min(s)) if min(s) > (max(s)//2) else ('low',max(s)) - >>> di.apply(func) - a | b | c | d | - ====== | ======= | ====== | ====== | - 0 low | 0 high | 0 low | 0 low | - 1 28 | 1 9 | 1 47 | 1 4 | - | 2 5 | | | - """ - if axis in [1, "columns"]: - raise NotImplementedError - - if axis not in [0, "index"]: - raise ValueError(axis) - - # we cannot use self._data.apply(func=func, args=args, **kwds) - # because this may return a pandas.DataFrame. Also we cannot - # use pandas.Series.apply(), because this works on its values. - need_dios = need_convert = False - result = pd.Series(dtype="O", index=self.columns) - for c, series in self.items(): - series = series.values if raw else series - s = func(series, *args, **kwds) - result.at[c] = s - if pdextra.is_scalar(s): - need_convert = True - else: - need_dios = True - if not isinstance(s, pd.Series): - need_convert = True - if need_dios: - if need_convert: - for c, val in result.items(): - result.at[c] = pd.Series(val) - itype = _find_least_common_itype(result) - result = self._constructor(data=result, itype=itype, fastpath=True) - result._finalize(self) - - return result - - def reduce_columns(self, func, initial=None, skipna=False): - """ - Reduce all columns to a single pandas.Series by a given function. - - Apply a function of two pandas.Series as arguments, cumulatively to all - columns, from left to right, so as to reduce the columns to a single - pandas.Series. If initial is present, it is placed before the columns - in the calculation, and serves as a default when the columns are empty. - - Parameters - ---------- - func : function - The function must take two identically indexed pandas.Series and should - return a single pandas.Series with the same index. - - initial : column-label or pd.Series, default None - The series to start with. If None a dummy series is created, with the - indices of all columns and the first seen values. - - skipna : bool, default False - If True, skip NaN values. - - Returns - ------- - pandas.Series - A series with the reducing result and the index of the start series, - defined by ``initializer``. - """ - if initial is None: - value = pd.Series(index=self.index_of("all")) - for d in self._data: - value = value.combine_first(d) - elif isinstance(initial, pd.Series): - value = initial.copy() - elif initial in self.columns: - value = self._data.at[initial].copy() - else: - raise ValueError("initial must be pd.Series, a column label or None") - - if skipna: - val = value.dropna() - data = self.dropna()._data - else: - val = value - data = self._data - - for d in data: - idx = val.index & d.index - if len(idx) > 0: - l, r = val.loc[idx], d.loc[idx] - val.loc[idx] = func(l, r) - - if skipna: - value.loc[val.index] = val - return value - - # ------------------------------------------------------------------------------ - # Merging and Joining - - def combine_first(self, other, keepna=False): - """ - Update null elements with value in the same location in other. - - Combine two DictOfSeries objects by filling null values in one DictOfSeries with - non-null values from other DictOfSeries. The row and column indexes of the resulting - DictOfSeries will be the union of the two. - - Parameters - ---------- - keepna : bool, default False - By default Nan's are updated by other and new value-index pairs from other are - inserted. If set to True, NaN's are not updated and only new value-index pair are inserted. - - other : DictOfSeries - Provided DictOfSeries to use to fill null values. - - Returns - ------- - DictOfSeries - """ - if keepna: - nans = self.isna() - - new: DictOfSeries = self.copy() - for c in other.columns: - if c in self.columns: - col = self._data.at[c].combine_first(other[c]) - else: - col = other[c] - new._data.at[c] = col - - if keepna: - new.aloc[nans] = np.nan - - return new - - # ------------------------------------------------------------------------------ - # Misc methods - - def index_of(self, method="all"): - """Return an single index with indices from all columns. - - Parameters - ---------- - method : string, default 'all' - * 'all' : get all indices from all columns - * 'union' : alias for 'all' - * 'shared' : get indices that are present in every columns - * 'intersection' : alias for 'shared' - * 'uniques' : get indices that are only present in a single column - * 'non-uniques' : get indices that are present in more than one column - - Returns - ------- - pd.Index - A single duplicate-free index, somehow representing indices of all columns. - - Examples - -------- - We use the example DictOfSeries from :ref:`indexing <doc_indexing:Example dios>`. - - >>> di - a | b | c | d | - ===== | ====== | ====== | ===== | - 0 0 | 2 5 | 4 7 | 6 0 | - 1 7 | 3 6 | 5 17 | 7 1 | - 2 14 | 4 7 | 6 27 | 8 2 | - 3 21 | 5 8 | 7 37 | 9 3 | - 4 28 | 6 9 | 8 47 | 10 4 | - 5 35 | 7 10 | 9 57 | 11 5 | - 6 42 | 8 11 | 10 67 | 12 6 | - 7 49 | 9 12 | 11 77 | 13 7 | - 8 56 | 10 13 | 12 87 | 14 8 | - 9 63 | 11 14 | 13 97 | 15 9 | - - >>> di.index_of() - RangeIndex(start=0, stop=16, step=1) - - >>> di.index_of("shared") - Int64Index([6, 7, 8, 9], dtype='int64') - - >>> di.index_of("uniques") - Int64Index([0, 1, 14, 15], dtype='int64') - """ - indexes = self.indexes - if len(indexes) <= 1: - return indexes.squeeze() - - if method in ["union", "all"]: - res = ftools.reduce(pd.Index.union, indexes) - elif method in ["intersection", "shared"]: - res = ftools.reduce(pd.Index.intersection, indexes) - elif method in ["uniques", "non-uniques"]: - res = ftools.reduce(pd.Index.append, indexes) - res = res.value_counts(sort=False, dropna=False) - if method == "uniques": - res = res[res == 1].index - else: - res = res[res > 1].index - else: - raise ValueError(method) - return res if res.is_unique else res.unique() - - def squeeze(self, axis=None): - """Squeeze a 1-dimensional axis objects into scalars.""" - if axis in [0, "index"]: - if (self.lengths == 1).all(): - return self._data.apply(pd.Series.squeeze) - return self - elif axis in [1, "columns"]: - if len(self) == 1: - return self._data.squeeze() - return self - elif axis is None: - if len(self) == 1: - return self._data.squeeze().squeeze() - if (self.lengths == 1).all(): - return self._data.apply(pd.Series.squeeze).squeeze() - return self - raise ValueError(axis) - - def dropna(self, inplace=False): - """Return a bolean array that is `True` if the value is a Nan-value""" - data = self.for_each("dropna", inplace=inplace) - if inplace: - return - return self._construct_like_self(data=data, fastpath=True) - - def dropempty(self): - """Drop empty columns. Return copy.""" - return self.loc[:, self.notempty()] - - def astype(self, dtype, copy=True, errors="raise"): - """Cast the data to the given data type.""" - data = self.for_each("astype", dtype=dtype, copy=copy, errors=errors) - return self._construct_like_self(data=data, fastpath=True) - - def _mask_or_where(self, cond, other=np.nan, inplace=False, mask=True): - """helper to mask/where""" - data = self if inplace else self.copy() - - if callable(other): - other = other(data) - - if callable(cond): - cond = cond(data) - # if DictOfSeries is bool, - # is already checked in aloc - elif not _is_dios_like(cond): - if not pdextra.is_bool_indexer(cond): - raise ValueError( - "Object with boolean values only expected as condition" - ) - - if mask: - data.aloc[cond] = other - else: - data.aloc[~cond] = other - - if inplace: - return None - return data - - def where(self, cond, other=np.nan, inplace=False): - """ - Replace values where the condition is False. - - Parameters - ---------- - cond : bool DictOfSeries, Series, array-like, or callable - Where cond is True, keep the original value. Where False, replace - with corresponding value from other. If cond is callable, it is computed - on the DictOfSeries and should return boolean DictOfSeries or array. - The callable must not change input DictOfSeries (though dios doesn’t check it). - If cond is a bool Series, every column is (row-)aligned against it, before the - boolean values are evaluated. Missing indices are treated like False values. - - other : scalar, Series, DictOfSeries, or callable - Entries where cond is False are replaced with corresponding value from other. - If other is callable, it is computed on the DictOfSeries and should return scalar - or DictOfSeries. The callable must not change input DictOfSeries (though dios doesn’t check it). - If other is a Series, every column is (row-)aligned against it, before the values - are written. NAN's are written for missing indices. - - inplace : bool, default False - Whether to perform the operation in place on the data. - - Returns - ------- - DictOfSeries - - See Also - -------- - mask: Mask data where condition is True - """ - return self._mask_or_where(cond=cond, other=other, inplace=inplace, mask=False) - - def mask(self, cond, other=np.nan, inplace=False): - """ - Replace values where the condition is True. - - Parameters - ---------- - cond : bool DictOfSeries, Series, array-like, or callable - Where cond is False, keep the original value. Where True, replace - with corresponding value from other. If cond is callable, it is computed - on the DictOfSeries and should return boolean DictOfSeries or array. - The callable must not change input DictOfSeries (though dios doesn’t check it). - If cond is a bool Series, every column is (row-)aligned against it, before the - boolean values are evaluated. Missing indices are treated like False values. - - other : scalar, Series, DictOfSeries, or callable - Entries where cond is True are replaced with corresponding value from other. - If other is callable, it is computed on the DictOfSeries and should return scalar - or DictOfSeries. The callable must not change input DictOfSeries (though dios doesn’t check it). - If other is a Series, every column is (row-)aligned against it, before the values - are written. NAN's are written for missing indices. - - inplace : bool, default False - Whether to perform the operation in place on the data. - - Returns - ------- - DictOfSeries - - See Also - -------- - mask: Mask data where condition is False - """ - return self._mask_or_where(cond=cond, other=other, inplace=inplace, mask=True) - - def memory_usage(self, index=True, deep=False): - return self.for_each(pd.Series.memory_usage, index=index, deep=deep).sum() - - def to_df(self, how="outer"): - """ - Transform DictOfSeries to a pandas.DataFrame. - - Because a pandas.DataFrame can not handle Series of different - length, but DictOfSeries can, the missing data is filled with - NaNs or is dropped, depending on the keyword `how`. - - Parameters - ---------- - how: {'outer', 'inner'}, default 'outer' - define how the resulting DataFrame index is generated: - * 'outer': The indices of all columns, merged into one index is used. - If a column misses values at the new index location, `NaN`s are filled. - * 'inner': Only indices that are present in all columns are used, filling - logic is not needed, but values are dropped, if a column has indices - that are not known to all other columns. - - Returns - ------- - pandas.DataFrame: transformed data - - Examples - -------- - - Missing data locations are filled with NaN's - - >>> a = pd.Series(11, index=range(2)) - >>> b = pd.Series(22, index=range(3)) - >>> c = pd.Series(33, index=range(1,9,3)) - >>> di = DictOfSeries(dict(a=a, b=b, c=c)) - >>> di - a | b | c | - ===== | ===== | ===== | - 0 11 | 0 22 | 1 33 | - 1 11 | 1 22 | 4 33 | - | 2 22 | 7 33 | - - >>> di.to_df() - columns a b c - 0 11.0 22.0 NaN - 1 11.0 22.0 33.0 - 2 NaN 22.0 NaN - 4 NaN NaN 33.0 - 7 NaN NaN 33.0 - - or is dropped if `how='inner'` - - >>> di.to_df(how='inner') - columns a b c - 1 11 22 33 - """ - if how == "inner": - how = "shared" - elif how == "outer": - how = "all" - else: - raise ValueError(how) - - index = self.index_of(how) - df = pd.DataFrame(columns=self.columns, index=index) - for c, series in self.items(): - # this automatically respects the df-index, that - # was set before. Missing locations are already - # nans, present locations are set. - df[c] = series.copy() - - df.attrs = self.attrs - return df - - @property - def debugDf(self): - """Alias for ``to_df()`` as property, for debugging purpose.""" - return self.to_df() - - def min(self, axis=0, skipna=True): - if axis is None: - return self.for_each(pd.Series.min, skipna=skipna).min() - if axis in [0, "index"]: - return self.for_each(pd.Series.min, skipna=skipna) - if axis in [1, "columns"]: - func = lambda s1, s2: s1.where(s1 < s2, s2) - return self.reduce_columns(func, skipna=skipna) - raise ValueError(axis) - - def max(self, axis=0, skipna=None): - if axis is None: - return self.for_each(pd.Series.max, skipna=skipna).max() - if axis in [0, "index"]: - return self.for_each(pd.Series.max, skipna=skipna) - if axis in [1, "columns"]: - func = lambda s1, s2: s1.where(s1 > s2, s2) - return self.reduce_columns(func, skipna=skipna) - raise ValueError(axis) - - # ---------------------------------------------------------------------- - # Boolean and empty stuff - - def equals(self, other): - """ - Test whether two DictOfSeries contain the same elements. - - This function allows two DictOfSeries to be compared against each other to see - if they have the same shape and elements. NaNs in the same location are considered equal. - The column headers do not need to have the same type, but the elements within the columns - must be the same dtype. - - Parameters - ---------- - other: DictOfSeries - The other DictOfSeries to compare with. - - Returns - ------- - bool - True if all elements are the same in both DictOfSeries, False otherwise. - """ - if not isinstance(other, _DiosBase): - return False - try: - eq_nans = (self.isna() == other.isna()).all(None) - eq_data = (self.dropna() == other.dropna()).all(None) - eq_dtypes = (self.dtypes == other.dtypes).all() - return eq_nans and eq_dtypes and eq_data - except Exception: - return False - - def isin(self, values): - """Return a boolean dios, that indicates if the corresponding value is in the given array-like.""" - data = self.for_each("isin", values=values) - return self._construct_like_self(data=data, fastpath=True) - - def all(self, axis=0): - """ - Return whether all elements are True, potentially over an axis. - - Returns True unless there at least one element within a series - or along a DictOfSeries axis that is False or equivalent (e.g. zero or empty). - - Parameters - ---------- - axis : {0 or ‘index’, 1 or ‘columns’, None}, default 0 - Indicate which axis or axes should be reduced. - * 0 / ‘index’ : reduce the index, return a Series whose index is the original column labels. - * 1 / ‘columns’ : reduce the columns, return a Series whose index is the union of all columns indexes. - * None : reduce all axes, return a scalar. - - Returns - ------- - pandas.Series - - See Also - -------- - pandas.Series.all: Return True if all elements are True. - any: Return True if one (or more) elements are True. - """ - if axis is None: - return self._data.apply(all).all() - if axis in [0, "index"]: - return self._data.apply(all) - if axis in [1, "columns"]: - func = lambda s1, s2: s1.astype(bool) & s2.astype(bool) - init = pd.Series(True, dtype=bool, index=self.index_of("all")) - return self.reduce_columns(func, init) - raise ValueError(axis) - - def any(self, axis=0): - """ - Return whether any element is True, potentially over an axis. - - Returns False unless there at least one element within a series - or along a DictOfSeries axis that is True or equivalent (e.g. non-zero or non-empty). - - Parameters - ---------- - axis : {0 or ‘index’, 1 or ‘columns’, None}, default 0 - Indicate which axis or axes should be reduced. - * 0 / ‘index’ : reduce the index, return a Series whose index is the original column labels. - * 1 / ‘columns’ : reduce the columns, return a Series whose index is the union of all columns indexes. - * None : reduce all axes, return a scalar. - - Returns - ------- - pandas.Series - - See Also - -------- - pandas.Series.any: Return whether any element is True. - all: Return True if all elements are True. - """ - if axis is None: - return self._data.apply(any).any() - if axis in [0, "index"]: - return self._data.apply(any) - if axis in [1, "columns"]: - func = lambda s1, s2: s1.astype(bool) | s2.astype(bool) - init = pd.Series(False, dtype=bool, index=self.index_of("all")) - return self.reduce_columns(func, init) - raise ValueError(axis) - - def isna(self, drop_empty=False): - """ - Return a boolean DictOfSeries which indicates NA positions. - """ - data = self.dropempty() if drop_empty else self - data = data.for_each("isna") - return self._construct_like_self(data=data, fastpath=True) - - def notna(self, drop_empty=False): - """ - Return a boolean DictOfSeries which indicates non-NA positions. - """ - data = self.dropempty() if drop_empty else self - data = data.for_each("notna") - return self._construct_like_self(data=data, fastpath=True) - - def hasnans(self, axis=0, drop_empty=False): - """ - Returns a boolean Series along an axis, which indicates if it contains NA-entries. - """ - data = self.dropempty() if drop_empty else self - if axis is None: - return data.for_each("hasnans").any() - if axis in [0, "index"]: - return data.for_each("hasnans") - if axis in [1, "columns"]: - func = lambda s1, s2: s1.isna() | s2.isna() - init = pd.Series(False, dtype=bool, index=self.index_of("all")) - return data.reduce_columns(func, init) - raise ValueError(axis) - - def fillna( - self, - value=None, - method=None, - axis=None, - inplace=False, - limit=None, - downcast=None, - ): - if axis in [None, 0, "index"]: - kws = dict(value=value, method=method, limit=limit, downcast=downcast) - data = self.for_each("fillna", inplace=inplace, **kws) - if inplace: - return - return self._construct_like_self(data=data, fastpath=True) - - if axis in [1, "columns"]: - raise NotImplementedError - raise ValueError(axis) - - def isempty(self): - """Returns a boolean Series, which indicates if an column is empty""" - return self.for_each("empty").astype(bool) - - def notempty(self): - """Returns a boolean Series, which indicates if an column is not empty""" - return ~self.isempty() - - def isdata(self): - """Alias for ``notna(drop_empty=True)``.""" - return self.notna(drop_empty=True) - - def isnull(self, drop_empty=False): - """Alias for ``isna()``""" - return self.isna(drop_empty=drop_empty) - - def notnull(self, drop_empty=False): - """Alias, see ``notna()``.""" - return self.notna(drop_empty=drop_empty) - - def to_dios(self): - """ - A dummy to allow unconditional to_dios calls - on pd.DataFrame, pd.Series and dios.DictOfSeries - """ - return self - - # ---------------------------------------------------------------------- - # Rendering Methods - - def __str__(self): - return self.__repr__() - - def __repr__(self): - repr = dios_options[OptsFields.dios_repr] - showdim = self.lengths.max() > dios_options[OptsFields.disp_max_rows] - return self.to_string(method=repr, show_dimensions=showdim) - - def to_string( - self, - max_rows=None, - min_rows=None, - max_cols=None, - na_rep="NaN", - show_dimensions=False, - method=Opts.repr_indexed, - no_value=" ", - empty_series_rep="no data", - col_delim=" | ", - header_delim="=", - col_space=None, - ): - """Pretty print a dios. - - if `method` == `indexed` (default): - every column is represented by a own index and corresponding values - - if `method` == `aligned` [2]: - one(!) global index is generated and values from a column appear at - the corresponding index-location. - - Parameters - --------- - - max_cols : - not more column than `max_cols` are printed [1] - - max_rows : - see `min_rows` [1] - - min_rows : - not more rows than `min_rows` are printed, if rows of any series exceed `max_rows` [1] - - na_rep : - all NaN-values are replaced by `na_rep`. Default `NaN` - - empty_series_rep : - Ignored if not `method='indexed'`. - Empty series are represented by the string in `empty_series_rep` - - col_delim : str - Ignored if not `method='indexed'`. - between all columns `col_delim` is inserted. - - header_delim : - Ignored if not `method='indexed'`. - between the column names (header) and the data, `header_delim` is inserted, - if not None. The string is repeated, up to the width of the column. (str or None). - - no_value : - Ignored if not `method='aligned'`. - value that indicates, that no entry in the underling series is present. Bear in mind - that this should differ from `na_rep`, otherwise you cannot differ missing- from NaN- values. - - Notes - ----- - [1]: defaults to the corresponding value in `dios_options` - [2]: the common-params are directly passed to pd.DataFrame.to_string(..) - under the hood, if method is `aligned` - - """ - if self.empty: - return _empty_repr(self) - - max_cols = max_cols or dios_options[OptsFields.disp_max_cols] or 100 - max_rows = max_rows or dios_options[OptsFields.disp_max_rows] or 200 - min_rows = min_rows or dios_options[OptsFields.disp_min_rows] or 100 - - kwargs = dict( - max_rows=max_rows, - min_rows=min_rows, - max_cols=max_cols, - na_rep=na_rep, - col_space=col_space, - show_dimensions=show_dimensions, - ) - - if method == Opts.repr_aligned: - return _to_aligned_df(self, no_value=no_value).to_string(**kwargs) - - # add pprint relevant options - kwargs.update( - empty_series_rep=empty_series_rep, - col_delim=col_delim, - header_delim=header_delim, - ) - - return pprint_dios(self, **kwargs) - - def to_csv(self, *args, **kwargs): - self.to_df().to_csv(*args, **kwargs) - - to_csv.__doc__ = pd.DataFrame.to_csv.__doc__ - - -def _empty_repr(di): - return f"Empty DictOfSeries\n" f"Columns: {di.columns.to_list()}" - - -def pprint_dios( - dios, - max_rows=None, - min_rows=None, - max_cols=None, - na_rep="NaN", - empty_series_rep="no data", - col_space=None, - show_dimensions=True, - col_delim=" | ", - header_delim="=", -): - na_rep = str(na_rep) - empty_series_rep = str(empty_series_rep) - col_delim = col_delim or " " - - min_rows = min(max_rows, min_rows) - - if dios.empty: - return _empty_repr(dios) - - maxlen = dios.lengths.max() - data = dios._data - - trunc_cols = len(data) > max_cols - if trunc_cols: - left, right = data.head(max_cols // 2), data.tail(max_cols // 2) - data = left.append(right) - - # now data only contains series that we want to print. - - # if any series exceed max_rows we trim all series to min_rows - series_lengths = data.apply(len).to_list() - series_maxlen = max(series_lengths) - trunc_rows = series_maxlen > max_rows - max_rows = min_rows if trunc_rows else series_maxlen - - # we make a list of list, where the inner contains all - # stringified values of the series upto max_rows+1, where - # the additional row is the column-name - outer = [] - for i, colname in enumerate(data.index): - # use iat instead of at, see #GL391 - s = data.iat[i] - - isempty = s.empty - if isempty: - s = pd.Series(empty_series_rep) - idx = False - cspace = col_space - else: - idx = True - cspace = col_space // 2 if col_space else col_space - - sstr = s.to_frame().to_string( - col_space=cspace, - header=[str(colname)], - index=idx, - na_rep=na_rep, - max_rows=max_rows, - min_rows=min_rows, - ) - li = sstr.split("\n") - - # HACK: empty series produce a unnecessary space, - # because index is omitted - if isempty: - cstr, vstr = li - if len(cstr.lstrip()) < len(vstr) and (cspace or 0) < len(vstr): - li = [cstr[1:], vstr[1:]] - - outer.append(li) - - # now the length of every value-string per series are the same. - # we need this length's to know, how many chars we need to fill, - # once we exceed the length of the series, or if we insert whole - # columns. - valstr_len = [len(c[0]) for c in outer] - - rows = max_rows + 1 # colnames aka. header - rows += 1 if trunc_rows else 0 # `...` in rows - rows += 1 if header_delim else 0 # underline header - - if header_delim: - for i, c in enumerate(outer): - colheader = (header_delim * valstr_len[i])[: valstr_len[i]] - c.insert(1, colheader) - - dots = " ... " - if trunc_cols: - outer.insert(max_cols // 2, [dots] * rows) - valstr_len.insert(max_cols // 2, len(dots)) - series_lengths.insert(max_cols // 2, rows) - - txt = "" - for r in range(rows): - for i, c in enumerate(outer): - try: - vstr = c[r] - except IndexError: - vstr = " " * valstr_len[i] - txt += vstr + col_delim - txt += "\n" - - # add footer - if show_dimensions: - for i, c in enumerate(outer): - # ignore the dot-column - if trunc_cols and i == max_cols // 2: - txt += dots + " " * len(col_delim) - else: - txt += f"[{series_lengths[i]}]".ljust(valstr_len[i] + len(col_delim)) - - txt += f"\n\nmax: [{maxlen} rows x {len(dios.columns)} columns]" - txt += "\n" - - return txt - - -def _to_aligned_df(dios, no_value=" "): - if dios.empty: - return pd.DataFrame(columns=dios.columns) - - # keep track of all real nans - nandict = {} - for c in dios: - nans = dios[c].isna() - nandict[c] = nans[nans].index - - df = dios.to_df() - df[df.isna()] = no_value - - # reinsert all real nans - for c in df: - df.loc[nandict[c], c] = np.nan - - return df - - -def to_dios(obj) -> DictOfSeries: - """try cast obj to DictOfSeries.""" - if isinstance(obj, DictOfSeries): - return obj - return DictOfSeries(data=obj) - - -def __monkey_patch_pandas(): - def to_dios(self): - return DictOfSeries(data=self) - - pd.Series.to_dios = to_dios - pd.DataFrame.to_dios = to_dios - - -__monkey_patch_pandas() diff --git a/dios/dios/indexer.py b/dios/dios/indexer.py deleted file mode 100644 index cbf0ffa5631f74048e8994875532c49627263d3e..0000000000000000000000000000000000000000 --- a/dios/dios/indexer.py +++ /dev/null @@ -1,483 +0,0 @@ -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -import numpy as np -import pandas as pd - -from . import pandas_bridge as pdextra -from .base import _DiosBase, _is_bool_dios_like, _is_dios_like - - -class _Indexer: - def __init__(self, obj: _DiosBase): - self.obj = obj - self._data = obj._data - - def _unpack_key(self, key): - key = list(key) if pdextra.is_iterator(key) else key - - if isinstance(key, tuple): - if len(key) > 2: - raise KeyError("To many indexers") - rowkey, colkey = key - else: - rowkey, colkey = key, slice(None) - - if isinstance(rowkey, tuple) or isinstance(colkey, tuple): - raise KeyError(f"{key}. tuples are not allowed.") - - rowkey = list(rowkey) if pdextra.is_iterator(rowkey) else rowkey - colkey = list(colkey) if pdextra.is_iterator(colkey) else colkey - return rowkey, colkey - - def _set_value_muli_column(self, rowkey, colkey, value, xloc="loc"): - """set value helper for loc and iloc""" - - data = getattr(self._data, xloc)[colkey] - - hashable_rkey = pdextra.is_hashable(rowkey) - dioslike_value = False - iter_value = False - - if _is_dios_like(value): - dioslike_value = True - if hashable_rkey: - raise ValueError(f"Incompatible indexer with DictOfSeries") - - elif pdextra.is_list_like(value): - value = value.values if isinstance(value, pd.Series) else value - iter_value = True - if len(value) != len(data): - raise ValueError( - f"shape mismatch: value array of shape (.., {len(value)}) could " - f"not be broadcast to indexing result of shape (.., {len(data)})" - ) - c = "?" - try: - for i, c in enumerate(data.index): - dat = data.at[c] - dat_xloc = getattr(dat, xloc) - - if dioslike_value: - # set to empty series fail; emptySer.loc[:] = [2,1] - # len(scalar) -> would fail, but cannot happen, - # because dioslike+hashable, already was checked - if len(dat_xloc[rowkey]) == 0: - continue - - # unpack the value if necessary - if iter_value: - val = value[i] - elif dioslike_value: - val = value[c] if c in value else np.nan - else: - val = value - - dat_xloc[rowkey] = val - - except Exception as e: - raise type(e)(f"failed for column {c}: " + str(e)) from e - - -# ############################################################################# - - -class _LocIndexer(_Indexer): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def __getitem__(self, key): - rowkey, colkey = self._unpack_key(key) - if _is_dios_like(rowkey) or _is_dios_like(colkey): - raise ValueError("Could not index with multidimensional key") - - # simple optimisation - if pdextra.is_null_slice(rowkey) and pdextra.is_null_slice(colkey): - return self.obj.copy() - - data = self._data.loc[colkey].copy() - - # .loc[any, scalar] -> (a single) series - # .loc[scalar, scalar] -> (a single) value - if pdextra.is_hashable(colkey): - new = data.loc[rowkey] - - # .loc[any, non-scalar] - else: - k = "?" - try: - for k in data.index: - data.at[k] = data.at[k].loc[rowkey] - - except Exception as e: - raise type(e)(f"failed for column {k}: " + str(e)) from e - - # .loc[scalar, non-scalar] -> column-indexed series - if pdextra.is_hashable(rowkey): - new = data - - # .loc[non-scalar, non-scalar] -> dios - else: - new = self.obj.copy_empty(columns=False) - new._data = data - - return new - - def __setitem__(self, key, value): - rowkey, colkey = self._unpack_key(key) - if _is_dios_like(rowkey) or _is_dios_like(colkey): - raise ValueError("Cannot index with multi-dimensional key") - - # .loc[any, scalar] - set on single column - if pdextra.is_hashable(colkey): - # .loc[dont-care, new-scalar] = val - if colkey not in self.obj.columns: - self.obj._insert(colkey, value) - - # .loc[any, scalar] = multi-dim - elif _is_dios_like(value) or pdextra.is_nested_list_like(value): - raise ValueError("Incompatible indexer with multi-dimensional value") - - # .loc[any, scalar] = val - else: - self._data.at[colkey].loc[rowkey] = value - - # .loc[any, non-scalar] = any - else: - self._set_value_muli_column(rowkey, colkey, value, xloc="loc") - - -# ############################################################################# - - -class _iLocIndexer(_Indexer): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def __getitem__(self, key): - rowkey, colkey = self._unpack_key(key) - if _is_dios_like(rowkey) or _is_dios_like(colkey): - raise ValueError("Cannot index with multidimensional key") - - # simple optimisation - if pdextra.is_null_slice(rowkey) and pdextra.is_null_slice(colkey): - return self.obj.copy() - - data = self._data.iloc[colkey].copy() - - # .iloc[any, int] -> single series - # .iloc[int, int] -> single value - if pdextra.is_integer(colkey): - new = data.iloc[rowkey] - - # .iloc[any, non-int] - else: - k = "?" - try: - for k in data.index: - data.at[k] = data.at[k].iloc[rowkey] - - except Exception as e: - raise type(e)(f"failed for column {k}: " + str(e)) from e - - # .iloc[int, non-int] -> column-indexed series - if pdextra.is_integer(rowkey): - new = data - - # .iloc[non-int, non-int] -> dios - else: - new = self.obj.copy_empty(columns=False) - new._data = data - - return new - - def __setitem__(self, key, value): - rowkey, colkey = self._unpack_key(key) - if _is_dios_like(rowkey) or _is_dios_like(colkey): - raise ValueError("Cannot index with multidimensional key") - - # .iloc[any, int] = Any - if pdextra.is_integer(colkey): - if _is_dios_like(value) or pdextra.is_nested_list_like(value): - raise ValueError("Incompatible indexer with multi-dimensional value") - self._data.iat[colkey].iloc[rowkey] = value - - # .iloc[any, non-int] = Any - else: - self._set_value_muli_column(rowkey, colkey, value, xloc="iloc") - - -# ############################################################################# - - -class _aLocIndexer(_Indexer): - """align Indexer - - Automatically align (alignable) indexer on all possible axis, - and handle indexing with non-existent or missing keys gracefully. - - Also align (alignable) values before setting them with .loc - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self._usebool = True - - def __call__(self, usebool=True): - """We are called if the user want to set `usebool=False', which make - boolean alignable indexer treat as non-boolean alignable indexer. - - Explanation: A boolean dios indexer align its indices with the indices - of the receiving dios like a non-boolean dios indexer also would do. - Additionally all rows with False values are kicked too. To disable - that `usebool=False` can be given.""" - self._usebool = usebool - return self - - def __getitem__(self, key): - rowkeys, colkeys, lowdim = self._unpack_key_aloc(key) - data = pd.Series(dtype="O", index=colkeys) - kws = dict(itype=self.obj.itype, cast_policy=self.obj._policy) - - c = "?" - try: - for i, c in enumerate(data.index): - data.at[c] = self._data.at[c].loc[rowkeys[i]] - - except Exception as e: - raise type(e)(f"failed for column {c}: " + str(e)) from e - - if lowdim: - return data.squeeze() - else: - return self.obj._constructor(data=data, fastpath=True, **kws)._finalize( - self.obj - ) - - def __setitem__(self, key, value): - rowkeys, colkeys, _ = self._unpack_key_aloc(key) - - def iter_self(colkeys, position=False): - c = "?" - try: - for i, c in enumerate(colkeys): - dat = self._data.at[c] - rk = rowkeys[i] - if len(dat.loc[rk]) == 0: - continue - yield dat, rk, i if position else c - - except Exception as e: - raise type(e)(f"failed for column {c}: " + str(e)) from e - - # align columns, for rows use series.loc to align - if _is_dios_like(value): - colkeys = value.columns.intersection(colkeys) - for dat, rk, c in iter_self(colkeys): - dat.loc[rk] = value[c] - - # no align, no merci - elif pdextra.is_nested_list_like(value): - if len(colkeys) != len(value): - raise ValueError( - f"shape mismatch: values array of shape " - f"(.., {len(value)}) could not " - f"be broadcast to indexing result of " - f"shape (.., {len(colkeys)})" - ) - for dat, rk, i in iter_self(colkeys, position=True): - dat.loc[rk] = value[i] - - # align rows by using series.loc - elif isinstance(value, pd.Series): - for dat, rk, _ in iter_self(colkeys): - dat.loc[rk] = value - - # no align, no merci - else: - for dat, rk, _ in iter_self(colkeys): - dat.loc[rk] = value - - def _unpack_key_aloc(self, key): - """ - Return a list of row indexer and a list of existing(!) column labels. - Both list always have the same length and also could be empty together. - - Note: - The items of the row indexer list should be passed to pd.Series.loc[] - """ - # if a single column-key is given, the caller may - # want to return a single Series, instead of a dios - lowdim = False - - def keys_from_bool_dios_like(key): - if not _is_bool_dios_like(key): - raise ValueError("Must pass dios-like key with boolean values only.") - colkey = self.obj.columns.intersection(key.columns) - rowkey = [] - for c in colkey: - b = key[c] - rowkey += [self._data.at[c].index.intersection(b[b].index)] - return rowkey, colkey, lowdim - - def keys_from_dios_like(key): - colkey = self.obj.columns.intersection(key.columns) - rowkey = [self._data.at[c].index.intersection(key[c].index) for c in colkey] - return rowkey, colkey, lowdim - - rowkey, colkey = self._unpack_key(key) - - if _is_dios_like(colkey) or pdextra.is_nested_list_like(colkey): - raise ValueError("Could not index with multi-dimensional column key.") - - # giving the ellipsis as column key, is an alias - # for giving `usebool=False`. see self.__call__() - if colkey is Ellipsis: - self._usebool = False - colkey = slice(None) - - # .aloc[dios] - if _is_dios_like(rowkey): - if not pdextra.is_null_slice(colkey): - raise ValueError( - f"Could not index with a dios-like indexer as rowkey," - f"and a column key of that type {type(colkey)}" - ) - if self._usebool: - return keys_from_bool_dios_like(rowkey) - else: - return keys_from_dios_like(rowkey) - - # handle gracefully: scalar - elif pdextra.is_hashable(colkey): - colkey = [colkey] if colkey in self.obj.columns else [] - lowdim = True - - # column-alignable: list-like, filter only existing columns - elif pdextra.is_list_like(colkey) and not pdextra.is_bool_indexer(colkey): - colkey = colkey.values if isinstance(colkey, pd.Series) else colkey - colkey = self.obj.columns.intersection(colkey) - - # handle gracefully (automatically) - # just a simple optimisation - elif pdextra.is_null_slice(colkey): - colkey = self.obj.columns - - # not alignable, fall back to .loc (boolean list/series, slice(..), etc. - else: - colkey = self._data.loc[colkey].index - - if len(colkey) == 0: # (!) `if not colkey:` fails for pd.Index - return [], [], lowdim - - rowkey = self._get_rowkey(rowkey, colkey) - - return rowkey, colkey, lowdim - - def _get_rowkey(self, rowkey, colkey, depth=0): - if pdextra.is_nested_list_like(rowkey) and depth == 0: - rowkey = rowkey.values if isinstance(rowkey, pd.Series) else rowkey - if len(rowkey) != len(colkey): - raise ValueError( - "Nested arrays indexer must have same (outer) " - "length than the number of selected columns." - ) - indexer = [] - for i, c in enumerate(colkey): - # recurse to get the row indexer from inner element - indexer += self._get_rowkey(rowkey[i], [c], depth=depth + 1) - rowkey = indexer - - # row-alignable: pd.Series(), align rows to every series in colkey (columns) - elif isinstance(rowkey, pd.Series): - if self._usebool and pdextra.is_bool_indexer(rowkey): - rowkey = [ - self._data.at[c].index.intersection(rowkey[rowkey].index) - for c in colkey - ] - else: - rowkey = [ - self._data.at[c].index.intersection(rowkey.index) for c in colkey - ] - - # handle gracefully: scalar, transform to row-slice - elif pdextra.is_hashable(rowkey): - rowkey = [slice(rowkey, rowkey)] * len(colkey) - - # handle gracefully: list-like, filter only existing rows - # NOTE: dios.aloc[series.index] is processed here - elif pdextra.is_list_like(rowkey) and not pdextra.is_bool_indexer(rowkey): - rowkey = [self._data.at[c].index.intersection(rowkey) for c in colkey] - - # not alignable - # the rowkey is processed by .loc someway in - # the calling function - (eg. slice(..), boolean list-like, etc.) - else: - rowkey = [rowkey] * len(colkey) - - return rowkey - - -# ############################################################################# - - -class _AtIndexer(_Indexer): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def _check_key(self, key): - if not ( - isinstance(key, tuple) - and len(key) == 2 - and pdextra.is_hashable(key[0]) - and pdextra.is_hashable(key[1]) - ): - raise KeyError( - f"{key}. `.at` takes exactly one scalar row-key " - "and one scalar column-key" - ) - - def __getitem__(self, key): - self._check_key(key) - return self._data.at[key[1]].at[key[0]] - - def __setitem__(self, key, value): - self._check_key(key) - if _is_dios_like(value) or pdextra.is_nested_list_like(value): - raise TypeError( - ".at[] cannot be used to set multi-dimensional values, use .aloc[] instead." - ) - self._data.at[key[1]].at[key[0]] = value - - -# ############################################################################# - - -class _iAtIndexer(_Indexer): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def _check_key(self, key): - if not ( - isinstance(key, tuple) - and len(key) == 2 - and pdextra.is_integer(key[0]) - and pdextra.is_integer(key[1]) - ): - raise KeyError( - f"{key} `.iat` takes exactly one integer positional " - f"row-key and one integer positional scalar column-key" - ) - - def __getitem__(self, key): - self._check_key(key) - return self._data.iat[key[1]].iat[key[0]] - - def __setitem__(self, key, value): - self._check_key(key) - if _is_dios_like(value) or pdextra.is_nested_list_like(value): - raise TypeError( - ".iat[] cannot be used to set multi-dimensional values, use .aloc[] instead." - ) - self._data.iat[key[1]].iat[key[0]] = value diff --git a/dios/dios/lib.py b/dios/dios/lib.py deleted file mode 100644 index c034e87b50f3033aee6893725dea1a0e27d55cd1..0000000000000000000000000000000000000000 --- a/dios/dios/lib.py +++ /dev/null @@ -1,437 +0,0 @@ -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -import warnings -from contextlib import contextmanager - -import pandas as pd - - -@contextmanager -def no_index_warning(): - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=FutureWarning) - yield - - -class ItypeWarning(RuntimeWarning): - pass - - -class ItypeCastWarning(ItypeWarning): - pass - - -class ItypeCastError(RuntimeError): - pass - - -class __Itype: - def __init__(self): - raise RuntimeError("a Itype class does not allow instances of itself.") - - -class DtItype(__Itype): - name = "datetime" - unique = True - subtypes = (pd.DatetimeIndex,) - min_pdindex = pd.DatetimeIndex([]) - - -class IntItype(__Itype): - name = "integer" - unique = True - with no_index_warning(): - subtypes = (pd.RangeIndex, pd.Int64Index, pd.UInt64Index, int) - min_pdindex = pd.Int64Index([]) - - -class FloatItype(__Itype): - name = "float" - unique = True - - with no_index_warning(): - subtypes = (pd.Float64Index, float) - min_pdindex = pd.Float64Index([]) - - -# class MultiItype(__Itype): -# name = "multi" -# subtypes = (pd.MultiIndex, ) -# unique = ?? - - -class NumItype(__Itype): - name = "numeric" - _subitypes = (IntItype, FloatItype) - subtypes = _subitypes + IntItype.subtypes + FloatItype.subtypes - unique = False - with no_index_warning(): - min_pdindex = pd.Float64Index([]) - - -class ObjItype(__Itype): - name = "object" - unique = False - _subitypes = (DtItype, IntItype, FloatItype, NumItype, str) - _otheritypes = ( - pd.CategoricalIndex, - pd.IntervalIndex, - pd.PeriodIndex, - pd.TimedeltaIndex, - pd.Index, - ) - subtypes = _subitypes + _otheritypes + DtItype.subtypes + NumItype.subtypes - min_pdindex = pd.Index([]) - - -def is_itype(obj, itype): - """Check if obj is a instance of the given itype or its str-alias was given""" - - # todo: iter through itype as it could be a tuple, if called like ``is_itype(o, (t1,t2))`` - - # user gave a Itype, like ``DtItype`` - if type(obj) == type and issubclass(obj, itype): - return True - - # user gave a string, like 'datetime' - if isinstance(obj, str) and obj == itype.name: - return True - - return False - - -def is_itype_subtype(obj, itype): - """Check if obj is a subclass or a instance of a subclass of the given itype""" - - # user gave a subtype, like ``pd.DatetimeIndex`` - if type(obj) == type and issubclass(obj, itype.subtypes): - return True - - # user gave a instance of a subtype, like ``pd.Series(..).index`` - if isinstance(obj, itype.subtypes): - return True - - return False - - -def is_itype_like(obj, itype): - """Check if obj is a subclass or a instance of the given itype or any of its subtypes""" - return is_itype(obj, itype) or is_itype_subtype(obj, itype) - - -def get_itype(obj): - """ - - Return the according Itype. - - and return the according Itype - Parameters - ---------- - obj : {itype string, Itype, pandas.Index, instance of pd.index} - get the itype fitting for the input - - Examples - -------- - >>> get_itype("datetime") - <class 'dios.lib.DtItype'> - - >>> s = pd.Series(index=pd.to_datetime([])) - >>> get_itype(s.index) - <class 'dios.lib.DtItype'> - - >>> get_itype(DtItype) - <class 'dios.lib.DtItype'> - - >>> get_itype(pd.DatetimeIndex) - <class 'dios.lib.DtItype'> - """ - if type(obj) == type and issubclass(obj, __Itype): - return obj - - # check if it is the actual type, not a subtype - types = [DtItype, IntItype, FloatItype, NumItype, ObjItype] - for t in types: - if is_itype(obj, t): - return t - - for t in types: - if is_itype_subtype(obj, t): - return t - - raise ValueError( - f"{obj} is not a itype, nor any known subtype of a itype, nor a itype string alias" - ) - - -def _itype_eq(a, b): - return is_itype(a, b) - - -def _itype_lt(a, b): - return is_itype_subtype(a, b) - - -def _itype_le(a, b): - return is_itype_like(a, b) - - -def _find_least_common_itype(iterable_of_series): - itypes = [NumItype, FloatItype, IntItype, DtItype] - tlist = [get_itype(s.index) for s in iterable_of_series] - found = ObjItype - if tlist: - for itype in itypes: - for t in tlist: - if _itype_le(t, itype): - continue - break - else: - found = itype - return found - - -################################################################################ -# Casting - - -class CastPolicy: - force = "force" - save = "save" - never = "never" - - -_CAST_POLICIES = [CastPolicy.force, CastPolicy.save, CastPolicy.never] - - -def cast_to_itype(series, itype, policy="lossless", err="raise", inplace=False): - """Cast a series (more explicit the type of the index) to fit the itype of a dios. - - Return the casted series if successful, None otherwise. - - Note: - This is very basic number-casting, so in most cases, information from - the old index will be lost after the cast. - """ - - if policy not in _CAST_POLICIES: - raise ValueError(f"policy={policy}") - if err not in ["raise", "ignore"]: - raise ValueError(f"err={err}") - if not inplace: - series = series.copy() - itype = get_itype(itype) - - if series.empty: - return pd.Series(index=itype.min_pdindex, dtype=series.dtype) - - series.itype = get_itype(series.index) - - # up-cast issn't necessary because a dios with a higher - # itype always can take lower itypes. - # series can have dt/int/float/mixed - # dt -> dt -> mixed - # int -> int -> num -> mixed - # float -> float -> num -> mixed - # mixed -> mixed - if _itype_le(series.itype, itype): # a <= b - return series - - e = f"A series index of type '{type(series.index)}' cannot be casted to Itype '{itype.name}'" - - # cast any -> dt always fail. - if is_itype(itype, DtItype): - pass - else: - e += f", as forbidden by the cast-policy '{policy}'." - - if policy == CastPolicy.never: - pass - - elif policy == CastPolicy.force: - # cast any (dt/float/mixed) -> int - if is_itype(itype, IntItype): # a == b - series.index = pd.RangeIndex(len(series)) - return series - # cast any (dt/int/mixed) -> float - # cast any (dt/float/mixed) -> nur - if is_itype(itype, FloatItype) or is_itype(itype, NumItype): # a == b or a == c - series.index = pd.Float64Index(range(len(series))) - return series - - elif policy == CastPolicy.save: - # cast int -> float - if is_itype(itype, IntItype) and is_itype( - series.itype, FloatItype - ): # a == b and c == d - series.index = series.index.astype(float) - return series - # cast float -> int, maybe if unique - if is_itype(itype, FloatItype) and is_itype( - series.itype, IntItype - ): # a == b and c == d - series.index = series.index.astype(int) - if series.index.is_unique: - return series - e = ( - f"The cast with policy {policy} from series index type '{type(series.index)}' to " - f"itype {itype.name} resulted in a non-unique index." - ) - # cast mixed -> int/float always fail - - if err == "raise": - raise ItypeCastError(e) - else: - return None - - -################################################################################ -# OPTIONS - - -class OptsFields: - """storage class for the keys in `dios_options` - - Use like so: ``dios_options[OptsFields.X] = Opts.Y``. - - See Also - -------- - Opts: values for the options dict - dios_options: options dict for module - """ - - mixed_itype_warn_policy = "mixed_itype_policy" - disp_max_rows = "disp_max_rows " - disp_min_rows = "disp_min_rows " - disp_max_cols = "disp_max_vars" - dios_repr = "dios_repr" - - -class Opts: - """storage class for string values for `dios_options` - - Use like so: ``dios_options[OptsFields.X] = Opts.Y``. - - See Also - -------- - OptsFields: keys for the options dict - dios_options: options dict for module - """ - - itype_warn = "warn" - itype_err = "err" - itype_ignore = "ignore" - repr_aligned = "aligned" - repr_indexed = "indexed" - - -class __DocDummy(dict): - pass - - -dios_options = __DocDummy() -dios_options.update( - **{ - OptsFields.disp_max_rows: 60, - OptsFields.disp_min_rows: 10, - OptsFields.disp_max_cols: 10, - OptsFields.mixed_itype_warn_policy: Opts.itype_warn, - OptsFields.dios_repr: Opts.repr_indexed, - } -) - -opdoc = f"""Options dictionary for module `dios`. - -Use like so: ``dios_options[OptsFields.X] = Opts.Y``. - -**Items**: - * {OptsFields.dios_repr}: {{'indexed', 'aligned'}} default: 'indexed' - dios default representation if: - * `indexed`: show every column with its index - * `aligned`: transform to pandas.DataFrame with indexed merged together. - * {OptsFields.disp_max_rows} : int - Maximum numbers of row before truncated to `disp_min_rows` - in representation of DictOfSeries - - * {OptsFields.disp_min_rows} : int - min rows to display if `max_rows` is exceeded - - * {OptsFields.disp_max_cols} : int - Maximum numbers of columns before truncated representation - - * {OptsFields.mixed_itype_warn_policy} : {{'warn', 'err', 'ignore'}} - How to inform user about mixed Itype - -See Also --------- - OptsFields: keys for the options dict - Opts: values for the options dict - -""" -dios_options.__doc__ = opdoc - - -def _throw_MixedItype_err_or_warn(itype): - msg = ( - f"Using '{itype.name}' as itype is not recommend. " - f"As soon as series with different index types are inserted,\n" - f"indexing and slicing will almost always fail. " - ) - - if dios_options[OptsFields.mixed_itype_warn_policy] in [ - "ignore", - Opts.itype_ignore, - ]: - pass - elif dios_options[OptsFields.mixed_itype_warn_policy] in [ - "error", - "err", - Opts.itype_err, - ]: - msg += "Suppress this error by specifying an unitary 'itype' or giving an 'index' to DictOfSeries." - raise ItypeCastError(msg) - else: - msg += "Silence this warning by specifying an unitary 'itype' or giving an 'index' to DictOfSeries." - warnings.warn(msg, ItypeWarning) - return - - -def example_DictOfSeries(): - """Return a example dios. - - Returns - ------- - DictOfSeries: an example - - Examples - -------- - - >>> from dios import example_DictOfSeries - >>> di = example_DictOfSeries() - >>> di - a | b | c | d | - ===== | ====== | ====== | ===== | - 0 0 | 2 5 | 4 7 | 6 0 | - 1 7 | 3 6 | 5 17 | 7 1 | - 2 14 | 4 7 | 6 27 | 8 2 | - 3 21 | 5 8 | 7 37 | 9 3 | - 4 28 | 6 9 | 8 47 | 10 4 | - 5 35 | 7 10 | 9 57 | 11 5 | - 6 42 | 8 11 | 10 67 | 12 6 | - 7 49 | 9 12 | 11 77 | 13 7 | - 8 56 | 10 13 | 12 87 | 14 8 | - 9 63 | 11 14 | 13 97 | 15 9 | - """ - from dios import DictOfSeries - - a = pd.Series(range(0, 70, 7)) - b = pd.Series(range(5, 15, 1)) - c = pd.Series(range(7, 107, 10)) - d = pd.Series(range(0, 10, 1)) - - for i, s in enumerate([a, b, c, d]): - s.index += i * 2 - - di = DictOfSeries(dict(a=a, b=b, c=c, d=d)) - return di.copy() diff --git a/dios/dios/operators.py b/dios/dios/operators.py deleted file mode 100644 index e040310a41f2952b52c2e8d5420857381b1bdc68..0000000000000000000000000000000000000000 --- a/dios/dios/operators.py +++ /dev/null @@ -1,45 +0,0 @@ -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -# do not import dios-stuff here -import operator as op - -_OP1_MAP = { - op.inv: "~", - op.neg: "-", - op.abs: "abs()", -} - -_OP2_COMP_MAP = { - op.eq: "==", - op.ne: "!=", - op.le: "<=", - op.ge: ">=", - op.gt: ">", - op.lt: "<", -} - -_OP2_BOOL_MAP = { - op.and_: "&", - op.or_: "|", - op.xor: "^", -} -_OP2_ARITH_MAP = { - op.add: "+", - op.sub: "-", - op.mul: "*", - op.pow: "**", -} - -_OP2_DIV_MAP = { - op.mod: "%", - op.truediv: "/", - op.floordiv: "//", -} - -OP_MAP = _OP2_COMP_MAP.copy() -OP_MAP.update(_OP2_BOOL_MAP) -OP_MAP.update(_OP2_ARITH_MAP) -OP_MAP.update(_OP2_DIV_MAP) -OP_MAP.update(_OP1_MAP) diff --git a/dios/dios/pandas_bridge.py b/dios/dios/pandas_bridge.py deleted file mode 100644 index 1057785a7bd1e52eb9ecc0f375ef31e33871c9e4..0000000000000000000000000000000000000000 --- a/dios/dios/pandas_bridge.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python - -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -__author__ = "Bert Palm" -__email__ = "bert.palm@ufz.de" -__copyright__ = "Copyright 2020, Helmholtz-Zentrum für Umweltforschung GmbH - UFZ" - - -from pandas.api.types import ( # Unlike the example says, return lists False, not True; >>is_iterator([1, 2, 3]); >>False - is_dict_like, - is_hashable, - is_integer, - is_iterator, - is_list_like, - is_scalar, -) -from pandas.core.common import is_bool_indexer, is_null_slice -from pandas.core.dtypes.common import is_nested_list_like diff --git a/dios/docs/.gitignore b/dios/docs/.gitignore deleted file mode 100644 index 94b2ce4f6c97aab8fdc5b2a6fb7f5b3e4bb00da1..0000000000000000000000000000000000000000 --- a/dios/docs/.gitignore +++ /dev/null @@ -1,12 +0,0 @@ -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -# ignore everything -_api -_build -_static -*.automodsumm - - - diff --git a/dios/docs/Makefile b/dios/docs/Makefile deleted file mode 100644 index 5dc68a0e6a975d755f25cd006df1b5e66a041830..0000000000000000000000000000000000000000 --- a/dios/docs/Makefile +++ /dev/null @@ -1,29 +0,0 @@ -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -# Minimal makefile for Sphinx documentation -# - -# You can set these variables from the command line, and also -# from the environment for the first two. -SPHINXOPTS ?= -SPHINXBUILD ?= sphinx-build -SOURCEDIR = . -BUILDDIR = _build - -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -.PHONY: help Makefile clean - -clean: - rm -rf _build _static _api - rm -f *.automodsumm - mkdir _static - -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/dios/docs/conf.py b/dios/docs/conf.py deleted file mode 100644 index 245102bcab2ac28231cea6c109dad3092cfd4326..0000000000000000000000000000000000000000 --- a/dios/docs/conf.py +++ /dev/null @@ -1,95 +0,0 @@ -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -# Configuration file for the Sphinx documentation builder. -# -# This file only contains a selection of the most common options. For a full -# list see the documentation: -# https://www.sphinx-doc.org/en/master/usage/configuration.html - -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -import os -import sys - -sys.path.insert(0, os.path.abspath("..")) - - -# -- Project information ----------------------------------------------------- - -project = "dios" -copyright = "2020, Bert Palm" -author = "Bert Palm" - - -# -- General configuration --------------------------------------------------- - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - "sphinx.ext.autodoc", - "sphinx.ext.autosummary", - # "sphinx.ext.doctest", - # "sphinx.ext.extlinks", - # "sphinx.ext.todo", - # "sphinx.ext.intersphinx", - # "sphinx.ext.coverage", - # "sphinx.ext.mathjax", - # "sphinx.ext.ifconfig", - "sphinx.ext.autosectionlabel", - # link source code - "sphinx.ext.viewcode", - # add suupport for NumPy style docstrings - "sphinx.ext.napoleon", - # doc the whole module - "sphinx_automodapi.automodapi", - "sphinxcontrib.fulltoc", - # markdown sources support - "recommonmark", - "sphinx_markdown_tables", -] -numpydoc_show_class_members = False -automodsumm_inherited_members = True -automodapi_inheritance_diagram = False -automodapi_toctreedirnm = "_api" -# automodsumm_writereprocessed = True -autosectionlabel_prefix_document = True - - -# Add any paths that contain templates here, relative to this directory. -templates_path = ["_templates"] - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] - -source_suffix = [".rst", ".md"] - - -# -- Options for HTML output ------------------------------------------------- - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = "sphinx" - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -html_theme = "nature" - -# use pandas theme -# html_theme = "pydata_sphinx_theme" - - -# html_theme_options = { -# } - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ["_static"] diff --git a/dios/docs/dios_api.rst b/dios/docs/dios_api.rst deleted file mode 100644 index 4d3ad1ad820dbd18a470cd90891f59f9bae97174..0000000000000000000000000000000000000000 --- a/dios/docs/dios_api.rst +++ /dev/null @@ -1,14 +0,0 @@ -.. SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -.. -.. SPDX-License-Identifier: GPL-3.0-or-later - -API -==== - -.. automodapi:: dios - :include-all-objects: - :no-heading: - - - - diff --git a/dios/docs/doc_cookbook.md b/dios/docs/doc_cookbook.md deleted file mode 100644 index 94a3478495bf8ecaaf75be6f7132e908892a8cb8..0000000000000000000000000000000000000000 --- a/dios/docs/doc_cookbook.md +++ /dev/null @@ -1,31 +0,0 @@ -<!-- -SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ - -SPDX-License-Identifier: GPL-3.0-or-later ---> - -Cookbook -========= - -Recipes -------- -- select common rows from all columns -- align columns to an other column -- align columns to a given index -- align dios with dios -- get/set values by condition -- apply a value to multiple columns -- [Broadcast array-likes to multiple columns](#broadcast-array-likes-to-multiple-columns) -- apply a array-like value to multiple columns -- nan-policy - mask vs. drop values, when nan's are inserted (mv to Readme ??) -- itype - when to use, pitfalls and best-practise -- changing the index of series' in dios (one, some, all) -- changing the dtype of series' in dios (one, some, all) -- changing properties of series' in dios (one, some, all) - -**T_O_D_O** - - -Broadcast array-likes to multiple columns ------------------------------------------ -**T_O_D_O** diff --git a/dios/docs/doc_indexing.md b/dios/docs/doc_indexing.md deleted file mode 100644 index 7d372f291ef27f46d22960736e277ed2de84e816..0000000000000000000000000000000000000000 --- a/dios/docs/doc_indexing.md +++ /dev/null @@ -1,531 +0,0 @@ -<!-- -SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ - -SPDX-License-Identifier: GPL-3.0-or-later ---> - -Pandas-like indexing -==================== - -`[]` and `.loc[]`, `.iloc[]` and `.at[]`, `.iat[]` - should behave exactly like -their counter-parts from pandas.DataFrame. They can take as indexer -- lists, array-like objects and in general all iterables -- boolean lists and iterables -- slices -- scalars and any hashable object - -Most indexers are directly passed to the underling columns-series or row-series depending -on the position of the indexer and the complexity of the operation. For `.loc`, `.iloc`, `.at` -and `iat` the first position is the *row indexer*, the second the *column indexer*. The second -can be omitted and will default to `slice(None)`. Examples: -- `di.loc[[1,2,3], ['a']]` : select labels 1,2,3 from column a -- `di.iloc[[1,2,3], [0,3]]` : select positions 1,2,3 from the columns 0 and 3 -- `di.loc[:, 'a':'c']` : select all rows from columns a to d -- `di.at[4,'c']` : select the elements with label 4 in column c -- `di.loc[:]` -> `di.loc[:,:]` : select everything. - -Scalar indexing always return a pandas Series if the other indexer is a non-scalar. If both indexer -are scalars, the element itself is returned. In all other cases a dios is returned. -For more pandas-like indexing magic and the differences between the indexers, -see the [pandas documentation](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html). - ->**Note:** -> ->In contrast to pandas.DataFrame, `.loc[:]` and `.loc[:, :]` always behaves identical. Same apply for `iloc` and ->[`aloc`](#the-special-indexer-aloc). For example, two pandas.DataFrames `df1` and `df2` with different columns, ->does align columns with `df1.loc[:, :] = df2` , but does **not** with `df1.loc[:] = df2`. -> ->If this is the desired behavior or a bug, i couldn't verify so far. -- Bert Palm - -**2D-indexer** - -`dios[boolean dios-like]` (as single key) - dios accept boolean 2D-indexer (boolean pandas.Dataframe -or boolean Dios). - -Columns and rows from the indexer align with the dios. -This means that only matching columns selected and in this columns rows are selected where -i) indices are match and ii) the value is True in the indexer-bool-dios. There is no difference between -missing indices and present indices, but False values. - -Values from unselected rows and columns are dropped, but empty columns are still preserved, -with the effect that the resulting Dios always have the same column dimension than the initial dios. - ->**Note:** ->This is the exact same behavior like pandas.DataFrame's handling of 2D-indexer, despite that pandas.DataFrame ->fill numpy.nan's at missing locations and therefore also fill-up, whole missing columns with numpy.nan's. - -**setting values** - -Setting values with `[]` and `.loc[]`, `.iloc[]` and `.at[]`, `.iat[]` works like in pandas. -With `.at`/`.iat` only single items can be set, for the other the -right hand side values can be: - - *scalars*: these are broadcasted to the selected positions - - *lists*: the length the list must match the number of indexed columns. The items can be everything that - can applied to a series, with the respective indexing method (`loc`, `iloc`, `[]`). - - *dios*: the length of the columns must match the number of indexed columns - columns does *not* align, - they are just iterated. - Rows do align. Rows that are present on the right but not on the left are ignored. - Rows that are present on the left (bear in mind: these rows was explicitly chosen for write!), but not present - on the right, are filled with `NaN`s, like in pandas. - - *pandas.Series*: column indexer must be a scalar(!), the series is passed down, and set with `loc`, `iloc` or `[]` - by pandas Series, where it maybe align, depending on the method. - -**Examples:** - -- `dios.loc[2:5, 'a'] = [1,2,3]` is the same as `a=dios['a']; a.loc[2:5]=[1,2,3]; dios['a']=a` -- `dios.loc[2:5, :] = 99` : set 99 on rows 2 to 5 on all columns - -Special indexer `.aloc` -======================== - -Additional to the pandas like indexers we have a `.aloc[..]` (align locator) indexing method. -Unlike `.iloc` and `.loc` indexers fully align if possible and 1D-array-likes can be broadcast -to multiple columns at once. This method also handle missing indexer-items gracefully. -It is used like `.loc`, so a single indexer (`.aloc[indexer]`) or a tuple of row-indexer and -column-indexer (`.aloc[row-indexer, column-indexer]`) can be given. Also it can handle boolean and *non-bolean* -2D-Indexer. - -The main **purpose** of `.aloc` is: -- to select gracefully, so rows or columns, that was given as indexer, but doesn't exist, not raise an error -- align series/dios-indexer -- vertically broadcasting aka. setting multiple columns at once with a list-like value - -Aloc usage ----------- - -`aloc` is *called* like `loc`, with a single key, that act as row indexer `aloc[rowkey]` or with a tuple of -row indexer and column indexer `aloc[rowkey, columnkey]`. Also 2D-indexer (like dios or df) can be given, but -only as a single key, like `.aloc[2D-indexer]` or with the special column key `...`, -the ellipsis (`.aloc[2D-indexer, ...]`). The ellipsis may change, how the 2D-indexer is -interpreted, but this will explained [later](#the-power-of-2d-indexer) in detail. - -If a normal (non 2D-dimensional) row indexer is given, but no column indexer, the latter defaults to `:` aka. -`slice(None)`, so `.aloc[row-indexer]` becomes `.aloc[row-indexer, :]`, which means, that all columns are used. -In general, a normal row-indexer is applied to every column, that was chosen by the column indexer, but for -each column separately. - -So maybe a first example gives an rough idea: -``` ->>> s = pd.Series([11] * 4 ) ->>> di = DictOfSeries(dict(a=s[:2]*6, b=s[2:4]*7, c=s[:2]*8, d=s[1:3]*9)) ->>> di - a | b | c | d | -===== | ===== | ===== | ===== | -0 66 | 2 77 | 0 88 | 1 99 | -1 66 | 3 77 | 1 88 | 2 99 | - - ->>> di.aloc[[1,2], ['a', 'b', 'd', 'x']] - a | b | d | -===== | ===== | ===== | -1 66 | 2 77 | 1 99 | - | | 2 99 | -``` - -The return type ----------------- - -Unlike the other two indexer methods `loc` and `iloc`, it is not possible to get a single item returned; -the return type is either a pandas.Series, iff the column-indexer is a single key (eg. `'a'`) or a dios, iff not. -The row-indexer does not play any role in the return type choice. - -> **Note for the curios:** -> -> This is because a scalar (`.aloc[key]`) is translates to `.loc[key:key]` under the hood. - -Indexer types -------------- -Following the `.aloc` specific indexer are listed. Any indexer that is not listed below (slice, boolean lists, ...), -but are known to work with `.loc`, are treated as they would passed to `.loc`, as they actually do under the hood. - -Some indexer are linked to later sections, where a more detailed explanation and examples are given. - -*special [Column indexer](#select-columns-gracefully) are :* -- *list / array-like* (or any iterable object): Only labels that are present in the columns are used, others are - ignored. -- *pd.Series* : `.values` are taken from series and handled like a *list*. -- *scalar* (or any hashable obj) : Select a single column, if label is present, otherwise nothing. - - -*special [Row indexer](#selecting-rows-a-smart-way) are :* -- *list / array-like* (or any iterable object): Only rows, which indices are present in the index of the column are - used, others are ignored. A dios is returned. -- *scalar* (or any hashable obj) : Select a single row from a column, if the value is present in the index of - the column, otherwise nothing is selected. [1] -- *pd.Series* : align the index from the given Series with the column, what means only common indices are used. The - actual values of the series are ignored(!). -- *boolean pd.Series* : like *pd.Series* but only True values are evaluated. - False values are equivalent to missing indices. To treat a boolean series as a *normal* indexer series, as decribed - above, one can use `.aloc(usebool=False)[boolean pd.Series]`. - - -*special [2D-indexer](#the-power-of-2d-indexer) are :* -- `.aloc[boolean dios-like]` : work same like `di[boolean dios-like]` (see there). - Brief: full align, select items, where the index is present and the value is True. -- `.aloc[dios-like, ...]` (with Ellipsis) : Align in columns and rows, ignore its values. Per common column, - the common indices are selected. The ellipsis forces `aloc`, to ignore the values, so a boolean dios could be - treated as a non-boolean. Alternatively `.aloc(usebool=False)[boolean dios-like]` could be used.[2] -- `.aloc[nested list-like]` : The inner lists are used as `aloc`-*list*-row-indexer (see there) on all columns. - One list for one column, which implies, that the outer list has the same length as the number of columns. - -*special handling of 1D-**values*** - -Values that are list- or array-like, which includes pd.Series, are set on all selected columns. pd.Series align -like `s1.loc[:] = s2` do. See also the [cookbook](/docs/cookbook.md#broadcast-array-likes-to-multiple-columns). - - -Aloc overiew table ---------------------- - -| example | type | on | like `.loc` | handling | conditions / hints | link | -| ------- | ---- | --- | ----------- | -------- | ------------------ | ---- | -| `.aloc[any, 'a']` | scalar | columns |no | select graceful | - | [cols](#select-columns-gracefully)| -|[Column indexer](#select-columns-gracefully)| -| `.aloc[any, 'a']` | scalar | columns |no | select graceful | - | [cols](#select-columns-gracefully)| -| `.aloc[any, 'b':'z']` | slice | columns |yes| slice | - | [cols](#select-columns-gracefully)| -| `.aloc[any, ['a','c']]` | list-like | columns |no | filter graceful | - | [cols](#select-columns-gracefully)| -| `.aloc[any [True,False]]` | bool list-like | columns |yes| take `True`'s | length must match nr of columns | [cols](#select-columns-gracefully)| -| `.aloc[any, s]` | Series | columns |no | like list, | only `s.values` are evaluated | [cols](#select-columns-gracefully)| -| `.aloc[any, bs]` | bool Series | columns |yes| like bool-list | see there | [cols](#select-columns-gracefully)| -|[Row indexer](#selecting-rows-a-smart-way)| -| `.aloc[7, any]` | scalar | rows |no | translate to `.loc[key:key]` | - | [rows](#selecting-rows-a-smart-way) | -| `.aloc[3:42, any]` | slice | rows |yes| slice | - | | -| `.aloc[[1,2,24], any]` | list-like | rows |no | filter graceful | - | [rows](#selecting-rows-a-smart-way) | -| `.aloc[[True,False], any]` | bool list-like | rows |yes| take `True`'s | length must match nr of (all selected) columns | [blist](#boolean-array-likes-as-row-indexer)| -| `.aloc[s, any]` | Series | rows |no | like `.loc[s.index]` | - | [ser](#pandasseries-and-boolean-pandasseries-as-row-indexer) | -| `.aloc[bs, any]` | bool Series | rows |no | align + just take `True`'s | evaluate `usebool`-keyword | [ser](#pandasseries-and-boolean-pandasseries-as-row-indexer)| -| `.aloc[[[s],[1,2,3]], any]` | nested list-like | both | ? | one row-indexer per column | outer length must match nr of (selected) columns | [nlist](#nested-lists-as-row-indexer) | -|[2D-indexer](#the-power-of-2d-indexer)| -| `.aloc[di]` | dios-like | both |no | full align | - | | -| `.aloc[di, ...]` | dios-like | both |no | full align | ellipsis has no effect | | -| `.aloc[di>5]` | bool dios-like | both |no | full align + take `True`'s | evaluate `usebool`-keyword | | -| `.aloc[di>5, ...]` | (bool) dios-like | both |no | full align, **no** bool evaluation | - | | - -Example dios ------------- - -The Dios used in the examples, unless stated otherwise, looks like so: - -``` ->>> dictofser - a | b | c | d | -===== | ====== | ====== | ===== | -0 0 | 2 5 | 4 7 | 6 0 | -1 7 | 3 6 | 5 17 | 7 1 | -2 14 | 4 7 | 6 27 | 8 2 | -3 21 | 5 8 | 7 37 | 9 3 | -4 28 | 6 9 | 8 47 | 10 4 | -5 35 | 7 10 | 9 57 | 11 5 | -6 42 | 8 11 | 10 67 | 12 6 | -7 49 | 9 12 | 11 77 | 13 7 | -8 56 | 10 13 | 12 87 | 14 8 | -``` - -or the short version: - -``` ->>> di - a | b | c | d | -===== | ==== | ===== | ===== | -0 0 | 2 5 | 4 7 | 6 0 | -1 7 | 3 6 | 5 17 | 7 1 | -2 14 | 4 7 | 6 27 | 8 2 | -3 21 | 5 8 | 7 37 | 9 3 | -4 28 | 6 9 | 8 47 | 10 4 | -``` - -The example Dios can get via a function: - -``` -from dios import example_DictOfSeries() -mydios = example_DictOfSeries() -``` - -or generated manually like so: - -``` ->>> a = pd.Series(range(0, 70, 7)) ->>> b = pd.Series(range(5, 15, 1)) ->>> c = pd.Series(range(7, 107, 10)) ->>> d = pd.Series(range(0, 10, 1)) ->>> for i, s in enumerate([a,b,c,d]): s.index += i*2 ->>> dictofser = DictOfSeries(dict(a=a, b=b, c=c, d=d)) ->>> di = dictofser[:5] -``` - - -Select columns, gracefully ---------------------------- - -One can use `.aloc[:, key]` to select **single columns** gracefully. -The underling pandas.Series is returned, if the key exist. -Otherwise a empty pandas.Series with `dtype=object` is returned. - -``` ->>> di.aloc[:, 'a'] -0 0 -1 7 -2 14 -3 21 -4 28 -Name: a, dtype: int64 - ->>> di.aloc[:, 'x'] -Series([], dtype: object) -``` - - -**Multiple columns** - -Just like selecting *single columns gracefully*, but with a array-like indexer. -A dios is returned, with a subset of the existing columns. -If no key is present a empty dios is returned. - -``` ->>> di.aloc[:, ['c', 99, None, 'a', 'x', 'y']] - a | c | -===== | ===== | -0 0 | 4 7 | -1 7 | 5 17 | -2 14 | 6 27 | -3 21 | 7 37 | -4 28 | 8 47 | - ->>> di.aloc[:, ['x', 'y']] -Empty DictOfSeries -Columns: [] - -s = pd.Series(dict(a='a', b='x', c='c', foo='d')) -d.aloc[:, s] - a | c | d | -===== | ===== | ===== | -0 0 | 4 7 | 6 0 | -1 7 | 5 17 | 7 1 | -2 14 | 6 27 | 8 2 | -3 21 | 7 37 | 9 3 | -4 28 | 8 47 | 10 4 | -``` - -**Boolean indexing, indexing with pd.Series and slice indexer** - -**Boolean indexer**, for example `[True, 'False', 'True', 'False']`, must have the same length than the number -of columns, then only columns, where the indexer has a `True` value are selected. - -If the key is a **pandas.Series**, its *values* are used for indexing, especially the Series's index is ignored. If a -series has boolean values its treated like a boolean indexer, otherwise its treated as a array-like indexer. - -A easy way to select all columns, is, to use null-**slice**es, like `.aloc[:,:]` or even simpler `.aloc[:]`. -This is just like one would do, with `loc` or `iloc`. Of course slicing with boundaries also work, -eg `.loc[:, 'a':'f']`. - ->**See also** -> - [pandas slicing ranges](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#slicing-ranges) -> - [pandas boolean indexing](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-indexing) - - -Selecting Rows a smart way --------------------------- - -For scalar and array-like indexer with label values, the keys are handled gracefully, just like with -array-like column indexers. - -``` ->>> di.aloc[1] - a | b | c | d | -==== | ======= | ======= | ======= | -1 7 | no data | no data | no data | - ->>> di.aloc[99] -Empty DictOfSeries -Columns: ['a', 'b', 'c', 'd'] - ->>> di.aloc[[3,6,7,18]] - a | b | c | d | -===== | ==== | ===== | ==== | -3 21 | 3 6 | 6 27 | 6 0 | - | 6 9 | 7 37 | 7 1 | -``` - -The length of columns can differ: -``` ->>> di.aloc[[3,6,7,18]].aloc[[3,6]] - a | b | c | d | -===== | ==== | ===== | ==== | -3 21 | 3 6 | 6 27 | 6 0 | - | 6 9 | | | -``` - -Boolean array-likes as row indexer ---------------------------------- - -For array-like indexer that hold boolean values, the length of the indexer and -the length of all column(s) to index must match. -``` ->>> di.aloc[[True,False,False,True,False]] - a | b | c | d | -===== | ==== | ===== | ==== | -0 0 | 2 5 | 4 7 | 6 0 | -3 21 | 5 8 | 7 37 | 9 3 | -``` -If the length does not match a `IndexError` is raised: -``` ->>> di.aloc[[True,False,False]] -Traceback (most recent call last): - ... - IndexError: failed for column a: Boolean index has wrong length: 3 instead of 5 -``` - -This can be tricky, especially if columns have different length: -``` ->>> difflen - a | b | c | d | -===== | ==== | ===== | ==== | -0 0 | 2 5 | 4 7 | 6 0 | -1 7 | 3 6 | 6 27 | 7 1 | -2 14 | 4 7 | | 8 2 | - ->>> difflen.aloc[[False,True,False]] -Traceback (most recent call last): - ... - IndexError: Boolean index has wrong length: 3 instead of 2 -``` - -pandas.Series and boolean pandas.Series as row indexer ------------------------------------------------------- - -When using a pandas.Series as row indexer with `aloc`, all its magic comes to light. -The index of the given series align itself with the index of each column separately and is this way used as a filter. - -``` ->>> s = di['b'] + 100 ->>> s -2 105 -3 106 -4 107 -5 108 -6 109 -Name: b, dtype: int64 - ->>> di.aloc[s] - a | b | c | d | -===== | ==== | ===== | ==== | -2 14 | 2 5 | 4 7 | 6 0 | -3 21 | 3 6 | 5 17 | | -4 28 | 4 7 | 6 27 | | - | 5 8 | | | - | 6 9 | | | -``` - -As seen in the example above the series' values are ignored completely. The functionality -is similar to `s1.loc[s2.index]`, with `s1` and `s2` are pandas.Series's, and s2 is the indexer and s1 is one column -after the other. - -If the indexer series holds boolean values, these are **not** ignored. -The series align the same way as explained above, but additional only the `True` values are evaluated. -Thus `False`-values are treated like missing indices. The behavior here is analogous to `s1.loc[s2[s2].index]`. - -``` ->>> boolseries = di['b'] > 6 ->>> boolseries -2 False -3 False -4 True -5 True -6 True -Name: b, dtype: bool - ->>> di.aloc[boolseries] - a | b | c | d | -===== | ==== | ===== | ==== | -4 28 | 4 7 | 4 7 | 6 0 | - | 5 8 | 5 17 | | - | 6 9 | 6 27 | | -``` - -To evaluate boolean values is a very handy feature, as it can easily used with multiple conditions and also fits -nicely with writing those as one-liner: - -``` ->>> di.aloc[d['b'] > 6] - a | b | c | d | -===== | ==== | ===== | ==== | -4 28 | 4 7 | 4 7 | 6 0 | - | 5 8 | 5 17 | | - | 6 9 | 6 27 | | - ->>> di.aloc[(d['a'] > 6) & (d['b'] > 6)] - a | b | c | d | -===== | ==== | ==== | ======= | -4 28 | 4 7 | 4 7 | no data | -``` - - ->**Note:** -> ->Nevertheless, something like `di.aloc[di['a'] > di['b']]` do not work, because the comparison fails, ->as long as the two series objects not have the same index. But maybe one want to checkout ->[DictOfSeries.index_of()](https://dios.readthedocs.io/en/latest/_api/dios.DictOfSeries.html#dios.DictOfSeries.index_of). - - -Nested-lists as row indexer ---------------------------- - -It is possible to pass different array-like indexer to different columns, by using nested lists as indexer. -The outer list's length must match the number of columns of the dios. The items of the outer list, all must be -array-like and not further nested. For example list, pandas.Series, boolean lists or pandas.Series, numpy.arrays... -Every inner list-like item is applied as row indexer to the according column. - -``` ->>> d - a | b | c | d | -===== | ==== | ===== | ===== | -0 0 | 2 5 | 4 7 | 6 0 | -1 7 | 3 6 | 5 17 | 7 1 | -2 14 | 4 7 | 6 27 | 8 2 | -3 21 | 5 8 | 7 37 | 9 3 | -4 28 | 6 9 | 8 47 | 10 4 | - ->>> di.aloc[ [d['a'], [True,False,True,False,False], [], [7,8,10]] ] - a | b | c | d | -===== | ==== | ======= | ===== | -0 0 | 2 5 | no data | 7 1 | -1 7 | 4 7 | | 8 2 | -2 14 | | | 10 4 | -3 21 | | | | -4 28 | | | | - ->>> ar = np.array([2,3]) ->>> di.aloc[[ar, ar+1, ar+2, ar+3]] - a | b | c | d | -===== | ==== | ===== | ==== | -2 14 | 3 6 | 4 7 | 6 0 | -3 21 | 4 7 | 5 17 | | -``` - -Even this looks like a 2D-indexer, that are explained in the next section, it is not. -In contrast to the 2D-indexer, we also can provide a column key, to pre-filter the columns. - -``` ->>> di.aloc[[ar, ar+1, ar+3], ['a','b','d']] - a | b | d | -===== | ==== | ==== | -2 14 | 3 6 | 6 0 | -3 21 | 4 7 | | -``` - - - -The power of 2D-indexer ------------------------ - -Overview: - -| | | -| ------ | ------ | -| `.aloc[bool-dios]` | 1. align columns, 2. align rows, 3. just take `True`'s -- [1] | -| `.aloc[dios, ...]` (use Ellipsis) | 1. align columns, 2. align rows, (3.) ignore values -- [1] | -[1] evaluate `usebool`-keyword - - -**T_O_D_O** - diff --git a/dios/docs/doc_itype.md b/dios/docs/doc_itype.md deleted file mode 100644 index 0420b2f4645bbbdd192ef998857a4b669286df8d..0000000000000000000000000000000000000000 --- a/dios/docs/doc_itype.md +++ /dev/null @@ -1,24 +0,0 @@ -<!-- -SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ - -SPDX-License-Identifier: GPL-3.0-or-later ---> - -Itype -===== - -DictOfSeries holds multiple series, and each series can have a different index length -and index type. Differing index lengths are either solved by some aligning magic, or simply fail, if -aligning makes no sense (eg. assigning the very same list to series of different lengths (see `.aloc`). - -A bigger challange is the type of the index. If one series has an alphabetical index, and another one -a numeric index, selecting along columns can fail in every scenario. To keep track of the -types of index or to prohibit the inserting of a *not fitting* index type, -we introduce the `itype`. This can be set on creation of a Dios and also changed during usage. -On change of the itype, all indexes of all series in the dios are casted to a new fitting type, -if possible. Different cast-mechanisms are available. - -If an itype prohibits some certain types of indexes and a series with a non-fitting index-type is inserted, -an implicit type cast is done (with or without a warning) or an error is raised. The warning/error policy -can be adjusted via global options. - diff --git a/dios/docs/genindex.rst b/dios/docs/genindex.rst deleted file mode 100644 index d8bfa27e507b94ecf176d9606856f717b5765859..0000000000000000000000000000000000000000 --- a/dios/docs/genindex.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -.. -.. SPDX-License-Identifier: GPL-3.0-or-later - -# dummy file to be able to link to index - -Index -===== \ No newline at end of file diff --git a/dios/docs/index.rst b/dios/docs/index.rst deleted file mode 100644 index 9546d94886c7b14ea7046203560018bbffb7fd44..0000000000000000000000000000000000000000 --- a/dios/docs/index.rst +++ /dev/null @@ -1,60 +0,0 @@ -.. SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -.. -.. SPDX-License-Identifier: GPL-3.0-or-later - -.. dios documentation master file, created by - sphinx-quickstart on Sun Apr 19 02:36:37 2020. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -Dios Docs -========= - -.. currentmodule:: dios - -The whole package :mod:`dios` is mainly a container for -the class :class:`dios.DictOfSeries`. See - -.. toctree:: - - dios.DictOfSeries <_api/dios.DictOfSeries> - -.. toctree:: - :hidden: - - Repository <https://git.ufz.de/rdm/dios> - example DictOfSeries <_api/dios.example_DictOfSeries> - - -Most magic happen in getting and setting elements. -To select any combination from columns and rows, -read the documentation about indexing: - -.. toctree:: - - doc_indexing - -.. toctree:: - - doc_cookbook - -For the idea behind the Itype concept and its usage read: - -.. toctree:: - - doc_itype - -For implemented methods and module functions, -respectively the full module api, see: - -.. toctree:: - :maxdepth: 2 - - dios_api - -or browse the Index.. - -.. toctree:: - - genindex - diff --git a/dios/docs/make.bat b/dios/docs/make.bat deleted file mode 100644 index 11b2c629f136ded491dca051ac52ddc62ec68000..0000000000000000000000000000000000000000 --- a/dios/docs/make.bat +++ /dev/null @@ -1,39 +0,0 @@ -REM SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -REM -REM SPDX-License-Identifier: GPL-3.0-or-later - -@ECHO OFF - -pushd %~dp0 - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set SOURCEDIR=. -set BUILDDIR=_build - -if "%1" == "" goto help - -%SPHINXBUILD% >NUL 2>NUL -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.http://sphinx-doc.org/ - exit /b 1 -) - -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% -goto end - -:help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% - -:end -popd diff --git a/dios/docs/requirements_sphinx.txt b/dios/docs/requirements_sphinx.txt deleted file mode 100644 index 3c074c1f0388441cd4660f900a7cda4d828cfd7f..0000000000000000000000000000000000000000 --- a/dios/docs/requirements_sphinx.txt +++ /dev/null @@ -1,39 +0,0 @@ -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -alabaster==0.7.12 -Babel==2.8.0 -certifi==2020.6.20 -chardet==3.0.4 -commonmark==0.9.1 -docutils==0.16 -idna==2.10 -imagesize==1.2.0 -importlib-metadata==1.7.0 -Jinja2==2.11.2 -Markdown==3.2.2 -MarkupSafe==1.1.1 -numpy==1.19.1 -packaging==20.4 -pandas==1.1.1 -Pygments==2.6.1 -pyparsing==2.4.7 -python-dateutil==2.8.1 -pytz==2020.1 -recommonmark==0.6.0 -requests==2.24.0 -six==1.15.0 -snowballstemmer==2.0.0 -Sphinx==3.2.1 -sphinx-automodapi==0.12 -sphinx-markdown-tables==0.0.15 -sphinxcontrib-applehelp==1.0.2 -sphinxcontrib-devhelp==1.0.2 -sphinxcontrib-fulltoc==1.2.0 -sphinxcontrib-htmlhelp==1.0.3 -sphinxcontrib-jsmath==1.0.1 -sphinxcontrib-qthelp==1.0.3 -sphinxcontrib-serializinghtml==1.1.4 -urllib3==1.25.10 -zipp==3.1.0 diff --git a/dios/profiling/__init__.py b/dios/profiling/__init__.py deleted file mode 100644 index 609612eeb17be4a5bd61dd1ecf45cfab448dd546..0000000000000000000000000000000000000000 --- a/dios/profiling/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -from .generate_testsets import * -from .performance import find_index_range, gen_random_timestamps diff --git a/dios/profiling/generate_testsets.py b/dios/profiling/generate_testsets.py deleted file mode 100644 index 42c24c97e7921aa7f7504a01af6862c0f7ae543a..0000000000000000000000000000000000000000 --- a/dios/profiling/generate_testsets.py +++ /dev/null @@ -1,126 +0,0 @@ -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -import datetime as dt -import os -import pickle -import time - -import numpy as np -import pandas as pd - -from ..dios import DictOfSeries - -var_prefix = "var" - - -def _gen_testset(rowsz, colsz, freq="1min", disalign=True, randstart=True): - df = pd.DataFrame() - dos = DictOfSeries() - start = dt.datetime.strptime("2000-01-01 00:00:00", "%Y-%m-%d %H:%M:%S") - times = pd.date_range(periods=rowsz, start=start, freq=freq) - - frequ = freq.strip("0123456789") - freqv = int(freq[: -len(frequ)]) - - for i in range(colsz): - if randstart: - # generate random startpoint for each series - r = str(np.random.randint(int(rowsz * 0.05), int(rowsz * 0.6) + 2)) + frequ - st = start + pd.Timedelta(r) - times = pd.date_range(periods=rowsz, start=st, freq=freq) - - if disalign: - if disalign == "random": - r = np.random.randint(1, i + 2) - else: - # total disalign - r = i - times += pd.Timedelta(f"{r}ns") - - d = np.random.randint(1, 9, rowsz) - v = f"var{i}" - tmp = pd.DataFrame(index=times, data=d, columns=[v]) - df = pd.merge(df, tmp, left_index=True, right_index=True, how="outer") - dos[v] = tmp.squeeze().copy() - - return df, dos - - -def get_random_df_and_dios(rowsz, colsz, freq="1min", disalign=True, randstart=True): - df, _, _, dios, *_ = get_testset( - rowsz, colsz, freq=freq, disalign=disalign, randstart=randstart - ) - return df, dios - - -def get_testset( - rows, - cols, - freq="1s", - disalign=True, - randstart=True, - storagedir=None, - noresult=False, -): - if storagedir is None: - storagedir = os.path.dirname(__file__) - storagedir = os.path.join(storagedir, "testsets") - - fname = f"set_f{freq}_d{disalign}_r{randstart}_dim{rows}x{cols}.pkl" - fpath = os.path.join(storagedir, fname) - - # try to get pickled data - try: - with open(fpath, "rb") as fh: - if noresult: - return - tup = pickle.load(fh) - - # file/data was present - return tup - except (pickle.UnpicklingError, FileNotFoundError): - pass - - # generate testset(s) - df, dios = _gen_testset( - rowsz=rows, colsz=cols, freq=freq, disalign=disalign, randstart=randstart - ) - df = df.sort_index(axis=0, level=0) - df_type_a = df.copy().stack(dropna=False).sort_index(axis=0, level=0).copy() - df_type_b = df.copy().unstack().sort_index(axis=0, level=0).copy() - tup = df, df_type_a, df_type_b, dios - - # store testsets - with open(fpath, "wb") as fh: - pickle.dump(tup, fh) - - if noresult: - return - - return tup - - -def gen_all(rrange, crange): - for r in rrange: - for c in crange: - print(r, " x ", c) - t0 = time.time() - get_testset(r, c, noresult=True) - t1 = time.time() - print(t1 - t0) - - -if __name__ == "__main__": - # import time - # - # t0 = time.time() - # for i in range(7): - # get_testset(10**i, 10) - # t1 = time.time() - # print(t1-t0) - - rr = [10**r for r in range(1, 6)] - c = range(10, 60, 10) - gen_all(rr, c) diff --git a/dios/profiling/memory.py b/dios/profiling/memory.py deleted file mode 100644 index 3078c654936182266f1dd4be76300cb30eb661bd..0000000000000000000000000000000000000000 --- a/dios/profiling/memory.py +++ /dev/null @@ -1,105 +0,0 @@ -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -import gc - -from .generate_testsets import get_random_df_and_dios - - -def calc_mem(rows, cols, shifted=False, dtypesz=(64 / 8)): - if shifted: - idxsz = 8 * rows * cols - # additional nans are inserted exactly as many as variables - rowsz = rows * cols * dtypesz - else: - idxsz = 8 * rows - rowsz = rows * dtypesz - - return idxsz + rowsz * cols - - -def bytes2hread(bytes): - i = 0 - units = ["B", "kB", "MB", "GB", "TB"] - while bytes > 1000: - bytes /= 1024 - i += 1 - if i == 4: - break - return bytes, units[i] - - -def rows_by_time(nsec, mdays): - """calc the number of values for one value every n seconds in m days - :param nsec: n seconds a value occur - :param mdays: this many days of data - :return: rows thats needed - """ - return int((60 / nsec) * 60 * 24 * mdays) - - -if __name__ == "__main__": - # dios - linear in rows and colums, same size for r=10,c=100 or r=100,c=10 - do_real_check = True - cols = 10 - rows = 100000 - # rows = rows_by_time(nsec=600, mdays=365*2) - - mem = calc_mem(rows, cols, shifted=False) - memsh = calc_mem(rows, cols, shifted=True) - - df, dios = get_random_df_and_dios(rows, cols, disalign=False, randstart=True) - dios_mem = dios.memory_usage() - print(f"dios:\n-----------") - print("mem: ", *bytes2hread(dios_mem)) - print("entries:", sum([len(dios[e]) for e in dios])) - print() - - ratio = (1 / (memsh - mem)) * dios_mem - - mem = bytes2hread(mem) - memsh = bytes2hread(memsh) - - print("df - best case\n---------") - print("mem: ", *mem) - print("entries:", rows) - print() - print("df - worst case\n---------") - print("mem :", *memsh) - print("entries:", rows * cols) - - print() - print(f"dfbest, dios, dfworst: 0%, {round(ratio, 4)*100}%, 100% ") - - if not do_real_check: - exit(0) - - proveMeRight = False - - if proveMeRight: - # best case - print() - print("best case proove") - dfb, _ = get_random_df_and_dios(rows, cols, disalign=False, randstart=False) - dfb.info(memory_usage="deep", verbose=False) - - print() - print("rand start, same freq") - df.info(memory_usage="deep", verbose=False) - print("entries:", sum([len(df[e]) for e in df])) - - print() - print("rand start, rand freq") - df, _ = get_random_df_and_dios(rows, cols, disalign="random", randstart=True) - df.info(memory_usage="deep", verbose=False) - print("entries:", sum([len(df[e]) for e in df])) - - if proveMeRight: - # worst case - print() - print("worst case proove") - df, _ = get_random_df_and_dios(rows, cols, disalign=True, randstart=False) - df.info(memory_usage="deep", verbose=False) - - gc.collect() diff --git a/dios/profiling/performance.py b/dios/profiling/performance.py deleted file mode 100644 index 19e95c950ef918d81edc1bd25d47be3ba6ce24ee..0000000000000000000000000000000000000000 --- a/dios/profiling/performance.py +++ /dev/null @@ -1,210 +0,0 @@ -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -import time - -import numpy as np -import pandas as pd - -from .generate_testsets import get_testset, var_prefix - -profile_assignment = False - -idx = pd.IndexSlice -rows = 0 - -fir = ["var", "ts", "ass"] -sec = ["df", "a", "b", "dios"] -timingsdf = pd.DataFrame(columns=pd.MultiIndex.from_product([fir, sec])) - - -def df_timmings(df, t0, t1, v1, v2): - _t0 = time.time() - a = df.loc[t0:t1, :] - _t1 = time.time() - b = df.loc[:, v1] - _t2 = time.time() - if profile_assignment: - df.loc[t0:t1, v1] = df.loc[t0:t1, v1] * 1111 - _t3 = time.time() - - timingsdf.at[rows, ("ts", "df")] += _t1 - _t0 - timingsdf.at[rows, ("var", "df")] += _t2 - _t1 - timingsdf.at[rows, ("ass", "df")] += _t3 - _t2 - return a, b, df - - -def a_timings(df, t0, t1, v1, v2): - _t0 = time.time() - a = df.loc[t0:t1, :] - _t1 = time.time() - b = df.loc[:, v1] - _t2 = time.time() - if profile_assignment: - df.loc[t0:t1, v1] = df.loc[t0:t1, v1] * 1111 - _t3 = time.time() - - timingsdf.at[rows, ("ts", "a")] += _t1 - _t0 - timingsdf.at[rows, ("var", "a")] += _t2 - _t1 - timingsdf.at[rows, ("ass", "a")] += _t3 - _t2 - return a, b, df - - -def b_timings(df, t0, t1, v1, v2): - _t0 = time.time() - a = df.loc[:, t0:t1] - _t1 = time.time() - b = df.loc[v1, :] - _t2 = time.time() - if profile_assignment: - df.loc[v1, t0:t1] = df.loc[v1, t0:t1] * 1111 - _t3 = time.time() - - timingsdf.at[rows, ("ts", "b")] += _t1 - _t0 - timingsdf.at[rows, ("var", "b")] += _t2 - _t1 - timingsdf.at[rows, ("ass", "b")] += _t3 - _t2 - return a, b, df - - -def dios_timings(dios, t0, t1, v1, v2): - _t0 = time.time() - a = dios.loc[t0:t1, :] - _t1 = time.time() - b = dios.loc[:, v1] - _t2 = time.time() - if profile_assignment: - dios.loc[t0:t1, v1] = dios.loc[t0:t1, v1] * 1111 - _t3 = time.time() - - timingsdf.at[rows, ("ts", "dios")] += _t1 - _t0 - timingsdf.at[rows, ("var", "dios")] += _t2 - _t1 - timingsdf.at[rows, ("ass", "dios")] += _t3 - _t2 - return a, b, dios - - -def gen_random_timestamps(m, M): - r = (M - m) * (np.random.randint(10, 90) + np.random.random()) * 0.01 - a, b = m + r, M - r - return min(a, b), max(a, b) - - -def find_index_range(obj): - min_ = None - max_ = None - for r in obj: - m = obj[r].index.min() - M = obj[r].index.max() - try: - min_ = min(min_, m) - max_ = max(max_, M) - except TypeError: - min_ = m - max_ = M - return min_, max_ - - -if __name__ == "__main__": - import matplotlib.pyplot as plt - - # do not touch - rows = 1 - - # max increase of of rows - # 1 = 10 # 2 = 100 # .... # 5 = 100'000 - iterations = 5 - runs = 1 - cols = 10 - - profile_assignment = True - - # which to calc and plot - use_df = False - use_a = True - use_b = True - use_dios = True - - # plot options - normalize_to_df = True - plot_xlog = True - plot_ylog = True - - # ######################## - - v1 = "var1" - v2 = "var2" - for i in range(iterations): - rows *= 10 - - timingsdf.loc[rows] = (0,) * len(timingsdf.columns) - - df, a, b, dios = get_testset(rows, cols) - t0, t4 = find_index_range(df) - - if use_df or normalize_to_df: - for r in range(runs): - t1, t2 = gen_random_timestamps(t0, t4) - vr1 = var_prefix + str(np.random.randint(0, cols)) - df_timmings(df, t1, t2, vr1, None) - - if use_a: - for r in range(runs): - t1, t2 = gen_random_timestamps(t0, t4) - vr1 = var_prefix + str(np.random.randint(0, cols)) - a_timings(a, t1, t2, vr1, None) - - if use_b: - for r in range(runs): - t1, t2 = gen_random_timestamps(t0, t4) - vr1 = var_prefix + str(np.random.randint(0, cols)) - b_timings(b, t1, t2, vr1, None) - - if use_dios: - for r in range(runs): - t1, t2 = gen_random_timestamps(t0, t4) - vr1 = var_prefix + str(np.random.randint(0, cols)) - dios_timings(dios, t1, t2, vr1, None) - - # calc the average - timingsdf /= runs - - pd.set_option("display.max_columns", 100) - - df = timingsdf - if not profile_assignment: - df.drop(labels="ass", axis=1, level=0, inplace=True) - print("timings:") - print(df) - df = df.swaplevel(axis=1) - if normalize_to_df: - a = df.loc[:, "a"] / df.loc[:, "df"] - b = df.loc[:, "b"] / df.loc[:, "df"] - c = df.loc[:, "df"] / df.loc[:, "df"] - d = df.loc[:, "dios"] / df.loc[:, "df"] - df.loc[:, "a"] = a.values - df.loc[:, "b"] = b.values - df.loc[:, "df"] = c.values - df.loc[:, "dios"] = d.values - all = df.copy() - all.swaplevel(axis=1) - print("\n\ndiff:") - print(all) - - a = df.loc[:, ("a", slice(None))] - b = df.loc[:, ("b", slice(None))] - dios = df.loc[:, ("dios", slice(None))] - df = df.loc[:, ("df", slice(None))] - - ax = plt.gca() - ax.set_title(f"avg of: {runs} runs, columns: {cols}") - - if use_df: - df.plot(logy=plot_ylog, logx=plot_xlog, linestyle="-", ax=ax) - if use_a: - a.plot(logy=plot_ylog, logx=plot_xlog, linestyle="--", ax=ax) - if use_b: - b.plot(logy=plot_ylog, logx=plot_xlog, linestyle=":", ax=ax) - if use_dios: - dios.plot(logy=plot_ylog, logx=plot_xlog, linestyle="-.", ax=ax) - - plt.show() diff --git a/dios/profiling/testsets/.gitignore b/dios/profiling/testsets/.gitignore deleted file mode 100644 index 0bed86a4dfbb49bfbf843bc5b6485250dc8c93c5..0000000000000000000000000000000000000000 --- a/dios/profiling/testsets/.gitignore +++ /dev/null @@ -1,9 +0,0 @@ -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -# ignore all -* - -# except ourself, to ensure the `testsets`-dir isn't ignored -!.gitignore \ No newline at end of file diff --git a/dios/requirements.txt b/dios/requirements.txt deleted file mode 100644 index dfd11d04b95a848f21759d79f50b716ef03860ab..0000000000000000000000000000000000000000 --- a/dios/requirements.txt +++ /dev/null @@ -1,9 +0,0 @@ -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -numpy==1.21.2 -pandas==1.3.5 -python-dateutil==2.8.2 -pytz==2022.7.1 -six==1.16.0 diff --git a/dios/setup.py b/dios/setup.py deleted file mode 100644 index 2213f2088fa537fd577181cbc2802172844163d3..0000000000000000000000000000000000000000 --- a/dios/setup.py +++ /dev/null @@ -1,39 +0,0 @@ -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -import subprocess - -from setuptools import find_packages, setup - -with open("Readme.md", "r") as fh: - long_description = fh.read() - -cmd = "git describe --tags --always --dirty" -version = ( - subprocess.run(cmd, shell=True, check=False, stdout=subprocess.PIPE) - .stdout.decode() - .strip() -) -print(f"git version: {version}") -# if '-dirty' in version: -# print("Do not make a version from a dirty repro. Exiting now") -# exit(1) -txt = "enter version\n>" -version = input(txt) - -setup( - name="dios", - version=version, - author="Bert Palm", - author_email="bert.palm@ufz.de", - description="Dictionary of Series - a kind of pandas extension", - long_description=long_description, - long_description_content_type="text/markdown", - url="https://git.ufz.de/rdm/dios", - packages=["dios"], - install_requires=[ - "pandas", - ], - license="GPLv3", -) diff --git a/dios/test/__init__.py b/dios/test/__init__.py deleted file mode 100644 index 38bfac50f92ecb8595a96b7c3c51fad6292b3efb..0000000000000000000000000000000000000000 --- a/dios/test/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -from .test_setup import * diff --git a/dios/test/run_dios.py b/dios/test/run_dios.py deleted file mode 100644 index 6fc299dd61783fa37767e6ae0ccf16a0d698a52b..0000000000000000000000000000000000000000 --- a/dios/test/run_dios.py +++ /dev/null @@ -1,53 +0,0 @@ -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -from builtins import range - -import numpy as np - -from dios import * - -if __name__ == "__main__": - dios_options[OptsFields.mixed_itype_warn_policy] = Opts.itype_warn - print(dios_options) - - df = pd.DataFrame(columns=range(1000)) - pd.Series() - # print(df) - # exit(99) - - # dios_options[OptsFields.disp_max_cols] = 5 - # dios_options[OptsFields.disp_max_rows] = 100 - dios_options[OptsFields.disp_min_rows] = 50 - # dios_options[OptsFields.dios_repr] = Opts.repr_aligned - - n = 10 - d = DictOfSeries( - dict( - l=pd.Series(0, index=range(0, 30)), - # i123=pd.Series(dtype='O'), - a=pd.Series(1, index=range(0, n)), - nan=pd.Series(np.nan, index=range(3, n + 3)), - b=pd.Series(2, index=range(0, n * 2, 2)), - c=pd.Series(3, index=range(n, n * 2)), - d=pd.Series(4, index=range(-n // 2, n // 2)), - # z=pd.Series([1, 2, 3], index=list("abc")) - ) - ) - - def f(s): - sec = 10**9 - s.index = pd.to_datetime(s.index * sec) - return s - - dd = d.apply(f) - print(d) - - # print(d.to_df()) - # print(pd.options.display.max_rows) - # print(d.to_str(col_delim=' | ', col_space=20, header_delim='0123456789')) - # print(d.to_str(col_delim=' | ', col_space=20, max_cols=4 )) - di = DictOfSeries(columns=[]) - print(di) - # print(DictOfSeries(data=1, columns=['a'])) diff --git a/dios/test/test__ops__.py b/dios/test/test__ops__.py deleted file mode 100644 index dede30afba1dcfca3b5f39edebe28f12029f72ef..0000000000000000000000000000000000000000 --- a/dios/test/test__ops__.py +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env python - -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -import pytest - -from .test_setup import * - -__author__ = "Bert Palm" -__email__ = "bert.palm@ufz.de" -__copyright__ = "Copyright 2018, Helmholtz-Centrum für Umweltforschung GmbH - UFC" - - -@pytest.mark.parametrize("left", diosFromMatr(DATA_ALIGNED)) -@pytest.mark.parametrize("right", diosFromMatr(DATA_ALIGNED)) -def test__eq__(left, right): - a, b = left, right - _test = a == b - for c in _test: - for i in _test[c].index: - res = (_test[c])[i] - e1 = a[c][i] - e2 = b[c][i] - exp = e1 == e2 - assert res == exp - - -@pytest.mark.filterwarnings( - "ignore: invalid value encountered in .*_scalars", category=RuntimeWarning -) -@pytest.mark.filterwarnings( - "ignore: divide by zero encountered in .*_scalars", category=RuntimeWarning -) -@pytest.mark.parametrize("left", diosFromMatr(DATA_ALIGNED)) -@pytest.mark.parametrize("right", diosFromMatr(DATA_ALIGNED)) -@pytest.mark.parametrize("op", OP2) -def test__op2__aligningops(left, right, op): - a, b = left, right - test = op(a, b) - for c in test: - for j in test[c].index: - exp = op(a[c][j], b[c][j]) - res = test[c][j] - if not np.isfinite(res): - print(f"\n\n{a[c][j]} {OP_MAP[op]} {b[c][j]}") - print(f"\nres: {res}, exp:{exp}, op: {OP_MAP[op]}") - pytest.skip("test not support non-finite values") - return - assert res == exp - - -@pytest.mark.filterwarnings( - "ignore: invalid value encountered in .*_scalars", category=RuntimeWarning -) -@pytest.mark.filterwarnings( - "ignore: divide by zero encountered in .*_scalars", category=RuntimeWarning -) -@pytest.mark.parametrize("left", diosFromMatr(DATA_UNALIGNED)) -@pytest.mark.parametrize("right", diosFromMatr(DATA_UNALIGNED)) -@pytest.mark.parametrize("op", OPNOCOMP) -def test__op2__UNaligningops(left, right, op): - try: - a, b = left, right - test = op(a, b) - for c in test: - for j in test[c].index: - exp = op(a[c][j], b[c][j]) - res = test[c][j] - if not np.isfinite(res): - print(f"\n\n{a[c][j]} {OP_MAP[op]} {b[c][j]}") - print(f"\nres: {res}, exp:{exp}, op: {OP_MAP[op]}") - pytest.skip("test not support non-finite values") - return - assert res == exp - except ZeroDivisionError: - pytest.skip("ZeroDivisionError") - - -@pytest.mark.parametrize("data", diosFromMatr(ALL)) -@pytest.mark.parametrize("op", OP1) -def test__op1__(data, op): - test = op(data) - res = [entry for col in test for entry in test[col]] - e = [entry for col in data for entry in data[col]] - for i in range(len(res)): - exp = op(e[i]) - assert res[i] == exp diff --git a/dios/test/test__setget__.py b/dios/test/test__setget__.py deleted file mode 100644 index 21e5d2175e5c3a178a92a58500495b5ab39eb15b..0000000000000000000000000000000000000000 --- a/dios/test/test__setget__.py +++ /dev/null @@ -1,61 +0,0 @@ -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -from pandas.core.dtypes.common import is_scalar - -from .test_setup import * - - -@pytest.mark.parametrize(("idxer", "exp"), [("a", s1), ("c", s3)]) -def test__getitem_single(dios_aligned, idxer, exp): - di = dios_aligned[idxer] - assert isinstance(di, pd.Series) - assert (di == exp).all() - - -@pytest.mark.parametrize( - "idxer", - [ - "x", - "2", - 1000, - None, - ], -) -def test__getitem_single_fail(dios_aligned, idxer): - with pytest.raises((KeyError, ValueError)): - di = dios_aligned[idxer] - - -@pytest.mark.parametrize("idxer", BASIC_INDEXER) -def test__getitem_(dios_aligned, idxer): - di = dios_aligned[idxer] - - assert isinstance(di, DictOfSeries) - - -@pytest.mark.parametrize("idxer", BASIC_INDEXER_FAIL) -def test__getitem_fail(dios_aligned, idxer): - with pytest.raises((ValueError, KeyError)): - dios_aligned[idxer] - - -@pytest.mark.parametrize( - ("idxer", "exp"), - [ - (slice(None), [s1 == s1, s2 == s2, s3 == s3, s4 == s4]), - (dios_aligned__() > 5, [s1 > 5, s2 > 5, s3 > 5, s4 > 5]), - ], -) -def test__setitem_single(dios_aligned, idxer, exp): - di = dios_aligned - di[idxer] = 99 - for i, c in enumerate(di): - assert ((di[c] == 99) == exp[i]).all() - - -@pytest.mark.parametrize("idxer", BASIC_INDEXER_FAIL) -def test__setitem__fail(dios_aligned, idxer): - with pytest.raises((ValueError, KeyError, IndexError)): - dios_aligned[idxer] = 99 diff --git a/dios/test/test__setget__aloc.py b/dios/test/test__setget__aloc.py deleted file mode 100644 index 0b9548d74ef5d8bc86c22c8f9b752628df216c51..0000000000000000000000000000000000000000 --- a/dios/test/test__setget__aloc.py +++ /dev/null @@ -1,70 +0,0 @@ -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -from pandas.core.dtypes.common import is_scalar - -from .test_setup import * - -pytestmark = pytest.mark.skip - - -@pytest.mark.parametrize( - ("idxer", "exp"), [("a", s1), ("c", s3), ("x", pd.Series(dtype=float))] -) -def test__getitem_aloc_singleCol(dios_aligned, idxer, exp): - di = dios_aligned.aloc[:, idxer] - assert isinstance(di, pd.Series) - assert (di == exp).all() - - -@pytest.mark.parametrize(("idxer", "exp"), [((1, "a"), s1), ((3, "c"), s3)]) -def test__getitem_aloc_singleRow_singleCol(dios_aligned, idxer, exp): - di = dios_aligned.aloc[idxer] - assert is_scalar(di) - assert di == exp.loc[idxer[0]] - - -@pytest.mark.parametrize("idxerL", R_LOC_INDEXER) -@pytest.mark.parametrize("idxerR", C_LOC_INDEXER) -def test__getitem__aloc(dios_aligned, idxerL, idxerR): - di = dios_aligned.copy().aloc[idxerL, idxerR] - exp = dios_aligned.copy().loc[idxerL, idxerR] - assert isinstance(di, DictOfSeries) - assert (di == exp).all(None) - - -# ############################# -# __SETITEM__ - - -@pytest.mark.parametrize( - ("idxer", "exp"), - [ - (slice(None), [s1 == s1, s2 == s2, s3 == s3, s4 == s4]), - (C_BLIST, [s1 == s1, s2 != s2, s3 != s3, s4 == s4]), - ], -) -def test__setitem_aloc_singleCol(dios_aligned, idxer, exp): - di = dios_aligned.copy() - di.aloc[:, idxer] = 99 - for i, c in enumerate(di): - assert ((di[c] == 99) == exp[i]).all() - - -VALS = [ - 99, - pd.Series(range(4, 10), index=range(4, 10)), -] - - -@pytest.mark.parametrize("idxerL", R_LOC_INDEXER) -@pytest.mark.parametrize("idxerR", C_LOC_INDEXER) -@pytest.mark.parametrize("val", VALS) -def test__setitem__aloc(dios_aligned, idxerL, idxerR, val): - di = dios_aligned.copy() - di.aloc[idxerL, idxerR] = val - exp = dios_aligned.copy() - di.loc[idxerL, idxerR] = val - assert isinstance(di, DictOfSeries) - assert (di == exp).all(None) diff --git a/dios/test/test__setget__iloc.py b/dios/test/test__setget__iloc.py deleted file mode 100644 index 7fcad70fe51fe6efe656dc431c1eec645ca0ca80..0000000000000000000000000000000000000000 --- a/dios/test/test__setget__iloc.py +++ /dev/null @@ -1,45 +0,0 @@ -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -from pandas.core.dtypes.common import is_scalar - -from .test_setup import * - - -@pytest.mark.parametrize( - ("idxer", "exp"), - [(0, s1), (1, s2), (2, s3), (3, s4), (-1, s4), (-2, s3), (-3, s2), (-4, s1)], -) -def test__getitem_single_iloc(dios_aligned, idxer, exp): - di = dios_aligned.iloc[:, idxer] - assert isinstance(di, pd.Series) - assert (di == exp).all() - - -@pytest.mark.parametrize( - ("idxer", "exp"), [((1, 0), s1), ((3, -2), s3), ((-1, -1), s4)] -) -def test__getitem_scalar_iloc(dios_aligned, idxer, exp): - di = dios_aligned.iloc[idxer] - assert is_scalar(di) - assert di == exp.iloc[idxer[0]] - - -@pytest.mark.parametrize( - "idxer", - [ - -5, - 99, - "a", - "2", - None, - ], -) -def test__getitem_single_iloc_fail(dios_aligned, idxer): - with pytest.raises((KeyError, IndexError, TypeError)): - di = dios_aligned.iloc[:, idxer] - - -# ############################# -# __SETITEM__ diff --git a/dios/test/test__setget__loc.py b/dios/test/test__setget__loc.py deleted file mode 100644 index 3f5646f9b06cf3cc30cb921188d0b9bfee450ae7..0000000000000000000000000000000000000000 --- a/dios/test/test__setget__loc.py +++ /dev/null @@ -1,67 +0,0 @@ -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -from pandas.core.dtypes.common import is_scalar - -from .test_setup import * - - -@pytest.mark.parametrize(("idxer", "exp"), [("a", s1), ("c", s3)]) -def test__getitem_loc_singleCol(dios_aligned, idxer, exp): - di = dios_aligned.loc[:, idxer] - assert isinstance(di, pd.Series) - assert (di == exp).all() - - -@pytest.mark.parametrize(("idxer", "exp"), [((1, "a"), s1), ((3, "c"), s3)]) -def test__getitem_loc_singleRow_singleCol(dios_aligned, idxer, exp): - di = dios_aligned.loc[idxer] - assert is_scalar(di) - assert di == exp.loc[idxer[0]] - - -@pytest.mark.parametrize( - "idxer", - [ - "x", - "2", - 1, - None, - ], -) -def test__getitem_loc_singleCol_fail(dios_aligned, idxer): - with pytest.raises((KeyError, TypeError)): - di = dios_aligned.loc[:, idxer] - - -# ############################# -# __SETITEM__ - - -@pytest.mark.parametrize( - ("idxer", "exp"), - [ - (slice(None), [s1 == s1, s2 == s2, s3 == s3, s4 == s4]), - (C_BLIST, [s1 == s1, s2 != s2, s3 != s3, s4 == s4]), - ], -) -def test__setitem_loc_singleCol(dios_aligned, idxer, exp): - di = dios_aligned.copy() - di.loc[:, idxer] = 99 - for i, c in enumerate(di): - assert ((di[c] == 99) == exp[i]).all() - - -VALS = [ - 99, -] - - -@pytest.mark.parametrize("idxerL", R_LOC_INDEXER) -@pytest.mark.parametrize("idxerR", C_LOC_INDEXER) -@pytest.mark.parametrize("val", VALS) -def test__setitem__loc(dios_aligned, idxerL, idxerR, val): - di = dios_aligned.copy() - di.loc[idxerL, idxerR] = val - assert isinstance(di, DictOfSeries) diff --git a/dios/test/test__setitem__.py b/dios/test/test__setitem__.py deleted file mode 100644 index 1575768e321d6137da0edceb04c409949a188b5b..0000000000000000000000000000000000000000 --- a/dios/test/test__setitem__.py +++ /dev/null @@ -1,34 +0,0 @@ -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -import pytest - -from .test_setup import * - -# -# s1 = pd.Series(range(10), index=range(10)) -# s2 = pd.Series(range(5, 10), index=range(5, 10)) -# s3 = pd.Series(range(1, 30, 2), index=range(1, 30, 2)) -# s4 = pd.Series(np.linspace(7, 13, 9), index=range(3, 12)) -# s1.name, s2.name, s3.name, s4.name = 'a', 'b', 'c', 'd' -# d1 = DictOfSeries(data=dict(a=s1.copy(), b=s2.copy(), c=s3.copy(), d=s4.copy())) -# -# blist = [True, False, False, True] -# b = pd.Series([True, False] * 5, index=[1, 2, 3, 4, 5] + [6, 8, 10, 12, 14]) -# B = d1 > 5 -# -# -# -# -# BLIST = [True, False, False, True] -# -# LISTIDXER = [['a'], ['a', 'c'], pd.Series(['a', 'c'])] -# BOOLIDXER = [pd.Series(BLIST), d1.copy() > 10] -# SLICEIDXER = [slice(None), slice(-3, -1), slice(-1, 3), slice(None, None, 3)] -# MULTIIDXER = [] # [d1 > 9, d1 != d1, d1 == d1] -# EMPTYIDEXER = [[], pd.Series(), slice(3, 3), slice(3, -1), DictOfSeries()] -# -# INDEXERS = LISTIDXER + BOOLIDXER + SLICEIDXER + MULTIIDXER + EMPTYIDEXER -# -# diff --git a/dios/test/test_dflike.py b/dios/test/test_dflike.py deleted file mode 100644 index 445b4e0785adb32cf3289c8c4d2441e0a4ff2f1a..0000000000000000000000000000000000000000 --- a/dios/test/test_dflike.py +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env python - -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -from copy import deepcopy - -import numpy as np -import pandas as pd -import pytest -from pandas.core.dtypes.common import is_dict_like, is_nested_list_like - -from .test_setup import * - -__author__ = "Bert Palm" -__email__ = "bert.palm@ufz.de" -__copyright__ = "Copyright 2018, Helmholtz-Zentrum für Umweltforschung GmbH - UFZ" - - -arr = np.random.rand(8) -TESTDATA = [ - None, # empty # 0 - [1], # 1 - arr.copy(), # 2 - np.array([arr.copy(), arr.copy(), arr.copy()]), # 3 - nested list - range(4), # 4 - dict(a=arr.copy(), b=arr.copy()), # 5 dict - pd.DataFrame(dict(a=arr.copy(), b=arr.copy())), # 6 df -] - - -@pytest.mark.parametrize("data", TESTDATA) -@pytest.mark.parametrize("with_column_param", [False, True]) -def test_dios_create(data, with_column_param): - data_copy0 = deepcopy(data) - data_copy1 = deepcopy(data) - - # create columns list - if with_column_param: - df = pd.DataFrame(data=data_copy0) - col = [f"new_{c}" for c in df] - else: - col = None - - if is_nested_list_like(data): - # giving nested lists, work different between df and dios - data_copy1 = data_copy1.transpose() - - df = pd.DataFrame(data=data_copy0, columns=col) - dios = DictOfSeries(data=data_copy1, columns=col) - - assert dios.columns.equals(df.columns) - - eq, msg = dios_eq_df(dios, df, with_msg=True) - assert eq, msg diff --git a/dios/test/test_dflike__setget__.py b/dios/test/test_dflike__setget__.py deleted file mode 100644 index 7388d58e0dee1a5cd7f3861f371304b320d3311d..0000000000000000000000000000000000000000 --- a/dios/test/test_dflike__setget__.py +++ /dev/null @@ -1,97 +0,0 @@ -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -import pytest - -from .test_setup import * - - -def _test(res, exp): - if isinstance(exp, pd.DataFrame): - eq, msg = dios_eq_df(res, exp, with_msg=True) - assert eq, msg - - else: - assert type(exp) == type(res) - - if isinstance(exp, pd.Series): - eq, msg = diosSeries_eq_dfSeries(res, exp, with_msg=True) - assert eq, msg - - # scalars - else: - assert res == exp - - -@pytest.mark.parametrize("idxer", BASIC_INDEXER) -def test_dflike__get__(df_aligned, dios_aligned, idxer): - print(idxer) - exp = df_aligned[idxer] - res = dios_aligned[idxer] - _test(res, exp) - - -@pytest.mark.parametrize("locR", R_LOC_INDEXER) -@pytest.mark.parametrize("locC", C_LOC_INDEXER) -def test_dflike__get_loc__(df_aligned, dios_aligned, locR, locC): - print(locR) - print(locC) - exp = df_aligned.loc[locR, locC] - res = dios_aligned.loc[locR, locC] - _test(res, exp) - - -@pytest.mark.parametrize("ilocR", R_iLOC_INDEXER) -@pytest.mark.parametrize("ilocC", C_iLOC_INDEXER) -def test_dflike__get_iloc__(df_aligned, dios_aligned, ilocR, ilocC): - print(ilocR) - print(ilocC) - exp = df_aligned.iloc[ilocR, ilocC] - res = dios_aligned.iloc[ilocR, ilocC] - _test(res, exp) - - -VALS = [ - 99, -] - - -@pytest.mark.parametrize("idxer", BASIC_INDEXER) -@pytest.mark.parametrize("val", VALS) -def test_dflike__set__(df_aligned, dios_aligned, idxer, val): - print(idxer) - exp = df_aligned - res = dios_aligned - # NOTE: two test fail, pandas bul***it - # df[:2] -> select 2 rows - # df[:2]=99 -> set 3 rows, WTF ??? - exp[idxer] = val - res[idxer] = val - _test(res, exp) - - -@pytest.mark.parametrize("locR", R_LOC_INDEXER) -@pytest.mark.parametrize("locC", C_LOC_INDEXER) -@pytest.mark.parametrize("val", VALS) -def test_dflike__set_loc__(df_aligned, dios_aligned, locR, locC, val): - print(locR) - print(locC) - exp = df_aligned - res = dios_aligned - exp.loc[locR, locC] = val - res.loc[locR, locC] = val - _test(res, exp) - - -@pytest.mark.parametrize("ilocR", R_iLOC_INDEXER) -@pytest.mark.parametrize("ilocC", C_iLOC_INDEXER) -@pytest.mark.parametrize("val", VALS) -def test_dflike__set_iloc__(df_aligned, dios_aligned, ilocR, ilocC, val): - print(ilocR) - print(ilocC) - exp = df_aligned - res = dios_aligned - exp.iloc[ilocR, ilocC] = val - res.iloc[ilocR, ilocC] = val - _test(res, exp) diff --git a/dios/test/test_magic_methods.py b/dios/test/test_magic_methods.py deleted file mode 100644 index 33c80c714b28369f896e258cd94edd0ac3e4a79f..0000000000000000000000000000000000000000 --- a/dios/test/test_magic_methods.py +++ /dev/null @@ -1,20 +0,0 @@ -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -from .test_setup import * - - -def test__len__(datetime_series, maxlen=10): - dios = DictOfSeries() - assert len(dios) == 0 - - for i in range(maxlen): - dios[f"c{i}"] = datetime_series.copy() - assert len(dios) == i + 1 - - for i in reversed(range(maxlen)): - assert len(dios) == i + 1 - del dios[f"c{i}"] - - assert len(dios) == 0 diff --git a/dios/test/test_methods.py b/dios/test/test_methods.py deleted file mode 100644 index d1ebcd85b4050b6adb7e92318798b8161e55321d..0000000000000000000000000000000000000000 --- a/dios/test/test_methods.py +++ /dev/null @@ -1,54 +0,0 @@ -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -from .test_setup import * - - -def test_copy_copy_empty(dios_aligned): - di = dios_aligned - shallow = di.copy(deep=False) - deep = di.copy(deep=True) - empty_w_cols = di.copy_empty(columns=True) - empty_no_cols = di.copy_empty(columns=False) - - assert di is not shallow - assert di is not deep - assert di is not empty_w_cols - assert di is not empty_no_cols - - for attr in [ - "itype", - "_itype", - "_policy", - ]: - dios_attr = getattr(di, attr) - for cop in [shallow, deep, empty_w_cols, empty_no_cols]: - copy_attr = getattr(cop, attr) - assert dios_attr == copy_attr - - assert di.columns.equals(shallow.columns) - assert di.columns.equals(deep.columns) - assert di.columns.equals(empty_w_cols.columns) - assert not di.columns.equals(empty_no_cols.columns) - - for i in di: - assert di._data[i].index is shallow._data[i].index - assert di._data[i].index is not deep._data[i].index - di._data[i][0] = 999999 - assert di[i][0] == shallow[i][0] - assert di[i][0] != deep[i][0] - - -@pytest.mark.parametrize("left", diosFromMatr(DATA_UNALIGNED)) -# we use comp ops just to get some noise in the data -@pytest.mark.parametrize("op", OPCOMP) -def test_all(left, op): - a = left - ser = (op(a, a)).all() - assert isinstance(ser, pd.Series) - res = [e for e in ser] - exp = [op(a[col], a[col]) for col in a] - for i in range(len(res)): - assert isinstance(exp[i], pd.Series) - assert (res[i] == exp[i]).all() diff --git a/dios/test/test_setup.py b/dios/test/test_setup.py deleted file mode 100644 index 103afa1a4aede9e7a2b4c6962b799a4ee3384a66..0000000000000000000000000000000000000000 --- a/dios/test/test_setup.py +++ /dev/null @@ -1,346 +0,0 @@ -# SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ -# -# SPDX-License-Identifier: GPL-3.0-or-later - -import pytest -from numpy.random import randint - -from dios import * - -try: - from dios.operators import ( - _OP1_MAP, - _OP2_ARITH_MAP, - _OP2_BOOL_MAP, - _OP2_COMP_MAP, - _OP2_DIV_MAP, - OP_MAP, - ) -except ModuleNotFoundError: - from dios.dios.operators import ( - OP_MAP, - _OP1_MAP, - _OP2_DIV_MAP, - _OP2_ARITH_MAP, - _OP2_BOOL_MAP, - _OP2_COMP_MAP, - ) - -import numpy as np -import pandas as pd - -a = pd.Series(range(0, 70, 7), dtype=int) -b = pd.Series(range(5, 15, 1), dtype=int) -c = pd.Series(range(7, 107, 10), dtype=int) -d = pd.Series(range(0, 10, 1), dtype=int) - -s1, s2, s3, s4 = a, b, c, d - - -def df_aligned__(): - return pd.DataFrame( - dict( - a=a.copy(), - b=b.copy(), - c=c.copy(), - d=d.copy(), - ) - ) - - -def dios_aligned__(): - return DictOfSeries( - dict( - a=a.copy(), - b=b.copy(), - c=c.copy(), - d=d.copy(), - ) - ) - - -def dios_unaligned__(): - di = dios_aligned__().copy() - for i, s in enumerate(di._data): - s.index = s.index + i * 2 - return di - - -def df_unaligned__(): - return dios_unaligned__().to_df() - - -def dios_fuzzy__(nr_cols=None, mincol=0, maxcol=10, itype=None): - nr_of_cols = nr_cols if nr_cols else randint(mincol, maxcol + 1) - - ns = 10**9 - sec_per_year = 31536000 - - ITYPES = [IntItype, FloatItype, DtItype, ObjItype] - if itype is not None: - itype = get_itype(itype) - else: - itype = ITYPES[randint(0, len(ITYPES))] - - if itype == IntItype: - f = lambda i: pd.Int64Index(i.astype(int)).unique() - elif itype == FloatItype: - f = lambda i: pd.Float64Index(i).unique() - elif itype == ObjItype: - f = lambda i: pd.Index(i.astype(int)).unique().astype(str) + "_str" - else: # itype == DtItype: - f = lambda i: pd.to_datetime(i.astype(int) * ns) + pd.Timedelta("30Y") - - di = DictOfSeries(itype=itype) - for i in range(nr_of_cols): - start = randint(0, sec_per_year) - end = start + randint(0, sec_per_year) - if end > sec_per_year: - start, end = end - sec_per_year, start - - base = randint(0, 10 + 1) - exp = randint(1, int(np.log10(end - start + 100))) - periods = base ** randint(1, exp + 1) - index = np.linspace(start, end, periods) - index = f(index) - - arr = randint(0, 10, len(index)) - di[f"c{i}"] = pd.Series(data=arr, dtype=float, index=index) - - return di - - -@pytest.fixture -def dios_fuzzy(): - return dios_fuzzy__().copy() - - -@pytest.fixture -def df_aligned(): - return df_aligned__().copy() - - -@pytest.fixture -def dios_aligned(): - return dios_aligned__().copy() - - -@pytest.fixture -def df_unaligned(): - return df_unaligned__().copy() - - -@pytest.fixture -def dios_unaligned(): - return dios_unaligned__().copy() - - -def diosSeries_eq_dfSeries( - df_s, di_s, with_msg=False, df_s_name="di_s", di_s_name="df_s" -): - def fail(msg): - if with_msg: - return False, msg - return False - - assert isinstance(df_s, pd.Series) - assert isinstance(di_s, pd.Series) - - if df_s.empty and not di_s.empty: - return fail( - f"value mismatch: " f"{df_s_name} is missing, but " f"{di_s_name} == {di_s}" - ) - - idiff = di_s.index.difference(df_s.index) - if not idiff.empty: - return fail( - f"index mismatch: " - f"{di_s_name}.index: {di_s.index.to_list()}, " - f"{df_s_name}.index: {df_s.index.to_list()}, " - f"diff: {idiff.to_list()}" - ) - - # compare series - for i in df_s.index: - exp = df_s.loc[i] - - # Normally df-nans, from selecting are just not present values - # in a dios. But if a Nan was inserted in dios on purpose, it is - # a valid value, so we try to access the value first. - try: - val = di_s.loc[i] - except KeyError: - # nan in df, missing in dios -> OK - if np.isnan(exp): - continue - - # valid val in df, missing in dios -> FAIL - else: - return fail( - f"value mismatch: " - f"{di_s_name}.loc[{i}] == {exp}, but " - f"{df_s_name}.loc[{i}] does not exist" - ) - - # inf = np.isinf(exp) and np.isinf(val) - # sig = np.sign(exp) == np.sign(val) - # eq_nan = np.isnan(exp) and np.isnan(val) - # eq_inf = inf and sig - # eq_vals = exp == val - # eq = eq_nan or eq_inf or eq_vals - eq = np.equal(val, exp) - assert np.isscalar(eq) - - if not eq: - return fail( - f"value mismatch: " - f"{di_s_name}.loc[{i}] == {exp}, but " - f"{df_s_name}.loc[{i}] == {val}" - ) - - return True, "equal" if with_msg else True - - -def dios_eq_df(dios, df, dios_dropped_empty_colums=False, with_msg=False): - def fail(msg): - if with_msg: - return False, msg - return False - - assert isinstance(df, pd.DataFrame) - assert isinstance(dios, DictOfSeries) - - # check: dios has not more/other cols than df - notmore = [c for c in dios if c not in df] - if notmore: - return fail( - f"columns mismatch. " - f"dios: {dios.columns.to_list()}, " - f"df: {df.columns.to_list()}, " - f"diff: {notmore}" - ) - - # check: may df has empty cols and dios has no cols - # at this locations - miss = [c for c in df if c not in dios] - if miss: - if dios_dropped_empty_colums: - tmp = [] - for c in miss: - if not df[c].dropna().empty: - tmp += [c] - if tmp: - return fail(f"columns mismatch: " f"dios missing column(s): {tmp}") - else: - return fail(f"columns mismatch: " f"dios missing column(s): {miss}") - - cols = df.columns.intersection(dios.columns) - - for c in cols: - ok, m = diosSeries_eq_dfSeries( - df[c], dios[c], di_s_name=f"di[{c}]", df_s_name=f"df[{c}]", with_msg=True - ) - if not ok: - return fail(m) - - return True, "equal" if with_msg else True - - -# 0,1 -NICE_SLICE = [slice(None), slice(None, None, 3)] -R_BLIST = [True, False, False, False, True] * 2 -C_BLIST = [True, False, False, True] - -# 3,4 5 6 -R_LOC_SLICE = NICE_SLICE + [slice(2), slice(2, 8)] -R_LOC_LIST = [[1], [3, 4, 5], pd.Series([3, 7])] -# 7 8 9 -R_LOC_BLIST = [R_BLIST, pd.Series(R_BLIST), pd.Series(R_BLIST).values] - -# 0, 1, 2, -C_LOC_LIST = [["a"], ["a", "c"], pd.Series(["a", "c"])] -C_LOC_SLICE = NICE_SLICE + [slice("b"), slice("b", "c")] -C_LOC_BLIST = [ - C_BLIST, - pd.Series(C_BLIST, index=list("abcd")), - pd.Series(C_BLIST).values, -] - -# 0 1 2 3 4 -RC_iLOC_SLICE = NICE_SLICE + [slice(4), slice(-3, -1), slice(-1, 3)] -R_iLOC_LIST = [[7], [6, 8]] -R_iLOC_BLIST = [ - R_BLIST, - pd.Series(R_BLIST).values, -] # only list-likes allowed not series-likes -C_iLOC_LIST = [[0], [1, 3]] -C_iLOC_BLIST = [C_BLIST, pd.Series(C_BLIST).values] - -MULTIIDXER = [ - df_aligned__() > 9, - df_aligned__() != df_aligned__(), - df_aligned__() == df_aligned__(), - df_aligned__() % 3 == 0, -] -EMPTYIDEXER = [ - [], - pd.Series(dtype="O"), -] -EMPTY_DF = [pd.DataFrame()] - -BASIC_INDEXER = ( - C_LOC_LIST + R_LOC_SLICE + R_LOC_BLIST + MULTIIDXER + EMPTYIDEXER + EMPTY_DF -) -BASIC_INDEXER_FAIL = [ - ["z"], - ["a", "z"], - pd.Series(["a", "z"]), - pd.DataFrame(dict(a=[1, 2, 3])), -] - -R_LOC_INDEXER = R_LOC_SLICE + R_LOC_LIST + R_LOC_BLIST + EMPTYIDEXER -C_LOC_INDEXER = C_LOC_SLICE + C_LOC_LIST + C_LOC_BLIST + EMPTYIDEXER - -R_iLOC_INDEXER = RC_iLOC_SLICE + R_iLOC_LIST + R_iLOC_BLIST -C_iLOC_INDEXER = RC_iLOC_SLICE + C_iLOC_LIST + C_iLOC_BLIST - -O = [[0, 0, 0], [0, 0, 0]] -I = [[1, 1, 1], [1, 1, 1]] -A = [[1, 2, 3], [4, 5, 6]] -B = [[0, 2, 2], [5, 5, 5]] -C = [[3, 2, 0], [1, 0, 3]] -D = [[6, 5, 4], [3, 2, 1]] -DATA_ALIGNED = [O, I, A, B, C, D] - -# outer lists could have differnet length, but this would -# make the checks to complicated -EEE = [[], [], []] -O = [[0, 0], [0, 0, 0], [0, 0, 0, 0]] -I = [[1, 1, 1], [1, 1, 1], [1]] -A = [[1], [2, 3], [4, 5, 6]] -B = [[0, 2, 2], [5], [5, 5]] -C = [[3, 2, 0], [1, 0, 3], [0, 0, 0]] -D = [[6], [2], [9]] -DATA_UNALIGNED = [O, I, A, B, C, D, EEE] - -# only use if a single matrix is used -ALL = DATA_ALIGNED + DATA_UNALIGNED - -OPCOMP = list(_OP2_COMP_MAP) -OPNOCOMP = list(_OP2_ARITH_MAP) + list(_OP2_BOOL_MAP) + list(_OP2_DIV_MAP) -OP2 = OPCOMP + OPNOCOMP -OP1 = list(_OP1_MAP) - - -def diosFromMatr(mlist): - l = [] - for m in mlist: - l.append(DictOfSeries({i: li.copy() for i, li in enumerate(m)})) - return tuple(l) - - -@pytest.fixture() -def datetime_series(): - m = randint(2, 1000) - idx = pd.date_range("2000", "2010", m) - return pd.Series(range(m), idx) diff --git a/docs/cookbooks/MultivariateFlagging.rst b/docs/cookbooks/MultivariateFlagging.rst index 8d0c4d915b6d6dea8059aae7a9e20474c49066f4..b258de006052e5012b572c8301f194a94c299a75 100644 --- a/docs/cookbooks/MultivariateFlagging.rst +++ b/docs/cookbooks/MultivariateFlagging.rst @@ -68,7 +68,7 @@ We can check out the fields, the newly generated :py:class:`~saqc.SaQC` object c .. doctest:: exampleMV >>> qc.data.columns - Index(['sac254_raw', 'level_raw', 'water_temp_raw', 'maint'], dtype='object', name='columns') + Index(['sac254_raw', 'level_raw', 'water_temp_raw', 'maint'], dtype='object') The variables represent meassurements of *water level*, the *specific absorption coefficient* at 254 nm Wavelength, the *water temperature* and there is also a variable, *maint*, that refers to time periods, where the *sac254* sensor @@ -130,43 +130,41 @@ the desired variables as column names and have a look at the console output to g .. doctest:: exampleMV >>> qc.data[['sac254_raw', 'level_raw', 'water_temp_raw']] # doctest:+NORMALIZE_WHITESPACE - sac254_raw | level_raw | water_temp_raw | - ============================== | ============================= | ================================== | - Timestamp | Timestamp | Timestamp | - 2016-01-01 00:02:00 18.4500 | 2016-01-01 00:02:00 103.290 | 2016-01-01 00:02:00 4.84 | - 2016-01-01 00:17:00 18.6437 | 2016-01-01 00:17:00 103.285 | 2016-01-01 00:17:00 4.82 | - 2016-01-01 00:32:00 18.9887 | 2016-01-01 00:32:00 103.253 | 2016-01-01 00:32:00 4.81 | - 2016-01-01 00:47:00 18.8388 | 2016-01-01 00:47:00 103.210 | 2016-01-01 00:47:00 4.80 | - 2016-01-01 01:02:00 18.7438 | 2016-01-01 01:02:00 103.167 | 2016-01-01 01:02:00 4.78 | - ... ... | ... ... | ... ... | - 2017-12-31 22:47:00 43.2275 | 2017-12-31 22:47:00 186.060 | 2017-12-31 22:47:00 5.49 | - 2017-12-31 23:02:00 43.6937 | 2017-12-31 23:02:00 186.115 | 2017-12-31 23:02:00 5.49 | - 2017-12-31 23:17:00 43.6012 | 2017-12-31 23:17:00 186.137 | 2017-12-31 23:17:00 5.50 | - 2017-12-31 23:32:00 43.2237 | 2017-12-31 23:32:00 186.128 | 2017-12-31 23:32:00 5.51 | - [70163] [70163] [70163] + sac254_raw | level_raw | water_temp_raw | + ============================ | ============================ | ========================= | + 2016-01-01 00:02:00 18.4500 | 2016-01-01 00:02:00 103.290 | 2016-01-01 00:02:00 4.84 | + 2016-01-01 00:17:00 18.6437 | 2016-01-01 00:17:00 103.285 | 2016-01-01 00:17:00 4.82 | + 2016-01-01 00:32:00 18.9887 | 2016-01-01 00:32:00 103.253 | 2016-01-01 00:32:00 4.81 | + 2016-01-01 00:47:00 18.8388 | 2016-01-01 00:47:00 103.210 | 2016-01-01 00:47:00 4.80 | + 2016-01-01 01:02:00 18.7438 | 2016-01-01 01:02:00 103.167 | 2016-01-01 01:02:00 4.78 | + ... ... | ... ... | ... ... | + 2017-12-31 22:47:00 43.2275 | 2017-12-31 22:47:00 186.060 | 2017-12-31 22:47:00 5.49 | + 2017-12-31 23:02:00 43.6937 | 2017-12-31 23:02:00 186.115 | 2017-12-31 23:02:00 5.49 | + 2017-12-31 23:17:00 43.6012 | 2017-12-31 23:17:00 186.137 | 2017-12-31 23:17:00 5.50 | + 2017-12-31 23:32:00 43.2237 | 2017-12-31 23:32:00 186.128 | 2017-12-31 23:32:00 5.51 | + 2017-12-31 23:47:00 43.7438 | 2017-12-31 23:47:00 186.130 | 2017-12-31 23:47:00 5.53 | <BLANKLINE> - max: [70163 rows x 3 columns] The data seems to have a fairly regular sampling rate of *15* minutes at first glance. But checking out values around *2017-10-29*, we notice, that the sampling rate seems not to be totally stable: .. doctest:: exampleMV - >>> qc.data[['sac254_raw', 'level_raw', 'water_temp_raw']]['2017-10-29 07:00:00':'2017-10-29 09:00:00'] # doctest:+NORMALIZE_WHITESPACE - sac254_raw | level_raw | water_temp_raw | - ============================== | ============================= | ================================== | - Timestamp | Timestamp | Timestamp | - 2017-10-29 07:02:00 40.3050 | 2017-10-29 07:02:00 112.570 | 2017-10-29 07:02:00 10.91 | - 2017-10-29 07:17:00 39.6287 | 2017-10-29 07:17:00 112.497 | 2017-10-29 07:17:00 10.90 | - 2017-10-29 07:32:00 39.5800 | 2017-10-29 07:32:00 112.460 | 2017-10-29 07:32:00 10.88 | - 2017-10-29 07:32:01 39.9750 | 2017-10-29 07:32:01 111.837 | 2017-10-29 07:32:01 10.70 | - 2017-10-29 07:47:00 39.1350 | 2017-10-29 07:47:00 112.330 | 2017-10-29 07:47:00 10.84 | - 2017-10-29 07:47:01 40.6937 | 2017-10-29 07:47:01 111.615 | 2017-10-29 07:47:01 10.68 | - 2017-10-29 08:02:00 40.4938 | 2017-10-29 08:02:00 112.040 | 2017-10-29 08:02:00 10.77 | - 2017-10-29 08:02:01 39.3337 | 2017-10-29 08:02:01 111.552 | 2017-10-29 08:02:01 10.68 | - 2017-10-29 08:17:00 41.5238 | 2017-10-29 08:17:00 111.835 | 2017-10-29 08:17:00 10.72 | - 2017-10-29 08:17:01 38.6963 | 2017-10-29 08:17:01 111.750 | 2017-10-29 08:17:01 10.69 | - 2017-10-29 08:32:01 39.4337 | 2017-10-29 08:32:01 112.027 | 2017-10-29 08:32:01 10.66 | + >>> qc.data['sac254_raw']['2017-10-29 07:00:00':'2017-10-29 09:00:00'] # doctest:+NORMALIZE_WHITESPACE + Timestamp + 2017-10-29 07:02:00 40.3050 + 2017-10-29 07:17:00 39.6287 + 2017-10-29 07:32:00 39.5800 + 2017-10-29 07:32:01 39.9750 + 2017-10-29 07:47:00 39.1350 + 2017-10-29 07:47:01 40.6937 + 2017-10-29 08:02:00 40.4938 + 2017-10-29 08:02:01 39.3337 + 2017-10-29 08:17:00 41.5238 + 2017-10-29 08:17:01 38.6963 + 2017-10-29 08:32:01 39.4337 + 2017-10-29 08:47:01 40.4987 + dtype: float64 Those instabilities do bias most statistical evaluations and it is common practice to apply some :doc:`resampling functions <../funcs/resampling>` onto the data, to obtain a regularly spaced timestamp. @@ -205,7 +203,7 @@ The resulting timeseries now has has regular timestamp. 2017-12-31 23:30:00 43.274033 2017-12-31 23:45:00 43.674453 2018-01-01 00:00:00 NaN - Name: sac254_raw, Length: 70177, dtype: float64 + Length: 70177, dtype: float64 Since points, that were identified as malicous get excluded before the harmonization, the resulting regularly sampled timeseries does not include them anymore: diff --git a/docs/documentation/Customizations.rst b/docs/documentation/Customizations.rst index 989e4d4e1770e1b82a3ae4e6326d04ca612e523a..505ddbaa1905c7ab3710f8ce36c7bc6a90b48665 100644 --- a/docs/documentation/Customizations.rst +++ b/docs/documentation/Customizations.rst @@ -56,7 +56,7 @@ Argument Descriptions * - Name - Description * - ``data`` - - The actual dataset, an instance of ``dios.DictOfSeries``. + - The actual dataset, an instance of ``saqc.DictOfSeries``. * - ``field`` - The field/column within ``data``, that function is processing. * - ``flags`` diff --git a/docs/documentation/GenericFunctions.rst b/docs/documentation/GenericFunctions.rst index bd017843aa675bc3698cc5e7543bcdd11393b348..74e7b2e8c4f72ca0a4145d29bf2e7f0da0bc1dcc 100644 --- a/docs/documentation/GenericFunctions.rst +++ b/docs/documentation/GenericFunctions.rst @@ -126,7 +126,7 @@ Simple constraints ... ), ... data ... ) - >>> (tmp.flags == qc1.flags).all(axis=None) #doctest:+NORMALIZE_WHITESPACE + >>> tmp.flags == qc1.flags #doctest:+NORMALIZE_WHITESPACE True @@ -187,7 +187,7 @@ Cross variable constraints ... ), ... data ... ) - >>> (tmp.flags == qc2.flags).all(axis=None) #doctest:+NORMALIZE_WHITESPACE + >>> tmp.flags == qc2.flags #doctest:+NORMALIZE_WHITESPACE True @@ -251,7 +251,7 @@ need to be put in parentheses. ... ), ... data ... ) - >>> (tmp.flags == qc3.flags).all(axis=None) #doctest:+NORMALIZE_WHITESPACE + >>> tmp.flags == qc3.flags #doctest:+NORMALIZE_WHITESPACE True @@ -303,7 +303,7 @@ Arithmetics ... ), ... data ... ) - >>> (tmp.flags == qc4.flags).all(axis=None) #doctest:+NORMALIZE_WHITESPACE + >>> tmp.flags == qc4.flags #doctest:+NORMALIZE_WHITESPACE True @@ -361,7 +361,7 @@ Special functions ... ), ... data ... ) - >>> (tmp.flags == qc5.flags).all(axis=None) #doctest:+NORMALIZE_WHITESPACE + >>> tmp.flags == qc5.flags #doctest:+NORMALIZE_WHITESPACE True @@ -413,7 +413,7 @@ Special functions ... ), ... data ... ) - >>> (tmp.flags == qc6.flags).all(axis=None) #doctest:+NORMALIZE_WHITESPACE + >>> tmp.flags == qc6.flags #doctest:+NORMALIZE_WHITESPACE True @@ -439,12 +439,12 @@ Let's consider the following dataset: .. doctest:: python >>> qc.data #doctest:+NORMALIZE_WHITESPACE - meas | fan | volt | - ========================= | ======================= | ========================= | - 2018-06-01 12:00:00 3.56 | 2018-06-01 12:00:00 1 | 2018-06-01 12:00:00 12.1 | - 2018-06-01 12:10:00 4.70 | 2018-06-01 12:10:00 0 | 2018-06-01 12:10:00 12.0 | - 2018-06-01 12:20:00 0.10 | 2018-06-01 12:20:00 1 | 2018-06-01 12:20:00 11.5 | - 2018-06-01 12:30:00 3.62 | 2018-06-01 12:30:00 1 | 2018-06-01 12:30:00 12.1 | + meas | fan | volt | + ========================= | ====================== | ========================= | + 2018-06-01 12:00:00 3.56 | 2018-06-01 12:00:00 1 | 2018-06-01 12:00:00 12.1 | + 2018-06-01 12:10:00 4.70 | 2018-06-01 12:10:00 0 | 2018-06-01 12:10:00 12.0 | + 2018-06-01 12:20:00 0.10 | 2018-06-01 12:20:00 1 | 2018-06-01 12:20:00 11.5 | + 2018-06-01 12:30:00 3.62 | 2018-06-01 12:30:00 1 | 2018-06-01 12:30:00 12.1 | **Task**: Flag ``meas`` where ``fan`` equals 0 and ``volt`` is lower than ``12.0``. @@ -491,7 +491,7 @@ Let's consider the following dataset: ... ), ... data ... ) - >>> (tmp.flags == qc7.flags).all(axis=None) #doctest:+NORMALIZE_WHITESPACE + >>> tmp.flags == qc7.flags #doctest:+NORMALIZE_WHITESPACE True @@ -546,7 +546,7 @@ But we could also quality check our independent variables first and than leverag ... ), ... data ... ) - >>> (tmp.flags == qc8.flags).all(axis=None) #doctest:+NORMALIZE_WHITESPACE + >>> tmp.flags == qc8.flags #doctest:+NORMALIZE_WHITESPACE True @@ -644,7 +644,7 @@ variables in a given dataset. We start with dummy data again: ... ), ... data ... ) - >>> (tmp.data == qc1.data).all(axis=None) #doctest:+NORMALIZE_WHITESPACE + >>> tmp.data == qc1.data #doctest:+NORMALIZE_WHITESPACE True diff --git a/docs/gettingstarted/TutorialCLI.rst b/docs/gettingstarted/TutorialCLI.rst index e173f3c66139b29af4b3181e64df3279a9daf252..9074311430ff641d5b29eae197cddad803a5aff9 100644 --- a/docs/gettingstarted/TutorialCLI.rst +++ b/docs/gettingstarted/TutorialCLI.rst @@ -267,7 +267,7 @@ series. Also, you can write your own tests using a python-based import os qc = saqc.fromConfig(configpath('3'), data) - qc.data.to_csv(temppath('TutorialCLIHarmData.csv')) + qc.data.to_pandas().to_csv(temppath('TutorialCLIHarmData.csv')) The above executes an internal framework that aligns the timestamps of SM2 diff --git a/docs/howtodoc/HowToDoc.rst b/docs/howtodoc/HowToDoc.rst index 2b0c36afaa012a3692722c2d749f2695675f7fdb..162bcf9bfa1f623423df906898f7a3c39a1762c4 100644 --- a/docs/howtodoc/HowToDoc.rst +++ b/docs/howtodoc/HowToDoc.rst @@ -46,7 +46,7 @@ use this: def foo(data, field, flagger): """ - data : dios.DictOfSeries + data : saqc.DictOfSeries A saqc-data object. field : str @@ -239,8 +239,8 @@ This will be rendered as: >>> 1+1 2 -It can be a little tricky, to match complexer std_out strings, like dios or DataFrames. There are some -doctest flags that can mitigate frustration: +It can be a little tricky, to match complexer std_out strings, like DictOfSeries or +DataFrames. There are some doctest flags that can mitigate frustration: #. NORMALIZE_WHITESPACE will map all whitespace/tab combos onto a single whitespace. Use like: diff --git a/docs/modules/SaQCCore.rst b/docs/modules/SaQCCore.rst index 93569cdb1b17699415c1b5455ef15685a6bb9bcf..00bbdea82f827a04e562e316bc7238041143da67 100644 --- a/docs/modules/SaQCCore.rst +++ b/docs/modules/SaQCCore.rst @@ -4,15 +4,6 @@ SaQC ==== -.. currentmodule:: saqc - -.. HACK: add 'our' external imported objects to core, but dont make it show up here - .. autosummary:: - :toctree: ../_api - - saqc.core.to_dios - saqc.core.DictOfSeries - .. automodapi:: saqc.core :include-all-objects: diff --git a/requirements.txt b/requirements.txt index 20eafea505c2213fb136a41f814321a419223fbd..83d3c93aafd3bd98ec50107ffcbcfb567e91d12c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,3 +13,4 @@ pandas==1.3.5 scikit-learn==1.2.1 scipy==1.10.0 typing_extensions==4.4.0 +fancy-collections==0.1.3 diff --git a/saqc/__init__.py b/saqc/__init__.py index 082baf5bc8a209dd8163b2d35eab33ae2aaf82a4..f6e1d7df304d579638f7c578b474290a912e0c94 100644 --- a/saqc/__init__.py +++ b/saqc/__init__.py @@ -8,7 +8,7 @@ """The System for automated Quality Control package.""" from saqc.constants import BAD, DOUBTFUL, FILTER_ALL, FILTER_NONE, GOOD, UNFLAGGED -from saqc.core import Flags, SaQC +from saqc.core import Flags, DictOfSeries, SaQC from saqc.core.translation import DmpScheme, FloatScheme, PositionalScheme, SimpleScheme from saqc.parsing.reader import fromConfig from saqc.version import __version__ diff --git a/saqc/__main__.py b/saqc/__main__.py index eadfef4b6d53f149472919833a648bd79facae62..42953137eaff95db8794bfff9c206741c40f1e11 100644 --- a/saqc/__main__.py +++ b/saqc/__main__.py @@ -111,10 +111,10 @@ def main(config, data, scheme, outfile, nodata, log_level): scheme=TRANSLATION_SCHEMES[scheme or "simple"](), ) - data_result = saqc.data.to_df() + data_result = saqc.data.to_pandas() flags_result = saqc.flags if isinstance(flags_result, DictOfSeries): - flags_result = flags_result.to_df() + flags_result = flags_result.to_pandas() if outfile: data_result.columns = pd.MultiIndex.from_product( diff --git a/saqc/core/__init__.py b/saqc/core/__init__.py index 775e7f23fda3afc45663371042f418fd924dde21..1fbe7e27cf16682bce68df38479da20179647b19 100644 --- a/saqc/core/__init__.py +++ b/saqc/core/__init__.py @@ -4,7 +4,7 @@ # -*- coding: utf-8 -*- # isort: skip_file -from saqc.core.frame import DictOfSeries, to_dios # noqa +from saqc.core.frame import DictOfSeries from saqc.core.history import History from saqc.core.flags import Flags, initFlagsLike from saqc.core.register import flagging, processing, register diff --git a/saqc/core/core.py b/saqc/core/core.py index c719375bc4e99bcd911e4b8961c63b8b71909663..f9eb861ee0436aac839b2c7a4fc4b194b2393f9d 100644 --- a/saqc/core/core.py +++ b/saqc/core/core.py @@ -16,7 +16,7 @@ import numpy as np import pandas as pd from saqc.core.flags import Flags, initFlagsLike -from saqc.core.frame import DictOfSeries, concatDios, to_dios +from saqc.core.frame import DictOfSeries from saqc.core.history import History from saqc.core.register import FUNC_MAP from saqc.core.translation import ( @@ -153,23 +153,34 @@ class SaQC(FunctionsMixin): def _initData(self, data) -> DictOfSeries: if data is None: return DictOfSeries() - if isinstance(data, list): - results = [] + result = DictOfSeries() + doubles = pd.Index([]) for d in data: - results.append(self._castToDios(d)) - return concatDios(results, warn=True, stacklevel=3) - - if isinstance(data, (DictOfSeries, pd.DataFrame, pd.Series)): - return self._castToDios(data) - - raise TypeError( - "'data' must be of type pandas.Series, " - "pandas.DataFrame or dios.DictOfSeries or " - "a list of those." - ) + new = self._castData(d) + doubles = doubles.union(result.columns.intersection(new.columns)) + result.update(new) + if not doubles.empty: + warnings.warn( + f"Column(s) {doubles.tolist()} was present multiple " + f"times in input data. Some data was overwritten. " + f"Avoid duplicate columns names over all inputs.", + stacklevel=2, + ) + return result + try: + return self._castData(data) + except ValueError as e: + raise e from None + except TypeError as e: + raise TypeError( + "'data' must be of type pandas.Series, " + "pandas.DataFrame or saqc.DictOfSeries or " + "a list of those or a dict with string keys " + "and pandas.Series as values." + ) from e - def _castToDios(self, data): + def _castData(self, data) -> DictOfSeries: if isinstance(data, pd.Series): if not isinstance(data.name, str): raise ValueError(f"Cannot init from unnamed pd.Series") @@ -177,12 +188,12 @@ class SaQC(FunctionsMixin): if isinstance(data, pd.DataFrame): for idx in [data.index, data.columns]: if isinstance(idx, pd.MultiIndex): - raise TypeError("'data' should not have MultiIndex") - data = to_dios(data) # noop for DictOfSeries - for c in data.columns: - if not isinstance(c, str): - raise TypeError("columns labels must be of type string") - return data + raise ValueError("'data' should not have MultiIndex") + try: + # This ensures that values are pd.Series + return DictOfSeries(data) + except Exception: + raise TypeError(f"Cannot cast {type(data)} to DictOfSeries") from None def _initFlags(self, flags) -> Flags: if flags is None: diff --git a/saqc/core/flags.py b/saqc/core/flags.py index ebff8349b01683f05d297c54824d07a8f99c467f..3e7e7dcc0bb71ee4022673cdafb88d7a8c664d37 100644 --- a/saqc/core/flags.py +++ b/saqc/core/flags.py @@ -6,6 +6,8 @@ from __future__ import annotations +import typing +import warnings from typing import DefaultDict, Dict, Iterable, Mapping, Tuple, Type, Union import numpy as np @@ -80,7 +82,6 @@ class Flags: >>> flags = Flags() >>> flags Empty Flags - Columns: [] .. doctest:: exampleFlags @@ -257,6 +258,9 @@ class Flags: # ---------------------------------------------------------------------- # meta data + def keys(self) -> typing.KeysView: + return self._data.keys() + @property def columns(self) -> pd.Index: """ @@ -447,16 +451,19 @@ class Flags: """ Transform the flags container to a ``DictOfSeries``. + + .. deprecated:: 2.4 + use `saqc.DictOfSeries(obj)` instead. + Returns ------- DictOfSeries """ - di = DictOfSeries(columns=self.columns) - - for k in self._data.keys(): - di[k] = self[k] - - return di.copy() + warnings.warn( + "toDios is deprecated, use `saqc.DictOfSeries(obj)` instead.", + category=DeprecationWarning, + ) + return DictOfSeries(self).copy() def toFrame(self) -> pd.DataFrame: """ @@ -466,10 +473,10 @@ class Flags: ------- pd.DataFrame """ - return self.toDios().to_df() + return pd.DataFrame(dict(self)) def __repr__(self) -> str: - return str(self.toDios()).replace("DictOfSeries", type(self).__name__) + return str(DictOfSeries(self)).replace("DictOfSeries", type(self).__name__) def initFlagsLike( diff --git a/saqc/core/frame.py b/saqc/core/frame.py index 225ec64144c1730f50750ccfdf7ff784468b233c..312277a4815b4eb8748ab88f5de5e9fa31ec7478 100644 --- a/saqc/core/frame.py +++ b/saqc/core/frame.py @@ -2,61 +2,200 @@ # SPDX-FileCopyrightText: 2021 Helmholtz-Zentrum für Umweltforschung GmbH - UFZ # SPDX-License-Identifier: GPL-3.0-or-later # -*- coding: utf-8 -*- +from __future__ import annotations import warnings -from typing import List - -from dios import DictOfSeries, to_dios # noqa - - -def mergeDios(left: DictOfSeries, right: DictOfSeries, subset=None, join="merge"): - # use dios.merge() as soon as it implemented - # see https://git.ufz.de/rdm/dios/issues/15 - - merged = left.copy() - if subset is not None: - right_subset_cols = right.columns.intersection(subset) - else: - right_subset_cols = right.columns - - shared_cols = left.columns.intersection(right_subset_cols) - - for c in shared_cols: - l, r = left[c], right[c] - if join == "merge": - # NOTE: - # our merge behavior is nothing more than an - # outer join, where the right join argument - # overwrites the left at the shared indices, - # while on a normal outer join common indices - # hold the values from the left join argument - r, l = l.align(r, join="outer") - else: - l, r = l.align(r, join=join) - merged[c] = l.combine_first(r) - - newcols = right_subset_cols.difference(left.columns) - for c in newcols: - merged[c] = right[c].copy() - - return merged - - -def concatDios(data: List[DictOfSeries], warn: bool = True, stacklevel: int = 2): - # fast path for most common case - if len(data) == 1 and data[0].columns.is_unique: - return data[0] - - result = DictOfSeries() - for di in data: - for c in di.columns: - if c in result.columns: - if warn: - warnings.warn( - f"Column {c} already exist. Data is overwritten. " - f"Avoid duplicate columns names over all inputs.", - stacklevel=stacklevel, - ) - result[c] = di[c] - - return result +from typing import Any, Hashable, Mapping + +import numpy as np +import pandas as pd +from fancy_collections import DictOfPandas + + +class DictOfSeries(DictOfPandas): + _key_types = (str, int, float) + _value_types = (pd.Series,) + + def __init__(self, *args, **kwargs): + # data is needed to prevent an + # AttributeError on repr during + # Errors within __init__ + self.data = {} + self._attrs = None + super().__init__(*args, **kwargs) + + @property + def attrs(self) -> dict[Hashable, Any]: + """ + Dictionary of global attributes of this dataset. + """ + if self._attrs is None: + self._attrs = {} + return self._attrs + + @attrs.setter + def attrs(self, value: Mapping[Hashable, Any]) -> None: + self._attrs = dict(value) + + def to_df(self, how="outer") -> pd.DataFrame: + """ + Transform DictOfSeries to a pandas.DataFrame. + + .. deprecated:: 2.4 + use `DictOfSeries.to_pandas()` instead. + """ + warnings.warn( + f"`to_df()` is deprecated use `to_pandas()` instead.", + category=DeprecationWarning, + ) + return self.to_pandas(how) + + def flatten(self, promote_index: bool = False) -> DictOfSeries: + """ + Return a copy. + DictOfPandas compatibility + """ + return self.copy() + + def to_pandas(self, how="outer"): + # This is a future feature from fancy_collections.DictOfPandas + # wich probably will come in the next version 0.1.4. + # We adopt this early, to prevent a second refactoring. + # The docstring will be different, so we keep the + # dynamic docstring allocation, down below. + # If the feature is present we just need to delete the + # entire method here. + return self.to_dataframe(how) + + def index_of(self, method="union") -> pd.Index: + """Return an index with indices from all columns. + + .. deprecated:: 2.4 + use `DictOfSeries.union_index()` and `DictOfSeries.shared_index()` instead. + + Parameters + ---------- + method : string, default 'all' + * 'union' : return the union of all indices from all columns + * 'shared' : return only indices that are present in every column + * 'all' : alias for 'union' + * 'intersection' : alias for 'shared' + + See also + -------- + DictOfSeries.to_pandas: convert a DictOfSeries to a pandas.DataFrame + + Returns + ------- + index: pd.Index + A duplicate-free index + """ + if method in ["union", "all"]: + return self.union_index() + elif method in ["intersection", "shared"]: + return self.shared_index() + raise ValueError("method must be one of 'shared' or 'union'.") + + +DictOfSeries.empty.__doc__ = """ +Indicator whether DictOfSeries is empty. + +True if DictOfSeries is entirely empty (no items) or all +items are empty themselves. + +Notes +----- +To only check if DictOfSeries has no items use ``len`` or ``bool`` +buildins. + +Examples +-------- +>>> from saqc import DictOfSeries +>>> di1 = DictOfSeries() +>>> di1.empty +True + +A DictOfSeries is also considered empty if all items within it are empty + +>>> di2 = DictOfSeries(a=pd.Series(dtype=float), b=pd.Series(dtype='O')) +>>> assert di2['a'].empty and di2['b'].empty +>>> di2.empty +True + +To differentiate between a DictOfSeries with no items and a +DictOfSeries with empty items use the buildin functions +`len` or `bool` + +>>> len(di1) +0 +>>> bool(di1) +False +>>> len(di2) +2 +>>> bool(di2) +True + +Returns +------- +bool +""" + +DictOfSeries.to_pandas.__doc__ = """ +Transform DictOfSeries to a pandas.DataFrame. + +Because a pandas.DataFrame can not handle data of different +length, but DictOfSeries can, the missing data is filled with +NaNs or is dropped, depending on the keyword `how`. + +Parameters +---------- +how : {'outer', 'inner'}, default 'outer' + Defines how the resulting DataFrame index is generated. + + - ``outer`` : The resulting DataFrame index is the combination + of all indices merged together. If a column misses values at + new index locations, `NaN`'s are filled. + - ``inner`` : Only indices that are present in all columns are used + for the resulting index. Filling logic is not needed, but values + are dropped, if a column has indices that are not known to all + other columns. + +Returns +------- +frame: pandas.DataFrame + +Examples +-------- +Missing data locations are filled with NaN's + +>>> from saqc import DictOfSeries +>>> a = pd.Series(11, index=range(2)) +>>> b = pd.Series(22, index=range(3)) +>>> c = pd.Series(33, index=range(1,9,3)) +>>> di = DictOfSeries(a=a, b=b, c=c) +>>> di # doctest: +NORMALIZE_WHITESPACE + a | b | c | +===== | ===== | ===== | +0 11 | 0 22 | 1 33 | +1 11 | 1 22 | 4 33 | + | 2 22 | 7 33 | + +>>> di.to_pandas() # doctest: +NORMALIZE_WHITESPACE + a b c +0 11.0 22.0 NaN +1 11.0 22.0 33.0 +2 NaN 22.0 NaN +4 NaN NaN 33.0 +7 NaN NaN 33.0 + +or is dropped if `how='inner'` + +>>> di.to_pandas(how='inner') # doctest: +NORMALIZE_WHITESPACE + a b c +1 11.0 22.0 33.0 +""" + + +DictOfSeries.to_dataframe.__doc__ = DictOfSeries.to_pandas.__doc__.replace( + "to_pandas", "to_dataframe" +) diff --git a/saqc/core/register.py b/saqc/core/register.py index 7f364ce5e72e36facf5648849131deba37a6732a..dac71ea792496d8ca8a44d37a163863ed685dcc8 100644 --- a/saqc/core/register.py +++ b/saqc/core/register.py @@ -170,19 +170,17 @@ def _maskData( mask : DictOfSeries dios holding iloc-data-pairs for every column in `data` """ - mask = DictOfSeries(columns=columns) + mask = DictOfSeries() # we use numpy here because it is faster for c in columns: - col_mask = isflagged(flags[c], thresh) + col_mask = isflagged(flags[c].to_numpy(), thresh) if col_mask.any(): col_data = data[c].to_numpy(dtype=np.float64) - mask[c] = pd.Series(col_data[col_mask], index=np.where(col_mask)[0]) - col_data[col_mask] = np.nan - data[c] = col_data + data[c] = pd.Series(col_data, index=data[c].index, dtype=data[c].dtype) return data, mask diff --git a/saqc/core/translation/basescheme.py b/saqc/core/translation/basescheme.py index d469e02b33edbb8467e7471a982d8fc3128611a8..9d2b8efa8bc3b0cae854a6166a32aff81191af0f 100644 --- a/saqc/core/translation/basescheme.py +++ b/saqc/core/translation/basescheme.py @@ -153,7 +153,8 @@ class MappingScheme(TranslationScheme): diff = pd.Index(out[field]).difference(expected) if not diff.empty: raise ValueError( - f"flags were not translated: {diff.drop_duplicates().to_list()}" + f"following flag values could not be " + f"translated: {diff.drop_duplicates().to_list()}" ) return out @@ -240,6 +241,6 @@ class FloatScheme(TranslationScheme): ) def toExternal(self, flags: Flags, attrs: dict | None = None) -> DictOfSeries: - out = flags.toDios() + out = DictOfSeries(flags) out.attrs = attrs or {} return out diff --git a/saqc/funcs/drift.py b/saqc/funcs/drift.py index 2545b7d125d2525930a05956dfc6bf0f3b4819a2..f7c80bffa27d15271d9ed49dd8fe7fce8735c80e 100644 --- a/saqc/funcs/drift.py +++ b/saqc/funcs/drift.py @@ -138,7 +138,7 @@ class DriftMixin: """ fields = toSequence(field) - data = self._data[fields].to_df() + data = self._data[fields].to_pandas() data.dropna(inplace=True) segments = data.groupby(pd.Grouper(freq=freq)) @@ -223,7 +223,7 @@ class DriftMixin: if reference not in fields: fields.append(reference) - data = self._data[fields].to_df().dropna() + data = self._data[fields].to_pandas().dropna() segments = data.groupby(pd.Grouper(freq=freq)) for segment in segments: @@ -342,7 +342,7 @@ class DriftMixin: maint_data = self._data[maintenance_field].copy() to_correct_clean = to_correct.dropna() - d = {"drift_group": np.nan, to_correct.name: to_correct_clean.values} + d = {"drift_group": np.nan, field: to_correct_clean.values} drift_frame = pd.DataFrame(d, index=to_correct_clean.index) # group the drift frame @@ -361,7 +361,7 @@ class DriftMixin: ) for k, group in drift_grouper: - data_series = group[to_correct.name] + data_series = group[field] data_fit, data_shiftTarget = _driftFit( data_series, shift_targets.loc[k, :][0], cal_range, model ) @@ -753,7 +753,7 @@ def _assignRegimeAnomaly( ) -> Tuple[DictOfSeries, Flags]: series = data[cluster_field] cluster = np.unique(series) - cluster_dios = DictOfSeries({i: data[field][series == i] for i in cluster}) + cluster_dios = DictOfSeries({str(i): data[field][series == i] for i in cluster}) plateaus = detectDeviants(cluster_dios, metric, spread, frac, method, "samples") if set_flags: diff --git a/saqc/funcs/flagtools.py b/saqc/funcs/flagtools.py index 208aecd667a1b67ded026150bd0025eb700e0db1..4c0bf85f62a9d7bb88253b4be68159882ce0e320 100644 --- a/saqc/funcs/flagtools.py +++ b/saqc/funcs/flagtools.py @@ -209,6 +209,7 @@ class FlagtoolsMixin: .. doctest:: ExampleFlagManual + >>> import saqc >>> mdata = pd.Series([1, 0, 1], index=pd.to_datetime(['2000-02-01', '2000-03-01', '2000-05-01'])) >>> mdata 2000-02-01 1 @@ -231,7 +232,7 @@ class FlagtoolsMixin: 2000-02-02 False 2000-03-01 False 2000-05-01 True - Name: daily_data, dtype: bool + dtype: bool With the 'right-open' method, the mdata is forward fill: @@ -244,7 +245,7 @@ class FlagtoolsMixin: 2000-02-02 True 2000-03-01 False 2000-05-01 True - Name: daily_data, dtype: bool + dtype: bool With the 'left-open' method, backward filling is used: @@ -257,7 +258,7 @@ class FlagtoolsMixin: 2000-02-02 True 2000-03-01 True 2000-05-01 True - Name: daily_data, dtype: bool + dtype: bool """ dat = self._data[field] # internal not-mflag-value -> cant go for np.nan @@ -362,23 +363,24 @@ class FlagtoolsMixin: .. doctest:: exampleTransfer + >>> import saqc >>> data = pd.DataFrame({'a': [1, 2], 'b': [1, 2], 'c': [1, 2]}) >>> qc = saqc.SaQC(data) >>> qc = qc.flagRange('a', max=1.5) - >>> qc.flags.to_df() - columns a b c - 0 -inf -inf -inf - 1 255.0 -inf -inf + >>> qc.flags.to_pandas() + a b c + 0 -inf -inf -inf + 1 255.0 -inf -inf Now we can project the flag from `a` to `b` via .. doctest:: exampleTransfer >>> qc = qc.transferFlags('a', target='b') - >>> qc.flags.to_df() - columns a b c - 0 -inf -inf -inf - 1 255.0 255.0 -inf + >>> qc.flags.to_pandas() + a b c + 0 -inf -inf -inf + 1 255.0 255.0 -inf You can skip the explicit target parameter designation: @@ -392,10 +394,10 @@ class FlagtoolsMixin: .. doctest:: exampleTransfer >>> qc = qc.transferFlags(['a','a'], ['b', 'c']) - >>> qc.flags.to_df() - columns a b c - 0 -inf -inf -inf - 1 255.0 255.0 255.0 + >>> qc.flags.to_pandas() + a b c + 0 -inf -inf -inf + 1 255.0 255.0 255.0 """ import warnings @@ -450,6 +452,7 @@ class FlagtoolsMixin: .. doctest:: propagateFlags + >>> import saqc >>> data = pd.DataFrame({"a": [-3, -2, -1, 0, 1, 2, 3]}) >>> flags = pd.DataFrame({"a": [-np.inf, -np.inf, -np.inf, 255.0, -np.inf, -np.inf, -np.inf]}) >>> qc = saqc.SaQC(data=data, flags=flags) @@ -461,7 +464,7 @@ class FlagtoolsMixin: 4 -inf 5 -inf 6 -inf - Name: a, dtype: float64 + dtype: float64 Now, to repeat the flag '255.0' two times in direction of ascending indices, execute: @@ -475,7 +478,7 @@ class FlagtoolsMixin: 4 255.0 5 255.0 6 -inf - Name: a, dtype: float64 + dtype: float64 Choosing "bfill" will result in @@ -489,7 +492,7 @@ class FlagtoolsMixin: 4 -inf 5 -inf 6 -inf - Name: a, dtype: float64 + dtype: float64 If an explicit flag is passed, it will be used to fill the repetition window @@ -503,7 +506,7 @@ class FlagtoolsMixin: 4 -inf 5 -inf 6 -inf - Name: a, dtype: float64 + dtype: float64 """ if method not in {"bfill", "ffill"}: diff --git a/saqc/funcs/generic.py b/saqc/funcs/generic.py index 803edfdaf8c481841db08731b854c0957f075381..1ec20feca0bec27b5427fc5a65f7ca91e4e42b09 100644 --- a/saqc/funcs/generic.py +++ b/saqc/funcs/generic.py @@ -16,7 +16,7 @@ from saqc import BAD, FILTER_ALL from saqc.core import DictOfSeries, Flags, History, register from saqc.core.register import _maskData from saqc.lib.tools import isAllBoolean, isflagged, toSequence -from saqc.lib.types import GenericFunction, PandasLike +from saqc.lib.types import GenericFunction from saqc.parsing.environ import ENVIRONMENT if TYPE_CHECKING: @@ -58,7 +58,7 @@ def _prepare( def _execGeneric( flags: Flags, - data: PandasLike, + data: pd.DataFrame | pd.Series | DictOfSeries, func: GenericFunction, dfilter: float = FILTER_ALL, ) -> DictOfSeries: @@ -74,14 +74,35 @@ def _execGeneric( if isinstance(data, pd.Series): data = data.to_frame() - out = func(*[data[c] for c in data.columns]) - if pd.api.types.is_scalar(out): - raise ValueError( + # set series.name, because `isflagged` relies on it + cols = [] + for c in data.columns: + data[c].name = c + cols.append(data[c]) + return func(*cols) + + +def _castResult(obj) -> DictOfSeries: + # Note: the actual keys aka. column names + # we use here to create a DictOfSeries + # are never used, and only exists temporary. + + if isinstance(obj, pd.Series): + return DictOfSeries({"0": obj}) + if pd.api.types.is_dict_like(obj): + # includes pd.Series and + # everything with keys and __getitem__ + return DictOfSeries(obj) + if pd.api.types.is_list_like(obj): + # includes pd.Series and dict + return DictOfSeries({str(i): val for i, val in enumerate(obj)}) + + if pd.api.types.is_scalar(obj): + raise TypeError( "generic function should return a sequence object, " - f"got '{type(out)}' instead" + f"got '{type(obj)}' instead" ) - - return DictOfSeries(out) + raise TypeError(f"unprocessable result type {type(obj)}.") class GenericMixin: @@ -146,8 +167,8 @@ class GenericMixin: >>> from saqc import SaQC >>> qc = SaQC(pd.DataFrame({'rainfall':[1], 'snowfall':[2]}, index=pd.DatetimeIndex([0]))) >>> qc = qc.processGeneric(field=["rainfall", "snowfall"], target="precipitation", func=lambda x, y: x + y) - >>> qc.data.to_df() - columns rainfall snowfall precipitation + >>> qc.data.to_pandas() + rainfall snowfall precipitation 1970-01-01 1 2 3 """ @@ -156,6 +177,7 @@ class GenericMixin: dchunk, fchunk = _prepare(self._data, self._flags, fields, dfilter) result = _execGeneric(fchunk, dchunk, func, dfilter=dfilter) + result = _castResult(result) meta = { "func": "procGeneric", @@ -266,10 +288,12 @@ class GenericMixin: dchunk, fchunk = _prepare(self._data, self._flags, fields, dfilter) result = _execGeneric(fchunk, dchunk, func, dfilter=dfilter) + result = _castResult(result) if len(targets) != len(result.columns): raise ValueError( - f"the generic function returned {len(result.columns)} field(s), but only {len(targets)} target(s) were given" + f"the generic function returned {len(result.columns)} field(s), " + f"but {len(targets)} target(s) were given" ) if not result.empty and not isAllBoolean(result): @@ -296,9 +320,14 @@ class GenericMixin: # dummy column to ensure consistency between flags and data if col not in self._data: - self._data[col] = pd.Series(np.nan, index=maskcol.index) - - flagcol = maskcol.replace({False: np.nan, True: flag}).astype(float) + self._data[col] = pd.Series(np.nan, index=maskcol.index, dtype=float) + + # Note: big speedup for series, because replace works + # with a loop and setting with mask is vectorized. + # old code: + # >>> flagcol = maskcol.replace({False: np.nan, True: flag}).astype(float) + flagcol = pd.Series(np.nan, index=maskcol.index, dtype=float) + flagcol[maskcol] = flag # we need equal indices to work on if not self._flags[col].index.equals(maskcol.index): diff --git a/saqc/funcs/outliers.py b/saqc/funcs/outliers.py index 03c50820f89542d883053d8a9894e1224592d8ad..03d2ea82e3c347138ab92901336b93582cd10b57 100644 --- a/saqc/funcs/outliers.py +++ b/saqc/funcs/outliers.py @@ -723,6 +723,7 @@ class OutliersMixin: .. doctest:: flagOffsetExample + >>> import saqc >>> data = pd.DataFrame({'data':np.array([5,5,8,16,17,7,4,4,4,1,1,4])}, index=pd.date_range('2000',freq='1H', periods=12)) >>> data data @@ -747,7 +748,7 @@ class OutliersMixin: .. doctest:: flagOffsetExample >>> qc = qc.flagOffset("data", thresh=2, tolerance=1.5, window='6H') - >>> qc.plot('data') # doctest:+SKIP + >>> qc.plot('data') # doctest: +SKIP .. plot:: :context: close-figs @@ -755,7 +756,7 @@ class OutliersMixin: >>> qc = saqc.SaQC(data) >>> qc = qc.flagOffset("data", thresh=2, tolerance=1.5, window='6H') - >>> qc.plot('data') + >>> qc.plot('data') # doctest: +SKIP Note, that both, negative and positive jumps are considered starting points of negative or positive offsets. If you want to impose the additional condition, that the initial value jump must exceed *+90%* of the value level, @@ -772,7 +773,7 @@ class OutliersMixin: >>> qc = saqc.SaQC(data) >>> qc = qc.flagOffset("data", thresh=2, thresh_relative=.9, tolerance=1.5, window='6H') - >>> qc.plot('data') + >>> qc.plot('data') # doctest: +SKIP Now, only positive jumps, that exceed a value gain of *+90%* are considered starting points of offsets. @@ -790,7 +791,7 @@ class OutliersMixin: >>> qc = saqc.SaQC(data) >>> qc = qc.flagOffset("data", thresh=2, thresh_relative=-.5, tolerance=1.5, window='6H') - >>> qc.plot('data') + >>> qc.plot('data') # doctest: +SKIP References @@ -1036,7 +1037,7 @@ class OutliersMixin: fields = toSequence(field) - df = self._data[fields].to_df(how="inner") + df = self._data[fields].to_pandas(how="inner") if isinstance(method, str): if method == "modZscore": @@ -1231,7 +1232,7 @@ def _evalStrayLabels( ---------- [1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda35h.htm """ - val_frame = data[target].to_df() + val_frame = data[target].to_pandas() stray_detects = flags[field] > UNFLAGGED stray_detects = stray_detects[stray_detects] to_flag_frame = pd.DataFrame(False, columns=target, index=stray_detects.index) diff --git a/saqc/funcs/scores.py b/saqc/funcs/scores.py index 9e3d0bc4ba3f3a6615681f0840e4e3eb25031f4c..42927d050fe0df7d7d4517dac8e66e1cec8943f7 100644 --- a/saqc/funcs/scores.py +++ b/saqc/funcs/scores.py @@ -180,7 +180,7 @@ class ScoresMixin: target = target[0] fields = toSequence(field) - val_frame = self._data[fields].copy().to_df(how="inner") + val_frame = self._data[fields].copy().to_pandas(how="inner") score_ser = pd.Series(np.nan, index=val_frame.index, name=target) val_frame.dropna(inplace=True) diff --git a/saqc/lib/plotting.py b/saqc/lib/plotting.py index 56a27e8f29828f7c6b65c207c800c2e18f8fb6ea..a8f9b054083489fc31601df5cd7b22a3b91e1e2e 100644 --- a/saqc/lib/plotting.py +++ b/saqc/lib/plotting.py @@ -16,9 +16,8 @@ import numpy as np import pandas as pd from typing_extensions import Literal -from saqc.core import Flags +from saqc.core import DictOfSeries, Flags from saqc.lib.tools import toSequence -from saqc.lib.types import DiosLikeT STATSDICT = { "values total": lambda x, y, z: len(x), @@ -55,7 +54,7 @@ SCATTER_KWARGS = { def makeFig( - data: DiosLikeT, + data: DictOfSeries, field: str, flags: Flags, level: float, @@ -71,10 +70,10 @@ def makeFig( Parameters ---------- - data : {pd.DataFrame, dios.DictOfSeries} + data : {pd.DataFrame, DictOfSeries} data - flags : {pd.DataFrame, dios.DictOfSeries, saqc.flagger} + flags : {pd.DataFrame, DictOfSeries, saqc.flagger} Flags or flagger object field : str @@ -118,7 +117,8 @@ def makeFig( if ax_kwargs is None: ax_kwargs = {} # data retrieval - d = data[field] + d = data[field].copy(deep=False) + d.name = field # data slicing: xscope = xscope or slice(xscope) d = d[xscope] diff --git a/saqc/lib/tools.py b/saqc/lib/tools.py index 8bf96b839e1121b96ef300e153eee3e72e9d0b07..6c02070d760251da73c011c865fc8f3b13bfd1bf 100644 --- a/saqc/lib/tools.py +++ b/saqc/lib/tools.py @@ -342,12 +342,9 @@ def detectDeviants( In addition, only a group is considered "normal" if it contains more then `frac` percent of the variables in "fields". - Note, that the function also can be used to detect anormal regimes in a variable by assigning the different regimes - dios.DictOfSeries columns and passing this dios. - Parameters ---------- - data : {pandas.DataFrame, dios.DictOfSeries} + data : {pandas.DataFrame, DictOfSeries} Input data metric : Callable[[numpy.array, numpy.array], float] A metric function that for calculating the dissimilarity between 2 variables. @@ -365,8 +362,8 @@ def detectDeviants( Returns ------- - deviants : List - A list containing the column positions of deviant variables in the input frame/dios. + deviants : list + A list containing the column positions of deviant variables in the input """ var_num = len(data.columns) @@ -394,7 +391,8 @@ def detectDeviants( pop_num = np.sum(list(counts.values())) else: raise ValueError( - "Not a valid normality criteria keyword passed. Pass either 'variables' or 'population'." + "Not a valid normality criteria keyword passed. " + "Pass either 'variables' or 'population'." ) norm_cluster = -1 diff --git a/saqc/lib/types.py b/saqc/lib/types.py index 4bdb7e4e2841dd3dbd6e8d01151de86c6e2961bc..0de6217f7047ce32dc3c2923fb9ba369bfd783e2 100644 --- a/saqc/lib/types.py +++ b/saqc/lib/types.py @@ -19,8 +19,6 @@ from saqc.core import DictOfSeries __all__ = [ "T", "ArrayLike", - "PandasLike", - "DiosLikeT", "CurveFitter", "ExternalFlag", "OptionalNone", @@ -28,8 +26,6 @@ __all__ = [ T = TypeVar("T") ArrayLike = TypeVar("ArrayLike", np.ndarray, pd.Series, pd.DataFrame) -PandasLike = Union[pd.Series, pd.DataFrame, DictOfSeries] -DiosLikeT = Union[DictOfSeries, pd.DataFrame] ExternalFlag = Union[str, float, int] @@ -44,7 +40,7 @@ class GenericFunction(Protocol): __name__: str __globals__: Dict[str, Any] - def __call__(self, *args: pd.Series) -> PandasLike: + def __call__(self, *args: pd.Series) -> pd.Series | pd.DataFrame | DictOfSeries: ... # pragma: no cover diff --git a/tests/api/test_creation.py b/tests/api/test_creation.py index 60ae95a4ab7f66305a1f7cf420a05a4cba55e3e9..70e0875f46c00bb80156e2f08ba7a65bb2fd5dcf 100644 --- a/tests/api/test_creation.py +++ b/tests/api/test_creation.py @@ -7,13 +7,9 @@ import numpy as np import pandas as pd -# directly import container class to avoid importing -# saqc here. -import dios - def test_init(): - from saqc import Flags, SaQC + from saqc import DictOfSeries, Flags, SaQC arr = np.array( [ @@ -26,4 +22,4 @@ def test_init(): assert isinstance(qc, SaQC) assert isinstance(qc._flags, Flags) - assert isinstance(qc._data, dios.DictOfSeries) + assert isinstance(qc._data, DictOfSeries) diff --git a/tests/core/test_core.py b/tests/core/test_core.py index e0dedae95b498008aa52b4e345c68ff2d7c214c2..03bd456b4b374c17669b6b109f57a6e56772b87d 100644 --- a/tests/core/test_core.py +++ b/tests/core/test_core.py @@ -13,7 +13,7 @@ import pandas as pd import pytest from saqc import BAD, FILTER_ALL, FILTER_NONE, UNFLAGGED, SaQC -from saqc.core import Flags, flagging, initFlagsLike, processing, register +from saqc.core import DictOfSeries, Flags, flagging, initFlagsLike, processing, register from saqc.lib.types import OptionalNone from tests.common import initData @@ -27,8 +27,9 @@ def data(): @pytest.fixture def flags(data, optional): - if not optional: - return initFlagsLike(data[data.columns[::2]]).toDios() + if optional: + return None + return DictOfSeries(initFlagsLike(data[data.columns[::2]])) def test_errorHandling(data): @@ -55,7 +56,7 @@ def test_dtypes(data, flags): return saqc flags = initFlagsLike(data) - flags_raw = flags.toDios() + flags_raw = DictOfSeries(flags) var1, var2 = data.columns[:2] pflags = SaQC(data, flags=flags_raw).flagAll(var1).flagAll(var2).flags @@ -69,34 +70,39 @@ def test_new_call(data): qc = qc.flagRange("var1", max=5) +def test_SaQC_attributes(): + """Test if all instance attributes are in SaQC._attributes""" + qc = SaQC() + for name in [n for n in dir(qc) if not n.startswith("__")]: + if hasattr(SaQC, name): # skip class attributes + continue + assert name in SaQC._attributes + + def test_copy(data): qc = SaQC(data) - qc = qc.flagRange("var1").flagRange("var1", min=0, max=0) deep = qc.copy(deep=True) shallow = qc.copy(deep=False) - for copy in [deep, shallow]: assert copy is not qc - assert copy._scheme is not qc._scheme - assert copy._attrs is not qc._attrs + for name in [n for n in dir(qc) if not n.startswith("__")]: + if hasattr(SaQC, name): # skip class attributes + continue + qc_attr = getattr(qc, name) + other_attr = getattr(copy, name) + assert qc_attr is not other_attr - assert copy._data is not qc._data - assert copy._flags is not qc._flags + # History is always copied + assert deep._flags._data["var1"] is not qc._flags._data["var1"] + assert shallow._flags._data["var1"] is not qc._flags._data["var1"] - assert copy._data._data is not qc._data._data - assert copy._flags._data is not qc._flags._data + # underling data NOT copied + assert shallow._data["var1"] is qc._data["var1"] # underling data copied - assert deep._data._data.iloc[0] is not qc._data._data.iloc[0] - assert ( - deep._flags._data["var1"].hist.index is not qc._flags._data["var1"].hist.index - ) - - # underling data NOT copied - assert shallow._data._data.iloc[0] is qc._data._data.iloc[0] - assert shallow._flags._data["var1"].hist.index is qc._flags._data["var1"].hist.index + assert deep._data["var1"] is not qc._data["var1"] def test_sourceTargetCopy(): @@ -364,3 +370,48 @@ def test_dfilterTranslation(data, user_flag, internal_flag): field = data.columns[0] qc = SaQC(data, scheme="simple") qc.flagFoo(field, dfilter=user_flag) + + +@pytest.mark.parametrize( + "data, expected", + [ + # 2c + 1c -> 3c + ( + [ + DictOfSeries(a=pd.Series([1]), b=pd.Series([2])), + DictOfSeries(c=pd.Series([3])), + ], + DictOfSeries(a=pd.Series([1]), b=pd.Series([2]), c=pd.Series([3])), + ), + # 1c + 1c + 1c -> 3c + ( + [ + DictOfSeries(a=pd.Series([1])), + DictOfSeries(b=pd.Series([2])), + DictOfSeries(c=pd.Series([3])), + ], + DictOfSeries(a=pd.Series([1]), b=pd.Series([2]), c=pd.Series([3])), + ), + ], +) +def test_concatDios(data, expected): + result = SaQC(data) + assert result.data == expected + + +@pytest.mark.parametrize( + "data,expected", + [ + ( + [ + DictOfSeries(a=pd.Series([1]), b=pd.Series([2])), + DictOfSeries(b=pd.Series([99])), + ], + DictOfSeries(a=pd.Series([1]), b=pd.Series([99])), + ) + ], +) +def test_concatDios_warning(data, expected): + with pytest.warns(UserWarning): + result = SaQC(data) + assert result.data == expected diff --git a/tests/core/test_flags.py b/tests/core/test_flags.py index fbb09e0d3c335401050535201ebea0f15568eabe..74e3f4d905b70b73dbd5602e4f8b1c48be5e6f65 100644 --- a/tests/core/test_flags.py +++ b/tests/core/test_flags.py @@ -290,14 +290,14 @@ def _validate_flags_equals_frame(flags, df): @pytest.mark.parametrize("data", testdata) def test_to_dios(data: Union[pd.DataFrame, DictOfSeries, Dict[str, pd.Series]]): flags = Flags(data) - df = flags.toDios() - - assert isinstance(df, DictOfSeries) - _validate_flags_equals_frame(flags, df) + with pytest.deprecated_call(): + result = flags.toDios() + assert isinstance(result, DictOfSeries) + _validate_flags_equals_frame(flags, result) @pytest.mark.parametrize("data", testdata) -def test_to_frame(data: Union[pd.DataFrame, DictOfSeries, Dict[str, pd.Series]]): +def test_toFrame(data: Union[pd.DataFrame, DictOfSeries, Dict[str, pd.Series]]): flags = Flags(data) df = flags.toFrame() diff --git a/tests/core/test_frame.py b/tests/core/test_frame.py index 267e522e0b5cb4f509572a5b7572a2c41a1d8549..2c4871e6c716fc280ecd636e52357fba3ed1569b 100644 --- a/tests/core/test_frame.py +++ b/tests/core/test_frame.py @@ -6,35 +6,6 @@ import pytest -from saqc.core.frame import DictOfSeries as DoS -from saqc.core.frame import concatDios +from saqc.core.frame import DictOfSeries - -@pytest.mark.parametrize( - "data, expected", - [ - # 2c + 1c -> 3c - ([DoS(dict(a=[1], b=[2])), DoS(dict(c=[3]))], DoS(dict(a=[1], b=[2], c=[3]))), - # 1c + 1c + 1c -> 3c - ( - [DoS(dict(a=[1])), DoS(dict(b=[1])), DoS(dict(c=[1]))], - DoS(dict(a=[1], b=[1], c=[1])), - ), - # 2c + 1c (overwrite) = 2c - ([DoS(dict(a=[1], b=[2])), DoS(dict(b=[22]))], DoS(dict(a=[1], b=[22]))), - # 1c + 1c + 1c (all overwrite) -> 1c - ( - [DoS(dict(a=[1])), DoS(dict(a=[11])), DoS(dict(a=[111]))], - DoS(dict(a=[111])), - ), - ], -) -def test_concatDios(data, expected): - result = concatDios(data, warn=False) - assert result == expected - - -@pytest.mark.parametrize("data", [[DoS(dict(a=[1], b=[2])), DoS(dict(b=[22]))]]) -def test_concatDios_warning(data): - with pytest.warns(UserWarning): - concatDios(data, warn=True, stacklevel=0) +# todo : test for DictOfSeries diff --git a/tests/core/test_translator.py b/tests/core/test_translator.py index 1da0075e90d378ba77465faa2f826fc8ee5f2158..63a7812c8bee7bd6244f910ab4663757aba18554 100644 --- a/tests/core/test_translator.py +++ b/tests/core/test_translator.py @@ -149,7 +149,7 @@ def test_positionalTranslatorIntegration(): saqc = saqc.flagMissing(col).flagRange(col, min=3, max=10, flag=DOUBTFUL) flags = saqc.flags - for field in flags.columns: + for field in flags.keys(): assert flags[field].astype(str).str.match("^9[012]*$").all() round_trip = scheme.toExternal(scheme.toInternal(flags)) diff --git a/tests/fixtures.py b/tests/fixtures.py index b8d05ceb5d12d07a41ab27a8e8cd7a67b00e222e..5763a6b711059d8240d6c69816246789add1096b 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -54,7 +54,7 @@ def course_1(char_dict): int(np.floor(len(t_index) / 2)) - 1 : int(np.floor(len(t_index) / 2)) + 1 ] - data = DictOfSeries(data=s, columns=[name]) + data = DictOfSeries({name: s}) return data, char_dict return fix_funk @@ -94,7 +94,7 @@ def course_2(char_dict): char_dict[kind] = data.index[int(np.floor(periods / 2))] char_dict["return"] = data.index[int(np.floor(len(t_index) / 2)) + 1] - data = DictOfSeries(data=data, columns=["data"]) + data = DictOfSeries(data=data) return data, char_dict return fix_funk @@ -120,7 +120,7 @@ def course_test(char_dict): data.iloc[2] = out_val data.iloc[3] = out_val - data = DictOfSeries(data=data, columns=["data"]) + data = DictOfSeries(data=data) return data, char_dict return fix_funk @@ -177,7 +177,7 @@ def course_3(char_dict): char_dict[kind] = anomaly_index char_dict["return"] = t_index[int(len(t_index) / 2) + 1] - data = DictOfSeries(data=data, columns=["data"]) + data = DictOfSeries(data=data) return data, char_dict return fix_funk @@ -207,7 +207,7 @@ def course_4(char_dict): char_dict["raise"] = t_index[int(len(t_index) / 2) :: 2] char_dict["return"] = t_index[int((len(t_index) / 2) + 1) :: 2] - data = DictOfSeries(data=data, columns=["data"]) + data = DictOfSeries(data=data) return data, char_dict return fix_funk @@ -239,7 +239,7 @@ def course_5(char_dict): s.iloc[nan_slice] = np.nan char_dict["missing"] = s.iloc[nan_slice].index - data = DictOfSeries(data=s, columns=["data"]) + data = DictOfSeries(data=s) return data, char_dict return fix_funk diff --git a/tests/funcs/test_functions.py b/tests/funcs/test_functions.py index 991481575dd465368e82f5164956b3854b018538..5ac707b351058d7dcbb6b5556fa9a4a8ddaf6336 100644 --- a/tests/funcs/test_functions.py +++ b/tests/funcs/test_functions.py @@ -32,7 +32,7 @@ def test_statPass(): noise = [-1, 1] * 10 data[100:120] = noise data[200:210] = noise[:10] - data = DictOfSeries(data) + data = DictOfSeries(data=data) flags = initFlagsLike(data) qc = SaQC(data, flags).flagByStatLowPass( "data", np.std, "20D", 0.999, "5D", 0.999, 0, flag=BAD diff --git a/tests/funcs/test_generic_api_functions.py b/tests/funcs/test_generic_api_functions.py index 18eb41240fb4a8aceb63ae99ac90af0548be38e9..4a1065c04fa5d324af7381d6c32dc32fdac9fa3c 100644 --- a/tests/funcs/test_generic_api_functions.py +++ b/tests/funcs/test_generic_api_functions.py @@ -159,7 +159,7 @@ def test_writeTargetProcGeneric(data, targets, func, expected_data): dfilter=dfilter, label="generic", ) - assert (expected_data == res.data).all(axis=None) + assert expected_data == res.data # check that new histories where created for target in targets: assert res._flags.history[target].hist.iloc[0].isna().all() @@ -202,7 +202,7 @@ def test_overwriteFieldProcGeneric(data, fields, func, expected_data): ) res = saqc.processGeneric(field=fields, func=func, dfilter=dfilter, label="generic") - assert (expected_data == res.data).all(axis=None) + assert expected_data == res.data # check that the histories got appended for field in fields: assert (res._flags.history[field].hist[0] == 127.0).all() @@ -223,7 +223,7 @@ def test_label(): qc = qc.flagGeneric( ["data1", "data3"], target="data2", - func=lambda x, y: isflagged(x, "out of range") | isflagged(y), + func=lambda x, y: isflagged(x, "out of range") | isflagged(y), # noqa ) assert list((qc.flags["data2"] > 0).values) == [False, False, True, False, False] diff --git a/tests/funcs/test_generic_config_functions.py b/tests/funcs/test_generic_config_functions.py index 194f62d88de4e9ac9612eeff4aa0ed829153d62e..6b29d9d6189bea639876d84fe9fa0bb12b06e890 100644 --- a/tests/funcs/test_generic_config_functions.py +++ b/tests/funcs/test_generic_config_functions.py @@ -33,9 +33,9 @@ def data_diff(): mid = len(col0) // 2 offset = len(col0) // 8 return DictOfSeries( - data={ - col0.name: col0.iloc[: mid + offset], - col1.name: col1.iloc[mid - offset :], + { + data.columns[0]: col0.iloc[: mid + offset], + data.columns[1]: col1.iloc[mid - offset :], } ) diff --git a/tests/funcs/test_outlier_detection.py b/tests/funcs/test_outlier_detection.py index 4fdc17ec0e310526281357d8249bb39836a48a10..ebcaa9e8858d522105fe3721638b4df93370bcc8 100644 --- a/tests/funcs/test_outlier_detection.py +++ b/tests/funcs/test_outlier_detection.py @@ -21,11 +21,11 @@ from tests.fixtures import char_dict, course_1, course_2, course_3, course_4 @pytest.fixture(scope="module") def spiky_data(): index = pd.date_range(start="2011-01-01", end="2011-01-05", freq="5min") - s = pd.Series(np.linspace(1, 2, index.size), index=index, name="spiky_data") + s = pd.Series(np.linspace(1, 2, index.size), index=index) s.iloc[100] = 100 s.iloc[1000] = -100 flag_assertion = [100, 1000] - return DictOfSeries(s), flag_assertion + return DictOfSeries(spiky_data=s), flag_assertion def test_flagMad(spiky_data): @@ -95,10 +95,10 @@ def test_flagMVScores(dat): periods=1000, initial_level=20, final_level=1, out_val=30 ) fields = ["field1", "field2"] - s1, s2 = data1.squeeze(), data2.squeeze() + s1, s2 = data1["data"], data2["data"] s1 = pd.Series(data=s1.values, index=s1.index) s2 = pd.Series(data=s2.values, index=s1.index) - data = DictOfSeries([s1, s2], columns=["field1", "field2"]) + data = DictOfSeries(field1=s1, field2=s2) flags = initFlagsLike(data) qc = SaQC(data, flags).flagMVScores( field=fields, @@ -131,10 +131,10 @@ def test_flagCrossStatistics(dat): data1, characteristics = dat(initial_level=0, final_level=0, out_val=0) data2, characteristics = dat(initial_level=0, final_level=0, out_val=10) fields = ["field1", "field2"] - s1, s2 = data1.squeeze(), data2.squeeze() + s1, s2 = data1["data"], data2["data"] s1 = pd.Series(data=s1.values, index=s1.index) s2 = pd.Series(data=s2.values, index=s1.index) - data = DictOfSeries([s1, s2], columns=["field1", "field2"]) + data = DictOfSeries(field1=s1, field2=s2) flags = initFlagsLike(data) qc = SaQC(data, flags).flagCrossStatistics( @@ -157,19 +157,19 @@ def test_flagZScores(): qc = saqc.SaQC(data) qc = qc.flagZScore("data", window=None) - assert (qc.flags.to_df().iloc[[5, 40, 80], 0] > 0).all() + assert (qc.flags.to_pandas().iloc[[5, 40, 80], 0] > 0).all() qc = saqc.SaQC(data) qc = qc.flagZScore("data", window=None, min_residuals=10) - assert (qc.flags.to_df()["data"] < 0).all() + assert (qc.flags.to_pandas()["data"] < 0).all() qc = saqc.SaQC(data) qc = qc.flagZScore("data", window="20D") - assert (qc.flags.to_df().iloc[[40, 80], 0] > 0).all() + assert (qc.flags.to_pandas().iloc[[40, 80], 0] > 0).all() qc = saqc.SaQC(data) qc = qc.flagZScore("data", window=20) - assert (qc.flags.to_df().iloc[[40, 80], 0] > 0).all() + assert (qc.flags.to_pandas().iloc[[40, 80], 0] > 0).all() diff --git a/tests/funcs/test_pattern_rec.py b/tests/funcs/test_pattern_rec.py index f885522c42e040e99837dab58c4c24569fa66789..6b96ee0ca5b6a44a962322eb98d7ffb54fdbf20f 100644 --- a/tests/funcs/test_pattern_rec.py +++ b/tests/funcs/test_pattern_rec.py @@ -31,7 +31,7 @@ def test_flagPattern_dtw(plot, normalize): data.iloc[10:18] = [0, 5, 6, 7, 6, 8, 5, 0] pattern = data.iloc[10:18] - data = DictOfSeries(dict(data=data, pattern_data=pattern)) + data = DictOfSeries(data=data, pattern_data=pattern) flags = initFlagsLike(data, name="data") qc = SaQC(data, flags).flagPatternByDTW( "data", diff --git a/tests/funcs/test_proc_functions.py b/tests/funcs/test_proc_functions.py index 4843842f37c4e31c5991e0a14231630aba7cb319..1e4ddceba2c6b38ddb3328cba3caa8ae5e607a93 100644 --- a/tests/funcs/test_proc_functions.py +++ b/tests/funcs/test_proc_functions.py @@ -109,7 +109,7 @@ def test_resample(course_5): def test_interpolateGrid(course_5, course_3): data, _ = course_5() data_grid, _ = course_3() - data["grid"] = data_grid.to_df() + data["grid"] = data_grid["data"] flags = initFlagsLike(data) SaQC(data, flags).interpolateIndex( "data", "1h", "time", grid_field="grid", limit=10 @@ -123,7 +123,7 @@ def test_offsetCorrecture(): data.iloc[70:80] = 100 flags = initFlagsLike(data) qc = SaQC(data, flags).correctOffset("dat", 40, 20, "3d", 1) - assert (qc.data == 0).all()[0] + assert (qc.data["dat"] == 0).all() # GL-333 diff --git a/tests/funcs/test_resampling.py b/tests/funcs/test_resampling.py index 2eb7316afc7a5faca5dfd7796c12c486aef17add..2fc0623e23ed7f37bc10ffeaa23534482d949c8d 100644 --- a/tests/funcs/test_resampling.py +++ b/tests/funcs/test_resampling.py @@ -26,10 +26,10 @@ def data(): index = index.insert(5, pd.Timestamp(2011, 1, 1, 0, 31, 0)) index = index.insert(0, pd.Timestamp(2010, 12, 31, 23, 57, 0)) index = index.drop(pd.Timestamp("2011-01-01 00:30:00")) - dat = pd.Series(np.linspace(-50, 50, index.size), index=index, name="data") + dat = pd.Series(np.linspace(-50, 50, index.size), index=index) # good to have some nan dat[-3] = np.nan - data = DictOfSeries(dat) + data = DictOfSeries(data=dat) return data @@ -87,7 +87,7 @@ def test_gridInterpolation(data, method, fill_history): field = "data" data = data[field] data = pd.concat([data * np.sin(data), data.shift(1, "2h")]).shift(1, "3s") - data = DictOfSeries(data) + data = DictOfSeries(data=data) flags = initFlagsLike(data) if fill_history == "none": diff --git a/tests/funcs/test_tools.py b/tests/funcs/test_tools.py index 75e51273734a2dcfb0ca8b5253fff0616219bdec..7fc412e38563d616003a7ac60c4c9614e1ec823f 100644 --- a/tests/funcs/test_tools.py +++ b/tests/funcs/test_tools.py @@ -16,10 +16,9 @@ from saqc.core import DictOfSeries def test_makeFig(tmp_path): # just testing for no errors to occure... data = DictOfSeries( - pd.Series( + data=pd.Series( np.linspace(0, 1000, 1000), pd.date_range("2000", "2001", periods=1000), - name="data", ) ) d_saqc = saqc.SaQC(data) diff --git a/tests/fuzzy/lib.py b/tests/fuzzy/lib.py index a1bf9ed274e8b8e069ab4945e49ff92f331f0bdd..3604bd472b9b4435ff3fb830e583c1d4aeedf4aa 100644 --- a/tests/fuzzy/lib.py +++ b/tests/fuzzy/lib.py @@ -152,11 +152,11 @@ def functionKwargs(draw, func): i64 = np.iinfo("int64") strategies = { - FreqString: frequencyStrings, - ColumnName: lambda _: sampled_from( - sorted(c for c in data.columns if c != field) - ), - IntegerWindow: lambda _: integers(min_value=1, max_value=len(data[field]) - 1), + # FreqString: frequencyStrings, + # ColumnName: lambda _: sampled_from( + # sorted(c for c in data.columns if c != field) + # ), + # IntegerWindow: lambda _: integers(min_value=1, max_value=len(data[field]) - 1), int: lambda _: integers(min_value=i64.min + 1, max_value=i64.max - 1), } diff --git a/tests/fuzzy/test_masking.py b/tests/fuzzy/test_masking.py index ef7d54e968688ca7a026bcd4261855821b11bfe0..b224efdc77bc0f45afefbe5863cc592cf77af886 100644 --- a/tests/fuzzy/test_masking.py +++ b/tests/fuzzy/test_masking.py @@ -10,7 +10,7 @@ import pandas as pd import pytest from hypothesis import given, settings -from saqc import BAD, UNFLAGGED +from saqc import BAD, UNFLAGGED, DictOfSeries from saqc.core.register import _maskData, _unmaskData from tests.fuzzy.lib import MAX_EXAMPLES, dataFieldFlags @@ -23,11 +23,17 @@ def test_maskingMasksData(data_field_flags): test if flagged values are replaced by np.nan """ data_in, field, flags = data_field_flags - data_masked, mask = _maskData( - data_in, flags, columns=[field], thresh=UNFLAGGED - ) # thresh UNFLAGGED | np.inf - assert data_masked[field].iloc[mask[field].index].isna().all() - assert (flags[field].iloc[mask[field].index] > UNFLAGGED).all() + data_masked, mask = _maskData(data_in, flags, columns=[field], thresh=UNFLAGGED) + assert isinstance(data_masked, DictOfSeries) + assert isinstance(mask, DictOfSeries) + assert field in data_masked.columns + if field in mask.columns: + assert data_masked[field].iloc[mask[field].index].isna().all() + assert (flags[field].iloc[mask[field].index] > UNFLAGGED).all() + else: + # if nothing gets masked in a column, + # the column does not appear in mask + assert (flags[field] == UNFLAGGED).all() @pytest.mark.slow @@ -38,9 +44,8 @@ def test_dataMutationPreventsUnmasking(data_field_flags): if `data` is mutated after `_maskData`, `_unmaskData` should be a no-op """ - filler = -9999 - data_in, field, flags = data_field_flags + filler = pd.Series(-9999.0, index=data_in[field].index, dtype=float) data_masked, mask = _maskData(data_in, flags, columns=[field], thresh=UNFLAGGED) data_masked[field] = filler @@ -104,7 +109,7 @@ def test_unmaskingInvertsMasking(data_field_flags): data_masked, mask = _maskData(data_in, flags, columns=[field], thresh=UNFLAGGED) data_out = _unmaskData(data_masked, mask) assert pd.DataFrame.equals( - data_out.to_df().astype(float), data_in.to_df().astype(float) + data_out.to_pandas().astype(float), data_in.to_pandas().astype(float) )