Skip to content
Snippets Groups Projects
Commit e4b4ba31 authored by Bert Palm's avatar Bert Palm 🎇
Browse files

merged develop

parents 2540ea87 342dcea4
No related branches found
No related tags found
No related merge requests found
Pipeline #47051 failed with stage
in 2 minutes and 17 seconds
# ===========================================================
# preparation
# ===========================================================
variables: variables:
GIT_SUBMODULE_STRATEGY: recursive GIT_SUBMODULE_STRATEGY: recursive
before_script: default:
- export DEBIAN_FRONTEND=noninteractive image: python:3.8
- apt-get -qq update before_script:
- apt-get -qq install -y make build-essential libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev xz-utils tk-dev libffi-dev liblzma-dev python-openssl git > /dev/null - pip install --upgrade pip
- export DEBIAN_FRONTEND=dialog - pip install pytest
- export LC_ALL=C.UTF-8 - pip install -r requirements.txt
- export LANG=C.UTF-8
- git clone https://github.com/pyenv/pyenv.git ~/.pyenv
- export PYENV_ROOT="$HOME/.pyenv"
- export PATH="$PYENV_ROOT/bin:$PATH"
- eval "$(pyenv init -)"
test:python37: # ===========================================================
# normal jobs (non scheduled)
# ===========================================================
# test saqc with python 3.7
python37:
stage: test
except:
- schedules
image: python:3.7
script: script:
- pyenv install 3.7.5 - pytest tests/core tests/funcs tests/integration dios/test
- pyenv shell 3.7.5
- pip install --upgrade pip
- pip install -r requirements.txt
- python -m pytest --ignore test/lib test
- python -m saqc --config ressources/data/config_ci.csv --data ressources/data/data.csv --outfile /tmp/test.csv - python -m saqc --config ressources/data/config_ci.csv --data ressources/data/data.csv --outfile /tmp/test.csv
test:python38: # test saqc with python 3.8
python38:
stage: test
except:
- schedules
script: script:
- pyenv install 3.8.0 - pytest tests/core tests/funcs tests/integration dios/test
- pyenv shell 3.8.0
- pip install --upgrade pip
- pip install -r requirements.txt
- python -m pytest --ignore test/lib test
- python -m saqc --config ressources/data/config_ci.csv --data ressources/data/data.csv --outfile /tmp/test.csv - python -m saqc --config ressources/data/config_ci.csv --data ressources/data/data.csv --outfile /tmp/test.csv
# Make html docu with sphinx
# test saqc with python 3.9
python39:
stage: test
except:
- schedules
image: python:3.9
script:
- pytest tests/core tests/funcs tests/integration
- python -m saqc --config ressources/data/config_ci.csv --data ressources/data/data.csv --outfile /tmp/test.csv
# check if everthing is properly formatted
black:
stage: test
script:
- pip install black
- black --check .
# make (visual) coverage in gitlab merge request diff's
coverage:
stage: test
except:
- schedules
allow_failure: true
script:
- pip install pytest-cov coverage
- pytest --cov=saqc tests/core tests/funcs
after_script:
- coverage xml
# regex to find the coverage percentage in the job output
coverage: '/^TOTAL.+?(\d+\%)$/'
artifacts:
when: always
reports:
cobertura: coverage.xml
# make html docu with sphinx
pages: pages:
stage: deploy stage: deploy
only:
- cookBux
except:
- schedules
script: script:
- pyenv install 3.8.0
- pyenv shell 3.8.0
- pip install --upgrade pip
- pip install -r requirements.txt
- cd sphinx-doc/ - cd sphinx-doc/
- pip install -r requirements_sphinx.txt - pip install -r requirements_sphinx.txt
- make html - make doc
- cp -r _build/html ../public - cp -r _build/html ../public
artifacts: artifacts:
paths: paths:
- public - public
# ===========================================================
# scheduled jobs
# ===========================================================
# fuzzy testing saqc
fuzzy:
stage: test
only:
- schedules
script:
- pytest tests/fuzzy
# test lib saqc
testLib:
stage: test
only: only:
- develop - schedules
script:
- pytest tests/lib
...@@ -79,10 +79,9 @@ ...@@ -79,10 +79,9 @@
## Breaking Changes ## Breaking Changes
- register is now a decorator instead of a wrapper - register is now a decorator instead of a wrapper
# 1.5 # 1.5
coming soon... coming soon ...
## Features ## Features
......
...@@ -3,29 +3,105 @@ We recommend a virtual python environment for development. The setup process is ...@@ -3,29 +3,105 @@ We recommend a virtual python environment for development. The setup process is
# Testing # Testing
SaQC comes with an extensive test suite based on [pytest](https://docs.pytest.org/en/latest/). SaQC comes with an extensive test suite based on [pytest](https://docs.pytest.org/en/latest/).
In order to run all tests execute: In order to run all tests execute `python -m pytest .`, for faster iteration a test run with
```sh `python -m pytest --ignore test/lib test` is usually enough.
python -m pytest .
```
# Coding conventions # Coding conventions
## Naming ## Naming
### Code ### Code
We follow the follwing naming conventions: We implement the following naming conventions:
- Classes: CamelCase - Classes: CamelCase
- Functions: camelCase - Functions: camelCase
- Variables/Arguments: snake_case - Variables/Arguments: snake_case
### Argument names in public functions signatures
first, its not necessary to have *talking* arg-names, in contrast to variable names in
code. This is, because one always must read the documentation. To use and parameterize a function,
just by guessing the meaning of the argument names and not read the docs,
will almost never work. thats why, we dont have the obligation to make names (very)
talkative.
second, because of the nature of a function (to have a *simple* way to use complex code),
its common to use simple and short names. This means, to omit any *irrelevant* information.
For example if we have a function that fit a polynomial on some data with three arguments.
Lets say we have:
- the data input,
- a threshold that defines a cutoff point for a calculation on a polynomial and
- a third argument.
one could name the args `data, poly_cutoff_threshold, ...`, but much better names would
be `data, thresh, ...`, because a caller dont need the extra information,
stuffed in the name.
If the third argument is also some kind of threshold,
one can use `data, cutoff, thresh`, because the *thresh-* information of the `cutoff`
parameter is not crucial and the caller knows that this is a threshold from the docstring.
third, underscores give a nice feedback if one doing wrong or over complex.
No underscore is fine, one underscore is ok, if the information is *really necessary* (see above),
but if one use two or more underscores, one should think of a better naming,
or omit some information.
Sure, seldom but sometimes it is necessary to use 2 underscores, but we consider it as bad style.
Using 3 or more underscores, is not allowed unless have write an reasoning and get it
signed by at least as many core developers as underscores one want to use.
In short the naming should *give a very, very rough idea* of the purpose of the argument,
but not *explain* the usage or the purpose.
It is not a shame to name a parameter just `n` or `alpha` etc. if for example the algorithm
(from the paper etc.) name it alike.
### Test Functions ### Test Functions
- testnames: [testmodule_]flagTestName - testnames: [testmodule_]flagTestName
## Formatting ## Formatting
We use (black)[https://black.readthedocs.io/en/stable/] with a line length if 120 characters. We use (black)[https://black.readthedocs.io/en/stable/] in its default settings.
Within the `SaQC` root directory run `black -l 120`. Within the `saqc` root directory run `black .`.
## Imports ## Imports
Only absolute imports are accepted. Only absolute imports are accepted.
# Development Workflow
## Repository Structure
- `master` - branch:
+ Stable and usually protected.
+ Regular merges from `develop` according to the [release cycle](#release-cycle). These merges get a tag, increasing at least the minor version.
+ Irregular merges from `develop` in case if critical bugs. Such merges increase at least the patch level.
+ Merges into `master` usually lead to a PyPI release
- `develop` - branch:
+ The main development branch, no hard stability requirements/guarantees
+ Merges into `develop` should mostly follow a Merge Request Workflow, minor changes can however be committed directly. Such minor changes include:
* Typos and white space changes
* Obvious bug in features implemented by the committing developer
## Merge Request Workflow
- Most changes to `saqc` are integrated by merge requests from a feature branch into `develop`
- All merge requests need to be reviewed by at least one other core developer (currently @palmb, @luenensc and @schaefed).
- We implement the following Gitlab based review process:
+ The author assigns the Merge Request to one of the core developers. The reviewer should review the request within one week,
large requests may of course lead to longer review times.
+ Reviewer and Author discuss any issues using the Gitlab code review facilities:
* In case all concerns are resolved, the reviewer approves the Merge Request and assigns it back to the author.
* In case reviewer and author can't resolve their discussion, the Merge Request should be assigned to another reviewer.
The new reviewer is now in charge to come to a decision, by either approving, closing or going into another review iteration.
+ The author of an approved Merge Request:
* has the right and the duty to merge into the `develop` branch, any occurring conflicts need to be addressed by the author,
* is always highly encouraged to provide a summary of the changes introduced with the Merge Request in its description upon integration. This recommandation becomes an obligation in case of interface modification or changes to supported and/or documented workflows.
## Release Cycle
- We employ a release cycle of roughly 4 weeks.
- To avoid the avoid the integration of untested and/or broken changes, the merge window closes one week before the intended
release date. Commits to `develop` after the merge window of a release closes need to be integrated during the subsequent release
cycle
- The release cycle is organized by Gitlab Milestones, the expiration date of a certain milestone indicates the end of the
related merge window, the actual merge into `master` and the accompanying release is scheduled for the week after the
milestones expiration date.
- Issues and Merge Requests can and should be associated to these milestone as this help in the organization of review activities.
This diff is collapsed.
...@@ -58,12 +58,12 @@ dataset and the routines to inspect, quality control and/or process them. ...@@ -58,12 +58,12 @@ dataset and the routines to inspect, quality control and/or process them.
The content of such a configuration could look like this: The content of such a configuration could look like this:
``` ```
varname ; test varname ; test
#----------;------------------------------------ #----------;------------------------------------
SM2 ; harm_shift2Grid(freq="15Min") SM2 ; shiftToFreq(freq="15Min")
SM2 ; flagMissing(nodata=NAN) SM2 ; flagMissing()
'SM(1|2)+' ; flagRange(min=10, max=60) 'SM(1|2)+' ; flagRange(min=10, max=60)
SM2 ; spikes_flagMad(window="30d", z=3.5) SM2 ; flagMad(window="30d", z=3.5)
``` ```
As soon as the basic inputs, a dataset and the configuration file are As soon as the basic inputs, a dataset and the configuration file are
...@@ -81,15 +81,16 @@ The following snippet implements the same configuration given above through ...@@ -81,15 +81,16 @@ The following snippet implements the same configuration given above through
the Python-API: the Python-API:
```python ```python
from saqc import SaQC, SimpleFlagger import numpy as np
from saqc import SaQC
saqc = (SaQC(SimpleFlagger(), data) saqc = (SaQC(data)
.harm_shift2Grid("SM2", freq="15Min") .shiftToFreq("SM2", freq="15Min")
.flagMissing("SM2", nodata=np.nan) .flagMissing("SM2")
.flagRange("SM(1|2)+", regex=True, min=10, max=60) .flagRange("SM(1|2)+", regex=True, min=10, max=60)
.spikes_flagMad("SM2", window="30d", z=3.5)) .flagMad("SM2", window="30d", z=3.5))
data, flagger = saqc.getResult() data, flags = saqc.getResult()
``` ```
## Installation ## Installation
......
__pycache__/
DictOfSeries
============
DictOfSeries is a pandas.Series of pandas.Series objects which aims to behave as similar as possible to pandas.DataFrame.
Nomenclature
------------
- series/ser: instance of pandas.Series
- dios: instance of dios.DictOfSeries
- df: instance of pandas.DataFrame
- dios-like: a *dios* or a *df*
- alignable object: a *dios*, *df* or a *series*
Features
--------
* every *column* has its own index
* uses much less memory than a misaligned pandas.DataFrame
* behaves quite like a pandas.DataFrame
* additional align locator (`.aloc[]`)
Install
-------
todo: PyPi
```
import dios
# Have fun :)
```
Documentation
-------------
The main docu is on ReadTheDocs at:
* [dios.rtfd.io](https://dios.rtfd.io)
but some docs are also available local:
* [Indexing](/docs/doc_indexing.md)
* [Cookbook](/docs/doc_cookbook.md)
* [Itype](/docs/doc_itype.md)
TL;DR
-----
**get it**
```
>>> from dios import DictOfSeries
```
**empty**
```
>>> DictOfSeries()
Empty DictOfSeries
Columns: []
>>> DictOfSeries(columns=['x', 'y'])
Empty DictOfSeries
Columns: ['x', 'y']
>>> DictOfSeries(columns=['x', 'y'], index=[3,4,5])
x | y |
====== | ====== |
3 NaN | 3 NaN |
4 NaN | 4 NaN |
5 NaN | 5 NaN |
```
**with data**
```
>>> DictOfSeries([range(4), range(2), range(3)])
0 | 1 | 2 |
==== | ==== | ==== |
0 0 | 0 0 | 0 0 |
1 1 | 1 1 | 1 1 |
2 2 | | 2 2 |
3 3 | | |
>>> DictOfSeries(np.random.random([2,4]))
0 | 1 |
=========== | =========== |
0 0.112020 | 0 0.509881 |
1 0.108070 | 1 0.285779 |
2 0.851453 | 2 0.805933 |
3 0.138352 | 3 0.812339 |
>>> DictOfSeries(np.random.random([2,4]), columns=['a','b'], index=[11,12,13,14])
a | b |
============ | ============ |
11 0.394304 | 11 0.356206 |
12 0.943689 | 12 0.735356 |
13 0.791820 | 13 0.066947 |
14 0.759802 | 14 0.496321 |
>>> DictOfSeries(dict(today=['spam']*3, tomorrow=['spam']*2))
today | tomorrow |
======= | ========== |
0 spam | 0 spam |
1 spam | 1 spam |
2 spam | |
```
#!/usr/bin/env python
from .dios import *
from .lib import *
from .dios import *
__all__ = [
"DictOfSeries",
"to_dios",
"pprint_dios",
"IntItype",
"FloatItype",
"NumItype",
"DtItype",
"ObjItype",
"ItypeWarning",
"ItypeCastWarning",
"ItypeCastError",
"is_itype",
"is_itype_subtype",
"is_itype_like",
"get_itype",
"cast_to_itype",
"CastPolicy",
"Opts",
"OptsFields",
"OptsFields",
"dios_options",
"example_DictOfSeries",
]
This diff is collapsed.
This diff is collapsed.
from . import pandas_bridge as pdextra
from .base import (
_DiosBase,
_is_dios_like,
_is_bool_dios_like,
)
import numpy as np
import pandas as pd
class _Indexer:
def __init__(self, obj: _DiosBase):
self.obj = obj
self._data = obj._data
def _unpack_key(self, key):
key = list(key) if pdextra.is_iterator(key) else key
if isinstance(key, tuple):
if len(key) > 2:
raise KeyError("To many indexers")
rowkey, colkey = key
else:
rowkey, colkey = key, slice(None)
if isinstance(rowkey, tuple) or isinstance(colkey, tuple):
raise KeyError(f"{key}. tuples are not allowed.")
rowkey = list(rowkey) if pdextra.is_iterator(rowkey) else rowkey
colkey = list(colkey) if pdextra.is_iterator(colkey) else colkey
return rowkey, colkey
def _set_value_muli_column(self, rowkey, colkey, value, xloc="loc"):
"""set value helper for loc and iloc"""
data = getattr(self._data, xloc)[colkey]
hashable_rkey = pdextra.is_hashable(rowkey)
dioslike_value = False
iter_value = False
if _is_dios_like(value):
dioslike_value = True
if hashable_rkey:
raise ValueError(f"Incompatible indexer with DictOfSeries")
elif pdextra.is_list_like(value):
value = value.values if isinstance(value, pd.Series) else value
iter_value = True
if len(value) != len(data):
raise ValueError(
f"shape mismatch: value array of shape (.., {len(value)}) could "
f"not be broadcast to indexing result of shape (.., {len(data)})"
)
c = "?"
try:
for i, c in enumerate(data.index):
dat = data.at[c]
dat_xloc = getattr(dat, xloc)
if dioslike_value:
# set to empty series fail; emptySer.loc[:] = [2,1]
# len(scalar) -> would fail, but cannot happen,
# because dioslike+hashable, already was checked
if len(dat_xloc[rowkey]) == 0:
continue
# unpack the value if necessary
if iter_value:
val = value[i]
elif dioslike_value:
val = value[c] if c in value else np.nan
else:
val = value
dat_xloc[rowkey] = val
except Exception as e:
raise type(e)(f"failed for column {c}: " + str(e)) from e
# #############################################################################
class _LocIndexer(_Indexer):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def __getitem__(self, key):
rowkey, colkey = self._unpack_key(key)
if _is_dios_like(rowkey) or _is_dios_like(colkey):
raise ValueError("Could not index with multidimensional key")
# simple optimisation
if pdextra.is_null_slice(rowkey) and pdextra.is_null_slice(colkey):
return self.obj.copy()
data = self._data.loc[colkey].copy()
# .loc[any, scalar] -> (a single) series
# .loc[scalar, scalar] -> (a single) value
if pdextra.is_hashable(colkey):
new = data.loc[rowkey]
# .loc[any, non-scalar]
else:
k = "?"
try:
for k in data.index:
data.at[k] = data.at[k].loc[rowkey]
except Exception as e:
raise type(e)(f"failed for column {k}: " + str(e)) from e
# .loc[scalar, non-scalar] -> column-indexed series
if pdextra.is_hashable(rowkey):
new = data
# .loc[non-scalar, non-scalar] -> dios
else:
new = self.obj.copy_empty(columns=False)
new._data = data
return new
def __setitem__(self, key, value):
rowkey, colkey = self._unpack_key(key)
if _is_dios_like(rowkey) or _is_dios_like(colkey):
raise ValueError("Cannot index with multi-dimensional key")
# .loc[any, scalar] - set on single column
if pdextra.is_hashable(colkey):
# .loc[dont-care, new-scalar] = val
if colkey not in self.obj.columns:
self.obj._insert(colkey, value)
# .loc[any, scalar] = multi-dim
elif _is_dios_like(value) or pdextra.is_nested_list_like(value):
raise ValueError("Incompatible indexer with multi-dimensional value")
# .loc[any, scalar] = val
else:
self._data.at[colkey].loc[rowkey] = value
# .loc[any, non-scalar] = any
else:
self._set_value_muli_column(rowkey, colkey, value, xloc="loc")
# #############################################################################
class _iLocIndexer(_Indexer):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def __getitem__(self, key):
rowkey, colkey = self._unpack_key(key)
if _is_dios_like(rowkey) or _is_dios_like(colkey):
raise ValueError("Cannot index with multidimensional key")
# simple optimisation
if pdextra.is_null_slice(rowkey) and pdextra.is_null_slice(colkey):
return self.obj.copy()
data = self._data.iloc[colkey].copy()
# .iloc[any, int] -> single series
# .iloc[int, int] -> single value
if pdextra.is_integer(colkey):
new = data.iloc[rowkey]
# .iloc[any, non-int]
else:
k = "?"
try:
for k in data.index:
data.at[k] = data.at[k].iloc[rowkey]
except Exception as e:
raise type(e)(f"failed for column {k}: " + str(e)) from e
# .iloc[int, non-int] -> column-indexed series
if pdextra.is_integer(rowkey):
new = data
# .iloc[non-int, non-int] -> dios
else:
new = self.obj.copy_empty(columns=False)
new._data = data
return new
def __setitem__(self, key, value):
rowkey, colkey = self._unpack_key(key)
if _is_dios_like(rowkey) or _is_dios_like(colkey):
raise ValueError("Cannot index with multidimensional key")
# .iloc[any, int] = Any
if pdextra.is_integer(colkey):
if _is_dios_like(value) or pdextra.is_nested_list_like(value):
raise ValueError("Incompatible indexer with multi-dimensional value")
self._data.iat[colkey].iloc[rowkey] = value
# .iloc[any, non-int] = Any
else:
self._set_value_muli_column(rowkey, colkey, value, xloc="iloc")
# #############################################################################
class _aLocIndexer(_Indexer):
"""align Indexer
Automatically align (alignable) indexer on all possible axis,
and handle indexing with non-existent or missing keys gracefully.
Also align (alignable) values before setting them with .loc
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._usebool = True
def __call__(self, usebool=True):
"""We are called if the user want to set `usebool=False', which make
boolean alignable indexer treat as non-boolean alignable indexer.
Explanation: A boolean dios indexer align its indices with the indices
of the receiving dios like a non-boolean dios indexer also would do.
Additionally all rows with False values are kicked too. To disable
that `usebool=False` can be given."""
self._usebool = usebool
return self
def __getitem__(self, key):
rowkeys, colkeys, lowdim = self._unpack_key_aloc(key)
data = pd.Series(dtype="O", index=colkeys)
kws = dict(itype=self.obj.itype, cast_policy=self.obj._policy)
c = "?"
try:
for i, c in enumerate(data.index):
data.at[c] = self._data.at[c].loc[rowkeys[i]]
except Exception as e:
raise type(e)(f"failed for column {c}: " + str(e)) from e
if lowdim:
return data.squeeze()
else:
return self.obj._constructor(data=data, fastpath=True, **kws)
def __setitem__(self, key, value):
rowkeys, colkeys, _ = self._unpack_key_aloc(key)
def iter_self(colkeys, position=False):
c = "?"
try:
for i, c in enumerate(colkeys):
dat = self._data.at[c]
rk = rowkeys[i]
if len(dat.loc[rk]) == 0:
continue
yield dat, rk, i if position else c
except Exception as e:
raise type(e)(f"failed for column {c}: " + str(e)) from e
# align columns, for rows use series.loc to align
if _is_dios_like(value):
colkeys = value.columns.intersection(colkeys)
for dat, rk, c in iter_self(colkeys):
dat.loc[rk] = value[c]
# no align, no merci
elif pdextra.is_nested_list_like(value):
if len(colkeys) != len(value):
raise ValueError(
f"shape mismatch: values array of shape "
f"(.., {len(value)}) could not "
f"be broadcast to indexing result of "
f"shape (.., {len(colkeys)})"
)
for dat, rk, i in iter_self(colkeys, position=True):
dat.loc[rk] = value[i]
# align rows by using series.loc
elif isinstance(value, pd.Series):
for dat, rk, _ in iter_self(colkeys):
dat.loc[rk] = value
# no align, no merci
else:
for dat, rk, _ in iter_self(colkeys):
dat.loc[rk] = value
def _unpack_key_aloc(self, key):
"""
Return a list of row indexer and a list of existing(!) column labels.
Both list always have the same length and also could be empty together.
Note:
The items of the row indexer list should be passed to pd.Series.loc[]
"""
# if a single column-key is given, the caller may
# want to return a single Series, instead of a dios
lowdim = False
def keys_from_bool_dios_like(key):
if not _is_bool_dios_like(key):
raise ValueError("Must pass dios-like key with boolean values only.")
colkey = self.obj.columns.intersection(key.columns)
rowkey = []
for c in colkey:
b = key[c]
rowkey += [self._data.at[c].index.intersection(b[b].index)]
return rowkey, colkey, lowdim
def keys_from_dios_like(key):
colkey = self.obj.columns.intersection(key.columns)
rowkey = [self._data.at[c].index.intersection(key[c].index) for c in colkey]
return rowkey, colkey, lowdim
rowkey, colkey = self._unpack_key(key)
if _is_dios_like(colkey) or pdextra.is_nested_list_like(colkey):
raise ValueError("Could not index with multi-dimensional column key.")
# giving the ellipsis as column key, is an alias
# for giving `usebool=False`. see self.__call__()
if colkey is Ellipsis:
self._usebool = False
colkey = slice(None)
# .aloc[dios]
if _is_dios_like(rowkey):
if not pdextra.is_null_slice(colkey):
raise ValueError(
f"Could not index with a dios-like indexer as rowkey,"
f"and a column key of that type {type(colkey)}"
)
if self._usebool:
return keys_from_bool_dios_like(rowkey)
else:
return keys_from_dios_like(rowkey)
# handle gracefully: scalar
elif pdextra.is_hashable(colkey):
colkey = [colkey] if colkey in self.obj.columns else []
lowdim = True
# column-alignable: list-like, filter only existing columns
elif pdextra.is_list_like(colkey) and not pdextra.is_bool_indexer(colkey):
colkey = colkey.values if isinstance(colkey, pd.Series) else colkey
colkey = self.obj.columns.intersection(colkey)
# handle gracefully (automatically)
# just a simple optimisation
elif pdextra.is_null_slice(colkey):
colkey = self.obj.columns
# not alignable, fall back to .loc (boolean list/series, slice(..), etc.
else:
colkey = self._data.loc[colkey].index
if len(colkey) == 0: # (!) `if not colkey:` fails for pd.Index
return [], [], lowdim
rowkey = self._get_rowkey(rowkey, colkey)
return rowkey, colkey, lowdim
def _get_rowkey(self, rowkey, colkey, depth=0):
if pdextra.is_nested_list_like(rowkey) and depth == 0:
rowkey = rowkey.values if isinstance(rowkey, pd.Series) else rowkey
if len(rowkey) != len(colkey):
raise ValueError(
"Nested arrays indexer must have same (outer) "
"length than the number of selected columns."
)
indexer = []
for i, c in enumerate(colkey):
# recurse to get the row indexer from inner element
indexer += self._get_rowkey(rowkey[i], [c], depth=depth + 1)
rowkey = indexer
# row-alignable: pd.Series(), align rows to every series in colkey (columns)
elif isinstance(rowkey, pd.Series):
if self._usebool and pdextra.is_bool_indexer(rowkey):
rowkey = [
self._data.at[c].index.intersection(rowkey[rowkey].index)
for c in colkey
]
else:
rowkey = [
self._data.at[c].index.intersection(rowkey.index) for c in colkey
]
# handle gracefully: scalar, transform to row-slice
elif pdextra.is_hashable(rowkey):
rowkey = [slice(rowkey, rowkey)] * len(colkey)
# handle gracefully: list-like, filter only existing rows
# NOTE: dios.aloc[series.index] is processed here
elif pdextra.is_list_like(rowkey) and not pdextra.is_bool_indexer(rowkey):
rowkey = [self._data.at[c].index.intersection(rowkey) for c in colkey]
# not alignable
# the rowkey is processed by .loc someway in
# the calling function - (eg. slice(..), boolean list-like, etc.)
else:
rowkey = [rowkey] * len(colkey)
return rowkey
# #############################################################################
class _AtIndexer(_Indexer):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def _check_key(self, key):
if not (
isinstance(key, tuple)
and len(key) == 2
and pdextra.is_hashable(key[0])
and pdextra.is_hashable(key[1])
):
raise KeyError(
f"{key}. `.at` takes exactly one scalar row-key "
"and one scalar column-key"
)
def __getitem__(self, key):
self._check_key(key)
return self._data.at[key[1]].at[key[0]]
def __setitem__(self, key, value):
self._check_key(key)
if _is_dios_like(value) or pdextra.is_nested_list_like(value):
raise TypeError(
".at[] cannot be used to set multi-dimensional values, use .aloc[] instead."
)
self._data.at[key[1]].at[key[0]] = value
# #############################################################################
class _iAtIndexer(_Indexer):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def _check_key(self, key):
if not (
isinstance(key, tuple)
and len(key) == 2
and pdextra.is_integer(key[0])
and pdextra.is_integer(key[1])
):
raise KeyError(
f"{key} `.iat` takes exactly one integer positional "
f"row-key and one integer positional scalar column-key"
)
def __getitem__(self, key):
self._check_key(key)
return self._data.iat[key[1]].iat[key[0]]
def __setitem__(self, key, value):
self._check_key(key)
if _is_dios_like(value) or pdextra.is_nested_list_like(value):
raise TypeError(
".iat[] cannot be used to set multi-dimensional values, use .aloc[] instead."
)
self._data.iat[key[1]].iat[key[0]] = value
import pandas as pd
import warnings
class ItypeWarning(RuntimeWarning):
pass
class ItypeCastWarning(ItypeWarning):
pass
class ItypeCastError(RuntimeError):
pass
class __Itype:
def __init__(self):
raise RuntimeError("a Itype class does not allow instances of itself.")
class DtItype(__Itype):
name = "datetime"
unique = True
subtypes = (pd.DatetimeIndex,)
min_pdindex = pd.DatetimeIndex([])
class IntItype(__Itype):
name = "integer"
unique = True
subtypes = (pd.RangeIndex, pd.Int64Index, pd.UInt64Index, int)
min_pdindex = pd.Int64Index([])
class FloatItype(__Itype):
name = "float"
subtypes = (pd.Float64Index, float)
unique = True
min_pdindex = pd.Float64Index([])
# class MultiItype(__Itype):
# name = "multi"
# subtypes = (pd.MultiIndex, )
# unique = ??
class NumItype(__Itype):
name = "numeric"
_subitypes = (IntItype, FloatItype)
subtypes = _subitypes + IntItype.subtypes + FloatItype.subtypes
unique = False
min_pdindex = pd.Float64Index([])
class ObjItype(__Itype):
name = "object"
unique = False
_subitypes = (DtItype, IntItype, FloatItype, NumItype, str)
_otheritypes = (
pd.CategoricalIndex,
pd.IntervalIndex,
pd.PeriodIndex,
pd.TimedeltaIndex,
pd.Index,
)
subtypes = _subitypes + _otheritypes + DtItype.subtypes + NumItype.subtypes
min_pdindex = pd.Index([])
def is_itype(obj, itype):
"""Check if obj is a instance of the given itype or its str-alias was given"""
# todo: iter through itype as it could be a tuple, if called like ``is_itype(o, (t1,t2))``
# user gave a Itype, like ``DtItype``
if type(obj) == type and issubclass(obj, itype):
return True
# user gave a string, like 'datetime'
if isinstance(obj, str) and obj == itype.name:
return True
return False
def is_itype_subtype(obj, itype):
"""Check if obj is a subclass or a instance of a subclass of the given itype"""
# user gave a subtype, like ``pd.DatetimeIndex``
if type(obj) == type and issubclass(obj, itype.subtypes):
return True
# user gave a instance of a subtype, like ``pd.Series(..).index``
if isinstance(obj, itype.subtypes):
return True
return False
def is_itype_like(obj, itype):
"""Check if obj is a subclass or a instance of the given itype or any of its subtypes"""
return is_itype(obj, itype) or is_itype_subtype(obj, itype)
def get_itype(obj):
"""
Return the according Itype.
and return the according Itype
Parameters
----------
obj : {itype string, Itype, pandas.Index, instance of pd.index}
get the itype fitting for the input
Examples
--------
>>> get_itype("datetime")
<class 'dios.lib.DtItype'>
>>> s = pd.Series(index=pd.to_datetime([]))
>>> get_itype(s.index)
<class 'dios.lib.DtItype'>
>>> get_itype(DtItype)
<class 'dios.lib.DtItype'>
>>> get_itype(pd.DatetimeIndex)
<class 'dios.lib.DtItype'>
"""
if type(obj) == type and issubclass(obj, __Itype):
return obj
# check if it is the actual type, not a subtype
types = [DtItype, IntItype, FloatItype, NumItype, ObjItype]
for t in types:
if is_itype(obj, t):
return t
for t in types:
if is_itype_subtype(obj, t):
return t
raise ValueError(
f"{obj} is not a itype, nor any known subtype of a itype, nor a itype string alias"
)
def _itype_eq(a, b):
return is_itype(a, b)
def _itype_lt(a, b):
return is_itype_subtype(a, b)
def _itype_le(a, b):
return is_itype_like(a, b)
def _find_least_common_itype(iterable_of_series):
itypes = [NumItype, FloatItype, IntItype, DtItype]
tlist = [get_itype(s.index) for s in iterable_of_series]
found = ObjItype
if tlist:
for itype in itypes:
for t in tlist:
if _itype_le(t, itype):
continue
break
else:
found = itype
return found
################################################################################
# Casting
class CastPolicy:
force = "force"
save = "save"
never = "never"
_CAST_POLICIES = [CastPolicy.force, CastPolicy.save, CastPolicy.never]
def cast_to_itype(series, itype, policy="lossless", err="raise", inplace=False):
"""Cast a series (more explicit the type of the index) to fit the itype of a dios.
Return the casted series if successful, None otherwise.
Note:
This is very basic number-casting, so in most cases, information from
the old index will be lost after the cast.
"""
if policy not in _CAST_POLICIES:
raise ValueError(f"policy={policy}")
if err not in ["raise", "ignore"]:
raise ValueError(f"err={err}")
if not inplace:
series = series.copy()
itype = get_itype(itype)
if series.empty:
return pd.Series(index=itype.min_pdindex, dtype=series.dtype)
series.itype = get_itype(series.index)
# up-cast issn't necessary because a dios with a higher
# itype always can take lower itypes.
# series can have dt/int/float/mixed
# dt -> dt -> mixed
# int -> int -> num -> mixed
# float -> float -> num -> mixed
# mixed -> mixed
if _itype_le(series.itype, itype): # a <= b
return series
e = f"A series index of type '{type(series.index)}' cannot be casted to Itype '{itype.name}'"
# cast any -> dt always fail.
if is_itype(itype, DtItype):
pass
else:
e += f", as forbidden by the cast-policy '{policy}'."
if policy == CastPolicy.never:
pass
elif policy == CastPolicy.force:
# cast any (dt/float/mixed) -> int
if is_itype(itype, IntItype): # a == b
series.index = pd.RangeIndex(len(series))
return series
# cast any (dt/int/mixed) -> float
# cast any (dt/float/mixed) -> nur
if is_itype(itype, FloatItype) or is_itype(itype, NumItype): # a == b or a == c
series.index = pd.Float64Index(range(len(series)))
return series
elif policy == CastPolicy.save:
# cast int -> float
if is_itype(itype, IntItype) and is_itype(
series.itype, FloatItype
): # a == b and c == d
series.index = series.index.astype(float)
return series
# cast float -> int, maybe if unique
if is_itype(itype, FloatItype) and is_itype(
series.itype, IntItype
): # a == b and c == d
series.index = series.index.astype(int)
if series.index.is_unique:
return series
e = (
f"The cast with policy {policy} from series index type '{type(series.index)}' to "
f"itype {itype.name} resulted in a non-unique index."
)
# cast mixed -> int/float always fail
if err == "raise":
raise ItypeCastError(e)
else:
return None
################################################################################
# OPTIONS
class OptsFields:
"""storage class for the keys in `dios_options`
Use like so: ``dios_options[OptsFields.X] = Opts.Y``.
See Also
--------
Opts: values for the options dict
dios_options: options dict for module
"""
mixed_itype_warn_policy = "mixed_itype_policy"
disp_max_rows = "disp_max_rows "
disp_min_rows = "disp_min_rows "
disp_max_cols = "disp_max_vars"
dios_repr = "dios_repr"
class Opts:
"""storage class for string values for `dios_options`
Use like so: ``dios_options[OptsFields.X] = Opts.Y``.
See Also
--------
OptsFields: keys for the options dict
dios_options: options dict for module
"""
itype_warn = "warn"
itype_err = "err"
itype_ignore = "ignore"
repr_aligned = "aligned"
repr_indexed = "indexed"
class __DocDummy(dict):
pass
dios_options = __DocDummy()
dios_options.update(
**{
OptsFields.disp_max_rows: 60,
OptsFields.disp_min_rows: 10,
OptsFields.disp_max_cols: 10,
OptsFields.mixed_itype_warn_policy: Opts.itype_warn,
OptsFields.dios_repr: Opts.repr_indexed,
}
)
opdoc = f"""Options dictionary for module `dios`.
Use like so: ``dios_options[OptsFields.X] = Opts.Y``.
**Items**:
* {OptsFields.dios_repr}: {{'indexed', 'aligned'}} default: 'indexed'
dios default representation if:
* `indexed`: show every column with its index
* `aligned`: transform to pandas.DataFrame with indexed merged together.
* {OptsFields.disp_max_rows} : int
Maximum numbers of row before truncated to `disp_min_rows`
in representation of DictOfSeries
* {OptsFields.disp_min_rows} : int
min rows to display if `max_rows` is exceeded
* {OptsFields.disp_max_cols} : int
Maximum numbers of columns before truncated representation
* {OptsFields.mixed_itype_warn_policy} : {{'warn', 'err', 'ignore'}}
How to inform user about mixed Itype
See Also
--------
OptsFields: keys for the options dict
Opts: values for the options dict
"""
dios_options.__doc__ = opdoc
def _throw_MixedItype_err_or_warn(itype):
msg = (
f"Using '{itype.name}' as itype is not recommend. "
f"As soon as series with different index types are inserted,\n"
f"indexing and slicing will almost always fail. "
)
if dios_options[OptsFields.mixed_itype_warn_policy] in [
"ignore",
Opts.itype_ignore,
]:
pass
elif dios_options[OptsFields.mixed_itype_warn_policy] in [
"error",
"err",
Opts.itype_err,
]:
msg += "Suppress this error by specifying an unitary 'itype' or giving an 'index' to DictOfSeries."
raise ItypeCastError(msg)
else:
msg += "Silence this warning by specifying an unitary 'itype' or giving an 'index' to DictOfSeries."
warnings.warn(msg, ItypeWarning)
return
def example_DictOfSeries():
"""Return a example dios.
Returns
-------
DictOfSeries: an example
Examples
--------
>>> from dios import example_DictOfSeries
>>> di = example_DictOfSeries()
>>> di
a | b | c | d |
===== | ====== | ====== | ===== |
0 0 | 2 5 | 4 7 | 6 0 |
1 7 | 3 6 | 5 17 | 7 1 |
2 14 | 4 7 | 6 27 | 8 2 |
3 21 | 5 8 | 7 37 | 9 3 |
4 28 | 6 9 | 8 47 | 10 4 |
5 35 | 7 10 | 9 57 | 11 5 |
6 42 | 8 11 | 10 67 | 12 6 |
7 49 | 9 12 | 11 77 | 13 7 |
8 56 | 10 13 | 12 87 | 14 8 |
9 63 | 11 14 | 13 97 | 15 9 |
"""
from dios import DictOfSeries
a = pd.Series(range(0, 70, 7))
b = pd.Series(range(5, 15, 1))
c = pd.Series(range(7, 107, 10))
d = pd.Series(range(0, 10, 1))
for i, s in enumerate([a, b, c, d]):
s.index += i * 2
di = DictOfSeries(dict(a=a, b=b, c=c, d=d))
return di.copy()
# do not import dios-stuff here
import operator as op
_OP1_MAP = {
op.inv: "~",
op.neg: "-",
op.abs: "abs()",
}
_OP2_COMP_MAP = {
op.eq: "==",
op.ne: "!=",
op.le: "<=",
op.ge: ">=",
op.gt: ">",
op.lt: "<",
}
_OP2_BOOL_MAP = {
op.and_: "&",
op.or_: "|",
op.xor: "^",
}
_OP2_ARITH_MAP = {
op.add: "+",
op.sub: "-",
op.mul: "*",
op.pow: "**",
}
_OP2_DIV_MAP = {
op.mod: "%",
op.truediv: "/",
op.floordiv: "//",
}
OP_MAP = _OP2_COMP_MAP.copy()
OP_MAP.update(_OP2_BOOL_MAP)
OP_MAP.update(_OP2_ARITH_MAP)
OP_MAP.update(_OP2_DIV_MAP)
OP_MAP.update(_OP1_MAP)
#!/usr/bin/env python
__author__ = "Bert Palm"
__email__ = "bert.palm@ufz.de"
__copyright__ = "Copyright 2020, Helmholtz-Zentrum für Umweltforschung GmbH - UFZ"
from pandas.core.common import (
is_bool_indexer,
is_null_slice,
)
from pandas.core.dtypes.common import (
is_nested_list_like,
)
from pandas.api.types import (
is_list_like,
is_hashable,
is_integer,
is_dict_like,
is_scalar,
# Unlike the example says, return lists False, not True
# >>is_iterator([1, 2, 3])
# >>False
is_iterator,
)
# ignore everything
_api
_build
_static
*.automodsumm
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile clean
clean:
rm -rf _build _static _api
rm -f *.automodsumm
mkdir _static
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys
sys.path.insert(0, os.path.abspath(".."))
# -- Project information -----------------------------------------------------
project = "dios"
copyright = "2020, Bert Palm"
author = "Bert Palm"
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
"sphinx.ext.autodoc",
"sphinx.ext.autosummary",
# "sphinx.ext.doctest",
# "sphinx.ext.extlinks",
# "sphinx.ext.todo",
# "sphinx.ext.intersphinx",
# "sphinx.ext.coverage",
# "sphinx.ext.mathjax",
# "sphinx.ext.ifconfig",
"sphinx.ext.autosectionlabel",
# link source code
"sphinx.ext.viewcode",
# add suupport for NumPy style docstrings
"sphinx.ext.napoleon",
# doc the whole module
"sphinx_automodapi.automodapi",
"sphinxcontrib.fulltoc",
# markdown sources support
"recommonmark",
"sphinx_markdown_tables",
]
numpydoc_show_class_members = False
automodsumm_inherited_members = True
automodapi_inheritance_diagram = False
automodapi_toctreedirnm = "_api"
# automodsumm_writereprocessed = True
autosectionlabel_prefix_document = True
# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
source_suffix = [".rst", ".md"]
# -- Options for HTML output -------------------------------------------------
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = "sphinx"
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
html_theme = "nature"
# use pandas theme
# html_theme = "pydata_sphinx_theme"
# html_theme_options = {
# }
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ["_static"]
API
====
.. automodapi:: dios
:include-all-objects:
:no-heading:
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment