merged develop

e4b4ba31 · Bert Palm · 2540ea87 · 342dcea4 · e4b4ba31 · e4b4ba31
Commit e4b4ba31 authored 3 years ago by Bert Palm 🎇
--- a/dios/docs/doc_cookbook.md
+++ b/dios/docs/doc_cookbook.md
+Cookbook
+=========
+
+Recipes
+-------
+- select common rows from all columns
+- align columns to an other column
+- align columns to a given index
+- align dios with dios
+- get/set values by condition
+- apply a value to multiple columns
+- [Broadcast array-likes to multiple columns](#broadcast-array-likes-to-multiple-columns)
+- apply a array-like value to multiple columns
+- nan-policy - mask vs. drop values, when nan's are inserted (mv to Readme ??)
+- itype - when to use, pitfalls and best-practise
+- changing the index of series' in dios (one, some, all)
+- changing the dtype of series' in dios (one, some, all)
+- changing properties of series' in dios (one, some, all)
+
+**T_O_D_O**
+
+
+Broadcast array-likes to multiple columns
+-----------------------------------------
+**T_O_D_O**
--- a/dios/docs/doc_indexing.md
+++ b/dios/docs/doc_indexing.md
--- a/dios/docs/doc_itype.md
+++ b/dios/docs/doc_itype.md
+Itype
+=====
+
+DictOfSeries holds multiple series, and each series can have a different index length 
+and index type. Differing index lengths are either solved by some aligning magic, or simply fail, if 
+aligning makes no sense (eg. assigning the very same list to series of different lengths (see `.aloc`).
+
+A bigger challange is the type of the index. If one series has an alphabetical index, and another one 
+a numeric index, selecting along columns can fail in every scenario. To keep track of the
+types of index or to prohibit the inserting of a *not fitting* index type, 
+we introduce the `itype`. This can be set on creation of a Dios and also changed during usage. 
+On change of the itype, all indexes of all series in the dios are casted to a new fitting type,
+if possible. Different cast-mechanisms are available. 
+
+If an itype prohibits some certain types of indexes and a series with a non-fitting index-type is inserted, 
+an implicit type cast is done (with or without a warning) or an error is raised. The warning/error policy
+can be adjusted via global options. 
+
--- a/dios/docs/genindex.rst
+++ b/dios/docs/genindex.rst
+
+# dummy file to be able to link to index
+
+Index
+=====
\ No newline at end of file
--- a/dios/docs/index.rst
+++ b/dios/docs/index.rst
+.. dios documentation master file, created by
+   sphinx-quickstart on Sun Apr 19 02:36:37 2020.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Dios Docs
+=========
+
+.. currentmodule:: dios
+
+The whole package :mod:`dios` is mainly a container for
+the class :class:`dios.DictOfSeries`. See
+
+.. toctree::
+
+   dios.DictOfSeries <_api/dios.DictOfSeries>
+
+.. toctree::
+   :hidden:
+
+   Repository <https://git.ufz.de/rdm/dios>
+   example DictOfSeries <_api/dios.example_DictOfSeries>
+
+
+Most magic happen in getting and setting elements.
+To select any combination from columns and rows,
+read the documentation about indexing:
+
+.. toctree::
+
+   doc_indexing
+
+.. toctree::
+
+   doc_cookbook
+
+For the idea behind the Itype concept and its usage read:
+
+.. toctree::
+
+   doc_itype
+
+For implemented methods and module functions,
+respectively the full module api, see:
+
+.. toctree::
+   :maxdepth: 2
+
+   dios_api
+
+or browse the Index..
+
+.. toctree::
+
+   genindex
+
--- a/dios/docs/make.bat
+++ b/dios/docs/make.bat
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
--- a/dios/docs/requirements_sphinx.txt
+++ b/dios/docs/requirements_sphinx.txt
+alabaster==0.7.12
+Babel==2.8.0
+certifi==2020.6.20
+chardet==3.0.4
+commonmark==0.9.1
+docutils==0.16
+idna==2.10
+imagesize==1.2.0
+importlib-metadata==1.7.0
+Jinja2==2.11.2
+Markdown==3.2.2
+MarkupSafe==1.1.1
+numpy==1.19.1
+packaging==20.4
+pandas==1.1.1
+Pygments==2.6.1
+pyparsing==2.4.7
+python-dateutil==2.8.1
+pytz==2020.1
+recommonmark==0.6.0
+requests==2.24.0
+six==1.15.0
+snowballstemmer==2.0.0
+Sphinx==3.2.1
+sphinx-automodapi==0.12
+sphinx-markdown-tables==0.0.15
+sphinxcontrib-applehelp==1.0.2
+sphinxcontrib-devhelp==1.0.2
+sphinxcontrib-fulltoc==1.2.0
+sphinxcontrib-htmlhelp==1.0.3
+sphinxcontrib-jsmath==1.0.1
+sphinxcontrib-qthelp==1.0.3
+sphinxcontrib-serializinghtml==1.1.4
+urllib3==1.25.10
+zipp==3.1.0
--- a/dios/profiling/__init__.py
+++ b/dios/profiling/__init__.py
+from .generate_testsets import *
+from .performance import find_index_range, gen_random_timestamps
--- a/dios/profiling/generate_testsets.py
+++ b/dios/profiling/generate_testsets.py
--- a/dios/profiling/memory.py
+++ b/dios/profiling/memory.py
+import gc
+from .generate_testsets import get_random_df_and_dios
+
+
+def calc_mem(rows, cols, shifted=False, dtypesz=(64 / 8)):
+    if shifted:
+        idxsz = 8 * rows * cols
+        # additional nans are inserted exactly as many as variables
+        rowsz = rows * cols * dtypesz
+    else:
+        idxsz = 8 * rows
+        rowsz = rows * dtypesz
+
+    return idxsz + rowsz * cols
+
+
+def bytes2hread(bytes):
+    i = 0
+    units = ["B", "kB", "MB", "GB", "TB"]
+    while bytes > 1000:
+        bytes /= 1024
+        i += 1
+        if i == 4:
+            break
+    return bytes, units[i]
+
+
+def rows_by_time(nsec, mdays):
+    """calc the number of values for one value every n seconds in m days
+    :param nsec: n seconds a value occur
+    :param mdays: this many days of data
+    :return: rows thats needed
+    """
+    return int((60 / nsec) * 60 * 24 * mdays)
+
+
+if __name__ == "__main__":
+
+    # dios      - linear in rows and colums, same size for r=10,c=100 or r=100,c=10
+    do_real_check = True
+    cols = 10
+    rows = 100000
+    # rows = rows_by_time(nsec=600, mdays=365*2)
+
+    mem = calc_mem(rows, cols, shifted=False)
+    memsh = calc_mem(rows, cols, shifted=True)
+
+    df, dios = get_random_df_and_dios(rows, cols, disalign=False, randstart=True)
+    dios_mem = dios.memory_usage()
+    print(f"dios:\n-----------")
+    print("mem: ", *bytes2hread(dios_mem))
+    print("entries:", sum([len(dios[e]) for e in dios]))
+    print()
+
+    ratio = (1 / (memsh - mem)) * dios_mem
+
+    mem = bytes2hread(mem)
+    memsh = bytes2hread(memsh)
+
+    print("df - best case\n---------")
+    print("mem: ", *mem)
+    print("entries:", rows)
+    print()
+    print("df - worst case\n---------")
+    print("mem :", *memsh)
+    print("entries:", rows * cols)
+
+    print()
+    print(f"dfbest, dios, dfworst: 0%, {round(ratio, 4)*100}%, 100% ")
+
+    if not do_real_check:
+        exit(0)
+
+    proveMeRight = False
+
+    if proveMeRight:
+        # best case
+        print()
+        print("best case proove")
+        dfb, _ = get_random_df_and_dios(rows, cols, disalign=False, randstart=False)
+        dfb.info(memory_usage="deep", verbose=False)
+
+    print()
+    print("rand start, same freq")
+    df.info(memory_usage="deep", verbose=False)
+    print("entries:", sum([len(df[e]) for e in df]))
+
+    print()
+    print("rand start, rand freq")
+    df, _ = get_random_df_and_dios(rows, cols, disalign="random", randstart=True)
+    df.info(memory_usage="deep", verbose=False)
+    print("entries:", sum([len(df[e]) for e in df]))
+
+    if proveMeRight:
+        # worst case
+        print()
+        print("worst case proove")
+        df, _ = get_random_df_and_dios(rows, cols, disalign=True, randstart=False)
+        df.info(memory_usage="deep", verbose=False)
+
+    gc.collect()
--- a/dios/profiling/performance.py
+++ b/dios/profiling/performance.py
--- a/dios/profiling/testsets/.gitignore
+++ b/dios/profiling/testsets/.gitignore
--- a/dios/requirements.txt
+++ b/dios/requirements.txt
--- a/dios/setup.py
+++ b/dios/setup.py
--- a/dios/test/__init__.py
+++ b/dios/test/__init__.py
+from .test_setup import *
--- a/dios/test/run_dios.py
+++ b/dios/test/run_dios.py
--- a/dios/test/test__ops__.py
+++ b/dios/test/test__ops__.py
--- a/dios/test/test__setget__.py
+++ b/dios/test/test__setget__.py
--- a/dios/test/test__setget__aloc.py
+++ b/dios/test/test__setget__aloc.py
--- a/dios/test/test__setget__iloc.py
+++ b/dios/test/test__setget__iloc.py