From 200b7aec2503711619cd670d09ac214a88cb0ac8 Mon Sep 17 00:00:00 2001
From: Bert Palm <bert.palm@ufz.de>
Date: Thu, 13 Feb 2020 16:41:09 +0100
Subject: [PATCH] fixed some imports

---
 dios/__init__.py               |   2 +
 dios/lib.py                    |   1 +
 profiling/__init__.py          |   2 +
 profiling/generate_testsets.py | 103 +++++++++++++++++
 profiling/memory.py            | 101 ++++++++++++++++
 profiling/performance.py       | 204 +++++++++++++++++++++++++++++++++
 profiling/testsets/.gitignore  |   6 +
 tests/tests.py                 |   3 +
 8 files changed, 422 insertions(+)
 create mode 100644 profiling/__init__.py
 create mode 100644 profiling/generate_testsets.py
 create mode 100644 profiling/memory.py
 create mode 100644 profiling/performance.py
 create mode 100644 profiling/testsets/.gitignore

diff --git a/dios/__init__.py b/dios/__init__.py
index 2866f42..34cb18b 100644
--- a/dios/__init__.py
+++ b/dios/__init__.py
@@ -1,3 +1,5 @@
 
+from dios.lib import *
+from dios.options import *
 from dios.dios import *
 
diff --git a/dios/lib.py b/dios/lib.py
index a625ef7..38f5662 100644
--- a/dios/lib.py
+++ b/dios/lib.py
@@ -1,5 +1,6 @@
 from dios.itypes import *
 from dios.options import *
+
 import pandas as pd
 import warnings
 
diff --git a/profiling/__init__.py b/profiling/__init__.py
new file mode 100644
index 0000000..139597f
--- /dev/null
+++ b/profiling/__init__.py
@@ -0,0 +1,2 @@
+
+
diff --git a/profiling/generate_testsets.py b/profiling/generate_testsets.py
new file mode 100644
index 0000000..df2d97e
--- /dev/null
+++ b/profiling/generate_testsets.py
@@ -0,0 +1,103 @@
+import time
+
+import pandas as pd
+import numpy as np
+import datetime as dt
+from dios import dios
+import pickle
+import os
+
+var_prefix = 'var'
+
+
+def _gen_testset(rowsz, colsz, freq='1min', disalign=True, randstart=True):
+    df = pd.DataFrame()
+    dos = dios.DictOfSeries()
+    start = dt.datetime.strptime("2000-01-01 00:00:00", "%Y-%m-%d %H:%M:%S")
+    times = pd.date_range(periods=rowsz, start=start, freq=freq)
+
+    frequ = freq.strip('0123456789')
+    freqv = int(freq[:-len(frequ)])
+
+    for i in range(colsz):
+
+        if randstart:
+            # generate random startpoint for each series
+            r = str(np.random.randint(int(rowsz * 0.05), int(rowsz * 0.6) + 2)) + frequ
+            st = start + pd.Timedelta(r)
+            times = pd.date_range(periods=rowsz, start=st, freq=freq)
+
+        if disalign:
+            if disalign == 'random':
+                r = np.random.randint(1, i + 2)
+            else:
+                # total disalign
+                r = i
+            times += pd.Timedelta(f'{r}ns')
+
+        d = np.random.randint(1, 9, rowsz)
+        v = f'var{i}'
+        tmp = pd.DataFrame(index=times, data=d, columns=[v])
+        df = pd.merge(df, tmp, left_index=True, right_index=True, how='outer')
+        dos[v] = tmp.squeeze().copy()
+
+    return df, dos
+
+
+def _gen_df(rowsz, colsz, freq='1min', disalign=True, randstart=True):
+    df, _ = _gen_testset(rowsz=rowsz, colsz=colsz, freq=freq, disalign=disalign, randstart=randstart)
+    return df
+
+
+def gen_dos(rowsz, colsz, freq='1min', disalign=True, randstart=True):
+    _, dos = _gen_testset(rowsz=rowsz, colsz=colsz, freq=freq, disalign=disalign, randstart=randstart)
+    return dos
+
+
+def get_testset(rows, cols, freq='1s', disalign=True, randstart=True, storagedir='testsets', noresult=False):
+    fname = f'set_f{freq}_d{disalign}_r{randstart}_dim{rows}x{cols}.pkl'
+    fpath = os.path.join(storagedir, fname)
+    try:
+        with open(fpath, 'rb') as fh:
+            if noresult:
+                return
+            tup = pickle.load(fh)
+    except (pickle.UnpicklingError, FileNotFoundError):
+        df_, dos_ = _gen_testset(rowsz=rows, colsz=cols, freq=freq, disalign=disalign, randstart=randstart)
+        df_ = df_.sort_index(axis=0, level=0)
+        a_ = df_.copy().stack(dropna=False).sort_index(axis=0, level=0).copy()
+        b_ = df_.copy().unstack().sort_index(axis=0, level=0).copy()
+        tup = df_, a_, b_, dos_
+        with open(fpath, 'wb') as fh:
+            pickle.dump(tup, fh)
+
+    if noresult:
+        return
+
+    return tup
+
+
+def gen_all(rrange, crange):
+    for r in rrange:
+        for c in crange:
+            print(r, ' x ', c)
+            t0 = time.time()
+            get_testset(r, c, noresult=True)
+            t1 = time.time()
+            print(t1-t0)
+
+
+if __name__ == '__main__':
+    # import time
+    #
+    # t0 = time.time()
+    # for i in range(7):
+    #     get_testset(10**i, 10)
+    # t1 = time.time()
+    # print(t1-t0)
+
+    rr = [10**r for r in range(1,6)]
+    c = range(10, 60, 10)
+    gen_all(rr, c)
+
+
diff --git a/profiling/memory.py b/profiling/memory.py
new file mode 100644
index 0000000..81f7f00
--- /dev/null
+++ b/profiling/memory.py
@@ -0,0 +1,101 @@
+import gc
+from profiling import get_testset, _gen_testset
+
+
+def calc_mem(rows, cols, shifted=False, dtypesz=(64 / 8)):
+    if shifted:
+        idxsz = 8 * rows * cols
+        # additional nans are inserted exactly as many as variables
+        rowsz = rows * cols * dtypesz
+    else:
+        idxsz = 8 * rows
+        rowsz = rows * dtypesz
+
+    return idxsz + rowsz * cols
+
+
+def bytes2hread(bytes):
+    i = 0
+    units = ['B', 'kB', 'MB', 'GB', 'TB']
+    while (bytes > 1000):
+        bytes /= 1024
+        i += 1
+        if i == 4:
+            break
+    return bytes, units[i]
+
+
+def rows_by_time(nsec, mdays):
+    """ calc the number of values for one value every n seconds in m days
+    :param nsec: n seconds a value occur
+    :param mdays: this many days of data
+    :return: rows thats needed
+    """
+    return int((60 / nsec) * 60 * 24 * mdays)
+
+
+if __name__ == '__main__':
+
+    # dos      - linear in rows and colums, same size for r=10,c=100 or r=100,c=10
+    do_real_check = True
+    cols = 10
+    rows = 100000
+    # rows = rows_by_time(nsec=600, mdays=365*2)
+
+    mem = calc_mem(rows, cols, shifted=False)
+    memsh = calc_mem(rows, cols, shifted=True)
+
+    df, _, _, dos = get_testset(rows, cols, disalign=False, randstart=True)
+    dos_mem = dos.memory_usage()
+    print(f"dos:\n-----------")
+    print("mem: ", *bytes2hread(dos_mem))
+    print("entries:", sum([len(dos[e]) for e in dos]))
+    print()
+
+    ratio = (1 / (memsh - mem) ) * dos_mem
+
+    mem = bytes2hread(mem)
+    memsh = bytes2hread(memsh)
+
+    print('df - best case\n---------')
+    print("mem: ", *mem)
+    print("entries:", rows)
+    print()
+    print('df - worst case\n---------')
+    print("mem :", *memsh)
+    print("entries:", rows * cols)
+
+    print()
+    print(f"dfbest, dos, dfworst: 0%, {round(ratio, 4)*100}%, 100% ")
+
+    if not do_real_check:
+        exit(0)
+
+    proveMeRight = False
+
+    if proveMeRight:
+        # best case
+        print()
+        print('best case proove')
+        dfb, _ = _gen_testset(rows, cols, disalign=False, randstart=False)
+        dfb.info(memory_usage='deep', verbose=False)
+
+    print()
+    print('rand start, same freq')
+    df.info(memory_usage='deep', verbose=False)
+    print("entries:", sum([len(df[e]) for e in df]))
+
+    print()
+    print('rand start, rand freq')
+    df, _ = get_testset(rows, cols, disalign='random', randstart=True)
+    df.info(memory_usage='deep', verbose=False)
+    print("entries:", sum([len(df[e]) for e in df]))
+
+    if proveMeRight:
+        # worst case
+        print()
+        print('worst case proove')
+        df, _ = _gen_testset(rows, cols, disalign=True, randstart=False)
+        df.info(memory_usage='deep', verbose=False)
+
+    gc.collect()
diff --git a/profiling/performance.py b/profiling/performance.py
new file mode 100644
index 0000000..eb5c95a
--- /dev/null
+++ b/profiling/performance.py
@@ -0,0 +1,204 @@
+import pandas as pd
+import numpy as np
+import time
+from profiling import get_testset, var_prefix
+
+profile_assignment = False
+
+idx = pd.IndexSlice
+rows = 0
+
+fir = ['var', 'ts', 'ass']
+sec = ['df', 'a', 'b', 'dios']
+timingsdf = pd.DataFrame(columns=pd.MultiIndex.from_product([fir, sec]))
+
+
+def df_timmings(df, t0, t1, v1, v2):
+    _t0 = time.time()
+    a = df.loc[t0:t1, :]
+    _t1 = time.time()
+    b = df.loc[:, v1]
+    _t2 = time.time()
+    if profile_assignment:
+        df.loc[t0:t1, v1] = df.loc[t0:t1, v1] * 1111
+    _t3 = time.time()
+
+    timingsdf.at[rows, ('ts', 'df')] += _t1 - _t0
+    timingsdf.at[rows, ('var', 'df')] += _t2 - _t1
+    timingsdf.at[rows, ('ass', 'df')] += _t3 - _t2
+    return a, b, df
+
+
+def a_timings(df, t0, t1, v1, v2):
+    _t0 = time.time()
+    a = df.loc[t0:t1, :]
+    _t1 = time.time()
+    b = df.loc[:, v1]
+    _t2 = time.time()
+    if profile_assignment:
+        df.loc[t0:t1, v1] = df.loc[t0:t1, v1] * 1111
+    _t3 = time.time()
+
+    timingsdf.at[rows, ('ts', 'a')] += _t1 - _t0
+    timingsdf.at[rows, ('var', 'a')] += _t2 - _t1
+    timingsdf.at[rows, ('ass', 'a')] += _t3 - _t2
+    return a, b, df
+
+
+def b_timings(df, t0, t1, v1, v2):
+    _t0 = time.time()
+    a = df.loc[:, t0:t1]
+    _t1 = time.time()
+    b = df.loc[v1, :]
+    _t2 = time.time()
+    if profile_assignment:
+        df.loc[v1, t0:t1] = df.loc[v1, t0:t1] * 1111
+    _t3 = time.time()
+
+    timingsdf.at[rows, ('ts', 'b')] += _t1 - _t0
+    timingsdf.at[rows, ('var', 'b')] += _t2 - _t1
+    timingsdf.at[rows, ('ass', 'b')] += _t3 - _t2
+    return a, b, df
+
+
+def dos_timings(dos, t0, t1, v1, v2):
+    _t0 = time.time()
+    a = dos[t0:t1, :]
+    _t1 = time.time()
+    b = dos[:, v1]
+    _t2 = time.time()
+    if profile_assignment:
+        dos[t0:t1, v1] = dos[t0:t1, v1] * 1111
+    _t3 = time.time()
+
+    timingsdf.at[rows, ('ts', 'dios')] += _t1 - _t0
+    timingsdf.at[rows, ('var', 'dios')] += _t2 - _t1
+    timingsdf.at[rows, ('ass', 'dios')] += _t3 - _t2
+    return a, b, dos
+
+
+def gen_random_timestamps(m, M):
+    r = (M - m) * (np.random.randint(10,90) + np.random.random()) * 0.01
+    a , b = m + r, M - r
+    return min(a,b), max(a,b)
+
+
+def find_index_range(obj):
+    min_ = None
+    max_ = None
+    for r in obj:
+        m = obj[r].index.min()
+        M = obj[r].index.max()
+        try:
+            min_ = min(min_, m)
+            max_ = max(max_, M)
+        except TypeError:
+            min_ = m
+            max_ = M
+    return min_, max_
+
+
+if __name__ == '__main__':
+    import matplotlib.pyplot as plt
+
+    # do not touch
+    rows = 1
+
+    # max increase of of rows
+    # 1 = 10 # 2 = 100 # .... # 5 = 100'000
+    iterations = 4
+    runs = 10
+    cols = 10
+
+    profile_assignment = True
+
+    # which to calc and plot
+    use_df = True
+    use_a = False
+    use_b = False
+    use_dos = True
+
+    # plot options
+    normalize_to_df = False
+    plot_xlog = True
+    plot_ylog = False
+
+    # ########################
+
+    v1 = 'var1'
+    v2 = 'var2'
+    for i in range(iterations):
+        rows *= 10
+
+        timingsdf.loc[rows] = (0,) * len(timingsdf.columns)
+
+        df, a, b, dos = get_testset(rows, cols)
+        t0, t4 = find_index_range(df)
+
+        if use_df or normalize_to_df:
+            for r in range(runs):
+                t1, t2 = gen_random_timestamps(t0, t4)
+                vr1 = var_prefix + str(np.random.randint(0, cols))
+                df_timmings(df, t1, t2, vr1, None)
+
+        if use_a:
+            for r in range(runs):
+                t1, t2 = gen_random_timestamps(t0, t4)
+                vr1 = var_prefix + str(np.random.randint(0, cols))
+                a_timings(a, t1, t2, vr1, None)
+
+        if use_b:
+            for r in range(runs):
+                t1, t2 = gen_random_timestamps(t0, t4)
+                vr1 = var_prefix + str(np.random.randint(0, cols))
+                b_timings(b, t1, t2, vr1, None)
+
+        if use_dos:
+            for r in range(runs):
+                t1, t2 = gen_random_timestamps(t0, t4)
+                vr1 = var_prefix + str(np.random.randint(0, cols))
+                dos_timings(dos, t1, t2, vr1, None)
+
+    # calc the average
+    timingsdf /= runs
+
+    pd.set_option('display.max_columns', 100)
+
+    df = timingsdf
+    if not profile_assignment:
+        df.drop(labels='ass', axis=1, level=0, inplace=True)
+    print('timings:')
+    print(df)
+    df = df.swaplevel(axis=1)
+    if normalize_to_df:
+        a = df.loc[:, 'a'] / df.loc[:, 'df']
+        b = df.loc[:, 'b'] / df.loc[:, 'df']
+        c = df.loc[:, 'df'] / df.loc[:, 'df']
+        d = df.loc[:, 'dios'] / df.loc[:, 'df']
+        df.loc[:, 'a'] = a.values
+        df.loc[:, 'b'] = b.values
+        df.loc[:, 'df'] = c.values
+        df.loc[:, 'dios'] = d.values
+        all = df.copy()
+        all.swaplevel(axis=1)
+        print('\n\ndiff:')
+        print(all)
+
+    a = df.loc[:, ('a', slice(None))]
+    b = df.loc[:, ('b', slice(None))]
+    dios = df.loc[:, ('dios', slice(None))]
+    df = df.loc[:, ('df', slice(None))]
+
+    ax = plt.gca()
+    ax.set_title(f"avg of: {runs} runs, columns: {cols}")
+
+    if use_df:
+        df.plot(logy=plot_ylog, logx=plot_xlog, linestyle='-', ax=ax)
+    if use_a:
+        a.plot(logy=plot_ylog, logx=plot_xlog, linestyle='--', ax=ax)
+    if use_b:
+        b.plot(logy=plot_ylog, logx=plot_xlog, linestyle=':', ax=ax)
+    if use_dos:
+        dios.plot(logy=plot_ylog, logx=plot_xlog, linestyle='-.', ax=ax)
+
+    plt.show()
diff --git a/profiling/testsets/.gitignore b/profiling/testsets/.gitignore
new file mode 100644
index 0000000..aa8d4bb
--- /dev/null
+++ b/profiling/testsets/.gitignore
@@ -0,0 +1,6 @@
+
+# ignore all
+*
+
+# except ourself, to ensure the `testsets`-dir isn't ignored
+!.gitignore
\ No newline at end of file
diff --git a/tests/tests.py b/tests/tests.py
index 90040e0..be2407e 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -1,4 +1,7 @@
 from dios import *
+import pandas as pd
+import datetime as dt
+import numpy as np
 
 v0 = 'var0'
 v1 = 'var1'
-- 
GitLab