del oldstuff

a1eb979b · Bert Palm · 99455238 · 99455238 · 99455238 · 99455238
Commit a1eb979b authored 5 years ago by Bert Palm 🎇
--- a/dios/profiling/__init__.py
+++ b/dios/profiling/__init__.py
-
-
--- a/dios/profiling/generate_testsets.py
+++ b/dios/profiling/generate_testsets.py
-import time
-
-import pandas as pd
-import numpy as np
-import datetime as dt
-from dios import dios
-import pickle
-import os
-
-var_prefix = 'var'
-
-
-def _gen_testset(rowsz, colsz, freq='1min', disalign=True, randstart=True):
-    df = pd.DataFrame()
-    dos = dios.DictOfSeries()
-    start = dt.datetime.strptime("2000-01-01 00:00:00", "%Y-%m-%d %H:%M:%S")
-    times = pd.date_range(periods=rowsz, start=start, freq=freq)
-
-    frequ = freq.strip('0123456789')
-    freqv = int(freq[:-len(frequ)])
-
-    for i in range(colsz):
-
-        if randstart:
-            # generate random startpoint for each series
-            r = str(np.random.randint(int(rowsz * 0.05), int(rowsz * 0.6) + 2)) + frequ
-            st = start + pd.Timedelta(r)
-            times = pd.date_range(periods=rowsz, start=st, freq=freq)
-
-        if disalign:
-            if disalign == 'random':
-                r = np.random.randint(1, i + 2)
-            else:
-                # total disalign
-                r = i
-            times += pd.Timedelta(f'{r}ns')
-
-        d = np.random.randint(1, 9, rowsz)
-        v = f'var{i}'
-        tmp = pd.DataFrame(index=times, data=d, columns=[v])
-        df = pd.merge(df, tmp, left_index=True, right_index=True, how='outer')
-        dos[v] = tmp.squeeze().copy()
-
-    return df, dos
-
-
-def _gen_df(rowsz, colsz, freq='1min', disalign=True, randstart=True):
-    df, _ = _gen_testset(rowsz=rowsz, colsz=colsz, freq=freq, disalign=disalign, randstart=randstart)
-    return df
-
-
-def gen_dos(rowsz, colsz, freq='1min', disalign=True, randstart=True):
-    _, dos = _gen_testset(rowsz=rowsz, colsz=colsz, freq=freq, disalign=disalign, randstart=randstart)
-    return dos
-
-
-def get_testset(rows, cols, freq='1s', disalign=True, randstart=True, storagedir='testsets', noresult=False):
-    fname = f'set_f{freq}_d{disalign}_r{randstart}_dim{rows}x{cols}.pkl'
-    fpath = os.path.join(storagedir, fname)
-    try:
-        with open(fpath, 'rb') as fh:
-            if noresult:
-                return
-            tup = pickle.load(fh)
-    except (pickle.UnpicklingError, FileNotFoundError):
-        df_, dos_ = _gen_testset(rowsz=rows, colsz=cols, freq=freq, disalign=disalign, randstart=randstart)
-        df_ = df_.sort_index(axis=0, level=0)
-        a_ = df_.copy().stack(dropna=False).sort_index(axis=0, level=0).copy()
-        b_ = df_.copy().unstack().sort_index(axis=0, level=0).copy()
-        tup = df_, a_, b_, dos_
-        with open(fpath, 'wb') as fh:
-            pickle.dump(tup, fh)
-
-    if noresult:
-        return
-
-    return tup
-
-
-def gen_all(rrange, crange):
-    for r in rrange:
-        for c in crange:
-            print(r, ' x ', c)
-            t0 = time.time()
-            get_testset(r, c, noresult=True)
-            t1 = time.time()
-            print(t1-t0)
-
-
-if __name__ == '__main__':
-    # import time
-    #
-    # t0 = time.time()
-    # for i in range(7):
-    #     get_testset(10**i, 10)
-    # t1 = time.time()
-    # print(t1-t0)
-
-    rr = [10**r for r in range(1,6)]
-    c = range(10, 60, 10)
-    gen_all(rr, c)
-
-
--- a/dios/profiling/memory.py
+++ b/dios/profiling/memory.py
-import gc
-from dios.profiling.generate_testsets import get_testset, _gen_testset
-
-
-def calc_mem(rows, cols, shifted=False, dtypesz=(64 / 8)):
-    if shifted:
-        idxsz = 8 * rows * cols
-        # additional nans are inserted exactly as many as variables
-        rowsz = rows * cols * dtypesz
-    else:
-        idxsz = 8 * rows
-        rowsz = rows * dtypesz
-
-    return idxsz + rowsz * cols
-
-
-def bytes2hread(bytes):
-    i = 0
-    units = ['B', 'kB', 'MB', 'GB', 'TB']
-    while (bytes > 1000):
-        bytes /= 1024
-        i += 1
-        if i == 4:
-            break
-    return bytes, units[i]
-
-
-def rows_by_time(nsec, mdays):
-    """ calc the number of values for one value every n seconds in m days
-    :param nsec: n seconds a value occur
-    :param mdays: this many days of data
-    :return: rows thats needed
-    """
-    return int((60 / nsec) * 60 * 24 * mdays)
-
-
-if __name__ == '__main__':
-
-    # dos      - linear in rows and colums, same size for r=10,c=100 or r=100,c=10
-    do_real_check = True
-    cols = 10
-    rows = 100000
-    # rows = rows_by_time(nsec=600, mdays=365*2)
-
-    mem = calc_mem(rows, cols, shifted=False)
-    memsh = calc_mem(rows, cols, shifted=True)
-
-    df, _, _, dos = get_testset(rows, cols, disalign=False, randstart=True)
-    dos_mem = dos.memory_usage()
-    print(f"dos:\n-----------")
-    print("mem: ", *bytes2hread(dos_mem))
-    print("entries:", sum([len(dos[e]) for e in dos]))
-    print()
-
-    ratio = (1 / (memsh - mem) ) * dos_mem
-
-    mem = bytes2hread(mem)
-    memsh = bytes2hread(memsh)
-
-    print('df - best case\n---------')
-    print("mem: ", *mem)
-    print("entries:", rows)
-    print()
-    print('df - worst case\n---------')
-    print("mem :", *memsh)
-    print("entries:", rows * cols)
-
-    print()
-    print(f"dfbest, dos, dfworst: 0%, {round(ratio, 4)*100}%, 100% ")
-
-    if not do_real_check:
-        exit(0)
-
-    proveMeRight = False
-
-    if proveMeRight:
-        # best case
-        print()
-        print('best case proove')
-        dfb, _ = _gen_testset(rows, cols, disalign=False, randstart=False)
-        dfb.info(memory_usage='deep', verbose=False)
-
-    print()
-    print('rand start, same freq')
-    df.info(memory_usage='deep', verbose=False)
-    print("entries:", sum([len(df[e]) for e in df]))
-
-    print()
-    print('rand start, rand freq')
-    df, _ = get_testset(rows, cols, disalign='random', randstart=True)
-    df.info(memory_usage='deep', verbose=False)
-    print("entries:", sum([len(df[e]) for e in df]))
-
-    if proveMeRight:
-        # worst case
-        print()
-        print('worst case proove')
-        df, _ = _gen_testset(rows, cols, disalign=True, randstart=False)
-        df.info(memory_usage='deep', verbose=False)
-
-    gc.collect()
--- a/dios/profiling/performance.py
+++ b/dios/profiling/performance.py
-import pandas as pd
-import numpy as np
-import time
-from dios.profiling.generate_testsets import get_testset, var_prefix
-
-profile_assignment = False
-
-idx = pd.IndexSlice
-rows = 0
-
-fir = ['var', 'ts', 'ass']
-sec = ['df', 'a', 'b', 'dios']
-timingsdf = pd.DataFrame(columns=pd.MultiIndex.from_product([fir, sec]))
-
-
-def df_timmings(df, t0, t1, v1, v2):
-    _t0 = time.time()
-    a = df.loc[t0:t1, :]
-    _t1 = time.time()
-    b = df.loc[:, v1]
-    _t2 = time.time()
-    if profile_assignment:
-        df.loc[t0:t1, v1] = df.loc[t0:t1, v1] * 1111
-    _t3 = time.time()
-
-    timingsdf.at[rows, ('ts', 'df')] += _t1 - _t0
-    timingsdf.at[rows, ('var', 'df')] += _t2 - _t1
-    timingsdf.at[rows, ('ass', 'df')] += _t3 - _t2
-    return a, b, df
-
-
-def a_timings(df, t0, t1, v1, v2):
-    _t0 = time.time()
-    a = df.loc[t0:t1, :]
-    _t1 = time.time()
-    b = df.loc[:, v1]
-    _t2 = time.time()
-    if profile_assignment:
-        df.loc[t0:t1, v1] = df.loc[t0:t1, v1] * 1111
-    _t3 = time.time()
-
-    timingsdf.at[rows, ('ts', 'a')] += _t1 - _t0
-    timingsdf.at[rows, ('var', 'a')] += _t2 - _t1
-    timingsdf.at[rows, ('ass', 'a')] += _t3 - _t2
-    return a, b, df
-
-
-def b_timings(df, t0, t1, v1, v2):
-    _t0 = time.time()
-    a = df.loc[:, t0:t1]
-    _t1 = time.time()
-    b = df.loc[v1, :]
-    _t2 = time.time()
-    if profile_assignment:
-        df.loc[v1, t0:t1] = df.loc[v1, t0:t1] * 1111
-    _t3 = time.time()
-
-    timingsdf.at[rows, ('ts', 'b')] += _t1 - _t0
-    timingsdf.at[rows, ('var', 'b')] += _t2 - _t1
-    timingsdf.at[rows, ('ass', 'b')] += _t3 - _t2
-    return a, b, df
-
-
-def dos_timings(dos, t0, t1, v1, v2):
-    _t0 = time.time()
-    a = dos[t0:t1, :]
-    _t1 = time.time()
-    b = dos[:, v1]
-    _t2 = time.time()
-    if profile_assignment:
-        dos[t0:t1, v1] = dos[t0:t1, v1] * 1111
-    _t3 = time.time()
-
-    timingsdf.at[rows, ('ts', 'dios')] += _t1 - _t0
-    timingsdf.at[rows, ('var', 'dios')] += _t2 - _t1
-    timingsdf.at[rows, ('ass', 'dios')] += _t3 - _t2
-    return a, b, dos
-
-
-def gen_random_timestamps(m, M):
-    r = (M - m) * (np.random.randint(10,90) + np.random.random()) * 0.01
-    a , b = m + r, M - r
-    return min(a,b), max(a,b)
-
-
-def find_index_range(obj):
-    min_ = None
-    max_ = None
-    for r in obj:
-        m = obj[r].index.min()
-        M = obj[r].index.max()
-        try:
-            min_ = min(min_, m)
-            max_ = max(max_, M)
-        except TypeError:
-            min_ = m
-            max_ = M
-    return min_, max_
-
-
-if __name__ == '__main__':
-    import matplotlib.pyplot as plt
-
-    # do not touch
-    rows = 1
-
-    # max increase of of rows
-    # 1 = 10 # 2 = 100 # .... # 5 = 100'000
-    iterations = 4
-    runs = 10
-    cols = 10
-
-    profile_assignment = True
-
-    # which to calc and plot
-    use_df = True
-    use_a = False
-    use_b = False
-    use_dos = True
-
-    # plot options
-    normalize_to_df = False
-    plot_xlog = True
-    plot_ylog = False
-
-    # ########################
-
-    v1 = 'var1'
-    v2 = 'var2'
-    for i in range(iterations):
-        rows *= 10
-
-        timingsdf.loc[rows] = (0,) * len(timingsdf.columns)
-
-        df, a, b, dos = get_testset(rows, cols)
-        t0, t4 = find_index_range(df)
-
-        if use_df or normalize_to_df:
-            for r in range(runs):
-                t1, t2 = gen_random_timestamps(t0, t4)
-                vr1 = var_prefix + str(np.random.randint(0, cols))
-                df_timmings(df, t1, t2, vr1, None)
-
-        if use_a:
-            for r in range(runs):
-                t1, t2 = gen_random_timestamps(t0, t4)
-                vr1 = var_prefix + str(np.random.randint(0, cols))
-                a_timings(a, t1, t2, vr1, None)
-
-        if use_b:
-            for r in range(runs):
-                t1, t2 = gen_random_timestamps(t0, t4)
-                vr1 = var_prefix + str(np.random.randint(0, cols))
-                b_timings(b, t1, t2, vr1, None)
-
-        if use_dos:
-            for r in range(runs):
-                t1, t2 = gen_random_timestamps(t0, t4)
-                vr1 = var_prefix + str(np.random.randint(0, cols))
-                dos_timings(dos, t1, t2, vr1, None)
-
-    # calc the average
-    timingsdf /= runs
-
-    pd.set_option('display.max_columns', 100)
-
-    df = timingsdf
-    if not profile_assignment:
-        df.drop(labels='ass', axis=1, level=0, inplace=True)
-    print('timings:')
-    print(df)
-    df = df.swaplevel(axis=1)
-    if normalize_to_df:
-        a = df.loc[:, 'a'] / df.loc[:, 'df']
-        b = df.loc[:, 'b'] / df.loc[:, 'df']
-        c = df.loc[:, 'df'] / df.loc[:, 'df']
-        d = df.loc[:, 'dios'] / df.loc[:, 'df']
-        df.loc[:, 'a'] = a.values
-        df.loc[:, 'b'] = b.values
-        df.loc[:, 'df'] = c.values
-        df.loc[:, 'dios'] = d.values
-        all = df.copy()
-        all.swaplevel(axis=1)
-        print('\n\ndiff:')
-        print(all)
-
-    a = df.loc[:, ('a', slice(None))]
-    b = df.loc[:, ('b', slice(None))]
-    dios = df.loc[:, ('dios', slice(None))]
-    df = df.loc[:, ('df', slice(None))]
-
-    ax = plt.gca()
-    ax.set_title(f"avg of: {runs} runs, columns: {cols}")
-
-    if use_df:
-        df.plot(logy=plot_ylog, logx=plot_xlog, linestyle='-', ax=ax)
-    if use_a:
-        a.plot(logy=plot_ylog, logx=plot_xlog, linestyle='--', ax=ax)
-    if use_b:
-        b.plot(logy=plot_ylog, logx=plot_xlog, linestyle=':', ax=ax)
-    if use_dos:
-        dios.plot(logy=plot_ylog, logx=plot_xlog, linestyle='-.', ax=ax)
-
-    plt.show()
--- a/dios/profiling/testsets/.gitignore
+++ b/dios/profiling/testsets/.gitignore
-
-# ignore all
-*
-
-# except ourself, to ensure the `testsets`-dir isn't ignored
-!.gitignore
\ No newline at end of file