dos->dios

8144f3c4 · Bert Palm · 200b7aec · 8144f3c4 · 8144f3c4 · 8144f3c4
Commit 8144f3c4 authored 5 years ago by Bert Palm 🎇
--- a/profiling/generate_testsets.py
+++ b/profiling/generate_testsets.py
@@ -44,14 +44,9 @@ def _gen_testset(rowsz, colsz, freq='1min', disalign=True, randstart=True):
    return df, dos


-def _gen_df(rowsz, colsz, freq='1min', disalign=True, randstart=True):
-    df, _ = _gen_testset(rowsz=rowsz, colsz=colsz, freq=freq, disalign=disalign, randstart=randstart)
-    return df
-
-
-def gen_dos(rowsz, colsz, freq='1min', disalign=True, randstart=True):
-    _, dos = _gen_testset(rowsz=rowsz, colsz=colsz, freq=freq, disalign=disalign, randstart=randstart)
-    return dos
+def get_random_df_and_dios(rowsz, colsz, freq='1min', disalign=True, randstart=True):
+    df, _, _, dios, *_ = get_testset(rowsz, colsz, freq=freq, disalign=disalign, randstart=randstart)
+    return df, dios


 def get_testset(rows, cols, freq='1s', disalign=True, randstart=True, storagedir='testsets', noresult=False):
@@ -63,11 +58,11 @@ def get_testset(rows, cols, freq='1s', disalign=True, randstart=True, storagedir
                return
            tup = pickle.load(fh)
    except (pickle.UnpicklingError, FileNotFoundError):
-        df_, dos_ = _gen_testset(rowsz=rows, colsz=cols, freq=freq, disalign=disalign, randstart=randstart)
-        df_ = df_.sort_index(axis=0, level=0)
-        a_ = df_.copy().stack(dropna=False).sort_index(axis=0, level=0).copy()
-        b_ = df_.copy().unstack().sort_index(axis=0, level=0).copy()
-        tup = df_, a_, b_, dos_
+        df, dios = _gen_testset(rowsz=rows, colsz=cols, freq=freq, disalign=disalign, randstart=randstart)
+        df = df.sort_index(axis=0, level=0)
+        df_type_a = df.copy().stack(dropna=False).sort_index(axis=0, level=0).copy()
+        df_type_b = df.copy().unstack().sort_index(axis=0, level=0).copy()
+        tup = df, df_type_a, df_type_b, dios
        with open(fpath, 'wb') as fh:
            pickle.dump(tup, fh)


--- a/profiling/memory.py
+++ b/profiling/memory.py
 import gc
-from profiling import get_testset, _gen_testset
+from profiling.generate_testsets import get_random_df_and_dios


 def calc_mem(rows, cols, shifted=False, dtypesz=(64 / 8)):
@@ -36,7 +36,7 @@ def rows_by_time(nsec, mdays):

 if __name__ == '__main__':

-    # dos      - linear in rows and colums, same size for r=10,c=100 or r=100,c=10
+    # dios      - linear in rows and colums, same size for r=10,c=100 or r=100,c=10
    do_real_check = True
    cols = 10
    rows = 100000
@@ -45,14 +45,14 @@ if __name__ == '__main__':
    mem = calc_mem(rows, cols, shifted=False)
    memsh = calc_mem(rows, cols, shifted=True)

-    df, _, _, dos = get_testset(rows, cols, disalign=False, randstart=True)
-    dos_mem = dos.memory_usage()
-    print(f"dos:\n-----------")
-    print("mem: ", *bytes2hread(dos_mem))
-    print("entries:", sum([len(dos[e]) for e in dos]))
+    df, dios = get_random_df_and_dios(rows, cols, disalign=False, randstart=True)
+    dios_mem = dios.memory_usage()
+    print(f"dios:\n-----------")
+    print("mem: ", *bytes2hread(dios_mem))
+    print("entries:", sum([len(dios[e]) for e in dios]))
    print()

-    ratio = (1 / (memsh - mem) ) * dos_mem
+    ratio = (1 / (memsh - mem) ) * dios_mem

    mem = bytes2hread(mem)
    memsh = bytes2hread(memsh)
@@ -66,7 +66,7 @@ if __name__ == '__main__':
    print("entries:", rows * cols)

    print()
-    print(f"dfbest, dos, dfworst: 0%, {round(ratio, 4)*100}%, 100% ")
+    print(f"dfbest, dios, dfworst: 0%, {round(ratio, 4)*100}%, 100% ")

    if not do_real_check:
        exit(0)
@@ -77,7 +77,7 @@ if __name__ == '__main__':
        # best case
        print()
        print('best case proove')
-        dfb, _ = _gen_testset(rows, cols, disalign=False, randstart=False)
+        dfb, _ = get_random_df_and_dios(rows, cols, disalign=False, randstart=False)
        dfb.info(memory_usage='deep', verbose=False)

    print()
@@ -87,7 +87,7 @@ if __name__ == '__main__':

    print()
    print('rand start, rand freq')
-    df, _ = get_testset(rows, cols, disalign='random', randstart=True)
+    df, _ = get_random_df_and_dios(rows, cols, disalign='random', randstart=True)
    df.info(memory_usage='deep', verbose=False)
    print("entries:", sum([len(df[e]) for e in df]))

@@ -95,7 +95,7 @@ if __name__ == '__main__':
        # worst case
        print()
        print('worst case proove')
-        df, _ = _gen_testset(rows, cols, disalign=True, randstart=False)
+        df, _ = get_random_df_and_dios(rows, cols, disalign=True, randstart=False)
        df.info(memory_usage='deep', verbose=False)

    gc.collect()
--- a/profiling/performance.py
+++ b/profiling/performance.py
 import pandas as pd
 import numpy as np
 import time
-from profiling import get_testset, var_prefix
+from profiling.generate_testsets import get_testset, var_prefix

 profile_assignment = False

@@ -61,20 +61,20 @@ def b_timings(df, t0, t1, v1, v2):
    return a, b, df


-def dos_timings(dos, t0, t1, v1, v2):
+def dios_timings(dios, t0, t1, v1, v2):
    _t0 = time.time()
-    a = dos[t0:t1, :]
+    a = dios[t0:t1, :]
    _t1 = time.time()
-    b = dos[:, v1]
+    b = dios[:, v1]
    _t2 = time.time()
    if profile_assignment:
-        dos[t0:t1, v1] = dos[t0:t1, v1] * 1111
+        dios[t0:t1, v1] = dios[t0:t1, v1] * 1111
    _t3 = time.time()

    timingsdf.at[rows, ('ts', 'dios')] += _t1 - _t0
    timingsdf.at[rows, ('var', 'dios')] += _t2 - _t1
    timingsdf.at[rows, ('ass', 'dios')] += _t3 - _t2
-    return a, b, dos
+    return a, b, dios


 def gen_random_timestamps(m, M):
@@ -116,7 +116,7 @@ if __name__ == '__main__':
    use_df = True
    use_a = False
    use_b = False
-    use_dos = True
+    use_dios = True

    # plot options
    normalize_to_df = False
@@ -132,7 +132,7 @@ if __name__ == '__main__':

        timingsdf.loc[rows] = (0,) * len(timingsdf.columns)

-        df, a, b, dos = get_testset(rows, cols)
+        df, a, b, dios = get_testset(rows, cols)
        t0, t4 = find_index_range(df)

        if use_df or normalize_to_df:
@@ -153,11 +153,11 @@ if __name__ == '__main__':
                vr1 = var_prefix + str(np.random.randint(0, cols))
                b_timings(b, t1, t2, vr1, None)

-        if use_dos:
+        if use_dios:
            for r in range(runs):
                t1, t2 = gen_random_timestamps(t0, t4)
                vr1 = var_prefix + str(np.random.randint(0, cols))
-                dos_timings(dos, t1, t2, vr1, None)
+                dios_timings(dios, t1, t2, vr1, None)

    # calc the average
    timingsdf /= runs
@@ -198,7 +198,7 @@ if __name__ == '__main__':
        a.plot(logy=plot_ylog, logx=plot_xlog, linestyle='--', ax=ax)
    if use_b:
        b.plot(logy=plot_ylog, logx=plot_xlog, linestyle=':', ax=ax)
-    if use_dos:
+    if use_dios:
        dios.plot(logy=plot_ylog, logx=plot_xlog, linestyle='-.', ax=ax)

    plt.show()