From 8144f3c4dcf0f3ce7e3117467743593ad864292c Mon Sep 17 00:00:00 2001 From: Bert Palm <bert.palm@ufz.de> Date: Thu, 13 Feb 2020 16:56:55 +0100 Subject: [PATCH] dos->dios --- profiling/generate_testsets.py | 21 ++++++++------------- profiling/memory.py | 24 ++++++++++++------------ profiling/performance.py | 22 +++++++++++----------- 3 files changed, 31 insertions(+), 36 deletions(-) diff --git a/profiling/generate_testsets.py b/profiling/generate_testsets.py index df2d97e..9ec68ba 100644 --- a/profiling/generate_testsets.py +++ b/profiling/generate_testsets.py @@ -44,14 +44,9 @@ def _gen_testset(rowsz, colsz, freq='1min', disalign=True, randstart=True): return df, dos -def _gen_df(rowsz, colsz, freq='1min', disalign=True, randstart=True): - df, _ = _gen_testset(rowsz=rowsz, colsz=colsz, freq=freq, disalign=disalign, randstart=randstart) - return df - - -def gen_dos(rowsz, colsz, freq='1min', disalign=True, randstart=True): - _, dos = _gen_testset(rowsz=rowsz, colsz=colsz, freq=freq, disalign=disalign, randstart=randstart) - return dos +def get_random_df_and_dios(rowsz, colsz, freq='1min', disalign=True, randstart=True): + df, _, _, dios, *_ = get_testset(rowsz, colsz, freq=freq, disalign=disalign, randstart=randstart) + return df, dios def get_testset(rows, cols, freq='1s', disalign=True, randstart=True, storagedir='testsets', noresult=False): @@ -63,11 +58,11 @@ def get_testset(rows, cols, freq='1s', disalign=True, randstart=True, storagedir return tup = pickle.load(fh) except (pickle.UnpicklingError, FileNotFoundError): - df_, dos_ = _gen_testset(rowsz=rows, colsz=cols, freq=freq, disalign=disalign, randstart=randstart) - df_ = df_.sort_index(axis=0, level=0) - a_ = df_.copy().stack(dropna=False).sort_index(axis=0, level=0).copy() - b_ = df_.copy().unstack().sort_index(axis=0, level=0).copy() - tup = df_, a_, b_, dos_ + df, dios = _gen_testset(rowsz=rows, colsz=cols, freq=freq, disalign=disalign, randstart=randstart) + df = df.sort_index(axis=0, level=0) + df_type_a = df.copy().stack(dropna=False).sort_index(axis=0, level=0).copy() + df_type_b = df.copy().unstack().sort_index(axis=0, level=0).copy() + tup = df, df_type_a, df_type_b, dios with open(fpath, 'wb') as fh: pickle.dump(tup, fh) diff --git a/profiling/memory.py b/profiling/memory.py index 81f7f00..d577464 100644 --- a/profiling/memory.py +++ b/profiling/memory.py @@ -1,5 +1,5 @@ import gc -from profiling import get_testset, _gen_testset +from profiling.generate_testsets import get_random_df_and_dios def calc_mem(rows, cols, shifted=False, dtypesz=(64 / 8)): @@ -36,7 +36,7 @@ def rows_by_time(nsec, mdays): if __name__ == '__main__': - # dos - linear in rows and colums, same size for r=10,c=100 or r=100,c=10 + # dios - linear in rows and colums, same size for r=10,c=100 or r=100,c=10 do_real_check = True cols = 10 rows = 100000 @@ -45,14 +45,14 @@ if __name__ == '__main__': mem = calc_mem(rows, cols, shifted=False) memsh = calc_mem(rows, cols, shifted=True) - df, _, _, dos = get_testset(rows, cols, disalign=False, randstart=True) - dos_mem = dos.memory_usage() - print(f"dos:\n-----------") - print("mem: ", *bytes2hread(dos_mem)) - print("entries:", sum([len(dos[e]) for e in dos])) + df, dios = get_random_df_and_dios(rows, cols, disalign=False, randstart=True) + dios_mem = dios.memory_usage() + print(f"dios:\n-----------") + print("mem: ", *bytes2hread(dios_mem)) + print("entries:", sum([len(dios[e]) for e in dios])) print() - ratio = (1 / (memsh - mem) ) * dos_mem + ratio = (1 / (memsh - mem) ) * dios_mem mem = bytes2hread(mem) memsh = bytes2hread(memsh) @@ -66,7 +66,7 @@ if __name__ == '__main__': print("entries:", rows * cols) print() - print(f"dfbest, dos, dfworst: 0%, {round(ratio, 4)*100}%, 100% ") + print(f"dfbest, dios, dfworst: 0%, {round(ratio, 4)*100}%, 100% ") if not do_real_check: exit(0) @@ -77,7 +77,7 @@ if __name__ == '__main__': # best case print() print('best case proove') - dfb, _ = _gen_testset(rows, cols, disalign=False, randstart=False) + dfb, _ = get_random_df_and_dios(rows, cols, disalign=False, randstart=False) dfb.info(memory_usage='deep', verbose=False) print() @@ -87,7 +87,7 @@ if __name__ == '__main__': print() print('rand start, rand freq') - df, _ = get_testset(rows, cols, disalign='random', randstart=True) + df, _ = get_random_df_and_dios(rows, cols, disalign='random', randstart=True) df.info(memory_usage='deep', verbose=False) print("entries:", sum([len(df[e]) for e in df])) @@ -95,7 +95,7 @@ if __name__ == '__main__': # worst case print() print('worst case proove') - df, _ = _gen_testset(rows, cols, disalign=True, randstart=False) + df, _ = get_random_df_and_dios(rows, cols, disalign=True, randstart=False) df.info(memory_usage='deep', verbose=False) gc.collect() diff --git a/profiling/performance.py b/profiling/performance.py index eb5c95a..1be82e8 100644 --- a/profiling/performance.py +++ b/profiling/performance.py @@ -1,7 +1,7 @@ import pandas as pd import numpy as np import time -from profiling import get_testset, var_prefix +from profiling.generate_testsets import get_testset, var_prefix profile_assignment = False @@ -61,20 +61,20 @@ def b_timings(df, t0, t1, v1, v2): return a, b, df -def dos_timings(dos, t0, t1, v1, v2): +def dios_timings(dios, t0, t1, v1, v2): _t0 = time.time() - a = dos[t0:t1, :] + a = dios[t0:t1, :] _t1 = time.time() - b = dos[:, v1] + b = dios[:, v1] _t2 = time.time() if profile_assignment: - dos[t0:t1, v1] = dos[t0:t1, v1] * 1111 + dios[t0:t1, v1] = dios[t0:t1, v1] * 1111 _t3 = time.time() timingsdf.at[rows, ('ts', 'dios')] += _t1 - _t0 timingsdf.at[rows, ('var', 'dios')] += _t2 - _t1 timingsdf.at[rows, ('ass', 'dios')] += _t3 - _t2 - return a, b, dos + return a, b, dios def gen_random_timestamps(m, M): @@ -116,7 +116,7 @@ if __name__ == '__main__': use_df = True use_a = False use_b = False - use_dos = True + use_dios = True # plot options normalize_to_df = False @@ -132,7 +132,7 @@ if __name__ == '__main__': timingsdf.loc[rows] = (0,) * len(timingsdf.columns) - df, a, b, dos = get_testset(rows, cols) + df, a, b, dios = get_testset(rows, cols) t0, t4 = find_index_range(df) if use_df or normalize_to_df: @@ -153,11 +153,11 @@ if __name__ == '__main__': vr1 = var_prefix + str(np.random.randint(0, cols)) b_timings(b, t1, t2, vr1, None) - if use_dos: + if use_dios: for r in range(runs): t1, t2 = gen_random_timestamps(t0, t4) vr1 = var_prefix + str(np.random.randint(0, cols)) - dos_timings(dos, t1, t2, vr1, None) + dios_timings(dios, t1, t2, vr1, None) # calc the average timingsdf /= runs @@ -198,7 +198,7 @@ if __name__ == '__main__': a.plot(logy=plot_ylog, logx=plot_xlog, linestyle='--', ax=ax) if use_b: b.plot(logy=plot_ylog, logx=plot_xlog, linestyle=':', ax=ax) - if use_dos: + if use_dios: dios.plot(logy=plot_ylog, logx=plot_xlog, linestyle='-.', ax=ax) plt.show() -- GitLab