From 200b7aec2503711619cd670d09ac214a88cb0ac8 Mon Sep 17 00:00:00 2001 From: Bert Palm <bert.palm@ufz.de> Date: Thu, 13 Feb 2020 16:41:09 +0100 Subject: [PATCH] fixed some imports --- dios/__init__.py | 2 + dios/lib.py | 1 + profiling/__init__.py | 2 + profiling/generate_testsets.py | 103 +++++++++++++++++ profiling/memory.py | 101 ++++++++++++++++ profiling/performance.py | 204 +++++++++++++++++++++++++++++++++ profiling/testsets/.gitignore | 6 + tests/tests.py | 3 + 8 files changed, 422 insertions(+) create mode 100644 profiling/__init__.py create mode 100644 profiling/generate_testsets.py create mode 100644 profiling/memory.py create mode 100644 profiling/performance.py create mode 100644 profiling/testsets/.gitignore diff --git a/dios/__init__.py b/dios/__init__.py index 2866f42..34cb18b 100644 --- a/dios/__init__.py +++ b/dios/__init__.py @@ -1,3 +1,5 @@ +from dios.lib import * +from dios.options import * from dios.dios import * diff --git a/dios/lib.py b/dios/lib.py index a625ef7..38f5662 100644 --- a/dios/lib.py +++ b/dios/lib.py @@ -1,5 +1,6 @@ from dios.itypes import * from dios.options import * + import pandas as pd import warnings diff --git a/profiling/__init__.py b/profiling/__init__.py new file mode 100644 index 0000000..139597f --- /dev/null +++ b/profiling/__init__.py @@ -0,0 +1,2 @@ + + diff --git a/profiling/generate_testsets.py b/profiling/generate_testsets.py new file mode 100644 index 0000000..df2d97e --- /dev/null +++ b/profiling/generate_testsets.py @@ -0,0 +1,103 @@ +import time + +import pandas as pd +import numpy as np +import datetime as dt +from dios import dios +import pickle +import os + +var_prefix = 'var' + + +def _gen_testset(rowsz, colsz, freq='1min', disalign=True, randstart=True): + df = pd.DataFrame() + dos = dios.DictOfSeries() + start = dt.datetime.strptime("2000-01-01 00:00:00", "%Y-%m-%d %H:%M:%S") + times = pd.date_range(periods=rowsz, start=start, freq=freq) + + frequ = freq.strip('0123456789') + freqv = int(freq[:-len(frequ)]) + + for i in range(colsz): + + if randstart: + # generate random startpoint for each series + r = str(np.random.randint(int(rowsz * 0.05), int(rowsz * 0.6) + 2)) + frequ + st = start + pd.Timedelta(r) + times = pd.date_range(periods=rowsz, start=st, freq=freq) + + if disalign: + if disalign == 'random': + r = np.random.randint(1, i + 2) + else: + # total disalign + r = i + times += pd.Timedelta(f'{r}ns') + + d = np.random.randint(1, 9, rowsz) + v = f'var{i}' + tmp = pd.DataFrame(index=times, data=d, columns=[v]) + df = pd.merge(df, tmp, left_index=True, right_index=True, how='outer') + dos[v] = tmp.squeeze().copy() + + return df, dos + + +def _gen_df(rowsz, colsz, freq='1min', disalign=True, randstart=True): + df, _ = _gen_testset(rowsz=rowsz, colsz=colsz, freq=freq, disalign=disalign, randstart=randstart) + return df + + +def gen_dos(rowsz, colsz, freq='1min', disalign=True, randstart=True): + _, dos = _gen_testset(rowsz=rowsz, colsz=colsz, freq=freq, disalign=disalign, randstart=randstart) + return dos + + +def get_testset(rows, cols, freq='1s', disalign=True, randstart=True, storagedir='testsets', noresult=False): + fname = f'set_f{freq}_d{disalign}_r{randstart}_dim{rows}x{cols}.pkl' + fpath = os.path.join(storagedir, fname) + try: + with open(fpath, 'rb') as fh: + if noresult: + return + tup = pickle.load(fh) + except (pickle.UnpicklingError, FileNotFoundError): + df_, dos_ = _gen_testset(rowsz=rows, colsz=cols, freq=freq, disalign=disalign, randstart=randstart) + df_ = df_.sort_index(axis=0, level=0) + a_ = df_.copy().stack(dropna=False).sort_index(axis=0, level=0).copy() + b_ = df_.copy().unstack().sort_index(axis=0, level=0).copy() + tup = df_, a_, b_, dos_ + with open(fpath, 'wb') as fh: + pickle.dump(tup, fh) + + if noresult: + return + + return tup + + +def gen_all(rrange, crange): + for r in rrange: + for c in crange: + print(r, ' x ', c) + t0 = time.time() + get_testset(r, c, noresult=True) + t1 = time.time() + print(t1-t0) + + +if __name__ == '__main__': + # import time + # + # t0 = time.time() + # for i in range(7): + # get_testset(10**i, 10) + # t1 = time.time() + # print(t1-t0) + + rr = [10**r for r in range(1,6)] + c = range(10, 60, 10) + gen_all(rr, c) + + diff --git a/profiling/memory.py b/profiling/memory.py new file mode 100644 index 0000000..81f7f00 --- /dev/null +++ b/profiling/memory.py @@ -0,0 +1,101 @@ +import gc +from profiling import get_testset, _gen_testset + + +def calc_mem(rows, cols, shifted=False, dtypesz=(64 / 8)): + if shifted: + idxsz = 8 * rows * cols + # additional nans are inserted exactly as many as variables + rowsz = rows * cols * dtypesz + else: + idxsz = 8 * rows + rowsz = rows * dtypesz + + return idxsz + rowsz * cols + + +def bytes2hread(bytes): + i = 0 + units = ['B', 'kB', 'MB', 'GB', 'TB'] + while (bytes > 1000): + bytes /= 1024 + i += 1 + if i == 4: + break + return bytes, units[i] + + +def rows_by_time(nsec, mdays): + """ calc the number of values for one value every n seconds in m days + :param nsec: n seconds a value occur + :param mdays: this many days of data + :return: rows thats needed + """ + return int((60 / nsec) * 60 * 24 * mdays) + + +if __name__ == '__main__': + + # dos - linear in rows and colums, same size for r=10,c=100 or r=100,c=10 + do_real_check = True + cols = 10 + rows = 100000 + # rows = rows_by_time(nsec=600, mdays=365*2) + + mem = calc_mem(rows, cols, shifted=False) + memsh = calc_mem(rows, cols, shifted=True) + + df, _, _, dos = get_testset(rows, cols, disalign=False, randstart=True) + dos_mem = dos.memory_usage() + print(f"dos:\n-----------") + print("mem: ", *bytes2hread(dos_mem)) + print("entries:", sum([len(dos[e]) for e in dos])) + print() + + ratio = (1 / (memsh - mem) ) * dos_mem + + mem = bytes2hread(mem) + memsh = bytes2hread(memsh) + + print('df - best case\n---------') + print("mem: ", *mem) + print("entries:", rows) + print() + print('df - worst case\n---------') + print("mem :", *memsh) + print("entries:", rows * cols) + + print() + print(f"dfbest, dos, dfworst: 0%, {round(ratio, 4)*100}%, 100% ") + + if not do_real_check: + exit(0) + + proveMeRight = False + + if proveMeRight: + # best case + print() + print('best case proove') + dfb, _ = _gen_testset(rows, cols, disalign=False, randstart=False) + dfb.info(memory_usage='deep', verbose=False) + + print() + print('rand start, same freq') + df.info(memory_usage='deep', verbose=False) + print("entries:", sum([len(df[e]) for e in df])) + + print() + print('rand start, rand freq') + df, _ = get_testset(rows, cols, disalign='random', randstart=True) + df.info(memory_usage='deep', verbose=False) + print("entries:", sum([len(df[e]) for e in df])) + + if proveMeRight: + # worst case + print() + print('worst case proove') + df, _ = _gen_testset(rows, cols, disalign=True, randstart=False) + df.info(memory_usage='deep', verbose=False) + + gc.collect() diff --git a/profiling/performance.py b/profiling/performance.py new file mode 100644 index 0000000..eb5c95a --- /dev/null +++ b/profiling/performance.py @@ -0,0 +1,204 @@ +import pandas as pd +import numpy as np +import time +from profiling import get_testset, var_prefix + +profile_assignment = False + +idx = pd.IndexSlice +rows = 0 + +fir = ['var', 'ts', 'ass'] +sec = ['df', 'a', 'b', 'dios'] +timingsdf = pd.DataFrame(columns=pd.MultiIndex.from_product([fir, sec])) + + +def df_timmings(df, t0, t1, v1, v2): + _t0 = time.time() + a = df.loc[t0:t1, :] + _t1 = time.time() + b = df.loc[:, v1] + _t2 = time.time() + if profile_assignment: + df.loc[t0:t1, v1] = df.loc[t0:t1, v1] * 1111 + _t3 = time.time() + + timingsdf.at[rows, ('ts', 'df')] += _t1 - _t0 + timingsdf.at[rows, ('var', 'df')] += _t2 - _t1 + timingsdf.at[rows, ('ass', 'df')] += _t3 - _t2 + return a, b, df + + +def a_timings(df, t0, t1, v1, v2): + _t0 = time.time() + a = df.loc[t0:t1, :] + _t1 = time.time() + b = df.loc[:, v1] + _t2 = time.time() + if profile_assignment: + df.loc[t0:t1, v1] = df.loc[t0:t1, v1] * 1111 + _t3 = time.time() + + timingsdf.at[rows, ('ts', 'a')] += _t1 - _t0 + timingsdf.at[rows, ('var', 'a')] += _t2 - _t1 + timingsdf.at[rows, ('ass', 'a')] += _t3 - _t2 + return a, b, df + + +def b_timings(df, t0, t1, v1, v2): + _t0 = time.time() + a = df.loc[:, t0:t1] + _t1 = time.time() + b = df.loc[v1, :] + _t2 = time.time() + if profile_assignment: + df.loc[v1, t0:t1] = df.loc[v1, t0:t1] * 1111 + _t3 = time.time() + + timingsdf.at[rows, ('ts', 'b')] += _t1 - _t0 + timingsdf.at[rows, ('var', 'b')] += _t2 - _t1 + timingsdf.at[rows, ('ass', 'b')] += _t3 - _t2 + return a, b, df + + +def dos_timings(dos, t0, t1, v1, v2): + _t0 = time.time() + a = dos[t0:t1, :] + _t1 = time.time() + b = dos[:, v1] + _t2 = time.time() + if profile_assignment: + dos[t0:t1, v1] = dos[t0:t1, v1] * 1111 + _t3 = time.time() + + timingsdf.at[rows, ('ts', 'dios')] += _t1 - _t0 + timingsdf.at[rows, ('var', 'dios')] += _t2 - _t1 + timingsdf.at[rows, ('ass', 'dios')] += _t3 - _t2 + return a, b, dos + + +def gen_random_timestamps(m, M): + r = (M - m) * (np.random.randint(10,90) + np.random.random()) * 0.01 + a , b = m + r, M - r + return min(a,b), max(a,b) + + +def find_index_range(obj): + min_ = None + max_ = None + for r in obj: + m = obj[r].index.min() + M = obj[r].index.max() + try: + min_ = min(min_, m) + max_ = max(max_, M) + except TypeError: + min_ = m + max_ = M + return min_, max_ + + +if __name__ == '__main__': + import matplotlib.pyplot as plt + + # do not touch + rows = 1 + + # max increase of of rows + # 1 = 10 # 2 = 100 # .... # 5 = 100'000 + iterations = 4 + runs = 10 + cols = 10 + + profile_assignment = True + + # which to calc and plot + use_df = True + use_a = False + use_b = False + use_dos = True + + # plot options + normalize_to_df = False + plot_xlog = True + plot_ylog = False + + # ######################## + + v1 = 'var1' + v2 = 'var2' + for i in range(iterations): + rows *= 10 + + timingsdf.loc[rows] = (0,) * len(timingsdf.columns) + + df, a, b, dos = get_testset(rows, cols) + t0, t4 = find_index_range(df) + + if use_df or normalize_to_df: + for r in range(runs): + t1, t2 = gen_random_timestamps(t0, t4) + vr1 = var_prefix + str(np.random.randint(0, cols)) + df_timmings(df, t1, t2, vr1, None) + + if use_a: + for r in range(runs): + t1, t2 = gen_random_timestamps(t0, t4) + vr1 = var_prefix + str(np.random.randint(0, cols)) + a_timings(a, t1, t2, vr1, None) + + if use_b: + for r in range(runs): + t1, t2 = gen_random_timestamps(t0, t4) + vr1 = var_prefix + str(np.random.randint(0, cols)) + b_timings(b, t1, t2, vr1, None) + + if use_dos: + for r in range(runs): + t1, t2 = gen_random_timestamps(t0, t4) + vr1 = var_prefix + str(np.random.randint(0, cols)) + dos_timings(dos, t1, t2, vr1, None) + + # calc the average + timingsdf /= runs + + pd.set_option('display.max_columns', 100) + + df = timingsdf + if not profile_assignment: + df.drop(labels='ass', axis=1, level=0, inplace=True) + print('timings:') + print(df) + df = df.swaplevel(axis=1) + if normalize_to_df: + a = df.loc[:, 'a'] / df.loc[:, 'df'] + b = df.loc[:, 'b'] / df.loc[:, 'df'] + c = df.loc[:, 'df'] / df.loc[:, 'df'] + d = df.loc[:, 'dios'] / df.loc[:, 'df'] + df.loc[:, 'a'] = a.values + df.loc[:, 'b'] = b.values + df.loc[:, 'df'] = c.values + df.loc[:, 'dios'] = d.values + all = df.copy() + all.swaplevel(axis=1) + print('\n\ndiff:') + print(all) + + a = df.loc[:, ('a', slice(None))] + b = df.loc[:, ('b', slice(None))] + dios = df.loc[:, ('dios', slice(None))] + df = df.loc[:, ('df', slice(None))] + + ax = plt.gca() + ax.set_title(f"avg of: {runs} runs, columns: {cols}") + + if use_df: + df.plot(logy=plot_ylog, logx=plot_xlog, linestyle='-', ax=ax) + if use_a: + a.plot(logy=plot_ylog, logx=plot_xlog, linestyle='--', ax=ax) + if use_b: + b.plot(logy=plot_ylog, logx=plot_xlog, linestyle=':', ax=ax) + if use_dos: + dios.plot(logy=plot_ylog, logx=plot_xlog, linestyle='-.', ax=ax) + + plt.show() diff --git a/profiling/testsets/.gitignore b/profiling/testsets/.gitignore new file mode 100644 index 0000000..aa8d4bb --- /dev/null +++ b/profiling/testsets/.gitignore @@ -0,0 +1,6 @@ + +# ignore all +* + +# except ourself, to ensure the `testsets`-dir isn't ignored +!.gitignore \ No newline at end of file diff --git a/tests/tests.py b/tests/tests.py index 90040e0..be2407e 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -1,4 +1,7 @@ from dios import * +import pandas as pd +import datetime as dt +import numpy as np v0 = 'var0' v1 = 'var1' -- GitLab