diff --git a/dios/profiling/__init__.py b/dios/profiling/__init__.py deleted file mode 100644 index 139597f9cb07c5d48bed18984ec4747f4b4f3438..0000000000000000000000000000000000000000 --- a/dios/profiling/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ - - diff --git a/dios/profiling/generate_testsets.py b/dios/profiling/generate_testsets.py deleted file mode 100644 index df2d97e8a2fe34392f400336ecaa001c27f667ca..0000000000000000000000000000000000000000 --- a/dios/profiling/generate_testsets.py +++ /dev/null @@ -1,103 +0,0 @@ -import time - -import pandas as pd -import numpy as np -import datetime as dt -from dios import dios -import pickle -import os - -var_prefix = 'var' - - -def _gen_testset(rowsz, colsz, freq='1min', disalign=True, randstart=True): - df = pd.DataFrame() - dos = dios.DictOfSeries() - start = dt.datetime.strptime("2000-01-01 00:00:00", "%Y-%m-%d %H:%M:%S") - times = pd.date_range(periods=rowsz, start=start, freq=freq) - - frequ = freq.strip('0123456789') - freqv = int(freq[:-len(frequ)]) - - for i in range(colsz): - - if randstart: - # generate random startpoint for each series - r = str(np.random.randint(int(rowsz * 0.05), int(rowsz * 0.6) + 2)) + frequ - st = start + pd.Timedelta(r) - times = pd.date_range(periods=rowsz, start=st, freq=freq) - - if disalign: - if disalign == 'random': - r = np.random.randint(1, i + 2) - else: - # total disalign - r = i - times += pd.Timedelta(f'{r}ns') - - d = np.random.randint(1, 9, rowsz) - v = f'var{i}' - tmp = pd.DataFrame(index=times, data=d, columns=[v]) - df = pd.merge(df, tmp, left_index=True, right_index=True, how='outer') - dos[v] = tmp.squeeze().copy() - - return df, dos - - -def _gen_df(rowsz, colsz, freq='1min', disalign=True, randstart=True): - df, _ = _gen_testset(rowsz=rowsz, colsz=colsz, freq=freq, disalign=disalign, randstart=randstart) - return df - - -def gen_dos(rowsz, colsz, freq='1min', disalign=True, randstart=True): - _, dos = _gen_testset(rowsz=rowsz, colsz=colsz, freq=freq, disalign=disalign, randstart=randstart) - return dos - - -def get_testset(rows, cols, freq='1s', disalign=True, randstart=True, storagedir='testsets', noresult=False): - fname = f'set_f{freq}_d{disalign}_r{randstart}_dim{rows}x{cols}.pkl' - fpath = os.path.join(storagedir, fname) - try: - with open(fpath, 'rb') as fh: - if noresult: - return - tup = pickle.load(fh) - except (pickle.UnpicklingError, FileNotFoundError): - df_, dos_ = _gen_testset(rowsz=rows, colsz=cols, freq=freq, disalign=disalign, randstart=randstart) - df_ = df_.sort_index(axis=0, level=0) - a_ = df_.copy().stack(dropna=False).sort_index(axis=0, level=0).copy() - b_ = df_.copy().unstack().sort_index(axis=0, level=0).copy() - tup = df_, a_, b_, dos_ - with open(fpath, 'wb') as fh: - pickle.dump(tup, fh) - - if noresult: - return - - return tup - - -def gen_all(rrange, crange): - for r in rrange: - for c in crange: - print(r, ' x ', c) - t0 = time.time() - get_testset(r, c, noresult=True) - t1 = time.time() - print(t1-t0) - - -if __name__ == '__main__': - # import time - # - # t0 = time.time() - # for i in range(7): - # get_testset(10**i, 10) - # t1 = time.time() - # print(t1-t0) - - rr = [10**r for r in range(1,6)] - c = range(10, 60, 10) - gen_all(rr, c) - - diff --git a/dios/profiling/memory.py b/dios/profiling/memory.py deleted file mode 100644 index 33539ee5839cbff631d24014410d86fa69d6b719..0000000000000000000000000000000000000000 --- a/dios/profiling/memory.py +++ /dev/null @@ -1,101 +0,0 @@ -import gc -from dios.profiling.generate_testsets import get_testset, _gen_testset - - -def calc_mem(rows, cols, shifted=False, dtypesz=(64 / 8)): - if shifted: - idxsz = 8 * rows * cols - # additional nans are inserted exactly as many as variables - rowsz = rows * cols * dtypesz - else: - idxsz = 8 * rows - rowsz = rows * dtypesz - - return idxsz + rowsz * cols - - -def bytes2hread(bytes): - i = 0 - units = ['B', 'kB', 'MB', 'GB', 'TB'] - while (bytes > 1000): - bytes /= 1024 - i += 1 - if i == 4: - break - return bytes, units[i] - - -def rows_by_time(nsec, mdays): - """ calc the number of values for one value every n seconds in m days - :param nsec: n seconds a value occur - :param mdays: this many days of data - :return: rows thats needed - """ - return int((60 / nsec) * 60 * 24 * mdays) - - -if __name__ == '__main__': - - # dos - linear in rows and colums, same size for r=10,c=100 or r=100,c=10 - do_real_check = True - cols = 10 - rows = 100000 - # rows = rows_by_time(nsec=600, mdays=365*2) - - mem = calc_mem(rows, cols, shifted=False) - memsh = calc_mem(rows, cols, shifted=True) - - df, _, _, dos = get_testset(rows, cols, disalign=False, randstart=True) - dos_mem = dos.memory_usage() - print(f"dos:\n-----------") - print("mem: ", *bytes2hread(dos_mem)) - print("entries:", sum([len(dos[e]) for e in dos])) - print() - - ratio = (1 / (memsh - mem) ) * dos_mem - - mem = bytes2hread(mem) - memsh = bytes2hread(memsh) - - print('df - best case\n---------') - print("mem: ", *mem) - print("entries:", rows) - print() - print('df - worst case\n---------') - print("mem :", *memsh) - print("entries:", rows * cols) - - print() - print(f"dfbest, dos, dfworst: 0%, {round(ratio, 4)*100}%, 100% ") - - if not do_real_check: - exit(0) - - proveMeRight = False - - if proveMeRight: - # best case - print() - print('best case proove') - dfb, _ = _gen_testset(rows, cols, disalign=False, randstart=False) - dfb.info(memory_usage='deep', verbose=False) - - print() - print('rand start, same freq') - df.info(memory_usage='deep', verbose=False) - print("entries:", sum([len(df[e]) for e in df])) - - print() - print('rand start, rand freq') - df, _ = get_testset(rows, cols, disalign='random', randstart=True) - df.info(memory_usage='deep', verbose=False) - print("entries:", sum([len(df[e]) for e in df])) - - if proveMeRight: - # worst case - print() - print('worst case proove') - df, _ = _gen_testset(rows, cols, disalign=True, randstart=False) - df.info(memory_usage='deep', verbose=False) - - gc.collect() diff --git a/dios/profiling/performance.py b/dios/profiling/performance.py deleted file mode 100644 index ab9a3a91f681f56db25aaa36a34c9d429fa8b491..0000000000000000000000000000000000000000 --- a/dios/profiling/performance.py +++ /dev/null @@ -1,204 +0,0 @@ -import pandas as pd -import numpy as np -import time -from dios.profiling.generate_testsets import get_testset, var_prefix - -profile_assignment = False - -idx = pd.IndexSlice -rows = 0 - -fir = ['var', 'ts', 'ass'] -sec = ['df', 'a', 'b', 'dios'] -timingsdf = pd.DataFrame(columns=pd.MultiIndex.from_product([fir, sec])) - - -def df_timmings(df, t0, t1, v1, v2): - _t0 = time.time() - a = df.loc[t0:t1, :] - _t1 = time.time() - b = df.loc[:, v1] - _t2 = time.time() - if profile_assignment: - df.loc[t0:t1, v1] = df.loc[t0:t1, v1] * 1111 - _t3 = time.time() - - timingsdf.at[rows, ('ts', 'df')] += _t1 - _t0 - timingsdf.at[rows, ('var', 'df')] += _t2 - _t1 - timingsdf.at[rows, ('ass', 'df')] += _t3 - _t2 - return a, b, df - - -def a_timings(df, t0, t1, v1, v2): - _t0 = time.time() - a = df.loc[t0:t1, :] - _t1 = time.time() - b = df.loc[:, v1] - _t2 = time.time() - if profile_assignment: - df.loc[t0:t1, v1] = df.loc[t0:t1, v1] * 1111 - _t3 = time.time() - - timingsdf.at[rows, ('ts', 'a')] += _t1 - _t0 - timingsdf.at[rows, ('var', 'a')] += _t2 - _t1 - timingsdf.at[rows, ('ass', 'a')] += _t3 - _t2 - return a, b, df - - -def b_timings(df, t0, t1, v1, v2): - _t0 = time.time() - a = df.loc[:, t0:t1] - _t1 = time.time() - b = df.loc[v1, :] - _t2 = time.time() - if profile_assignment: - df.loc[v1, t0:t1] = df.loc[v1, t0:t1] * 1111 - _t3 = time.time() - - timingsdf.at[rows, ('ts', 'b')] += _t1 - _t0 - timingsdf.at[rows, ('var', 'b')] += _t2 - _t1 - timingsdf.at[rows, ('ass', 'b')] += _t3 - _t2 - return a, b, df - - -def dos_timings(dos, t0, t1, v1, v2): - _t0 = time.time() - a = dos[t0:t1, :] - _t1 = time.time() - b = dos[:, v1] - _t2 = time.time() - if profile_assignment: - dos[t0:t1, v1] = dos[t0:t1, v1] * 1111 - _t3 = time.time() - - timingsdf.at[rows, ('ts', 'dios')] += _t1 - _t0 - timingsdf.at[rows, ('var', 'dios')] += _t2 - _t1 - timingsdf.at[rows, ('ass', 'dios')] += _t3 - _t2 - return a, b, dos - - -def gen_random_timestamps(m, M): - r = (M - m) * (np.random.randint(10,90) + np.random.random()) * 0.01 - a , b = m + r, M - r - return min(a,b), max(a,b) - - -def find_index_range(obj): - min_ = None - max_ = None - for r in obj: - m = obj[r].index.min() - M = obj[r].index.max() - try: - min_ = min(min_, m) - max_ = max(max_, M) - except TypeError: - min_ = m - max_ = M - return min_, max_ - - -if __name__ == '__main__': - import matplotlib.pyplot as plt - - # do not touch - rows = 1 - - # max increase of of rows - # 1 = 10 # 2 = 100 # .... # 5 = 100'000 - iterations = 4 - runs = 10 - cols = 10 - - profile_assignment = True - - # which to calc and plot - use_df = True - use_a = False - use_b = False - use_dos = True - - # plot options - normalize_to_df = False - plot_xlog = True - plot_ylog = False - - # ######################## - - v1 = 'var1' - v2 = 'var2' - for i in range(iterations): - rows *= 10 - - timingsdf.loc[rows] = (0,) * len(timingsdf.columns) - - df, a, b, dos = get_testset(rows, cols) - t0, t4 = find_index_range(df) - - if use_df or normalize_to_df: - for r in range(runs): - t1, t2 = gen_random_timestamps(t0, t4) - vr1 = var_prefix + str(np.random.randint(0, cols)) - df_timmings(df, t1, t2, vr1, None) - - if use_a: - for r in range(runs): - t1, t2 = gen_random_timestamps(t0, t4) - vr1 = var_prefix + str(np.random.randint(0, cols)) - a_timings(a, t1, t2, vr1, None) - - if use_b: - for r in range(runs): - t1, t2 = gen_random_timestamps(t0, t4) - vr1 = var_prefix + str(np.random.randint(0, cols)) - b_timings(b, t1, t2, vr1, None) - - if use_dos: - for r in range(runs): - t1, t2 = gen_random_timestamps(t0, t4) - vr1 = var_prefix + str(np.random.randint(0, cols)) - dos_timings(dos, t1, t2, vr1, None) - - # calc the average - timingsdf /= runs - - pd.set_option('display.max_columns', 100) - - df = timingsdf - if not profile_assignment: - df.drop(labels='ass', axis=1, level=0, inplace=True) - print('timings:') - print(df) - df = df.swaplevel(axis=1) - if normalize_to_df: - a = df.loc[:, 'a'] / df.loc[:, 'df'] - b = df.loc[:, 'b'] / df.loc[:, 'df'] - c = df.loc[:, 'df'] / df.loc[:, 'df'] - d = df.loc[:, 'dios'] / df.loc[:, 'df'] - df.loc[:, 'a'] = a.values - df.loc[:, 'b'] = b.values - df.loc[:, 'df'] = c.values - df.loc[:, 'dios'] = d.values - all = df.copy() - all.swaplevel(axis=1) - print('\n\ndiff:') - print(all) - - a = df.loc[:, ('a', slice(None))] - b = df.loc[:, ('b', slice(None))] - dios = df.loc[:, ('dios', slice(None))] - df = df.loc[:, ('df', slice(None))] - - ax = plt.gca() - ax.set_title(f"avg of: {runs} runs, columns: {cols}") - - if use_df: - df.plot(logy=plot_ylog, logx=plot_xlog, linestyle='-', ax=ax) - if use_a: - a.plot(logy=plot_ylog, logx=plot_xlog, linestyle='--', ax=ax) - if use_b: - b.plot(logy=plot_ylog, logx=plot_xlog, linestyle=':', ax=ax) - if use_dos: - dios.plot(logy=plot_ylog, logx=plot_xlog, linestyle='-.', ax=ax) - - plt.show() diff --git a/dios/profiling/testsets/.gitignore b/dios/profiling/testsets/.gitignore deleted file mode 100644 index aa8d4bb337f54e8ae3685732eb6a29dcf4a23364..0000000000000000000000000000000000000000 --- a/dios/profiling/testsets/.gitignore +++ /dev/null @@ -1,6 +0,0 @@ - -# ignore all -* - -# except ourself, to ensure the `testsets`-dir isn't ignored -!.gitignore \ No newline at end of file