Skip to content
Snippets Groups Projects
Commit a1eb979b authored by Bert Palm's avatar Bert Palm 🎇
Browse files

del oldstuff

parent 99455238
No related branches found
No related tags found
2 merge requests!2Develop,!1complete rework
import time
import pandas as pd
import numpy as np
import datetime as dt
from dios import dios
import pickle
import os
var_prefix = 'var'
def _gen_testset(rowsz, colsz, freq='1min', disalign=True, randstart=True):
df = pd.DataFrame()
dos = dios.DictOfSeries()
start = dt.datetime.strptime("2000-01-01 00:00:00", "%Y-%m-%d %H:%M:%S")
times = pd.date_range(periods=rowsz, start=start, freq=freq)
frequ = freq.strip('0123456789')
freqv = int(freq[:-len(frequ)])
for i in range(colsz):
if randstart:
# generate random startpoint for each series
r = str(np.random.randint(int(rowsz * 0.05), int(rowsz * 0.6) + 2)) + frequ
st = start + pd.Timedelta(r)
times = pd.date_range(periods=rowsz, start=st, freq=freq)
if disalign:
if disalign == 'random':
r = np.random.randint(1, i + 2)
else:
# total disalign
r = i
times += pd.Timedelta(f'{r}ns')
d = np.random.randint(1, 9, rowsz)
v = f'var{i}'
tmp = pd.DataFrame(index=times, data=d, columns=[v])
df = pd.merge(df, tmp, left_index=True, right_index=True, how='outer')
dos[v] = tmp.squeeze().copy()
return df, dos
def _gen_df(rowsz, colsz, freq='1min', disalign=True, randstart=True):
df, _ = _gen_testset(rowsz=rowsz, colsz=colsz, freq=freq, disalign=disalign, randstart=randstart)
return df
def gen_dos(rowsz, colsz, freq='1min', disalign=True, randstart=True):
_, dos = _gen_testset(rowsz=rowsz, colsz=colsz, freq=freq, disalign=disalign, randstart=randstart)
return dos
def get_testset(rows, cols, freq='1s', disalign=True, randstart=True, storagedir='testsets', noresult=False):
fname = f'set_f{freq}_d{disalign}_r{randstart}_dim{rows}x{cols}.pkl'
fpath = os.path.join(storagedir, fname)
try:
with open(fpath, 'rb') as fh:
if noresult:
return
tup = pickle.load(fh)
except (pickle.UnpicklingError, FileNotFoundError):
df_, dos_ = _gen_testset(rowsz=rows, colsz=cols, freq=freq, disalign=disalign, randstart=randstart)
df_ = df_.sort_index(axis=0, level=0)
a_ = df_.copy().stack(dropna=False).sort_index(axis=0, level=0).copy()
b_ = df_.copy().unstack().sort_index(axis=0, level=0).copy()
tup = df_, a_, b_, dos_
with open(fpath, 'wb') as fh:
pickle.dump(tup, fh)
if noresult:
return
return tup
def gen_all(rrange, crange):
for r in rrange:
for c in crange:
print(r, ' x ', c)
t0 = time.time()
get_testset(r, c, noresult=True)
t1 = time.time()
print(t1-t0)
if __name__ == '__main__':
# import time
#
# t0 = time.time()
# for i in range(7):
# get_testset(10**i, 10)
# t1 = time.time()
# print(t1-t0)
rr = [10**r for r in range(1,6)]
c = range(10, 60, 10)
gen_all(rr, c)
import gc
from dios.profiling.generate_testsets import get_testset, _gen_testset
def calc_mem(rows, cols, shifted=False, dtypesz=(64 / 8)):
if shifted:
idxsz = 8 * rows * cols
# additional nans are inserted exactly as many as variables
rowsz = rows * cols * dtypesz
else:
idxsz = 8 * rows
rowsz = rows * dtypesz
return idxsz + rowsz * cols
def bytes2hread(bytes):
i = 0
units = ['B', 'kB', 'MB', 'GB', 'TB']
while (bytes > 1000):
bytes /= 1024
i += 1
if i == 4:
break
return bytes, units[i]
def rows_by_time(nsec, mdays):
""" calc the number of values for one value every n seconds in m days
:param nsec: n seconds a value occur
:param mdays: this many days of data
:return: rows thats needed
"""
return int((60 / nsec) * 60 * 24 * mdays)
if __name__ == '__main__':
# dos - linear in rows and colums, same size for r=10,c=100 or r=100,c=10
do_real_check = True
cols = 10
rows = 100000
# rows = rows_by_time(nsec=600, mdays=365*2)
mem = calc_mem(rows, cols, shifted=False)
memsh = calc_mem(rows, cols, shifted=True)
df, _, _, dos = get_testset(rows, cols, disalign=False, randstart=True)
dos_mem = dos.memory_usage()
print(f"dos:\n-----------")
print("mem: ", *bytes2hread(dos_mem))
print("entries:", sum([len(dos[e]) for e in dos]))
print()
ratio = (1 / (memsh - mem) ) * dos_mem
mem = bytes2hread(mem)
memsh = bytes2hread(memsh)
print('df - best case\n---------')
print("mem: ", *mem)
print("entries:", rows)
print()
print('df - worst case\n---------')
print("mem :", *memsh)
print("entries:", rows * cols)
print()
print(f"dfbest, dos, dfworst: 0%, {round(ratio, 4)*100}%, 100% ")
if not do_real_check:
exit(0)
proveMeRight = False
if proveMeRight:
# best case
print()
print('best case proove')
dfb, _ = _gen_testset(rows, cols, disalign=False, randstart=False)
dfb.info(memory_usage='deep', verbose=False)
print()
print('rand start, same freq')
df.info(memory_usage='deep', verbose=False)
print("entries:", sum([len(df[e]) for e in df]))
print()
print('rand start, rand freq')
df, _ = get_testset(rows, cols, disalign='random', randstart=True)
df.info(memory_usage='deep', verbose=False)
print("entries:", sum([len(df[e]) for e in df]))
if proveMeRight:
# worst case
print()
print('worst case proove')
df, _ = _gen_testset(rows, cols, disalign=True, randstart=False)
df.info(memory_usage='deep', verbose=False)
gc.collect()
import pandas as pd
import numpy as np
import time
from dios.profiling.generate_testsets import get_testset, var_prefix
profile_assignment = False
idx = pd.IndexSlice
rows = 0
fir = ['var', 'ts', 'ass']
sec = ['df', 'a', 'b', 'dios']
timingsdf = pd.DataFrame(columns=pd.MultiIndex.from_product([fir, sec]))
def df_timmings(df, t0, t1, v1, v2):
_t0 = time.time()
a = df.loc[t0:t1, :]
_t1 = time.time()
b = df.loc[:, v1]
_t2 = time.time()
if profile_assignment:
df.loc[t0:t1, v1] = df.loc[t0:t1, v1] * 1111
_t3 = time.time()
timingsdf.at[rows, ('ts', 'df')] += _t1 - _t0
timingsdf.at[rows, ('var', 'df')] += _t2 - _t1
timingsdf.at[rows, ('ass', 'df')] += _t3 - _t2
return a, b, df
def a_timings(df, t0, t1, v1, v2):
_t0 = time.time()
a = df.loc[t0:t1, :]
_t1 = time.time()
b = df.loc[:, v1]
_t2 = time.time()
if profile_assignment:
df.loc[t0:t1, v1] = df.loc[t0:t1, v1] * 1111
_t3 = time.time()
timingsdf.at[rows, ('ts', 'a')] += _t1 - _t0
timingsdf.at[rows, ('var', 'a')] += _t2 - _t1
timingsdf.at[rows, ('ass', 'a')] += _t3 - _t2
return a, b, df
def b_timings(df, t0, t1, v1, v2):
_t0 = time.time()
a = df.loc[:, t0:t1]
_t1 = time.time()
b = df.loc[v1, :]
_t2 = time.time()
if profile_assignment:
df.loc[v1, t0:t1] = df.loc[v1, t0:t1] * 1111
_t3 = time.time()
timingsdf.at[rows, ('ts', 'b')] += _t1 - _t0
timingsdf.at[rows, ('var', 'b')] += _t2 - _t1
timingsdf.at[rows, ('ass', 'b')] += _t3 - _t2
return a, b, df
def dos_timings(dos, t0, t1, v1, v2):
_t0 = time.time()
a = dos[t0:t1, :]
_t1 = time.time()
b = dos[:, v1]
_t2 = time.time()
if profile_assignment:
dos[t0:t1, v1] = dos[t0:t1, v1] * 1111
_t3 = time.time()
timingsdf.at[rows, ('ts', 'dios')] += _t1 - _t0
timingsdf.at[rows, ('var', 'dios')] += _t2 - _t1
timingsdf.at[rows, ('ass', 'dios')] += _t3 - _t2
return a, b, dos
def gen_random_timestamps(m, M):
r = (M - m) * (np.random.randint(10,90) + np.random.random()) * 0.01
a , b = m + r, M - r
return min(a,b), max(a,b)
def find_index_range(obj):
min_ = None
max_ = None
for r in obj:
m = obj[r].index.min()
M = obj[r].index.max()
try:
min_ = min(min_, m)
max_ = max(max_, M)
except TypeError:
min_ = m
max_ = M
return min_, max_
if __name__ == '__main__':
import matplotlib.pyplot as plt
# do not touch
rows = 1
# max increase of of rows
# 1 = 10 # 2 = 100 # .... # 5 = 100'000
iterations = 4
runs = 10
cols = 10
profile_assignment = True
# which to calc and plot
use_df = True
use_a = False
use_b = False
use_dos = True
# plot options
normalize_to_df = False
plot_xlog = True
plot_ylog = False
# ########################
v1 = 'var1'
v2 = 'var2'
for i in range(iterations):
rows *= 10
timingsdf.loc[rows] = (0,) * len(timingsdf.columns)
df, a, b, dos = get_testset(rows, cols)
t0, t4 = find_index_range(df)
if use_df or normalize_to_df:
for r in range(runs):
t1, t2 = gen_random_timestamps(t0, t4)
vr1 = var_prefix + str(np.random.randint(0, cols))
df_timmings(df, t1, t2, vr1, None)
if use_a:
for r in range(runs):
t1, t2 = gen_random_timestamps(t0, t4)
vr1 = var_prefix + str(np.random.randint(0, cols))
a_timings(a, t1, t2, vr1, None)
if use_b:
for r in range(runs):
t1, t2 = gen_random_timestamps(t0, t4)
vr1 = var_prefix + str(np.random.randint(0, cols))
b_timings(b, t1, t2, vr1, None)
if use_dos:
for r in range(runs):
t1, t2 = gen_random_timestamps(t0, t4)
vr1 = var_prefix + str(np.random.randint(0, cols))
dos_timings(dos, t1, t2, vr1, None)
# calc the average
timingsdf /= runs
pd.set_option('display.max_columns', 100)
df = timingsdf
if not profile_assignment:
df.drop(labels='ass', axis=1, level=0, inplace=True)
print('timings:')
print(df)
df = df.swaplevel(axis=1)
if normalize_to_df:
a = df.loc[:, 'a'] / df.loc[:, 'df']
b = df.loc[:, 'b'] / df.loc[:, 'df']
c = df.loc[:, 'df'] / df.loc[:, 'df']
d = df.loc[:, 'dios'] / df.loc[:, 'df']
df.loc[:, 'a'] = a.values
df.loc[:, 'b'] = b.values
df.loc[:, 'df'] = c.values
df.loc[:, 'dios'] = d.values
all = df.copy()
all.swaplevel(axis=1)
print('\n\ndiff:')
print(all)
a = df.loc[:, ('a', slice(None))]
b = df.loc[:, ('b', slice(None))]
dios = df.loc[:, ('dios', slice(None))]
df = df.loc[:, ('df', slice(None))]
ax = plt.gca()
ax.set_title(f"avg of: {runs} runs, columns: {cols}")
if use_df:
df.plot(logy=plot_ylog, logx=plot_xlog, linestyle='-', ax=ax)
if use_a:
a.plot(logy=plot_ylog, logx=plot_xlog, linestyle='--', ax=ax)
if use_b:
b.plot(logy=plot_ylog, logx=plot_xlog, linestyle=':', ax=ax)
if use_dos:
dios.plot(logy=plot_ylog, logx=plot_xlog, linestyle='-.', ax=ax)
plt.show()
# ignore all
*
# except ourself, to ensure the `testsets`-dir isn't ignored
!.gitignore
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment