performance.py

import pandas as pd
import numpy as np
import time
from .generate_testsets import get_testset, var_prefix

profile_assignment = False

idx = pd.IndexSlice
rows = 0

fir = ["var", "ts", "ass"]
sec = ["df", "a", "b", "dios"]
timingsdf = pd.DataFrame(columns=pd.MultiIndex.from_product([fir, sec]))


def df_timmings(df, t0, t1, v1, v2):
    _t0 = time.time()
    a = df.loc[t0:t1, :]
    _t1 = time.time()
    b = df.loc[:, v1]
    _t2 = time.time()
    if profile_assignment:
        df.loc[t0:t1, v1] = df.loc[t0:t1, v1] * 1111
    _t3 = time.time()

    timingsdf.at[rows, ("ts", "df")] += _t1 - _t0
    timingsdf.at[rows, ("var", "df")] += _t2 - _t1
    timingsdf.at[rows, ("ass", "df")] += _t3 - _t2
    return a, b, df


def a_timings(df, t0, t1, v1, v2):
    _t0 = time.time()
    a = df.loc[t0:t1, :]
    _t1 = time.time()
    b = df.loc[:, v1]
    _t2 = time.time()
    if profile_assignment:
        df.loc[t0:t1, v1] = df.loc[t0:t1, v1] * 1111
    _t3 = time.time()

    timingsdf.at[rows, ("ts", "a")] += _t1 - _t0
    timingsdf.at[rows, ("var", "a")] += _t2 - _t1
    timingsdf.at[rows, ("ass", "a")] += _t3 - _t2
    return a, b, df


def b_timings(df, t0, t1, v1, v2):
    _t0 = time.time()
    a = df.loc[:, t0:t1]
    _t1 = time.time()
    b = df.loc[v1, :]
    _t2 = time.time()
    if profile_assignment:
        df.loc[v1, t0:t1] = df.loc[v1, t0:t1] * 1111
    _t3 = time.time()

    timingsdf.at[rows, ("ts", "b")] += _t1 - _t0
    timingsdf.at[rows, ("var", "b")] += _t2 - _t1
    timingsdf.at[rows, ("ass", "b")] += _t3 - _t2
    return a, b, df


def dios_timings(dios, t0, t1, v1, v2):
    _t0 = time.time()
    a = dios.loc[t0:t1, :]
    _t1 = time.time()
    b = dios.loc[:, v1]
    _t2 = time.time()
    if profile_assignment:
        dios.loc[t0:t1, v1] = dios.loc[t0:t1, v1] * 1111
    _t3 = time.time()

    timingsdf.at[rows, ("ts", "dios")] += _t1 - _t0
    timingsdf.at[rows, ("var", "dios")] += _t2 - _t1
    timingsdf.at[rows, ("ass", "dios")] += _t3 - _t2
    return a, b, dios


def gen_random_timestamps(m, M):
    r = (M - m) * (np.random.randint(10, 90) + np.random.random()) * 0.01
    a, b = m + r, M - r
    return min(a, b), max(a, b)


def find_index_range(obj):
    min_ = None
    max_ = None
    for r in obj:
        m = obj[r].index.min()
        M = obj[r].index.max()
        try:
            min_ = min(min_, m)
            max_ = max(max_, M)
        except TypeError:
            min_ = m
            max_ = M
    return min_, max_


if __name__ == "__main__":
    import matplotlib.pyplot as plt

    # do not touch
    rows = 1

    # max increase of of rows
    # 1 = 10 # 2 = 100 # .... # 5 = 100'000
    iterations = 5
    runs = 1
    cols = 10

    profile_assignment = True

    # which to calc and plot
    use_df = False
    use_a = True
    use_b = True
    use_dios = True

    # plot options
    normalize_to_df = True
    plot_xlog = True
    plot_ylog = True

    # ########################

    v1 = "var1"
    v2 = "var2"
    for i in range(iterations):
        rows *= 10

        timingsdf.loc[rows] = (0,) * len(timingsdf.columns)

        df, a, b, dios = get_testset(rows, cols)
        t0, t4 = find_index_range(df)

        if use_df or normalize_to_df:
            for r in range(runs):
                t1, t2 = gen_random_timestamps(t0, t4)
                vr1 = var_prefix + str(np.random.randint(0, cols))
                df_timmings(df, t1, t2, vr1, None)

        if use_a:
            for r in range(runs):
                t1, t2 = gen_random_timestamps(t0, t4)
                vr1 = var_prefix + str(np.random.randint(0, cols))
                a_timings(a, t1, t2, vr1, None)

        if use_b:
            for r in range(runs):
                t1, t2 = gen_random_timestamps(t0, t4)
                vr1 = var_prefix + str(np.random.randint(0, cols))
                b_timings(b, t1, t2, vr1, None)

        if use_dios:
            for r in range(runs):
                t1, t2 = gen_random_timestamps(t0, t4)
                vr1 = var_prefix + str(np.random.randint(0, cols))
                dios_timings(dios, t1, t2, vr1, None)

    # calc the average
    timingsdf /= runs

    pd.set_option("display.max_columns", 100)

    df = timingsdf
    if not profile_assignment:
        df.drop(labels="ass", axis=1, level=0, inplace=True)
    print("timings:")
    print(df)
    df = df.swaplevel(axis=1)
    if normalize_to_df:
        a = df.loc[:, "a"] / df.loc[:, "df"]
        b = df.loc[:, "b"] / df.loc[:, "df"]
        c = df.loc[:, "df"] / df.loc[:, "df"]
        d = df.loc[:, "dios"] / df.loc[:, "df"]
        df.loc[:, "a"] = a.values
        df.loc[:, "b"] = b.values
        df.loc[:, "df"] = c.values
        df.loc[:, "dios"] = d.values
        all = df.copy()
        all.swaplevel(axis=1)
        print("\n\ndiff:")
        print(all)

    a = df.loc[:, ("a", slice(None))]
    b = df.loc[:, ("b", slice(None))]
    dios = df.loc[:, ("dios", slice(None))]
    df = df.loc[:, ("df", slice(None))]

    ax = plt.gca()
    ax.set_title(f"avg of: {runs} runs, columns: {cols}")

    if use_df:
        df.plot(logy=plot_ylog, logx=plot_xlog, linestyle="-", ax=ax)
    if use_a:
        a.plot(logy=plot_ylog, logx=plot_xlog, linestyle="--", ax=ax)
    if use_b:
        b.plot(logy=plot_ylog, logx=plot_xlog, linestyle=":", ax=ax)
    if use_dios:
        dios.plot(logy=plot_ylog, logx=plot_xlog, linestyle="-.", ax=ax)

    plt.show()