Skip to content

Commit

Permalink
blocks-import-script: add --csv flag
Browse files Browse the repository at this point in the history
  • Loading branch information
siddarthkay committed Jan 3, 2025
1 parent e33ab7d commit 326666c
Showing 1 changed file with 121 additions and 101 deletions.
222 changes: 121 additions & 101 deletions scripts/block-import-stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import numpy as np
import matplotlib.pyplot as plt
import os

import csv
import argparse

plt.rcParams["figure.figsize"] = [40, 30]
Expand Down Expand Up @@ -52,110 +52,130 @@ def formatBins(df: pd.DataFrame, bins: int):
return df


parser = argparse.ArgumentParser()
parser.add_argument("baseline")
parser.add_argument("contender")
parser.add_argument("--plot", action="store_true")
parser.add_argument(
"--bins",
default=10,
type=int,
help="Number of bins to group block ranges into in overview, 0=all rows",
)
parser.add_argument(
"--min-block-number",
default=500000,
type=int,
help="Skip block blocks below the given number",
)
args = parser.parse_args()
min_block_number = args.min_block_number

baseline = readStats(args.baseline)
contender = readStats(args.contender)

start = max(min(baseline.index), min(contender.index))
end = min(max(baseline.index), max(contender.index))

# Check if there's any overlap in the time ranges
if start > max(max(baseline.index), max(contender.index)) or end < min(min(baseline.index), min(contender.index)):
print(f"Error: No overlapping time ranges between baseline and contender datasets")
print(f"Baseline range: {min(baseline.index)} to {max(baseline.index)}")
print(f"Contender range: {min(contender.index)} to {max(contender.index)}")
exit(1)

baseline = baseline.loc[baseline.index >= start and baseline.index <= end]
contender = contender.loc[contender.index >= start and contender.index <= end]

# Join the two frames then interpolate - this helps dealing with runs that
# haven't been using the same chunking and/or max-blocks
df = baseline.merge(contender, on=("block_number", "blocks"), how="outer")
df = df.interpolate(method="index").reindex(contender.index)
df.reset_index(inplace=True)

if df.block_number.iloc[-1] > min_block_number + df.block_number.iloc[0]:
cutoff = min(
df.block_number.iloc[-1] - min_block_number,
min_block_number,
def write_csv_output(df_stats, csv_path):
"""Write statistics to a CSV file"""
with open(csv_path, 'w', newline='') as csvfile:
csv_writer = csv.writer(csvfile)

csv_writer.writerow(['block_range', 'bps_x', 'bps_y', 'tps_x', 'tps_y',
'time_x', 'time_y', 'bpsd', 'tpsd', 'timed'])

for idx, row in df_stats.iterrows():
csv_writer.writerow([
str(idx), # block range
f"{row['bps_x']:.2f}",
f"{row['bps_y']:.2f}",
f"{row['tps_x']:.2f}",
f"{row['tps_y']:.2f}",
prettySecs(row['time_x']),
prettySecs(row['time_y']),
f"{row['bpsd']:.2%}",
f"{row['tpsd']:.2%}",
f"{row['timed']:.2%}"
])


def main():
parser = argparse.ArgumentParser()
parser.add_argument("baseline")
parser.add_argument("contender")
parser.add_argument("--plot", action="store_true")
parser.add_argument("--csv-output", type=str, help="Path to output CSV file")
parser.add_argument(
"--bins",
default=10,
type=int,
help="Number of bins to group block ranges into in overview, 0=all rows",
)
parser.add_argument(
"--min-block-number",
default=500000,
type=int,
help="Skip block blocks below the given number",
)
args = parser.parse_args()
min_block_number = args.min_block_number

baseline = readStats(args.baseline)
contender = readStats(args.contender)

start = max(min(baseline.index), min(contender.index))
end = min(max(baseline.index), max(contender.index))

# Check if there's any overlap in the time ranges
if start > max(max(baseline.index), max(contender.index)) or end < min(min(baseline.index), min(contender.index)):
print(f"Error: No overlapping time ranges between baseline and contender datasets")
print(f"Baseline range: {min(baseline.index)} to {max(baseline.index)}")
print(f"Contender range: {min(contender.index)} to {max(contender.index)}")
exit(1)

baseline = baseline.loc[baseline.index >= start and baseline.index <= end]
contender = contender.loc[contender.index >= start and contender.index <= end]

# Join the two frames then interpolate - this helps dealing with runs that
# haven't been using the same chunking and/or max-blocks
df = baseline.merge(contender, on=("block_number", "blocks"), how="outer")
df = df.interpolate(method="index").reindex(contender.index)
df.reset_index(inplace=True)

if df.block_number.iloc[-1] > min_block_number + df.block_number.iloc[0]:
cutoff = min(
df.block_number.iloc[-1] - min_block_number,
min_block_number,
)
df = df[df.block_number >= cutoff]

df["bpsd"] = (df.bps_y - df.bps_x) / df.bps_x
df["tpsd"] = (df.tps_y - df.tps_x) / df.tps_x.replace(0, 1)
df["timed"] = (df.time_y - df.time_x) / df.time_x

if args.plot:
plt.rcParams["axes.grid"] = True
fig = plt.figure()
bps = fig.add_subplot(2, 2, 1, title="Blocks per second (more is better)")
bpsd = fig.add_subplot(2, 2, 2, title="Difference (>0 is better)")
tps = fig.add_subplot(2, 2, 3, title="Transactions per second (more is better)")
tpsd = fig.add_subplot(2, 2, 4, title="Difference (>0 is better)")

bps.plot(df.block_number, df.bps_x.rolling(3).mean(), label="baseline")
bps.plot(df.block_number, df.bps_y.rolling(3).mean(), label="contender")
bpsd.plot(df.block_number, df.bpsd.rolling(3).mean())
tps.plot(df.block_number, df.tps_x.rolling(3).mean(), label="baseline")
tps.plot(df.block_number, df.tps_y.rolling(3).mean(), label="contender")
tpsd.plot(df.block_number, df.tpsd.rolling(3).mean())

bps.legend()
tps.legend()

fig.subplots_adjust(bottom=0.05, right=0.95, top=0.95, left=0.05)
plt.show()

stats_df = formatBins(df, args.bins).agg(
dict.fromkeys(["bps_x", "bps_y", "tps_x", "tps_y"], "mean")
| dict.fromkeys(["time_x", "time_y"], "sum")
| dict.fromkeys(["bpsd", "tpsd", "timed"], "mean")
)
df = df[df.block_number >= cutoff]

df["bpsd"] = (df.bps_y - df.bps_x) / df.bps_x
df["tpsd"] = (df.tps_y - df.tps_x) / df.tps_x.replace(0, 1)
df["timed"] = (df.time_y - df.time_x) / df.time_x

if args.plot:
plt.rcParams["axes.grid"] = True

fig = plt.figure()
bps = fig.add_subplot(2, 2, 1, title="Blocks per second (more is better)")
bpsd = fig.add_subplot(2, 2, 2, title="Difference (>0 is better)")
tps = fig.add_subplot(2, 2, 3, title="Transactions per second (more is better)")
tpsd = fig.add_subplot(2, 2, 4, title="Difference (>0 is better)")

bps.plot(df.block_number, df.bps_x.rolling(3).mean(), label="baseline")
bps.plot(df.block_number, df.bps_y.rolling(3).mean(), label="contender")

bpsd.plot(df.block_number, df.bpsd.rolling(3).mean())

tps.plot(df.block_number, df.tps_x.rolling(3).mean(), label="baseline")
tps.plot(df.block_number, df.tps_y.rolling(3).mean(), label="contender")
if args.csv_output:
write_csv_output(stats_df, args.csv_output)

tpsd.plot(df.block_number, df.tpsd.rolling(3).mean())
print(f"{os.path.basename(args.baseline)} vs {os.path.basename(args.contender)}")
print(stats_df.to_string(
formatters=dict.fromkeys(["bpsd", "tpsd", "timed"], "{:,.2%}".format)
| dict.fromkeys(["bps_x", "bps_y", "tps_x", "tps_y"], "{:,.2f}".format)
| dict.fromkeys(["time_x", "time_y"], prettySecs),
))

bps.legend()
tps.legend()
total_blocks = df.block_number.max() - df.block_number.min()
time_xt = df.time_x.sum()
time_yt = df.time_y.sum()
timet = time_yt - time_xt

fig.subplots_adjust(bottom=0.05, right=0.95, top=0.95, left=0.05)
plt.show()
print(f"\nblocks: {total_blocks}, baseline: {prettySecs(time_xt)}, contender: {prettySecs(time_yt)}")
print(f"Time (total): {prettySecs(timet)}, {(timet/time_xt):.2%}")
print("\nbpsd = blocks per sec diff (+), tpsd = txs per sec diff, timed = time to process diff (-)")
print("+ = more is better, - = less is better")


print(f"{os.path.basename(args.baseline)} vs {os.path.basename(args.contender)}")
print(
formatBins(df, args.bins)
.agg(
dict.fromkeys(["bps_x", "bps_y", "tps_x", "tps_y"], "mean")
| dict.fromkeys(["time_x", "time_y"], "sum")
| dict.fromkeys(["bpsd", "tpsd", "timed"], "mean")
)
.to_string(
formatters=dict.fromkeys(["bpsd", "tpsd", "timed"], "{:,.2%}".format)
| dict.fromkeys(["bps_x", "bps_y", "tps_x", "tps_y"], "{:,.2f}".format)
| dict.fromkeys(["time_x", "time_y"], prettySecs),
)
)

print(
f"\nblocks: {df.block_number.max() - df.block_number.min()}, baseline: {prettySecs(df.time_x.sum())}, contender: {prettySecs(df.time_y.sum())}"
)
time_xt = df.time_x.sum()
time_yt = df.time_y.sum()

timet = time_yt - df.time_x.sum()
print(f"Time (total): {prettySecs(timet)}, {(timet/time_xt):.2%}")

print()
print(
"bpsd = blocks per sec diff (+), tpsd = txs per sec diff, timed = time to process diff (-)"
)
print("+ = more is better, - = less is better")
if __name__ == "__main__":
main()

0 comments on commit 326666c

Please sign in to comment.