import json
import sys
import os
def load_report(path):
with open(path) as f:
return json.load(f)
def main():
if len(sys.argv) < 3:
results_dir = os.path.join(os.path.dirname(__file__), "benchmark_results")
if os.path.isdir(results_dir):
reports = sorted(
[os.path.join(results_dir, f) for f in os.listdir(results_dir) if f.endswith(".json")],
key=os.path.getmtime,
)
if len(reports) >= 2:
a_path, b_path = reports[-2], reports[-1]
else:
print("Usage: compare_benchmarks.py <report_a.json> <report_b.json>")
sys.exit(1)
else:
print("Usage: compare_benchmarks.py <report_a.json> <report_b.json>")
sys.exit(1)
else:
a_path, b_path = sys.argv[1], sys.argv[2]
a = load_report(a_path)
b = load_report(b_path)
label_a = a["label"]
label_b = b["label"]
print(f"Comparing: {label_a} vs {label_b}")
print(f" A: {os.path.basename(a_path)}")
print(f" B: {os.path.basename(b_path)}")
print()
a_profiles = {p["profile"]: p for p in a["profiles"]}
b_profiles = {p["profile"]: p for p in b["profiles"]}
all_profiles = list(dict.fromkeys(list(a_profiles.keys()) + list(b_profiles.keys())))
def delta(va, vb, lower_better=True):
if va == 0 and vb == 0:
return " ="
if va == 0:
return " N/A"
pct = (vb - va) / abs(va) * 100
direction = "better" if (pct < 0) == lower_better else "worse"
return f"{pct:+.1f}% ({direction})"
header = f"{'Profile':<20} {'Metric':<15} {label_a:>12} {label_b:>12} {'Delta':>20}"
print(header)
print("-" * len(header))
totals_a = {"switches": 0, "switch_time_s": 0, "requests": 0, "wall_time_s": 0, "avg_wait_weighted": 0, "wait_count": 0}
totals_b = {"switches": 0, "switch_time_s": 0, "requests": 0, "wall_time_s": 0, "avg_wait_weighted": 0, "wait_count": 0}
for profile in all_profiles:
pa = a_profiles.get(profile)
pb = b_profiles.get(profile)
if not pa or not pb:
print(f"{profile:<20} (missing in one report)")
continue
for key, label, lower in [
("switches", "Switches", True),
("switch_time_s", "Switch Time", True),
("gpu_serving_pct", "GPU Serving %", False),
("avg_wait_s", "Avg Wait", True),
("requests", "Requests", False),
]:
va = float(pa.get(key, 0))
vb = float(pb.get(key, 0))
d = delta(va, vb, lower_better=lower)
pfx = profile if key == "switches" else ""
print(f"{pfx:<20} {label:<15} {va:>12.1f} {vb:>12.1f} {d:>20}")
for t, p in [(totals_a, pa), (totals_b, pb)]:
t["switches"] += p["switches"]
t["switch_time_s"] += p["switch_time_s"]
t["requests"] += p["requests"]
t["wall_time_s"] += p["wall_time_s"]
t["avg_wait_weighted"] += p["avg_wait_s"] * p["requests"]
t["wait_count"] += p["requests"]
print()
print("=" * len(header))
print("OVERALL SUMMARY")
print()
for metric, key, lower in [
("Total Switches", "switches", True),
("Total Switch Time", "switch_time_s", True),
("Total Requests", "requests", False),
]:
va = totals_a[key]
vb = totals_b[key]
d = delta(va, vb, lower_better=lower)
print(f" {metric:<25} {va:>10.1f} {vb:>10.1f} {d:>20}")
gpu_a = (1 - totals_a["switch_time_s"] / totals_a["wall_time_s"]) * 100 if totals_a["wall_time_s"] > 0 else 0
gpu_b = (1 - totals_b["switch_time_s"] / totals_b["wall_time_s"]) * 100 if totals_b["wall_time_s"] > 0 else 0
d = delta(gpu_a, gpu_b, lower_better=False)
print(f" {'Overall GPU Serving %':<25} {gpu_a:>10.1f} {gpu_b:>10.1f} {d:>20}")
avg_a = totals_a["avg_wait_weighted"] / totals_a["wait_count"] if totals_a["wait_count"] > 0 else 0
avg_b = totals_b["avg_wait_weighted"] / totals_b["wait_count"] if totals_b["wait_count"] > 0 else 0
d = delta(avg_a, avg_b, lower_better=True)
print(f" {'Overall Avg Wait':<25} {avg_a:>10.2f} {avg_b:>10.2f} {d:>20}")
print()
switch_improvement = (totals_a["switches"] - totals_b["switches"]) / totals_a["switches"] * 100 if totals_a["switches"] > 0 else 0
gpu_improvement = gpu_b - gpu_a
wait_improvement = (avg_a - avg_b) / avg_a * 100 if avg_a > 0 else 0
print("VERDICT:")
print(f" Switch reduction: {switch_improvement:+.1f}%")
print(f" GPU serving change: {gpu_improvement:+.1f} percentage points")
print(f" Wait time change: {wait_improvement:+.1f}%")
if gpu_improvement > 0 and switch_improvement > 0:
print(f" >> {label_b} is BETTER than {label_a}")
elif gpu_improvement < -5:
print(f" >> {label_b} is WORSE than {label_a}")
else:
print(f" >> Results are MIXED — review per-profile data")
if __name__ == "__main__":
main()