import argparse
from pathlib import Path
import numpy as np
import pandas as pd
from plotnine import (
ggplot,
aes,
geom_line,
geom_point,
geom_ribbon,
geom_hline,
geom_vline,
geom_errorbar,
geom_bar,
geom_density,
geom_segment,
geom_text,
geom_abline,
labs,
theme_minimal,
theme,
element_text,
element_line,
element_rect,
element_blank,
scale_x_continuous,
scale_x_log10,
scale_y_continuous,
scale_color_manual,
scale_fill_manual,
coord_cartesian,
annotate,
facet_wrap,
position_dodge,
after_stat,
)
COLORS = {
"primary": "#2563eb", "secondary": "#0d9488", "accent": "#f97316", "error": "#ef4444", "text": "#374151", "muted": "#9ca3af", "light": "#e5e7eb", "background": "#ffffff", }
def theme_timing_oracle():
return (
theme_minimal() +
theme(
text=element_text(family="sans-serif", color=COLORS["text"]),
plot_title=element_text(size=14, weight="bold", margin={"b": 12}),
plot_subtitle=element_text(size=10, color=COLORS["muted"], margin={"b": 8}),
axis_title=element_text(size=10, margin={"t": 8, "r": 8}),
axis_text=element_text(size=9, color=COLORS["text"]),
legend_title=element_text(size=9, weight="bold"),
legend_text=element_text(size=8),
panel_grid_major=element_line(color=COLORS["light"], size=0.5),
panel_grid_minor=element_blank(),
panel_background=element_rect(fill=COLORS["background"]),
plot_background=element_rect(fill=COLORS["background"]),
legend_position="bottom",
legend_background=element_rect(fill=COLORS["background"], color=None),
figure_size=(8, 5),
dpi=150,
)
)
def wilson_ci(successes: int, trials: int, confidence: float = 0.95) -> tuple[float, float]:
if trials == 0:
return (0.0, 1.0)
n = trials
p_hat = successes / n
z = 1.96 if abs(confidence - 0.95) < 0.001 else 1.645
if successes == 0:
upper = 1.0 - ((1.0 - confidence) / 2.0) ** (1.0 / n)
return (0.0, upper)
if successes == trials:
lower = ((1.0 - confidence) / 2.0) ** (1.0 / n)
return (lower, 1.0)
z2 = z * z
denom = 1.0 + z2 / n
center = (p_hat + z2 / (2.0 * n)) / denom
margin = z * np.sqrt((p_hat * (1.0 - p_hat) + z2 / (4.0 * n)) / n) / denom
lower = max(0.0, center - margin)
upper = min(1.0, center + margin)
return (lower, upper)
def infer_effect_from_test_name(test_name: str) -> float:
import re
if "adjacent_network" in test_name.lower() or "adjacentnetwork" in test_name.lower():
theta = 100.0
elif "research" in test_name.lower():
theta = 50.0
elif "remote_network" in test_name.lower() or "remotenetwork" in test_name.lower():
theta = 50000.0
else:
theta = 100.0
match = re.search(r'(\d+)x_theta', test_name.lower())
if match:
multiplier = float(match.group(1))
return theta * multiplier
match = re.search(r'(\d+)ns', test_name.lower())
if match:
return float(match.group(1))
return 0.0
def plot_power_curve(df: pd.DataFrame, output_path: Path):
power_df = df[df["test_type"] == "power"].copy()
if power_df.empty:
print(" No power test data found, skipping power curve")
return
if (power_df["injected_effect_ns"] == 0).all():
power_df["injected_effect_ns"] = power_df["test_name"].apply(infer_effect_from_test_name)
print(" Inferred effect sizes from test names")
power_df = power_df[power_df["injected_effect_ns"] > 0]
if power_df.empty:
print(" Could not determine effect sizes, skipping power curve")
return
agg = power_df.groupby("injected_effect_ns").agg(
detected=("decision", lambda x: (x == "fail").sum()),
total=("decision", "count"),
).reset_index()
agg["power"] = agg["detected"] / agg["total"]
agg["ci_low"] = agg.apply(lambda r: wilson_ci(int(r["detected"]), int(r["total"]))[0], axis=1)
agg["ci_high"] = agg.apply(lambda r: wilson_ci(int(r["detected"]), int(r["total"]))[1], axis=1)
p = (
ggplot(agg, aes(x="injected_effect_ns", y="power"))
+ geom_ribbon(aes(ymin="ci_low", ymax="ci_high"), fill=COLORS["primary"], alpha=0.2)
+ geom_line(color=COLORS["primary"], size=1.2)
+ geom_point(color=COLORS["primary"], size=3)
+ geom_hline(yintercept=0.70, linetype="dashed", color=COLORS["muted"], size=0.7)
+ geom_hline(yintercept=0.90, linetype="dotted", color=COLORS["muted"], size=0.7)
+ labs(
title="Power Curve: Detection Rate vs Effect Size",
subtitle="Shaded region shows 95% Wilson confidence interval",
x="Injected Effect Size (ns)",
y="Detection Rate",
)
+ scale_y_continuous(labels=lambda l: [f"{v:.0%}" for v in l], limits=(0, 1.05))
+ scale_x_log10()
+ theme_timing_oracle()
+ annotate("text", x=agg["injected_effect_ns"].max() * 0.7, y=0.72,
label="70% min @ 2\u03b8", size=8, color=COLORS["muted"])
+ annotate("text", x=agg["injected_effect_ns"].max() * 0.7, y=0.92,
label="90% min @ 5\u03b8", size=8, color=COLORS["muted"])
)
p.save(output_path / "power_curve.png", width=8, height=5, dpi=200)
print(f" Saved: {output_path / 'power_curve.png'}")
def plot_fpr_calibration(df: pd.DataFrame, output_path: Path):
fpr_df = df[df["test_type"] == "fpr"].copy()
if fpr_df.empty:
print(" No FPR test data found, skipping FPR calibration plot")
return
agg = fpr_df.groupby("test_name").agg(
false_positives=("decision", lambda x: (x == "fail").sum()),
total=("decision", "count"),
).reset_index()
agg["fpr"] = agg["false_positives"] / agg["total"]
agg["ci_low"] = agg.apply(lambda r: wilson_ci(int(r["false_positives"]), int(r["total"]))[0], axis=1)
agg["ci_high"] = agg.apply(lambda r: wilson_ci(int(r["false_positives"]), int(r["total"]))[1], axis=1)
agg["label"] = agg["test_name"].str.replace("fpr_quick_", "").str.replace("_", " ")
p = (
ggplot(agg, aes(x="label", y="fpr"))
+ geom_errorbar(aes(ymin="ci_low", ymax="ci_high"), width=0.2, color=COLORS["text"], size=0.8)
+ geom_point(color=COLORS["text"], size=4)
+ geom_hline(yintercept=0.05, linetype="dashed", color=COLORS["primary"], size=0.8)
+ geom_hline(yintercept=0.10, linetype="dotted", color=COLORS["accent"], size=0.7)
+ labs(
title="FPR Calibration: False Positive Rate Under Null",
subtitle="Error bars show 95% Wilson confidence interval",
x="Test Configuration",
y="False Positive Rate",
)
+ scale_y_continuous(labels=lambda l: [f"{v:.0%}" for v in l], limits=(0, 0.20))
+ theme_timing_oracle()
+ theme(axis_text_x=element_text(angle=15, hjust=1))
+ annotate("text", x=0.5, y=0.055, label="\u03b1 = 5%", size=8, color=COLORS["primary"], ha="left")
+ annotate("text", x=0.5, y=0.105, label="max = 10%", size=8, color=COLORS["accent"], ha="left")
)
p.save(output_path / "fpr_calibration.png", width=7, height=5, dpi=200)
print(f" Saved: {output_path / 'fpr_calibration.png'}")
def plot_coverage_calibration(df: pd.DataFrame, output_path: Path):
cov_df = df[df["test_type"] == "coverage"].copy()
if cov_df.empty:
print(" No coverage test data found, skipping coverage plot")
return
cov_df = cov_df.dropna(subset=["ci_low_ns", "ci_high_ns"])
if cov_df.empty:
print(" No valid coverage data with CIs, skipping coverage plot")
return
cov_df["covered"] = (
(cov_df["ci_low_ns"] <= cov_df["injected_effect_ns"]) &
(cov_df["injected_effect_ns"] <= cov_df["ci_high_ns"])
)
agg = cov_df.groupby("injected_effect_ns").agg(
covered_count=("covered", "sum"),
total=("covered", "count"),
).reset_index()
agg["coverage"] = agg["covered_count"] / agg["total"]
agg["ci_low"] = agg.apply(lambda r: wilson_ci(int(r["covered_count"]), int(r["total"]))[0], axis=1)
agg["ci_high"] = agg.apply(lambda r: wilson_ci(int(r["covered_count"]), int(r["total"]))[1], axis=1)
p = (
ggplot(agg, aes(x="injected_effect_ns", y="coverage"))
+ geom_bar(stat="identity", fill=COLORS["secondary"], alpha=0.8, width=40)
+ geom_errorbar(aes(ymin="ci_low", ymax="ci_high"), width=20, color=COLORS["text"], size=0.7)
+ geom_hline(yintercept=0.95, linetype="dashed", color=COLORS["primary"], size=0.8)
+ geom_hline(yintercept=0.85, linetype="dotted", color=COLORS["muted"], size=0.7)
+ labs(
title="Coverage Calibration: 95% CI Contains True Value",
subtitle="Nominal coverage = 95%, minimum acceptable = 85%",
x="Injected Effect Size (ns)",
y="Coverage Rate",
)
+ scale_y_continuous(labels=lambda l: [f"{v:.0%}" for v in l], limits=(0.75, 1.02))
+ theme_timing_oracle()
+ annotate("text", x=agg["injected_effect_ns"].min() * 0.9, y=0.96,
label="95% nominal", size=8, color=COLORS["primary"], ha="left")
)
p.save(output_path / "coverage_calibration.png", width=7, height=5, dpi=200)
print(f" Saved: {output_path / 'coverage_calibration.png'}")
def plot_effect_estimation(df: pd.DataFrame, output_path: Path):
est_df = df[df["shift_ns"].notna() & (df["injected_effect_ns"] > 0)].copy()
if est_df.empty:
print(" No effect estimation data found, skipping scatter plot")
return
est_df["covered"] = (
(est_df["ci_low_ns"] <= est_df["injected_effect_ns"]) &
(est_df["injected_effect_ns"] <= est_df["ci_high_ns"])
)
est_df["covered_label"] = est_df["covered"].map({True: "CI covers true", False: "CI misses true"})
max_val = max(est_df["injected_effect_ns"].max(), est_df["shift_ns"].max()) * 1.1
p = (
ggplot(est_df, aes(x="injected_effect_ns", y="shift_ns", color="covered_label"))
+ geom_abline(intercept=0, slope=1, linetype="dashed", color=COLORS["muted"], size=0.8)
+ geom_errorbar(aes(ymin="ci_low_ns", ymax="ci_high_ns"), width=0, alpha=0.4, size=0.5)
+ geom_point(size=2, alpha=0.7)
+ scale_color_manual(values={
"CI covers true": COLORS["primary"],
"CI misses true": COLORS["error"],
})
+ labs(
title="Effect Estimation Accuracy",
subtitle="Dashed line shows perfect estimation (y = x)",
x="True Injected Effect (ns)",
y="Estimated Effect (ns)",
color="",
)
+ coord_cartesian(xlim=(0, max_val), ylim=(0, max_val))
+ theme_timing_oracle()
+ theme(legend_position="bottom")
)
p.save(output_path / "effect_estimation.png", width=6, height=6, dpi=200)
print(f" Saved: {output_path / 'effect_estimation.png'}")
def plot_bayesian_calibration(df: pd.DataFrame, output_path: Path):
calib_df = df[df["leak_probability"].notna()].copy()
if calib_df.empty:
print(" No Bayesian calibration data found, skipping calibration curve")
return
calib_df["is_true_positive"] = calib_df["injected_effect_ns"] > 0
calib_df["prob_bin"] = pd.cut(
calib_df["leak_probability"],
bins=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
labels=[0.05, 0.15, 0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85, 0.95],
include_lowest=True
)
agg = calib_df.groupby("prob_bin", observed=True).agg(
true_positives=("is_true_positive", "sum"),
total=("is_true_positive", "count"),
).reset_index()
agg["prob_bin"] = agg["prob_bin"].astype(float)
agg["empirical_rate"] = agg["true_positives"] / agg["total"]
agg["ci_low"] = agg.apply(lambda r: wilson_ci(int(r["true_positives"]), int(r["total"]))[0], axis=1)
agg["ci_high"] = agg.apply(lambda r: wilson_ci(int(r["true_positives"]), int(r["total"]))[1], axis=1)
p = (
ggplot(agg, aes(x="prob_bin", y="empirical_rate"))
+ geom_abline(intercept=0, slope=1, linetype="dashed", color=COLORS["muted"], size=0.8)
+ geom_ribbon(aes(ymin="ci_low", ymax="ci_high"), fill=COLORS["primary"], alpha=0.2)
+ geom_line(color=COLORS["primary"], size=1)
+ geom_point(color=COLORS["primary"], size=3)
+ labs(
title="Bayesian Calibration Curve",
subtitle="Stated P(leak) vs empirical true positive rate (dashed = perfect calibration)",
x="Stated P(leak)",
y="Empirical True Positive Rate",
)
+ scale_x_continuous(
labels=lambda l: [f"{v:.0%}" for v in l],
breaks=[0.05, 0.25, 0.50, 0.75, 0.95],
limits=(0, 1)
)
+ scale_y_continuous(
labels=lambda l: [f"{v:.0%}" for v in l],
limits=(0, 1)
)
+ coord_cartesian(xlim=(0, 1), ylim=(0, 1))
+ theme_timing_oracle()
)
p.save(output_path / "bayesian_calibration.png", width=6, height=6, dpi=200)
print(f" Saved: {output_path / 'bayesian_calibration.png'}")
def plot_power_curves_faceted(df: pd.DataFrame, output_path: Path):
power_df = df[df["test_type"] == "power"].copy()
if power_df.empty:
print(" No power test data found, skipping faceted power curves")
return
def extract_model(name: str) -> str:
name_lower = name.lower()
if "adjacent" in name_lower:
return "AdjacentNetwork"
elif "remote" in name_lower:
return "RemoteNetwork"
elif "research" in name_lower:
return "Research"
elif "shared" in name_lower:
return "SharedHardware"
elif "pq" in name_lower or "quantum" in name_lower:
return "PostQuantumSentinel"
return "Unknown"
power_df["attacker_model"] = power_df["test_name"].apply(extract_model)
if (power_df["injected_effect_ns"] == 0).all():
power_df["injected_effect_ns"] = power_df["test_name"].apply(infer_effect_from_test_name)
power_df = power_df[power_df["injected_effect_ns"] > 0]
if power_df.empty:
print(" Could not determine effect sizes, skipping faceted power curves")
return
models = power_df["attacker_model"].unique()
if len(models) <= 1:
print(" Only one attacker model found, skipping faceted plot")
return
agg = power_df.groupby(["attacker_model", "injected_effect_ns"]).agg(
detected=("decision", lambda x: (x == "fail").sum()),
total=("decision", "count"),
).reset_index()
agg["power"] = agg["detected"] / agg["total"]
agg["ci_low"] = agg.apply(lambda r: wilson_ci(int(r["detected"]), int(r["total"]))[0], axis=1)
agg["ci_high"] = agg.apply(lambda r: wilson_ci(int(r["detected"]), int(r["total"]))[1], axis=1)
p = (
ggplot(agg, aes(x="injected_effect_ns", y="power"))
+ geom_ribbon(aes(ymin="ci_low", ymax="ci_high"), fill=COLORS["primary"], alpha=0.2)
+ geom_line(color=COLORS["primary"], size=1)
+ geom_point(color=COLORS["primary"], size=2.5)
+ geom_hline(yintercept=0.70, linetype="dashed", color=COLORS["muted"], size=0.5)
+ geom_hline(yintercept=0.90, linetype="dotted", color=COLORS["muted"], size=0.5)
+ facet_wrap("~ attacker_model", scales="free_x", ncol=2)
+ labs(
title="Power Curves by Attacker Model",
subtitle="Detection rate vs effect size (horizontal lines: 70%, 90% targets)",
x="Injected Effect Size (ns)",
y="Detection Rate",
)
+ scale_y_continuous(labels=lambda l: [f"{v:.0%}" for v in l], limits=(0, 1.05))
+ scale_x_log10()
+ theme_timing_oracle()
+ theme(figure_size=(10, 8))
)
p.save(output_path / "power_curves_faceted.png", width=10, height=8, dpi=200)
print(f" Saved: {output_path / 'power_curves_faceted.png'}")
def plot_estimation_bias(df: pd.DataFrame, output_path: Path):
est_df = df[df["shift_ns"].notna() & (df["injected_effect_ns"] > 0)].copy()
if est_df.empty:
print(" No effect estimation data found, skipping bias plot")
return
est_df["bias_ns"] = est_df["shift_ns"] - est_df["injected_effect_ns"]
est_df["bias_pct"] = est_df["bias_ns"] / est_df["injected_effect_ns"] * 100
agg = est_df.groupby("injected_effect_ns").agg(
mean_bias_pct=("bias_pct", "mean"),
std_bias_pct=("bias_pct", "std"),
mean_estimate=("shift_ns", "mean"),
count=("shift_ns", "count"),
).reset_index()
def compute_rmse(group):
true_effect = group.name
return np.sqrt(((group["shift_ns"] - true_effect) ** 2).mean())
rmse_df = est_df.groupby("injected_effect_ns").apply(compute_rmse, include_groups=False).reset_index()
rmse_df.columns = ["injected_effect_ns", "rmse"]
agg = agg.merge(rmse_df, on="injected_effect_ns")
agg["rmse_pct"] = agg["rmse"] / agg["injected_effect_ns"] * 100
agg["ci_low"] = agg["mean_bias_pct"] - 1.96 * agg["std_bias_pct"] / np.sqrt(agg["count"])
agg["ci_high"] = agg["mean_bias_pct"] + 1.96 * agg["std_bias_pct"] / np.sqrt(agg["count"])
p = (
ggplot(agg, aes(x="injected_effect_ns", y="mean_bias_pct"))
+ geom_hline(yintercept=0, linetype="solid", color=COLORS["muted"], size=0.8)
+ geom_hline(yintercept=20, linetype="dashed", color=COLORS["accent"], size=0.5)
+ geom_hline(yintercept=-20, linetype="dashed", color=COLORS["accent"], size=0.5)
+ geom_errorbar(aes(ymin="ci_low", ymax="ci_high"), width=0.05, color=COLORS["text"], size=0.7)
+ geom_point(color=COLORS["primary"], size=4)
+ labs(
title="Estimation Bias by Effect Size",
subtitle="Dashed lines show \u00b120% bias threshold",
x="True Effect Size (ns)",
y="Bias ((estimate - true) / true \u00d7 100%)",
)
+ scale_x_log10()
+ scale_y_continuous(labels=lambda l: [f"{v:.0f}%" for v in l])
+ theme_timing_oracle()
+ geom_text(aes(label="rmse_pct"), format_string="{:.0f}% RMSE", nudge_y=10, size=7, color=COLORS["muted"])
)
p.save(output_path / "estimation_bias.png", width=8, height=5, dpi=200)
print(f" Saved: {output_path / 'estimation_bias.png'}")
def plot_compact_dashboard(df: pd.DataFrame, output_path: Path):
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.gridspec import GridSpec
fig = plt.figure(figsize=(12, 10), facecolor='white')
gs = GridSpec(3, 2, figure=fig, height_ratios=[1, 2, 1.5], hspace=0.3, wspace=0.3)
PASS_COLOR = '#22c55e' FAIL_COLOR = '#ef4444' PRIMARY = '#2563eb' MUTED = '#9ca3af'
ax_fpr = fig.add_subplot(gs[0, 0])
fpr_df = df[df["test_type"] == "fpr"].copy()
if not fpr_df.empty:
fpr_groups = fpr_df.groupby("test_name").agg(
failures=("decision", lambda x: (x == "fail").sum()),
total=("decision", "count"),
).reset_index()
fpr_groups["fpr"] = fpr_groups["failures"] / fpr_groups["total"]
fpr_groups["label"] = fpr_groups["test_name"].str.extract(r'(fixed|random)[^_]*_vs_([^_]+)', expand=False).apply(
lambda x: f"{x[0]} vs {x[1]}" if isinstance(x, tuple) and len(x) == 2 else "unknown", axis=1
)
fpr_groups["label"] = fpr_groups["test_name"].apply(
lambda x: "fixed vs fixed" if "fixed_vs_fixed" in x else ("random vs random" if "random" in x else x[:20])
)
y_pos = range(len(fpr_groups))
colors = [PASS_COLOR if fpr <= 0.10 else FAIL_COLOR for fpr in fpr_groups["fpr"]]
bars = ax_fpr.barh(y_pos, fpr_groups["fpr"] * 100, color=colors, height=0.6, alpha=0.8)
ax_fpr.set_yticks(y_pos)
ax_fpr.set_yticklabels(fpr_groups["label"])
ax_fpr.set_xlabel("FPR (%)")
ax_fpr.set_title("False Positive Rate (target ≤10%)", fontweight='bold', fontsize=11)
ax_fpr.axvline(x=10, color=FAIL_COLOR, linestyle='--', alpha=0.5, label='max=10%')
ax_fpr.axvline(x=5, color=MUTED, linestyle=':', alpha=0.5, label='α=5%')
ax_fpr.set_xlim(0, max(15, fpr_groups["fpr"].max() * 100 + 5))
for i, (bar, row) in enumerate(zip(bars, fpr_groups.itertuples())):
width = bar.get_width()
ci_low, ci_high = wilson_ci(int(row.failures), int(row.total))
ax_fpr.text(width + 0.5, bar.get_y() + bar.get_height()/2,
f'{row.fpr*100:.1f}% [{ci_low*100:.0f}-{ci_high*100:.0f}%] n={row.total}',
va='center', fontsize=9)
else:
ax_fpr.text(0.5, 0.5, "No FPR data", ha='center', va='center', transform=ax_fpr.transAxes)
ax_fpr.set_title("False Positive Rate", fontweight='bold', fontsize=11)
ax_fpr.spines['top'].set_visible(False)
ax_fpr.spines['right'].set_visible(False)
ax_power = fig.add_subplot(gs[0, 1])
power_df = df[df["test_type"] == "power"].copy()
if not power_df.empty:
if (power_df["injected_effect_ns"] == 0).all():
power_df["injected_effect_ns"] = power_df["test_name"].apply(infer_effect_from_test_name)
power_df = power_df[power_df["injected_effect_ns"] > 0]
if not power_df.empty:
power_groups = power_df.groupby("injected_effect_ns").agg(
detections=("decision", lambda x: (x == "fail").sum()),
total=("decision", "count"),
).reset_index()
power_groups["power"] = power_groups["detections"] / power_groups["total"]
y_pos = range(len(power_groups))
colors = [PASS_COLOR if p >= 0.90 else (MUTED if p >= 0.70 else FAIL_COLOR) for p in power_groups["power"]]
bars = ax_power.barh(y_pos, power_groups["power"] * 100, color=colors, height=0.6, alpha=0.8)
ax_power.set_yticks(y_pos)
ax_power.set_yticklabels([f'{int(e)}ns' for e in power_groups["injected_effect_ns"]])
ax_power.set_xlabel("Detection Rate (%)")
ax_power.set_title("Statistical Power (target ≥90%)", fontweight='bold', fontsize=11)
ax_power.axvline(x=90, color=PASS_COLOR, linestyle='--', alpha=0.5)
ax_power.axvline(x=70, color=MUTED, linestyle=':', alpha=0.5)
ax_power.set_xlim(0, 105)
for i, (bar, row) in enumerate(zip(bars, power_groups.itertuples())):
width = bar.get_width()
ci_low, ci_high = wilson_ci(int(row.detections), int(row.total))
ax_power.text(width + 1, bar.get_y() + bar.get_height()/2,
f'{row.power*100:.0f}% n={row.total}',
va='center', fontsize=9)
else:
ax_power.text(0.5, 0.5, "No power data", ha='center', va='center', transform=ax_power.transAxes)
ax_power.set_title("Statistical Power", fontweight='bold', fontsize=11)
ax_power.spines['top'].set_visible(False)
ax_power.spines['right'].set_visible(False)
ax_est = fig.add_subplot(gs[1, :])
est_df = df[df["test_type"] == "estimation"].copy()
if est_df.empty:
est_df = df[df["test_name"].str.contains("estimation", case=False, na=False)].copy()
if not est_df.empty and "shift_ns" in est_df.columns:
est_df = est_df[est_df["shift_ns"].notna() & (est_df["injected_effect_ns"] > 0)].copy()
if not est_df.empty:
est_df["estimated"] = est_df["shift_ns"]
est_df["true"] = est_df["injected_effect_ns"]
est_df["bias"] = est_df["estimated"] - est_df["true"]
est_df["bias_pct"] = est_df["bias"] / est_df["true"] * 100
if "ci_low_ns" in est_df.columns and "ci_high_ns" in est_df.columns:
est_df["covers"] = (est_df["ci_low_ns"] <= est_df["true"]) & (est_df["true"] <= est_df["ci_high_ns"])
else:
est_df["covers"] = False
max_val = max(est_df["true"].max(), est_df["estimated"].max()) * 1.1
ax_est.plot([0, max_val], [0, max_val], '--', color=MUTED, alpha=0.7, label='Perfect (y=x)')
x_range = np.linspace(0, max_val, 100)
ax_est.fill_between(x_range, x_range * 0.8, x_range * 1.2, alpha=0.1, color=PASS_COLOR, label='±20% bounds')
for true_val in est_df["true"].unique():
subset = est_df[est_df["true"] == true_val]
if "ci_low_ns" in subset.columns:
for _, row in subset.iterrows():
color = PRIMARY if row["covers"] else FAIL_COLOR
ax_est.plot([row["true"], row["true"]], [row["ci_low_ns"], row["ci_high_ns"]],
color=color, alpha=0.3, linewidth=1)
colors = [PRIMARY if c else FAIL_COLOR for c in subset["covers"]]
ax_est.scatter(subset["true"], subset["estimated"], c=colors, alpha=0.6, s=30, edgecolors='white', linewidth=0.5)
mean_est = subset["estimated"].mean()
mean_bias = subset["bias_pct"].mean()
n = len(subset)
coverage = subset["covers"].mean() * 100
ax_est.annotate(
f'μ={mean_est:.0f}ns\nbias={mean_bias:+.1f}%\nCI cov={coverage:.0f}%\nn={n}',
xy=(true_val, mean_est),
xytext=(true_val + max_val*0.08, mean_est),
fontsize=8,
bbox=dict(boxstyle='round,pad=0.3', facecolor='white', edgecolor=MUTED, alpha=0.8),
arrowprops=dict(arrowstyle='->', color=MUTED, alpha=0.5)
)
ax_est.set_xlabel("True Injected Effect (ns)", fontsize=10)
ax_est.set_ylabel("Estimated Effect (ns)", fontsize=10)
ax_est.set_title("Effect Estimation Accuracy (each point = one trial)", fontweight='bold', fontsize=11)
ax_est.set_xlim(0, max_val)
ax_est.set_ylim(0, max_val)
handles = [
mpatches.Patch(color=PRIMARY, alpha=0.6, label='CI covers true'),
mpatches.Patch(color=FAIL_COLOR, alpha=0.6, label='CI misses true'),
plt.Line2D([0], [0], linestyle='--', color=MUTED, label='Perfect estimation'),
]
ax_est.legend(handles=handles, loc='upper left', fontsize=9)
else:
ax_est.text(0.5, 0.5, "No estimation data", ha='center', va='center', transform=ax_est.transAxes)
ax_est.set_title("Effect Estimation Accuracy", fontweight='bold', fontsize=11)
ax_est.spines['top'].set_visible(False)
ax_est.spines['right'].set_visible(False)
ax_table = fig.add_subplot(gs[2, :])
ax_table.axis('off')
total_trials = len(df)
completed = len(df[df["decision"].isin(["pass", "fail", "inconclusive"])])
unmeasurable = len(df[df["decision"] == "unmeasurable"])
if not fpr_df.empty:
fpr_failures = (fpr_df["decision"] == "fail").sum()
fpr_total = len(fpr_df)
fpr_rate = fpr_failures / fpr_total
fpr_low, fpr_high = wilson_ci(fpr_failures, fpr_total)
fpr_status = "✓ PASS" if fpr_high <= 0.10 else "✗ FAIL"
else:
fpr_rate, fpr_total, fpr_status = 0, 0, "N/A"
if not power_df.empty and len(power_df) > 0:
power_detections = (power_df["decision"] == "fail").sum()
power_total = len(power_df)
power_rate = power_detections / power_total
power_status = "✓ PASS" if power_rate >= 0.90 else "✗ FAIL"
else:
power_rate, power_total, power_status = 0, 0, "N/A"
if not est_df.empty and "bias_pct" in est_df.columns:
mean_bias = est_df["bias_pct"].abs().mean()
est_status = "✓ PASS" if mean_bias <= 25 else "✗ FAIL"
else:
mean_bias, est_status = 0, "N/A"
table_data = [
["Metric", "Value", "Target", "Status"],
["─" * 20, "─" * 25, "─" * 15, "─" * 10],
["Total Trials", f"{total_trials}", "", ""],
["Completed", f"{completed} ({completed/max(1,total_trials)*100:.0f}%)", "", ""],
["", "", "", ""],
["False Positive Rate", f"{fpr_rate*100:.1f}% (n={fpr_total})", "≤ 10%", fpr_status],
["Statistical Power", f"{power_rate*100:.0f}% (n={power_total})", "≥ 90%", power_status],
["Mean |Bias|", f"{mean_bias:.1f}%", "≤ 25%", est_status],
]
y_start = 0.85
row_height = 0.11
col_widths = [0.25, 0.35, 0.2, 0.15]
col_starts = [0.02, 0.27, 0.62, 0.82]
for i, row in enumerate(table_data):
y = y_start - i * row_height
for j, (cell, col_start) in enumerate(zip(row, col_starts)):
weight = 'bold' if i == 0 else 'normal'
color = PASS_COLOR if '✓' in cell else (FAIL_COLOR if '✗' in cell else 'black')
ax_table.text(col_start, y, cell, transform=ax_table.transAxes, fontsize=11,
fontweight=weight, color=color, family='monospace')
ax_table.set_title("Summary Statistics", fontweight='bold', fontsize=11, pad=10)
plt.tight_layout()
plt.savefig(output_path / "dashboard.png", dpi=150, bbox_inches='tight', facecolor='white')
plt.close()
print(f" Saved: {output_path / 'dashboard.png'}")
def plot_summary_dashboard(df: pd.DataFrame, output_path: Path):
total_trials = len(df)
completed = len(df[df["decision"] != "unmeasurable"])
unmeasurable = len(df[df["decision"] == "unmeasurable"])
power_df = df[df["test_type"] == "power"]
fpr_df = df[df["test_type"] == "fpr"]
coverage_df = df[df["test_type"] == "coverage"]
bayesian_df = df[df["leak_probability"].notna()]
if not fpr_df.empty:
fpr_rate = (fpr_df["decision"] == "fail").mean()
fpr_trials = len(fpr_df)
fpr_low, fpr_high = wilson_ci(int((fpr_df["decision"] == "fail").sum()), fpr_trials)
else:
fpr_rate, fpr_trials, fpr_low, fpr_high = 0, 0, 0, 0
power_stats = []
if not power_df.empty:
for effect in power_df["injected_effect_ns"].unique():
if effect > 0:
subset = power_df[power_df["injected_effect_ns"] == effect]
detected = (subset["decision"] == "fail").sum()
total = len(subset)
rate = detected / total if total > 0 else 0
low, high = wilson_ci(detected, total)
power_stats.append((effect, rate, low, high, total))
if not coverage_df.empty:
coverage_df = coverage_df.dropna(subset=["ci_low_ns", "ci_high_ns"])
if not coverage_df.empty:
covered = (
(coverage_df["ci_low_ns"] <= coverage_df["injected_effect_ns"]) &
(coverage_df["injected_effect_ns"] <= coverage_df["ci_high_ns"])
).sum()
coverage_total = len(coverage_df)
coverage_rate = covered / coverage_total if coverage_total > 0 else 0
coverage_low, coverage_high = wilson_ci(covered, coverage_total)
else:
coverage_rate, coverage_total, coverage_low, coverage_high = 0, 0, 0, 0
else:
coverage_rate, coverage_total, coverage_low, coverage_high = 0, 0, 0, 0
if not bayesian_df.empty:
calib_df = bayesian_df.copy()
calib_df["is_true_positive"] = calib_df["injected_effect_ns"] > 0
calib_df["prob_bin"] = (calib_df["leak_probability"] * 10).astype(int) / 10 + 0.05
bin_agg = calib_df.groupby("prob_bin").agg(
tp=("is_true_positive", "sum"),
total=("is_true_positive", "count"),
)
bin_agg["empirical"] = bin_agg["tp"] / bin_agg["total"]
bin_agg["deviation"] = (bin_agg.index - bin_agg["empirical"]).abs()
mean_calibration_error = bin_agg["deviation"].mean()
max_calibration_error = bin_agg["deviation"].max()
else:
mean_calibration_error, max_calibration_error = 0, 0
summary_text = f"""
================================================================================
CALIBRATION SUMMARY REPORT
================================================================================
OVERVIEW
--------
Total Trials: {total_trials:>8}
Completed: {completed:>8} ({completed/total_trials*100:.1f}%)
Unmeasurable: {unmeasurable:>8} ({unmeasurable/total_trials*100:.1f}%)
FALSE POSITIVE RATE (FPR)
-------------------------
FPR: {fpr_rate*100:>7.1f}% [95% CI: {fpr_low*100:.1f}% - {fpr_high*100:.1f}%]
Trials: {fpr_trials:>8}
Target: ≤ 5% (α = 0.05), acceptable ≤ 10%
Status: {'✓ PASS' if fpr_high <= 0.10 else '✗ FAIL'}
STATISTICAL POWER
-----------------"""
for effect, rate, low, high, n in sorted(power_stats):
target = "70%" if effect <= 200 else ("90%" if effect <= 500 else "95%")
status = "✓" if rate >= 0.70 else "✗"
summary_text += f"""
{effect:>6.0f}ns effect: {rate*100:>5.1f}% [95% CI: {low*100:.1f}% - {high*100:.1f}%] (n={n}) {status}"""
summary_text += f"""
CI COVERAGE
-----------
Coverage: {coverage_rate*100:>7.1f}% [95% CI: {coverage_low*100:.1f}% - {coverage_high*100:.1f}%]
Trials: {coverage_total:>8}
Target: ≥ 85% (nominal 95%)
Status: {'✓ PASS' if coverage_rate >= 0.85 else '✗ FAIL'}
BAYESIAN CALIBRATION
--------------------
Mean Calibration Error: {mean_calibration_error*100:>5.1f}%
Max Deviation: {max_calibration_error*100:>5.1f}%
Target: ≤ 15% mean, ≤ 25% max
Status: {'✓ PASS' if mean_calibration_error <= 0.15 and max_calibration_error <= 0.25 else '✗ FAIL'}
================================================================================
"""
with open(output_path / "summary.txt", "w") as f:
f.write(summary_text)
print(f" Saved: {output_path / 'summary.txt'}")
print(summary_text)
def load_data(data_dir: Path) -> pd.DataFrame:
csv_files = list(data_dir.glob("*.csv"))
if not csv_files:
raise ValueError(f"No CSV files found in {data_dir}")
dfs = []
for f in csv_files:
try:
df = pd.read_csv(f)
dfs.append(df)
print(f" Loaded: {f.name} ({len(df)} records)")
except Exception as e:
print(f" Warning: Failed to load {f.name}: {e}")
if not dfs:
raise ValueError("No valid CSV files could be loaded")
return pd.concat(dfs, ignore_index=True)
def main():
parser = argparse.ArgumentParser(description="Generate calibration plots for timing-oracle")
parser.add_argument("data_dir", type=Path, help="Directory containing CSV data files")
parser.add_argument("--output", "-o", type=Path, default=Path("./plots"), help="Output directory for plots")
args = parser.parse_args()
if not args.data_dir.exists():
print(f"Error: Data directory not found: {args.data_dir}")
return 1
args.output.mkdir(parents=True, exist_ok=True)
print(f"\nLoading data from {args.data_dir}...")
df = load_data(args.data_dir)
print(f"Total records: {len(df)}")
print(f"\nGenerating plots to {args.output}...")
plot_compact_dashboard(df, args.output) plot_power_curve(df, args.output)
plot_power_curves_faceted(df, args.output)
plot_fpr_calibration(df, args.output)
plot_bayesian_calibration(df, args.output)
plot_coverage_calibration(df, args.output)
plot_effect_estimation(df, args.output)
plot_estimation_bias(df, args.output)
plot_summary_dashboard(df, args.output)
print("\nDone!")
return 0
if __name__ == "__main__":
exit(main())