pub mod ab;
pub mod swebench;
pub mod templates;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct BenchResult {
pub run_id: String,
pub template: String,
pub condition: String,
pub brain_model: String,
pub verifier_score: u8,
pub verifier_pass: bool,
pub wall_clock_ms: u64,
pub rounds: u32,
pub best_round_restore_fired: bool,
pub failure_category: Option<String>,
pub verifier_feedback: String,
}
impl BenchResult {
#[must_use]
pub fn errored(template: &str, condition: &str, reason: &str) -> Self {
Self {
run_id: format!("bench-error-{}", chrono::Utc::now().timestamp()),
template: template.to_string(),
condition: condition.to_string(),
brain_model: String::new(),
verifier_score: 0,
verifier_pass: false,
wall_clock_ms: 0,
rounds: 0,
best_round_restore_fired: false,
failure_category: Some("harness_error".to_string()),
verifier_feedback: reason.to_string(),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct AbComparison {
pub template: String,
pub control: BenchResult,
pub variant: BenchResult,
pub score_delta: i16,
pub wall_clock_delta_ms: i64,
}
impl AbComparison {
#[must_use]
pub fn new(control: BenchResult, variant: BenchResult) -> Self {
debug_assert_eq!(
control.template, variant.template,
"A/B comparison requires matching templates"
);
let score_delta = i16::from(variant.verifier_score) - i16::from(control.verifier_score);
let wall_clock_delta_ms = i64::try_from(variant.wall_clock_ms)
.unwrap_or(i64::MAX)
.saturating_sub(i64::try_from(control.wall_clock_ms).unwrap_or(i64::MAX));
let template = control.template.clone();
Self {
template,
control,
variant,
score_delta,
wall_clock_delta_ms,
}
}
#[must_use]
pub fn summary_line(&self) -> String {
format!(
"{:<20} Δscore={:+} Δms={:+}",
self.template, self.score_delta, self.wall_clock_delta_ms
)
}
}
#[must_use]
pub fn bench_runs_dir() -> std::path::PathBuf {
if let Ok(p) = std::env::var("CLAUDETTE_BENCH_DIR") {
if !p.is_empty() {
return std::path::PathBuf::from(p).join("runs");
}
}
let home = std::env::var("USERPROFILE")
.or_else(|_| std::env::var("HOME"))
.unwrap_or_else(|_| ".".to_string());
std::path::PathBuf::from(home)
.join(".claudette")
.join("bench")
.join("runs")
}
#[cfg(test)]
mod tests {
use super::*;
fn fixture(template: &str, score: u8, wall_ms: u64) -> BenchResult {
BenchResult {
run_id: format!("test-{template}-{score}"),
template: template.to_string(),
condition: "control".to_string(),
brain_model: "qwen3.6-35b-a3b".to_string(),
verifier_score: score,
verifier_pass: score >= 8,
wall_clock_ms: wall_ms,
rounds: 1,
best_round_restore_fired: false,
failure_category: None,
verifier_feedback: String::new(),
}
}
#[test]
fn ab_comparison_computes_positive_score_delta() {
let control = fixture("csv-analytics", 6, 30_000);
let variant = fixture("csv-analytics", 9, 45_000);
let cmp = AbComparison::new(control, variant);
assert_eq!(cmp.score_delta, 3);
assert_eq!(cmp.wall_clock_delta_ms, 15_000);
}
#[test]
fn ab_comparison_computes_negative_deltas() {
let control = fixture("dns-parser", 9, 40_000);
let variant = fixture("dns-parser", 5, 28_000);
let cmp = AbComparison::new(control, variant);
assert_eq!(cmp.score_delta, -4);
assert_eq!(cmp.wall_clock_delta_ms, -12_000);
}
#[test]
fn ab_comparison_summary_includes_template_and_deltas() {
let cmp = AbComparison::new(
fixture("storefront", 7, 10_000),
fixture("storefront", 8, 12_500),
);
let line = cmp.summary_line();
assert!(line.contains("storefront"));
assert!(line.contains("+1"));
assert!(line.contains("+2500"));
}
#[test]
fn bench_runs_dir_honors_env_var() {
let prev = std::env::var("CLAUDETTE_BENCH_DIR").ok();
std::env::set_var("CLAUDETTE_BENCH_DIR", "/tmp/test-bench");
let dir = bench_runs_dir();
assert!(dir.to_string_lossy().contains("test-bench"));
assert!(dir.ends_with("runs"));
match prev {
Some(v) => std::env::set_var("CLAUDETTE_BENCH_DIR", v),
None => std::env::remove_var("CLAUDETTE_BENCH_DIR"),
}
}
#[test]
fn bench_result_errored_sentinel_has_failure_category() {
let r = BenchResult::errored("portfolio", "control", "mission_start failed");
assert_eq!(r.template, "portfolio");
assert_eq!(r.condition, "control");
assert_eq!(r.failure_category.as_deref(), Some("harness_error"));
assert_eq!(r.verifier_feedback, "mission_start failed");
assert!(!r.verifier_pass);
}
#[test]
fn bench_result_round_trips_through_json() {
let r = fixture("rms-scheduler", 8, 22_500);
let json = serde_json::to_string(&r).unwrap();
let back: BenchResult = serde_json::from_str(&json).unwrap();
assert_eq!(r, back);
}
}