#[path = "eval/mod.rs"]
mod eval;
use claude_hippo::surprise::SurpriseWeights;
use eval::*;
use serde::Serialize;
const TOPICS: &[&str] = &["auth", "db", "billing", "ui", "infra"];
const CHATS_PER_CLUSTER: usize = 19;
const TOTAL_ITEMS: usize = 100;
const K: usize = 5;
const PARAPHRASES: &[&str] = &[
"what was our decision about {topic}?",
"remind me of the {topic} choice we made",
"the {topic} call we settled on",
"decision summary for {topic}",
"what did we decide on {topic}",
];
#[derive(Serialize)]
struct BenchAReport {
setup: SetupSummary,
default_oversample: AblationResult,
full_oversample: AblationResult,
}
#[derive(Serialize)]
struct SetupSummary {
total_items: usize,
decisions: usize,
chats: usize,
clusters: usize,
queries: usize,
k: usize,
note: &'static str,
}
fn build_config(oversample_factor: usize) -> EvalConfig {
let mut items = Vec::with_capacity(TOTAL_ITEMS);
let mut queries = Vec::with_capacity(TOPICS.len() * PARAPHRASES.len());
for (cluster_idx, topic) in TOPICS.iter().enumerate() {
let decision_id = format!("decision-{topic}");
items.push(EvalItem {
id: decision_id.clone(),
content: format!(
"We chose to use approach X for the {topic} module after reviewing options Y and Z. \
The decision was driven by latency targets and on-call ergonomics. \
Implementation lives in service-{topic} and is owned by the platform team. \
Tradeoffs: option Y was rejected because it did not satisfy the audit requirements. \
Revisit by 2026-Q4 if traffic crosses 50k QPS or compliance scope expands."
),
tags: decision_tags(topic),
memory_type: Some("Decision".into()),
importance: Some(1.0),
cluster: cluster_idx,
age_days: 0.0,
});
for i in 0..CHATS_PER_CLUSTER {
items.push(EvalItem {
id: format!("chat-{topic}-{i:02}"),
content: format!("{topic} chat {i:02}: small note"),
tags: chat_tags(topic),
memory_type: Some("Observation".into()),
importance: None,
cluster: cluster_idx,
age_days: 0.0,
});
}
for tmpl in PARAPHRASES {
queries.push(EvalQuery {
query: tmpl.replace("{topic}", topic),
cluster: cluster_idx,
relevant_ids: vec![decision_id.clone()],
});
}
}
assert_eq!(items.len(), TOTAL_ITEMS);
assert_eq!(queries.len(), TOPICS.len() * PARAPHRASES.len());
EvalConfig {
items,
queries,
k: K,
weights: SurpriseWeights::default(),
num_clusters: TOPICS.len() + 1,
fallback_cluster: TOPICS.len(),
noise_scale: 0.05,
oversample_factor,
prediction_loss: None,
embedder_override: None,
}
}
#[tokio::test]
async fn bench_a_long_session_noise() {
use claude_hippo::server::DEFAULT_OVERSAMPLE_FACTOR;
let default_oversample = run_ablation(build_config(DEFAULT_OVERSAMPLE_FACTOR))
.await
.expect("bench A default-oversample run");
let full_oversample = run_ablation(build_config(20))
.await
.expect("bench A full-oversample run");
let report = BenchAReport {
setup: SetupSummary {
total_items: TOTAL_ITEMS,
decisions: TOPICS.len(),
chats: TOPICS.len() * CHATS_PER_CLUSTER,
clusters: TOPICS.len(),
queries: TOPICS.len() * PARAPHRASES.len(),
k: K,
note: "1 Decision (importance=1.0) + 19 Observation chats per topic, 5 topics. \
5 paraphrases per topic = 25 queries total. \
Each query asks for the cluster's Decision; relevant set = {that one Decision}.",
},
default_oversample: default_oversample.clone(),
full_oversample: full_oversample.clone(),
};
write_result_json("bench_a_long_session", &report).expect("write bench A result");
println!("\nBench A — Long-session noise (v0.3)");
println!(
" default oversample ({}): baseline P@1={:.3} MRR={:.3} surprise P@1={:.3} MRR={:.3}",
claude_hippo::server::DEFAULT_OVERSAMPLE_FACTOR,
default_oversample.baseline.precision_at_1,
default_oversample.baseline.mrr,
default_oversample.with_surprise.precision_at_1,
default_oversample.with_surprise.mrr,
);
println!(
" full oversample (20): baseline P@1={:.3} MRR={:.3} surprise P@1={:.3} MRR={:.3}",
full_oversample.baseline.precision_at_1,
full_oversample.baseline.mrr,
full_oversample.with_surprise.precision_at_1,
full_oversample.with_surprise.mrr,
);
assert_eq!(
full_oversample.with_surprise.precision_at_1, 1.0,
"with full oversample, surprise rerank must place Decision at rank 1 every time"
);
assert_eq!(
full_oversample.with_surprise.mrr, 1.0,
"MRR must be 1.0 when every Decision is rank 1"
);
assert_eq!(
default_oversample.with_surprise.precision_at_1,
1.0,
"v0.3: with default oversample={}, Bench A must achieve P@1=1.0 (was 0.72 in v0.2 \
when default was 3). If this fails, the default bump regressed.",
claude_hippo::server::DEFAULT_OVERSAMPLE_FACTOR,
);
assert_eq!(
default_oversample.with_surprise.mrr, 1.0,
"v0.3: with default oversample, Bench A MRR must = 1.0"
);
}