use aprender::online::eval_harness::{
compute_accuracy, compute_perplexity, mock_hellaswag, run_harness, EvalExample, EvalTask,
HarnessConfig, TaskType,
};
fn main() {
println!("=== Evaluation Harness (GH-454) ===\n");
println!("── 1. Scoring Primitives ──");
let ppl = compute_perplexity(-23.0, 10);
println!(" Perplexity (LL=-23, N=10): {:.2}", ppl);
let preds = vec![0, 1, 2, 0];
let gold = vec![0, 1, 0, 0];
let acc = compute_accuracy(&preds, &gold);
println!(" Accuracy: {:.1}% ({}/{})", acc * 100.0, 3, 4);
println!("\n── 2. Multiple-Choice (HellaSwag mock) ──");
let task = mock_hellaswag();
println!(" Task: {} ({} examples)", task.name, task.len());
let config = HarnessConfig {
tasks: vec![task],
length_normalize: false,
max_examples: 0,
};
let report = run_harness(&config, |ctx, completion| {
let base = -(completion.len() as f64) * 0.1;
if (ctx.contains("sandwich") && completion.contains("butter"))
|| (ctx.contains("cat") && completion.contains("purred"))
{
base + 5.0
} else {
base
}
})
.expect("run_harness failed for multiple-choice task");
let mc = &report.tasks[0];
println!(" Accuracy: {:.1}%", mc.accuracy.unwrap_or(0.0) * 100.0);
println!(" Predictions: {:?}", mc.predictions);
println!("\n── 3. Perplexity Evaluation ──");
let mut ppl_task = EvalTask::new("wikitext_sample", TaskType::Perplexity);
ppl_task.add_example(EvalExample {
context: String::new(),
choices: vec![],
gold_idx: None,
reference: Some("The Mona Lisa is a half-length portrait painting by the Italian artist Leonardo da Vinci".to_string()),
});
ppl_task.add_example(EvalExample {
context: String::new(),
choices: vec![],
gold_idx: None,
reference: Some(
"Machine learning is a subset of artificial intelligence that focuses on algorithms"
.to_string(),
),
});
let ppl_config = HarnessConfig {
tasks: vec![ppl_task],
length_normalize: false,
max_examples: 0,
};
let ppl_report = run_harness(&ppl_config, |_, text| {
-(text.split_whitespace().count() as f64) * 0.3
})
.expect("run_harness failed for perplexity task");
let ppl_metrics = &ppl_report.tasks[0];
println!(
" Perplexity: {:.2}",
ppl_metrics.perplexity.unwrap_or(f64::NAN)
);
println!(
" Avg LL: {:.4}",
ppl_metrics.avg_log_likelihood.unwrap_or(f64::NAN)
);
println!("\n── 4. Multi-Task Report ──");
let mc_task = mock_hellaswag();
let mut cls_task = EvalTask::new("sentiment", TaskType::Classification);
cls_task.add_example(EvalExample {
context: "This movie was great".to_string(),
choices: vec![" positive".to_string(), " negative".to_string()],
gold_idx: Some(0),
reference: None,
});
cls_task.add_example(EvalExample {
context: "Terrible experience".to_string(),
choices: vec![" positive".to_string(), " negative".to_string()],
gold_idx: Some(1),
reference: None,
});
let multi_config = HarnessConfig {
tasks: vec![mc_task, cls_task],
length_normalize: false,
max_examples: 0,
};
let multi_report = run_harness(&multi_config, |ctx, completion| {
let is_good_match = (ctx.contains("sandwich") && completion.contains("butter"))
|| (ctx.contains("cat") && completion.contains("purred"))
|| (ctx.contains("great") && completion.contains("positive"))
|| (ctx.contains("Terrible") && completion.contains("negative"));
if is_good_match {
0.0
} else {
-5.0
}
})
.expect("run_harness failed for multi-task evaluation");
println!(" Tasks evaluated: {}", multi_report.num_tasks);
println!(" Total examples: {}", multi_report.total_examples);
println!(
" Macro accuracy: {:.1}%",
multi_report.macro_accuracy * 100.0
);
for t in &multi_report.tasks {
println!(
" {}: acc={:.1}%",
t.task_name,
t.accuracy.unwrap_or(0.0) * 100.0
);
}
println!("\n=== Eval harness verified ===");
}