use super::*;
#[test]
fn test_score_multiple_choice_basic() {
let scores = vec![-1.0, -2.0, -3.0, -4.0];
let (idx, score) = score_multiple_choice(&scores, false);
assert_eq!(idx, 0);
assert!((score - (-1.0)).abs() < 1e-10);
}
#[test]
fn test_score_multiple_choice_second_best() {
let scores = vec![-5.0, -1.0, -3.0, -4.0];
let (idx, _) = score_multiple_choice(&scores, false);
assert_eq!(idx, 1);
}
#[test]
fn test_score_multiple_choice_empty() {
let (idx, score) = score_multiple_choice(&[], false);
assert_eq!(idx, 0);
assert!(score == f64::NEG_INFINITY);
}
#[test]
fn test_score_multiple_choice_single() {
let scores = vec![-2.5];
let (idx, score) = score_multiple_choice(&scores, false);
assert_eq!(idx, 0);
assert!((score - (-2.5)).abs() < 1e-10);
}
#[test]
fn test_score_multiple_choice_equal() {
let scores = vec![-2.0, -2.0, -2.0];
let (idx, _) = score_multiple_choice(&scores, false);
assert!(idx < 3);
}
#[test]
fn test_compute_perplexity_basic() {
let ppl = compute_perplexity(-10.0, 10);
assert!((ppl - 1.0_f64.exp()).abs() < 1e-10);
}
#[test]
fn test_compute_perplexity_zero_tokens() {
let ppl = compute_perplexity(-5.0, 0);
assert!(ppl == f64::INFINITY);
}
#[test]
fn test_compute_perplexity_zero_ll() {
let ppl = compute_perplexity(0.0, 10);
assert!((ppl - 1.0).abs() < 1e-10);
}
#[test]
fn test_compute_perplexity_large_negative() {
let ppl = compute_perplexity(-100.0, 10);
assert!(ppl > 20000.0);
assert!(ppl.is_finite());
}
#[test]
fn test_compute_accuracy_perfect() {
let preds = vec![0, 1, 2, 3];
let gold = vec![0, 1, 2, 3];
assert!((compute_accuracy(&preds, &gold) - 1.0).abs() < 1e-10);
}
#[test]
fn test_compute_accuracy_zero() {
let preds = vec![1, 2, 3, 0];
let gold = vec![0, 1, 2, 3];
assert!((compute_accuracy(&preds, &gold) - 0.0).abs() < 1e-10);
}
#[test]
fn test_compute_accuracy_half() {
let preds = vec![0, 1, 0, 0];
let gold = vec![0, 1, 2, 3];
assert!((compute_accuracy(&preds, &gold) - 0.5).abs() < 1e-10);
}
#[test]
fn test_compute_accuracy_empty() {
assert!((compute_accuracy(&[], &[]) - 0.0).abs() < 1e-10);
}
#[test]
fn test_compute_accuracy_mismatched_len() {
let preds = vec![0, 1];
let gold = vec![0, 1, 2];
assert!((compute_accuracy(&preds, &gold) - 0.0).abs() < 1e-10);
}
#[test]
fn test_eval_task_new() {
let task = EvalTask::new("test_task", TaskType::MultipleChoice);
assert_eq!(task.name, "test_task");
assert_eq!(task.task_type, TaskType::MultipleChoice);
assert!(task.is_empty());
assert_eq!(task.len(), 0);
assert_eq!(task.num_fewshot, 0);
}
#[test]
fn test_eval_task_add_example() {
let mut task = EvalTask::new("t", TaskType::Classification);
task.add_example(EvalExample {
context: "ctx".to_string(),
choices: vec!["a".to_string(), "b".to_string()],
gold_idx: Some(0),
reference: None,
});
assert_eq!(task.len(), 1);
assert!(!task.is_empty());
}
#[test]
fn test_eval_task_with_fewshot() {
let task = EvalTask::new("t", TaskType::MultipleChoice).with_fewshot(5);
assert_eq!(task.num_fewshot, 5);
}
#[test]
fn test_mock_hellaswag() {
let task = mock_hellaswag();
assert_eq!(task.name, "hellaswag");
assert_eq!(task.task_type, TaskType::MultipleChoice);
assert_eq!(task.len(), 2);
assert_eq!(task.examples[0].choices.len(), 4);
assert_eq!(task.examples[0].gold_idx, Some(0));
assert_eq!(task.examples[1].gold_idx, Some(1));
}
#[test]
fn test_eval_report_from_tasks() {
let metrics = vec![
TaskMetrics {
task_name: "task_a".to_string(),
task_type: TaskType::MultipleChoice,
num_examples: 10,
accuracy: Some(0.8),
perplexity: None,
avg_log_likelihood: None,
predictions: vec![],
},
TaskMetrics {
task_name: "task_b".to_string(),
task_type: TaskType::MultipleChoice,
num_examples: 20,
accuracy: Some(0.6),
perplexity: None,
avg_log_likelihood: None,
predictions: vec![],
},
];
let report = EvalReport::from_tasks(metrics);
assert_eq!(report.num_tasks, 2);
assert_eq!(report.total_examples, 30);
assert!((report.macro_accuracy - 0.7).abs() < 1e-10);
}
#[test]
fn test_eval_report_empty() {
let report = EvalReport::from_tasks(vec![]);
assert_eq!(report.num_tasks, 0);
assert_eq!(report.total_examples, 0);
assert!((report.macro_accuracy - 0.0).abs() < 1e-10);
}
#[test]
fn test_eval_report_perplexity_only() {
let metrics = vec![TaskMetrics {
task_name: "ppl_task".to_string(),
task_type: TaskType::Perplexity,
num_examples: 5,
accuracy: None,
perplexity: Some(15.3),
avg_log_likelihood: Some(-2.5),
predictions: vec![],
}];
let report = EvalReport::from_tasks(metrics);
assert_eq!(report.num_tasks, 1);
assert!((report.macro_accuracy - 0.0).abs() < 1e-10);
}
#[test]
fn test_harness_config_default() {
let cfg = HarnessConfig::default();
assert!(cfg.tasks.is_empty());
assert!(!cfg.length_normalize);
assert_eq!(cfg.max_examples, 0);
}
#[test]
fn test_run_harness_empty_tasks() {
let config = HarnessConfig::default();
let result = run_harness(&config, |_: &str, _: &str| 0.0);
assert!(result.is_err());
}
#[test]
fn test_run_harness_multiple_choice() {
let task = mock_hellaswag();
let config = HarnessConfig {
tasks: vec![task],
length_normalize: false,
max_examples: 0,
};
let report = run_harness(&config, |_ctx: &str, completion: &str| {
-(completion.len() as f64)
})
.unwrap();
assert_eq!(report.num_tasks, 1);
assert_eq!(report.total_examples, 2);
assert!(report.tasks[0].accuracy.is_some());
}
#[test]
fn test_run_harness_perfect_scorer() {
let task = mock_hellaswag();
let config = HarnessConfig {
tasks: vec![task],
length_normalize: false,
max_examples: 0,
};
let report = run_harness(&config, |ctx: &str, completion: &str| {
if (ctx.contains("sandwich") && completion.contains("butter"))
|| (ctx.contains("cat") && completion.contains("purred"))
{
0.0 } else {
-10.0
}
})
.unwrap();
assert!((report.tasks[0].accuracy.unwrap() - 1.0).abs() < 1e-10);
}
#[test]
fn test_run_harness_max_examples() {
let mut task = mock_hellaswag();
for i in 0..10 {
task.add_example(EvalExample {
context: format!("Context {}", i),
choices: vec!["a".to_string(), "b".to_string()],
gold_idx: Some(0),
reference: None,
});
}
let config = HarnessConfig {
tasks: vec![task],
length_normalize: false,
max_examples: 3,
};
let report = run_harness(&config, |_: &str, _: &str| -1.0).unwrap();
assert_eq!(report.tasks[0].num_examples, 3);
}
#[test]
fn test_run_harness_perplexity() {
let mut task = EvalTask::new("ppl_test", TaskType::Perplexity);
task.add_example(EvalExample {
context: "The quick brown fox".to_string(),
choices: vec![],
gold_idx: None,
reference: Some("The quick brown fox jumps over the lazy dog".to_string()),
});
task.add_example(EvalExample {
context: "Hello world".to_string(),
choices: vec![],
gold_idx: None,
reference: None, });
let config = HarnessConfig {
tasks: vec![task],
length_normalize: false,
max_examples: 0,
};
let report = run_harness(&config, |_: &str, text: &str| {
-(text.split_whitespace().count() as f64) * 0.5
})
.unwrap();
assert!(report.tasks[0].perplexity.is_some());
let ppl = report.tasks[0].perplexity.unwrap();
assert!(ppl > 0.0);
assert!(ppl.is_finite());
assert!(report.tasks[0].avg_log_likelihood.is_some());
}
#[test]
fn test_run_harness_generation() {
let mut task = EvalTask::new("gen_test", TaskType::Generation);
task.add_example(EvalExample {
context: "Translate: hello".to_string(),
choices: vec![],
gold_idx: None,
reference: Some("hola".to_string()),
});
let config = HarnessConfig {
tasks: vec![task],
length_normalize: false,
max_examples: 0,
};
let report = run_harness(&config, |_: &str, _: &str| 0.0).unwrap();
assert!(report.tasks[0].accuracy.is_none());
assert!(report.tasks[0].perplexity.is_none());
}
#[test]
fn test_run_harness_multi_task() {
let mc_task = mock_hellaswag();
let mut ppl_task = EvalTask::new("ppl", TaskType::Perplexity);
ppl_task.add_example(EvalExample {
context: "test text".to_string(),
choices: vec![],
gold_idx: None,
reference: None,
});
let config = HarnessConfig {
tasks: vec![mc_task, ppl_task],
length_normalize: false,
max_examples: 0,
};
let report = run_harness(&config, |_: &str, _: &str| -1.0).unwrap();
assert_eq!(report.num_tasks, 2);
}
#[test]
fn falsify_eval_001_perplexity_positive() {
for ll in [-100.0, -10.0, -1.0, 0.0, 1.0] {
for tokens in [1, 5, 10, 100, 1000] {
let ppl = compute_perplexity(ll, tokens);
assert!(
ppl > 0.0,
"PPL must be > 0, got {} for ll={}, tokens={}",
ppl,
ll,
tokens
);
}
}
}
#[test]
fn falsify_eval_002_accuracy_bounded() {
let test_cases: Vec<(Vec<usize>, Vec<usize>)> = vec![
(vec![0, 0, 0, 0], vec![0, 0, 0, 0]),
(vec![0, 1, 2, 3], vec![3, 2, 1, 0]),
(vec![0, 1, 0, 1], vec![0, 0, 1, 1]),
(vec![0], vec![0]),
];
for (preds, gold) in &test_cases {
let acc = compute_accuracy(preds, gold);
assert!(
(0.0..=1.0).contains(&acc),
"Accuracy must be in [0,1], got {} for {:?} vs {:?}",
acc,
preds,
gold
);
}
}
#[test]
fn falsify_eval_003_report_consistency() {
let metrics = vec![
TaskMetrics {
task_name: "a".to_string(),
task_type: TaskType::MultipleChoice,
num_examples: 100,
accuracy: Some(0.9),
perplexity: None,
avg_log_likelihood: None,
predictions: vec![],
},
TaskMetrics {
task_name: "b".to_string(),
task_type: TaskType::Classification,
num_examples: 50,
accuracy: Some(0.7),
perplexity: None,
avg_log_likelihood: None,
predictions: vec![],
},
TaskMetrics {
task_name: "c".to_string(),
task_type: TaskType::Perplexity,
num_examples: 25,
accuracy: None,
perplexity: Some(10.0),
avg_log_likelihood: Some(-2.3),
predictions: vec![],
},
];
let report = EvalReport::from_tasks(metrics);
assert_eq!(report.total_examples, 175);
assert_eq!(report.num_tasks, 3);
let expected_macro = (0.9 + 0.7) / 2.0;
assert!(
(report.macro_accuracy - expected_macro).abs() < 1e-10,
"Macro accuracy should be mean of accuracy tasks only, got {}",
report.macro_accuracy
);
}