use crate::error::{AprenderError, Result};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TaskType {
MultipleChoice,
Perplexity,
Generation,
Classification,
}
#[derive(Debug, Clone)]
pub struct EvalExample {
pub context: String,
pub choices: Vec<String>,
pub gold_idx: Option<usize>,
pub reference: Option<String>,
}
#[derive(Debug, Clone)]
pub struct EvalTask {
pub name: String,
pub task_type: TaskType,
pub examples: Vec<EvalExample>,
pub num_fewshot: usize,
}
impl EvalTask {
#[must_use]
pub fn new(name: impl Into<String>, task_type: TaskType) -> Self {
Self {
name: name.into(),
task_type,
examples: Vec::new(),
num_fewshot: 0,
}
}
pub fn add_example(&mut self, example: EvalExample) {
self.examples.push(example);
}
#[must_use]
pub fn with_fewshot(mut self, n: usize) -> Self {
self.num_fewshot = n;
self
}
#[must_use]
pub fn len(&self) -> usize {
self.examples.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.examples.is_empty()
}
}
#[derive(Debug, Clone)]
pub struct LogLikelihoodScores {
pub scores: Vec<f64>,
pub normalized_scores: Vec<f64>,
}
#[must_use]
pub fn score_multiple_choice(scores: &[f64], normalize: bool) -> (usize, f64) {
if scores.is_empty() {
return (0, f64::NEG_INFINITY);
}
scores
.iter()
.enumerate()
.max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
.map(|(i, &s)| {
let _ = normalize; (i, s)
})
.unwrap_or((0, f64::NEG_INFINITY))
}
#[must_use]
pub fn compute_perplexity(log_likelihood: f64, num_tokens: usize) -> f64 {
if num_tokens == 0 {
return f64::INFINITY;
}
(-log_likelihood / num_tokens as f64).exp()
}
#[must_use]
pub fn compute_accuracy(predictions: &[usize], gold: &[usize]) -> f64 {
if predictions.is_empty() || predictions.len() != gold.len() {
return 0.0;
}
let correct = predictions
.iter()
.zip(gold.iter())
.filter(|(&p, &g)| p == g)
.count();
correct as f64 / predictions.len() as f64
}
#[derive(Debug, Clone)]
pub struct TaskMetrics {
pub task_name: String,
pub task_type: TaskType,
pub num_examples: usize,
pub accuracy: Option<f64>,
pub perplexity: Option<f64>,
pub avg_log_likelihood: Option<f64>,
pub predictions: Vec<usize>,
}
#[derive(Debug, Clone)]
pub struct EvalReport {
pub tasks: Vec<TaskMetrics>,
pub macro_accuracy: f64,
pub num_tasks: usize,
pub total_examples: usize,
}
impl EvalReport {
#[must_use]
pub fn from_tasks(tasks: Vec<TaskMetrics>) -> Self {
let num_tasks = tasks.len();
let total_examples: usize = tasks.iter().map(|t| t.num_examples).sum();
let accs: Vec<f64> = tasks.iter().filter_map(|t| t.accuracy).collect();
let macro_accuracy = if accs.is_empty() {
0.0
} else {
accs.iter().sum::<f64>() / accs.len() as f64
};
Self {
tasks,
macro_accuracy,
num_tasks,
total_examples,
}
}
}
#[derive(Debug, Clone)]
pub struct HarnessConfig {
pub tasks: Vec<EvalTask>,
pub length_normalize: bool,
pub max_examples: usize,
}
impl Default for HarnessConfig {
fn default() -> Self {
Self {
tasks: Vec::new(),
length_normalize: false,
max_examples: 0,
}
}
}
pub fn run_harness<F>(config: &HarnessConfig, score_fn: F) -> Result<EvalReport>
where
F: Fn(&str, &str) -> f64,
{
if config.tasks.is_empty() {
return Err(AprenderError::FormatError {
message: "No tasks to evaluate".to_string(),
});
}
let mut all_metrics = Vec::new();
for task in &config.tasks {
let metrics = evaluate_task(task, &score_fn, config)?;
all_metrics.push(metrics);
}
Ok(EvalReport::from_tasks(all_metrics))
}
fn evaluate_task<F>(task: &EvalTask, score_fn: &F, config: &HarnessConfig) -> Result<TaskMetrics>
where
F: Fn(&str, &str) -> f64,
{
let examples = if config.max_examples > 0 && config.max_examples < task.examples.len() {
&task.examples[..config.max_examples]
} else {
&task.examples
};
match task.task_type {
TaskType::MultipleChoice | TaskType::Classification => {
evaluate_multiple_choice(task, examples, score_fn, config.length_normalize)
}
TaskType::Perplexity => evaluate_perplexity(task, examples, score_fn),
TaskType::Generation => {
Ok(TaskMetrics {
task_name: task.name.clone(),
task_type: task.task_type,
num_examples: examples.len(),
accuracy: None,
perplexity: None,
avg_log_likelihood: None,
predictions: Vec::new(),
})
}
}
}
fn evaluate_multiple_choice<F>(
task: &EvalTask,
examples: &[EvalExample],
score_fn: &F,
_length_normalize: bool,
) -> Result<TaskMetrics>
where
F: Fn(&str, &str) -> f64,
{
let mut predictions = Vec::with_capacity(examples.len());
let mut gold_labels = Vec::with_capacity(examples.len());
for example in examples {
let scores: Vec<f64> = example
.choices
.iter()
.map(|choice| score_fn(&example.context, choice))
.collect();
let (pred_idx, _) = score_multiple_choice(&scores, false);
predictions.push(pred_idx);
if let Some(gold) = example.gold_idx {
gold_labels.push(gold);
}
}
let accuracy = if gold_labels.len() == predictions.len() {
Some(compute_accuracy(&predictions, &gold_labels))
} else {
None
};
Ok(TaskMetrics {
task_name: task.name.clone(),
task_type: task.task_type,
num_examples: examples.len(),
accuracy,
perplexity: None,
avg_log_likelihood: None,
predictions,
})
}
fn evaluate_perplexity<F>(
task: &EvalTask,
examples: &[EvalExample],
score_fn: &F,
) -> Result<TaskMetrics>
where
F: Fn(&str, &str) -> f64,
{
let mut total_ll = 0.0;
let mut total_tokens = 0usize;
for example in examples {
let text = example.reference.as_deref().unwrap_or(&example.context);
let ll = score_fn("", text);
let tokens = text.split_whitespace().count().max(1);
total_ll += ll;
total_tokens += tokens;
}
let ppl = compute_perplexity(total_ll, total_tokens);
let avg_ll = if examples.is_empty() {
0.0
} else {
total_ll / examples.len() as f64
};
Ok(TaskMetrics {
task_name: task.name.clone(),
task_type: task.task_type,
num_examples: examples.len(),
accuracy: None,
perplexity: Some(ppl),
avg_log_likelihood: Some(avg_ll),
predictions: Vec::new(),
})
}
#[must_use]
pub fn mock_hellaswag() -> EvalTask {
let mut task = EvalTask::new("hellaswag", TaskType::MultipleChoice);
task.add_example(EvalExample {
context: "A person is making a sandwich. They".to_string(),
choices: vec![
" spread butter on the bread.".to_string(),
" flew into space.".to_string(),
" turned into a tree.".to_string(),
" dissolved into nothing.".to_string(),
],
gold_idx: Some(0),
reference: None,
});
task.add_example(EvalExample {
context: "The cat sat on the".to_string(),
choices: vec![
" ceiling fan.".to_string(),
" mat and purred.".to_string(),
" quantum vacuum.".to_string(),
" surface of the sun.".to_string(),
],
gold_idx: Some(1),
reference: None,
});
task
}
#[cfg(test)]
#[path = "eval_harness_tests.rs"]
mod tests;