aprender-core 0.29.3

//! Evaluation Harness for Standard Benchmarks (GH-454)
//!
//! Implements a lightweight evaluation framework compatible with the
//! lm-evaluation-harness approach. Supports:
//! - Multiple-choice tasks (HellaSwag, MMLU, ARC)
//! - Perplexity-based evaluation
//! - Log-likelihood scoring
//! - Metric aggregation and reporting
//!
//! # Design
//!
//! This module provides the scoring primitives and harness structure.
//! Actual benchmark datasets are loaded from JSONL files at runtime.

use crate::error::{AprenderError, Result};

// ============================================================================
// Task types
// ============================================================================

/// Evaluation task type.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TaskType {
    /// Multiple-choice: pick the best completion
    MultipleChoice,
    /// Perplexity: compute PPL on text
    Perplexity,
    /// Generation: compare generated text to reference
    Generation,
    /// Classification: accuracy on labeled examples
    Classification,
}

/// A single evaluation example.
#[derive(Debug, Clone)]
pub struct EvalExample {
    /// Context/prompt text
    pub context: String,
    /// Candidate completions (for multiple-choice)
    pub choices: Vec<String>,
    /// Correct answer index (for multiple-choice/classification)
    pub gold_idx: Option<usize>,
    /// Reference text (for generation/perplexity)
    pub reference: Option<String>,
}

/// An evaluation task (benchmark).
#[derive(Debug, Clone)]
pub struct EvalTask {
    /// Task name (e.g., "hellaswag", "mmlu_abstract_algebra")
    pub name: String,
    /// Task type
    pub task_type: TaskType,
    /// Examples in this task
    pub examples: Vec<EvalExample>,
    /// Number of few-shot examples to prepend
    pub num_fewshot: usize,
}

impl EvalTask {
    /// Create a new evaluation task.
    #[must_use]
    pub fn new(name: impl Into<String>, task_type: TaskType) -> Self {
        Self {
            name: name.into(),
            task_type,
            examples: Vec::new(),
            num_fewshot: 0,
        }
    }

    /// Add an example to the task.
    pub fn add_example(&mut self, example: EvalExample) {
        self.examples.push(example);
    }

    /// Set number of few-shot examples.
    #[must_use]
    pub fn with_fewshot(mut self, n: usize) -> Self {
        self.num_fewshot = n;
        self
    }

    /// Get number of examples.
    #[must_use]
    pub fn len(&self) -> usize {
        self.examples.len()
    }

    /// Check if task has no examples.
    #[must_use]
    pub fn is_empty(&self) -> bool {
        self.examples.is_empty()
    }
}

// ============================================================================
// Scoring
// ============================================================================

/// Log-likelihood scores for candidate completions.
#[derive(Debug, Clone)]
pub struct LogLikelihoodScores {
    /// Per-choice log-likelihoods
    pub scores: Vec<f64>,
    /// Length-normalized scores (divide by token count)
    pub normalized_scores: Vec<f64>,
}

/// Score multiple-choice by selecting the highest log-likelihood completion.
///
/// Returns the index of the best completion and its score.
#[must_use]
pub fn score_multiple_choice(scores: &[f64], normalize: bool) -> (usize, f64) {
    if scores.is_empty() {
        return (0, f64::NEG_INFINITY);
    }

    scores
        .iter()
        .enumerate()
        .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
        .map(|(i, &s)| {
            let _ = normalize; // Used in normalized_scores path
            (i, s)
        })
        .unwrap_or((0, f64::NEG_INFINITY))
}

/// Compute perplexity from log-likelihood.
///
/// PPL = exp(-1/N * Σ log P(x_i))
#[must_use]
pub fn compute_perplexity(log_likelihood: f64, num_tokens: usize) -> f64 {
    if num_tokens == 0 {
        return f64::INFINITY;
    }
    (-log_likelihood / num_tokens as f64).exp()
}

/// Compute accuracy from predictions vs gold labels.
#[must_use]
pub fn compute_accuracy(predictions: &[usize], gold: &[usize]) -> f64 {
    if predictions.is_empty() || predictions.len() != gold.len() {
        return 0.0;
    }
    let correct = predictions
        .iter()
        .zip(gold.iter())
        .filter(|(&p, &g)| p == g)
        .count();
    correct as f64 / predictions.len() as f64
}

// ============================================================================
// Metrics
// ============================================================================

/// Evaluation metrics for a single task.
#[derive(Debug, Clone)]
pub struct TaskMetrics {
    /// Task name
    pub task_name: String,
    /// Task type
    pub task_type: TaskType,
    /// Number of examples evaluated
    pub num_examples: usize,
    /// Accuracy (for multiple-choice/classification)
    pub accuracy: Option<f64>,
    /// Perplexity (for perplexity tasks)
    pub perplexity: Option<f64>,
    /// Average log-likelihood
    pub avg_log_likelihood: Option<f64>,
    /// Per-example predictions (for multiple-choice)
    pub predictions: Vec<usize>,
}

/// Aggregate metrics across multiple tasks.
#[derive(Debug, Clone)]
pub struct EvalReport {
    /// Per-task metrics
    pub tasks: Vec<TaskMetrics>,
    /// Overall accuracy (macro average)
    pub macro_accuracy: f64,
    /// Number of tasks evaluated
    pub num_tasks: usize,
    /// Total examples evaluated
    pub total_examples: usize,
}

impl EvalReport {
    /// Compute aggregate report from task metrics.
    #[must_use]
    pub fn from_tasks(tasks: Vec<TaskMetrics>) -> Self {
        let num_tasks = tasks.len();
        let total_examples: usize = tasks.iter().map(|t| t.num_examples).sum();

        let accs: Vec<f64> = tasks.iter().filter_map(|t| t.accuracy).collect();
        let macro_accuracy = if accs.is_empty() {
            0.0
        } else {
            accs.iter().sum::<f64>() / accs.len() as f64
        };

        Self {
            tasks,
            macro_accuracy,
            num_tasks,
            total_examples,
        }
    }
}

// ============================================================================
// Harness
// ============================================================================

/// Evaluation harness configuration.
#[derive(Debug, Clone)]
pub struct HarnessConfig {
    /// Tasks to evaluate
    pub tasks: Vec<EvalTask>,
    /// Whether to use length normalization for scoring
    pub length_normalize: bool,
    /// Maximum number of examples per task (0 = all)
    pub max_examples: usize,
}

impl Default for HarnessConfig {
    fn default() -> Self {
        Self {
            tasks: Vec::new(),
            length_normalize: false,
            max_examples: 0,
        }
    }
}

/// Run evaluation harness with a scoring function.
///
/// The `score_fn` takes (context, completion) and returns the log-likelihood
/// of the completion given the context.
pub fn run_harness<F>(config: &HarnessConfig, score_fn: F) -> Result<EvalReport>
where
    F: Fn(&str, &str) -> f64,
{
    if config.tasks.is_empty() {
        return Err(AprenderError::FormatError {
            message: "No tasks to evaluate".to_string(),
        });
    }

    let mut all_metrics = Vec::new();

    for task in &config.tasks {
        let metrics = evaluate_task(task, &score_fn, config)?;
        all_metrics.push(metrics);
    }

    Ok(EvalReport::from_tasks(all_metrics))
}

/// Evaluate a single task.
fn evaluate_task<F>(task: &EvalTask, score_fn: &F, config: &HarnessConfig) -> Result<TaskMetrics>
where
    F: Fn(&str, &str) -> f64,
{
    let examples = if config.max_examples > 0 && config.max_examples < task.examples.len() {
        &task.examples[..config.max_examples]
    } else {
        &task.examples
    };

    match task.task_type {
        TaskType::MultipleChoice | TaskType::Classification => {
            evaluate_multiple_choice(task, examples, score_fn, config.length_normalize)
        }
        TaskType::Perplexity => evaluate_perplexity(task, examples, score_fn),
        TaskType::Generation => {
            // Generation evaluation requires comparing outputs — simplified here
            Ok(TaskMetrics {
                task_name: task.name.clone(),
                task_type: task.task_type,
                num_examples: examples.len(),
                accuracy: None,
                perplexity: None,
                avg_log_likelihood: None,
                predictions: Vec::new(),
            })
        }
    }
}

/// Evaluate multiple-choice task.
fn evaluate_multiple_choice<F>(
    task: &EvalTask,
    examples: &[EvalExample],
    score_fn: &F,
    _length_normalize: bool,
) -> Result<TaskMetrics>
where
    F: Fn(&str, &str) -> f64,
{
    let mut predictions = Vec::with_capacity(examples.len());
    let mut gold_labels = Vec::with_capacity(examples.len());

    for example in examples {
        let scores: Vec<f64> = example
            .choices
            .iter()
            .map(|choice| score_fn(&example.context, choice))
            .collect();

        let (pred_idx, _) = score_multiple_choice(&scores, false);
        predictions.push(pred_idx);

        if let Some(gold) = example.gold_idx {
            gold_labels.push(gold);
        }
    }

    let accuracy = if gold_labels.len() == predictions.len() {
        Some(compute_accuracy(&predictions, &gold_labels))
    } else {
        None
    };

    Ok(TaskMetrics {
        task_name: task.name.clone(),
        task_type: task.task_type,
        num_examples: examples.len(),
        accuracy,
        perplexity: None,
        avg_log_likelihood: None,
        predictions,
    })
}

/// Evaluate perplexity task.
fn evaluate_perplexity<F>(
    task: &EvalTask,
    examples: &[EvalExample],
    score_fn: &F,
) -> Result<TaskMetrics>
where
    F: Fn(&str, &str) -> f64,
{
    let mut total_ll = 0.0;
    let mut total_tokens = 0usize;

    for example in examples {
        let text = example.reference.as_deref().unwrap_or(&example.context);
        let ll = score_fn("", text);
        let tokens = text.split_whitespace().count().max(1);
        total_ll += ll;
        total_tokens += tokens;
    }

    let ppl = compute_perplexity(total_ll, total_tokens);
    let avg_ll = if examples.is_empty() {
        0.0
    } else {
        total_ll / examples.len() as f64
    };

    Ok(TaskMetrics {
        task_name: task.name.clone(),
        task_type: task.task_type,
        num_examples: examples.len(),
        accuracy: None,
        perplexity: Some(ppl),
        avg_log_likelihood: Some(avg_ll),
        predictions: Vec::new(),
    })
}

/// Create a mock HellaSwag-style task for testing.
#[must_use]
pub fn mock_hellaswag() -> EvalTask {
    let mut task = EvalTask::new("hellaswag", TaskType::MultipleChoice);

    task.add_example(EvalExample {
        context: "A person is making a sandwich. They".to_string(),
        choices: vec![
            " spread butter on the bread.".to_string(),
            " flew into space.".to_string(),
            " turned into a tree.".to_string(),
            " dissolved into nothing.".to_string(),
        ],
        gold_idx: Some(0),
        reference: None,
    });

    task.add_example(EvalExample {
        context: "The cat sat on the".to_string(),
        choices: vec![
            " ceiling fan.".to_string(),
            " mat and purred.".to_string(),
            " quantum vacuum.".to_string(),
            " surface of the sun.".to_string(),
        ],
        gold_idx: Some(1),
        reference: None,
    });

    task
}

#[cfg(test)]
#[path = "eval_harness_tests.rs"]
mod tests;