aprender-core 0.50.0

use super::*;

// ============================================================================
// Scoring tests
// ============================================================================

#[test]
fn test_score_multiple_choice_basic() {
    let scores = vec![-1.0, -2.0, -3.0, -4.0];
    let (idx, score) = score_multiple_choice(&scores, false);
    assert_eq!(idx, 0);
    assert!((score - (-1.0)).abs() < 1e-10);
}

#[test]
fn test_score_multiple_choice_second_best() {
    let scores = vec![-5.0, -1.0, -3.0, -4.0];
    let (idx, _) = score_multiple_choice(&scores, false);
    assert_eq!(idx, 1);
}

#[test]
fn test_score_multiple_choice_empty() {
    let (idx, score) = score_multiple_choice(&[], false);
    assert_eq!(idx, 0);
    assert!(score == f64::NEG_INFINITY);
}

#[test]
fn test_score_multiple_choice_single() {
    let scores = vec![-2.5];
    let (idx, score) = score_multiple_choice(&scores, false);
    assert_eq!(idx, 0);
    assert!((score - (-2.5)).abs() < 1e-10);
}

#[test]
fn test_score_multiple_choice_equal() {
    let scores = vec![-2.0, -2.0, -2.0];
    let (idx, _) = score_multiple_choice(&scores, false);
    // Any index is valid when all equal; just check it's in range
    assert!(idx < 3);
}

// ============================================================================
// Perplexity tests
// ============================================================================

#[test]
fn test_compute_perplexity_basic() {
    // PPL = exp(-LL/N)
    let ppl = compute_perplexity(-10.0, 10);
    assert!((ppl - 1.0_f64.exp()).abs() < 1e-10);
}

#[test]
fn test_compute_perplexity_zero_tokens() {
    let ppl = compute_perplexity(-5.0, 0);
    assert!(ppl == f64::INFINITY);
}

#[test]
fn test_compute_perplexity_zero_ll() {
    // exp(0) = 1.0
    let ppl = compute_perplexity(0.0, 10);
    assert!((ppl - 1.0).abs() < 1e-10);
}

#[test]
fn test_compute_perplexity_large_negative() {
    let ppl = compute_perplexity(-100.0, 10);
    // exp(10) ≈ 22026
    assert!(ppl > 20000.0);
    assert!(ppl.is_finite());
}

// ============================================================================
// Accuracy tests
// ============================================================================

#[test]
fn test_compute_accuracy_perfect() {
    let preds = vec![0, 1, 2, 3];
    let gold = vec![0, 1, 2, 3];
    assert!((compute_accuracy(&preds, &gold) - 1.0).abs() < 1e-10);
}

#[test]
fn test_compute_accuracy_zero() {
    let preds = vec![1, 2, 3, 0];
    let gold = vec![0, 1, 2, 3];
    assert!((compute_accuracy(&preds, &gold) - 0.0).abs() < 1e-10);
}

#[test]
fn test_compute_accuracy_half() {
    let preds = vec![0, 1, 0, 0];
    let gold = vec![0, 1, 2, 3];
    assert!((compute_accuracy(&preds, &gold) - 0.5).abs() < 1e-10);
}

#[test]
fn test_compute_accuracy_empty() {
    assert!((compute_accuracy(&[], &[]) - 0.0).abs() < 1e-10);
}

#[test]
fn test_compute_accuracy_mismatched_len() {
    let preds = vec![0, 1];
    let gold = vec![0, 1, 2];
    assert!((compute_accuracy(&preds, &gold) - 0.0).abs() < 1e-10);
}

// ============================================================================
// EvalTask tests
// ============================================================================

#[test]
fn test_eval_task_new() {
    let task = EvalTask::new("test_task", TaskType::MultipleChoice);
    assert_eq!(task.name, "test_task");
    assert_eq!(task.task_type, TaskType::MultipleChoice);
    assert!(task.is_empty());
    assert_eq!(task.len(), 0);
    assert_eq!(task.num_fewshot, 0);
}

#[test]
fn test_eval_task_add_example() {
    let mut task = EvalTask::new("t", TaskType::Classification);
    task.add_example(EvalExample {
        context: "ctx".to_string(),
        choices: vec!["a".to_string(), "b".to_string()],
        gold_idx: Some(0),
        reference: None,
    });
    assert_eq!(task.len(), 1);
    assert!(!task.is_empty());
}

#[test]
fn test_eval_task_with_fewshot() {
    let task = EvalTask::new("t", TaskType::MultipleChoice).with_fewshot(5);
    assert_eq!(task.num_fewshot, 5);
}

// ============================================================================
// Mock data tests
// ============================================================================

#[test]
fn test_mock_hellaswag() {
    let task = mock_hellaswag();
    assert_eq!(task.name, "hellaswag");
    assert_eq!(task.task_type, TaskType::MultipleChoice);
    assert_eq!(task.len(), 2);
    assert_eq!(task.examples[0].choices.len(), 4);
    assert_eq!(task.examples[0].gold_idx, Some(0));
    assert_eq!(task.examples[1].gold_idx, Some(1));
}

// ============================================================================
// EvalReport tests
// ============================================================================

#[test]
fn test_eval_report_from_tasks() {
    let metrics = vec![
        TaskMetrics {
            task_name: "task_a".to_string(),
            task_type: TaskType::MultipleChoice,
            num_examples: 10,
            accuracy: Some(0.8),
            perplexity: None,
            avg_log_likelihood: None,
            predictions: vec![],
        },
        TaskMetrics {
            task_name: "task_b".to_string(),
            task_type: TaskType::MultipleChoice,
            num_examples: 20,
            accuracy: Some(0.6),
            perplexity: None,
            avg_log_likelihood: None,
            predictions: vec![],
        },
    ];
    let report = EvalReport::from_tasks(metrics);
    assert_eq!(report.num_tasks, 2);
    assert_eq!(report.total_examples, 30);
    assert!((report.macro_accuracy - 0.7).abs() < 1e-10);
}

#[test]
fn test_eval_report_empty() {
    let report = EvalReport::from_tasks(vec![]);
    assert_eq!(report.num_tasks, 0);
    assert_eq!(report.total_examples, 0);
    assert!((report.macro_accuracy - 0.0).abs() < 1e-10);
}

#[test]
fn test_eval_report_perplexity_only() {
    let metrics = vec![TaskMetrics {
        task_name: "ppl_task".to_string(),
        task_type: TaskType::Perplexity,
        num_examples: 5,
        accuracy: None,
        perplexity: Some(15.3),
        avg_log_likelihood: Some(-2.5),
        predictions: vec![],
    }];
    let report = EvalReport::from_tasks(metrics);
    assert_eq!(report.num_tasks, 1);
    // No accuracy tasks → macro_accuracy = 0.0
    assert!((report.macro_accuracy - 0.0).abs() < 1e-10);
}

// ============================================================================
// Harness config tests
// ============================================================================

#[test]
fn test_harness_config_default() {
    let cfg = HarnessConfig::default();
    assert!(cfg.tasks.is_empty());
    assert!(!cfg.length_normalize);
    assert_eq!(cfg.max_examples, 0);
}

// ============================================================================
// run_harness tests
// ============================================================================

#[test]
fn test_run_harness_empty_tasks() {
    let config = HarnessConfig::default();
    let result = run_harness(&config, |_: &str, _: &str| 0.0);
    assert!(result.is_err());
}

#[test]
fn test_run_harness_multiple_choice() {
    let task = mock_hellaswag();
    let config = HarnessConfig {
        tasks: vec![task],
        length_normalize: false,
        max_examples: 0,
    };

    // Score function: longer completions get higher scores (sensible choices are longer)
    let report = run_harness(&config, |_ctx: &str, completion: &str| {
        -(completion.len() as f64)
    })
    .unwrap();

    assert_eq!(report.num_tasks, 1);
    assert_eq!(report.total_examples, 2);
    assert!(report.tasks[0].accuracy.is_some());
}

#[test]
fn test_run_harness_perfect_scorer() {
    let task = mock_hellaswag();
    let config = HarnessConfig {
        tasks: vec![task],
        length_normalize: false,
        max_examples: 0,
    };

    // Perfect scorer: gives highest score to the correct answer
    let report = run_harness(&config, |ctx: &str, completion: &str| {
        if (ctx.contains("sandwich") && completion.contains("butter"))
            || (ctx.contains("cat") && completion.contains("purred"))
        {
            0.0 // Highest (least negative)
        } else {
            -10.0
        }
    })
    .unwrap();

    assert!((report.tasks[0].accuracy.unwrap() - 1.0).abs() < 1e-10);
}

#[test]
fn test_run_harness_max_examples() {
    let mut task = mock_hellaswag();
    // Add more examples
    for i in 0..10 {
        task.add_example(EvalExample {
            context: format!("Context {}", i),
            choices: vec!["a".to_string(), "b".to_string()],
            gold_idx: Some(0),
            reference: None,
        });
    }

    let config = HarnessConfig {
        tasks: vec![task],
        length_normalize: false,
        max_examples: 3,
    };

    let report = run_harness(&config, |_: &str, _: &str| -1.0).unwrap();
    assert_eq!(report.tasks[0].num_examples, 3);
}

#[test]
fn test_run_harness_perplexity() {
    let mut task = EvalTask::new("ppl_test", TaskType::Perplexity);
    task.add_example(EvalExample {
        context: "The quick brown fox".to_string(),
        choices: vec![],
        gold_idx: None,
        reference: Some("The quick brown fox jumps over the lazy dog".to_string()),
    });
    task.add_example(EvalExample {
        context: "Hello world".to_string(),
        choices: vec![],
        gold_idx: None,
        reference: None, // Falls back to context
    });

    let config = HarnessConfig {
        tasks: vec![task],
        length_normalize: false,
        max_examples: 0,
    };

    let report = run_harness(&config, |_: &str, text: &str| {
        // Fake log-likelihood: -0.5 per whitespace-delimited token
        -(text.split_whitespace().count() as f64) * 0.5
    })
    .unwrap();

    assert!(report.tasks[0].perplexity.is_some());
    let ppl = report.tasks[0].perplexity.unwrap();
    assert!(ppl > 0.0);
    assert!(ppl.is_finite());
    assert!(report.tasks[0].avg_log_likelihood.is_some());
}

#[test]
fn test_run_harness_generation() {
    let mut task = EvalTask::new("gen_test", TaskType::Generation);
    task.add_example(EvalExample {
        context: "Translate: hello".to_string(),
        choices: vec![],
        gold_idx: None,
        reference: Some("hola".to_string()),
    });

    let config = HarnessConfig {
        tasks: vec![task],
        length_normalize: false,
        max_examples: 0,
    };

    let report = run_harness(&config, |_: &str, _: &str| 0.0).unwrap();
    // Generation returns placeholder metrics
    assert!(report.tasks[0].accuracy.is_none());
    assert!(report.tasks[0].perplexity.is_none());
}

#[test]
fn test_run_harness_multi_task() {
    let mc_task = mock_hellaswag();
    let mut ppl_task = EvalTask::new("ppl", TaskType::Perplexity);
    ppl_task.add_example(EvalExample {
        context: "test text".to_string(),
        choices: vec![],
        gold_idx: None,
        reference: None,
    });

    let config = HarnessConfig {
        tasks: vec![mc_task, ppl_task],
        length_normalize: false,
        max_examples: 0,
    };

    let report = run_harness(&config, |_: &str, _: &str| -1.0).unwrap();
    assert_eq!(report.num_tasks, 2);
}

// ============================================================================
// Falsification tests
// ============================================================================

/// FALSIFY-EVAL-001: Perplexity is always positive.
#[test]
fn falsify_eval_001_perplexity_positive() {
    for ll in [-100.0, -10.0, -1.0, 0.0, 1.0] {
        for tokens in [1, 5, 10, 100, 1000] {
            let ppl = compute_perplexity(ll, tokens);
            assert!(
                ppl > 0.0,
                "PPL must be > 0, got {} for ll={}, tokens={}",
                ppl,
                ll,
                tokens
            );
        }
    }
}

/// FALSIFY-EVAL-002: Accuracy is bounded in [0.0, 1.0].
#[test]
fn falsify_eval_002_accuracy_bounded() {
    let test_cases: Vec<(Vec<usize>, Vec<usize>)> = vec![
        (vec![0, 0, 0, 0], vec![0, 0, 0, 0]),
        (vec![0, 1, 2, 3], vec![3, 2, 1, 0]),
        (vec![0, 1, 0, 1], vec![0, 0, 1, 1]),
        (vec![0], vec![0]),
    ];

    for (preds, gold) in &test_cases {
        let acc = compute_accuracy(preds, gold);
        assert!(
            (0.0..=1.0).contains(&acc),
            "Accuracy must be in [0,1], got {} for {:?} vs {:?}",
            acc,
            preds,
            gold
        );
    }
}

/// FALSIFY-EVAL-003: Harness report aggregation is consistent.
#[test]
fn falsify_eval_003_report_consistency() {
    let metrics = vec![
        TaskMetrics {
            task_name: "a".to_string(),
            task_type: TaskType::MultipleChoice,
            num_examples: 100,
            accuracy: Some(0.9),
            perplexity: None,
            avg_log_likelihood: None,
            predictions: vec![],
        },
        TaskMetrics {
            task_name: "b".to_string(),
            task_type: TaskType::Classification,
            num_examples: 50,
            accuracy: Some(0.7),
            perplexity: None,
            avg_log_likelihood: None,
            predictions: vec![],
        },
        TaskMetrics {
            task_name: "c".to_string(),
            task_type: TaskType::Perplexity,
            num_examples: 25,
            accuracy: None,
            perplexity: Some(10.0),
            avg_log_likelihood: Some(-2.3),
            predictions: vec![],
        },
    ];

    let report = EvalReport::from_tasks(metrics);

    // Total examples = sum of all tasks
    assert_eq!(report.total_examples, 175);
    // num_tasks = number of tasks
    assert_eq!(report.num_tasks, 3);
    // macro_accuracy = mean of non-None accuracies only
    let expected_macro = (0.9 + 0.7) / 2.0;
    assert!(
        (report.macro_accuracy - expected_macro).abs() < 1e-10,
        "Macro accuracy should be mean of accuracy tasks only, got {}",
        report.macro_accuracy
    );
}

/// FALSIFY-EVAL-004: `normalize` actually selects acc_norm (length-normalized),
/// and length-normalization can FLIP the winner versus raw acc.
///
/// This is the acc vs acc_norm distinction required by HellaSwag / ARC / MMLU.
/// Construct two choices where the SHORTER completion has the higher RAW summed
/// log-likelihood (the short-completion bias of raw acc), but the LONGER
/// completion has the higher PER-TOKEN log-likelihood:
///
/// - choice 0 (short, 2 tokens): raw = -1.0  → per-token = -0.50
/// - choice 1 (long, 8 tokens):  raw = -2.0  → per-token = -0.25
///
/// Raw acc:      -1.0 > -2.0          → picks choice 0 (short).
/// acc_norm:     -0.25 > -0.50        → picks choice 1 (long).
///
/// With the previously-ignored `normalize` param BOTH calls returned the raw
/// winner (choice 0), so this test was RED (the two asserts could not both
/// hold). With length-normalization wired in it is GREEN.
#[test]
fn falsify_eval_004_normalize_flips_winner() {
    let scores = vec![-1.0, -2.0]; // summed completion log-likelihoods
    let lengths = vec![2usize, 8usize]; // completion token counts

    // Sanity: the flip preconditions actually hold for these numbers.
    assert!(scores[0] > scores[1], "short must have higher RAW total");
    assert!(
        scores[1] / lengths[1] as f64 > scores[0] / lengths[0] as f64,
        "long must have higher PER-TOKEN log-prob"
    );

    // acc (raw): the short completion wins on raw summed log-likelihood.
    let (acc_idx, acc_score) = score_multiple_choice_with_lengths(&scores, &lengths, false);
    assert_eq!(
        acc_idx, 0,
        "raw acc must pick the short completion (choice 0)"
    );
    assert!(
        (acc_score - (-1.0)).abs() < 1e-12,
        "returned score is always the RAW log-likelihood of the winner"
    );

    // acc_norm (length-normalized): the longer, higher-per-token choice wins.
    let (norm_idx, norm_score) = score_multiple_choice_with_lengths(&scores, &lengths, true);
    assert_eq!(
        norm_idx, 1,
        "acc_norm must pick the long completion (choice 1) after dividing by length"
    );
    // Returned score is still the RAW log-likelihood, not the per-token value.
    assert!(
        (norm_score - (-2.0)).abs() < 1e-12,
        "acc_norm returns the winner's RAW score (-2.0), not its per-token (-0.25)"
    );

    // The core falsifiable claim: normalization changed the prediction.
    assert_ne!(
        acc_idx, norm_idx,
        "length-normalization MUST be able to flip the winner (acc vs acc_norm)"
    );
}

/// FALSIFY-EVAL-005: the `!normalize` (raw acc) path is unchanged — no regression.
///
/// For every test case the length-aware function with `normalize == false` and
/// the legacy `score_multiple_choice(.., false)` wrapper must agree, and must
/// equal a plain raw argmax. Guards against the fix perturbing the acc path.
#[test]
fn falsify_eval_005_raw_acc_no_regression() {
    let cases: Vec<Vec<f64>> = vec![
        vec![-1.0, -2.0, -3.0, -4.0],
        vec![-5.0, -1.0, -3.0, -4.0],
        vec![-2.5],
        vec![-2.0, -2.0, -2.0],
        vec![-10.0, -0.5],
    ];

    for scores in &cases {
        // Arbitrary, varied lengths — must be ignored when normalize == false.
        let lengths: Vec<usize> = (1..=scores.len()).collect();

        let raw_argmax = scores
            .iter()
            .enumerate()
            .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
            .map(|(i, _)| i)
            .unwrap_or(0);

        let (wrapper_idx, _) = score_multiple_choice(scores, false);
        let (lenfn_idx, lenfn_score) = score_multiple_choice_with_lengths(scores, &lengths, false);

        assert_eq!(
            wrapper_idx, raw_argmax,
            "score_multiple_choice(.., false) must equal raw argmax for {scores:?}"
        );
        assert_eq!(
            lenfn_idx, raw_argmax,
            "score_multiple_choice_with_lengths(.., false) must equal raw argmax for {scores:?}, \
             ignoring lengths"
        );
        // Returned score equals the raw value at the winning index.
        assert!((lenfn_score - scores[raw_argmax]).abs() < 1e-12);
    }
}

/// `normalize == true` with unit lengths is the identity (wrapper delegation).
#[test]
fn test_score_multiple_choice_normalize_unit_lengths_is_identity() {
    let scores = vec![-1.0, -2.0, -3.0];
    // The thin wrapper has no lengths → unit lengths → normalization is a no-op,
    // so the result matches raw acc.
    assert_eq!(score_multiple_choice(&scores, true).0, 0);
    assert_eq!(score_multiple_choice(&scores, false).0, 0);
}

/// Empty input is handled identically on both normalize paths.
#[test]
fn test_score_multiple_choice_with_lengths_empty() {
    let (idx_t, score_t) = score_multiple_choice_with_lengths(&[], &[], true);
    let (idx_f, score_f) = score_multiple_choice_with_lengths(&[], &[], false);
    assert_eq!(idx_t, 0);
    assert_eq!(idx_f, 0);
    assert!(score_t == f64::NEG_INFINITY);
    assert!(score_f == f64::NEG_INFINITY);
}

/// Zero-length completion is guarded (no division by zero / NaN).
#[test]
fn test_score_multiple_choice_with_lengths_zero_len_guard() {
    let scores = vec![-1.0, -2.0];
    let lengths = vec![0usize, 1usize]; // choice 0 has length 0 → guarded to 1
    let (idx, score) = score_multiple_choice_with_lengths(&scores, &lengths, true);
    // With length guarded to 1: choice 0 = -1.0, choice 1 = -2.0 → choice 0 wins.
    assert_eq!(idx, 0);
    assert!(score.is_finite());
}

/// acc_norm at the harness level: a `length_normalize` task flips a prediction
/// versus the default (raw acc) run for the same scorer.
#[test]
fn test_run_harness_length_normalize_changes_prediction() {
    // One MC example: choice 0 short, choice 1 long. Scorer returns a fixed raw
    // log-likelihood per choice such that short wins raw, long wins per-token.
    let mut task = EvalTask::new("acc_norm_flip", TaskType::MultipleChoice);
    task.add_example(EvalExample {
        context: "ctx".to_string(),
        choices: vec![
            "aa bb".to_string(),                   // 2 whitespace tokens
            "cc dd ee ff gg hh ii jj".to_string(), // 8 whitespace tokens
        ],
        gold_idx: Some(1), // the long completion is "correct"
        reference: None,
    });

    // Raw log-likelihoods: short=-1.0 (2 tok → -0.50/tok), long=-2.0 (8 tok → -0.25/tok).
    let scorer = |_ctx: &str, completion: &str| -> f64 {
        match completion.split_whitespace().count() {
            2 => -1.0,
            8 => -2.0,
            _ => f64::NEG_INFINITY,
        }
    };

    // Raw acc picks the short (wrong) completion → accuracy 0.0.
    let raw_cfg = HarnessConfig {
        tasks: vec![task.clone()],
        length_normalize: false,
        max_examples: 0,
    };
    let raw = run_harness(&raw_cfg, scorer).unwrap();
    assert_eq!(raw.tasks[0].predictions[0], 0);
    assert!((raw.tasks[0].accuracy.unwrap() - 0.0).abs() < 1e-12);

    // acc_norm picks the long (correct) completion → accuracy 1.0.
    let norm_cfg = HarnessConfig {
        tasks: vec![task],
        length_normalize: true,
        max_examples: 0,
    };
    let norm = run_harness(&norm_cfg, scorer).unwrap();
    assert_eq!(norm.tasks[0].predictions[0], 1);
    assert!((norm.tasks[0].accuracy.unwrap() - 1.0).abs() < 1e-12);
}