rusty-llm-jury 0.1.0

//! Bias correction functionality for LLM judge evaluation.

use crate::error::{JudgyError, Result};
use rand::prelude::*;
use serde::{Deserialize, Serialize};

/// Result of bias-corrected estimation
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct EstimationResult {
    /// Point estimate of true success rate
    pub theta_hat: f64,
    /// Lower bound of confidence interval
    pub lower_bound: f64,
    /// Upper bound of confidence interval
    pub upper_bound: f64,
    /// Confidence level used
    pub confidence_level: f64,
    /// Number of bootstrap iterations used
    pub bootstrap_iterations: usize,
    /// Judge performance metrics
    pub judge_metrics: JudgeMetrics,
    /// Raw observed pass rate
    pub raw_pass_rate: f64,
}

/// Judge performance metrics
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct JudgeMetrics {
    /// True Positive Rate (Sensitivity)
    pub tpr: f64,
    /// True Negative Rate (Specificity)
    pub tnr: f64,
    /// False Positive Rate
    pub fpr: f64,
    /// False Negative Rate
    pub fnr: f64,
    /// Overall accuracy
    pub accuracy: f64,
}

impl JudgeMetrics {
    /// Calculate judge metrics from test data
    pub fn from_test_data(test_labels: &[u8], test_preds: &[u8]) -> Result<Self> {
        if test_labels.len() != test_preds.len() {
            return Err(JudgyError::input_validation(format!(
                "test_labels and test_preds must have the same length. Got {} and {}",
                test_labels.len(),
                test_preds.len()
            )));
        }

        let mut tp = 0; // True Positives
        let mut fp = 0; // False Positives
        let mut tn = 0; // True Negatives
        let mut fn_ = 0; // False Negatives

        for (&label, &pred) in test_labels.iter().zip(test_preds.iter()) {
            match (label, pred) {
                (1, 1) => tp += 1,
                (0, 1) => fp += 1,
                (0, 0) => tn += 1,
                (1, 0) => fn_ += 1,
                _ => {
                    return Err(JudgyError::input_validation(
                        "Labels and predictions must be 0 or 1".to_string(),
                    ))
                }
            }
        }

        let positive_count = tp + fn_;
        let negative_count = tn + fp;

        if positive_count == 0 || negative_count == 0 {
            return Err(JudgyError::input_validation(
                "test_labels must contain both positive and negative examples".to_string(),
            ));
        }

        let tpr = tp as f64 / positive_count as f64;
        let tnr = tn as f64 / negative_count as f64;
        let fpr = fp as f64 / negative_count as f64;
        let fnr = fn_ as f64 / positive_count as f64;
        let accuracy = (tp + tn) as f64 / test_labels.len() as f64;

        Ok(JudgeMetrics {
            tpr,
            tnr,
            fpr,
            fnr,
            accuracy,
        })
    }
}

/// Estimate true success rate with bias correction and confidence intervals.
///
/// This function corrects for LLM judge bias by using labeled test data to estimate
/// the judge's true positive rate (TPR) and true negative rate (TNR), then applies
/// the Rogan-Gladen correction to estimate the true pass rate from unlabeled predictions.
/// Bootstrap resampling is used to compute confidence intervals.
///
/// # Arguments
///
/// * `test_labels` - Human labels on test set (1 = Pass, 0 = Fail)
/// * `test_preds` - Judge predictions on test set (1 = Pass, 0 = Fail)
/// * `unlabeled_preds` - Judge predictions on unlabeled data (1 = Pass, 0 = Fail)
/// * `bootstrap_iterations` - Number of bootstrap iterations for confidence interval estimation
/// * `confidence_level` - Confidence level for the interval (between 0 and 1)
///
/// # Returns
///
/// Returns an `EstimationResult` containing the point estimate, confidence interval bounds,
/// and additional metrics.
///
/// # Errors
///
/// * `JudgyError::InputValidation` - If input arrays have different lengths or invalid values
/// * `JudgyError::JudgeAccuracyTooLow` - If judge accuracy is too low for correction (TPR + TNR <= 1)
/// * `JudgyError::Bootstrap` - If bootstrap sampling fails
///
/// # Example
///
/// ```rust
/// use llmjury::estimate_success_rate;
///
/// let test_labels = vec![1, 1, 0, 0, 1, 0];
/// let test_preds = vec![1, 0, 0, 1, 1, 0];
/// let unlabeled_preds = vec![1, 1, 0, 1, 0];
///
/// let result = estimate_success_rate(
///     &test_labels,
///     &test_preds,
///     &unlabeled_preds,
///     1000,
///     0.95
/// ).unwrap();
///
/// println!("Estimated pass rate: {:.3}", result.theta_hat);
/// println!("95% CI: [{:.3}, {:.3}]", result.lower_bound, result.upper_bound);
/// ```
pub fn estimate_success_rate(
    test_labels: &[u8],
    test_preds: &[u8],
    unlabeled_preds: &[u8],
    bootstrap_iterations: usize,
    confidence_level: f64,
) -> Result<EstimationResult> {
    // Input validation
    validate_inputs(
        test_labels,
        test_preds,
        unlabeled_preds,
        confidence_level,
        bootstrap_iterations,
    )?;

    // Calculate judge metrics
    let judge_metrics = JudgeMetrics::from_test_data(test_labels, test_preds)?;

    // Check if judge is better than random
    let tpr_plus_tnr = judge_metrics.tpr + judge_metrics.tnr;
    if tpr_plus_tnr <= 1.0 {
        return Err(JudgyError::JudgeAccuracyTooLow { tpr_plus_tnr });
    }

    // Calculate raw observed pass rate
    let raw_pass_rate =
        unlabeled_preds.iter().map(|&x| x as f64).sum::<f64>() / unlabeled_preds.len() as f64;

    // Apply Rogan-Gladen correction for point estimate
    let denominator = tpr_plus_tnr - 1.0;
    let theta_hat = ((raw_pass_rate + judge_metrics.tnr - 1.0) / denominator).clamp(0.0, 1.0);

    // Bootstrap confidence interval
    let (lower_bound, upper_bound) = bootstrap_confidence_interval(
        test_labels,
        test_preds,
        raw_pass_rate,
        bootstrap_iterations,
        confidence_level,
    )?;

    Ok(EstimationResult {
        theta_hat,
        lower_bound,
        upper_bound,
        confidence_level,
        bootstrap_iterations,
        judge_metrics,
        raw_pass_rate,
    })
}

/// Validate input parameters
fn validate_inputs(
    test_labels: &[u8],
    test_preds: &[u8],
    unlabeled_preds: &[u8],
    confidence_level: f64,
    bootstrap_iterations: usize,
) -> Result<()> {
    if test_labels.len() != test_preds.len() {
        return Err(JudgyError::input_validation(format!(
            "test_labels and test_preds must have the same length. Got {} and {}",
            test_labels.len(),
            test_preds.len()
        )));
    }

    if test_labels.is_empty() {
        return Err(JudgyError::input_validation(
            "test_labels cannot be empty".to_string(),
        ));
    }

    if unlabeled_preds.is_empty() {
        return Err(JudgyError::input_validation(
            "unlabeled_preds cannot be empty".to_string(),
        ));
    }

    if !(0.0 < confidence_level && confidence_level < 1.0) {
        return Err(JudgyError::input_validation(
            "confidence_level must be between 0 and 1 (exclusive)".to_string(),
        ));
    }

    if bootstrap_iterations == 0 {
        return Err(JudgyError::input_validation(
            "bootstrap_iterations must be positive".to_string(),
        ));
    }

    // Check that all values are binary
    for (i, &val) in test_labels.iter().enumerate() {
        if val != 0 && val != 1 {
            return Err(JudgyError::input_validation(format!(
                "test_labels[{}] = {} is not 0 or 1",
                i, val
            )));
        }
    }

    for (i, &val) in test_preds.iter().enumerate() {
        if val != 0 && val != 1 {
            return Err(JudgyError::input_validation(format!(
                "test_preds[{}] = {} is not 0 or 1",
                i, val
            )));
        }
    }

    for (i, &val) in unlabeled_preds.iter().enumerate() {
        if val != 0 && val != 1 {
            return Err(JudgyError::input_validation(format!(
                "unlabeled_preds[{}] = {} is not 0 or 1",
                i, val
            )));
        }
    }

    Ok(())
}

/// Compute bootstrap confidence interval
fn bootstrap_confidence_interval(
    test_labels: &[u8],
    test_preds: &[u8],
    observed_pass_rate: f64,
    bootstrap_iterations: usize,
    confidence_level: f64,
) -> Result<(f64, f64)> {
    let mut rng = StdRng::from_entropy();
    let test_size = test_labels.len();
    let indices: Vec<usize> = (0..test_size).collect();
    let mut bootstrap_samples = Vec::new();

    for _ in 0..bootstrap_iterations {
        // Bootstrap resample test data
        let bootstrap_indices: Vec<usize> = indices
            .choose_multiple(&mut rng, test_size)
            .cloned()
            .collect();

        let bootstrap_labels: Vec<u8> = bootstrap_indices.iter().map(|&i| test_labels[i]).collect();

        let bootstrap_preds: Vec<u8> = bootstrap_indices.iter().map(|&i| test_preds[i]).collect();

        // Calculate metrics for this bootstrap sample
        match JudgeMetrics::from_test_data(&bootstrap_labels, &bootstrap_preds) {
            Ok(metrics) => {
                let tpr_plus_tnr = metrics.tpr + metrics.tnr;
                if tpr_plus_tnr > 1.0 {
                    let denominator = tpr_plus_tnr - 1.0;
                    let theta_bootstrap =
                        ((observed_pass_rate + metrics.tnr - 1.0) / denominator).clamp(0.0, 1.0);
                    bootstrap_samples.push(theta_bootstrap);
                }
            }
            Err(_) => {
                // Skip this bootstrap sample if it fails
                continue;
            }
        }
    }

    if bootstrap_samples.is_empty() {
        return Err(JudgyError::bootstrap(
            "No valid bootstrap samples generated. This may indicate insufficient test data or very poor judge performance.".to_string(),
        ));
    }

    // Calculate confidence interval
    bootstrap_samples.sort_by(|a, b| a.partial_cmp(b).unwrap());
    let alpha = 1.0 - confidence_level;
    let lower_percentile = alpha / 2.0;
    let upper_percentile = 1.0 - alpha / 2.0;

    let lower_idx = ((bootstrap_samples.len() as f64 * lower_percentile) as usize)
        .min(bootstrap_samples.len() - 1);
    let upper_idx = ((bootstrap_samples.len() as f64 * upper_percentile) as usize)
        .min(bootstrap_samples.len() - 1);

    let lower_bound = bootstrap_samples[lower_idx];
    let upper_bound = bootstrap_samples[upper_idx];

    Ok((lower_bound, upper_bound))
}

#[cfg(test)]
mod tests {
    use super::*;
    use approx::assert_relative_eq;

    #[test]
    fn test_judge_metrics_perfect_judge() {
        let test_labels = vec![1, 1, 1, 0, 0, 0];
        let test_preds = vec![1, 1, 1, 0, 0, 0]; // Perfect predictions

        let metrics = JudgeMetrics::from_test_data(&test_labels, &test_preds).unwrap();

        assert_relative_eq!(metrics.tpr, 1.0);
        assert_relative_eq!(metrics.tnr, 1.0);
        assert_relative_eq!(metrics.fpr, 0.0);
        assert_relative_eq!(metrics.fnr, 0.0);
        assert_relative_eq!(metrics.accuracy, 1.0);
    }

    #[test]
    fn test_judge_metrics_random_judge() {
        let test_labels = vec![1, 1, 1, 1, 0, 0, 0, 0];
        let test_preds = vec![0, 0, 0, 0, 1, 1, 1, 1]; // Always wrong

        let metrics = JudgeMetrics::from_test_data(&test_labels, &test_preds).unwrap();

        assert_relative_eq!(metrics.tpr, 0.0);
        assert_relative_eq!(metrics.tnr, 0.0);
        assert_relative_eq!(metrics.fpr, 1.0);
        assert_relative_eq!(metrics.fnr, 1.0);
        assert_relative_eq!(metrics.accuracy, 0.0);
    }

    #[test]
    fn test_estimate_success_rate_basic() {
        let test_labels = vec![1, 1, 0, 0, 1, 0, 1, 0];
        let test_preds = vec![1, 0, 0, 1, 1, 0, 1, 0];
        let unlabeled_preds = vec![1, 1, 0, 1, 0, 1, 0, 1, 1, 0];

        let result =
            estimate_success_rate(&test_labels, &test_preds, &unlabeled_preds, 100, 0.95).unwrap();

        // Basic sanity checks
        assert!(result.theta_hat >= 0.0 && result.theta_hat <= 1.0);
        assert!(result.lower_bound >= 0.0 && result.lower_bound <= 1.0);
        assert!(result.upper_bound >= 0.0 && result.upper_bound <= 1.0);
        assert!(result.lower_bound <= result.theta_hat);
        assert!(result.theta_hat <= result.upper_bound);
        assert_relative_eq!(result.confidence_level, 0.95);
        assert_eq!(result.bootstrap_iterations, 100);
    }

    #[test]
    fn test_estimate_success_rate_perfect_judge() {
        let test_labels = vec![1, 1, 1, 0, 0, 0];
        let test_preds = vec![1, 1, 1, 0, 0, 0]; // Perfect predictions
        let unlabeled_preds = vec![1, 1, 0, 0]; // 50% pass rate

        let result =
            estimate_success_rate(&test_labels, &test_preds, &unlabeled_preds, 100, 0.95).unwrap();

        // With perfect judge, estimate should be close to observed rate
        let observed_rate = 0.5;
        assert!((result.theta_hat - observed_rate).abs() < 0.1);
    }

    #[test]
    fn test_input_validation_mismatched_lengths() {
        let test_labels = vec![1, 0];
        let test_preds = vec![1, 0, 1];
        let unlabeled_preds = vec![1, 0, 1];

        let result = estimate_success_rate(&test_labels, &test_preds, &unlabeled_preds, 100, 0.95);
        assert!(matches!(result, Err(JudgyError::InputValidation(_))));
    }

    #[test]
    fn test_input_validation_empty_arrays() {
        let empty: Vec<u8> = vec![];
        let valid = vec![1, 0, 1, 0];

        let result = estimate_success_rate(&empty, &empty, &valid, 100, 0.95);
        assert!(matches!(result, Err(JudgyError::InputValidation(_))));

        let result = estimate_success_rate(&valid, &valid, &empty, 100, 0.95);
        assert!(matches!(result, Err(JudgyError::InputValidation(_))));
    }

    #[test]
    fn test_input_validation_non_binary() {
        let test_labels = vec![1, 2, 0, 1]; // Contains 2
        let test_preds = vec![1, 0, 1, 1];
        let unlabeled_preds = vec![1, 0, 1];

        let result = estimate_success_rate(&test_labels, &test_preds, &unlabeled_preds, 100, 0.95);
        assert!(matches!(result, Err(JudgyError::InputValidation(_))));
    }

    #[test]
    fn test_input_validation_invalid_confidence_level() {
        let test_labels = vec![1, 0, 1, 0];
        let test_preds = vec![1, 0, 1, 1];
        let unlabeled_preds = vec![1, 0, 1];

        let result = estimate_success_rate(&test_labels, &test_preds, &unlabeled_preds, 100, 1.5);
        assert!(matches!(result, Err(JudgyError::InputValidation(_))));

        let result = estimate_success_rate(&test_labels, &test_preds, &unlabeled_preds, 100, 0.0);
        assert!(matches!(result, Err(JudgyError::InputValidation(_))));
    }

    #[test]
    fn test_judge_accuracy_too_low() {
        let test_labels = vec![1, 1, 1, 1, 0, 0, 0, 0];
        let test_preds = vec![0, 0, 0, 0, 1, 1, 1, 1]; // Always wrong
        let unlabeled_preds = vec![1, 0, 1, 0];

        let result = estimate_success_rate(&test_labels, &test_preds, &unlabeled_preds, 100, 0.95);
        assert!(matches!(
            result,
            Err(JudgyError::JudgeAccuracyTooLow { .. })
        ));
    }

    #[test]
    fn test_no_positive_examples() {
        let test_labels = vec![0, 0, 0]; // All negative
        let test_preds = vec![1, 0, 1];
        let unlabeled_preds = vec![1, 0, 1];

        let result = estimate_success_rate(&test_labels, &test_preds, &unlabeled_preds, 100, 0.95);
        assert!(matches!(result, Err(JudgyError::InputValidation(_))));
    }

    #[test]
    fn test_no_negative_examples() {
        let test_labels = vec![1, 1, 1]; // All positive
        let test_preds = vec![1, 0, 1];
        let unlabeled_preds = vec![1, 0, 1];

        let result = estimate_success_rate(&test_labels, &test_preds, &unlabeled_preds, 100, 0.95);
        assert!(matches!(result, Err(JudgyError::InputValidation(_))));
    }

    #[test]
    fn test_different_confidence_levels() {
        let test_labels = vec![1, 1, 0, 0, 1, 0, 1, 0];
        let test_preds = vec![1, 0, 0, 1, 1, 0, 1, 0];
        let unlabeled_preds = vec![1, 1, 0, 1, 0, 1, 1, 0, 1, 0]; // Larger sample

        // Test 90% confidence interval
        let result_90 =
            estimate_success_rate(&test_labels, &test_preds, &unlabeled_preds, 200, 0.90).unwrap();

        // Test 99% confidence interval
        let result_99 =
            estimate_success_rate(&test_labels, &test_preds, &unlabeled_preds, 200, 0.99).unwrap();

        // Point estimates should be similar (not necessarily identical due to bootstrap sampling)
        assert!((result_90.theta_hat - result_99.theta_hat).abs() < 0.1);

        // 99% CI should be wider than 90% CI (usually, but not guaranteed with small samples)
        let width_90 = result_90.upper_bound - result_90.lower_bound;
        let width_99 = result_99.upper_bound - result_99.lower_bound;

        // At least check that both CIs are reasonable
        assert!(width_90 >= 0.0);
        assert!(width_99 >= 0.0);
    }
}