reasonkit-core 0.1.8

The Reasoning Engine — Auditable Reasoning for Production AI | Rust-Native | Turn Prompts into Protocols
//! # Confidence Scorer
//!
//! Quantify verification confidence for mathematical proofs using multiple metrics.

use crate::error::Result;
use crate::verification::types::*;

/// Confidence scorer for mathematical proofs
pub struct ConfidenceScorer {
    config: VerificationConfig,
}

impl ConfidenceScorer {
    /// Create new confidence scorer with default configuration
    pub fn new() -> Self {
        Self {
            config: VerificationConfig::default(),
        }
    }

    /// Create new confidence scorer with custom configuration
    pub fn with_config(config: VerificationConfig) -> Self {
        Self { config }
    }

    /// Calculate overall confidence score for verification result
    pub fn score(&self, verification: &VerificationResult) -> f64 {
        let aime_weight = 0.40;
        let step_consistency = self.calculate_step_consistency(verification);
        let theorem_coverage = verification.details.theorem_coverage.coverage_score;
        let cross_model_agreement =
            self.extract_cross_model_agreement(&verification.details.cross_model_validation);

        let confidence = aime_weight * verification.confidence
            + 0.30 * step_consistency
            + 0.20 * theorem_coverage
            + 0.10 * cross_model_agreement;

        confidence.min(1.0).max(0.0)
    }

    /// Calculate step consistency score
    fn calculate_step_consistency(&self, verification: &VerificationResult) -> f64 {
        let total_steps = verification.details.valid_steps + verification.details.steps_with_issues;
        if total_steps == 0 {
            return 0.0;
        }
        verification.details.valid_steps as f64 / total_steps as f64
    }

    /// Extract cross-model agreement from validation details
    fn extract_cross_model_agreement(&self, validation: &Option<CrossModelValidation>) -> f64 {
        validation.as_ref().map(|v| v.agreement).unwrap_or(0.5) // Default 0.5 if no cross-model validation
    }

    /// Determine recommended action based on confidence score
    pub fn recommend(&self, confidence: f64) -> ValidationRecommendation {
        if confidence >= self.config.auto_accept_threshold {
            ValidationRecommendation::Accept
        } else if confidence >= self.config.review_threshold {
            ValidationRecommendation::AcceptWithAnnotation {
                reason: format!(
                    "Confidence {:?}% below auto-accept threshold of {:?}%",
                    confidence * 100.0,
                    self.config.auto_accept_threshold * 100.0
                ),
            }
        } else {
            ValidationRecommendation::ReviewRequired {
                reasons: vec![format!(
                    "Confidence {:?}% below review threshold of {:?}%",
                    confidence * 100.0,
                    self.config.review_threshold * 100.0
                )],
            }
        }
    }

    /// Calculate detailed confidence breakdown
    pub fn score_breakdown(&self, verification: &VerificationResult) -> ConfidenceBreakdown {
        let aime_accuracy = verification.confidence;
        let step_consistency = self.calculate_step_consistency(verification);
        let theorem_coverage = verification.details.theorem_coverage.coverage_score;
        let cross_model_agreement =
            self.extract_cross_model_agreement(&verification.details.cross_model_validation);

        ConfidenceBreakdown {
            aime_accuracy,
            step_consistency,
            theorem_coverage,
            cross_model_agreement,
            overall_confidence: self.score(verification),
            recommendation: self.recommend(self.score(verification)),
            meets_auto_accept: self.score(verification) >= self.config.auto_accept_threshold,
            meets_annotation_threshold: self.score(verification) >= self.config.review_threshold,
        }
    }
}

/// Detailed confidence breakdown
#[derive(Debug, Clone)]
pub struct ConfidenceBreakdown {
    /// AIME-based accuracy weight (40%)
    pub aime_accuracy: f64,

    /// Step consistency score (30%)
    pub step_consistency: f64,

    /// Theorem coverage score (20%)
    pub theorem_coverage: f64,

    /// Cross-model agreement score (10%)
    pub cross_model_agreement: f64,

    /// Overall confidence score
    pub overall_confidence: f64,

    /// Recommended action
    pub recommendation: ValidationRecommendation,

    /// Whether confidence meets auto-accept threshold
    pub meets_auto_accept: bool,

    /// Whether confidence meets review threshold
    pub meets_annotation_threshold: bool,
}

impl ConfidenceBreakdown {
    /// Format as human-readable string
    pub fn to_string(&self) -> String {
        format!(
            "\
Confidence Breakdown:
===================
AIME Accuracy (40%):     {:.2}%
Step Consistency (30%):  {:.2}%
Theorem Coverage (20%):  {:.2}%
Cross-Model Agreement (10%): {:.2}%

Overall Confidence:      {:.2}%
Recommendation: {}

Thresholds:
- Auto-accept: {:.2}% (met: {})
- Annotation: {:.2}% (met: {})
- Review:      {:.2}% (met: {})\n",
            self.aime_accuracy * 100.0,
            self.step_consistency * 100.0,
            self.theorem_coverage * 100.0,
            self.cross_model_agreement * 100.0,
            self.overall_confidence * 100.0,
            self.recommendation,
            90.0,
            self.meets_auto_accept,
            75.0,
            self.meets_annotation_threshold,
            75.0,
            !self.meets_annotation_threshold
        )
    }
}

impl Default for ConfidenceScorer {
    fn default() -> Self {
        Self::new()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_confidence_scorer_creation() {
        let scorer = ConfidenceScorer::new();
        assert_eq!(scorer.config.auto_accept_threshold, 0.90);
    }

    #[test]
    fn test_confidence_breakdown_formatting() {
        let breakdown = ConfidenceBreakdown {
            aime_accuracy: 0.91,
            step_consistency: 0.95,
            theorem_coverage: 0.88,
            cross_model_agreement: 0.80,
            overall_confidence: 0.90,
            recommendation: ValidationRecommendation::Accept,
            meets_auto_accept: true,
            meets_annotation_threshold: true,
        };

        let formatted = breakdown.to_string();
        assert!(formatted.contains("91.00%"));
        assert!(formatted.contains("90.00%"));
        assert!(formatted.contains("ACCEPT"));
    }

    #[test]
    fn test_recommend_accept() {
        let scorer = ConfidenceScorer::new();
        let rec = scorer.recommend(0.95);
        assert!(matches!(rec, ValidationRecommendation::Accept));
    }

    #[test]
    fn test_recommend_annotation() {
        let scorer = ConfidenceScorer::new();
        let rec = scorer.recommend(0.82);
        assert!(matches!(
            rec,
            ValidationRecommendation::AcceptWithAnnotation { .. }
        ));
    }

    #[test]
    fn test_recommend_review() {
        let scorer = ConfidenceScorer::new();
        let rec = scorer.recommend(0.65);
        assert!(matches!(
            rec,
            ValidationRecommendation::ReviewRequired { .. }
        ));
    }
}