reasonkit/evaluation/
reasoning.rs

1//! # Reasoning Quality Metrics
2//!
3//! Core metrics for evaluating AI THINKING improvement with ReasonKit.
4//!
5//! This is the HEART of ReasonKit evaluation - measuring whether
6//! ThinkTool protocols actually improve AI reasoning.
7
8use std::collections::HashMap;
9
10/// ReasonKit profiles (ThinkTool chains)
11///
12/// Profiles define which ThinkTools are used and in what order.
13/// See `thinktool::profiles::ReasoningProfile` for the full chain configuration
14/// including conditional execution and validation passes.
15///
16/// # Confidence Thresholds (per ORCHESTRATOR.md spec)
17///
18/// | Profile  | Min Confidence | Modules |
19/// |----------|----------------|---------|
20/// | Quick    | 70%            | gt, ll  |
21/// | Balanced | 80%            | gt, ll, br, pg |
22/// | Deep     | 85%            | All 5   |
23/// | Paranoid | 95%            | All 5 + validation |
24#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
25pub enum Profile {
26    /// No ThinkTools (baseline for A/B comparison)
27    None,
28    /// Quick: GigaThink -> LaserLogic (70% confidence target)
29    /// Fast 2-step analysis for rapid insights
30    Quick,
31    /// Balanced: GigaThink -> LaserLogic -> BedRock -> ProofGuard (80% confidence target)
32    /// Standard 4-module chain for thorough but efficient analysis
33    Balanced,
34    /// Deep: All 5 ThinkTools (85% confidence target)
35    /// GigaThink -> LaserLogic -> BedRock -> ProofGuard -> BrutalHonesty (conditional)
36    /// BrutalHonesty runs if confidence < 85%
37    Deep,
38    /// Paranoid: All 5 ThinkTools + validation pass (95% confidence target)
39    /// GigaThink -> LaserLogic -> BedRock -> ProofGuard -> BrutalHonesty -> ProofGuard
40    /// Maximum rigor with adversarial critique and second verification pass
41    Paranoid,
42}
43
44impl Profile {
45    /// Get the list of ThinkTools for this profile
46    ///
47    /// Note: This returns the *unique* tools used, not the full execution chain.
48    /// For the actual execution chain (including conditional steps and validation passes),
49    /// see `thinktool::profiles::ReasoningProfile`.
50    pub fn thinktools(&self) -> Vec<ThinkTool> {
51        match self {
52            Profile::None => vec![],
53            Profile::Quick => vec![ThinkTool::GigaThink, ThinkTool::LaserLogic],
54            Profile::Balanced => vec![
55                ThinkTool::GigaThink,
56                ThinkTool::LaserLogic,
57                ThinkTool::BedRock,
58                ThinkTool::ProofGuard,
59            ],
60            Profile::Deep => vec![
61                ThinkTool::GigaThink,
62                ThinkTool::LaserLogic,
63                ThinkTool::BedRock,
64                ThinkTool::ProofGuard,
65                ThinkTool::BrutalHonesty,
66            ],
67            Profile::Paranoid => vec![
68                ThinkTool::GigaThink,
69                ThinkTool::LaserLogic,
70                ThinkTool::BedRock,
71                ThinkTool::ProofGuard,
72                ThinkTool::BrutalHonesty,
73            ],
74        }
75    }
76
77    /// Get the minimum confidence threshold for this profile
78    ///
79    /// Returns the confidence level required for the profile to be considered successful.
80    /// Per ORCHESTRATOR.md specification:
81    /// - Quick: 70%
82    /// - Balanced: 80%
83    /// - Deep: 85%
84    /// - Paranoid: 95%
85    pub fn min_confidence(&self) -> f64 {
86        match self {
87            Profile::None => 0.0,
88            Profile::Quick => 0.70,
89            Profile::Balanced => 0.80,
90            Profile::Deep => 0.85,
91            Profile::Paranoid => 0.95,
92        }
93    }
94
95    /// Get the number of steps in the execution chain
96    ///
97    /// Note: Paranoid has 6 steps (includes 2nd ProofGuard validation pass)
98    pub fn chain_length(&self) -> usize {
99        match self {
100            Profile::None => 0,
101            Profile::Quick => 2,
102            Profile::Balanced => 4,
103            Profile::Deep => 5,
104            Profile::Paranoid => 6, // Includes 2nd ProofGuard pass
105        }
106    }
107
108    /// Convert from profile ID string
109    pub fn from_id(id: &str) -> Option<Self> {
110        match id.to_lowercase().as_str() {
111            "none" | "baseline" => Some(Profile::None),
112            "quick" => Some(Profile::Quick),
113            "balanced" => Some(Profile::Balanced),
114            "deep" => Some(Profile::Deep),
115            "paranoid" => Some(Profile::Paranoid),
116            _ => None,
117        }
118    }
119
120    /// Get the profile ID string
121    pub fn id(&self) -> &'static str {
122        match self {
123            Profile::None => "none",
124            Profile::Quick => "quick",
125            Profile::Balanced => "balanced",
126            Profile::Deep => "deep",
127            Profile::Paranoid => "paranoid",
128        }
129    }
130}
131
132/// Individual ThinkTools
133#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
134pub enum ThinkTool {
135    /// Multi-perspective expansion (10+ viewpoints)
136    GigaThink,
137    /// Precision deductive reasoning
138    LaserLogic,
139    /// First principles decomposition
140    BedRock,
141    /// Multi-source verification
142    ProofGuard,
143    /// Adversarial self-critique
144    BrutalHonesty,
145}
146
147/// Result from running a benchmark
148#[derive(Debug, Clone)]
149pub struct BenchmarkResult {
150    /// Benchmark name (e.g., "gsm8k", "arc_challenge")
151    pub benchmark: String,
152    /// Profile used
153    pub profile: Profile,
154    /// Accuracy (0.0-1.0)
155    pub accuracy: f64,
156    /// Number of correct answers
157    pub correct: usize,
158    /// Total questions
159    pub total: usize,
160    /// Per-question results
161    pub question_results: Vec<QuestionResult>,
162}
163
164impl BenchmarkResult {
165    /// Calculate improvement over a baseline
166    pub fn improvement_over(&self, baseline: &BenchmarkResult) -> f64 {
167        self.accuracy - baseline.accuracy
168    }
169}
170
171/// Result for a single question
172#[derive(Debug, Clone)]
173pub struct QuestionResult {
174    /// Question ID
175    pub id: String,
176    /// Whether answer was correct
177    pub correct: bool,
178    /// Confidence (0.0-1.0) if available
179    pub confidence: Option<f64>,
180    /// Answer given
181    pub answer: String,
182    /// Correct answer
183    pub expected: String,
184    /// Reasoning chain if captured
185    pub reasoning: Option<String>,
186}
187
188/// Aggregated reasoning metrics
189#[derive(Debug, Clone)]
190pub struct ReasoningMetrics {
191    /// Accuracy on benchmark
192    pub accuracy: f64,
193    /// Improvement over no-protocol baseline
194    pub improvement: f64,
195    /// Self-consistency metrics
196    pub consistency: ConsistencyMetrics,
197    /// Calibration metrics
198    pub calibration: CalibrationMetrics,
199    /// Per-ThinkTool effectiveness
200    pub thinktool_metrics: HashMap<ThinkTool, ThinkToolMetrics>,
201}
202
203/// Self-consistency metrics
204#[derive(Debug, Clone, Default)]
205pub struct ConsistencyMetrics {
206    /// Same answer across multiple runs (0.0-1.0)
207    pub answer_agreement: f64,
208    /// Same reasoning path across runs (0.0-1.0)
209    pub reasoning_agreement: f64,
210    /// Confidence variance (lower is better)
211    pub confidence_variance: f64,
212    /// Number of runs used to calculate
213    pub num_runs: usize,
214}
215
216impl ConsistencyMetrics {
217    /// Calculate from multiple runs of same questions
218    pub fn from_runs(runs: &[Vec<QuestionResult>]) -> Self {
219        if runs.is_empty() || runs[0].is_empty() {
220            return Self::default();
221        }
222
223        let num_runs = runs.len();
224        let num_questions = runs[0].len();
225        let mut answer_agreements = 0;
226        let mut confidence_sum = 0.0;
227        let mut confidence_sq_sum = 0.0;
228        let mut confidence_count = 0;
229
230        for q_idx in 0..num_questions {
231            // Check if all runs agree on this question
232            let first_answer = &runs[0][q_idx].answer;
233            let all_agree = runs.iter().all(|run| &run[q_idx].answer == first_answer);
234            if all_agree {
235                answer_agreements += 1;
236            }
237
238            // Collect confidences
239            for run in runs {
240                if let Some(conf) = run[q_idx].confidence {
241                    confidence_sum += conf;
242                    confidence_sq_sum += conf * conf;
243                    confidence_count += 1;
244                }
245            }
246        }
247
248        let answer_agreement = answer_agreements as f64 / num_questions as f64;
249
250        let confidence_variance = if confidence_count > 1 {
251            let mean = confidence_sum / confidence_count as f64;
252            (confidence_sq_sum / confidence_count as f64) - (mean * mean)
253        } else {
254            0.0
255        };
256
257        Self {
258            answer_agreement,
259            reasoning_agreement: 0.0, // Requires semantic comparison
260            confidence_variance,
261            num_runs,
262        }
263    }
264}
265
266/// Calibration metrics (confidence vs accuracy)
267#[derive(Debug, Clone, Default)]
268pub struct CalibrationMetrics {
269    /// Expected Calibration Error (lower is better)
270    pub ece: f64,
271    /// Overconfidence rate (high confidence + wrong)
272    pub overconfidence_rate: f64,
273    /// Underconfidence rate (low confidence + right)
274    pub underconfidence_rate: f64,
275    /// Brier score (lower is better)
276    pub brier_score: f64,
277}
278
279impl CalibrationMetrics {
280    /// Calculate calibration from results with confidence scores
281    pub fn from_results(results: &[QuestionResult]) -> Self {
282        let with_confidence: Vec<_> = results.iter().filter(|r| r.confidence.is_some()).collect();
283
284        if with_confidence.is_empty() {
285            return Self::default();
286        }
287
288        // Bin results by confidence for ECE
289        let num_bins = 10;
290        let mut bins: Vec<Vec<(f64, bool)>> = vec![vec![]; num_bins];
291
292        for result in &with_confidence {
293            let conf = result.confidence.unwrap();
294            let bin_idx = ((conf * num_bins as f64) as usize).min(num_bins - 1);
295            bins[bin_idx].push((conf, result.correct));
296        }
297
298        // Calculate ECE
299        let n = with_confidence.len() as f64;
300        let mut ece = 0.0;
301        for bin in &bins {
302            if !bin.is_empty() {
303                let bin_size = bin.len() as f64;
304                let avg_confidence: f64 = bin.iter().map(|(c, _)| c).sum::<f64>() / bin_size;
305                let accuracy: f64 =
306                    bin.iter().filter(|(_, correct)| *correct).count() as f64 / bin_size;
307                ece += (bin_size / n) * (avg_confidence - accuracy).abs();
308            }
309        }
310
311        // Overconfidence: confidence > 0.8 but wrong
312        let overconfident = with_confidence
313            .iter()
314            .filter(|r| r.confidence.unwrap() > 0.8 && !r.correct)
315            .count();
316        let overconfidence_rate = overconfident as f64 / with_confidence.len() as f64;
317
318        // Underconfidence: confidence < 0.5 but correct
319        let underconfident = with_confidence
320            .iter()
321            .filter(|r| r.confidence.unwrap() < 0.5 && r.correct)
322            .count();
323        let underconfidence_rate = underconfident as f64 / with_confidence.len() as f64;
324
325        // Brier score
326        let brier_score: f64 = with_confidence
327            .iter()
328            .map(|r| {
329                let conf = r.confidence.unwrap();
330                let outcome = if r.correct { 1.0 } else { 0.0 };
331                (conf - outcome).powi(2)
332            })
333            .sum::<f64>()
334            / with_confidence.len() as f64;
335
336        Self {
337            ece,
338            overconfidence_rate,
339            underconfidence_rate,
340            brier_score,
341        }
342    }
343}
344
345/// Generic ThinkTool effectiveness metrics
346#[derive(Debug, Clone, Default)]
347pub struct ThinkToolMetrics {
348    /// Improvement delta when this tool is added
349    pub improvement_delta: f64,
350    /// Is this tool worth the latency cost?
351    pub cost_effective: bool,
352    /// Latency added (ms)
353    pub latency_ms: f64,
354}
355
356/// GigaThink-specific metrics
357#[derive(Debug, Clone, Default)]
358pub struct GigaThinkMetrics {
359    /// Number of distinct perspectives generated
360    pub perspective_count: usize,
361    /// Coverage of relevant angles (0.0-1.0)
362    pub coverage_score: f64,
363    /// Proportion of non-obvious perspectives (0.0-1.0)
364    pub novelty_rate: f64,
365    /// How well perspectives integrate (0.0-1.0)
366    pub integration_quality: f64,
367}
368
369/// LaserLogic-specific metrics
370#[derive(Debug, Clone, Default)]
371pub struct LaserLogicMetrics {
372    /// Proportion of valid deductions (0.0-1.0)
373    pub validity_rate: f64,
374    /// Rate of detecting inserted fallacies (0.0-1.0)
375    pub fallacy_detection_rate: f64,
376    /// Avoidance of irrelevant premises (0.0-1.0)
377    pub precision: f64,
378    /// Valid deductions from true premises (0.0-1.0)
379    pub soundness: f64,
380}
381
382/// BedRock-specific metrics
383#[derive(Debug, Clone, Default)]
384pub struct BedRockMetrics {
385    /// Levels of first-principles breakdown
386    pub decomposition_depth: usize,
387    /// Proportion of truly fundamental axioms (0.0-1.0)
388    pub axiom_validity: f64,
389    /// Can rebuild conclusion from axioms? (0.0-1.0)
390    pub reconstruction_rate: f64,
391    /// Hidden assumptions made explicit (0.0-1.0)
392    pub assumption_surfacing: f64,
393}
394
395/// ProofGuard-specific metrics
396#[derive(Debug, Clone, Default)]
397pub struct ProofGuardMetrics {
398    /// Proportion of claims with 3+ sources (0.0-1.0)
399    pub triangulation_rate: f64,
400    /// Rate of detecting conflicting sources (0.0-1.0)
401    pub contradiction_detection: f64,
402    /// Tier 1 source priority adherence (0.0-1.0)
403    pub source_quality_score: f64,
404    /// Correct attribution rate (0.0-1.0)
405    pub citation_accuracy: f64,
406}
407
408/// BrutalHonesty-specific metrics
409#[derive(Debug, Clone, Default)]
410pub struct BrutalHonestyMetrics {
411    /// Proportion of real flaws identified (0.0-1.0)
412    pub flaw_detection_rate: f64,
413    /// Proportion of non-flaws flagged (lower is better)
414    pub false_positive_rate: f64,
415    /// Average actionable suggestions per flaw
416    pub suggestions_per_flaw: f64,
417    /// Correct severity prioritization (0.0-1.0)
418    pub severity_calibration: f64,
419}
420
421/// Calculate improvement delta for a ThinkTool
422pub fn calculate_thinktool_delta(without: &BenchmarkResult, with: &BenchmarkResult) -> f64 {
423    with.accuracy - without.accuracy
424}
425
426/// Statistical significance test (simplified)
427pub fn is_significant(delta: f64, n: usize, alpha: f64) -> bool {
428    // Approximate significance test
429    // For proper testing, would use bootstrap or permutation test
430    let se = (0.25 / n as f64).sqrt(); // Worst-case SE for proportions
431    let z = delta / se;
432    let critical = if alpha <= 0.01 {
433        2.576
434    } else if alpha <= 0.05 {
435        1.96
436    } else {
437        1.645
438    };
439    z.abs() > critical
440}
441
442#[cfg(test)]
443mod tests {
444    use super::*;
445
446    #[test]
447    fn test_profile_thinktools() {
448        assert!(Profile::None.thinktools().is_empty());
449        assert_eq!(Profile::Quick.thinktools().len(), 2);
450        assert_eq!(Profile::Balanced.thinktools().len(), 4);
451        assert_eq!(Profile::Deep.thinktools().len(), 5);
452        assert_eq!(Profile::Paranoid.thinktools().len(), 5); // Unique tools (not chain length)
453    }
454
455    #[test]
456    fn test_profile_min_confidence() {
457        assert_eq!(Profile::None.min_confidence(), 0.0);
458        assert_eq!(Profile::Quick.min_confidence(), 0.70);
459        assert_eq!(Profile::Balanced.min_confidence(), 0.80);
460        assert_eq!(Profile::Deep.min_confidence(), 0.85);
461        assert_eq!(Profile::Paranoid.min_confidence(), 0.95);
462    }
463
464    #[test]
465    fn test_profile_chain_length() {
466        assert_eq!(Profile::None.chain_length(), 0);
467        assert_eq!(Profile::Quick.chain_length(), 2);
468        assert_eq!(Profile::Balanced.chain_length(), 4);
469        assert_eq!(Profile::Deep.chain_length(), 5);
470        assert_eq!(Profile::Paranoid.chain_length(), 6); // Includes 2nd ProofGuard pass
471    }
472
473    #[test]
474    fn test_profile_from_id() {
475        assert_eq!(Profile::from_id("quick"), Some(Profile::Quick));
476        assert_eq!(Profile::from_id("BALANCED"), Some(Profile::Balanced));
477        assert_eq!(Profile::from_id("paranoid"), Some(Profile::Paranoid));
478        assert_eq!(Profile::from_id("baseline"), Some(Profile::None));
479        assert_eq!(Profile::from_id("invalid"), None);
480    }
481
482    #[test]
483    fn test_profile_id() {
484        assert_eq!(Profile::Quick.id(), "quick");
485        assert_eq!(Profile::Balanced.id(), "balanced");
486        assert_eq!(Profile::Deep.id(), "deep");
487        assert_eq!(Profile::Paranoid.id(), "paranoid");
488    }
489
490    #[test]
491    fn test_improvement_calculation() {
492        let baseline = BenchmarkResult {
493            benchmark: "gsm8k".into(),
494            profile: Profile::None,
495            accuracy: 0.57,
496            correct: 57,
497            total: 100,
498            question_results: vec![],
499        };
500
501        let treatment = BenchmarkResult {
502            benchmark: "gsm8k".into(),
503            profile: Profile::Balanced,
504            accuracy: 0.78,
505            correct: 78,
506            total: 100,
507            question_results: vec![],
508        };
509
510        let improvement = treatment.improvement_over(&baseline);
511        assert!((improvement - 0.21).abs() < 0.001);
512    }
513
514    #[test]
515    fn test_consistency_from_runs() {
516        let runs = vec![
517            vec![QuestionResult {
518                id: "q1".into(),
519                correct: true,
520                confidence: Some(0.9),
521                answer: "42".into(),
522                expected: "42".into(),
523                reasoning: None,
524            }],
525            vec![QuestionResult {
526                id: "q1".into(),
527                correct: true,
528                confidence: Some(0.85),
529                answer: "42".into(),
530                expected: "42".into(),
531                reasoning: None,
532            }],
533        ];
534
535        let consistency = ConsistencyMetrics::from_runs(&runs);
536        assert_eq!(consistency.answer_agreement, 1.0);
537        assert_eq!(consistency.num_runs, 2);
538    }
539
540    #[test]
541    fn test_calibration_ece() {
542        // Perfect calibration: 80% confident, 80% correct
543        let results: Vec<QuestionResult> = (0..100)
544            .map(|i| QuestionResult {
545                id: format!("q{}", i),
546                correct: i < 80, // 80 correct
547                confidence: Some(0.8),
548                answer: "x".into(),
549                expected: if i < 80 { "x" } else { "y" }.into(),
550                reasoning: None,
551            })
552            .collect();
553
554        let calibration = CalibrationMetrics::from_results(&results);
555        // ECE should be low for well-calibrated predictions
556        assert!(calibration.ece < 0.1);
557    }
558
559    #[test]
560    fn test_significance() {
561        // Large improvement with large N should be significant
562        assert!(is_significant(0.10, 1000, 0.05));
563        // Small improvement with small N should not be
564        assert!(!is_significant(0.02, 50, 0.05));
565    }
566}
reasonkit/evaluation/reasoning.rs

reasonkit/evaluation/
reasoning.rs