1use std::collections::HashMap;
9
10#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
25pub enum Profile {
26 None,
28 Quick,
31 Balanced,
34 Deep,
38 Paranoid,
42}
43
44impl Profile {
45 pub fn thinktools(&self) -> Vec<ThinkTool> {
51 match self {
52 Profile::None => vec![],
53 Profile::Quick => vec![ThinkTool::GigaThink, ThinkTool::LaserLogic],
54 Profile::Balanced => vec![
55 ThinkTool::GigaThink,
56 ThinkTool::LaserLogic,
57 ThinkTool::BedRock,
58 ThinkTool::ProofGuard,
59 ],
60 Profile::Deep => vec![
61 ThinkTool::GigaThink,
62 ThinkTool::LaserLogic,
63 ThinkTool::BedRock,
64 ThinkTool::ProofGuard,
65 ThinkTool::BrutalHonesty,
66 ],
67 Profile::Paranoid => vec![
68 ThinkTool::GigaThink,
69 ThinkTool::LaserLogic,
70 ThinkTool::BedRock,
71 ThinkTool::ProofGuard,
72 ThinkTool::BrutalHonesty,
73 ],
74 }
75 }
76
77 pub fn min_confidence(&self) -> f64 {
86 match self {
87 Profile::None => 0.0,
88 Profile::Quick => 0.70,
89 Profile::Balanced => 0.80,
90 Profile::Deep => 0.85,
91 Profile::Paranoid => 0.95,
92 }
93 }
94
95 pub fn chain_length(&self) -> usize {
99 match self {
100 Profile::None => 0,
101 Profile::Quick => 2,
102 Profile::Balanced => 4,
103 Profile::Deep => 5,
104 Profile::Paranoid => 6, }
106 }
107
108 pub fn from_id(id: &str) -> Option<Self> {
110 match id.to_lowercase().as_str() {
111 "none" | "baseline" => Some(Profile::None),
112 "quick" => Some(Profile::Quick),
113 "balanced" => Some(Profile::Balanced),
114 "deep" => Some(Profile::Deep),
115 "paranoid" => Some(Profile::Paranoid),
116 _ => None,
117 }
118 }
119
120 pub fn id(&self) -> &'static str {
122 match self {
123 Profile::None => "none",
124 Profile::Quick => "quick",
125 Profile::Balanced => "balanced",
126 Profile::Deep => "deep",
127 Profile::Paranoid => "paranoid",
128 }
129 }
130}
131
132#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
134pub enum ThinkTool {
135 GigaThink,
137 LaserLogic,
139 BedRock,
141 ProofGuard,
143 BrutalHonesty,
145}
146
147#[derive(Debug, Clone)]
149pub struct BenchmarkResult {
150 pub benchmark: String,
152 pub profile: Profile,
154 pub accuracy: f64,
156 pub correct: usize,
158 pub total: usize,
160 pub question_results: Vec<QuestionResult>,
162}
163
164impl BenchmarkResult {
165 pub fn improvement_over(&self, baseline: &BenchmarkResult) -> f64 {
167 self.accuracy - baseline.accuracy
168 }
169}
170
171#[derive(Debug, Clone)]
173pub struct QuestionResult {
174 pub id: String,
176 pub correct: bool,
178 pub confidence: Option<f64>,
180 pub answer: String,
182 pub expected: String,
184 pub reasoning: Option<String>,
186}
187
188#[derive(Debug, Clone)]
190pub struct ReasoningMetrics {
191 pub accuracy: f64,
193 pub improvement: f64,
195 pub consistency: ConsistencyMetrics,
197 pub calibration: CalibrationMetrics,
199 pub thinktool_metrics: HashMap<ThinkTool, ThinkToolMetrics>,
201}
202
203#[derive(Debug, Clone, Default)]
205pub struct ConsistencyMetrics {
206 pub answer_agreement: f64,
208 pub reasoning_agreement: f64,
210 pub confidence_variance: f64,
212 pub num_runs: usize,
214}
215
216impl ConsistencyMetrics {
217 pub fn from_runs(runs: &[Vec<QuestionResult>]) -> Self {
219 if runs.is_empty() || runs[0].is_empty() {
220 return Self::default();
221 }
222
223 let num_runs = runs.len();
224 let num_questions = runs[0].len();
225 let mut answer_agreements = 0;
226 let mut confidence_sum = 0.0;
227 let mut confidence_sq_sum = 0.0;
228 let mut confidence_count = 0;
229
230 for q_idx in 0..num_questions {
231 let first_answer = &runs[0][q_idx].answer;
233 let all_agree = runs.iter().all(|run| &run[q_idx].answer == first_answer);
234 if all_agree {
235 answer_agreements += 1;
236 }
237
238 for run in runs {
240 if let Some(conf) = run[q_idx].confidence {
241 confidence_sum += conf;
242 confidence_sq_sum += conf * conf;
243 confidence_count += 1;
244 }
245 }
246 }
247
248 let answer_agreement = answer_agreements as f64 / num_questions as f64;
249
250 let confidence_variance = if confidence_count > 1 {
251 let mean = confidence_sum / confidence_count as f64;
252 (confidence_sq_sum / confidence_count as f64) - (mean * mean)
253 } else {
254 0.0
255 };
256
257 Self {
258 answer_agreement,
259 reasoning_agreement: 0.0, confidence_variance,
261 num_runs,
262 }
263 }
264}
265
266#[derive(Debug, Clone, Default)]
268pub struct CalibrationMetrics {
269 pub ece: f64,
271 pub overconfidence_rate: f64,
273 pub underconfidence_rate: f64,
275 pub brier_score: f64,
277}
278
279impl CalibrationMetrics {
280 pub fn from_results(results: &[QuestionResult]) -> Self {
282 let with_confidence: Vec<_> = results.iter().filter(|r| r.confidence.is_some()).collect();
283
284 if with_confidence.is_empty() {
285 return Self::default();
286 }
287
288 let num_bins = 10;
290 let mut bins: Vec<Vec<(f64, bool)>> = vec![vec![]; num_bins];
291
292 for result in &with_confidence {
293 let conf = result.confidence.unwrap();
294 let bin_idx = ((conf * num_bins as f64) as usize).min(num_bins - 1);
295 bins[bin_idx].push((conf, result.correct));
296 }
297
298 let n = with_confidence.len() as f64;
300 let mut ece = 0.0;
301 for bin in &bins {
302 if !bin.is_empty() {
303 let bin_size = bin.len() as f64;
304 let avg_confidence: f64 = bin.iter().map(|(c, _)| c).sum::<f64>() / bin_size;
305 let accuracy: f64 =
306 bin.iter().filter(|(_, correct)| *correct).count() as f64 / bin_size;
307 ece += (bin_size / n) * (avg_confidence - accuracy).abs();
308 }
309 }
310
311 let overconfident = with_confidence
313 .iter()
314 .filter(|r| r.confidence.unwrap() > 0.8 && !r.correct)
315 .count();
316 let overconfidence_rate = overconfident as f64 / with_confidence.len() as f64;
317
318 let underconfident = with_confidence
320 .iter()
321 .filter(|r| r.confidence.unwrap() < 0.5 && r.correct)
322 .count();
323 let underconfidence_rate = underconfident as f64 / with_confidence.len() as f64;
324
325 let brier_score: f64 = with_confidence
327 .iter()
328 .map(|r| {
329 let conf = r.confidence.unwrap();
330 let outcome = if r.correct { 1.0 } else { 0.0 };
331 (conf - outcome).powi(2)
332 })
333 .sum::<f64>()
334 / with_confidence.len() as f64;
335
336 Self {
337 ece,
338 overconfidence_rate,
339 underconfidence_rate,
340 brier_score,
341 }
342 }
343}
344
345#[derive(Debug, Clone, Default)]
347pub struct ThinkToolMetrics {
348 pub improvement_delta: f64,
350 pub cost_effective: bool,
352 pub latency_ms: f64,
354}
355
356#[derive(Debug, Clone, Default)]
358pub struct GigaThinkMetrics {
359 pub perspective_count: usize,
361 pub coverage_score: f64,
363 pub novelty_rate: f64,
365 pub integration_quality: f64,
367}
368
369#[derive(Debug, Clone, Default)]
371pub struct LaserLogicMetrics {
372 pub validity_rate: f64,
374 pub fallacy_detection_rate: f64,
376 pub precision: f64,
378 pub soundness: f64,
380}
381
382#[derive(Debug, Clone, Default)]
384pub struct BedRockMetrics {
385 pub decomposition_depth: usize,
387 pub axiom_validity: f64,
389 pub reconstruction_rate: f64,
391 pub assumption_surfacing: f64,
393}
394
395#[derive(Debug, Clone, Default)]
397pub struct ProofGuardMetrics {
398 pub triangulation_rate: f64,
400 pub contradiction_detection: f64,
402 pub source_quality_score: f64,
404 pub citation_accuracy: f64,
406}
407
408#[derive(Debug, Clone, Default)]
410pub struct BrutalHonestyMetrics {
411 pub flaw_detection_rate: f64,
413 pub false_positive_rate: f64,
415 pub suggestions_per_flaw: f64,
417 pub severity_calibration: f64,
419}
420
421pub fn calculate_thinktool_delta(without: &BenchmarkResult, with: &BenchmarkResult) -> f64 {
423 with.accuracy - without.accuracy
424}
425
426pub fn is_significant(delta: f64, n: usize, alpha: f64) -> bool {
428 let se = (0.25 / n as f64).sqrt(); let z = delta / se;
432 let critical = if alpha <= 0.01 {
433 2.576
434 } else if alpha <= 0.05 {
435 1.96
436 } else {
437 1.645
438 };
439 z.abs() > critical
440}
441
442#[cfg(test)]
443mod tests {
444 use super::*;
445
446 #[test]
447 fn test_profile_thinktools() {
448 assert!(Profile::None.thinktools().is_empty());
449 assert_eq!(Profile::Quick.thinktools().len(), 2);
450 assert_eq!(Profile::Balanced.thinktools().len(), 4);
451 assert_eq!(Profile::Deep.thinktools().len(), 5);
452 assert_eq!(Profile::Paranoid.thinktools().len(), 5); }
454
455 #[test]
456 fn test_profile_min_confidence() {
457 assert_eq!(Profile::None.min_confidence(), 0.0);
458 assert_eq!(Profile::Quick.min_confidence(), 0.70);
459 assert_eq!(Profile::Balanced.min_confidence(), 0.80);
460 assert_eq!(Profile::Deep.min_confidence(), 0.85);
461 assert_eq!(Profile::Paranoid.min_confidence(), 0.95);
462 }
463
464 #[test]
465 fn test_profile_chain_length() {
466 assert_eq!(Profile::None.chain_length(), 0);
467 assert_eq!(Profile::Quick.chain_length(), 2);
468 assert_eq!(Profile::Balanced.chain_length(), 4);
469 assert_eq!(Profile::Deep.chain_length(), 5);
470 assert_eq!(Profile::Paranoid.chain_length(), 6); }
472
473 #[test]
474 fn test_profile_from_id() {
475 assert_eq!(Profile::from_id("quick"), Some(Profile::Quick));
476 assert_eq!(Profile::from_id("BALANCED"), Some(Profile::Balanced));
477 assert_eq!(Profile::from_id("paranoid"), Some(Profile::Paranoid));
478 assert_eq!(Profile::from_id("baseline"), Some(Profile::None));
479 assert_eq!(Profile::from_id("invalid"), None);
480 }
481
482 #[test]
483 fn test_profile_id() {
484 assert_eq!(Profile::Quick.id(), "quick");
485 assert_eq!(Profile::Balanced.id(), "balanced");
486 assert_eq!(Profile::Deep.id(), "deep");
487 assert_eq!(Profile::Paranoid.id(), "paranoid");
488 }
489
490 #[test]
491 fn test_improvement_calculation() {
492 let baseline = BenchmarkResult {
493 benchmark: "gsm8k".into(),
494 profile: Profile::None,
495 accuracy: 0.57,
496 correct: 57,
497 total: 100,
498 question_results: vec![],
499 };
500
501 let treatment = BenchmarkResult {
502 benchmark: "gsm8k".into(),
503 profile: Profile::Balanced,
504 accuracy: 0.78,
505 correct: 78,
506 total: 100,
507 question_results: vec![],
508 };
509
510 let improvement = treatment.improvement_over(&baseline);
511 assert!((improvement - 0.21).abs() < 0.001);
512 }
513
514 #[test]
515 fn test_consistency_from_runs() {
516 let runs = vec![
517 vec![QuestionResult {
518 id: "q1".into(),
519 correct: true,
520 confidence: Some(0.9),
521 answer: "42".into(),
522 expected: "42".into(),
523 reasoning: None,
524 }],
525 vec![QuestionResult {
526 id: "q1".into(),
527 correct: true,
528 confidence: Some(0.85),
529 answer: "42".into(),
530 expected: "42".into(),
531 reasoning: None,
532 }],
533 ];
534
535 let consistency = ConsistencyMetrics::from_runs(&runs);
536 assert_eq!(consistency.answer_agreement, 1.0);
537 assert_eq!(consistency.num_runs, 2);
538 }
539
540 #[test]
541 fn test_calibration_ece() {
542 let results: Vec<QuestionResult> = (0..100)
544 .map(|i| QuestionResult {
545 id: format!("q{}", i),
546 correct: i < 80, confidence: Some(0.8),
548 answer: "x".into(),
549 expected: if i < 80 { "x" } else { "y" }.into(),
550 reasoning: None,
551 })
552 .collect();
553
554 let calibration = CalibrationMetrics::from_results(&results);
555 assert!(calibration.ece < 0.1);
557 }
558
559 #[test]
560 fn test_significance() {
561 assert!(is_significant(0.10, 1000, 0.05));
563 assert!(!is_significant(0.02, 50, 0.05));
565 }
566}