use std::collections::HashMap;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum Profile {
None,
Quick,
Balanced,
Deep,
Paranoid,
}
impl Profile {
pub fn thinktools(&self) -> Vec<ThinkTool> {
match self {
Profile::None => vec![],
Profile::Quick => vec![ThinkTool::GigaThink, ThinkTool::LaserLogic],
Profile::Balanced => vec![
ThinkTool::GigaThink,
ThinkTool::LaserLogic,
ThinkTool::BedRock,
ThinkTool::ProofGuard,
],
Profile::Deep => vec![
ThinkTool::GigaThink,
ThinkTool::LaserLogic,
ThinkTool::BedRock,
ThinkTool::ProofGuard,
ThinkTool::BrutalHonesty,
],
Profile::Paranoid => vec![
ThinkTool::GigaThink,
ThinkTool::LaserLogic,
ThinkTool::BedRock,
ThinkTool::ProofGuard,
ThinkTool::BrutalHonesty,
],
}
}
pub fn min_confidence(&self) -> f64 {
match self {
Profile::None => 0.0,
Profile::Quick => 0.70,
Profile::Balanced => 0.80,
Profile::Deep => 0.85,
Profile::Paranoid => 0.95,
}
}
pub fn chain_length(&self) -> usize {
match self {
Profile::None => 0,
Profile::Quick => 2,
Profile::Balanced => 4,
Profile::Deep => 5,
Profile::Paranoid => 6, }
}
pub fn from_id(id: &str) -> Option<Self> {
match id.to_lowercase().as_str() {
"none" | "baseline" => Some(Profile::None),
"quick" => Some(Profile::Quick),
"balanced" => Some(Profile::Balanced),
"deep" => Some(Profile::Deep),
"paranoid" => Some(Profile::Paranoid),
_ => None,
}
}
pub fn id(&self) -> &'static str {
match self {
Profile::None => "none",
Profile::Quick => "quick",
Profile::Balanced => "balanced",
Profile::Deep => "deep",
Profile::Paranoid => "paranoid",
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum ThinkTool {
GigaThink,
LaserLogic,
BedRock,
ProofGuard,
BrutalHonesty,
}
#[derive(Debug, Clone)]
pub struct BenchmarkResult {
pub benchmark: String,
pub profile: Profile,
pub accuracy: f64,
pub correct: usize,
pub total: usize,
pub question_results: Vec<QuestionResult>,
}
impl BenchmarkResult {
pub fn improvement_over(&self, baseline: &BenchmarkResult) -> f64 {
self.accuracy - baseline.accuracy
}
}
#[derive(Debug, Clone)]
pub struct QuestionResult {
pub id: String,
pub correct: bool,
pub confidence: Option<f64>,
pub answer: String,
pub expected: String,
pub reasoning: Option<String>,
}
#[derive(Debug, Clone)]
pub struct ReasoningMetrics {
pub accuracy: f64,
pub improvement: f64,
pub consistency: ConsistencyMetrics,
pub calibration: CalibrationMetrics,
pub thinktool_metrics: HashMap<ThinkTool, ThinkToolMetrics>,
}
#[derive(Debug, Clone, Default)]
pub struct ConsistencyMetrics {
pub answer_agreement: f64,
pub reasoning_agreement: f64,
pub confidence_variance: f64,
pub num_runs: usize,
}
impl ConsistencyMetrics {
pub fn from_runs(runs: &[Vec<QuestionResult>]) -> Self {
if runs.is_empty() || runs[0].is_empty() {
return Self::default();
}
let num_runs = runs.len();
let num_questions = runs[0].len();
let mut answer_agreements = 0;
let mut confidence_sum = 0.0;
let mut confidence_sq_sum = 0.0;
let mut confidence_count = 0;
for q_idx in 0..num_questions {
let first_answer = &runs[0][q_idx].answer;
let all_agree = runs.iter().all(|run| &run[q_idx].answer == first_answer);
if all_agree {
answer_agreements += 1;
}
for run in runs {
if let Some(conf) = run[q_idx].confidence {
confidence_sum += conf;
confidence_sq_sum += conf * conf;
confidence_count += 1;
}
}
}
let answer_agreement = answer_agreements as f64 / num_questions as f64;
let confidence_variance = if confidence_count > 1 {
let mean = confidence_sum / confidence_count as f64;
(confidence_sq_sum / confidence_count as f64) - (mean * mean)
} else {
0.0
};
Self {
answer_agreement,
reasoning_agreement: 0.0, confidence_variance,
num_runs,
}
}
}
#[derive(Debug, Clone, Default)]
pub struct CalibrationMetrics {
pub ece: f64,
pub overconfidence_rate: f64,
pub underconfidence_rate: f64,
pub brier_score: f64,
}
impl CalibrationMetrics {
pub fn from_results(results: &[QuestionResult]) -> Self {
let with_confidence: Vec<_> = results.iter().filter(|r| r.confidence.is_some()).collect();
if with_confidence.is_empty() {
return Self::default();
}
let num_bins = 10;
let mut bins: Vec<Vec<(f64, bool)>> = vec![vec![]; num_bins];
for result in &with_confidence {
let conf = result.confidence.unwrap();
let bin_idx = ((conf * num_bins as f64) as usize).min(num_bins - 1);
bins[bin_idx].push((conf, result.correct));
}
let n = with_confidence.len() as f64;
let mut ece = 0.0;
for bin in &bins {
if !bin.is_empty() {
let bin_size = bin.len() as f64;
let avg_confidence: f64 = bin.iter().map(|(c, _)| c).sum::<f64>() / bin_size;
let accuracy: f64 =
bin.iter().filter(|(_, correct)| *correct).count() as f64 / bin_size;
ece += (bin_size / n) * (avg_confidence - accuracy).abs();
}
}
let overconfident = with_confidence
.iter()
.filter(|r| r.confidence.unwrap() > 0.8 && !r.correct)
.count();
let overconfidence_rate = overconfident as f64 / with_confidence.len() as f64;
let underconfident = with_confidence
.iter()
.filter(|r| r.confidence.unwrap() < 0.5 && r.correct)
.count();
let underconfidence_rate = underconfident as f64 / with_confidence.len() as f64;
let brier_score: f64 = with_confidence
.iter()
.map(|r| {
let conf = r.confidence.unwrap();
let outcome = if r.correct { 1.0 } else { 0.0 };
(conf - outcome).powi(2)
})
.sum::<f64>()
/ with_confidence.len() as f64;
Self {
ece,
overconfidence_rate,
underconfidence_rate,
brier_score,
}
}
}
#[derive(Debug, Clone, Default)]
pub struct ThinkToolMetrics {
pub improvement_delta: f64,
pub cost_effective: bool,
pub latency_ms: f64,
}
#[derive(Debug, Clone, Default)]
pub struct GigaThinkMetrics {
pub perspective_count: usize,
pub coverage_score: f64,
pub novelty_rate: f64,
pub integration_quality: f64,
}
#[derive(Debug, Clone, Default)]
pub struct LaserLogicMetrics {
pub validity_rate: f64,
pub fallacy_detection_rate: f64,
pub precision: f64,
pub soundness: f64,
}
#[derive(Debug, Clone, Default)]
pub struct BedRockMetrics {
pub decomposition_depth: usize,
pub axiom_validity: f64,
pub reconstruction_rate: f64,
pub assumption_surfacing: f64,
}
#[derive(Debug, Clone, Default)]
pub struct ProofGuardMetrics {
pub triangulation_rate: f64,
pub contradiction_detection: f64,
pub source_quality_score: f64,
pub citation_accuracy: f64,
}
#[derive(Debug, Clone, Default)]
pub struct BrutalHonestyMetrics {
pub flaw_detection_rate: f64,
pub false_positive_rate: f64,
pub suggestions_per_flaw: f64,
pub severity_calibration: f64,
}
pub fn calculate_thinktool_delta(without: &BenchmarkResult, with: &BenchmarkResult) -> f64 {
with.accuracy - without.accuracy
}
pub fn is_significant(delta: f64, n: usize, alpha: f64) -> bool {
let se = (0.25 / n as f64).sqrt(); let z = delta / se;
let critical = if alpha <= 0.01 {
2.576
} else if alpha <= 0.05 {
1.96
} else {
1.645
};
z.abs() > critical
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_profile_thinktools() {
assert!(Profile::None.thinktools().is_empty());
assert_eq!(Profile::Quick.thinktools().len(), 2);
assert_eq!(Profile::Balanced.thinktools().len(), 4);
assert_eq!(Profile::Deep.thinktools().len(), 5);
assert_eq!(Profile::Paranoid.thinktools().len(), 5); }
#[test]
fn test_profile_min_confidence() {
assert_eq!(Profile::None.min_confidence(), 0.0);
assert_eq!(Profile::Quick.min_confidence(), 0.70);
assert_eq!(Profile::Balanced.min_confidence(), 0.80);
assert_eq!(Profile::Deep.min_confidence(), 0.85);
assert_eq!(Profile::Paranoid.min_confidence(), 0.95);
}
#[test]
fn test_profile_chain_length() {
assert_eq!(Profile::None.chain_length(), 0);
assert_eq!(Profile::Quick.chain_length(), 2);
assert_eq!(Profile::Balanced.chain_length(), 4);
assert_eq!(Profile::Deep.chain_length(), 5);
assert_eq!(Profile::Paranoid.chain_length(), 6); }
#[test]
fn test_profile_from_id() {
assert_eq!(Profile::from_id("quick"), Some(Profile::Quick));
assert_eq!(Profile::from_id("BALANCED"), Some(Profile::Balanced));
assert_eq!(Profile::from_id("paranoid"), Some(Profile::Paranoid));
assert_eq!(Profile::from_id("baseline"), Some(Profile::None));
assert_eq!(Profile::from_id("invalid"), None);
}
#[test]
fn test_profile_id() {
assert_eq!(Profile::Quick.id(), "quick");
assert_eq!(Profile::Balanced.id(), "balanced");
assert_eq!(Profile::Deep.id(), "deep");
assert_eq!(Profile::Paranoid.id(), "paranoid");
}
#[test]
fn test_improvement_calculation() {
let baseline = BenchmarkResult {
benchmark: "gsm8k".into(),
profile: Profile::None,
accuracy: 0.57,
correct: 57,
total: 100,
question_results: vec![],
};
let treatment = BenchmarkResult {
benchmark: "gsm8k".into(),
profile: Profile::Balanced,
accuracy: 0.78,
correct: 78,
total: 100,
question_results: vec![],
};
let improvement = treatment.improvement_over(&baseline);
assert!((improvement - 0.21).abs() < 0.001);
}
#[test]
fn test_consistency_from_runs() {
let runs = vec![
vec![QuestionResult {
id: "q1".into(),
correct: true,
confidence: Some(0.9),
answer: "42".into(),
expected: "42".into(),
reasoning: None,
}],
vec![QuestionResult {
id: "q1".into(),
correct: true,
confidence: Some(0.85),
answer: "42".into(),
expected: "42".into(),
reasoning: None,
}],
];
let consistency = ConsistencyMetrics::from_runs(&runs);
assert_eq!(consistency.answer_agreement, 1.0);
assert_eq!(consistency.num_runs, 2);
}
#[test]
fn test_calibration_ece() {
let results: Vec<QuestionResult> = (0..100)
.map(|i| QuestionResult {
id: format!("q{}", i),
correct: i < 80, confidence: Some(0.8),
answer: "x".into(),
expected: if i < 80 { "x" } else { "y" }.into(),
reasoning: None,
})
.collect();
let calibration = CalibrationMetrics::from_results(&results);
assert!(calibration.ece < 0.1);
}
#[test]
fn test_significance() {
assert!(is_significant(0.10, 1000, 0.05));
assert!(!is_significant(0.02, 50, 0.05));
}
}