pub mod metrics;
pub mod reasoning;
pub use metrics::{
average_precision, mean_average_precision, mean_reciprocal_rank, ndcg_at_k, precision_at_k,
recall_at_k, EvaluationResult, QueryResult, RetrievalMetrics,
};
pub use reasoning::{
BedRockMetrics, BenchmarkResult, BrutalHonestyMetrics, CalibrationMetrics, ConsistencyMetrics,
GigaThinkMetrics, LaserLogicMetrics, Profile, ProofGuardMetrics, ReasoningMetrics,
ThinkToolMetrics,
};
#[derive(Debug, Clone)]
pub struct ReasoningEvalConfig {
pub consistency_runs: usize,
pub profile: Profile,
pub benchmarks: Vec<String>,
pub measure_calibration: bool,
pub measure_thinktool_effectiveness: bool,
}
impl Default for ReasoningEvalConfig {
fn default() -> Self {
Self {
consistency_runs: 5,
profile: Profile::Balanced,
benchmarks: vec!["gsm8k".into(), "arc_challenge".into()],
measure_calibration: true,
measure_thinktool_effectiveness: true,
}
}
}
#[derive(Debug, Clone)]
pub struct RetrievalEvalConfig {
pub k_values: Vec<usize>,
pub compute_mrr: bool,
pub compute_map: bool,
}
impl Default for RetrievalEvalConfig {
fn default() -> Self {
Self {
k_values: vec![5, 10, 20],
compute_mrr: true,
compute_map: true,
}
}
}
pub fn evaluate_reasoning(
results: &[BenchmarkResult],
config: &ReasoningEvalConfig,
) -> ReasoningEvalSummary {
let mut summary = ReasoningEvalSummary::new(&config.benchmarks);
for result in results {
summary.add_result(result);
}
summary.finalize()
}
#[derive(Debug, Clone)]
pub struct ReasoningEvalSummary {
pub num_benchmarks: usize,
pub accuracy: std::collections::HashMap<String, f64>,
pub improvement: std::collections::HashMap<String, f64>,
pub self_consistency: f64,
pub calibration_ece: f64,
pub thinktool_scores: std::collections::HashMap<String, f64>,
accuracy_sums: std::collections::HashMap<String, (f64, usize)>,
}
impl ReasoningEvalSummary {
pub fn new(benchmarks: &[String]) -> Self {
let mut accuracy = std::collections::HashMap::new();
let mut improvement = std::collections::HashMap::new();
let mut accuracy_sums = std::collections::HashMap::new();
for b in benchmarks {
accuracy.insert(b.clone(), 0.0);
improvement.insert(b.clone(), 0.0);
accuracy_sums.insert(b.clone(), (0.0, 0));
}
Self {
num_benchmarks: benchmarks.len(),
accuracy,
improvement,
self_consistency: 0.0,
calibration_ece: 0.0,
thinktool_scores: std::collections::HashMap::new(),
accuracy_sums,
}
}
fn add_result(&mut self, result: &BenchmarkResult) {
if let Some((sum, count)) = self.accuracy_sums.get_mut(&result.benchmark) {
*sum += result.accuracy;
*count += 1;
}
}
fn finalize(mut self) -> Self {
for (benchmark, (sum, count)) in &self.accuracy_sums {
if *count > 0 {
self.accuracy
.insert(benchmark.clone(), sum / (*count as f64));
}
}
self
}
pub fn check_targets(&self, targets: &ReasoningTargets) -> TargetResult {
let mut passed = true;
let mut failures = Vec::new();
if let Some(&target) = targets.gsm8k_improvement.as_ref() {
if let Some(&actual) = self.improvement.get("gsm8k") {
if actual < target {
passed = false;
failures.push(format!(
"GSM8K improvement: {:.1}% < {:.1}%",
actual * 100.0,
target * 100.0
));
}
}
}
if let Some(target) = targets.self_consistency {
if self.self_consistency < target {
passed = false;
failures.push(format!(
"Self-consistency: {:.1}% < {:.1}%",
self.self_consistency * 100.0,
target * 100.0
));
}
}
if let Some(target) = targets.calibration_ece_max {
if self.calibration_ece > target {
passed = false;
failures.push(format!(
"Calibration ECE: {:.3} > {:.3}",
self.calibration_ece, target
));
}
}
TargetResult { passed, failures }
}
}
#[derive(Debug, Clone, Default)]
pub struct ReasoningTargets {
pub gsm8k_improvement: Option<f64>,
pub arc_c_improvement: Option<f64>,
pub logiqa_improvement: Option<f64>,
pub self_consistency: Option<f64>,
pub calibration_ece_max: Option<f64>,
}
impl ReasoningTargets {
pub fn v1_targets() -> Self {
Self {
gsm8k_improvement: Some(0.15), arc_c_improvement: Some(0.08), logiqa_improvement: None, self_consistency: Some(0.85), calibration_ece_max: Some(0.10), }
}
pub fn v1_5_targets() -> Self {
Self {
gsm8k_improvement: Some(0.20),
arc_c_improvement: Some(0.10),
logiqa_improvement: Some(0.20),
self_consistency: Some(0.90),
calibration_ece_max: Some(0.08),
}
}
}
#[derive(Debug, Clone)]
pub struct TargetResult {
pub passed: bool,
pub failures: Vec<String>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_reasoning_config_default() {
let config = ReasoningEvalConfig::default();
assert_eq!(config.consistency_runs, 5);
assert!(config.measure_calibration);
}
#[test]
fn test_v1_targets() {
let targets = ReasoningTargets::v1_targets();
assert_eq!(targets.gsm8k_improvement, Some(0.15));
assert_eq!(targets.self_consistency, Some(0.85));
}
#[test]
fn test_target_check_pass() {
let mut summary = ReasoningEvalSummary::new(&["gsm8k".into()]);
summary.improvement.insert("gsm8k".into(), 0.20);
summary.self_consistency = 0.90;
summary.calibration_ece = 0.05;
let targets = ReasoningTargets::v1_targets();
let result = summary.check_targets(&targets);
assert!(result.passed);
assert!(result.failures.is_empty());
}
#[test]
fn test_target_check_fail() {
let mut summary = ReasoningEvalSummary::new(&["gsm8k".into()]);
summary.improvement.insert("gsm8k".into(), 0.10); summary.self_consistency = 0.75; summary.calibration_ece = 0.15;
let targets = ReasoningTargets::v1_targets();
let result = summary.check_targets(&targets);
assert!(!result.passed);
assert_eq!(result.failures.len(), 3);
}
}