use serde::{Deserialize, Serialize};
use crate::run::EvalRun;
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct Statistics {
pub n: usize,
pub mean: f64,
pub std_dev: f64,
pub ci_95_lower: f64,
pub ci_95_upper: f64,
pub min: f64,
pub max: f64,
}
impl Statistics {
pub fn from_values(values: &[f64]) -> Self {
let n = values.len();
if n == 0 {
return Self::default();
}
let sum: f64 = values.iter().sum();
let mean = sum / n as f64;
let min = values.iter().cloned().fold(f64::INFINITY, f64::min);
let max = values.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
if n == 1 {
return Self {
n,
mean,
std_dev: 0.0,
ci_95_lower: mean,
ci_95_upper: mean,
min,
max,
};
}
let variance: f64 = values.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / (n - 1) as f64;
let std_dev = variance.sqrt();
let t_value = t_value_95(n - 1);
let margin = t_value * std_dev / (n as f64).sqrt();
Self {
n,
mean,
std_dev,
ci_95_lower: mean - margin,
ci_95_upper: mean + margin,
min,
max,
}
}
}
fn t_value_95(df: usize) -> f64 {
const T_TABLE: &[(usize, f64)] = &[
(1, 12.706),
(2, 4.303),
(3, 3.182),
(4, 2.776),
(5, 2.571),
(6, 2.447),
(7, 2.365),
(8, 2.306),
(9, 2.262),
(10, 2.228),
(11, 2.201),
(12, 2.179),
(13, 2.160),
(14, 2.145),
(15, 2.131),
(16, 2.120),
(17, 2.110),
(18, 2.101),
(19, 2.093),
(20, 2.086),
(25, 2.060),
(30, 2.042),
(40, 2.021),
(50, 2.009),
(60, 2.000),
(80, 1.990),
(100, 1.984),
(120, 1.980),
];
if df > 120 {
return 1.96;
}
let mut lower = (1_usize, 12.706_f64);
let mut upper = (120_usize, 1.980_f64);
for &(table_df, t_val) in T_TABLE {
if table_df == df {
return t_val;
}
if table_df < df {
lower = (table_df, t_val);
} else {
upper = (table_df, t_val);
break;
}
}
let (df_low, t_low) = lower;
let (df_high, t_high) = upper;
let ratio = (df - df_low) as f64 / (df_high - df_low) as f64;
t_low + (t_high - t_low) * ratio
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct AggregatedResults {
pub total_runs: usize,
pub successful_runs: usize,
pub success_rate: f64,
pub pass_at_1: f64,
pub pass_at_5: f64,
pub pass_at_10: Option<f64>,
pub statistics: AggregatedStatistics,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct AggregatedStatistics {
pub success_rate: Statistics,
pub total_ticks: Statistics,
pub tick_latency_p95_ms: Statistics,
pub tick_latency_p99_ms: Statistics,
pub tick_miss_rate: Statistics,
pub tick_jitter: Statistics,
pub manager_intervention_rate: Statistics,
pub raw_throughput_per_sec: Statistics,
pub effective_throughput_per_sec: Statistics,
pub llm_invocations: Statistics,
pub llm_invoke_errors: Statistics,
pub llm_error_rate: Statistics,
pub total_llm_invocations: u64,
pub total_llm_errors: u64,
}
pub struct Aggregator;
impl Aggregator {
pub fn aggregate(runs: &[EvalRun]) -> AggregatedResults {
let total_runs = runs.len();
if total_runs == 0 {
return AggregatedResults::default();
}
let successful_runs = runs.iter().filter(|r| r.success).count();
let success_rate = successful_runs as f64 / total_runs as f64;
let success_rates: Vec<f64> = runs.iter().map(|r| r.metrics.task.success_rate).collect();
let total_ticks: Vec<f64> = runs
.iter()
.map(|r| r.metrics.task.total_ticks as f64)
.collect();
let tick_latency_p95: Vec<f64> = runs
.iter()
.map(|r| r.metrics.performance.tick_latency_p95_ms)
.collect();
let tick_latency_p99: Vec<f64> = runs
.iter()
.map(|r| r.metrics.performance.tick_latency_p99_ms)
.collect();
let tick_miss_rates: Vec<f64> = runs
.iter()
.map(|r| r.metrics.performance.tick_miss_rate)
.collect();
let tick_jitters: Vec<f64> = runs
.iter()
.map(|r| r.metrics.performance.tick_jitter)
.collect();
let manager_rates: Vec<f64> = runs
.iter()
.map(|r| r.metrics.coordination.manager_intervention_rate)
.collect();
let raw_throughputs: Vec<f64> = runs
.iter()
.map(|r| r.metrics.performance.raw_throughput_per_sec)
.collect();
let effective_throughputs: Vec<f64> = runs
.iter()
.map(|r| r.metrics.performance.effective_throughput_per_sec)
.collect();
let llm_invocations: Vec<f64> = runs
.iter()
.map(|r| r.metrics.performance.llm_invocations as f64)
.collect();
let llm_errors: Vec<f64> = runs
.iter()
.map(|r| r.metrics.performance.llm_invoke_errors as f64)
.collect();
let llm_error_rates: Vec<f64> = runs
.iter()
.map(|r| r.metrics.performance.llm_error_rate)
.collect();
let total_llm_invocations: u64 = runs
.iter()
.map(|r| r.metrics.performance.llm_invocations)
.sum();
let total_llm_errors: u64 = runs
.iter()
.map(|r| r.metrics.performance.llm_invoke_errors)
.sum();
let pass_at_1 = success_rate;
let pass_at_5 = Self::calculate_pass_at_k(total_runs, successful_runs, 5);
let pass_at_10 = if total_runs >= 10 {
Some(Self::calculate_pass_at_k(total_runs, successful_runs, 10))
} else {
None
};
AggregatedResults {
total_runs,
successful_runs,
success_rate,
pass_at_1,
pass_at_5,
pass_at_10,
statistics: AggregatedStatistics {
success_rate: Statistics::from_values(&success_rates),
total_ticks: Statistics::from_values(&total_ticks),
tick_latency_p95_ms: Statistics::from_values(&tick_latency_p95),
tick_latency_p99_ms: Statistics::from_values(&tick_latency_p99),
tick_miss_rate: Statistics::from_values(&tick_miss_rates),
tick_jitter: Statistics::from_values(&tick_jitters),
manager_intervention_rate: Statistics::from_values(&manager_rates),
raw_throughput_per_sec: Statistics::from_values(&raw_throughputs),
effective_throughput_per_sec: Statistics::from_values(&effective_throughputs),
llm_invocations: Statistics::from_values(&llm_invocations),
llm_invoke_errors: Statistics::from_values(&llm_errors),
llm_error_rate: Statistics::from_values(&llm_error_rates),
total_llm_invocations,
total_llm_errors,
},
}
}
fn calculate_pass_at_k(n: usize, c: usize, k: usize) -> f64 {
if k > n {
return if c > 0 { 1.0 } else { 0.0 };
}
if c >= n {
return 1.0;
}
if c == 0 {
return 0.0;
}
if n - c < k {
return 1.0; }
let mut ratio = 1.0;
for i in 0..k {
ratio *= (n - c - i) as f64 / (n - i) as f64;
}
1.0 - ratio
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_statistics_empty() {
let stats = Statistics::from_values(&[]);
assert_eq!(stats.n, 0);
assert_eq!(stats.mean, 0.0);
}
#[test]
fn test_statistics_single() {
let stats = Statistics::from_values(&[0.8]);
assert_eq!(stats.n, 1);
assert!((stats.mean - 0.8).abs() < 0.001);
assert_eq!(stats.std_dev, 0.0);
}
#[test]
fn test_statistics_multiple() {
let values = vec![0.7, 0.8, 0.9];
let stats = Statistics::from_values(&values);
assert_eq!(stats.n, 3);
assert!((stats.mean - 0.8).abs() < 0.001);
assert!(stats.std_dev > 0.0);
assert!(stats.ci_95_lower < stats.mean);
assert!(stats.ci_95_upper > stats.mean);
}
#[test]
fn test_pass_at_k() {
let pass_1 = Aggregator::calculate_pass_at_k(30, 24, 1);
assert!((pass_1 - 0.8).abs() < 0.01);
let pass_5 = Aggregator::calculate_pass_at_k(30, 24, 5);
assert!(pass_5 > pass_1);
let pass_perfect = Aggregator::calculate_pass_at_k(30, 30, 5);
assert!((pass_perfect - 1.0).abs() < 0.01);
let pass_zero = Aggregator::calculate_pass_at_k(30, 0, 5);
assert!((pass_zero - 0.0).abs() < 0.01);
}
#[test]
fn test_t_value_exact_matches() {
assert!((t_value_95(1) - 12.706).abs() < 0.001);
assert!((t_value_95(10) - 2.228).abs() < 0.001);
assert!((t_value_95(15) - 2.131).abs() < 0.001);
assert!((t_value_95(20) - 2.086).abs() < 0.001);
assert!((t_value_95(30) - 2.042).abs() < 0.001);
assert!((t_value_95(100) - 1.984).abs() < 0.001);
}
#[test]
fn test_t_value_interpolation() {
let t_21 = t_value_95(21);
assert!(t_21 < 2.086);
assert!(t_21 > 2.060);
assert!((t_21 - 2.080).abs() < 0.01);
let t_35 = t_value_95(35);
assert!(t_35 < 2.042);
assert!(t_35 > 2.021);
let t_22 = t_value_95(22);
let t_23 = t_value_95(23);
let t_24 = t_value_95(24);
assert!(t_22 > t_23);
assert!(t_23 > t_24);
}
#[test]
fn test_t_value_large_df() {
assert!((t_value_95(200) - 1.96).abs() < 0.001);
assert!((t_value_95(1000) - 1.96).abs() < 0.001);
}
}