impl ModelComparison {
#[must_use]
pub fn new(task_id: impl Into<String>) -> Self {
Self {
task_id: task_id.into(),
results: Vec::new(),
pareto_frontier: Vec::new(),
recommendations: Vec::new(),
}
}
pub fn add_result(&mut self, result: EvalResult) {
self.results.push(result);
}
#[must_use]
pub fn smallest_meeting_threshold(&self, min_success: f64) -> Option<&EvalResult> {
self.results
.iter()
.filter(|r| r.overall_success_rate >= min_success)
.min_by_key(|r| r.model_size_bytes)
}
#[must_use]
pub fn fastest_meeting_threshold(&self, min_success: f64) -> Option<&EvalResult> {
self.results
.iter()
.filter(|r| r.overall_success_rate >= min_success)
.min_by(|a, b| {
a.avg_turns_to_success
.partial_cmp(&b.avg_turns_to_success)
.unwrap_or(std::cmp::Ordering::Equal)
})
}
pub fn compute_pareto_frontier(&mut self) {
self.pareto_frontier = pareto::compute_pareto_frontier(&self.results);
}
pub fn generate_recommendations(&mut self) {
self.recommendations.clear();
if let Some(smallest) = self.results.iter().min_by_key(|r| r.model_size_bytes) {
self.recommendations.push(Recommendation {
scenario: "Minimum footprint".to_string(),
model_id: smallest.model_id.clone(),
rationale: format!(
"Smallest model at {} bytes, {:.1}% success",
smallest.model_size_bytes,
smallest.overall_success_rate * 100.0
),
});
}
if let Some(best) = self.results.iter().max_by(|a, b| {
a.overall_success_rate
.partial_cmp(&b.overall_success_rate)
.unwrap_or(std::cmp::Ordering::Equal)
}) {
self.recommendations.push(Recommendation {
scenario: "Maximum accuracy".to_string(),
model_id: best.model_id.clone(),
rationale: format!(
"Highest success rate at {:.1}%",
best.overall_success_rate * 100.0
),
});
}
if let Some(best) = self.pareto_frontier.iter().max_by(|a, b| {
let score_a = a.success_rate - (a.size_bytes as f64 / 1e9);
let score_b = b.success_rate - (b.size_bytes as f64 / 1e9);
score_a
.partial_cmp(&score_b)
.unwrap_or(std::cmp::Ordering::Equal)
}) {
self.recommendations.push(Recommendation {
scenario: "Best balance".to_string(),
model_id: best.model_id.clone(),
rationale: format!(
"Pareto optimal: {:.1}% success at {} bytes",
best.success_rate * 100.0,
best.size_bytes
),
});
}
}
#[must_use]
pub fn stratified_by_difficulty(&self) -> HashMap<String, HashMap<Difficulty, f64>> {
let mut result = HashMap::new();
for eval in &self.results {
let mut by_difficulty: HashMap<Difficulty, (usize, usize)> = HashMap::new();
for example in &eval.example_results {
let entry = by_difficulty.entry(example.difficulty).or_insert((0, 0));
entry.1 += 1; if example.solved_at_turn.is_some() {
entry.0 += 1; }
}
let rates: HashMap<Difficulty, f64> = by_difficulty
.into_iter()
.map(|(d, (solved, total))| {
let rate = if total > 0 {
solved as f64 / total as f64
} else {
0.0
};
(d, rate)
})
.collect();
result.insert(eval.model_id.clone(), rates);
}
result
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ParetoPoint {
pub model_id: String,
pub size_bytes: u64,
pub success_rate: f64,
pub avg_turns: f64,
pub is_pareto_optimal: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Recommendation {
pub scenario: String,
pub model_id: String,
pub rationale: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvalSuiteConfig {
pub id: String,
pub description: String,
pub max_turns: u32,
pub turn_timeout_secs: u64,
pub examples_path: PathBuf,
pub success_thresholds: Vec<f64>,
}
impl Default for EvalSuiteConfig {
fn default() -> Self {
Self {
id: "default".to_string(),
description: "Default evaluation suite".to_string(),
max_turns: 5,
turn_timeout_secs: 60,
examples_path: PathBuf::from("./examples"),
success_thresholds: vec![0.80, 0.90, 0.95],
}
}
}