use super::correctness::{CorrectnessMetrics, TaskResult, VerificationLevel};
use super::diff_quality::{DiffAnalyzer, DiffQualityMetrics};
use super::economics::{CostTracker, EconomicsMetrics, LatencyDistribution};
use crate::Result;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::time::{Duration, Instant};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum AblationMode {
Baseline,
RetrievalOnly,
AdaptersOnly,
RetrievalPlusAdapters,
Full,
}
impl AblationMode {
pub fn all() -> Vec<AblationMode> {
vec![
Self::Baseline,
Self::RetrievalOnly,
Self::AdaptersOnly,
Self::RetrievalPlusAdapters,
Self::Full,
]
}
pub fn name(&self) -> &'static str {
match self {
Self::Baseline => "Baseline",
Self::RetrievalOnly => "Retrieval Only",
Self::AdaptersOnly => "Adapters Only",
Self::RetrievalPlusAdapters => "Retrieval + Adapters",
Self::Full => "Full (R+A+SONA)",
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvalConfig {
pub task_count: usize,
pub seeds: Vec<u64>,
pub ablation_modes: Vec<AblationMode>,
pub task_timeout: Duration,
pub parallel: bool,
pub max_parallel: usize,
pub quality_threshold: f64,
pub cost_target: f64,
pub compute_edit_similarity: bool,
pub human_verification: bool,
}
impl Default for EvalConfig {
fn default() -> Self {
Self {
task_count: 100,
seeds: vec![42, 123, 456],
ablation_modes: AblationMode::all(),
task_timeout: Duration::from_secs(300),
parallel: true,
max_parallel: 4,
quality_threshold: 0.7,
cost_target: 1.0, compute_edit_similarity: true,
human_verification: false,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvalTask {
pub id: String,
pub repo: String,
pub issue: Option<String>,
pub description: String,
pub reference_patch: Option<String>,
pub test_command: String,
pub expected_files: Vec<String>,
pub verification_level: VerificationLevel,
pub tags: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvalRun {
pub task_id: String,
pub mode: AblationMode,
pub seed: u64,
pub generated_patch: Option<String>,
pub correctness: TaskResult,
pub diff_quality: Option<DiffQualityMetrics>,
pub cost: CostTracker,
pub latency: LatencyBreakdown,
pub accepted: bool,
pub error: Option<String>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct LatencyBreakdown {
pub routing_ms: f64,
pub retrieval_ms: f64,
pub adapter_load_ms: f64,
pub generation_ms: f64,
pub test_execution_ms: f64,
pub total_ms: f64,
}
pub struct EvaluationHarness {
config: EvalConfig,
diff_analyzer: DiffAnalyzer,
results: HashMap<AblationMode, Vec<EvalRun>>,
}
impl EvaluationHarness {
pub fn new(config: EvalConfig) -> Self {
Self {
config,
diff_analyzer: DiffAnalyzer::default(),
results: HashMap::new(),
}
}
pub async fn run_evaluation(&mut self, tasks: &[EvalTask]) -> Result<EvalReport> {
let start = Instant::now();
for mode in &self.config.ablation_modes.clone() {
let mode_results = self.run_mode(*mode, tasks).await?;
self.results.insert(*mode, mode_results);
}
let total_duration = start.elapsed();
Ok(self.generate_report(total_duration))
}
async fn run_mode(&mut self, mode: AblationMode, tasks: &[EvalTask]) -> Result<Vec<EvalRun>> {
let mut runs = Vec::new();
for task in tasks.iter().take(self.config.task_count) {
for &seed in &self.config.seeds {
let run = self.run_single_task(mode, task, seed).await?;
runs.push(run);
}
}
Ok(runs)
}
async fn run_single_task(
&self,
mode: AblationMode,
task: &EvalTask,
seed: u64,
) -> Result<EvalRun> {
let start = Instant::now();
let mut latency = LatencyBreakdown::default();
let mut cost = CostTracker::with_claude_pricing();
let route_start = Instant::now();
let _routing_result = self.simulate_routing(mode, task);
latency.routing_ms = route_start.elapsed().as_secs_f64() * 1000.0;
if matches!(
mode,
AblationMode::RetrievalOnly | AblationMode::RetrievalPlusAdapters | AblationMode::Full
) {
let retrieval_start = Instant::now();
let _patterns = self.simulate_retrieval(task);
latency.retrieval_ms = retrieval_start.elapsed().as_secs_f64() * 1000.0;
}
if matches!(
mode,
AblationMode::AdaptersOnly | AblationMode::RetrievalPlusAdapters | AblationMode::Full
) {
let adapter_start = Instant::now();
self.simulate_adapter_load(task);
latency.adapter_load_ms = adapter_start.elapsed().as_secs_f64() * 1000.0;
}
let gen_start = Instant::now();
let (patch, gen_cost) = self.simulate_generation(mode, task, seed);
latency.generation_ms = gen_start.elapsed().as_secs_f64() * 1000.0;
cost.add(&gen_cost);
latency.total_ms = start.elapsed().as_secs_f64() * 1000.0;
let diff_quality = patch.as_ref().map(|p| {
self.diff_analyzer
.analyze(p, task.reference_patch.as_deref())
});
let correctness = TaskResult {
task_id: task.id.clone(),
repo: task.repo.clone(),
issue_id: task.issue.clone(),
patch_generated: patch.is_some(),
patch_applies: patch.is_some(), test_results: None, verification_level: task.verification_level,
human_verified: None,
files_changed: task.expected_files.len(),
lines_changed: patch.as_ref().map_or(0, |p| p.lines().count()),
is_multi_file: task.expected_files.len() > 1,
coupling_score: 0.3,
generation_time: Duration::from_millis(latency.generation_ms as u64),
retries: 0,
error: None,
};
let accepted = correctness.succeeded()
&& diff_quality.as_ref().map_or(false, |dq| {
dq.combined_score >= self.config.quality_threshold
});
Ok(EvalRun {
task_id: task.id.clone(),
mode,
seed,
generated_patch: patch,
correctness,
diff_quality,
cost,
latency,
accepted,
error: None,
})
}
fn simulate_routing(&self, _mode: AblationMode, _task: &EvalTask) -> String {
"sonnet".to_string()
}
fn simulate_retrieval(&self, _task: &EvalTask) -> Vec<String> {
vec!["pattern1".to_string(), "pattern2".to_string()]
}
fn simulate_adapter_load(&self, _task: &EvalTask) {
}
fn simulate_generation(
&self,
mode: AblationMode,
_task: &EvalTask,
_seed: u64,
) -> (Option<String>, CostTracker) {
let success_rate = match mode {
AblationMode::Baseline => 0.3,
AblationMode::RetrievalOnly => 0.45,
AblationMode::AdaptersOnly => 0.50,
AblationMode::RetrievalPlusAdapters => 0.65,
AblationMode::Full => 0.75,
};
let mut cost = CostTracker::with_claude_pricing();
cost.input_tokens = 5000;
cost.output_tokens = 1000;
let patch = if rand_success(success_rate) {
Some("+// Fixed\n-// Old code".to_string())
} else {
None
};
(patch, cost)
}
fn generate_report(&self, duration: Duration) -> EvalReport {
let mut mode_metrics: HashMap<AblationMode, ModeMetrics> = HashMap::new();
for (mode, runs) in &self.results {
let mut correctness = CorrectnessMetrics::new();
let mut economics = EconomicsMetrics::new();
let mut quality_scores = Vec::new();
for run in runs {
correctness.add_result(&run.correctness);
economics.cost.add(&run.cost);
if run.accepted {
economics.successful_tasks += 1;
}
if let Some(ref dq) = run.diff_quality {
quality_scores.push(dq.combined_score);
}
economics
.latency
.routing
.add_secs(run.latency.routing_ms / 1000.0);
economics
.latency
.end_to_end
.add_secs(run.latency.total_ms / 1000.0);
}
economics.recalculate();
let avg_quality = if quality_scores.is_empty() {
0.0
} else {
quality_scores.iter().sum::<f64>() / quality_scores.len() as f64
};
mode_metrics.insert(
*mode,
ModeMetrics {
mode: *mode,
correctness,
economics,
avg_quality_score: avg_quality,
total_runs: runs.len(),
},
);
}
EvalReport {
config: self.config.clone(),
mode_metrics,
total_duration: duration,
timestamp: chrono::Utc::now(),
}
}
}
fn rand_success(rate: f64) -> bool {
rate > 0.5 }
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModeMetrics {
pub mode: AblationMode,
pub correctness: CorrectnessMetrics,
pub economics: EconomicsMetrics,
pub avg_quality_score: f64,
pub total_runs: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvalReport {
pub config: EvalConfig,
pub mode_metrics: HashMap<AblationMode, ModeMetrics>,
pub total_duration: Duration,
pub timestamp: chrono::DateTime<chrono::Utc>,
}
impl EvalReport {
pub fn to_leaderboard(&self) -> String {
let mut output = String::new();
output.push_str(
"╔════════════════════════════════════════════════════════════════════════════╗\n",
);
output.push_str(
"║ RuvLLM Evaluation Report ║\n",
);
output.push_str(
"╠════════════════════════════════════════════════════════════════════════════╣\n",
);
output.push_str(&format!(
"║ Tasks: {} × {} seeds × {} modes = {} runs ║\n",
self.config.task_count,
self.config.seeds.len(),
self.config.ablation_modes.len(),
self.config.task_count * self.config.seeds.len() * self.config.ablation_modes.len()
));
output.push_str(&format!(
"║ Duration: {:.1}s | Quality threshold: {:.0}% ║\n",
self.total_duration.as_secs_f64(),
self.config.quality_threshold * 100.0
));
output.push_str(
"╠════════════════════════════════════════════════════════════════════════════╣\n",
);
output.push_str(
"║ Mode │ Success% │ Verified% │ Quality │ $/patch │ p95 lat ║\n",
);
output.push_str(
"╠════════════════════════════════════════════════════════════════════════════╣\n",
);
let mut modes: Vec<_> = self.mode_metrics.values().collect();
modes.sort_by(|a, b| {
b.correctness
.task_success_rate()
.partial_cmp(&a.correctness.task_success_rate())
.unwrap()
});
for metrics in modes {
output.push_str(&format!(
"║ {:18} │ {:7.1}% │ {:8.1}% │ {:7.2} │ ${:6.4} │ {:7.1}ms ║\n",
metrics.mode.name(),
metrics.correctness.task_success_rate() * 100.0,
metrics.correctness.verified_success_rate() * 100.0,
metrics.avg_quality_score,
metrics.economics.cost_per_accepted_patch,
metrics.economics.latency.end_to_end.p95() * 1000.0,
));
}
output.push_str(
"╚════════════════════════════════════════════════════════════════════════════╝\n",
);
output
}
pub fn best_mode(&self) -> Option<AblationMode> {
self.mode_metrics
.values()
.max_by(|a, b| {
a.correctness
.task_success_rate()
.partial_cmp(&b.correctness.task_success_rate())
.unwrap()
})
.map(|m| m.mode)
}
pub fn improvement_over_baseline(&self, mode: AblationMode) -> Option<f64> {
let baseline = self.mode_metrics.get(&AblationMode::Baseline)?;
let target = self.mode_metrics.get(&mode)?;
let baseline_rate = baseline.correctness.task_success_rate();
if baseline_rate == 0.0 {
return None;
}
Some((target.correctness.task_success_rate() - baseline_rate) / baseline_rate * 100.0)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_ablation_modes() {
let modes = AblationMode::all();
assert_eq!(modes.len(), 5);
assert_eq!(modes[0], AblationMode::Baseline);
assert_eq!(modes[4], AblationMode::Full);
}
#[test]
fn test_eval_config_default() {
let config = EvalConfig::default();
assert_eq!(config.task_count, 100);
assert_eq!(config.seeds.len(), 3);
assert_eq!(config.ablation_modes.len(), 5);
}
#[tokio::test]
async fn test_harness_creation() {
let config = EvalConfig {
task_count: 2,
seeds: vec![42],
ablation_modes: vec![AblationMode::Baseline, AblationMode::Full],
..Default::default()
};
let harness = EvaluationHarness::new(config);
assert!(harness.results.is_empty());
}
}