use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::time::Duration;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum VerificationLevel {
Automated,
HumanVerified,
ConsensusVerified,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TestSuiteResult {
pub total_tests: usize,
pub passed: usize,
pub failed: usize,
pub skipped: usize,
pub duration: Duration,
pub failures: HashMap<String, String>,
pub regressions: Vec<String>,
}
impl TestSuiteResult {
pub fn pass_rate(&self) -> f64 {
if self.total_tests == 0 {
return 0.0;
}
self.passed as f64 / self.total_tests as f64
}
pub fn all_passed(&self) -> bool {
self.failed == 0 && self.regressions.is_empty()
}
pub fn is_clean(&self) -> bool {
self.failed == 0 && self.regressions.is_empty() && self.skipped == 0
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TaskResult {
pub task_id: String,
pub repo: String,
pub issue_id: Option<String>,
pub patch_generated: bool,
pub patch_applies: bool,
pub test_results: Option<TestSuiteResult>,
pub verification_level: VerificationLevel,
pub human_verified: Option<bool>,
pub files_changed: usize,
pub lines_changed: usize,
pub is_multi_file: bool,
pub coupling_score: f64,
pub generation_time: Duration,
pub retries: usize,
pub error: Option<String>,
}
impl TaskResult {
pub fn succeeded(&self) -> bool {
self.patch_generated
&& self.patch_applies
&& self.test_results.as_ref().map_or(false, |t| t.all_passed())
}
pub fn verified_success(&self) -> bool {
self.succeeded() && self.human_verified.unwrap_or(false)
}
pub fn is_long_horizon(&self) -> bool {
self.is_multi_file && self.coupling_score > 0.5
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CorrectnessMetrics {
pub total_tasks: usize,
pub patches_generated: usize,
pub patches_applied: usize,
pub tests_passed: usize,
pub verified_successes: usize,
pub long_horizon_successes: usize,
pub long_horizon_total: usize,
pub per_repo: HashMap<String, RepoMetrics>,
pub failure_reasons: HashMap<String, usize>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct RepoMetrics {
pub total: usize,
pub succeeded: usize,
pub verified: usize,
}
impl CorrectnessMetrics {
pub fn new() -> Self {
Self {
total_tasks: 0,
patches_generated: 0,
patches_applied: 0,
tests_passed: 0,
verified_successes: 0,
long_horizon_successes: 0,
long_horizon_total: 0,
per_repo: HashMap::new(),
failure_reasons: HashMap::new(),
}
}
pub fn add_result(&mut self, result: &TaskResult) {
self.total_tasks += 1;
if result.patch_generated {
self.patches_generated += 1;
}
if result.patch_applies {
self.patches_applied += 1;
}
if result.succeeded() {
self.tests_passed += 1;
}
if result.verified_success() {
self.verified_successes += 1;
}
if result.is_long_horizon() {
self.long_horizon_total += 1;
if result.succeeded() {
self.long_horizon_successes += 1;
}
}
let repo_metrics = self.per_repo.entry(result.repo.clone()).or_default();
repo_metrics.total += 1;
if result.succeeded() {
repo_metrics.succeeded += 1;
}
if result.verified_success() {
repo_metrics.verified += 1;
}
if !result.succeeded() {
let reason = if !result.patch_generated {
"patch_generation_failed"
} else if !result.patch_applies {
"patch_apply_failed"
} else {
"tests_failed"
};
*self.failure_reasons.entry(reason.to_string()).or_insert(0) += 1;
}
}
pub fn task_success_rate(&self) -> f64 {
if self.total_tasks == 0 {
return 0.0;
}
self.tests_passed as f64 / self.total_tasks as f64
}
pub fn verified_success_rate(&self) -> f64 {
if self.total_tasks == 0 {
return 0.0;
}
self.verified_successes as f64 / self.total_tasks as f64
}
pub fn long_horizon_success_rate(&self) -> f64 {
if self.long_horizon_total == 0 {
return 0.0;
}
self.long_horizon_successes as f64 / self.long_horizon_total as f64
}
pub fn generation_rate(&self) -> f64 {
if self.total_tasks == 0 {
return 0.0;
}
self.patches_generated as f64 / self.total_tasks as f64
}
pub fn application_rate(&self) -> f64 {
if self.patches_generated == 0 {
return 0.0;
}
self.patches_applied as f64 / self.patches_generated as f64
}
}
impl Default for CorrectnessMetrics {
fn default() -> Self {
Self::new()
}
}
pub struct CorrectnessEvaluator {
pub test_timeout: Duration,
pub isolated: bool,
pub clone_depth: Option<usize>,
}
impl Default for CorrectnessEvaluator {
fn default() -> Self {
Self {
test_timeout: Duration::from_secs(300), isolated: true,
clone_depth: Some(1),
}
}
}
impl CorrectnessEvaluator {
pub async fn evaluate_task(
&self,
task_id: &str,
repo: &str,
patch: &str,
_test_command: &str,
) -> TaskResult {
TaskResult {
task_id: task_id.to_string(),
repo: repo.to_string(),
issue_id: None,
patch_generated: !patch.is_empty(),
patch_applies: false, test_results: None,
verification_level: VerificationLevel::Automated,
human_verified: None,
files_changed: 0,
lines_changed: 0,
is_multi_file: false,
coupling_score: 0.0,
generation_time: Duration::from_secs(0),
retries: 0,
error: Some("Not implemented - stub evaluator".to_string()),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_task_success_rate_empty() {
let metrics = CorrectnessMetrics::new();
assert_eq!(metrics.task_success_rate(), 0.0);
}
#[test]
fn test_task_success_rate_calculation() {
let mut metrics = CorrectnessMetrics::new();
let success = TaskResult {
task_id: "1".into(),
repo: "test/repo".into(),
issue_id: None,
patch_generated: true,
patch_applies: true,
test_results: Some(TestSuiteResult {
total_tests: 10,
passed: 10,
failed: 0,
skipped: 0,
duration: Duration::from_secs(1),
failures: HashMap::new(),
regressions: vec![],
}),
verification_level: VerificationLevel::Automated,
human_verified: Some(true),
files_changed: 1,
lines_changed: 10,
is_multi_file: false,
coupling_score: 0.2,
generation_time: Duration::from_secs(5),
retries: 0,
error: None,
};
metrics.add_result(&success);
let failure = TaskResult {
task_id: "2".into(),
repo: "test/repo".into(),
issue_id: None,
patch_generated: true,
patch_applies: true,
test_results: Some(TestSuiteResult {
total_tests: 10,
passed: 8,
failed: 2,
skipped: 0,
duration: Duration::from_secs(1),
failures: HashMap::from([("test1".into(), "assertion failed".into())]),
regressions: vec![],
}),
verification_level: VerificationLevel::Automated,
human_verified: None,
files_changed: 2,
lines_changed: 50,
is_multi_file: true,
coupling_score: 0.7,
generation_time: Duration::from_secs(10),
retries: 2,
error: None,
};
metrics.add_result(&failure);
assert_eq!(metrics.total_tasks, 2);
assert_eq!(metrics.tests_passed, 1);
assert_eq!(metrics.task_success_rate(), 0.5);
assert_eq!(metrics.verified_success_rate(), 0.5);
}
#[test]
fn test_long_horizon_tracking() {
let mut metrics = CorrectnessMetrics::new();
let lh_success = TaskResult {
task_id: "lh1".into(),
repo: "test/repo".into(),
issue_id: None,
patch_generated: true,
patch_applies: true,
test_results: Some(TestSuiteResult {
total_tests: 20,
passed: 20,
failed: 0,
skipped: 0,
duration: Duration::from_secs(5),
failures: HashMap::new(),
regressions: vec![],
}),
verification_level: VerificationLevel::Automated,
human_verified: None,
files_changed: 5,
lines_changed: 200,
is_multi_file: true,
coupling_score: 0.8, generation_time: Duration::from_secs(30),
retries: 1,
error: None,
};
metrics.add_result(&lh_success);
assert_eq!(metrics.long_horizon_total, 1);
assert_eq!(metrics.long_horizon_successes, 1);
assert_eq!(metrics.long_horizon_success_rate(), 1.0);
}
}