use anyhow::{Context, Result};
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use tokio::process::Command;
use tracing::{debug, info, warn};
use crate::agent::{Task, TaskResult};
use crate::identity::AgentRole;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct QualityRubric {
pub dimensions: HashMap<String, f64>,
pub thresholds: HashMap<String, f64>,
pub role_weights: HashMap<String, HashMap<String, f64>>,
}
impl Default for QualityRubric {
fn default() -> Self {
let mut dimensions = HashMap::new();
dimensions.insert("correctness".to_string(), 0.3);
dimensions.insert("maintainability".to_string(), 0.2);
dimensions.insert("test_quality".to_string(), 0.2);
dimensions.insert("security".to_string(), 0.15);
dimensions.insert("performance".to_string(), 0.1);
dimensions.insert("documentation".to_string(), 0.05);
let mut thresholds = HashMap::new();
thresholds.insert("correctness".to_string(), 0.9);
thresholds.insert("maintainability".to_string(), 0.8);
thresholds.insert("test_quality".to_string(), 0.85);
thresholds.insert("security".to_string(), 0.9);
thresholds.insert("performance".to_string(), 0.7);
thresholds.insert("documentation".to_string(), 0.7);
Self {
dimensions,
thresholds,
role_weights: Self::default_role_weights(),
}
}
}
impl QualityRubric {
fn default_role_weights() -> HashMap<String, HashMap<String, f64>> {
let mut role_weights = HashMap::new();
let mut frontend_weights = HashMap::new();
frontend_weights.insert("correctness".to_string(), 0.25);
frontend_weights.insert("maintainability".to_string(), 0.2);
frontend_weights.insert("test_quality".to_string(), 0.15);
frontend_weights.insert("security".to_string(), 0.1);
frontend_weights.insert("performance".to_string(), 0.15);
frontend_weights.insert("documentation".to_string(), 0.05);
frontend_weights.insert("accessibility".to_string(), 0.1);
role_weights.insert("Frontend".to_string(), frontend_weights);
let mut backend_weights = HashMap::new();
backend_weights.insert("correctness".to_string(), 0.3);
backend_weights.insert("maintainability".to_string(), 0.15);
backend_weights.insert("test_quality".to_string(), 0.25);
backend_weights.insert("security".to_string(), 0.2);
backend_weights.insert("performance".to_string(), 0.1);
backend_weights.insert("documentation".to_string(), 0.0);
role_weights.insert("Backend".to_string(), backend_weights);
let mut devops_weights = HashMap::new();
devops_weights.insert("correctness".to_string(), 0.25);
devops_weights.insert("maintainability".to_string(), 0.15);
devops_weights.insert("test_quality".to_string(), 0.1);
devops_weights.insert("security".to_string(), 0.3);
devops_weights.insert("performance".to_string(), 0.15);
devops_weights.insert("documentation".to_string(), 0.05);
role_weights.insert("DevOps".to_string(), devops_weights);
role_weights
}
pub fn get_role_weights(&self, role: &str) -> &HashMap<String, f64> {
self.role_weights.get(role).unwrap_or(&self.dimensions)
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct QualityEvaluation {
pub overall_score: f64,
pub dimension_scores: HashMap<String, f64>,
pub issues: Vec<QualityIssue>,
pub feedback: String,
pub passes_standards: bool,
pub confidence: f64,
pub evaluated_at: DateTime<Utc>,
pub metadata: EvaluationMetadata,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct QualityIssue {
pub severity: IssueSeverity,
pub category: IssueCategory,
pub description: String,
pub suggested_fix: String,
pub affected_areas: Vec<String>,
pub fix_effort: u32,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub enum IssueSeverity {
Critical,
High,
Medium,
Low,
}
impl PartialOrd for IssueSeverity {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl Ord for IssueSeverity {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
use std::cmp::Ordering;
match (self, other) {
(IssueSeverity::Critical, IssueSeverity::Critical) => Ordering::Equal,
(IssueSeverity::Critical, _) => Ordering::Greater,
(_, IssueSeverity::Critical) => Ordering::Less,
(IssueSeverity::High, IssueSeverity::High) => Ordering::Equal,
(IssueSeverity::High, _) => Ordering::Greater,
(_, IssueSeverity::High) => Ordering::Less,
(IssueSeverity::Medium, IssueSeverity::Medium) => Ordering::Equal,
(IssueSeverity::Medium, IssueSeverity::Low) => Ordering::Greater,
(IssueSeverity::Low, IssueSeverity::Medium) => Ordering::Less,
(IssueSeverity::Low, IssueSeverity::Low) => Ordering::Equal,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum IssueCategory {
Security,
Performance,
TestCoverage,
CodeComplexity,
Documentation,
ErrorHandling,
Architecture,
BestPractices,
Accessibility,
TypeSafety,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvaluationMetadata {
pub files_analyzed: usize,
pub lines_of_code: usize,
pub duration_ms: u64,
pub tools_used: Vec<String>,
pub agent_role: String,
}
pub struct LLMQualityJudge {
rubric: QualityRubric,
evaluation_cache: HashMap<String, QualityEvaluation>,
pub use_claude: bool,
}
impl Default for LLMQualityJudge {
fn default() -> Self {
Self {
rubric: QualityRubric::default(),
evaluation_cache: HashMap::new(),
use_claude: true,
}
}
}
impl LLMQualityJudge {
pub fn new() -> Self {
Self::default()
}
pub fn with_rubric(rubric: QualityRubric) -> Self {
Self {
rubric,
evaluation_cache: HashMap::new(),
use_claude: true,
}
}
pub async fn evaluate_task(
&mut self,
task: &Task,
result: &TaskResult,
agent_role: &AgentRole,
workspace_path: &str,
) -> Result<QualityEvaluation> {
let start_time = std::time::Instant::now();
let cache_key = format!("{}-{}", task.id, result.output);
if let Some(cached) = self.evaluation_cache.get(&cache_key) {
debug!("Using cached evaluation for task {}", task.id);
return Ok(cached.clone());
}
info!(
"Evaluating quality for task {} by {} agent",
task.id,
agent_role.name()
);
let content = self.extract_content_from_result(result)?;
let evaluation = if self.use_claude {
self.evaluate_with_claude(&content, task, agent_role, workspace_path)
.await?
} else {
self.evaluate_with_heuristics(&content, task, agent_role)?
};
self.evaluation_cache.insert(cache_key, evaluation.clone());
let duration_ms = start_time.elapsed().as_millis() as u64;
info!(
"Quality evaluation completed in {}ms: score={:.2}, passed={}",
duration_ms, evaluation.overall_score, evaluation.passes_standards
);
Ok(evaluation)
}
fn extract_content_from_result(&self, result: &TaskResult) -> Result<String> {
if let Some(response) = result.output.get("response") {
Ok(response.as_str().unwrap_or("").to_string())
} else {
Ok(result.output.to_string())
}
}
async fn evaluate_with_claude(
&self,
content: &str,
task: &Task,
agent_role: &AgentRole,
workspace_path: &str,
) -> Result<QualityEvaluation> {
let prompt = self.generate_evaluation_prompt(content, task, agent_role);
let mut cmd = Command::new("claude");
cmd.current_dir(workspace_path)
.arg("-p")
.arg(&prompt)
.arg("--json");
let output = cmd
.output()
.await
.context("Failed to execute Claude for quality evaluation")?;
if !output.status.success() {
warn!("Claude evaluation failed, falling back to heuristics");
return self.evaluate_with_heuristics(content, task, agent_role);
}
let response = String::from_utf8_lossy(&output.stdout);
self.parse_claude_evaluation(&response, agent_role)
}
fn generate_evaluation_prompt(
&self,
content: &str,
task: &Task,
agent_role: &AgentRole,
) -> String {
let role_weights = self.rubric.get_role_weights(agent_role.name());
let dimensions: Vec<String> = role_weights.keys().cloned().collect();
format!(
r#"You are a code quality judge evaluating the output of a {} agent.
## Task Description
{}
## Code/Content to Evaluate
```
{}
```
## Evaluation Rubric
Please evaluate the code on these dimensions (0.0 to 1.0 scale):
{}
## Required Output Format
Provide your evaluation as a JSON object with this structure:
{{
"overall_score": 0.0-1.0,
"dimensions": {{
"correctness": 0.0-1.0,
"maintainability": 0.0-1.0,
"test_quality": 0.0-1.0,
"security": 0.0-1.0,
"performance": 0.0-1.0,
"documentation": 0.0-1.0,
"architecture": 0.0-1.0,
"error_handling": 0.0-1.0
}},
"issues": [
{{
"severity": "critical|high|medium|low",
"category": "Security|Performance|TestCoverage|CodeComplexity|Documentation|ErrorHandling|Architecture|BestPractices|Accessibility|TypeSafety",
"description": "Clear description of the issue",
"suggested_fix": "Specific suggestion for fixing the issue",
"affected_areas": ["file.js", "component.tsx"],
"fix_effort": 30
}}
],
"feedback": "Overall assessment and recommendations",
"passes_standards": true|false,
"confidence": 0.0-1.0
}}
Focus on {} agent-specific concerns. Be thorough but constructive."#,
agent_role.name(),
task.description,
content,
dimensions.join("\n"),
agent_role.name()
)
}
fn parse_claude_evaluation(
&self,
response: &str,
agent_role: &AgentRole,
) -> Result<QualityEvaluation> {
let json_result: serde_json::Value = serde_json::from_str(response)
.context("Failed to parse Claude's evaluation response")?;
let overall_score = json_result["overall_score"].as_f64().unwrap_or(0.5);
let mut dimension_scores = HashMap::new();
if let Some(dimensions) = json_result["dimensions"].as_object() {
for (key, value) in dimensions {
dimension_scores.insert(key.clone(), value.as_f64().unwrap_or(0.5));
}
}
let mut issues = Vec::new();
if let Some(issues_array) = json_result["issues"].as_array() {
for issue_json in issues_array {
if let Ok(issue) = self.parse_quality_issue(issue_json) {
issues.push(issue);
}
}
}
let feedback = json_result["feedback"]
.as_str()
.unwrap_or("No detailed feedback provided")
.to_string();
let passes_standards = json_result["passes_standards"]
.as_bool()
.unwrap_or(overall_score >= 0.85);
let confidence = json_result["confidence"].as_f64().unwrap_or(0.8);
Ok(QualityEvaluation {
overall_score,
dimension_scores,
issues,
feedback,
passes_standards,
confidence,
evaluated_at: Utc::now(),
metadata: EvaluationMetadata {
files_analyzed: 1,
lines_of_code: response.lines().count(),
duration_ms: 0, tools_used: vec!["Claude".to_string()],
agent_role: agent_role.name().to_string(),
},
})
}
pub fn parse_quality_issue(&self, json: &serde_json::Value) -> Result<QualityIssue> {
let severity = match json["severity"].as_str().unwrap_or("medium") {
"critical" => IssueSeverity::Critical,
"high" => IssueSeverity::High,
"low" => IssueSeverity::Low,
_ => IssueSeverity::Medium,
};
let category = match json["category"].as_str().unwrap_or("BestPractices") {
"Security" => IssueCategory::Security,
"Performance" => IssueCategory::Performance,
"TestCoverage" => IssueCategory::TestCoverage,
"CodeComplexity" => IssueCategory::CodeComplexity,
"Documentation" => IssueCategory::Documentation,
"ErrorHandling" => IssueCategory::ErrorHandling,
"Architecture" => IssueCategory::Architecture,
"Accessibility" => IssueCategory::Accessibility,
"TypeSafety" => IssueCategory::TypeSafety,
_ => IssueCategory::BestPractices,
};
let affected_areas = json["affected_areas"]
.as_array()
.map(|arr| {
arr.iter()
.filter_map(|v| v.as_str().map(String::from))
.collect()
})
.unwrap_or_default();
Ok(QualityIssue {
severity,
category,
description: json["description"].as_str().unwrap_or("").to_string(),
suggested_fix: json["suggested_fix"].as_str().unwrap_or("").to_string(),
affected_areas,
fix_effort: json["fix_effort"].as_u64().unwrap_or(30) as u32,
})
}
fn evaluate_with_heuristics(
&self,
content: &str,
_task: &Task,
agent_role: &AgentRole,
) -> Result<QualityEvaluation> {
let mut dimension_scores = HashMap::new();
let mut issues = Vec::new();
let lines = content.lines().count();
let has_tests = content.contains("test") || content.contains("spec");
let has_error_handling = content.contains("try")
|| content.contains("catch")
|| content.contains("Result")
|| content.contains("Option");
let has_comments =
content.contains("//") || content.contains("/*") || content.contains("#");
dimension_scores.insert("correctness".to_string(), 0.8); dimension_scores.insert(
"maintainability".to_string(),
if lines < 200 { 0.9 } else { 0.7 },
);
dimension_scores.insert(
"test_quality".to_string(),
if has_tests { 0.8 } else { 0.3 },
);
dimension_scores.insert("security".to_string(), 0.7); dimension_scores.insert("performance".to_string(), 0.7);
dimension_scores.insert(
"documentation".to_string(),
if has_comments { 0.7 } else { 0.4 },
);
dimension_scores.insert(
"error_handling".to_string(),
if has_error_handling { 0.8 } else { 0.5 },
);
dimension_scores.insert("architecture".to_string(), 0.7);
if !has_tests {
issues.push(QualityIssue {
severity: IssueSeverity::High,
category: IssueCategory::TestCoverage,
description: "No tests found for the implementation".to_string(),
suggested_fix: "Add unit tests covering main functionality and edge cases"
.to_string(),
affected_areas: vec![],
fix_effort: 60,
});
}
if !has_error_handling {
issues.push(QualityIssue {
severity: IssueSeverity::Medium,
category: IssueCategory::ErrorHandling,
description: "Limited error handling detected".to_string(),
suggested_fix: "Add proper error handling for edge cases and failures".to_string(),
affected_areas: vec![],
fix_effort: 30,
});
}
let weights = self.rubric.get_role_weights(agent_role.name());
let overall_score = dimension_scores
.iter()
.map(|(dim, score)| weights.get(dim).unwrap_or(&0.1) * score)
.sum::<f64>()
/ weights.values().sum::<f64>();
let passes_standards = overall_score >= 0.85
&& issues
.iter()
.filter(|i| matches!(i.severity, IssueSeverity::Critical | IssueSeverity::High))
.count()
== 0;
Ok(QualityEvaluation {
overall_score,
dimension_scores,
issues: issues.clone(),
feedback: format!(
"Heuristic evaluation for {} agent: {} issues found. {}",
agent_role.name(),
issues.len(),
if passes_standards {
"Meets standards."
} else {
"Needs improvement."
}
),
passes_standards,
confidence: 0.6, evaluated_at: Utc::now(),
metadata: EvaluationMetadata {
files_analyzed: 1,
lines_of_code: lines,
duration_ms: 0,
tools_used: vec!["Heuristics".to_string()],
agent_role: agent_role.name().to_string(),
},
})
}
pub fn evaluation_to_issues(&self, evaluation: &QualityEvaluation) -> Vec<String> {
evaluation
.issues
.iter()
.map(|issue| {
format!(
"[{:?}] {:?}: {}",
issue.severity, issue.category, issue.description
)
})
.collect()
}
pub fn generate_fix_instructions(&self, issues: &[QualityIssue], agent_role: &str) -> String {
if issues.is_empty() {
return "No issues found. Great work!".to_string();
}
let mut instructions = format!(
"## Quality Review - Fix Instructions for {} Agent\n\n",
agent_role
);
let mut critical_issues = Vec::new();
let mut high_issues = Vec::new();
let mut medium_issues = Vec::new();
let mut low_issues = Vec::new();
for issue in issues {
match issue.severity {
IssueSeverity::Critical => critical_issues.push(issue),
IssueSeverity::High => high_issues.push(issue),
IssueSeverity::Medium => medium_issues.push(issue),
IssueSeverity::Low => low_issues.push(issue),
}
}
if !critical_issues.is_empty() {
instructions.push_str("### 🚨 CRITICAL Issues (Fix Immediately)\n");
for issue in critical_issues {
instructions.push_str(&format!(
"- **{:?}**: {}\n - Fix: {}\n - Effort: {} minutes\n\n",
issue.category, issue.description, issue.suggested_fix, issue.fix_effort
));
}
}
if !high_issues.is_empty() {
instructions.push_str("### ⚠️ HIGH Priority Issues\n");
for issue in high_issues {
instructions.push_str(&format!(
"- **{:?}**: {}\n - Fix: {}\n - Effort: {} minutes\n\n",
issue.category, issue.description, issue.suggested_fix, issue.fix_effort
));
}
}
if !medium_issues.is_empty() {
instructions.push_str("### 📝 MEDIUM Priority Issues\n");
for issue in medium_issues {
instructions.push_str(&format!(
"- **{:?}**: {}\n - Fix: {}\n\n",
issue.category, issue.description, issue.suggested_fix
));
}
}
instructions.push_str(&format!(
"\n### Estimated Total Fix Time: {} minutes\n",
issues.iter().map(|i| i.fix_effort).sum::<u32>()
));
instructions
}
}