car-ir 0.7.0 - Docs.rs

//! Quality rubrics — structured evaluation criteria.
//!
//! Rubrics define measurable quality standards that agents can evaluate against.
//! Inspired by metaswarm's 8 review rubrics (code, architecture, security, etc.).
//!
//! A rubric has criteria, each with score levels and a pass threshold.
//! Evaluations produce per-criterion scores that can be stored as facts for learning.

use serde::{Deserialize, Serialize};

/// A quality rubric with named criteria.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Rubric {
    /// Rubric identifier (e.g., "code_review", "security_audit").
    pub id: String,
    /// Human-readable name.
    pub name: String,
    /// What this rubric evaluates.
    pub description: String,
    /// The criteria that make up this rubric.
    pub criteria: Vec<RubricCriterion>,
    /// Overall pass threshold (0.0-1.0). Average score must meet this.
    pub pass_threshold: f64,
}

/// A single criterion within a rubric.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RubricCriterion {
    /// Criterion name (e.g., "error_handling", "test_coverage").
    pub name: String,
    /// What this criterion measures.
    pub description: String,
    /// Weight relative to other criteria (default 1.0).
    #[serde(default = "default_weight")]
    pub weight: f64,
    /// Score level definitions — what each score range means.
    pub levels: Vec<ScoreLevel>,
}

fn default_weight() -> f64 { 1.0 }

/// A score level within a criterion.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ScoreLevel {
    /// Score range lower bound (inclusive).
    pub min_score: f64,
    /// Score range upper bound (inclusive).
    pub max_score: f64,
    /// Label for this level (e.g., "excellent", "adequate", "failing").
    pub label: String,
    /// Description of what this level looks like.
    pub description: String,
}

/// Result of evaluating work against a rubric.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RubricEvaluation {
    /// Which rubric was used.
    pub rubric_id: String,
    /// Per-criterion scores.
    pub scores: Vec<CriterionScore>,
    /// Weighted average score (0.0-1.0).
    pub overall_score: f64,
    /// Whether the evaluation passes the rubric's threshold.
    pub passed: bool,
}

/// Score for a single criterion.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CriterionScore {
    pub criterion: String,
    pub score: f64,
    pub explanation: String,
    pub level_label: String,
}

impl Rubric {
    /// Evaluate a set of criterion scores against this rubric.
    pub fn evaluate(&self, scores: Vec<CriterionScore>) -> RubricEvaluation {
        let total_weight: f64 = self.criteria.iter().map(|c| c.weight).sum();
        if total_weight == 0.0 {
            return RubricEvaluation {
                rubric_id: self.id.clone(),
                scores,
                overall_score: 0.0,
                passed: false,
            };
        }

        let weighted_sum: f64 = scores.iter()
            .map(|s| {
                let weight = self.criteria.iter()
                    .find(|c| c.name == s.criterion)
                    .map(|c| c.weight)
                    .unwrap_or(1.0);
                s.score * weight
            })
            .sum();

        let overall = weighted_sum / total_weight;
        let passed = overall >= self.pass_threshold;

        RubricEvaluation {
            rubric_id: self.id.clone(),
            scores,
            overall_score: overall,
            passed,
        }
    }

    /// Build a prompt for LLM-based evaluation against this rubric.
    pub fn evaluation_prompt(&self, work_output: &str) -> String {
        let criteria_text: Vec<String> = self.criteria.iter()
            .map(|c| {
                let levels_text: Vec<String> = c.levels.iter()
                    .map(|l| format!("  - **{:.1}-{:.1}** ({}): {}", l.min_score, l.max_score, l.label, l.description))
                    .collect();
                format!("### {} (weight: {:.1})\n{}\n{}", c.name, c.weight, c.description, levels_text.join("\n"))
            })
            .collect();

        format!(
            r#"Evaluate the following work output against this quality rubric.

## Rubric: {name}
{description}

## Criteria
{criteria}

## Work Output
{work}

## Instructions
Score each criterion from 0.0 to 1.0. Be strict and objective.

Respond with ONLY a JSON object:
```json
{{
  "scores": [
    {{"criterion": "criterion_name", "score": 0.0, "explanation": "reason", "level_label": "level"}}
  ]
}}
```"#,
            name = self.name,
            description = self.description,
            criteria = criteria_text.join("\n\n"),
            work = work_output,
        )
    }
}

// --- Built-in rubrics ---

/// Code review rubric — evaluates code quality.
pub fn code_review_rubric() -> Rubric {
    Rubric {
        id: "code_review".into(),
        name: "Code Review".into(),
        description: "Evaluates code quality, correctness, and maintainability.".into(),
        pass_threshold: 0.7,
        criteria: vec![
            RubricCriterion {
                name: "correctness".into(),
                description: "Does the code correctly implement the requirements?".into(),
                weight: 2.0,
                levels: vec![
                    ScoreLevel { min_score: 0.9, max_score: 1.0, label: "excellent".into(), description: "Fully correct, handles edge cases.".into() },
                    ScoreLevel { min_score: 0.7, max_score: 0.89, label: "good".into(), description: "Correct for main cases, minor edge case gaps.".into() },
                    ScoreLevel { min_score: 0.4, max_score: 0.69, label: "partial".into(), description: "Some functionality works, significant gaps.".into() },
                    ScoreLevel { min_score: 0.0, max_score: 0.39, label: "failing".into(), description: "Fundamentally broken or incomplete.".into() },
                ],
            },
            RubricCriterion {
                name: "error_handling".into(),
                description: "Are errors handled gracefully? No panics, clear error messages.".into(),
                weight: 1.5,
                levels: vec![
                    ScoreLevel { min_score: 0.8, max_score: 1.0, label: "robust".into(), description: "All error paths handled, typed errors.".into() },
                    ScoreLevel { min_score: 0.5, max_score: 0.79, label: "adequate".into(), description: "Main errors handled, some unwrap/expect.".into() },
                    ScoreLevel { min_score: 0.0, max_score: 0.49, label: "fragile".into(), description: "Panics on errors, missing error handling.".into() },
                ],
            },
            RubricCriterion {
                name: "readability".into(),
                description: "Is the code clear, well-named, and easy to follow?".into(),
                weight: 1.0,
                levels: vec![
                    ScoreLevel { min_score: 0.8, max_score: 1.0, label: "clear".into(), description: "Self-documenting, good naming, logical structure.".into() },
                    ScoreLevel { min_score: 0.5, max_score: 0.79, label: "acceptable".into(), description: "Generally readable, minor clarity issues.".into() },
                    ScoreLevel { min_score: 0.0, max_score: 0.49, label: "confusing".into(), description: "Hard to follow, poor naming, tangled logic.".into() },
                ],
            },
            RubricCriterion {
                name: "test_coverage".into(),
                description: "Are there tests? Do they cover important paths?".into(),
                weight: 1.5,
                levels: vec![
                    ScoreLevel { min_score: 0.8, max_score: 1.0, label: "thorough".into(), description: "Comprehensive tests, edge cases covered.".into() },
                    ScoreLevel { min_score: 0.5, max_score: 0.79, label: "basic".into(), description: "Happy path tested, some edge cases.".into() },
                    ScoreLevel { min_score: 0.0, max_score: 0.49, label: "missing".into(), description: "No tests or trivial-only tests.".into() },
                ],
            },
        ],
    }
}

/// Security review rubric.
pub fn security_review_rubric() -> Rubric {
    Rubric {
        id: "security_review".into(),
        name: "Security Review".into(),
        description: "Evaluates security posture of code changes.".into(),
        pass_threshold: 0.8,
        criteria: vec![
            RubricCriterion {
                name: "input_validation".into(),
                description: "Are all external inputs validated and sanitized?".into(),
                weight: 2.0,
                levels: vec![
                    ScoreLevel { min_score: 0.8, max_score: 1.0, label: "secure".into(), description: "All inputs validated, parameterized queries, no injection.".into() },
                    ScoreLevel { min_score: 0.5, max_score: 0.79, label: "partial".into(), description: "Some validation, minor gaps.".into() },
                    ScoreLevel { min_score: 0.0, max_score: 0.49, label: "vulnerable".into(), description: "Missing validation, injection possible.".into() },
                ],
            },
            RubricCriterion {
                name: "secrets_management".into(),
                description: "Are secrets handled properly? No hardcoded keys.".into(),
                weight: 2.0,
                levels: vec![
                    ScoreLevel { min_score: 0.8, max_score: 1.0, label: "secure".into(), description: "Env vars only, no secrets in code or logs.".into() },
                    ScoreLevel { min_score: 0.0, max_score: 0.79, label: "risky".into(), description: "Hardcoded values or logged secrets.".into() },
                ],
            },
            RubricCriterion {
                name: "auth_boundaries".into(),
                description: "Are trust boundaries enforced? Proper auth checks.".into(),
                weight: 1.5,
                levels: vec![
                    ScoreLevel { min_score: 0.8, max_score: 1.0, label: "enforced".into(), description: "All endpoints protected, RBAC applied.".into() },
                    ScoreLevel { min_score: 0.5, max_score: 0.79, label: "partial".into(), description: "Main endpoints protected, some gaps.".into() },
                    ScoreLevel { min_score: 0.0, max_score: 0.49, label: "missing".into(), description: "Open endpoints, no auth checks.".into() },
                ],
            },
        ],
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn rubric_evaluate_passes() {
        let rubric = code_review_rubric();
        let scores = vec![
            CriterionScore { criterion: "correctness".into(), score: 0.9, explanation: "Good".into(), level_label: "excellent".into() },
            CriterionScore { criterion: "error_handling".into(), score: 0.8, explanation: "Ok".into(), level_label: "robust".into() },
            CriterionScore { criterion: "readability".into(), score: 0.7, explanation: "Clear".into(), level_label: "acceptable".into() },
            CriterionScore { criterion: "test_coverage".into(), score: 0.8, explanation: "Good".into(), level_label: "thorough".into() },
        ];
        let eval = rubric.evaluate(scores);
        assert!(eval.passed);
        assert!(eval.overall_score > 0.7);
    }

    #[test]
    fn rubric_evaluate_fails() {
        let rubric = code_review_rubric();
        let scores = vec![
            CriterionScore { criterion: "correctness".into(), score: 0.3, explanation: "Broken".into(), level_label: "failing".into() },
            CriterionScore { criterion: "error_handling".into(), score: 0.2, explanation: "Panics".into(), level_label: "fragile".into() },
            CriterionScore { criterion: "readability".into(), score: 0.4, explanation: "Messy".into(), level_label: "confusing".into() },
            CriterionScore { criterion: "test_coverage".into(), score: 0.1, explanation: "None".into(), level_label: "missing".into() },
        ];
        let eval = rubric.evaluate(scores);
        assert!(!eval.passed);
        assert!(eval.overall_score < 0.7);
    }

    #[test]
    fn evaluation_prompt_includes_all_criteria() {
        let rubric = code_review_rubric();
        let prompt = rubric.evaluation_prompt("fn main() {}");
        assert!(prompt.contains("correctness"));
        assert!(prompt.contains("error_handling"));
        assert!(prompt.contains("readability"));
        assert!(prompt.contains("test_coverage"));
        assert!(prompt.contains("fn main()"));
    }

    #[test]
    fn security_rubric_higher_threshold() {
        let rubric = security_review_rubric();
        assert_eq!(rubric.pass_threshold, 0.8);
        assert!(rubric.criteria.iter().any(|c| c.name == "secrets_management"));
    }

    #[test]
    fn weighted_scoring() {
        let rubric = code_review_rubric();
        // correctness (weight 2.0) = 1.0, everything else = 0.0
        let scores = vec![
            CriterionScore { criterion: "correctness".into(), score: 1.0, explanation: "".into(), level_label: "".into() },
            CriterionScore { criterion: "error_handling".into(), score: 0.0, explanation: "".into(), level_label: "".into() },
            CriterionScore { criterion: "readability".into(), score: 0.0, explanation: "".into(), level_label: "".into() },
            CriterionScore { criterion: "test_coverage".into(), score: 0.0, explanation: "".into(), level_label: "".into() },
        ];
        let eval = rubric.evaluate(scores);
        // total_weight = 2.0 + 1.5 + 1.0 + 1.5 = 6.0
        // weighted_sum = 1.0 * 2.0 = 2.0
        // overall = 2.0 / 6.0 ≈ 0.333
        assert!((eval.overall_score - 2.0/6.0).abs() < 0.01);
        assert!(!eval.passed); // below 0.7 threshold
    }
}