car-ir 0.12.0 - Docs.rs

//! Quality rubrics — structured evaluation criteria.
//!
//! Rubrics define measurable quality standards that agents can evaluate against.
//! Inspired by metaswarm's 8 review rubrics (code, architecture, security, etc.).
//!
//! A rubric has criteria, each with score levels and a pass threshold.
//! Evaluations produce per-criterion scores that can be stored as facts for learning.

use serde::{Deserialize, Serialize};

/// A quality rubric with named criteria.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Rubric {
    /// Rubric identifier (e.g., "code_review", "security_audit").
    pub id: String,
    /// Human-readable name.
    pub name: String,
    /// What this rubric evaluates.
    pub description: String,
    /// The criteria that make up this rubric.
    pub criteria: Vec<RubricCriterion>,
    /// Overall pass threshold (0.0-1.0). Average score must meet this.
    pub pass_threshold: f64,
}

/// A single criterion within a rubric.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RubricCriterion {
    /// Criterion name (e.g., "error_handling", "test_coverage").
    pub name: String,
    /// What this criterion measures.
    pub description: String,
    /// Weight relative to other criteria (default 1.0).
    #[serde(default = "default_weight")]
    pub weight: f64,
    /// Score level definitions — what each score range means.
    pub levels: Vec<ScoreLevel>,
}

fn default_weight() -> f64 {
    1.0
}

/// A score level within a criterion.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ScoreLevel {
    /// Score range lower bound (inclusive).
    pub min_score: f64,
    /// Score range upper bound (inclusive).
    pub max_score: f64,
    /// Label for this level (e.g., "excellent", "adequate", "failing").
    pub label: String,
    /// Description of what this level looks like.
    pub description: String,
}

/// Result of evaluating work against a rubric.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RubricEvaluation {
    /// Which rubric was used.
    pub rubric_id: String,
    /// Per-criterion scores.
    pub scores: Vec<CriterionScore>,
    /// Weighted average score (0.0-1.0).
    pub overall_score: f64,
    /// Whether the evaluation passes the rubric's threshold.
    pub passed: bool,
}

/// Score for a single criterion.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CriterionScore {
    pub criterion: String,
    pub score: f64,
    pub explanation: String,
    pub level_label: String,
}

impl Rubric {
    /// Evaluate a set of criterion scores against this rubric.
    pub fn evaluate(&self, scores: Vec<CriterionScore>) -> RubricEvaluation {
        let total_weight: f64 = self.criteria.iter().map(|c| c.weight).sum();
        if total_weight == 0.0 {
            return RubricEvaluation {
                rubric_id: self.id.clone(),
                scores,
                overall_score: 0.0,
                passed: false,
            };
        }

        let weighted_sum: f64 = scores
            .iter()
            .map(|s| {
                let weight = self
                    .criteria
                    .iter()
                    .find(|c| c.name == s.criterion)
                    .map(|c| c.weight)
                    .unwrap_or(1.0);
                s.score * weight
            })
            .sum();

        let overall = weighted_sum / total_weight;
        let passed = overall >= self.pass_threshold;

        RubricEvaluation {
            rubric_id: self.id.clone(),
            scores,
            overall_score: overall,
            passed,
        }
    }

    /// Build a prompt for LLM-based evaluation against this rubric.
    pub fn evaluation_prompt(&self, work_output: &str) -> String {
        let criteria_text: Vec<String> = self
            .criteria
            .iter()
            .map(|c| {
                let levels_text: Vec<String> = c
                    .levels
                    .iter()
                    .map(|l| {
                        format!(
                            "  - **{:.1}-{:.1}** ({}): {}",
                            l.min_score, l.max_score, l.label, l.description
                        )
                    })
                    .collect();
                format!(
                    "### {} (weight: {:.1})\n{}\n{}",
                    c.name,
                    c.weight,
                    c.description,
                    levels_text.join("\n")
                )
            })
            .collect();

        format!(
            r#"Evaluate the following work output against this quality rubric.

## Rubric: {name}
{description}

## Criteria
{criteria}

## Work Output
{work}

## Instructions
Score each criterion from 0.0 to 1.0. Be strict and objective.

Respond with ONLY a JSON object:
```json
{{
  "scores": [
    {{"criterion": "criterion_name", "score": 0.0, "explanation": "reason", "level_label": "level"}}
  ]
}}
```"#,
            name = self.name,
            description = self.description,
            criteria = criteria_text.join("\n\n"),
            work = work_output,
        )
    }
}

// --- Built-in rubrics ---

/// Code review rubric — evaluates code quality.
pub fn code_review_rubric() -> Rubric {
    Rubric {
        id: "code_review".into(),
        name: "Code Review".into(),
        description: "Evaluates code quality, correctness, and maintainability.".into(),
        pass_threshold: 0.7,
        criteria: vec![
            RubricCriterion {
                name: "correctness".into(),
                description: "Does the code correctly implement the requirements?".into(),
                weight: 2.0,
                levels: vec![
                    ScoreLevel {
                        min_score: 0.9,
                        max_score: 1.0,
                        label: "excellent".into(),
                        description: "Fully correct, handles edge cases.".into(),
                    },
                    ScoreLevel {
                        min_score: 0.7,
                        max_score: 0.89,
                        label: "good".into(),
                        description: "Correct for main cases, minor edge case gaps.".into(),
                    },
                    ScoreLevel {
                        min_score: 0.4,
                        max_score: 0.69,
                        label: "partial".into(),
                        description: "Some functionality works, significant gaps.".into(),
                    },
                    ScoreLevel {
                        min_score: 0.0,
                        max_score: 0.39,
                        label: "failing".into(),
                        description: "Fundamentally broken or incomplete.".into(),
                    },
                ],
            },
            RubricCriterion {
                name: "error_handling".into(),
                description: "Are errors handled gracefully? No panics, clear error messages."
                    .into(),
                weight: 1.5,
                levels: vec![
                    ScoreLevel {
                        min_score: 0.8,
                        max_score: 1.0,
                        label: "robust".into(),
                        description: "All error paths handled, typed errors.".into(),
                    },
                    ScoreLevel {
                        min_score: 0.5,
                        max_score: 0.79,
                        label: "adequate".into(),
                        description: "Main errors handled, some unwrap/expect.".into(),
                    },
                    ScoreLevel {
                        min_score: 0.0,
                        max_score: 0.49,
                        label: "fragile".into(),
                        description: "Panics on errors, missing error handling.".into(),
                    },
                ],
            },
            RubricCriterion {
                name: "readability".into(),
                description: "Is the code clear, well-named, and easy to follow?".into(),
                weight: 1.0,
                levels: vec![
                    ScoreLevel {
                        min_score: 0.8,
                        max_score: 1.0,
                        label: "clear".into(),
                        description: "Self-documenting, good naming, logical structure.".into(),
                    },
                    ScoreLevel {
                        min_score: 0.5,
                        max_score: 0.79,
                        label: "acceptable".into(),
                        description: "Generally readable, minor clarity issues.".into(),
                    },
                    ScoreLevel {
                        min_score: 0.0,
                        max_score: 0.49,
                        label: "confusing".into(),
                        description: "Hard to follow, poor naming, tangled logic.".into(),
                    },
                ],
            },
            RubricCriterion {
                name: "test_coverage".into(),
                description: "Are there tests? Do they cover important paths?".into(),
                weight: 1.5,
                levels: vec![
                    ScoreLevel {
                        min_score: 0.8,
                        max_score: 1.0,
                        label: "thorough".into(),
                        description: "Comprehensive tests, edge cases covered.".into(),
                    },
                    ScoreLevel {
                        min_score: 0.5,
                        max_score: 0.79,
                        label: "basic".into(),
                        description: "Happy path tested, some edge cases.".into(),
                    },
                    ScoreLevel {
                        min_score: 0.0,
                        max_score: 0.49,
                        label: "missing".into(),
                        description: "No tests or trivial-only tests.".into(),
                    },
                ],
            },
        ],
    }
}

/// Security review rubric.
pub fn security_review_rubric() -> Rubric {
    Rubric {
        id: "security_review".into(),
        name: "Security Review".into(),
        description: "Evaluates security posture of code changes.".into(),
        pass_threshold: 0.8,
        criteria: vec![
            RubricCriterion {
                name: "input_validation".into(),
                description: "Are all external inputs validated and sanitized?".into(),
                weight: 2.0,
                levels: vec![
                    ScoreLevel {
                        min_score: 0.8,
                        max_score: 1.0,
                        label: "secure".into(),
                        description: "All inputs validated, parameterized queries, no injection."
                            .into(),
                    },
                    ScoreLevel {
                        min_score: 0.5,
                        max_score: 0.79,
                        label: "partial".into(),
                        description: "Some validation, minor gaps.".into(),
                    },
                    ScoreLevel {
                        min_score: 0.0,
                        max_score: 0.49,
                        label: "vulnerable".into(),
                        description: "Missing validation, injection possible.".into(),
                    },
                ],
            },
            RubricCriterion {
                name: "secrets_management".into(),
                description: "Are secrets handled properly? No hardcoded keys.".into(),
                weight: 2.0,
                levels: vec![
                    ScoreLevel {
                        min_score: 0.8,
                        max_score: 1.0,
                        label: "secure".into(),
                        description: "Env vars only, no secrets in code or logs.".into(),
                    },
                    ScoreLevel {
                        min_score: 0.0,
                        max_score: 0.79,
                        label: "risky".into(),
                        description: "Hardcoded values or logged secrets.".into(),
                    },
                ],
            },
            RubricCriterion {
                name: "auth_boundaries".into(),
                description: "Are trust boundaries enforced? Proper auth checks.".into(),
                weight: 1.5,
                levels: vec![
                    ScoreLevel {
                        min_score: 0.8,
                        max_score: 1.0,
                        label: "enforced".into(),
                        description: "All endpoints protected, RBAC applied.".into(),
                    },
                    ScoreLevel {
                        min_score: 0.5,
                        max_score: 0.79,
                        label: "partial".into(),
                        description: "Main endpoints protected, some gaps.".into(),
                    },
                    ScoreLevel {
                        min_score: 0.0,
                        max_score: 0.49,
                        label: "missing".into(),
                        description: "Open endpoints, no auth checks.".into(),
                    },
                ],
            },
        ],
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn rubric_evaluate_passes() {
        let rubric = code_review_rubric();
        let scores = vec![
            CriterionScore {
                criterion: "correctness".into(),
                score: 0.9,
                explanation: "Good".into(),
                level_label: "excellent".into(),
            },
            CriterionScore {
                criterion: "error_handling".into(),
                score: 0.8,
                explanation: "Ok".into(),
                level_label: "robust".into(),
            },
            CriterionScore {
                criterion: "readability".into(),
                score: 0.7,
                explanation: "Clear".into(),
                level_label: "acceptable".into(),
            },
            CriterionScore {
                criterion: "test_coverage".into(),
                score: 0.8,
                explanation: "Good".into(),
                level_label: "thorough".into(),
            },
        ];
        let eval = rubric.evaluate(scores);
        assert!(eval.passed);
        assert!(eval.overall_score > 0.7);
    }

    #[test]
    fn rubric_evaluate_fails() {
        let rubric = code_review_rubric();
        let scores = vec![
            CriterionScore {
                criterion: "correctness".into(),
                score: 0.3,
                explanation: "Broken".into(),
                level_label: "failing".into(),
            },
            CriterionScore {
                criterion: "error_handling".into(),
                score: 0.2,
                explanation: "Panics".into(),
                level_label: "fragile".into(),
            },
            CriterionScore {
                criterion: "readability".into(),
                score: 0.4,
                explanation: "Messy".into(),
                level_label: "confusing".into(),
            },
            CriterionScore {
                criterion: "test_coverage".into(),
                score: 0.1,
                explanation: "None".into(),
                level_label: "missing".into(),
            },
        ];
        let eval = rubric.evaluate(scores);
        assert!(!eval.passed);
        assert!(eval.overall_score < 0.7);
    }

    #[test]
    fn evaluation_prompt_includes_all_criteria() {
        let rubric = code_review_rubric();
        let prompt = rubric.evaluation_prompt("fn main() {}");
        assert!(prompt.contains("correctness"));
        assert!(prompt.contains("error_handling"));
        assert!(prompt.contains("readability"));
        assert!(prompt.contains("test_coverage"));
        assert!(prompt.contains("fn main()"));
    }

    #[test]
    fn security_rubric_higher_threshold() {
        let rubric = security_review_rubric();
        assert_eq!(rubric.pass_threshold, 0.8);
        assert!(rubric
            .criteria
            .iter()
            .any(|c| c.name == "secrets_management"));
    }

    #[test]
    fn weighted_scoring() {
        let rubric = code_review_rubric();
        // correctness (weight 2.0) = 1.0, everything else = 0.0
        let scores = vec![
            CriterionScore {
                criterion: "correctness".into(),
                score: 1.0,
                explanation: "".into(),
                level_label: "".into(),
            },
            CriterionScore {
                criterion: "error_handling".into(),
                score: 0.0,
                explanation: "".into(),
                level_label: "".into(),
            },
            CriterionScore {
                criterion: "readability".into(),
                score: 0.0,
                explanation: "".into(),
                level_label: "".into(),
            },
            CriterionScore {
                criterion: "test_coverage".into(),
                score: 0.0,
                explanation: "".into(),
                level_label: "".into(),
            },
        ];
        let eval = rubric.evaluate(scores);
        // total_weight = 2.0 + 1.5 + 1.0 + 1.5 = 6.0
        // weighted_sum = 1.0 * 2.0 = 2.0
        // overall = 2.0 / 6.0 ≈ 0.333
        assert!((eval.overall_score - 2.0 / 6.0).abs() < 0.01);
        assert!(!eval.passed); // below 0.7 threshold
    }
}