agentcarousel 0.2.2

Evaluate agents and skills with YAML fixtures, run cases (mock or live), and keep run rows in SQLite for reports and evidence export.
Documentation
use agentcarousel_core::{
    Case, CaseResult, EvalScores, ExecutionTrace, RubricScore, StepKind, ToolOrder,
};
use serde_json::Value;

use super::assertions::check_output;
use super::trait_def::{Evaluator, EvaluatorError, EvaluatorKind};

#[derive(Debug, Clone)]
pub struct RuleEvaluation {
    pub passed: bool,
    pub failures: Vec<String>,
}

#[derive(Debug, Default)]
pub struct RulesEvaluator;

pub fn evaluate_case(case: &Case, trace: &ExecutionTrace) -> RuleEvaluation {
    let mut failures = Vec::new();

    if let Some(sequence) = case.expected.tool_sequence.as_ref() {
        if !sequence.is_empty() {
            failures.extend(check_tool_sequence(sequence, trace));
        }
    }

    if let Some(assertions) = case.expected.output.as_ref() {
        for assertion in assertions {
            if let Err(error) = check_output(assertion, trace) {
                failures.push(error);
            }
        }
    }

    RuleEvaluation {
        passed: failures.is_empty(),
        failures,
    }
}

impl Evaluator for RulesEvaluator {
    fn id(&self) -> &'static str {
        EvaluatorKind::Rules.as_str()
    }

    fn evaluate(&self, case: &Case, result: &CaseResult) -> Result<EvalScores, EvaluatorError> {
        let evaluation = evaluate_case(case, &result.trace);
        let mut rubric_scores = score_rubric(case, &result.trace);

        if !evaluation.passed {
            rubric_scores.push(RubricScore {
                rubric_id: "rules".to_string(),
                score: 0.0,
                weight: 1.0,
                rationale: Some(evaluation.failures.join("; ")),
            });
        }

        let effectiveness_score = if !evaluation.passed {
            0.0
        } else if rubric_scores.is_empty() {
            1.0
        } else {
            weighted_average(&rubric_scores)
        };

        let passed = if rubric_scores.is_empty() {
            evaluation.passed
        } else {
            evaluation.passed && rubric_scores.iter().all(|score| score.score >= 1.0)
        };

        Ok(EvalScores {
            evaluator: self.id().to_string(),
            rubric_scores,
            effectiveness_score,
            passed,
            judge_rationale: None,
        })
    }
}

fn check_tool_sequence(
    sequence: &[agentcarousel_core::ToolCallExpectation],
    trace: &ExecutionTrace,
) -> Vec<String> {
    let actual_steps: Vec<&agentcarousel_core::TraceStep> = trace
        .steps
        .iter()
        .filter(|step| step.kind == StepKind::ToolCall)
        .collect();

    let actual_tools: Vec<String> = actual_steps
        .iter()
        .filter_map(|step| step.tool.clone())
        .collect();

    let mut failures = Vec::new();
    let strict_expected: Vec<&agentcarousel_core::ToolCallExpectation> = sequence
        .iter()
        .filter(|expectation| expectation.order == ToolOrder::Strict)
        .collect();

    if !strict_expected.is_empty() {
        if actual_tools.len() != strict_expected.len()
            || !strict_expected
                .iter()
                .zip(actual_steps.iter())
                .all(|(expected, actual)| tool_matches(expected, actual))
        {
            failures.push(format!(
                "tool sequence mismatch: expected {:?}, got {:?}",
                strict_expected
                    .iter()
                    .map(|expectation| expectation.tool.clone())
                    .collect::<Vec<_>>(),
                actual_tools
            ));
        }
        return failures;
    }

    let subsequence_expected: Vec<&agentcarousel_core::ToolCallExpectation> = sequence
        .iter()
        .filter(|expectation| expectation.order == ToolOrder::Subsequence)
        .collect();
    if !subsequence_expected.is_empty() && !is_subsequence(&subsequence_expected, &actual_steps) {
        failures.push(format!(
            "tool subsequence mismatch: expected {:?} within {:?}",
            subsequence_expected
                .iter()
                .map(|expectation| expectation.tool.clone())
                .collect::<Vec<_>>(),
            actual_tools
        ));
    }

    let unordered_expected: Vec<&agentcarousel_core::ToolCallExpectation> = sequence
        .iter()
        .filter(|expectation| expectation.order == ToolOrder::Unordered)
        .collect();
    for expected in unordered_expected {
        if !actual_steps
            .iter()
            .any(|actual| tool_matches(expected, actual))
        {
            failures.push(format!("missing unordered tool call: {}", expected.tool));
        }
    }

    failures
}

fn is_subsequence(
    expected: &[&agentcarousel_core::ToolCallExpectation],
    actual: &[&agentcarousel_core::TraceStep],
) -> bool {
    let mut index = 0;
    for item in actual {
        if expected
            .get(index)
            .map(|expectation| tool_matches(expectation, item))
            .unwrap_or(false)
        {
            index += 1;
            if index == expected.len() {
                return true;
            }
        }
    }
    expected.is_empty()
}

fn tool_matches(
    expected: &agentcarousel_core::ToolCallExpectation,
    actual: &agentcarousel_core::TraceStep,
) -> bool {
    let tool_matches = actual.tool.as_deref() == Some(&expected.tool);
    let args_matches = match (expected.args_match.as_ref(), actual.args.as_ref()) {
        (None, _) => true,
        (Some(expected), Some(actual)) => is_subset(expected, actual),
        _ => false,
    };
    tool_matches && args_matches
}

fn is_subset(expected: &Value, actual: &Value) -> bool {
    match (expected, actual) {
        (Value::Object(expected_map), Value::Object(actual_map)) => {
            expected_map.iter().all(|(key, value)| {
                actual_map
                    .get(key)
                    .map(|actual_value| is_subset(value, actual_value))
                    .unwrap_or(false)
            })
        }
        (Value::Array(expected_arr), Value::Array(actual_arr)) => expected_arr == actual_arr,
        _ => expected == actual,
    }
}

fn score_rubric(case: &Case, trace: &ExecutionTrace) -> Vec<RubricScore> {
    let Some(rubric) = case.expected.rubric.as_ref() else {
        return Vec::new();
    };
    rubric
        .iter()
        .map(|item| {
            let mut rationale = None;
            let score = if let Some(auto_check) = item.auto_check.as_ref() {
                match check_output(auto_check, trace) {
                    Ok(()) => 1.0,
                    Err(err) => {
                        rationale = Some(err);
                        0.0
                    }
                }
            } else {
                rationale = Some("requires judge or manual review".to_string());
                0.0
            };
            RubricScore {
                rubric_id: item.id.clone(),
                score,
                weight: item.weight,
                rationale,
            }
        })
        .collect()
}

fn weighted_average(scores: &[RubricScore]) -> f32 {
    let total_weight: f32 = scores.iter().map(|score| score.weight).sum();
    if total_weight <= f32::EPSILON {
        return 0.0;
    }
    let weighted_sum: f32 = scores.iter().map(|score| score.score * score.weight).sum();
    weighted_sum / total_weight
}