use tokio_util::sync::CancellationToken;
use tracing::{info, warn};
use swink_agent::{
Agent, AssistantMessage, ContentBlock, Cost, ModelSpec, StopReason, Usage, UserMessage,
};
use crate::error::EvalError;
use crate::evaluator::EvaluatorRegistry;
use crate::score::{Score, Verdict};
use crate::trajectory::{BudgetGuard, TrajectoryCollector};
use crate::types::{
EvalCase, EvalCaseResult, EvalMetricResult, EvalSet, EvalSetResult, EvalSummary, Invocation,
TurnRecord,
};
pub trait AgentFactory: Send + Sync {
fn create_agent(&self, case: &EvalCase) -> Result<(Agent, CancellationToken), EvalError>;
}
pub struct EvalRunner {
registry: EvaluatorRegistry,
}
impl EvalRunner {
#[must_use]
pub const fn new(registry: EvaluatorRegistry) -> Self {
Self { registry }
}
#[must_use]
pub fn with_defaults() -> Self {
Self::new(EvaluatorRegistry::with_defaults())
}
pub async fn run_case(
&self,
case: &EvalCase,
factory: &dyn AgentFactory,
) -> Result<EvalCaseResult, EvalError> {
info!(case_id = %case.id, case_name = %case.name, "running eval case");
let (mut agent, cancel) = factory.create_agent(case)?;
let messages: Vec<_> = case
.user_messages
.iter()
.map(|text| {
swink_agent::AgentMessage::Llm(swink_agent::LlmMessage::User(UserMessage {
content: vec![ContentBlock::Text { text: text.clone() }],
timestamp: swink_agent::now_timestamp(),
cache_hint: None,
}))
})
.collect();
let stream = agent.prompt_stream(messages)?;
let guard = BudgetGuard::from_case(case, cancel);
let invocation = TrajectoryCollector::collect_with_guard(stream, guard).await;
let metric_results = self.registry.evaluate(case, &invocation);
let verdict = if metric_results.iter().all(|r| r.score.verdict().is_pass()) {
Verdict::Pass
} else {
Verdict::Fail
};
info!(
case_id = %case.id,
verdict = ?verdict,
metrics = metric_results.len(),
"eval case complete"
);
Ok(EvalCaseResult {
case_id: case.id.clone(),
invocation,
metric_results,
verdict,
})
}
pub async fn run_set(
&self,
eval_set: &EvalSet,
factory: &dyn AgentFactory,
) -> Result<EvalSetResult, EvalError> {
info!(
set_id = %eval_set.id,
cases = eval_set.cases.len(),
"running eval set"
);
let mut case_results = Vec::with_capacity(eval_set.cases.len());
let mut total_cost = Cost::default();
let mut total_usage = Usage::default();
let mut total_duration = std::time::Duration::ZERO;
let mut passed = 0usize;
let mut failed = 0usize;
for case in &eval_set.cases {
match self.run_case(case, factory).await {
Ok(result) => {
total_cost += result.invocation.total_cost.clone();
total_usage += result.invocation.total_usage.clone();
total_duration += result.invocation.total_duration;
if result.verdict.is_pass() {
passed += 1;
} else {
failed += 1;
}
case_results.push(result);
}
Err(e) => {
warn!(case_id = %case.id, error = %e, "eval case failed with error — recording failure and continuing");
failed += 1;
case_results.push(EvalCaseResult {
case_id: case.id.clone(),
invocation: Invocation {
turns: vec![TurnRecord {
turn_index: 0,
assistant_message: AssistantMessage {
content: vec![],
provider: String::new(),
model_id: String::new(),
usage: Usage::default(),
cost: Cost::default(),
stop_reason: StopReason::Error,
error_message: Some(e.to_string()),
error_kind: None,
timestamp: swink_agent::now_timestamp(),
cache_hint: None,
},
tool_calls: vec![],
tool_results: vec![],
duration: std::time::Duration::ZERO,
}],
total_usage: Usage::default(),
total_cost: Cost::default(),
total_duration: std::time::Duration::ZERO,
final_response: None,
stop_reason: StopReason::Error,
model: ModelSpec::new("unknown", "unknown"),
},
metric_results: vec![EvalMetricResult {
evaluator_name: "error".to_string(),
score: Score::fail(),
details: Some(e.to_string()),
}],
verdict: Verdict::Fail,
});
}
}
}
let summary = EvalSummary {
total_cases: eval_set.cases.len(),
passed,
failed,
total_cost,
total_usage,
total_duration,
};
info!(
set_id = %eval_set.id,
passed = summary.passed,
failed = summary.failed,
total = summary.total_cases,
"eval set complete"
);
Ok(EvalSetResult {
eval_set_id: eval_set.id.clone(),
case_results,
summary,
timestamp: swink_agent::now_timestamp(),
})
}
}