use crate::criteria::EvaluationCriteria;
use crate::error::Result;
use crate::llm_judge::LlmJudge;
use crate::report::{EvaluationReport, EvaluationResult, Failure, TurnResult};
use crate::schema::{EvalCase, TestFile, ToolUse, Turn};
use crate::scoring::{ResponseScorer, ToolTrajectoryScorer};
use adk_core::{Agent, Content, Event, Llm};
use async_trait::async_trait;
use futures::StreamExt;
use serde::{Deserialize, Serialize};
use serde_json::Value;
use std::collections::HashMap;
use std::path::Path;
use std::sync::Arc;
use std::time::{Duration, Instant};
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct EvaluationConfig {
#[serde(default)]
pub criteria: EvaluationCriteria,
#[serde(default)]
pub continue_on_failure: bool,
#[serde(default)]
pub timeout_per_case: Option<Duration>,
#[serde(default)]
pub retries: usize,
#[serde(default = "default_true")]
pub collect_turn_details: bool,
}
fn default_true() -> bool {
true
}
impl EvaluationConfig {
pub fn with_criteria(criteria: EvaluationCriteria) -> Self {
Self { criteria, ..Default::default() }
}
}
pub struct Evaluator {
config: EvaluationConfig,
tool_scorer: ToolTrajectoryScorer,
response_scorer: ResponseScorer,
llm_judge: Option<LlmJudge>,
}
impl Evaluator {
pub fn new(config: EvaluationConfig) -> Self {
let tool_scorer = if let Some(tc) = &config.criteria.tool_trajectory_config {
ToolTrajectoryScorer::with_config(tc.clone())
} else {
ToolTrajectoryScorer::new()
};
let response_scorer = if let Some(rc) = &config.criteria.response_match_config {
ResponseScorer::with_config(rc.clone())
} else {
ResponseScorer::new()
};
Self { config, tool_scorer, response_scorer, llm_judge: None }
}
pub fn with_llm_judge(config: EvaluationConfig, judge_model: Arc<dyn Llm>) -> Self {
let tool_scorer = if let Some(tc) = &config.criteria.tool_trajectory_config {
ToolTrajectoryScorer::with_config(tc.clone())
} else {
ToolTrajectoryScorer::new()
};
let response_scorer = if let Some(rc) = &config.criteria.response_match_config {
ResponseScorer::with_config(rc.clone())
} else {
ResponseScorer::new()
};
Self { config, tool_scorer, response_scorer, llm_judge: Some(LlmJudge::new(judge_model)) }
}
pub fn set_llm_judge(&mut self, judge_model: Arc<dyn Llm>) {
self.llm_judge = Some(LlmJudge::new(judge_model));
}
pub fn has_llm_judge(&self) -> bool {
self.llm_judge.is_some()
}
pub async fn evaluate_file(
&self,
agent: Arc<dyn Agent>,
path: impl AsRef<Path>,
) -> Result<EvaluationReport> {
let test_file = TestFile::load(path)?;
self.evaluate_test_file(agent, &test_file).await
}
pub async fn evaluate_test_file(
&self,
agent: Arc<dyn Agent>,
test_file: &TestFile,
) -> Result<EvaluationReport> {
let started_at = chrono::Utc::now();
let run_id = format!("{}_{}", test_file.eval_set_id, uuid::Uuid::new_v4());
let mut results = Vec::new();
for eval_case in &test_file.eval_cases {
let result = self.evaluate_case(agent.clone(), eval_case).await;
match result {
Ok(r) => {
let passed = r.passed;
results.push(r);
if !passed && !self.config.continue_on_failure {
break;
}
}
Err(e) => {
results.push(EvaluationResult::failed(
&eval_case.eval_id,
HashMap::new(),
vec![Failure::new(
"execution",
Value::Null,
Value::String(e.to_string()),
0.0,
1.0,
)],
Duration::from_secs(0),
));
if !self.config.continue_on_failure {
break;
}
}
}
}
Ok(EvaluationReport::new(&run_id, results, started_at))
}
pub async fn evaluate_case(
&self,
agent: Arc<dyn Agent>,
eval_case: &EvalCase,
) -> Result<EvaluationResult> {
let start = Instant::now();
let mut all_scores: HashMap<String, f64> = HashMap::new();
let mut all_failures: Vec<Failure> = Vec::new();
let mut turn_results: Vec<TurnResult> = Vec::new();
for turn in &eval_case.conversation {
let turn_result = self.execute_turn(agent.clone(), turn).await?;
let (scores, failures) = self.score_turn(turn, &turn_result).await;
for (criterion, score) in &scores {
all_scores
.entry(criterion.clone())
.and_modify(|s| *s = (*s + score) / 2.0)
.or_insert(*score);
}
all_failures.extend(failures);
if self.config.collect_turn_details {
turn_results.push(turn_result);
}
}
let duration = start.elapsed();
let passed = all_failures.is_empty();
let mut result = if passed {
EvaluationResult::passed(&eval_case.eval_id, all_scores, duration)
} else {
EvaluationResult::failed(&eval_case.eval_id, all_scores, all_failures, duration)
};
if self.config.collect_turn_details {
result = result.with_turn_results(turn_results);
}
Ok(result)
}
async fn execute_turn(&self, agent: Arc<dyn Agent>, turn: &Turn) -> Result<TurnResult> {
let input_content = turn.user_content.to_adk_content();
let events = self.run_agent(agent, input_content).await?;
let (actual_response, actual_tool_calls) = self.extract_from_events(&events);
let expected_response = turn.final_response.as_ref().map(|c| c.get_text());
let expected_tool_calls =
turn.intermediate_data.as_ref().map(|d| d.tool_uses.clone()).unwrap_or_default();
Ok(TurnResult {
invocation_id: turn.invocation_id.clone(),
actual_response,
expected_response,
actual_tool_calls,
expected_tool_calls,
scores: HashMap::new(),
})
}
async fn run_agent(&self, agent: Arc<dyn Agent>, input: Content) -> Result<Vec<Event>> {
let invocation_id = uuid::Uuid::new_v4().to_string();
let ctx = Arc::new(EvalInvocationContext::new(invocation_id, input, agent.clone()));
let stream = agent.run(ctx).await.map_err(|e| {
crate::error::EvalError::ExecutionError(format!("Agent run failed: {}", e))
})?;
let events: Vec<Event> = stream.filter_map(|r| async { r.ok() }).collect().await;
Ok(events)
}
fn extract_from_events(&self, events: &[Event]) -> (Option<String>, Vec<ToolUse>) {
let mut response_text = String::new();
let mut tool_calls = Vec::new();
for event in events {
if let Some(content) = event.content() {
for part in &content.parts {
if let Some(text) = part.text() {
response_text.push_str(text);
}
if let adk_core::Part::FunctionCall { name, args, .. } = part {
tool_calls.push(ToolUse {
name: name.clone(),
args: args.clone(),
expected_response: None,
});
}
}
}
}
let response = if response_text.is_empty() { None } else { Some(response_text) };
(response, tool_calls)
}
async fn score_turn(
&self,
turn: &Turn,
result: &TurnResult,
) -> (HashMap<String, f64>, Vec<Failure>) {
let mut scores = HashMap::new();
let mut failures = Vec::new();
if let Some(threshold) = self.config.criteria.tool_trajectory_score {
let score =
self.tool_scorer.score(&result.expected_tool_calls, &result.actual_tool_calls);
scores.insert("tool_trajectory".to_string(), score);
if score < threshold {
failures.push(
Failure::new(
"tool_trajectory",
serde_json::to_value(&result.expected_tool_calls).unwrap_or_default(),
serde_json::to_value(&result.actual_tool_calls).unwrap_or_default(),
score,
threshold,
)
.with_details(&format!(
"Expected {} tool calls, got {}",
result.expected_tool_calls.len(),
result.actual_tool_calls.len()
)),
);
}
}
if let Some(threshold) = self.config.criteria.response_similarity {
if let (Some(expected), Some(actual)) =
(&result.expected_response, &result.actual_response)
{
let score = self.response_scorer.score(expected, actual);
scores.insert("response_similarity".to_string(), score);
if score < threshold {
failures.push(
Failure::new(
"response_similarity",
Value::String(expected.clone()),
Value::String(actual.clone()),
score,
threshold,
)
.with_details("Response text differs from expected"),
);
}
} else if result.expected_response.is_some() && result.actual_response.is_none() {
scores.insert("response_similarity".to_string(), 0.0);
failures.push(
Failure::new(
"response_similarity",
Value::String(result.expected_response.clone().unwrap_or_default()),
Value::Null,
0.0,
threshold,
)
.with_details("No response received"),
);
}
}
if let Some(threshold) = self.config.criteria.semantic_match_score {
if let Some(judge) = &self.llm_judge {
if let (Some(expected), Some(actual)) =
(&result.expected_response, &result.actual_response)
{
match judge
.semantic_match(
expected,
actual,
self.config.criteria.semantic_match_config.as_ref(),
)
.await
{
Ok(semantic_result) => {
scores.insert("semantic_match".to_string(), semantic_result.score);
if semantic_result.score < threshold {
failures.push(
Failure::new(
"semantic_match",
Value::String(expected.clone()),
Value::String(actual.clone()),
semantic_result.score,
threshold,
)
.with_details(&semantic_result.reasoning),
);
}
}
Err(e) => {
failures.push(
Failure::new(
"semantic_match",
Value::String(expected.clone()),
Value::String(actual.clone()),
0.0,
threshold,
)
.with_details(&format!("LLM judge error: {}", e)),
);
}
}
}
}
}
if let Some(threshold) = self.config.criteria.rubric_quality_score {
if let Some(judge) = &self.llm_judge {
if let Some(rubric_config) = &self.config.criteria.rubric_config {
if let Some(actual) = &result.actual_response {
let context = turn.user_content.get_text();
match judge.evaluate_rubrics(actual, &context, rubric_config).await {
Ok(rubric_result) => {
scores.insert(
"rubric_quality".to_string(),
rubric_result.overall_score,
);
for rs in &rubric_result.rubric_scores {
scores.insert(format!("rubric_{}", rs.name), rs.score);
}
if rubric_result.overall_score < threshold {
let details = rubric_result
.rubric_scores
.iter()
.map(|rs| {
format!(
"{}: {:.2} - {}",
rs.name, rs.score, rs.reasoning
)
})
.collect::<Vec<_>>()
.join("; ");
failures.push(
Failure::new(
"rubric_quality",
Value::Number(
serde_json::Number::from_f64(threshold)
.unwrap_or(serde_json::Number::from(0)),
),
Value::Number(
serde_json::Number::from_f64(
rubric_result.overall_score,
)
.unwrap_or(serde_json::Number::from(0)),
),
rubric_result.overall_score,
threshold,
)
.with_details(&details),
);
}
}
Err(e) => {
failures.push(
Failure::new(
"rubric_quality",
Value::Null,
Value::Null,
0.0,
threshold,
)
.with_details(&format!("LLM judge error: {}", e)),
);
}
}
}
}
}
}
if let Some(threshold) = self.config.criteria.safety_score {
if let Some(judge) = &self.llm_judge {
if let Some(actual) = &result.actual_response {
match judge.evaluate_safety(actual).await {
Ok(safety_result) => {
scores.insert("safety".to_string(), safety_result.score);
if safety_result.score < threshold {
failures.push(
Failure::new(
"safety",
Value::Number(
serde_json::Number::from_f64(threshold)
.unwrap_or(serde_json::Number::from(0)),
),
Value::Number(
serde_json::Number::from_f64(safety_result.score)
.unwrap_or(serde_json::Number::from(0)),
),
safety_result.score,
threshold,
)
.with_details(&format!(
"Safety issues: {}",
safety_result.issues.join(", ")
)),
);
}
}
Err(e) => {
failures.push(
Failure::new("safety", Value::Null, Value::Null, 0.0, threshold)
.with_details(&format!("LLM judge error: {}", e)),
);
}
}
}
}
}
if let Some(threshold) = self.config.criteria.hallucination_score {
if let Some(judge) = &self.llm_judge {
if let Some(actual) = &result.actual_response {
let context = turn.user_content.get_text();
let ground_truth = result.expected_response.as_deref();
match judge.detect_hallucinations(actual, &context, ground_truth).await {
Ok(hallucination_result) => {
scores.insert("hallucination".to_string(), hallucination_result.score);
if hallucination_result.score < threshold {
failures.push(
Failure::new(
"hallucination",
Value::Number(
serde_json::Number::from_f64(threshold)
.unwrap_or(serde_json::Number::from(0)),
),
Value::Number(
serde_json::Number::from_f64(
hallucination_result.score,
)
.unwrap_or(serde_json::Number::from(0)),
),
hallucination_result.score,
threshold,
)
.with_details(&format!(
"Hallucinations detected: {}",
hallucination_result.issues.join(", ")
)),
);
}
}
Err(e) => {
failures.push(
Failure::new(
"hallucination",
Value::Null,
Value::Null,
0.0,
threshold,
)
.with_details(&format!("LLM judge error: {}", e)),
);
}
}
}
}
}
(scores, failures)
}
pub async fn evaluate_cases_parallel(
&self,
agent: Arc<dyn Agent>,
cases: &[EvalCase],
concurrency: usize,
) -> Vec<Result<EvaluationResult>> {
use futures::stream::{self, StreamExt};
let results: Vec<_> = stream::iter(cases)
.map(|case| {
let agent = agent.clone();
async move { self.evaluate_case(agent, case).await }
})
.buffer_unordered(concurrency)
.collect()
.await;
results
}
pub async fn evaluate_directory(
&self,
agent: Arc<dyn Agent>,
dir: impl AsRef<Path>,
) -> Result<Vec<EvaluationReport>> {
let mut reports = Vec::new();
let entries = std::fs::read_dir(dir)?;
for entry in entries {
let entry = entry?;
let path = entry.path();
if path.extension().is_some_and(|ext| ext == "json") {
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
if name.ends_with(".test.json") {
let report = self.evaluate_file(agent.clone(), &path).await?;
reports.push(report);
}
}
}
}
Ok(reports)
}
}
impl Default for Evaluator {
fn default() -> Self {
Self::new(EvaluationConfig::default())
}
}
struct EvalInvocationContext {
invocation_id: String,
user_content: Content,
agent: Arc<dyn Agent>,
session: EvalSession,
run_config: adk_core::RunConfig,
ended: std::sync::atomic::AtomicBool,
}
impl EvalInvocationContext {
fn new(invocation_id: String, user_content: Content, agent: Arc<dyn Agent>) -> Self {
let session_id = format!("eval-session-{}", uuid::Uuid::new_v4());
Self {
invocation_id,
user_content,
agent,
session: EvalSession::new(session_id),
run_config: adk_core::RunConfig::default(),
ended: std::sync::atomic::AtomicBool::new(false),
}
}
}
impl adk_core::ReadonlyContext for EvalInvocationContext {
fn invocation_id(&self) -> &str {
&self.invocation_id
}
fn agent_name(&self) -> &str {
self.agent.name()
}
fn user_id(&self) -> &str {
"eval_user"
}
fn app_name(&self) -> &str {
"eval_app"
}
fn session_id(&self) -> &str {
&self.session.id
}
fn branch(&self) -> &str {
"main"
}
fn user_content(&self) -> &Content {
&self.user_content
}
}
#[async_trait]
impl adk_core::CallbackContext for EvalInvocationContext {
fn artifacts(&self) -> Option<Arc<dyn adk_core::Artifacts>> {
None
}
}
#[async_trait]
impl adk_core::InvocationContext for EvalInvocationContext {
fn agent(&self) -> Arc<dyn Agent> {
self.agent.clone()
}
fn memory(&self) -> Option<Arc<dyn adk_core::Memory>> {
None
}
fn session(&self) -> &dyn adk_core::Session {
&self.session
}
fn run_config(&self) -> &adk_core::RunConfig {
&self.run_config
}
fn end_invocation(&self) {
self.ended.store(true, std::sync::atomic::Ordering::SeqCst);
}
fn ended(&self) -> bool {
self.ended.load(std::sync::atomic::Ordering::SeqCst)
}
}
struct EvalSession {
id: String,
state: EvalState,
}
impl EvalSession {
fn new(id: String) -> Self {
Self { id, state: EvalState::new() }
}
}
impl adk_core::Session for EvalSession {
fn id(&self) -> &str {
&self.id
}
fn app_name(&self) -> &str {
"eval_app"
}
fn user_id(&self) -> &str {
"eval_user"
}
fn state(&self) -> &dyn adk_core::State {
&self.state
}
fn conversation_history(&self) -> Vec<Content> {
vec![]
}
}
struct EvalState {
data: std::sync::RwLock<HashMap<String, serde_json::Value>>,
}
impl EvalState {
fn new() -> Self {
Self { data: std::sync::RwLock::new(HashMap::new()) }
}
}
impl adk_core::State for EvalState {
fn get(&self, key: &str) -> Option<serde_json::Value> {
self.data.read().ok()?.get(key).cloned()
}
fn set(&mut self, key: String, value: serde_json::Value) {
if let Err(msg) = adk_core::validate_state_key(&key) {
tracing::warn!(key = %key, "rejecting invalid state key: {msg}");
return;
}
if let Ok(mut data) = self.data.write() {
data.insert(key, value);
}
}
fn all(&self) -> HashMap<String, serde_json::Value> {
self.data.read().ok().map(|d| d.clone()).unwrap_or_default()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_evaluator_creation() {
let config = EvaluationConfig::with_criteria(
EvaluationCriteria::exact_tools().with_response_similarity(0.8),
);
let evaluator = Evaluator::new(config);
assert!(evaluator.config.criteria.tool_trajectory_score.is_some());
assert!(evaluator.config.criteria.response_similarity.is_some());
}
#[tokio::test]
async fn test_turn_scoring() {
let config = EvaluationConfig::with_criteria(EvaluationCriteria {
tool_trajectory_score: Some(1.0),
response_similarity: Some(0.8),
..Default::default()
});
let evaluator = Evaluator::new(config);
let turn = Turn {
invocation_id: "test".to_string(),
user_content: crate::schema::ContentData::text("Hello"),
final_response: Some(crate::schema::ContentData::model_response("Hi there!")),
intermediate_data: Some(crate::schema::IntermediateData {
tool_uses: vec![ToolUse::new("greet")],
..Default::default()
}),
};
let result = TurnResult {
invocation_id: "test".to_string(),
actual_response: Some("Hi there!".to_string()),
expected_response: Some("Hi there!".to_string()),
actual_tool_calls: vec![ToolUse::new("greet")],
expected_tool_calls: vec![ToolUse::new("greet")],
scores: HashMap::new(),
};
let (scores, failures) = evaluator.score_turn(&turn, &result).await;
assert!(failures.is_empty());
assert_eq!(scores.get("tool_trajectory"), Some(&1.0));
assert_eq!(scores.get("response_similarity"), Some(&1.0));
}
}