use std::collections::{HashMap, HashSet};
use std::path::{Path, PathBuf};
use std::sync::{Arc, Mutex, MutexGuard, OnceLock};
use std::time::Instant;
use ai_agents_observability::ObservabilityConfig;
use ai_agents_observability::config::ExportFormat;
use ai_agents_runtime::spec::{AgentSpec, LLMConfigOrSelector, StorageConfig};
use ai_agents_runtime::{Agent, AgentBuilder, RuntimeAgent, StreamChunk};
use futures::{StreamExt, stream};
use serde_json::{Value, json};
use tokio::time::{Duration, timeout};
use crate::assertion::{AssertionEvalContext, AssertionOutcome, evaluate_assertion};
use crate::compatibility::suite_from_jsonl;
use crate::evidence::{collect_turn_evidence, relationship_snapshot};
use crate::fixtures::{
LlmFixtureMode, RecordingToolLog, build_llm_registry, build_tool_registry,
resolve_fixture_context, start_mock_server,
};
use crate::judge::{JudgeConfig, JudgeResolver};
use crate::metrics::compute_metrics;
use crate::redaction::{redact_text, redact_value};
use crate::suite::{
AttemptResult, EvalResult, EvalSuite, FailureCategory, IsolationMode, ResetStepConfig,
Scenario, ScenarioResult, ScenarioStatus, ScenarioStep, Turn, TurnResult,
};
use crate::{EvalError, Result};
#[derive(Debug, Clone, Default)]
pub struct EvalRunnerOptions {
pub agent: Option<PathBuf>,
pub scenarios: Option<PathBuf>,
pub output: PathBuf,
pub ids: Vec<String>,
pub tags: Vec<String>,
pub tag_mode_all: bool,
pub languages: Vec<String>,
pub retries: Option<u32>,
pub timeout_ms: Option<u64>,
pub parallel: Option<usize>,
pub fail_fast: bool,
pub observability: bool,
pub llm_mode: Option<LlmFixtureMode>,
pub cassette: Option<PathBuf>,
}
pub struct EvalRunner {
suite_path: PathBuf,
suite: EvalSuite,
options: EvalRunnerOptions,
}
impl EvalRunner {
pub fn from_file(path: impl AsRef<Path>, options: EvalRunnerOptions) -> Result<Self> {
let path = path.as_ref().to_path_buf();
let content = std::fs::read_to_string(&path)?;
let mut suite = if path.extension().and_then(|e| e.to_str()) == Some("jsonl") {
suite_from_jsonl(
path.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("eval")
.to_string(),
&content,
)?
} else {
serde_yaml::from_str::<EvalSuite>(&content)?
};
if let Some(agent) = &options.agent {
suite.agent = Some(agent.clone());
}
if let Some(retries) = options.retries {
suite.settings.retries = retries;
}
if let Some(timeout_ms) = options.timeout_ms {
suite.settings.timeout_per_turn_ms = timeout_ms;
}
if let Some(parallel) = options.parallel {
suite.settings.parallel = parallel > 1;
suite.settings.max_concurrent = parallel.max(1);
}
if options.fail_fast {
suite.settings.fail_fast = true;
}
if let Some(mode) = options.llm_mode {
suite.fixtures.llm.mode = mode;
}
if let Some(cassette) = &options.cassette {
suite.fixtures.llm.cassette = Some(cassette.clone());
}
suite.validate(options.agent.as_ref())?;
Ok(Self {
suite_path: path,
suite,
options,
})
}
pub async fn run(&self) -> Result<EvalResult> {
let start = Instant::now();
let base_dir = self.suite_path.parent().unwrap_or_else(|| Path::new("."));
let agent_path = self.resolve_agent_path(base_dir)?;
let scenarios = self.filtered_scenarios();
let results = if self.suite.settings.parallel && !self.suite.settings.fail_fast {
self.run_scenarios_parallel(&agent_path, base_dir, scenarios)
.await
} else {
self.run_scenarios_serial(&agent_path, base_dir, scenarios)
.await
};
let total = results.len();
let passed = results.iter().filter(|r| r.status.is_passed()).count();
let failed = results
.iter()
.filter(|r| r.status.is_failed() || r.status.is_error())
.count();
let skipped = results
.iter()
.filter(|r| matches!(r.status, ScenarioStatus::Skipped { .. }))
.count();
let metrics = compute_metrics(&results);
let observability = final_observability_report(&results);
Ok(EvalResult {
schema_version: 1,
suite: self.suite.name.clone(),
agent: agent_path.display().to_string(),
total,
passed,
failed,
skipped,
duration_ms: start.elapsed().as_millis() as u64,
scenarios: results,
metrics,
observability,
})
}
async fn run_scenarios_serial(
&self,
agent_path: &Path,
base_dir: &Path,
scenarios: Vec<&Scenario>,
) -> Vec<ScenarioResult> {
let mut results = Vec::new();
for scenario in scenarios {
let result = self.run_scenario(agent_path, base_dir, scenario).await;
match result {
Ok(result) => {
let stop = self.suite.settings.fail_fast
&& (result.status.is_failed() || result.status.is_error());
results.push(result);
if stop {
break;
}
}
Err(error) => {
results.push(error_result(scenario, error, FailureCategory::RuntimeError));
if self.suite.settings.fail_fast {
break;
}
}
}
}
results
}
async fn run_scenarios_parallel(
&self,
agent_path: &Path,
base_dir: &Path,
scenarios: Vec<&Scenario>,
) -> Vec<ScenarioResult> {
let max_concurrent = self.suite.settings.max_concurrent.max(1);
let mut indexed = stream::iter(scenarios.into_iter().enumerate())
.map(|(idx, scenario)| async move {
let result = self.run_scenario(agent_path, base_dir, scenario).await;
let result = result.unwrap_or_else(|error| {
error_result(scenario, error, FailureCategory::RuntimeError)
});
(idx, result)
})
.buffer_unordered(max_concurrent)
.collect::<Vec<_>>()
.await;
indexed.sort_by_key(|(idx, _)| *idx);
indexed.into_iter().map(|(_, result)| result).collect()
}
fn resolve_agent_path(&self, base_dir: &Path) -> Result<PathBuf> {
if let Some(agent) = &self.options.agent {
return Ok(agent.clone());
}
let agent =
self.suite.agent.clone().ok_or_else(|| {
EvalError::Config("agent path is required in suite or CLI".into())
})?;
Ok(if agent.is_absolute() {
agent
} else {
base_dir.join(agent)
})
}
fn filtered_scenarios(&self) -> Vec<&Scenario> {
let ids: HashSet<_> = self.options.ids.iter().collect();
let tags: HashSet<_> = self.options.tags.iter().collect();
let languages: HashSet<_> = self.options.languages.iter().collect();
self.suite
.scenarios
.iter()
.filter(|scenario| {
if !ids.is_empty() && !ids.contains(&scenario.id) {
return false;
}
if !languages.is_empty() {
let Some(language) = &scenario.language else {
return false;
};
if !languages.contains(language) {
return false;
}
}
if !tags.is_empty() {
let scenario_tags: HashSet<_> = scenario.tags.iter().collect();
if self.options.tag_mode_all {
if !tags.iter().all(|tag| scenario_tags.contains(*tag)) {
return false;
}
} else if !tags.iter().any(|tag| scenario_tags.contains(*tag)) {
return false;
}
}
true
})
.collect()
}
async fn run_scenario(
&self,
agent_path: &Path,
base_dir: &Path,
scenario: &Scenario,
) -> Result<ScenarioResult> {
let start = Instant::now();
if scenario.skip.is_skipped() {
return Ok(ScenarioResult {
id: scenario.id.clone(),
name: scenario.name.clone(),
tags: scenario.tags.clone(),
language: scenario.language.clone(),
status: ScenarioStatus::Skipped {
reason: scenario.skip.reason(),
},
failure_category: None,
flaky: false,
attempts: Vec::new(),
duration_ms: 0,
retries_used: 0,
});
}
let mut attempts = Vec::new();
let mut final_status = ScenarioStatus::Failed {
reason: "not run".to_string(),
};
let mut category = Some(FailureCategory::AssertionFailed);
let max_attempt = self.suite.settings.retries + 1;
for attempt_idx in 0..max_attempt {
let attempt_future = self.run_attempt(agent_path, base_dir, scenario, attempt_idx);
let attempt = if let Some(timeout_ms) = self.suite.settings.timeout_per_scenario_ms {
match timeout(Duration::from_millis(timeout_ms), attempt_future).await {
Ok(result) => result,
Err(_) => Err(EvalError::Runtime(format!(
"scenario '{}' attempt {} timed out after {}ms",
scenario.id, attempt_idx, timeout_ms
))),
}
} else {
attempt_future.await
};
match attempt {
Ok(attempt_result) => {
final_status = attempt_result.status.clone();
if final_status.is_passed() {
attempts.push(attempt_result);
category = if attempt_idx > 0 {
Some(FailureCategory::FlakyPass)
} else {
None
};
break;
}
category = Some(if final_status.is_error() {
FailureCategory::RuntimeError
} else {
failure_category_for_attempt(&attempt_result)
});
attempts.push(attempt_result);
}
Err(error) => {
final_status = ScenarioStatus::Error {
message: error.to_string(),
};
category = Some(FailureCategory::RuntimeError);
attempts.push(AttemptResult {
attempt: attempt_idx,
turns: Vec::new(),
status: final_status.clone(),
duration_ms: 0,
});
}
}
if attempt_idx + 1 < max_attempt {
tokio::time::sleep(Duration::from_millis(self.suite.settings.retry_delay_ms)).await;
}
}
let flaky = final_status.is_passed() && attempts.len() > 1;
Ok(ScenarioResult {
id: scenario.id.clone(),
name: scenario.name.clone(),
tags: scenario.tags.clone(),
language: scenario.language.clone(),
status: final_status,
failure_category: category,
flaky,
duration_ms: start.elapsed().as_millis() as u64,
retries_used: attempts.len().saturating_sub(1) as u32,
attempts,
})
}
async fn run_attempt(
&self,
agent_path: &Path,
base_dir: &Path,
scenario: &Scenario,
attempt: u32,
) -> Result<AttemptResult> {
let start = Instant::now();
let workspace = std::env::temp_dir().join(format!(
"ai_agents_eval_{}_{}_{}",
scenario.id,
attempt,
uuid::Uuid::new_v4()
));
std::fs::create_dir_all(&workspace)?;
let _env_guard = EnvGuard::apply(&scenario.env)?;
let mock_server = start_mock_server(self.suite.fixtures.mock_server.as_ref()).await?;
let tool_log = RecordingToolLog::new();
let mut agent = self
.build_agent(agent_path, base_dir, &workspace, tool_log.clone())
.await?;
apply_base_context(
&agent,
&self.suite,
base_dir,
scenario,
mock_server.as_ref(),
)?;
let mut turns = Vec::new();
let mut status = ScenarioStatus::Passed;
if !scenario.turns.is_empty() {
for (idx, turn) in scenario.turns.iter().enumerate() {
let turn_result = self
.run_turn(&agent, scenario, turn, idx, &tool_log)
.await?;
let turn_failed = turn_result.assertion_results.iter().any(|r| !r.passed);
turns.push(turn_result);
if turn_failed {
status = ScenarioStatus::Failed {
reason: format!("turn {} assertion failed", idx + 1),
};
break;
}
if self.suite.settings.isolation == IsolationMode::Turn
&& idx + 1 < scenario.turns.len()
{
agent.reset().await?;
apply_base_context(
&agent,
&self.suite,
base_dir,
scenario,
mock_server.as_ref(),
)?;
}
}
}
for step in &scenario.steps {
if !status.is_passed() {
break;
}
match step {
ScenarioStep::Run(run) => {
for turn in &run.turns {
let idx = turns.len();
let turn_result = self
.run_turn(&agent, scenario, turn, idx, &tool_log)
.await?;
let turn_failed = turn_result.assertion_results.iter().any(|r| !r.passed);
turns.push(turn_result);
if turn_failed {
status = ScenarioStatus::Failed {
reason: format!("turn {} assertion failed", idx + 1),
};
break;
}
}
if let Some(session) = &run.save_session {
agent.save_session(session).await?;
}
}
ScenarioStep::ResetAgent(reset) => {
if let Some(options) = reset_options(reset) {
if options.delete_persistence || !options.preserve_storage {
let _ = std::fs::remove_dir_all(&workspace);
std::fs::create_dir_all(&workspace)?;
}
let preserved_actor = options
.preserve_actor_id
.then(|| agent.actor_id())
.flatten();
if matches!(options.profile, crate::reset::ResetProfile::Conversation)
&& !options.delete_persistence
{
agent.reset().await?;
} else {
agent = self
.build_agent(agent_path, base_dir, &workspace, tool_log.clone())
.await?;
}
if options.preserve_host_context {
apply_base_context(
&agent,
&self.suite,
base_dir,
scenario,
mock_server.as_ref(),
)?;
}
if let Some(actor) = preserved_actor.or_else(|| scenario.actor.clone()) {
agent.set_actor_id(&actor)?;
agent.load_actor_memory().await?;
agent.load_actor_relationship().await?;
}
}
}
ScenarioStep::SaveSession(name) => {
agent.save_session(name).await?;
}
ScenarioStep::LoadSession(name) => {
let _ = agent.load_session(name).await?;
}
ScenarioStep::SetContext { values } => {
apply_context_value(&agent, values)?;
}
ScenarioStep::SetActor { actor } => {
agent.set_actor_id(actor)?;
agent.load_actor_memory().await?;
agent.load_actor_relationship().await?;
}
ScenarioStep::CleanupExpired => {
let _ = agent.cleanup_expired_sessions().await?;
}
}
}
Ok(AttemptResult {
attempt,
turns,
status,
duration_ms: start.elapsed().as_millis() as u64,
})
}
async fn build_agent(
&self,
agent_path: &Path,
base_dir: &Path,
workspace: &Path,
tool_log: RecordingToolLog,
) -> Result<RuntimeAgent> {
let content = std::fs::read_to_string(agent_path)?;
let mut spec: AgentSpec = serde_yaml::from_str(&content)?;
apply_eval_llm_settings(&mut spec, &self.suite.settings);
spec.validate()
.map_err(|error| EvalError::Config(error.to_string()))?;
let (llm_registry, _judge_llm) =
build_llm_registry(&spec, &self.suite.fixtures.llm, base_dir)?;
let tool_registry = build_tool_registry(&self.suite.fixtures, tool_log)?;
let mut builder = AgentBuilder::from_yaml_file(agent_path)
.map_err(|error| EvalError::Config(error.to_string()))?
.llm_registry(llm_registry)
.tools(tool_registry)
.auto_configure_features()
.map_err(|error| EvalError::Config(error.to_string()))?
.auto_configure_mcp()
.await
.map_err(|error| EvalError::Config(error.to_string()))?;
let storage_override = isolated_storage_config(&spec, workspace);
if let Some(storage) = storage_override {
builder = builder.storage_config(storage);
}
if let Some(observability) = self.observability_config(base_dir)? {
let manager = ai_agents_observability::ObservabilityManager::new(observability);
builder = builder.observability(manager);
}
builder = builder
.auto_configure_spawner()
.await
.map_err(|error| EvalError::Config(error.to_string()))?;
let agent = builder
.build()
.map_err(|error| EvalError::Config(error.to_string()))?;
agent.init_storage().await?;
Ok(agent)
}
fn observability_config(&self, base_dir: &Path) -> Result<Option<ObservabilityConfig>> {
let mut config = if let Some(config) = self.suite.observability.clone() {
config
} else if self.options.observability {
let mut config = ObservabilityConfig::default();
config.enabled = true;
config.export.formats = vec![ExportFormat::Json];
config.export.path = self
.options
.output
.join("observability")
.display()
.to_string();
config.export.write_report = true;
config
} else {
return Ok(None);
};
if !config.enabled {
return Ok(None);
}
config = config
.with_pricing_file_loaded(Some(base_dir))
.map_err(|error| EvalError::Config(error.to_string()))?;
config
.validate()
.map_err(|error| EvalError::Config(error.to_string()))?;
Ok(Some(config))
}
async fn run_turn(
&self,
agent: &RuntimeAgent,
scenario: &Scenario,
turn: &Turn,
index: usize,
tool_log: &RecordingToolLog,
) -> Result<TurnResult> {
apply_context_value(agent, &turn.context)?;
if let Some(actor) = &turn.actor {
agent.set_actor_id(actor)?;
}
let before_relationship = relationship_snapshot(agent);
let tool_start = tool_log.len();
let start = Instant::now();
let timeout_ms = turn
.timeout_ms
.unwrap_or(self.suite.settings.timeout_per_turn_ms);
let (response_content, response_metadata) = if turn.stream.unwrap_or(false) {
timeout(
Duration::from_millis(timeout_ms),
collect_stream_response(agent, &turn.input),
)
.await
.map_err(|_| EvalError::Runtime(format!("turn timed out after {}ms", timeout_ms)))??
} else {
let response = timeout(Duration::from_millis(timeout_ms), agent.chat(&turn.input))
.await
.map_err(|_| {
EvalError::Runtime(format!("turn timed out after {}ms", timeout_ms))
})??;
(response.content, response.metadata)
};
agent.flush_background_tasks().await?;
let latency_ms = start.elapsed().as_millis() as u64;
let evidence = collect_turn_evidence(
agent,
response_metadata.clone(),
tool_log,
tool_start,
before_relationship,
);
let judge = self.build_judge(agent);
let mut assertion_results = if let Some(assertion) = &turn.assertions {
match evaluate_assertion(
assertion,
AssertionEvalContext {
evidence: &evidence,
response: &response_content,
user_input: Some(&turn.input),
scenario_id: Some(&scenario.id),
language: scenario.language.as_deref(),
judge_resolver: Some(&judge),
},
)
.await
{
AssertionOutcome::Passed(details) | AssertionOutcome::Failed(details) => details,
AssertionOutcome::Error(message) => return Err(EvalError::Assertion(message)),
}
} else {
Vec::new()
};
if self.suite.settings.redact_outputs {
redact_assertion_details(&mut assertion_results);
}
let observability_span_id = evidence
.observability
.as_ref()
.and_then(|obs| obs.span_ids.last().cloned());
Ok(TurnResult {
index,
input: redact_text(&turn.input, self.suite.settings.redact_outputs, 0),
response: redact_text(&response_content, self.suite.settings.redact_outputs, 0),
state: evidence.state.clone(),
metadata: if self.suite.settings.redact_outputs {
None
} else {
response_metadata.and_then(|m| serde_json::to_value(m).ok())
},
evidence,
assertion_results,
latency_ms,
observability_span_id,
})
}
fn build_judge(&self, agent: &RuntimeAgent) -> JudgeResolver {
JudgeResolver::new(Arc::clone(agent.llm_registry()), JudgeConfig::default())
}
}
async fn collect_stream_response(
agent: &RuntimeAgent,
input: &str,
) -> Result<(String, Option<HashMap<String, Value>>)> {
let mut stream = agent.chat_stream(input).await?;
let mut content = String::new();
while let Some(chunk) = stream.next().await {
match chunk {
StreamChunk::Content { text } => content.push_str(&text),
StreamChunk::Done {} => break,
StreamChunk::Error { message } => return Err(EvalError::Runtime(message)),
_ => {}
}
}
Ok((content, None))
}
fn isolated_storage_config(spec: &AgentSpec, workspace: &Path) -> Option<StorageConfig> {
if spec.storage.is_none() {
return None;
}
match &spec.storage {
StorageConfig::Sqlite(_) => Some(StorageConfig::sqlite(
workspace.join("sessions.db").display().to_string(),
)),
StorageConfig::File(_) => Some(StorageConfig::file(
workspace.join("sessions").display().to_string(),
)),
StorageConfig::Redis(_) => None,
StorageConfig::None => None,
}
}
fn apply_context_map(agent: &RuntimeAgent, values: HashMap<String, Value>) -> Result<()> {
for (key, value) in values {
agent.set_context(&key, value)?;
}
Ok(())
}
fn apply_context_value(agent: &RuntimeAgent, value: &Value) -> Result<()> {
let Value::Object(map) = value else {
return Ok(());
};
for (key, value) in map {
agent.set_context(key, value.clone())?;
}
Ok(())
}
fn apply_base_context(
agent: &RuntimeAgent,
suite: &EvalSuite,
base_dir: &Path,
scenario: &Scenario,
mock_server: Option<&crate::fixtures::MockServerHandle>,
) -> Result<()> {
if let Some(mock_server) = mock_server {
apply_context_map(agent, mock_server.context())?;
}
apply_context_map(agent, resolve_fixture_context(&suite.fixtures, base_dir)?)?;
apply_context_value(agent, &scenario.context)?;
if let Some(actor) = &scenario.actor {
agent.set_actor_id(actor)?;
}
Ok(())
}
fn reset_options(config: &ResetStepConfig) -> Option<crate::reset::ResetOptions> {
match config {
ResetStepConfig::Bool(false) => None,
ResetStepConfig::Bool(true) => Some(crate::reset::ResetOptions::default()),
ResetStepConfig::Options(options) => Some(options.clone()),
}
}
fn redact_assertion_details(details: &mut [crate::assertion::AssertionResultDetail]) {
for detail in details {
detail.actual = redact_value(std::mem::take(&mut detail.actual), true, 0);
detail.expected = redact_value(std::mem::take(&mut detail.expected), true, 0);
}
}
fn error_result(
scenario: &Scenario,
error: EvalError,
category: FailureCategory,
) -> ScenarioResult {
ScenarioResult {
id: scenario.id.clone(),
name: scenario.name.clone(),
tags: scenario.tags.clone(),
language: scenario.language.clone(),
status: ScenarioStatus::Error {
message: error.to_string(),
},
failure_category: Some(category),
flaky: false,
attempts: Vec::new(),
duration_ms: 0,
retries_used: 0,
}
}
fn failure_category_for_attempt(attempt: &AttemptResult) -> FailureCategory {
let judge_failed = attempt.turns.iter().any(|turn| {
turn.assertion_results
.iter()
.any(|detail| !detail.passed && detail.assertion == "judge")
});
if judge_failed {
FailureCategory::JudgeError
} else {
FailureCategory::AssertionFailed
}
}
fn final_observability_report(
results: &[ScenarioResult],
) -> Option<ai_agents_observability::ObservabilityReport> {
results
.iter()
.rev()
.flat_map(|scenario| scenario.attempts.iter().rev())
.flat_map(|attempt| attempt.turns.iter().rev())
.find_map(|turn| {
turn.evidence
.observability
.as_ref()
.and_then(|obs| obs.report.clone())
})
}
fn apply_eval_llm_settings(spec: &mut AgentSpec, settings: &crate::suite::EvalSettings) {
if let LLMConfigOrSelector::Config(config) = &mut spec.llm {
apply_llm_config_settings(config, settings);
}
for config in spec.llms.values_mut() {
apply_llm_config_settings(config, settings);
}
}
fn apply_llm_config_settings(
config: &mut ai_agents_runtime::spec::LLMConfig,
settings: &crate::suite::EvalSettings,
) {
if let Some(temperature) = settings.temperature {
config.temperature = temperature;
}
if let Some(seed) = settings.seed {
config.extra.insert("seed".to_string(), json!(seed));
}
}
struct EnvGuard {
previous: Vec<(String, Option<String>)>,
_guard: MutexGuard<'static, ()>,
}
impl EnvGuard {
fn apply(values: &HashMap<String, String>) -> Result<Self> {
static ENV_LOCK: OnceLock<Mutex<()>> = OnceLock::new();
let guard = ENV_LOCK
.get_or_init(|| Mutex::new(()))
.lock()
.map_err(|_| EvalError::Runtime("failed to lock eval environment guard".to_string()))?;
let mut previous = Vec::new();
for (key, value) in values {
previous.push((key.clone(), std::env::var(key).ok()));
unsafe {
std::env::set_var(key, value);
}
}
Ok(Self {
previous,
_guard: guard,
})
}
}
impl Drop for EnvGuard {
fn drop(&mut self) {
for (key, value) in self.previous.drain(..).rev() {
unsafe {
if let Some(value) = value {
std::env::set_var(key, value);
} else {
std::env::remove_var(key);
}
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn runner_executes_mocked_suite_and_redacts_outputs() {
let dir = std::env::temp_dir().join(format!(
"ai_agents_eval_runner_test_{}",
uuid::Uuid::new_v4()
));
std::fs::create_dir_all(&dir).unwrap();
let agent_path = dir.join("agent.yaml");
std::fs::write(
&agent_path,
r#"
name: TestAgent
system_prompt: "You are helpful."
llm:
provider: openai
model: gpt-4.1-nano
"#,
)
.unwrap();
let suite_path = dir.join("suite.yaml");
std::fs::write(
&suite_path,
r#"
name: Runner Suite
agent: agent.yaml
fixtures:
llm:
mode: mock
responses:
- "Hello from mock"
scenarios:
- id: smoke
turns:
- input: Hello
assert:
response_contains: "Hello"
"#,
)
.unwrap();
let options = EvalRunnerOptions {
output: dir.join("out"),
..Default::default()
};
let runner = EvalRunner::from_file(&suite_path, options).unwrap();
let result = runner.run().await.unwrap();
assert_eq!(result.passed, 1);
let turn = &result.scenarios[0].attempts[0].turns[0];
assert_eq!(turn.input.value, "[redacted]");
assert_eq!(turn.response.value, "[redacted]");
let json = serde_json::to_string(&result).unwrap();
assert!(!json.contains("Hello from mock"));
let _ = std::fs::remove_dir_all(dir);
}
}