1use std::collections::HashMap;
2use std::path::PathBuf;
3
4use ai_agents_observability::ObservabilityConfig;
5use ai_agents_observability::ObservabilityReport;
6use serde::{Deserialize, Serialize};
7use serde_json::Value;
8
9use crate::assertion::{Assertion, AssertionResultDetail};
10use crate::evidence::TurnEvidence;
11use crate::fixtures::FixturesConfig;
12use crate::redaction::RedactedString;
13use crate::reset::ResetOptions;
14use crate::{EvalError, Result};
15
16#[derive(Debug, Clone, Deserialize)]
18pub struct EvalSuite {
19 pub name: String,
21 #[serde(default)]
23 pub agent: Option<PathBuf>,
24 #[serde(default)]
26 pub settings: EvalSettings,
27 #[serde(default)]
29 pub observability: Option<ObservabilityConfig>,
30 #[serde(default)]
32 pub fixtures: FixturesConfig,
33 #[serde(default)]
35 pub scenarios: Vec<Scenario>,
36}
37
38impl EvalSuite {
39 pub fn validate(&self, cli_agent: Option<&PathBuf>) -> Result<()> {
40 if self.name.trim().is_empty() {
41 return Err(EvalError::Config(
42 "eval suite name must not be empty".into(),
43 ));
44 }
45 if cli_agent.is_none() && self.agent.is_none() {
46 return Err(EvalError::Config(
47 "agent path is required in suite or CLI".into(),
48 ));
49 }
50 if self.settings.max_concurrent == 0 {
51 return Err(EvalError::Config(
52 "settings.max_concurrent must be greater than zero".into(),
53 ));
54 }
55 if self.settings.timeout_per_turn_ms == 0 {
56 return Err(EvalError::Config(
57 "settings.timeout_per_turn_ms must be greater than zero".into(),
58 ));
59 }
60 if matches!(
61 self.settings.isolation,
62 IsolationMode::Suite | IsolationMode::None
63 ) {
64 return Err(EvalError::Config(
65 "settings.isolation currently supports scenario or turn".into(),
66 ));
67 }
68 if self.settings.parallel && self.settings.isolation != IsolationMode::Scenario {
69 return Err(EvalError::Config(
70 "settings.parallel currently requires isolation: scenario".into(),
71 ));
72 }
73 if self.settings.parallel
74 && self
75 .scenarios
76 .iter()
77 .any(|scenario| !scenario.env.is_empty())
78 {
79 return Err(EvalError::Config(
80 "scenario.env cannot be used with parallel execution".into(),
81 ));
82 }
83 let mut ids = std::collections::HashSet::new();
84 for scenario in &self.scenarios {
85 if scenario.id.trim().is_empty() {
86 return Err(EvalError::Config("scenario id must not be empty".into()));
87 }
88 if !ids.insert(scenario.id.clone()) {
89 return Err(EvalError::Config(format!(
90 "duplicate scenario id: {}",
91 scenario.id
92 )));
93 }
94 if !scenario.skip.is_skipped() && scenario.turns.is_empty() && scenario.steps.is_empty()
95 {
96 return Err(EvalError::Config(format!(
97 "scenario '{}' must define turns or steps",
98 scenario.id
99 )));
100 }
101 }
102 Ok(())
103 }
104}
105
106#[derive(Debug, Clone, Deserialize)]
108pub struct EvalSettings {
109 #[serde(default)]
111 pub temperature: Option<f32>,
112 #[serde(default)]
114 pub seed: Option<u64>,
115 #[serde(default = "default_turn_timeout")]
117 pub timeout_per_turn_ms: u64,
118 #[serde(default)]
120 pub timeout_per_scenario_ms: Option<u64>,
121 #[serde(default)]
123 pub retries: u32,
124 #[serde(default = "default_retry_delay")]
126 pub retry_delay_ms: u64,
127 #[serde(default)]
129 pub isolation: IsolationMode,
130 #[serde(default)]
132 pub parallel: bool,
133 #[serde(default = "default_max_concurrent")]
135 pub max_concurrent: usize,
136 #[serde(default)]
138 pub fail_fast: bool,
139 #[serde(default = "default_true")]
141 pub redact_outputs: bool,
142}
143
144impl Default for EvalSettings {
145 fn default() -> Self {
146 Self {
147 temperature: None,
148 seed: None,
149 timeout_per_turn_ms: default_turn_timeout(),
150 timeout_per_scenario_ms: None,
151 retries: 0,
152 retry_delay_ms: default_retry_delay(),
153 isolation: IsolationMode::Scenario,
154 parallel: false,
155 max_concurrent: default_max_concurrent(),
156 fail_fast: false,
157 redact_outputs: true,
158 }
159 }
160}
161
162#[derive(Debug, Clone, Deserialize, Serialize, Default, PartialEq, Eq)]
164#[serde(rename_all = "snake_case")]
165pub enum IsolationMode {
166 Turn,
167 #[default]
168 Scenario,
169 Suite,
170 None,
171}
172
173#[derive(Debug, Clone, Deserialize)]
175pub struct Scenario {
176 pub id: String,
178 #[serde(default)]
180 pub name: Option<String>,
181 #[serde(default)]
183 pub tags: Vec<String>,
184 #[serde(default)]
186 pub language: Option<String>,
187 #[serde(default)]
189 pub actor: Option<String>,
190 #[serde(default)]
192 pub context: Value,
193 #[serde(default)]
195 pub env: HashMap<String, String>,
196 #[serde(default)]
198 pub skip: SkipConfig,
199 #[serde(default)]
201 pub turns: Vec<Turn>,
202 #[serde(default)]
204 pub steps: Vec<ScenarioStep>,
205}
206
207#[derive(Debug, Clone, Deserialize)]
209pub struct Turn {
210 pub input: String,
212 #[serde(default)]
214 pub actor: Option<String>,
215 #[serde(default)]
217 pub context: Value,
218 #[serde(default)]
220 pub stream: Option<bool>,
221 #[serde(default)]
223 pub timeout_ms: Option<u64>,
224 #[serde(default, rename = "assert")]
226 pub assertions: Option<Assertion>,
227}
228
229#[derive(Debug, Clone, Deserialize)]
231#[serde(untagged)]
232pub enum SkipConfig {
233 Bool(bool),
234 Reason(String),
235}
236
237impl Default for SkipConfig {
238 fn default() -> Self {
239 Self::Bool(false)
240 }
241}
242
243impl SkipConfig {
244 pub fn is_skipped(&self) -> bool {
245 match self {
246 Self::Bool(value) => *value,
247 Self::Reason(_) => true,
248 }
249 }
250
251 pub fn reason(&self) -> Option<String> {
252 match self {
253 Self::Bool(_) => None,
254 Self::Reason(reason) => Some(reason.clone()),
255 }
256 }
257}
258
259#[derive(Debug, Clone, Deserialize)]
261#[serde(rename_all = "snake_case")]
262pub enum ScenarioStep {
263 Run(RunStep),
264 ResetAgent(ResetStepConfig),
265 SaveSession(String),
266 LoadSession(String),
267 SetContext { values: Value },
268 SetActor { actor: String },
269 CleanupExpired,
270}
271
272#[derive(Debug, Clone, Deserialize)]
274pub struct RunStep {
275 #[serde(default)]
277 pub turns: Vec<Turn>,
278 #[serde(default)]
280 pub save_session: Option<String>,
281}
282
283#[derive(Debug, Clone, Deserialize)]
285#[serde(untagged)]
286pub enum ResetStepConfig {
287 Bool(bool),
288 Options(ResetOptions),
289}
290
291#[derive(Debug, Clone, Serialize)]
293pub struct EvalResult {
294 pub schema_version: u32,
296 pub suite: String,
298 pub agent: String,
300 pub total: usize,
302 pub passed: usize,
304 pub failed: usize,
306 pub skipped: usize,
308 pub duration_ms: u64,
310 pub scenarios: Vec<ScenarioResult>,
312 pub metrics: crate::metrics::EvalMetrics,
314 #[serde(skip_serializing_if = "Option::is_none")]
316 pub observability: Option<ObservabilityReport>,
317}
318
319#[derive(Debug, Clone, Serialize)]
321pub struct ScenarioResult {
322 pub id: String,
324 pub name: Option<String>,
326 pub tags: Vec<String>,
328 pub language: Option<String>,
330 pub status: ScenarioStatus,
332 pub failure_category: Option<FailureCategory>,
334 pub flaky: bool,
336 pub attempts: Vec<AttemptResult>,
338 pub duration_ms: u64,
340 pub retries_used: u32,
342}
343
344#[derive(Debug, Clone, Serialize)]
346pub struct AttemptResult {
347 pub attempt: u32,
349 pub turns: Vec<TurnResult>,
351 pub status: ScenarioStatus,
353 pub duration_ms: u64,
355}
356
357#[derive(Debug, Clone, Serialize)]
359#[serde(rename_all = "snake_case")]
360pub enum ScenarioStatus {
361 Passed,
362 Failed { reason: String },
363 Skipped { reason: Option<String> },
364 Error { message: String },
365}
366
367impl ScenarioStatus {
368 pub fn is_passed(&self) -> bool {
369 matches!(self, Self::Passed)
370 }
371
372 pub fn is_failed(&self) -> bool {
373 matches!(self, Self::Failed { .. })
374 }
375
376 pub fn is_error(&self) -> bool {
377 matches!(self, Self::Error { .. })
378 }
379}
380
381#[derive(Debug, Clone, Serialize, PartialEq, Eq, Hash)]
383#[serde(rename_all = "snake_case")]
384pub enum FailureCategory {
385 ConfigError,
386 RuntimeError,
387 AssertionFailed,
388 JudgeError,
389 FlakyPass,
390}
391
392#[derive(Debug, Clone, Serialize)]
394pub struct TurnResult {
395 pub index: usize,
397 pub input: RedactedString,
399 pub response: RedactedString,
401 pub state: Option<String>,
403 #[serde(skip_serializing_if = "Option::is_none")]
405 pub metadata: Option<Value>,
406 #[serde(skip_serializing)]
408 pub evidence: TurnEvidence,
409 pub assertion_results: Vec<AssertionResultDetail>,
411 pub latency_ms: u64,
413 #[serde(skip_serializing_if = "Option::is_none")]
415 pub observability_span_id: Option<String>,
416}
417
418fn default_turn_timeout() -> u64 {
419 30_000
420}
421
422fn default_retry_delay() -> u64 {
423 1_000
424}
425
426fn default_max_concurrent() -> usize {
427 4
428}
429
430fn default_true() -> bool {
431 true
432}
433
434#[cfg(test)]
435mod tests {
436 use super::*;
437
438 fn suite() -> EvalSuite {
439 EvalSuite {
440 name: "suite".to_string(),
441 agent: Some(PathBuf::from("agent.yaml")),
442 settings: EvalSettings::default(),
443 observability: None,
444 fixtures: FixturesConfig::default(),
445 scenarios: vec![Scenario {
446 id: "scenario-1".to_string(),
447 name: None,
448 tags: vec!["smoke".to_string()],
449 language: Some("en".to_string()),
450 actor: None,
451 context: Value::Null,
452 env: HashMap::new(),
453 skip: SkipConfig::default(),
454 turns: vec![Turn {
455 input: "hello".to_string(),
456 actor: None,
457 context: Value::Null,
458 stream: None,
459 timeout_ms: None,
460 assertions: None,
461 }],
462 steps: Vec::new(),
463 }],
464 }
465 }
466
467 #[test]
468 fn validation_accepts_minimal_suite() {
469 assert!(suite().validate(None).is_ok());
470 }
471
472 #[test]
473 fn validation_rejects_duplicate_ids() {
474 let mut suite = suite();
475 suite.scenarios.push(suite.scenarios[0].clone());
476 let error = suite.validate(None).unwrap_err().to_string();
477 assert!(error.contains("duplicate scenario id"));
478 }
479
480 #[test]
481 fn validation_rejects_parallel_env() {
482 let mut suite = suite();
483 suite.settings.parallel = true;
484 suite.scenarios[0]
485 .env
486 .insert("TOKEN".to_string(), "secret".to_string());
487 let error = suite.validate(None).unwrap_err().to_string();
488 assert!(error.contains("scenario.env"));
489 }
490}