Skip to main content

ai_agents_eval/
suite.rs

1use std::collections::HashMap;
2use std::path::PathBuf;
3
4use ai_agents_observability::ObservabilityConfig;
5use ai_agents_observability::ObservabilityReport;
6use serde::{Deserialize, Serialize};
7use serde_json::Value;
8
9use crate::assertion::{Assertion, AssertionResultDetail};
10use crate::evidence::TurnEvidence;
11use crate::fixtures::FixturesConfig;
12use crate::redaction::RedactedString;
13use crate::reset::ResetOptions;
14use crate::{EvalError, Result};
15
16/// Top-level evaluation suite loaded from YAML or JSONL.
17#[derive(Debug, Clone, Deserialize)]
18pub struct EvalSuite {
19    /// Human-readable name or criterion name.
20    pub name: String,
21    /// Agent YAML path used for this run.
22    #[serde(default)]
23    pub agent: Option<PathBuf>,
24    /// Execution settings for the suite.
25    #[serde(default)]
26    pub settings: EvalSettings,
27    /// Observability assertion, setting, or report value.
28    #[serde(default)]
29    pub observability: Option<ObservabilityConfig>,
30    /// Fixtures applied while building and running agents.
31    #[serde(default)]
32    pub fixtures: FixturesConfig,
33    /// Scenario test cases in this suite.
34    #[serde(default)]
35    pub scenarios: Vec<Scenario>,
36}
37
38impl EvalSuite {
39    pub fn validate(&self, cli_agent: Option<&PathBuf>) -> Result<()> {
40        if self.name.trim().is_empty() {
41            return Err(EvalError::Config(
42                "eval suite name must not be empty".into(),
43            ));
44        }
45        if cli_agent.is_none() && self.agent.is_none() {
46            return Err(EvalError::Config(
47                "agent path is required in suite or CLI".into(),
48            ));
49        }
50        if self.settings.max_concurrent == 0 {
51            return Err(EvalError::Config(
52                "settings.max_concurrent must be greater than zero".into(),
53            ));
54        }
55        if self.settings.timeout_per_turn_ms == 0 {
56            return Err(EvalError::Config(
57                "settings.timeout_per_turn_ms must be greater than zero".into(),
58            ));
59        }
60        if matches!(
61            self.settings.isolation,
62            IsolationMode::Suite | IsolationMode::None
63        ) {
64            return Err(EvalError::Config(
65                "settings.isolation currently supports scenario or turn".into(),
66            ));
67        }
68        if self.settings.parallel && self.settings.isolation != IsolationMode::Scenario {
69            return Err(EvalError::Config(
70                "settings.parallel currently requires isolation: scenario".into(),
71            ));
72        }
73        if self.settings.parallel
74            && self
75                .scenarios
76                .iter()
77                .any(|scenario| !scenario.env.is_empty())
78        {
79            return Err(EvalError::Config(
80                "scenario.env cannot be used with parallel execution".into(),
81            ));
82        }
83        let mut ids = std::collections::HashSet::new();
84        for scenario in &self.scenarios {
85            if scenario.id.trim().is_empty() {
86                return Err(EvalError::Config("scenario id must not be empty".into()));
87            }
88            if !ids.insert(scenario.id.clone()) {
89                return Err(EvalError::Config(format!(
90                    "duplicate scenario id: {}",
91                    scenario.id
92                )));
93            }
94            if !scenario.skip.is_skipped() && scenario.turns.is_empty() && scenario.steps.is_empty()
95            {
96                return Err(EvalError::Config(format!(
97                    "scenario '{}' must define turns or steps",
98                    scenario.id
99                )));
100            }
101        }
102        Ok(())
103    }
104}
105
106/// Execution policy for an evaluation suite.
107#[derive(Debug, Clone, Deserialize)]
108pub struct EvalSettings {
109    /// Optional temperature override for eval LLMs.
110    #[serde(default)]
111    pub temperature: Option<f32>,
112    /// Optional provider seed stored in LLM extra config.
113    #[serde(default)]
114    pub seed: Option<u64>,
115    /// Default timeout for one turn in milliseconds.
116    #[serde(default = "default_turn_timeout")]
117    pub timeout_per_turn_ms: u64,
118    /// Timeout for one scenario attempt in milliseconds.
119    #[serde(default)]
120    pub timeout_per_scenario_ms: Option<u64>,
121    /// Optional retry count or suite retry count.
122    #[serde(default)]
123    pub retries: u32,
124    /// Delay between retry attempts in milliseconds.
125    #[serde(default = "default_retry_delay")]
126    pub retry_delay_ms: u64,
127    /// Runtime isolation mode for scenarios or turns.
128    #[serde(default)]
129    pub isolation: IsolationMode,
130    /// Optional scenario concurrency override.
131    #[serde(default)]
132    pub parallel: bool,
133    /// Maximum concurrently running scenarios.
134    #[serde(default = "default_max_concurrent")]
135    pub max_concurrent: usize,
136    /// Stop after the first failed or errored scenario.
137    #[serde(default)]
138    pub fail_fast: bool,
139    /// Whether output artifacts should redact sensitive strings.
140    #[serde(default = "default_true")]
141    pub redact_outputs: bool,
142}
143
144impl Default for EvalSettings {
145    fn default() -> Self {
146        Self {
147            temperature: None,
148            seed: None,
149            timeout_per_turn_ms: default_turn_timeout(),
150            timeout_per_scenario_ms: None,
151            retries: 0,
152            retry_delay_ms: default_retry_delay(),
153            isolation: IsolationMode::Scenario,
154            parallel: false,
155            max_concurrent: default_max_concurrent(),
156            fail_fast: false,
157            redact_outputs: true,
158        }
159    }
160}
161
162/// Runtime isolation mode requested by a suite.
163#[derive(Debug, Clone, Deserialize, Serialize, Default, PartialEq, Eq)]
164#[serde(rename_all = "snake_case")]
165pub enum IsolationMode {
166    Turn,
167    #[default]
168    Scenario,
169    Suite,
170    None,
171}
172
173/// One test case inside an evaluation suite.
174#[derive(Debug, Clone, Deserialize)]
175pub struct Scenario {
176    /// Stable identifier for this item.
177    pub id: String,
178    /// Human-readable name or criterion name.
179    #[serde(default)]
180    pub name: Option<String>,
181    /// Tags used by filters and grouped metrics.
182    #[serde(default)]
183    pub tags: Vec<String>,
184    /// Optional language label for filtering, metrics, and judge context.
185    #[serde(default)]
186    pub language: Option<String>,
187    /// Actor ID used for this scenario, turn, or assertion.
188    #[serde(default)]
189    pub actor: Option<String>,
190    /// Runtime or fixture context value.
191    #[serde(default)]
192    pub context: Value,
193    /// env value for Scenario.
194    #[serde(default)]
195    pub env: HashMap<String, String>,
196    /// skip value for Scenario.
197    #[serde(default)]
198    pub skip: SkipConfig,
199    /// Turns executed by this scenario or step.
200    #[serde(default)]
201    pub turns: Vec<Turn>,
202    /// Advanced steps executed after direct turns.
203    #[serde(default)]
204    pub steps: Vec<ScenarioStep>,
205}
206
207/// One user input and assertion block inside a scenario.
208#[derive(Debug, Clone, Deserialize)]
209pub struct Turn {
210    /// User input sent to the runtime.
211    pub input: String,
212    /// Actor ID used for this scenario, turn, or assertion.
213    #[serde(default)]
214    pub actor: Option<String>,
215    /// Runtime or fixture context value.
216    #[serde(default)]
217    pub context: Value,
218    /// Whether to use streaming chat for this turn.
219    #[serde(default)]
220    pub stream: Option<bool>,
221    /// Optional timeout override for this turn.
222    #[serde(default)]
223    pub timeout_ms: Option<u64>,
224    /// Assertions evaluated after this turn.
225    #[serde(default, rename = "assert")]
226    pub assertions: Option<Assertion>,
227}
228
229/// Boolean or reason-string skip configuration.
230#[derive(Debug, Clone, Deserialize)]
231#[serde(untagged)]
232pub enum SkipConfig {
233    Bool(bool),
234    Reason(String),
235}
236
237impl Default for SkipConfig {
238    fn default() -> Self {
239        Self::Bool(false)
240    }
241}
242
243impl SkipConfig {
244    pub fn is_skipped(&self) -> bool {
245        match self {
246            Self::Bool(value) => *value,
247            Self::Reason(_) => true,
248        }
249    }
250
251    pub fn reason(&self) -> Option<String> {
252        match self {
253            Self::Bool(_) => None,
254            Self::Reason(reason) => Some(reason.clone()),
255        }
256    }
257}
258
259/// Advanced scenario action used outside direct turn lists.
260#[derive(Debug, Clone, Deserialize)]
261#[serde(rename_all = "snake_case")]
262pub enum ScenarioStep {
263    Run(RunStep),
264    ResetAgent(ResetStepConfig),
265    SaveSession(String),
266    LoadSession(String),
267    SetContext { values: Value },
268    SetActor { actor: String },
269    CleanupExpired,
270}
271
272/// Advanced step that runs turns and can save a session.
273#[derive(Debug, Clone, Deserialize)]
274pub struct RunStep {
275    /// Turns executed by this scenario or step.
276    #[serde(default)]
277    pub turns: Vec<Turn>,
278    /// Optional session name saved after a run step.
279    #[serde(default)]
280    pub save_session: Option<String>,
281}
282
283/// Boolean or object form for reset-agent steps.
284#[derive(Debug, Clone, Deserialize)]
285#[serde(untagged)]
286pub enum ResetStepConfig {
287    Bool(bool),
288    Options(ResetOptions),
289}
290
291/// Top-level result returned by an eval suite run.
292#[derive(Debug, Clone, Serialize)]
293pub struct EvalResult {
294    /// Machine-readable output schema version.
295    pub schema_version: u32,
296    /// Parsed and validated suite.
297    pub suite: String,
298    /// Agent YAML path used for this run.
299    pub agent: String,
300    /// Total count for this result or group.
301    pub total: usize,
302    /// Passed count or boolean result.
303    pub passed: usize,
304    /// Failed or errored count for this result or group.
305    pub failed: usize,
306    /// Skipped count for this result or group.
307    pub skipped: usize,
308    /// Duration in milliseconds.
309    pub duration_ms: u64,
310    /// Scenario test cases in this suite.
311    pub scenarios: Vec<ScenarioResult>,
312    /// metrics value for EvalResult.
313    pub metrics: crate::metrics::EvalMetrics,
314    /// Observability assertion, setting, or report value.
315    #[serde(skip_serializing_if = "Option::is_none")]
316    pub observability: Option<ObservabilityReport>,
317}
318
319/// Result for one evaluated scenario.
320#[derive(Debug, Clone, Serialize)]
321pub struct ScenarioResult {
322    /// Stable identifier for this item.
323    pub id: String,
324    /// Human-readable name or criterion name.
325    pub name: Option<String>,
326    /// Tags used by filters and grouped metrics.
327    pub tags: Vec<String>,
328    /// Optional language label for filtering, metrics, and judge context.
329    pub language: Option<String>,
330    /// Final or normalized status value.
331    pub status: ScenarioStatus,
332    /// High-level failure category for metrics.
333    pub failure_category: Option<FailureCategory>,
334    /// Number of scenarios that passed after retry.
335    pub flaky: bool,
336    /// Attempt results in execution order.
337    pub attempts: Vec<AttemptResult>,
338    /// Duration in milliseconds.
339    pub duration_ms: u64,
340    /// Number of retries consumed by this scenario.
341    pub retries_used: u32,
342}
343
344/// Result for one scenario attempt.
345#[derive(Debug, Clone, Serialize)]
346pub struct AttemptResult {
347    /// Zero-based attempt index.
348    pub attempt: u32,
349    /// Turns executed by this scenario or step.
350    pub turns: Vec<TurnResult>,
351    /// Final or normalized status value.
352    pub status: ScenarioStatus,
353    /// Duration in milliseconds.
354    pub duration_ms: u64,
355}
356
357/// Final status for a scenario result.
358#[derive(Debug, Clone, Serialize)]
359#[serde(rename_all = "snake_case")]
360pub enum ScenarioStatus {
361    Passed,
362    Failed { reason: String },
363    Skipped { reason: Option<String> },
364    Error { message: String },
365}
366
367impl ScenarioStatus {
368    pub fn is_passed(&self) -> bool {
369        matches!(self, Self::Passed)
370    }
371
372    pub fn is_failed(&self) -> bool {
373        matches!(self, Self::Failed { .. })
374    }
375
376    pub fn is_error(&self) -> bool {
377        matches!(self, Self::Error { .. })
378    }
379}
380
381/// High-level failure category used by metrics and reports.
382#[derive(Debug, Clone, Serialize, PartialEq, Eq, Hash)]
383#[serde(rename_all = "snake_case")]
384pub enum FailureCategory {
385    ConfigError,
386    RuntimeError,
387    AssertionFailed,
388    JudgeError,
389    FlakyPass,
390}
391
392/// Result for one evaluated turn.
393#[derive(Debug, Clone, Serialize)]
394pub struct TurnResult {
395    /// Zero-based turn index within the scenario.
396    pub index: usize,
397    /// User input sent to the runtime.
398    pub input: RedactedString,
399    /// Assistant response text or redacted output value.
400    pub response: RedactedString,
401    /// Current or expected state name.
402    pub state: Option<String>,
403    /// Optional response or tool metadata.
404    #[serde(skip_serializing_if = "Option::is_none")]
405    pub metadata: Option<Value>,
406    /// Full assertion-time evidence for this turn.
407    #[serde(skip_serializing)]
408    pub evidence: TurnEvidence,
409    /// Assertion details produced for this turn.
410    pub assertion_results: Vec<AssertionResultDetail>,
411    /// latency_ms value for TurnResult.
412    pub latency_ms: u64,
413    /// Optional observability span ID.
414    #[serde(skip_serializing_if = "Option::is_none")]
415    pub observability_span_id: Option<String>,
416}
417
418fn default_turn_timeout() -> u64 {
419    30_000
420}
421
422fn default_retry_delay() -> u64 {
423    1_000
424}
425
426fn default_max_concurrent() -> usize {
427    4
428}
429
430fn default_true() -> bool {
431    true
432}
433
434#[cfg(test)]
435mod tests {
436    use super::*;
437
438    fn suite() -> EvalSuite {
439        EvalSuite {
440            name: "suite".to_string(),
441            agent: Some(PathBuf::from("agent.yaml")),
442            settings: EvalSettings::default(),
443            observability: None,
444            fixtures: FixturesConfig::default(),
445            scenarios: vec![Scenario {
446                id: "scenario-1".to_string(),
447                name: None,
448                tags: vec!["smoke".to_string()],
449                language: Some("en".to_string()),
450                actor: None,
451                context: Value::Null,
452                env: HashMap::new(),
453                skip: SkipConfig::default(),
454                turns: vec![Turn {
455                    input: "hello".to_string(),
456                    actor: None,
457                    context: Value::Null,
458                    stream: None,
459                    timeout_ms: None,
460                    assertions: None,
461                }],
462                steps: Vec::new(),
463            }],
464        }
465    }
466
467    #[test]
468    fn validation_accepts_minimal_suite() {
469        assert!(suite().validate(None).is_ok());
470    }
471
472    #[test]
473    fn validation_rejects_duplicate_ids() {
474        let mut suite = suite();
475        suite.scenarios.push(suite.scenarios[0].clone());
476        let error = suite.validate(None).unwrap_err().to_string();
477        assert!(error.contains("duplicate scenario id"));
478    }
479
480    #[test]
481    fn validation_rejects_parallel_env() {
482        let mut suite = suite();
483        suite.settings.parallel = true;
484        suite.scenarios[0]
485            .env
486            .insert("TOKEN".to_string(), "secret".to_string());
487        let error = suite.validate(None).unwrap_err().to_string();
488        assert!(error.contains("scenario.env"));
489    }
490}