1use std::path::PathBuf;
2
3use serde::{Deserialize, Serialize};
4
5#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
6#[serde(rename_all = "camelCase")]
7pub struct EvalFixture {
8 pub id: String,
9 pub title: String,
10 pub prompt: String,
11 #[serde(default)]
12 pub tags: Vec<String>,
13 #[serde(default)]
14 pub workspace: EvalWorkspaceSetup,
15 #[serde(default, skip_serializing_if = "Option::is_none")]
16 pub timeout_ms: Option<u64>,
17 #[serde(default)]
18 pub expected: EvalExpectedEvidence,
19 #[serde(default)]
20 pub constraints: Vec<String>,
21 #[serde(default, skip_serializing_if = "Option::is_none")]
22 pub lazy_discovery: Option<EvalLazyDiscoveryFixture>,
23}
24
25#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
26#[serde(rename_all = "camelCase")]
27pub struct EvalWorkspaceSetup {
28 #[serde(default)]
29 pub files: Vec<EvalWorkspaceFile>,
30 #[serde(default)]
31 pub commands: Vec<String>,
32}
33
34#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
35#[serde(rename_all = "camelCase")]
36pub struct EvalWorkspaceFile {
37 pub path: PathBuf,
38 pub contents: String,
39}
40
41#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
42#[serde(rename_all = "camelCase")]
43pub struct EvalExpectedEvidence {
44 #[serde(default)]
45 pub final_answer_contains: Vec<String>,
46 #[serde(default)]
47 pub files: Vec<EvalExpectedFile>,
48 #[serde(default)]
49 pub command_checks: Vec<EvalExpectedCommand>,
50 #[serde(default)]
51 pub verification_required: bool,
52 #[serde(default)]
53 pub task_ledger_required: bool,
54}
55
56#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
57#[serde(rename_all = "camelCase")]
58pub struct EvalExpectedFile {
59 pub path: PathBuf,
60 #[serde(default = "default_true")]
61 pub exists: bool,
62 #[serde(default)]
63 pub contains: Vec<String>,
64 #[serde(default, skip_serializing_if = "Option::is_none")]
65 pub exact_contents: Option<String>,
66 #[serde(default, skip_serializing_if = "Option::is_none")]
67 pub max_bytes: Option<u64>,
68 #[serde(default, skip_serializing_if = "Option::is_none")]
69 pub allowed_chars: Option<String>,
70 #[serde(default)]
71 pub json_array_fields: Vec<String>,
72}
73
74#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
75#[serde(rename_all = "camelCase")]
76pub struct EvalExpectedCommand {
77 pub command: String,
78 #[serde(default)]
79 pub expected_exit_code: i32,
80 #[serde(default)]
81 pub stdout_contains: Vec<String>,
82 #[serde(default)]
83 pub stderr_contains: Vec<String>,
84}
85
86#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
87#[serde(rename_all = "camelCase")]
88pub struct EvalLazyDiscoveryFixture {
89 pub hidden_deferred_capabilities: u64,
90 #[serde(default)]
91 pub catalog_shape: EvalLazyDiscoveryCatalogShape,
92 #[serde(default)]
93 pub compact_index_contains: Vec<String>,
94 #[serde(default, skip_serializing_if = "Option::is_none")]
95 pub expected_discovery_query: Option<String>,
96 #[serde(default, skip_serializing_if = "Option::is_none")]
97 pub expected_promotion: Option<String>,
98 #[serde(default, skip_serializing_if = "Option::is_none")]
99 pub secondary_expected_promotion: Option<String>,
100 #[serde(default, skip_serializing_if = "Option::is_none")]
101 pub expected_tool_call: Option<String>,
102 #[serde(default)]
103 pub metrics: EvalLazyDiscoveryExpectedMetrics,
104}
105
106#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
107#[serde(rename_all = "camelCase")]
108pub struct EvalLazyDiscoveryCatalogShape {
109 #[serde(default)]
110 pub internal_tools: u64,
111 #[serde(default)]
112 pub mcp_tools: u64,
113 #[serde(default)]
114 pub skills: u64,
115 #[serde(default)]
116 pub plugins: u64,
117}
118
119#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
120#[serde(rename_all = "camelCase")]
121pub struct EvalLazyDiscoveryExpectedMetrics {
122 pub baseline_schema_tokens: u64,
123 pub deferred_prompt_tokens: u64,
124 #[serde(default)]
125 pub expected_promotion_count: u64,
126 #[serde(default)]
127 pub expected_warm_cache_hits: u64,
128 #[serde(default)]
129 pub max_wrong_tool_calls: u64,
130 #[serde(default)]
131 pub max_unknown_tool_calls: u64,
132 #[serde(default)]
133 pub max_calls_before_promotion: u64,
134}
135
136fn default_true() -> bool {
137 true
138}
139
140#[cfg(test)]
141mod tests {
142 use super::*;
143
144 #[test]
145 fn eval_fixture_round_trips_workspace_and_expected_checks() {
146 let fixture = EvalFixture {
147 id: "edit-config".to_string(),
148 title: "Edit config".to_string(),
149 prompt: "Set retries to 3 and verify tests.".to_string(),
150 tags: vec!["tool-calls".to_string()],
151 workspace: EvalWorkspaceSetup {
152 files: vec![EvalWorkspaceFile {
153 path: PathBuf::from("config.toml"),
154 contents: "retries = 1\n".to_string(),
155 }],
156 commands: vec!["cargo test".to_string()],
157 },
158 timeout_ms: Some(30_000),
159 expected: EvalExpectedEvidence {
160 final_answer_contains: vec!["verified".to_string()],
161 files: vec![EvalExpectedFile {
162 path: PathBuf::from("config.toml"),
163 exists: true,
164 contains: vec!["retries = 3".to_string()],
165 exact_contents: None,
166 max_bytes: None,
167 allowed_chars: None,
168 json_array_fields: Vec::new(),
169 }],
170 command_checks: vec![EvalExpectedCommand {
171 command: "cargo test".to_string(),
172 expected_exit_code: 0,
173 stdout_contains: vec!["test result: ok".to_string()],
174 stderr_contains: Vec::new(),
175 }],
176 verification_required: true,
177 task_ledger_required: true,
178 },
179 constraints: vec!["do not ask the user".to_string()],
180 lazy_discovery: Some(EvalLazyDiscoveryFixture {
181 hidden_deferred_capabilities: 32,
182 catalog_shape: EvalLazyDiscoveryCatalogShape {
183 internal_tools: 4,
184 mcp_tools: 24,
185 skills: 4,
186 plugins: 0,
187 },
188 compact_index_contains: vec!["github.issue.search".to_string()],
189 expected_discovery_query: Some("github issue".to_string()),
190 expected_promotion: Some("github.issue.search".to_string()),
191 secondary_expected_promotion: None,
192 expected_tool_call: Some("github.issue.search".to_string()),
193 metrics: EvalLazyDiscoveryExpectedMetrics {
194 baseline_schema_tokens: 4_600,
195 deferred_prompt_tokens: 780,
196 expected_promotion_count: 1,
197 expected_warm_cache_hits: 0,
198 max_wrong_tool_calls: 0,
199 max_unknown_tool_calls: 0,
200 max_calls_before_promotion: 0,
201 },
202 }),
203 };
204
205 let json = serde_json::to_string(&fixture).unwrap();
206 let round_trip: EvalFixture = serde_json::from_str(&json).unwrap();
207
208 assert_eq!(round_trip, fixture);
209 assert_eq!(
210 round_trip.workspace.files[0].path,
211 PathBuf::from("config.toml")
212 );
213 assert!(round_trip.expected.verification_required);
214 assert_eq!(
215 round_trip
216 .lazy_discovery
217 .as_ref()
218 .unwrap()
219 .hidden_deferred_capabilities,
220 32
221 );
222 }
223}