Skip to main content

roder_evals/
fixture.rs

1use std::path::PathBuf;
2
3use serde::{Deserialize, Serialize};
4
5#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
6#[serde(rename_all = "camelCase")]
7pub struct EvalFixture {
8    pub id: String,
9    pub title: String,
10    pub prompt: String,
11    #[serde(default)]
12    pub tags: Vec<String>,
13    #[serde(default)]
14    pub workspace: EvalWorkspaceSetup,
15    #[serde(default, skip_serializing_if = "Option::is_none")]
16    pub timeout_ms: Option<u64>,
17    #[serde(default)]
18    pub expected: EvalExpectedEvidence,
19    #[serde(default)]
20    pub constraints: Vec<String>,
21    #[serde(default, skip_serializing_if = "Option::is_none")]
22    pub lazy_discovery: Option<EvalLazyDiscoveryFixture>,
23}
24
25#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
26#[serde(rename_all = "camelCase")]
27pub struct EvalWorkspaceSetup {
28    #[serde(default)]
29    pub files: Vec<EvalWorkspaceFile>,
30    #[serde(default)]
31    pub commands: Vec<String>,
32}
33
34#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
35#[serde(rename_all = "camelCase")]
36pub struct EvalWorkspaceFile {
37    pub path: PathBuf,
38    pub contents: String,
39}
40
41#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
42#[serde(rename_all = "camelCase")]
43pub struct EvalExpectedEvidence {
44    #[serde(default)]
45    pub final_answer_contains: Vec<String>,
46    #[serde(default)]
47    pub files: Vec<EvalExpectedFile>,
48    #[serde(default)]
49    pub command_checks: Vec<EvalExpectedCommand>,
50    #[serde(default)]
51    pub verification_required: bool,
52    #[serde(default)]
53    pub task_ledger_required: bool,
54}
55
56#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
57#[serde(rename_all = "camelCase")]
58pub struct EvalExpectedFile {
59    pub path: PathBuf,
60    #[serde(default = "default_true")]
61    pub exists: bool,
62    #[serde(default)]
63    pub contains: Vec<String>,
64    #[serde(default, skip_serializing_if = "Option::is_none")]
65    pub exact_contents: Option<String>,
66    #[serde(default, skip_serializing_if = "Option::is_none")]
67    pub max_bytes: Option<u64>,
68    #[serde(default, skip_serializing_if = "Option::is_none")]
69    pub allowed_chars: Option<String>,
70    #[serde(default)]
71    pub json_array_fields: Vec<String>,
72}
73
74#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
75#[serde(rename_all = "camelCase")]
76pub struct EvalExpectedCommand {
77    pub command: String,
78    #[serde(default)]
79    pub expected_exit_code: i32,
80    #[serde(default)]
81    pub stdout_contains: Vec<String>,
82    #[serde(default)]
83    pub stderr_contains: Vec<String>,
84}
85
86#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
87#[serde(rename_all = "camelCase")]
88pub struct EvalLazyDiscoveryFixture {
89    pub hidden_deferred_capabilities: u64,
90    #[serde(default)]
91    pub catalog_shape: EvalLazyDiscoveryCatalogShape,
92    #[serde(default)]
93    pub compact_index_contains: Vec<String>,
94    #[serde(default, skip_serializing_if = "Option::is_none")]
95    pub expected_discovery_query: Option<String>,
96    #[serde(default, skip_serializing_if = "Option::is_none")]
97    pub expected_promotion: Option<String>,
98    #[serde(default, skip_serializing_if = "Option::is_none")]
99    pub secondary_expected_promotion: Option<String>,
100    #[serde(default, skip_serializing_if = "Option::is_none")]
101    pub expected_tool_call: Option<String>,
102    #[serde(default)]
103    pub metrics: EvalLazyDiscoveryExpectedMetrics,
104}
105
106#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
107#[serde(rename_all = "camelCase")]
108pub struct EvalLazyDiscoveryCatalogShape {
109    #[serde(default)]
110    pub internal_tools: u64,
111    #[serde(default)]
112    pub mcp_tools: u64,
113    #[serde(default)]
114    pub skills: u64,
115    #[serde(default)]
116    pub plugins: u64,
117}
118
119#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
120#[serde(rename_all = "camelCase")]
121pub struct EvalLazyDiscoveryExpectedMetrics {
122    pub baseline_schema_tokens: u64,
123    pub deferred_prompt_tokens: u64,
124    #[serde(default)]
125    pub expected_promotion_count: u64,
126    #[serde(default)]
127    pub expected_warm_cache_hits: u64,
128    #[serde(default)]
129    pub max_wrong_tool_calls: u64,
130    #[serde(default)]
131    pub max_unknown_tool_calls: u64,
132    #[serde(default)]
133    pub max_calls_before_promotion: u64,
134}
135
136fn default_true() -> bool {
137    true
138}
139
140#[cfg(test)]
141mod tests {
142    use super::*;
143
144    #[test]
145    fn eval_fixture_round_trips_workspace_and_expected_checks() {
146        let fixture = EvalFixture {
147            id: "edit-config".to_string(),
148            title: "Edit config".to_string(),
149            prompt: "Set retries to 3 and verify tests.".to_string(),
150            tags: vec!["tool-calls".to_string()],
151            workspace: EvalWorkspaceSetup {
152                files: vec![EvalWorkspaceFile {
153                    path: PathBuf::from("config.toml"),
154                    contents: "retries = 1\n".to_string(),
155                }],
156                commands: vec!["cargo test".to_string()],
157            },
158            timeout_ms: Some(30_000),
159            expected: EvalExpectedEvidence {
160                final_answer_contains: vec!["verified".to_string()],
161                files: vec![EvalExpectedFile {
162                    path: PathBuf::from("config.toml"),
163                    exists: true,
164                    contains: vec!["retries = 3".to_string()],
165                    exact_contents: None,
166                    max_bytes: None,
167                    allowed_chars: None,
168                    json_array_fields: Vec::new(),
169                }],
170                command_checks: vec![EvalExpectedCommand {
171                    command: "cargo test".to_string(),
172                    expected_exit_code: 0,
173                    stdout_contains: vec!["test result: ok".to_string()],
174                    stderr_contains: Vec::new(),
175                }],
176                verification_required: true,
177                task_ledger_required: true,
178            },
179            constraints: vec!["do not ask the user".to_string()],
180            lazy_discovery: Some(EvalLazyDiscoveryFixture {
181                hidden_deferred_capabilities: 32,
182                catalog_shape: EvalLazyDiscoveryCatalogShape {
183                    internal_tools: 4,
184                    mcp_tools: 24,
185                    skills: 4,
186                    plugins: 0,
187                },
188                compact_index_contains: vec!["github.issue.search".to_string()],
189                expected_discovery_query: Some("github issue".to_string()),
190                expected_promotion: Some("github.issue.search".to_string()),
191                secondary_expected_promotion: None,
192                expected_tool_call: Some("github.issue.search".to_string()),
193                metrics: EvalLazyDiscoveryExpectedMetrics {
194                    baseline_schema_tokens: 4_600,
195                    deferred_prompt_tokens: 780,
196                    expected_promotion_count: 1,
197                    expected_warm_cache_hits: 0,
198                    max_wrong_tool_calls: 0,
199                    max_unknown_tool_calls: 0,
200                    max_calls_before_promotion: 0,
201                },
202            }),
203        };
204
205        let json = serde_json::to_string(&fixture).unwrap();
206        let round_trip: EvalFixture = serde_json::from_str(&json).unwrap();
207
208        assert_eq!(round_trip, fixture);
209        assert_eq!(
210            round_trip.workspace.files[0].path,
211            PathBuf::from("config.toml")
212        );
213        assert!(round_trip.expected.verification_required);
214        assert_eq!(
215            round_trip
216                .lazy_discovery
217                .as_ref()
218                .unwrap()
219                .hidden_deferred_capabilities,
220            32
221        );
222    }
223}