1use schemars::JsonSchema;
8use serde::{Deserialize, Serialize};
9use std::path::Path;
10use std::time::Duration;
11
12#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
13pub struct EvaluationSample {
14 pub id: String,
15 pub input: serde_json::Value,
16}
17
18#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
19pub struct EvaluationDataset {
20 pub version: String,
21 pub samples: Vec<EvaluationSample>,
22}
23
24impl EvaluationDataset {
25 pub fn synthetic_v1() -> Self {
26 let samples = (0..5)
27 .map(|i| EvaluationSample {
28 id: format!("synthetic-addition-{i}"),
29 input: serde_json::json!({
30 "query": format!("What is {} + {}?", i, i + 1),
31 "context": null
32 }),
33 })
34 .collect();
35
36 Self {
37 version: "synthetic_v1".to_string(),
38 samples,
39 }
40 }
41
42 pub fn content_hash(&self) -> String {
43 let bytes = serde_json::to_vec(self).unwrap_or_default();
44 stable_hash_hex(&bytes)
45 }
46
47 pub fn load_from_path(path: &Path) -> anyhow::Result<Self> {
54 let content = std::fs::read_to_string(path)?;
55
56 if let Ok(dataset) = serde_json::from_str::<EvaluationDataset>(&content) {
57 return Ok(dataset);
58 }
59
60 let value: serde_json::Value = serde_json::from_str(&content)?;
61 let Some(items) = value.as_array() else {
62 anyhow::bail!("dataset must be an EvaluationDataset object or JSON array");
63 };
64
65 let mut samples = Vec::with_capacity(items.len());
66 for (index, item) in items.iter().enumerate() {
67 if let Some(input) = item.get("input") {
68 let id = item
69 .get("id")
70 .and_then(|id| id.as_str())
71 .map(str::to_string)
72 .unwrap_or_else(|| format!("sample-{index}"));
73 samples.push(EvaluationSample {
74 id,
75 input: input.clone(),
76 });
77 } else {
78 samples.push(EvaluationSample {
79 id: format!("sample-{index}"),
80 input: item.clone(),
81 });
82 }
83 }
84
85 Ok(Self {
86 version: dataset_version_from_path(path),
87 samples,
88 })
89 }
90}
91
92#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
93pub struct ScorerMetadata {
94 pub id: String,
95 pub version: String,
96}
97
98#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
99pub struct BehaviorEvalSpec {
100 pub version: String,
101 pub commands: Vec<BehaviorCommand>,
102}
103
104#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
105pub struct BehaviorCommand {
106 pub id: String,
107 pub command: String,
108 #[serde(default)]
109 pub args: Vec<String>,
110 pub cwd: Option<String>,
111 #[serde(default = "default_expect_success")]
112 pub expect_success: bool,
113 #[serde(default)]
114 pub expect_stdout_contains: Vec<String>,
115 #[serde(default)]
116 pub expect_stderr_contains: Vec<String>,
117 pub timeout_seconds: Option<u64>,
118}
119
120#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
121pub struct BehaviorEvalReport {
122 pub schema_version: String,
123 pub spec_path: String,
124 pub spec_hash: String,
125 pub total: usize,
126 pub passed: usize,
127 pub failed: usize,
128 pub command_records: Vec<BehaviorCommandRecord>,
129}
130
131#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
132pub struct BehaviorCommandRecord {
133 pub id: String,
134 pub command: String,
135 pub success: bool,
136 pub timed_out: bool,
137 pub status_code: Option<i32>,
138 pub duration_ms: u64,
139 pub stdout: String,
140 pub stderr: String,
141 pub failure_reason: Option<String>,
142}
143
144impl BehaviorEvalReport {
145 pub fn passed(&self) -> bool {
146 self.failed == 0
147 }
148}
149
150pub fn run_behavior_evals(root: &Path, spec_path: &Path) -> anyhow::Result<BehaviorEvalReport> {
151 let path = if spec_path.is_absolute() {
152 spec_path.to_path_buf()
153 } else {
154 root.join(spec_path)
155 };
156 let content = std::fs::read(&path)?;
157 let spec: BehaviorEvalSpec = serde_json::from_slice(&content)?;
158 let mut command_records = Vec::with_capacity(spec.commands.len());
159
160 for command_spec in &spec.commands {
161 let record = run_behavior_command(root, command_spec);
162 command_records.push(record);
163 }
164
165 let passed = command_records
166 .iter()
167 .filter(|record| record.success)
168 .count();
169 let failed = command_records.len().saturating_sub(passed);
170
171 Ok(BehaviorEvalReport {
172 schema_version: "0.4".to_string(),
173 spec_path: path.display().to_string(),
174 spec_hash: stable_hash_hex(&content),
175 total: command_records.len(),
176 passed,
177 failed,
178 command_records,
179 })
180}
181
182fn run_behavior_command(root: &Path, spec: &BehaviorCommand) -> BehaviorCommandRecord {
183 let started = std::time::Instant::now();
184 if spec.command.trim().is_empty() {
185 return failed_behavior_record(spec, started, false, "command is empty");
186 }
187
188 let cwd = spec
189 .cwd
190 .as_ref()
191 .map(|cwd| root.join(cwd))
192 .unwrap_or_else(|| root.to_path_buf());
193 if !cwd.is_dir() {
194 return failed_behavior_record(
195 spec,
196 started,
197 false,
198 format!(
199 "cwd does not exist or is not a directory: {}",
200 cwd.display()
201 ),
202 );
203 }
204
205 let mut command = std::process::Command::new(&spec.command);
206 command.args(&spec.args);
207 command.current_dir(&cwd);
208
209 let timeout = Duration::from_secs(spec.timeout_seconds.unwrap_or(120));
210 let Some(output) = mdx_rust_analysis::editing::run_command_with_timeout(&mut command, timeout)
211 else {
212 return failed_behavior_record(
213 spec,
214 started,
215 false,
216 "command could not be started or observed",
217 );
218 };
219
220 let mut failure_reason = None;
221 if output.success() != spec.expect_success {
222 failure_reason = Some(format!(
223 "expected success={} but command success={}",
224 spec.expect_success,
225 output.success()
226 ));
227 }
228 if failure_reason.is_none() {
229 if let Some(missing) = spec
230 .expect_stdout_contains
231 .iter()
232 .find(|needle| !output.stdout.contains(*needle))
233 {
234 failure_reason = Some(format!("stdout did not contain {missing:?}"));
235 }
236 }
237 if failure_reason.is_none() {
238 if let Some(missing) = spec
239 .expect_stderr_contains
240 .iter()
241 .find(|needle| !output.stderr.contains(*needle))
242 {
243 failure_reason = Some(format!("stderr did not contain {missing:?}"));
244 }
245 }
246
247 BehaviorCommandRecord {
248 id: spec.id.clone(),
249 command: command_label(spec),
250 success: failure_reason.is_none(),
251 timed_out: output.timed_out,
252 status_code: output.status.and_then(|status| status.code()),
253 duration_ms: output.duration_ms,
254 stdout: output.stdout,
255 stderr: output.stderr,
256 failure_reason,
257 }
258}
259
260fn failed_behavior_record(
261 spec: &BehaviorCommand,
262 started: std::time::Instant,
263 timed_out: bool,
264 reason: impl Into<String>,
265) -> BehaviorCommandRecord {
266 BehaviorCommandRecord {
267 id: spec.id.clone(),
268 command: command_label(spec),
269 success: false,
270 timed_out,
271 status_code: None,
272 duration_ms: elapsed_millis_u64(started),
273 stdout: String::new(),
274 stderr: String::new(),
275 failure_reason: Some(reason.into()),
276 }
277}
278
279fn command_label(spec: &BehaviorCommand) -> String {
280 std::iter::once(spec.command.as_str())
281 .chain(spec.args.iter().map(String::as_str))
282 .collect::<Vec<_>>()
283 .join(" ")
284}
285
286fn elapsed_millis_u64(started: std::time::Instant) -> u64 {
287 started.elapsed().as_millis().try_into().unwrap_or(u64::MAX)
288}
289
290fn default_expect_success() -> bool {
291 true
292}
293
294impl ScorerMetadata {
295 pub fn mechanical_v1() -> Self {
296 Self {
297 id: "mechanical".to_string(),
298 version: "v1".to_string(),
299 }
300 }
301
302 pub fn label(&self) -> String {
303 format!("{}_{}", self.id, self.version)
304 }
305}
306
307pub fn stable_hash_hex(bytes: &[u8]) -> String {
308 let mut hash = 0xcbf29ce484222325u64;
309 for byte in bytes {
310 hash ^= u64::from(*byte);
311 hash = hash.wrapping_mul(0x100000001b3);
312 }
313 format!("fnv1a64:{hash:016x}")
314}
315
316fn dataset_version_from_path(path: &Path) -> String {
317 path.file_stem()
318 .and_then(|stem| stem.to_str())
319 .filter(|stem| !stem.is_empty())
320 .map(|stem| format!("file:{stem}"))
321 .unwrap_or_else(|| "file:dataset".to_string())
322}
323
324#[cfg(test)]
325mod tests {
326 use super::*;
327 use tempfile::tempdir;
328
329 #[test]
330 fn load_dataset_from_raw_input_array() {
331 let dir = tempdir().unwrap();
332 let path = dir.path().join("dataset.json");
333 std::fs::write(
334 &path,
335 r#"[{"query":"hello"},{"query":"world","context":null}]"#,
336 )
337 .unwrap();
338
339 let dataset = EvaluationDataset::load_from_path(&path).unwrap();
340
341 assert_eq!(dataset.samples.len(), 2);
342 assert_eq!(dataset.samples[0].id, "sample-0");
343 assert_eq!(dataset.version, "file:dataset");
344 }
345
346 #[test]
347 fn load_dataset_from_structured_object() {
348 let dir = tempdir().unwrap();
349 let path = dir.path().join("evals.json");
350 std::fs::write(
351 &path,
352 r#"{"version":"v9","samples":[{"id":"a","input":{"query":"hello"}}]}"#,
353 )
354 .unwrap();
355
356 let dataset = EvaluationDataset::load_from_path(&path).unwrap();
357
358 assert_eq!(dataset.version, "v9");
359 assert_eq!(dataset.samples[0].id, "a");
360 }
361
362 #[test]
363 fn behavior_eval_runs_command_specs() {
364 let dir = tempdir().unwrap();
365 let spec_path = dir.path().join("evals.json");
366 std::fs::write(
367 &spec_path,
368 r#"{
369 "version": "v1",
370 "commands": [
371 {
372 "id": "hello",
373 "command": "cargo",
374 "args": ["--version"],
375 "expect_stdout_contains": ["cargo"],
376 "timeout_seconds": 30
377 }
378 ]
379}"#,
380 )
381 .unwrap();
382
383 let report = run_behavior_evals(dir.path(), &spec_path).unwrap();
384
385 assert!(report.passed());
386 assert_eq!(report.total, 1);
387 assert_eq!(report.command_records[0].id, "hello");
388 }
389
390 #[test]
391 fn behavior_eval_reports_malformed_commands_without_process_errors() {
392 let dir = tempdir().unwrap();
393 let spec_path = dir.path().join("evals.json");
394 std::fs::write(
395 &spec_path,
396 r#"{
397 "version": "v1",
398 "commands": [
399 {
400 "id": "empty",
401 "command": "",
402 "timeout_seconds": 30
403 }
404 ]
405}"#,
406 )
407 .unwrap();
408
409 let report = run_behavior_evals(dir.path(), &spec_path).unwrap();
410
411 assert!(!report.passed());
412 assert_eq!(report.failed, 1);
413 assert_eq!(
414 report.command_records[0].failure_reason.as_deref(),
415 Some("command is empty")
416 );
417 assert!(!report.command_records[0].timed_out);
418 }
419}