Skip to main content

mdx_rust_core/
eval.rs

1//! Evaluation dataset and scorer metadata.
2//!
3//! These types make experiment records explicit about what was measured and
4//! how. The current scorer is intentionally simple, but the optimizer now has
5//! a stable place to grow policy-aligned and LLM-judge scoring.
6
7use schemars::JsonSchema;
8use serde::{Deserialize, Serialize};
9use std::path::Path;
10use std::time::Duration;
11
12#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
13pub struct EvaluationSample {
14    pub id: String,
15    pub input: serde_json::Value,
16}
17
18#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
19pub struct EvaluationDataset {
20    pub version: String,
21    pub samples: Vec<EvaluationSample>,
22}
23
24impl EvaluationDataset {
25    pub fn synthetic_v1() -> Self {
26        let samples = (0..5)
27            .map(|i| EvaluationSample {
28                id: format!("synthetic-addition-{i}"),
29                input: serde_json::json!({
30                    "query": format!("What is {} + {}?", i, i + 1),
31                    "context": null
32                }),
33            })
34            .collect();
35
36        Self {
37            version: "synthetic_v1".to_string(),
38            samples,
39        }
40    }
41
42    pub fn content_hash(&self) -> String {
43        let bytes = serde_json::to_vec(self).unwrap_or_default();
44        stable_hash_hex(&bytes)
45    }
46
47    /// Load a dataset from JSON.
48    ///
49    /// Accepted shapes:
50    /// - `{ "version": "...", "samples": [{ "id": "...", "input": {...} }] }`
51    /// - `[{ "id": "...", "input": {...} }]`
52    /// - `[{...}, {...}]` where each object is treated directly as an input.
53    pub fn load_from_path(path: &Path) -> anyhow::Result<Self> {
54        let content = std::fs::read_to_string(path)?;
55
56        if let Ok(dataset) = serde_json::from_str::<EvaluationDataset>(&content) {
57            return Ok(dataset);
58        }
59
60        let value: serde_json::Value = serde_json::from_str(&content)?;
61        let Some(items) = value.as_array() else {
62            anyhow::bail!("dataset must be an EvaluationDataset object or JSON array");
63        };
64
65        let mut samples = Vec::with_capacity(items.len());
66        for (index, item) in items.iter().enumerate() {
67            if let Some(input) = item.get("input") {
68                let id = item
69                    .get("id")
70                    .and_then(|id| id.as_str())
71                    .map(str::to_string)
72                    .unwrap_or_else(|| format!("sample-{index}"));
73                samples.push(EvaluationSample {
74                    id,
75                    input: input.clone(),
76                });
77            } else {
78                samples.push(EvaluationSample {
79                    id: format!("sample-{index}"),
80                    input: item.clone(),
81                });
82            }
83        }
84
85        Ok(Self {
86            version: dataset_version_from_path(path),
87            samples,
88        })
89    }
90}
91
92#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
93pub struct ScorerMetadata {
94    pub id: String,
95    pub version: String,
96}
97
98#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
99pub struct BehaviorEvalSpec {
100    pub version: String,
101    pub commands: Vec<BehaviorCommand>,
102}
103
104#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
105pub struct BehaviorCommand {
106    pub id: String,
107    pub command: String,
108    #[serde(default)]
109    pub args: Vec<String>,
110    pub cwd: Option<String>,
111    #[serde(default = "default_expect_success")]
112    pub expect_success: bool,
113    #[serde(default)]
114    pub expect_stdout_contains: Vec<String>,
115    #[serde(default)]
116    pub expect_stderr_contains: Vec<String>,
117    pub timeout_seconds: Option<u64>,
118}
119
120#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
121pub struct BehaviorEvalReport {
122    pub schema_version: String,
123    pub spec_path: String,
124    pub spec_hash: String,
125    pub total: usize,
126    pub passed: usize,
127    pub failed: usize,
128    pub command_records: Vec<BehaviorCommandRecord>,
129}
130
131#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
132pub struct BehaviorCommandRecord {
133    pub id: String,
134    pub command: String,
135    pub success: bool,
136    pub timed_out: bool,
137    pub status_code: Option<i32>,
138    pub duration_ms: u64,
139    pub stdout: String,
140    pub stderr: String,
141    pub failure_reason: Option<String>,
142}
143
144impl BehaviorEvalReport {
145    pub fn passed(&self) -> bool {
146        self.failed == 0
147    }
148}
149
150pub fn run_behavior_evals(root: &Path, spec_path: &Path) -> anyhow::Result<BehaviorEvalReport> {
151    let path = if spec_path.is_absolute() {
152        spec_path.to_path_buf()
153    } else {
154        root.join(spec_path)
155    };
156    let content = std::fs::read(&path)?;
157    let spec: BehaviorEvalSpec = serde_json::from_slice(&content)?;
158    let mut command_records = Vec::with_capacity(spec.commands.len());
159
160    for command_spec in &spec.commands {
161        let record = run_behavior_command(root, command_spec);
162        command_records.push(record);
163    }
164
165    let passed = command_records
166        .iter()
167        .filter(|record| record.success)
168        .count();
169    let failed = command_records.len().saturating_sub(passed);
170
171    Ok(BehaviorEvalReport {
172        schema_version: "0.4".to_string(),
173        spec_path: path.display().to_string(),
174        spec_hash: stable_hash_hex(&content),
175        total: command_records.len(),
176        passed,
177        failed,
178        command_records,
179    })
180}
181
182fn run_behavior_command(root: &Path, spec: &BehaviorCommand) -> BehaviorCommandRecord {
183    let started = std::time::Instant::now();
184    if spec.command.trim().is_empty() {
185        return failed_behavior_record(spec, started, false, "command is empty");
186    }
187
188    let cwd = spec
189        .cwd
190        .as_ref()
191        .map(|cwd| root.join(cwd))
192        .unwrap_or_else(|| root.to_path_buf());
193    if !cwd.is_dir() {
194        return failed_behavior_record(
195            spec,
196            started,
197            false,
198            format!(
199                "cwd does not exist or is not a directory: {}",
200                cwd.display()
201            ),
202        );
203    }
204
205    let mut command = std::process::Command::new(&spec.command);
206    command.args(&spec.args);
207    command.current_dir(&cwd);
208
209    let timeout = Duration::from_secs(spec.timeout_seconds.unwrap_or(120));
210    let Some(output) = mdx_rust_analysis::editing::run_command_with_timeout(&mut command, timeout)
211    else {
212        return failed_behavior_record(
213            spec,
214            started,
215            false,
216            "command could not be started or observed",
217        );
218    };
219
220    let mut failure_reason = None;
221    if output.success() != spec.expect_success {
222        failure_reason = Some(format!(
223            "expected success={} but command success={}",
224            spec.expect_success,
225            output.success()
226        ));
227    }
228    if failure_reason.is_none() {
229        if let Some(missing) = spec
230            .expect_stdout_contains
231            .iter()
232            .find(|needle| !output.stdout.contains(*needle))
233        {
234            failure_reason = Some(format!("stdout did not contain {missing:?}"));
235        }
236    }
237    if failure_reason.is_none() {
238        if let Some(missing) = spec
239            .expect_stderr_contains
240            .iter()
241            .find(|needle| !output.stderr.contains(*needle))
242        {
243            failure_reason = Some(format!("stderr did not contain {missing:?}"));
244        }
245    }
246
247    BehaviorCommandRecord {
248        id: spec.id.clone(),
249        command: command_label(spec),
250        success: failure_reason.is_none(),
251        timed_out: output.timed_out,
252        status_code: output.status.and_then(|status| status.code()),
253        duration_ms: output.duration_ms,
254        stdout: output.stdout,
255        stderr: output.stderr,
256        failure_reason,
257    }
258}
259
260fn failed_behavior_record(
261    spec: &BehaviorCommand,
262    started: std::time::Instant,
263    timed_out: bool,
264    reason: impl Into<String>,
265) -> BehaviorCommandRecord {
266    BehaviorCommandRecord {
267        id: spec.id.clone(),
268        command: command_label(spec),
269        success: false,
270        timed_out,
271        status_code: None,
272        duration_ms: elapsed_millis_u64(started),
273        stdout: String::new(),
274        stderr: String::new(),
275        failure_reason: Some(reason.into()),
276    }
277}
278
279fn command_label(spec: &BehaviorCommand) -> String {
280    std::iter::once(spec.command.as_str())
281        .chain(spec.args.iter().map(String::as_str))
282        .collect::<Vec<_>>()
283        .join(" ")
284}
285
286fn elapsed_millis_u64(started: std::time::Instant) -> u64 {
287    started.elapsed().as_millis().try_into().unwrap_or(u64::MAX)
288}
289
290fn default_expect_success() -> bool {
291    true
292}
293
294impl ScorerMetadata {
295    pub fn mechanical_v1() -> Self {
296        Self {
297            id: "mechanical".to_string(),
298            version: "v1".to_string(),
299        }
300    }
301
302    pub fn label(&self) -> String {
303        format!("{}_{}", self.id, self.version)
304    }
305}
306
307pub fn stable_hash_hex(bytes: &[u8]) -> String {
308    let mut hash = 0xcbf29ce484222325u64;
309    for byte in bytes {
310        hash ^= u64::from(*byte);
311        hash = hash.wrapping_mul(0x100000001b3);
312    }
313    format!("fnv1a64:{hash:016x}")
314}
315
316fn dataset_version_from_path(path: &Path) -> String {
317    path.file_stem()
318        .and_then(|stem| stem.to_str())
319        .filter(|stem| !stem.is_empty())
320        .map(|stem| format!("file:{stem}"))
321        .unwrap_or_else(|| "file:dataset".to_string())
322}
323
324#[cfg(test)]
325mod tests {
326    use super::*;
327    use tempfile::tempdir;
328
329    #[test]
330    fn load_dataset_from_raw_input_array() {
331        let dir = tempdir().unwrap();
332        let path = dir.path().join("dataset.json");
333        std::fs::write(
334            &path,
335            r#"[{"query":"hello"},{"query":"world","context":null}]"#,
336        )
337        .unwrap();
338
339        let dataset = EvaluationDataset::load_from_path(&path).unwrap();
340
341        assert_eq!(dataset.samples.len(), 2);
342        assert_eq!(dataset.samples[0].id, "sample-0");
343        assert_eq!(dataset.version, "file:dataset");
344    }
345
346    #[test]
347    fn load_dataset_from_structured_object() {
348        let dir = tempdir().unwrap();
349        let path = dir.path().join("evals.json");
350        std::fs::write(
351            &path,
352            r#"{"version":"v9","samples":[{"id":"a","input":{"query":"hello"}}]}"#,
353        )
354        .unwrap();
355
356        let dataset = EvaluationDataset::load_from_path(&path).unwrap();
357
358        assert_eq!(dataset.version, "v9");
359        assert_eq!(dataset.samples[0].id, "a");
360    }
361
362    #[test]
363    fn behavior_eval_runs_command_specs() {
364        let dir = tempdir().unwrap();
365        let spec_path = dir.path().join("evals.json");
366        std::fs::write(
367            &spec_path,
368            r#"{
369  "version": "v1",
370  "commands": [
371    {
372      "id": "hello",
373      "command": "cargo",
374      "args": ["--version"],
375      "expect_stdout_contains": ["cargo"],
376      "timeout_seconds": 30
377    }
378  ]
379}"#,
380        )
381        .unwrap();
382
383        let report = run_behavior_evals(dir.path(), &spec_path).unwrap();
384
385        assert!(report.passed());
386        assert_eq!(report.total, 1);
387        assert_eq!(report.command_records[0].id, "hello");
388    }
389
390    #[test]
391    fn behavior_eval_reports_malformed_commands_without_process_errors() {
392        let dir = tempdir().unwrap();
393        let spec_path = dir.path().join("evals.json");
394        std::fs::write(
395            &spec_path,
396            r#"{
397  "version": "v1",
398  "commands": [
399    {
400      "id": "empty",
401      "command": "",
402      "timeout_seconds": 30
403    }
404  ]
405}"#,
406        )
407        .unwrap();
408
409        let report = run_behavior_evals(dir.path(), &spec_path).unwrap();
410
411        assert!(!report.passed());
412        assert_eq!(report.failed, 1);
413        assert_eq!(
414            report.command_records[0].failure_reason.as_deref(),
415            Some("command is empty")
416        );
417        assert!(!report.command_records[0].timed_out);
418    }
419}