Skip to main content

mdx_rust_core/
eval.rs

1//! Evaluation dataset and scorer metadata.
2//!
3//! These types make experiment records explicit about what was measured and
4//! how. The current scorer is intentionally simple, but the optimizer now has
5//! a stable place to grow policy-aligned and LLM-judge scoring.
6
7use schemars::JsonSchema;
8use serde::{Deserialize, Serialize};
9use std::path::Path;
10
11#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
12pub struct EvaluationSample {
13    pub id: String,
14    pub input: serde_json::Value,
15}
16
17#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
18pub struct EvaluationDataset {
19    pub version: String,
20    pub samples: Vec<EvaluationSample>,
21}
22
23impl EvaluationDataset {
24    pub fn synthetic_v1() -> Self {
25        let samples = (0..5)
26            .map(|i| EvaluationSample {
27                id: format!("synthetic-addition-{i}"),
28                input: serde_json::json!({
29                    "query": format!("What is {} + {}?", i, i + 1),
30                    "context": null
31                }),
32            })
33            .collect();
34
35        Self {
36            version: "synthetic_v1".to_string(),
37            samples,
38        }
39    }
40
41    pub fn content_hash(&self) -> String {
42        let bytes = serde_json::to_vec(self).unwrap_or_default();
43        stable_hash_hex(&bytes)
44    }
45
46    /// Load a dataset from JSON.
47    ///
48    /// Accepted shapes:
49    /// - `{ "version": "...", "samples": [{ "id": "...", "input": {...} }] }`
50    /// - `[{ "id": "...", "input": {...} }]`
51    /// - `[{...}, {...}]` where each object is treated directly as an input.
52    pub fn load_from_path(path: &Path) -> anyhow::Result<Self> {
53        let content = std::fs::read_to_string(path)?;
54
55        if let Ok(dataset) = serde_json::from_str::<EvaluationDataset>(&content) {
56            return Ok(dataset);
57        }
58
59        let value: serde_json::Value = serde_json::from_str(&content)?;
60        let Some(items) = value.as_array() else {
61            anyhow::bail!("dataset must be an EvaluationDataset object or JSON array");
62        };
63
64        let mut samples = Vec::with_capacity(items.len());
65        for (index, item) in items.iter().enumerate() {
66            if let Some(input) = item.get("input") {
67                let id = item
68                    .get("id")
69                    .and_then(|id| id.as_str())
70                    .map(str::to_string)
71                    .unwrap_or_else(|| format!("sample-{index}"));
72                samples.push(EvaluationSample {
73                    id,
74                    input: input.clone(),
75                });
76            } else {
77                samples.push(EvaluationSample {
78                    id: format!("sample-{index}"),
79                    input: item.clone(),
80                });
81            }
82        }
83
84        Ok(Self {
85            version: dataset_version_from_path(path),
86            samples,
87        })
88    }
89}
90
91#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
92pub struct ScorerMetadata {
93    pub id: String,
94    pub version: String,
95}
96
97impl ScorerMetadata {
98    pub fn mechanical_v1() -> Self {
99        Self {
100            id: "mechanical".to_string(),
101            version: "v1".to_string(),
102        }
103    }
104
105    pub fn label(&self) -> String {
106        format!("{}_{}", self.id, self.version)
107    }
108}
109
110pub fn stable_hash_hex(bytes: &[u8]) -> String {
111    let mut hash = 0xcbf29ce484222325u64;
112    for byte in bytes {
113        hash ^= u64::from(*byte);
114        hash = hash.wrapping_mul(0x100000001b3);
115    }
116    format!("fnv1a64:{hash:016x}")
117}
118
119fn dataset_version_from_path(path: &Path) -> String {
120    path.file_stem()
121        .and_then(|stem| stem.to_str())
122        .filter(|stem| !stem.is_empty())
123        .map(|stem| format!("file:{stem}"))
124        .unwrap_or_else(|| "file:dataset".to_string())
125}
126
127#[cfg(test)]
128mod tests {
129    use super::*;
130    use tempfile::tempdir;
131
132    #[test]
133    fn load_dataset_from_raw_input_array() {
134        let dir = tempdir().unwrap();
135        let path = dir.path().join("dataset.json");
136        std::fs::write(
137            &path,
138            r#"[{"query":"hello"},{"query":"world","context":null}]"#,
139        )
140        .unwrap();
141
142        let dataset = EvaluationDataset::load_from_path(&path).unwrap();
143
144        assert_eq!(dataset.samples.len(), 2);
145        assert_eq!(dataset.samples[0].id, "sample-0");
146        assert_eq!(dataset.version, "file:dataset");
147    }
148
149    #[test]
150    fn load_dataset_from_structured_object() {
151        let dir = tempdir().unwrap();
152        let path = dir.path().join("evals.json");
153        std::fs::write(
154            &path,
155            r#"{"version":"v9","samples":[{"id":"a","input":{"query":"hello"}}]}"#,
156        )
157        .unwrap();
158
159        let dataset = EvaluationDataset::load_from_path(&path).unwrap();
160
161        assert_eq!(dataset.version, "v9");
162        assert_eq!(dataset.samples[0].id, "a");
163    }
164}