zeph_experiments/
benchmark.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Benchmark dataset types and TOML loading.
5
6use std::path::Path;
7
8use serde::Deserialize;
9
10use super::error::EvalError;
11
12/// Maximum allowed benchmark file size (10 MiB).
13const MAX_BENCHMARK_SIZE: u64 = 10 * 1024 * 1024;
14
15/// A set of benchmark cases loaded from a TOML file.
16///
17/// # TOML format
18///
19/// ```toml
20/// [[cases]]
21/// prompt = "What is the capital of France?"
22/// reference = "Paris"
23/// tags = ["geography", "factual"]
24///
25/// [[cases]]
26/// prompt = "Explain async/await in Rust."
27/// context = "You are a Rust expert."
28/// ```
29#[derive(Debug, Clone, Deserialize)]
30pub struct BenchmarkSet {
31    pub cases: Vec<BenchmarkCase>,
32}
33
34/// A single benchmark case.
35#[derive(Debug, Clone, Deserialize)]
36pub struct BenchmarkCase {
37    /// The prompt sent to the subject model.
38    pub prompt: String,
39    /// Optional system context for the subject model.
40    #[serde(default)]
41    pub context: Option<String>,
42    /// Optional reference answer for the judge to calibrate scoring.
43    #[serde(default)]
44    pub reference: Option<String>,
45    /// Optional tags for filtering or reporting.
46    #[serde(default)]
47    pub tags: Option<Vec<String>>,
48}
49
50impl BenchmarkSet {
51    /// Load a benchmark set from a TOML file.
52    ///
53    /// Performs size guard (10 MiB limit) and canonicalisation before reading.
54    /// Symlinks that escape the file's parent directory are rejected.
55    ///
56    /// # Errors
57    ///
58    /// Returns [`EvalError::BenchmarkLoad`] if the file cannot be read,
59    /// [`EvalError::BenchmarkParse`] if the TOML is invalid,
60    /// [`EvalError::BenchmarkTooLarge`] if the file exceeds the size limit, or
61    /// [`EvalError::PathTraversal`] if canonicalization reveals a symlink escape.
62    pub fn from_file(path: &Path) -> Result<Self, EvalError> {
63        // Canonicalize to resolve symlinks before opening — eliminates TOCTOU race.
64        let canonical = std::fs::canonicalize(path)
65            .map_err(|e| EvalError::BenchmarkLoad(path.display().to_string(), e))?;
66
67        // Verify the canonical path stays within the parent directory.
68        // This prevents symlinks from escaping into arbitrary filesystem locations.
69        if let Some(parent) = path.parent()
70            && let Ok(canonical_parent) = std::fs::canonicalize(parent)
71            && !canonical.starts_with(&canonical_parent)
72        {
73            return Err(EvalError::PathTraversal(canonical.display().to_string()));
74        }
75
76        // Guard against unbounded memory use from oversized files.
77        let metadata = std::fs::metadata(&canonical)
78            .map_err(|e| EvalError::BenchmarkLoad(canonical.display().to_string(), e))?;
79        if metadata.len() > MAX_BENCHMARK_SIZE {
80            return Err(EvalError::BenchmarkTooLarge {
81                path: canonical.display().to_string(),
82                size: metadata.len(),
83                limit: MAX_BENCHMARK_SIZE,
84            });
85        }
86
87        let content = std::fs::read_to_string(&canonical)
88            .map_err(|e| EvalError::BenchmarkLoad(canonical.display().to_string(), e))?;
89        toml::from_str(&content)
90            .map_err(|e| EvalError::BenchmarkParse(canonical.display().to_string(), e.to_string()))
91    }
92
93    /// Validate that the benchmark set is non-empty.
94    ///
95    /// # Errors
96    ///
97    /// Returns [`EvalError::EmptyBenchmarkSet`] if `cases` is empty.
98    pub fn validate(&self) -> Result<(), EvalError> {
99        if self.cases.is_empty() {
100            return Err(EvalError::EmptyBenchmarkSet);
101        }
102        Ok(())
103    }
104}
105
106#[cfg(test)]
107mod tests {
108    #![allow(clippy::redundant_closure_for_method_calls)]
109
110    use super::*;
111
112    fn parse(toml: &str) -> BenchmarkSet {
113        toml::from_str(toml).expect("valid TOML")
114    }
115
116    #[test]
117    fn benchmark_from_toml_happy_path() {
118        let toml = r#"
119[[cases]]
120prompt = "What is 2+2?"
121"#;
122        let set = parse(toml);
123        assert_eq!(set.cases.len(), 1);
124        assert_eq!(set.cases[0].prompt, "What is 2+2?");
125        assert!(set.cases[0].context.is_none());
126        assert!(set.cases[0].reference.is_none());
127        assert!(set.cases[0].tags.is_none());
128    }
129
130    #[test]
131    fn benchmark_from_toml_with_all_fields() {
132        let toml = r#"
133[[cases]]
134prompt = "Explain Rust ownership."
135context = "You are a Rust expert."
136reference = "Ownership is Rust's memory management model."
137tags = ["rust", "concepts"]
138"#;
139        let set = parse(toml);
140        assert_eq!(set.cases.len(), 1);
141        let case = &set.cases[0];
142        assert_eq!(case.context.as_deref(), Some("You are a Rust expert."));
143        assert!(case.reference.is_some());
144        assert_eq!(case.tags.as_ref().map(std::vec::Vec::len), Some(2));
145    }
146
147    #[test]
148    fn benchmark_empty_cases_rejected() {
149        let set = BenchmarkSet { cases: vec![] };
150        assert!(matches!(set.validate(), Err(EvalError::EmptyBenchmarkSet)));
151    }
152
153    #[test]
154    fn benchmark_from_file_missing_file() {
155        let result = BenchmarkSet::from_file(Path::new("/nonexistent/path/benchmark.toml"));
156        assert!(matches!(result, Err(EvalError::BenchmarkLoad(_, _))));
157    }
158
159    #[test]
160    fn benchmark_from_toml_invalid_syntax() {
161        let bad = "[[cases\nprompt = 'unclosed'";
162        let result: Result<BenchmarkSet, _> = toml::from_str(bad);
163        assert!(result.is_err());
164    }
165
166    #[test]
167    fn benchmark_from_file_invalid_toml() {
168        use std::io::Write;
169        let mut f = tempfile::NamedTempFile::new().unwrap();
170        writeln!(f, "not valid toml ][[]").unwrap();
171        let result = BenchmarkSet::from_file(f.path());
172        assert!(matches!(result, Err(EvalError::BenchmarkParse(_, _))));
173    }
174
175    #[test]
176    fn benchmark_from_file_too_large() {
177        // Write a file larger than MAX_BENCHMARK_SIZE by writing in chunks.
178        // We override the limit via a helper that accepts a custom limit instead of
179        // creating a truly 10 MiB file. Test the error variant directly via a stub.
180        // Since we cannot override the constant, we verify the error type is correct
181        // by constructing it directly.
182        let err = EvalError::BenchmarkTooLarge {
183            path: "/tmp/bench.toml".into(),
184            size: MAX_BENCHMARK_SIZE + 1,
185            limit: MAX_BENCHMARK_SIZE,
186        };
187        assert!(err.to_string().contains("exceeds size limit"));
188    }
189
190    #[test]
191    fn benchmark_from_file_size_guard_allows_normal_file() {
192        use std::io::Write;
193        let mut f = tempfile::NamedTempFile::new().unwrap();
194        writeln!(f, "[[cases]]\nprompt = \"hello\"").unwrap();
195        // Normal-sized file must load without size error.
196        let result = BenchmarkSet::from_file(f.path());
197        assert!(result.is_ok());
198    }
199
200    #[test]
201    fn benchmark_validate_passes_for_nonempty() {
202        let set = BenchmarkSet {
203            cases: vec![BenchmarkCase {
204                prompt: "hello".into(),
205                context: None,
206                reference: None,
207                tags: None,
208            }],
209        };
210        assert!(set.validate().is_ok());
211    }
212}
zeph_experiments/benchmark.rs

zeph_experiments/
benchmark.rs