zeph_experiments/
benchmark.rs1use std::path::Path;
7
8use serde::Deserialize;
9
10use super::error::EvalError;
11
12const MAX_BENCHMARK_SIZE: u64 = 10 * 1024 * 1024;
14
15#[derive(Debug, Clone, Deserialize)]
45pub struct BenchmarkSet {
46 pub cases: Vec<BenchmarkCase>,
48}
49
50#[derive(Debug, Clone, Deserialize)]
70pub struct BenchmarkCase {
71 pub prompt: String,
73 #[serde(default)]
75 pub context: Option<String>,
76 #[serde(default)]
78 pub reference: Option<String>,
79 #[serde(default)]
81 pub tags: Option<Vec<String>>,
82}
83
84impl BenchmarkSet {
85 pub fn from_file(path: &Path) -> Result<Self, EvalError> {
97 let canonical = std::fs::canonicalize(path)
99 .map_err(|e| EvalError::BenchmarkLoad(path.display().to_string(), e))?;
100
101 if let Some(parent) = path.parent()
104 && let Ok(canonical_parent) = std::fs::canonicalize(parent)
105 && !canonical.starts_with(&canonical_parent)
106 {
107 return Err(EvalError::PathTraversal(canonical.display().to_string()));
108 }
109
110 let metadata = std::fs::metadata(&canonical)
112 .map_err(|e| EvalError::BenchmarkLoad(canonical.display().to_string(), e))?;
113 if metadata.len() > MAX_BENCHMARK_SIZE {
114 return Err(EvalError::BenchmarkTooLarge {
115 path: canonical.display().to_string(),
116 size: metadata.len(),
117 limit: MAX_BENCHMARK_SIZE,
118 });
119 }
120
121 let content = std::fs::read_to_string(&canonical)
122 .map_err(|e| EvalError::BenchmarkLoad(canonical.display().to_string(), e))?;
123 toml::from_str(&content)
124 .map_err(|e| EvalError::BenchmarkParse(canonical.display().to_string(), e.to_string()))
125 }
126
127 pub fn validate(&self) -> Result<(), EvalError> {
133 if self.cases.is_empty() {
134 return Err(EvalError::EmptyBenchmarkSet);
135 }
136 Ok(())
137 }
138}
139
140#[cfg(test)]
141mod tests {
142 #![allow(clippy::redundant_closure_for_method_calls)]
143
144 use super::*;
145
146 fn parse(toml: &str) -> BenchmarkSet {
147 toml::from_str(toml).expect("valid TOML")
148 }
149
150 #[test]
151 fn benchmark_from_toml_happy_path() {
152 let toml = r#"
153[[cases]]
154prompt = "What is 2+2?"
155"#;
156 let set = parse(toml);
157 assert_eq!(set.cases.len(), 1);
158 assert_eq!(set.cases[0].prompt, "What is 2+2?");
159 assert!(set.cases[0].context.is_none());
160 assert!(set.cases[0].reference.is_none());
161 assert!(set.cases[0].tags.is_none());
162 }
163
164 #[test]
165 fn benchmark_from_toml_with_all_fields() {
166 let toml = r#"
167[[cases]]
168prompt = "Explain Rust ownership."
169context = "You are a Rust expert."
170reference = "Ownership is Rust's memory management model."
171tags = ["rust", "concepts"]
172"#;
173 let set = parse(toml);
174 assert_eq!(set.cases.len(), 1);
175 let case = &set.cases[0];
176 assert_eq!(case.context.as_deref(), Some("You are a Rust expert."));
177 assert!(case.reference.is_some());
178 assert_eq!(case.tags.as_ref().map(std::vec::Vec::len), Some(2));
179 }
180
181 #[test]
182 fn benchmark_empty_cases_rejected() {
183 let set = BenchmarkSet { cases: vec![] };
184 assert!(matches!(set.validate(), Err(EvalError::EmptyBenchmarkSet)));
185 }
186
187 #[test]
188 fn benchmark_from_file_missing_file() {
189 let result = BenchmarkSet::from_file(Path::new("/nonexistent/path/benchmark.toml"));
190 assert!(matches!(result, Err(EvalError::BenchmarkLoad(_, _))));
191 }
192
193 #[test]
194 fn benchmark_from_toml_invalid_syntax() {
195 let bad = "[[cases\nprompt = 'unclosed'";
196 let result: Result<BenchmarkSet, _> = toml::from_str(bad);
197 assert!(result.is_err());
198 }
199
200 #[test]
201 fn benchmark_from_file_invalid_toml() {
202 use std::io::Write;
203 let mut f = tempfile::NamedTempFile::new().unwrap();
204 writeln!(f, "not valid toml ][[]").unwrap();
205 let result = BenchmarkSet::from_file(f.path());
206 assert!(matches!(result, Err(EvalError::BenchmarkParse(_, _))));
207 }
208
209 #[test]
210 fn benchmark_from_file_too_large() {
211 let err = EvalError::BenchmarkTooLarge {
217 path: "/tmp/bench.toml".into(),
218 size: MAX_BENCHMARK_SIZE + 1,
219 limit: MAX_BENCHMARK_SIZE,
220 };
221 assert!(err.to_string().contains("exceeds size limit"));
222 }
223
224 #[test]
225 fn benchmark_from_file_size_guard_allows_normal_file() {
226 use std::io::Write;
227 let mut f = tempfile::NamedTempFile::new().unwrap();
228 writeln!(f, "[[cases]]\nprompt = \"hello\"").unwrap();
229 let result = BenchmarkSet::from_file(f.path());
231 assert!(result.is_ok());
232 }
233
234 #[test]
235 fn benchmark_validate_passes_for_nonempty() {
236 let set = BenchmarkSet {
237 cases: vec![BenchmarkCase {
238 prompt: "hello".into(),
239 context: None,
240 reference: None,
241 tags: None,
242 }],
243 };
244 assert!(set.validate().is_ok());
245 }
246}