zeph_experiments/
benchmark.rs1use std::path::Path;
7
8use serde::Deserialize;
9
10use super::error::EvalError;
11
12const MAX_BENCHMARK_SIZE: u64 = 10 * 1024 * 1024;
14
15#[derive(Debug, Clone, Deserialize)]
30pub struct BenchmarkSet {
31 pub cases: Vec<BenchmarkCase>,
32}
33
34#[derive(Debug, Clone, Deserialize)]
36pub struct BenchmarkCase {
37 pub prompt: String,
39 #[serde(default)]
41 pub context: Option<String>,
42 #[serde(default)]
44 pub reference: Option<String>,
45 #[serde(default)]
47 pub tags: Option<Vec<String>>,
48}
49
50impl BenchmarkSet {
51 pub fn from_file(path: &Path) -> Result<Self, EvalError> {
63 let canonical = std::fs::canonicalize(path)
65 .map_err(|e| EvalError::BenchmarkLoad(path.display().to_string(), e))?;
66
67 if let Some(parent) = path.parent()
70 && let Ok(canonical_parent) = std::fs::canonicalize(parent)
71 && !canonical.starts_with(&canonical_parent)
72 {
73 return Err(EvalError::PathTraversal(canonical.display().to_string()));
74 }
75
76 let metadata = std::fs::metadata(&canonical)
78 .map_err(|e| EvalError::BenchmarkLoad(canonical.display().to_string(), e))?;
79 if metadata.len() > MAX_BENCHMARK_SIZE {
80 return Err(EvalError::BenchmarkTooLarge {
81 path: canonical.display().to_string(),
82 size: metadata.len(),
83 limit: MAX_BENCHMARK_SIZE,
84 });
85 }
86
87 let content = std::fs::read_to_string(&canonical)
88 .map_err(|e| EvalError::BenchmarkLoad(canonical.display().to_string(), e))?;
89 toml::from_str(&content)
90 .map_err(|e| EvalError::BenchmarkParse(canonical.display().to_string(), e.to_string()))
91 }
92
93 pub fn validate(&self) -> Result<(), EvalError> {
99 if self.cases.is_empty() {
100 return Err(EvalError::EmptyBenchmarkSet);
101 }
102 Ok(())
103 }
104}
105
106#[cfg(test)]
107mod tests {
108 #![allow(clippy::redundant_closure_for_method_calls)]
109
110 use super::*;
111
112 fn parse(toml: &str) -> BenchmarkSet {
113 toml::from_str(toml).expect("valid TOML")
114 }
115
116 #[test]
117 fn benchmark_from_toml_happy_path() {
118 let toml = r#"
119[[cases]]
120prompt = "What is 2+2?"
121"#;
122 let set = parse(toml);
123 assert_eq!(set.cases.len(), 1);
124 assert_eq!(set.cases[0].prompt, "What is 2+2?");
125 assert!(set.cases[0].context.is_none());
126 assert!(set.cases[0].reference.is_none());
127 assert!(set.cases[0].tags.is_none());
128 }
129
130 #[test]
131 fn benchmark_from_toml_with_all_fields() {
132 let toml = r#"
133[[cases]]
134prompt = "Explain Rust ownership."
135context = "You are a Rust expert."
136reference = "Ownership is Rust's memory management model."
137tags = ["rust", "concepts"]
138"#;
139 let set = parse(toml);
140 assert_eq!(set.cases.len(), 1);
141 let case = &set.cases[0];
142 assert_eq!(case.context.as_deref(), Some("You are a Rust expert."));
143 assert!(case.reference.is_some());
144 assert_eq!(case.tags.as_ref().map(std::vec::Vec::len), Some(2));
145 }
146
147 #[test]
148 fn benchmark_empty_cases_rejected() {
149 let set = BenchmarkSet { cases: vec![] };
150 assert!(matches!(set.validate(), Err(EvalError::EmptyBenchmarkSet)));
151 }
152
153 #[test]
154 fn benchmark_from_file_missing_file() {
155 let result = BenchmarkSet::from_file(Path::new("/nonexistent/path/benchmark.toml"));
156 assert!(matches!(result, Err(EvalError::BenchmarkLoad(_, _))));
157 }
158
159 #[test]
160 fn benchmark_from_toml_invalid_syntax() {
161 let bad = "[[cases\nprompt = 'unclosed'";
162 let result: Result<BenchmarkSet, _> = toml::from_str(bad);
163 assert!(result.is_err());
164 }
165
166 #[test]
167 fn benchmark_from_file_invalid_toml() {
168 use std::io::Write;
169 let mut f = tempfile::NamedTempFile::new().unwrap();
170 writeln!(f, "not valid toml ][[]").unwrap();
171 let result = BenchmarkSet::from_file(f.path());
172 assert!(matches!(result, Err(EvalError::BenchmarkParse(_, _))));
173 }
174
175 #[test]
176 fn benchmark_from_file_too_large() {
177 let err = EvalError::BenchmarkTooLarge {
183 path: "/tmp/bench.toml".into(),
184 size: MAX_BENCHMARK_SIZE + 1,
185 limit: MAX_BENCHMARK_SIZE,
186 };
187 assert!(err.to_string().contains("exceeds size limit"));
188 }
189
190 #[test]
191 fn benchmark_from_file_size_guard_allows_normal_file() {
192 use std::io::Write;
193 let mut f = tempfile::NamedTempFile::new().unwrap();
194 writeln!(f, "[[cases]]\nprompt = \"hello\"").unwrap();
195 let result = BenchmarkSet::from_file(f.path());
197 assert!(result.is_ok());
198 }
199
200 #[test]
201 fn benchmark_validate_passes_for_nonempty() {
202 let set = BenchmarkSet {
203 cases: vec![BenchmarkCase {
204 prompt: "hello".into(),
205 context: None,
206 reference: None,
207 tags: None,
208 }],
209 };
210 assert!(set.validate().is_ok());
211 }
212}