zeph_bench/loaders/
frames.rs1use std::{
5 io::{BufRead as _, BufReader},
6 path::Path,
7};
8
9use serde::Deserialize;
10
11use crate::{
12 error::BenchError,
13 scenario::{DatasetLoader, EvalResult, Evaluator, Scenario, exact_match},
14};
15
16#[derive(Debug, Deserialize)]
17struct FramesRecord {
18 #[serde(rename = "Prompt")]
19 prompt: String,
20 #[serde(rename = "Answer")]
21 answer: String,
22 reasoning_types: Option<serde_json::Value>,
23}
24
25#[derive(Debug)]
54pub struct FramesLoader;
55
56impl DatasetLoader for FramesLoader {
57 fn name(&self) -> &'static str {
58 "frames"
59 }
60
61 fn load(&self, path: &Path) -> Result<Vec<Scenario>, BenchError> {
66 let file = std::fs::File::open(path)?;
67 let reader = BufReader::new(file);
68
69 let mut scenarios = Vec::new();
70 for (line_number, line) in reader.lines().enumerate() {
71 let line = line?;
72 let trimmed = line.trim();
73 if trimmed.is_empty() {
74 continue;
75 }
76 let record: FramesRecord = serde_json::from_str(trimmed)
77 .map_err(|e| BenchError::InvalidFormat(format!("line {line_number}: {e}")))?;
78
79 let metadata = record.reasoning_types.unwrap_or(serde_json::Value::Null);
80
81 scenarios.push(Scenario::single(
82 format!("frames_{line_number}"),
83 record.prompt,
84 record.answer,
85 metadata,
86 ));
87 }
88 Ok(scenarios)
89 }
90}
91
92#[derive(Debug)]
115pub struct FramesEvaluator;
116
117impl Evaluator for FramesEvaluator {
118 fn evaluate(&self, scenario: &Scenario, agent_response: &str) -> EvalResult {
119 let passed = exact_match(agent_response, &scenario.expected);
120 EvalResult {
121 scenario_id: scenario.id.clone(),
122 score: if passed { 1.0 } else { 0.0 },
123 passed,
124 details: format!("exact_match={}", if passed { "true" } else { "false" }),
125 }
126 }
127}
128
129#[cfg(test)]
130mod tests {
131 use super::*;
132
133 const FIXTURE: &str = r#"{"Prompt": "What is 2+2?", "Answer": "4", "reasoning_types": ["math"], "wiki_links": []}
134{"Prompt": "Capital of France?", "Answer": "Paris", "reasoning_types": ["geography"]}
135"#;
136
137 fn load_from_str(jsonl: &str) -> Vec<Scenario> {
138 let dir = tempfile::tempdir().unwrap();
139 let path = dir.path().join("frames.jsonl");
140 std::fs::write(&path, jsonl).unwrap();
141 FramesLoader.load(&path).unwrap()
142 }
143
144 #[test]
145 fn load_parses_scenario_count() {
146 let scenarios = load_from_str(FIXTURE);
147 assert_eq!(scenarios.len(), 2);
148 }
149
150 #[test]
151 fn load_builds_correct_ids() {
152 let scenarios = load_from_str(FIXTURE);
153 assert_eq!(scenarios[0].id, "frames_0");
154 assert_eq!(scenarios[1].id, "frames_1");
155 }
156
157 #[test]
158 fn load_maps_prompt_and_expected() {
159 let scenarios = load_from_str(FIXTURE);
160 assert_eq!(scenarios[0].primary_prompt().unwrap(), "What is 2+2?");
161 assert_eq!(scenarios[0].expected, "4");
162 }
163
164 #[test]
165 fn load_stores_reasoning_types_in_metadata() {
166 let scenarios = load_from_str(FIXTURE);
167 assert!(scenarios[0].metadata.is_array());
168 }
169
170 #[test]
171 fn evaluator_exact_match_passes() {
172 let scenarios = load_from_str(FIXTURE);
173 let result = FramesEvaluator.evaluate(&scenarios[0], "4");
174 assert!(result.passed);
175 assert!((result.score - 1.0).abs() < f64::EPSILON);
176 }
177
178 #[test]
179 fn evaluator_wrong_answer_fails() {
180 let scenarios = load_from_str(FIXTURE);
181 let result = FramesEvaluator.evaluate(&scenarios[0], "5");
182 assert!(!result.passed);
183 assert!(result.score < f64::EPSILON);
184 }
185
186 #[test]
187 fn evaluator_case_insensitive_match() {
188 let scenarios = load_from_str(FIXTURE);
189 let result = FramesEvaluator.evaluate(&scenarios[1], "paris");
190 assert!(result.passed);
191 }
192
193 #[test]
194 fn load_invalid_jsonl_returns_error() {
195 let dir = tempfile::tempdir().unwrap();
196 let path = dir.path().join("bad.jsonl");
197 std::fs::write(&path, "not json\n").unwrap();
198 assert!(FramesLoader.load(&path).is_err());
199 }
200}