zeph_bench/loaders/
frames.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4use std::{
5    io::{BufRead as _, BufReader},
6    path::Path,
7};
8
9use serde::Deserialize;
10
11use crate::{
12    error::BenchError,
13    scenario::{DatasetLoader, EvalResult, Evaluator, Scenario, exact_match},
14};
15
16#[derive(Debug, Deserialize)]
17struct FramesRecord {
18    #[serde(rename = "Prompt")]
19    prompt: String,
20    #[serde(rename = "Answer")]
21    answer: String,
22    reasoning_types: Option<serde_json::Value>,
23}
24
25/// Loads FRAMES benchmark scenarios from a JSONL file.
26///
27/// **Source**: [`google/frames-benchmark`](https://huggingface.co/datasets/google/frames-benchmark)
28/// on `HuggingFace`.
29///
30/// **Schema**: one JSON object per line:
31/// ```json
32/// {"Prompt": "...", "Answer": "...", "reasoning_types": [...], "wiki_links": [...]}
33/// ```
34///
35/// Each non-empty line becomes one [`Scenario`]:
36/// - `id` — `"frames_{line_number}"` (zero-based, counting from the first line of the file).
37/// - `prompt` — value of `"Prompt"`.
38/// - `expected` — value of `"Answer"`.
39/// - `metadata` — value of `"reasoning_types"` (array of strings, or `null`).
40///
41/// Empty lines are skipped. Unknown fields (e.g. `"wiki_links"`) are ignored.
42///
43/// # Examples
44///
45/// ```no_run
46/// use std::path::Path;
47/// use zeph_bench::loaders::FramesLoader;
48/// use zeph_bench::scenario::DatasetLoader;
49///
50/// let scenarios = FramesLoader.load(Path::new("/data/frames.jsonl")).unwrap();
51/// println!("loaded {} scenarios", scenarios.len());
52/// ```
53#[derive(Debug)]
54pub struct FramesLoader;
55
56impl DatasetLoader for FramesLoader {
57    fn name(&self) -> &'static str {
58        "frames"
59    }
60
61    /// # Errors
62    ///
63    /// Returns [`BenchError::Io`] when the file cannot be read and
64    /// [`BenchError::InvalidFormat`] when a JSONL line cannot be parsed.
65    fn load(&self, path: &Path) -> Result<Vec<Scenario>, BenchError> {
66        let file = std::fs::File::open(path)?;
67        let reader = BufReader::new(file);
68
69        let mut scenarios = Vec::new();
70        for (line_number, line) in reader.lines().enumerate() {
71            let line = line?;
72            let trimmed = line.trim();
73            if trimmed.is_empty() {
74                continue;
75            }
76            let record: FramesRecord = serde_json::from_str(trimmed)
77                .map_err(|e| BenchError::InvalidFormat(format!("line {line_number}: {e}")))?;
78
79            let metadata = record.reasoning_types.unwrap_or(serde_json::Value::Null);
80
81            scenarios.push(Scenario {
82                id: format!("frames_{line_number}"),
83                prompt: record.prompt,
84                expected: record.answer,
85                metadata,
86            });
87        }
88        Ok(scenarios)
89    }
90}
91
92/// Evaluates FRAMES responses using case-insensitive exact match.
93///
94/// Normalization (applied to both prediction and reference before comparison):
95/// 1. Keep only alphanumeric characters and whitespace.
96/// 2. Convert to lowercase.
97/// 3. Collapse runs of whitespace.
98///
99/// Score is `1.0` when the normalized strings match, `0.0` otherwise.
100///
101/// # Examples
102///
103/// ```
104/// use zeph_bench::{Scenario, loaders::FramesEvaluator};
105/// use zeph_bench::scenario::Evaluator;
106///
107/// let scenario = Scenario {
108///     id: "frames_0".into(),
109///     prompt: "Capital of France?".into(),
110///     expected: "Paris".into(),
111///     metadata: serde_json::Value::Null,
112/// };
113///
114/// // Case-insensitive and punctuation-stripped.
115/// assert!(FramesEvaluator.evaluate(&scenario, "paris").passed);
116/// assert!(FramesEvaluator.evaluate(&scenario, "Paris!").passed);
117/// assert!(!FramesEvaluator.evaluate(&scenario, "London").passed);
118/// ```
119#[derive(Debug)]
120pub struct FramesEvaluator;
121
122impl Evaluator for FramesEvaluator {
123    fn evaluate(&self, scenario: &Scenario, agent_response: &str) -> EvalResult {
124        let passed = exact_match(agent_response, &scenario.expected);
125        EvalResult {
126            scenario_id: scenario.id.clone(),
127            score: if passed { 1.0 } else { 0.0 },
128            passed,
129            details: format!("exact_match={}", if passed { "true" } else { "false" }),
130        }
131    }
132}
133
134#[cfg(test)]
135mod tests {
136    use super::*;
137
138    const FIXTURE: &str = r#"{"Prompt": "What is 2+2?", "Answer": "4", "reasoning_types": ["math"], "wiki_links": []}
139{"Prompt": "Capital of France?", "Answer": "Paris", "reasoning_types": ["geography"]}
140"#;
141
142    fn load_from_str(jsonl: &str) -> Vec<Scenario> {
143        let dir = tempfile::tempdir().unwrap();
144        let path = dir.path().join("frames.jsonl");
145        std::fs::write(&path, jsonl).unwrap();
146        FramesLoader.load(&path).unwrap()
147    }
148
149    #[test]
150    fn load_parses_scenario_count() {
151        let scenarios = load_from_str(FIXTURE);
152        assert_eq!(scenarios.len(), 2);
153    }
154
155    #[test]
156    fn load_builds_correct_ids() {
157        let scenarios = load_from_str(FIXTURE);
158        assert_eq!(scenarios[0].id, "frames_0");
159        assert_eq!(scenarios[1].id, "frames_1");
160    }
161
162    #[test]
163    fn load_maps_prompt_and_expected() {
164        let scenarios = load_from_str(FIXTURE);
165        assert_eq!(scenarios[0].prompt, "What is 2+2?");
166        assert_eq!(scenarios[0].expected, "4");
167    }
168
169    #[test]
170    fn load_stores_reasoning_types_in_metadata() {
171        let scenarios = load_from_str(FIXTURE);
172        assert!(scenarios[0].metadata.is_array());
173    }
174
175    #[test]
176    fn evaluator_exact_match_passes() {
177        let scenarios = load_from_str(FIXTURE);
178        let result = FramesEvaluator.evaluate(&scenarios[0], "4");
179        assert!(result.passed);
180        assert!((result.score - 1.0).abs() < f64::EPSILON);
181    }
182
183    #[test]
184    fn evaluator_wrong_answer_fails() {
185        let scenarios = load_from_str(FIXTURE);
186        let result = FramesEvaluator.evaluate(&scenarios[0], "5");
187        assert!(!result.passed);
188        assert!(result.score < f64::EPSILON);
189    }
190
191    #[test]
192    fn evaluator_case_insensitive_match() {
193        let scenarios = load_from_str(FIXTURE);
194        let result = FramesEvaluator.evaluate(&scenarios[1], "paris");
195        assert!(result.passed);
196    }
197
198    #[test]
199    fn load_invalid_jsonl_returns_error() {
200        let dir = tempfile::tempdir().unwrap();
201        let path = dir.path().join("bad.jsonl");
202        std::fs::write(&path, "not json\n").unwrap();
203        assert!(FramesLoader.load(&path).is_err());
204    }
205}
zeph_bench/loaders/frames.rs

zeph_bench/loaders/
frames.rs