zeph_bench/
scenario.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4use std::path::Path;
5
6use crate::error::BenchError;
7
8/// Role of a turn in a multi-turn scenario conversation.
9///
10/// # Examples
11///
12/// ```
13/// use zeph_bench::scenario::Role;
14///
15/// assert!(matches!(Role::User, Role::User));
16/// assert!(matches!(Role::Assistant, Role::Assistant));
17/// ```
18#[derive(Debug, Clone, PartialEq, Eq)]
19pub enum Role {
20    /// A message from the human user.
21    User,
22    /// A message from the AI assistant.
23    Assistant,
24}
25
26/// One turn in a multi-turn scenario conversation.
27///
28/// # Examples
29///
30/// ```
31/// use zeph_bench::scenario::{Role, Turn};
32///
33/// let turn = Turn { role: Role::User, content: "What is the capital of France?".into() };
34/// assert!(matches!(turn.role, Role::User));
35/// ```
36#[derive(Debug, Clone)]
37pub struct Turn {
38    /// Who authored this turn.
39    pub role: Role,
40    /// Text content of the turn.
41    pub content: String,
42}
43
44/// A single benchmark scenario loaded from a dataset file.
45///
46/// Each scenario represents one question/task that will be presented to the agent.
47/// The `id` field is used to correlate agent responses with ground-truth answers and
48/// to skip already-completed scenarios during a `--resume` run.
49///
50/// Construct via [`Scenario::single`] for single-turn scenarios (all built-in loaders),
51/// or push [`Turn`]s directly into [`Scenario::turns`] for multi-turn scenarios.
52///
53/// # Examples
54///
55/// ```
56/// use zeph_bench::Scenario;
57///
58/// let scenario = Scenario::single(
59///     "gaia_t42",
60///     "What is the boiling point of water in Celsius?",
61///     "100",
62///     serde_json::json!({"level": 1}),
63/// );
64/// assert_eq!(scenario.id, "gaia_t42");
65/// assert_eq!(scenario.primary_prompt().unwrap(), "What is the boiling point of water in Celsius?");
66/// ```
67#[derive(Debug, Clone)]
68pub struct Scenario {
69    /// Unique identifier within the dataset (e.g. `"frames_0"`, `"s1_2"`).
70    pub id: String,
71    /// Ordered turns in this scenario. Non-empty by contract of [`Scenario::single`].
72    ///
73    /// Direct construction is allowed for multi-turn scenarios; callers must ensure
74    /// at least one [`Role::User`] turn is present before calling [`Scenario::primary_prompt`].
75    pub turns: Vec<Turn>,
76    /// The gold-standard answer used for scoring.
77    pub expected: String,
78    /// Dataset-specific extras such as difficulty level or `reasoning_types`.
79    ///
80    /// Set to [`serde_json::Value::Null`] when the dataset has no extra metadata.
81    pub metadata: serde_json::Value,
82}
83
84impl Scenario {
85    /// Convenience constructor for single-turn scenarios.
86    ///
87    /// Wraps `prompt` in a one-element [`Vec<Turn>`] with [`Role::User`]. All built-in
88    /// dataset loaders use this constructor.
89    ///
90    /// # Examples
91    ///
92    /// ```
93    /// use zeph_bench::Scenario;
94    ///
95    /// let s = Scenario::single("id1", "What year?", "2026", serde_json::Value::Null);
96    /// assert_eq!(s.primary_prompt().unwrap(), "What year?");
97    /// ```
98    #[must_use]
99    pub fn single(
100        id: impl Into<String>,
101        prompt: impl Into<String>,
102        expected: impl Into<String>,
103        metadata: serde_json::Value,
104    ) -> Self {
105        Self {
106            id: id.into(),
107            turns: vec![Turn {
108                role: Role::User,
109                content: prompt.into(),
110            }],
111            expected: expected.into(),
112            metadata,
113        }
114    }
115
116    /// Returns the content of the first [`Role::User`] turn.
117    ///
118    /// # Errors
119    ///
120    /// Returns [`BenchError::InvalidFormat`] when `turns` is empty or contains no
121    /// [`Role::User`] entry. Loaders must construct via [`Scenario::single`] or push
122    /// at least one user turn.
123    ///
124    /// # Examples
125    ///
126    /// ```
127    /// use zeph_bench::Scenario;
128    ///
129    /// let s = Scenario::single("id1", "hello", "world", serde_json::Value::Null);
130    /// assert_eq!(s.primary_prompt().unwrap(), "hello");
131    /// ```
132    pub fn primary_prompt(&self) -> Result<&str, BenchError> {
133        self.turns
134            .iter()
135            .find(|t| matches!(t.role, Role::User))
136            .map(|t| t.content.as_str())
137            .ok_or_else(|| {
138                BenchError::InvalidFormat(format!("scenario '{}' has no user turn", self.id))
139            })
140    }
141}
142
143/// Result of evaluating one agent response against the expected answer.
144///
145/// Produced by [`Evaluator::evaluate`]. The `score` is always in `0.0..=1.0`:
146/// - `1.0` — perfect match (exact or token-level depending on the evaluator).
147/// - `0.0` — no match.
148/// - Intermediate values — partial token overlap (LOCOMO token-F1 evaluator).
149///
150/// # Examples
151///
152/// ```
153/// use zeph_bench::EvalResult;
154///
155/// let result = EvalResult {
156///     scenario_id: "s1".into(),
157///     score: 0.75,
158///     passed: true,
159///     details: "token_f1=0.7500".into(),
160/// };
161/// assert!(result.passed);
162/// ```
163#[derive(Debug, Clone)]
164pub struct EvalResult {
165    /// ID of the scenario that produced this result.
166    pub scenario_id: String,
167    /// Numeric score in `0.0..=1.0`.
168    pub score: f64,
169    /// `true` when `score >= threshold` (threshold is evaluator-specific).
170    pub passed: bool,
171    /// Human-readable details such as `"token_f1=0.7500"` or `"exact_match=true"`.
172    pub details: String,
173}
174
175/// Loads scenarios from a dataset file on disk.
176///
177/// Implement this trait to add support for a new dataset format. The harness
178/// calls [`DatasetLoader::load`] once per run to materialise the full scenario
179/// list before iterating.
180///
181/// Built-in implementations:
182/// - [`crate::loaders::LocomoLoader`] — JSON array of sessions
183/// - [`crate::loaders::FramesLoader`] — JSONL, one record per line
184/// - [`crate::loaders::GaiaLoader`] — JSONL with optional level filter
185pub trait DatasetLoader {
186    /// Short identifier matching the dataset name in [`crate::DatasetRegistry`].
187    fn name(&self) -> &'static str;
188
189    /// Load all matching scenarios from `path`.
190    ///
191    /// # Errors
192    ///
193    /// Returns [`BenchError::Io`] when the file cannot be opened or read, and
194    /// [`BenchError::InvalidFormat`] when the file content cannot be parsed.
195    fn load(&self, path: &Path) -> Result<Vec<Scenario>, BenchError>;
196}
197
198/// Scores one agent response against a [`Scenario`].
199///
200/// Each dataset loader ships a paired evaluator:
201/// - [`crate::loaders::LocomoEvaluator`] — token F1 with threshold 0.5
202/// - [`crate::loaders::FramesEvaluator`] — exact match (case-insensitive, punctuation stripped)
203/// - [`crate::loaders::GaiaEvaluator`] — GAIA-normalized exact match (articles stripped)
204pub trait Evaluator {
205    /// Compute and return an [`EvalResult`] for the given `agent_response`.
206    fn evaluate(&self, scenario: &Scenario, agent_response: &str) -> EvalResult;
207}
208
209/// Token F1 score: overlap of whitespace-split tokens between prediction and reference.
210///
211/// Splits both strings on whitespace, computes precision and recall over the
212/// token-type intersection, then returns the harmonic mean (F1).
213/// Returns `0.0` when either string is empty.
214///
215/// This metric is tolerant of minor wording differences and is used by the
216/// LOCOMO evaluator.
217///
218/// # Examples
219///
220/// ```
221/// use zeph_bench::token_f1;
222///
223/// // Perfect match.
224/// assert!((token_f1("hello world", "hello world") - 1.0).abs() < f64::EPSILON);
225///
226/// // No overlap.
227/// assert!(token_f1("foo bar", "baz qux") < f64::EPSILON);
228///
229/// // Partial overlap gives a value between 0 and 1.
230/// let f1 = token_f1("the cat sat", "the cat ran");
231/// assert!(f1 > 0.0 && f1 < 1.0);
232///
233/// // Empty strings return 0.
234/// assert!(token_f1("", "hello") < f64::EPSILON);
235/// ```
236#[must_use]
237pub fn token_f1(prediction: &str, reference: &str) -> f64 {
238    let pred_tokens: std::collections::HashSet<&str> = prediction.split_whitespace().collect();
239    let ref_tokens: std::collections::HashSet<&str> = reference.split_whitespace().collect();
240
241    if pred_tokens.is_empty() || ref_tokens.is_empty() {
242        return 0.0;
243    }
244
245    #[allow(clippy::cast_precision_loss)]
246    let common = pred_tokens.intersection(&ref_tokens).count() as f64;
247    #[allow(clippy::cast_precision_loss)]
248    let precision = common / pred_tokens.len() as f64;
249    #[allow(clippy::cast_precision_loss)]
250    let recall = common / ref_tokens.len() as f64;
251
252    if precision + recall == 0.0 {
253        return 0.0;
254    }
255
256    2.0 * precision * recall / (precision + recall)
257}
258
259/// Exact match after lowercasing and stripping punctuation/whitespace.
260///
261/// Both strings are normalized by:
262/// 1. Keeping only alphanumeric characters and whitespace.
263/// 2. Converting to lowercase.
264/// 3. Collapsing runs of whitespace to a single space.
265///
266/// Used by the FRAMES evaluator.
267///
268/// # Examples
269///
270/// ```
271/// use zeph_bench::exact_match;
272///
273/// assert!(exact_match("Hello, World!", "hello world"));
274/// assert!(exact_match("answer: YES.", "answer yes"));
275/// assert!(!exact_match("foo", "bar"));
276/// ```
277#[must_use]
278pub fn exact_match(prediction: &str, reference: &str) -> bool {
279    normalize_basic(prediction) == normalize_basic(reference)
280}
281
282/// GAIA-normalized exact match: lowercase, strip articles, strip punctuation, collapse
283/// whitespace, then compare.
284///
285/// Normalization steps (in order):
286/// 1. Keep only alphanumeric characters and whitespace.
287/// 2. Convert to lowercase.
288/// 3. Remove the articles `a`, `an`, and `the`.
289/// 4. Collapse whitespace and compare.
290///
291/// This matches the official GAIA leaderboard scoring script.
292///
293/// # Examples
294///
295/// ```
296/// use zeph_bench::gaia_normalized_exact_match;
297///
298/// // Articles are stripped from both sides.
299/// assert!(gaia_normalized_exact_match("The Tokyo", "Tokyo"));
300/// assert!(gaia_normalized_exact_match("a cat sat on an apple", "cat sat on apple"));
301///
302/// // Different answers do not match.
303/// assert!(!gaia_normalized_exact_match("1944", "1945"));
304/// ```
305#[must_use]
306pub fn gaia_normalized_exact_match(prediction: &str, reference: &str) -> bool {
307    normalize_gaia(prediction) == normalize_gaia(reference)
308}
309
310fn normalize_basic(s: &str) -> String {
311    s.chars()
312        .filter(|c| c.is_alphanumeric() || c.is_whitespace())
313        .collect::<String>()
314        .to_lowercase()
315        .split_whitespace()
316        .collect::<Vec<_>>()
317        .join(" ")
318}
319
320fn normalize_gaia(s: &str) -> String {
321    const ARTICLES: &[&str] = &["a", "an", "the"];
322
323    // Map Unicode subscript/superscript digits to their ASCII equivalents before
324    // stripping — this ensures "H₂O" and "H2O" normalize identically.
325    let ascii_mapped: String = s.chars().map(ascii_fold_digit).collect();
326
327    let stripped = ascii_mapped
328        .chars()
329        .filter(|c| c.is_alphanumeric() || c.is_whitespace())
330        .collect::<String>()
331        .to_lowercase();
332
333    stripped
334        .split_whitespace()
335        .filter(|tok| !ARTICLES.contains(tok))
336        .collect::<Vec<_>>()
337        .join(" ")
338}
339
340/// Map Unicode subscript and superscript digit characters to their ASCII equivalents.
341///
342/// Returns the character unchanged if it is not a subscript/superscript digit.
343fn ascii_fold_digit(c: char) -> char {
344    match c {
345        '\u{2080}' | '\u{2070}' => '0',
346        '\u{2081}' | '\u{00B9}' => '1',
347        '\u{2082}' | '\u{00B2}' => '2',
348        '\u{2083}' | '\u{00B3}' => '3',
349        '\u{2084}' | '\u{2074}' => '4',
350        '\u{2085}' | '\u{2075}' => '5',
351        '\u{2086}' | '\u{2076}' => '6',
352        '\u{2087}' | '\u{2077}' => '7',
353        '\u{2088}' | '\u{2078}' => '8',
354        '\u{2089}' | '\u{2079}' => '9',
355        other => other,
356    }
357}
358
359#[cfg(test)]
360mod tests {
361    use super::*;
362
363    #[test]
364    fn token_f1_identical() {
365        assert!((token_f1("hello world", "hello world") - 1.0).abs() < f64::EPSILON);
366    }
367
368    #[test]
369    fn token_f1_no_overlap() {
370        assert!(token_f1("foo bar", "baz qux") < f64::EPSILON);
371    }
372
373    #[test]
374    fn token_f1_partial_overlap() {
375        let f1 = token_f1("hello world foo", "hello world bar");
376        assert!(f1 > 0.0 && f1 < 1.0);
377    }
378
379    #[test]
380    fn token_f1_empty_prediction() {
381        assert!(token_f1("", "hello") < f64::EPSILON);
382    }
383
384    #[test]
385    fn token_f1_empty_reference() {
386        assert!(token_f1("hello", "") < f64::EPSILON);
387    }
388
389    #[test]
390    fn exact_match_identical() {
391        assert!(exact_match("Hello, World!", "hello world"));
392    }
393
394    #[test]
395    fn exact_match_differs() {
396        assert!(!exact_match("foo", "bar"));
397    }
398
399    #[test]
400    fn exact_match_strips_punctuation() {
401        assert!(exact_match("answer: yes.", "answer yes"));
402    }
403
404    #[test]
405    fn gaia_normalized_strips_articles() {
406        assert!(gaia_normalized_exact_match(
407            "The quick brown fox",
408            "quick brown fox"
409        ));
410    }
411
412    #[test]
413    fn gaia_normalized_strips_a_an() {
414        assert!(gaia_normalized_exact_match(
415            "a cat sat on an apple",
416            "cat sat on apple"
417        ));
418    }
419
420    #[test]
421    fn gaia_normalized_differs() {
422        assert!(!gaia_normalized_exact_match("cat", "dog"));
423    }
424
425    #[test]
426    fn gaia_normalized_subscript_digits_match_ascii() {
427        // Model may respond with Unicode subscript "H₂O" — must match ASCII "H2O".
428        assert!(gaia_normalized_exact_match("H\u{2082}O", "H2O"));
429    }
430
431    #[test]
432    fn single_constructs_one_user_turn() {
433        let s = Scenario::single("id1", "hello", "world", serde_json::Value::Null);
434        assert_eq!(s.turns.len(), 1);
435        assert!(matches!(s.turns[0].role, Role::User));
436        assert_eq!(s.turns[0].content, "hello");
437        assert_eq!(s.expected, "world");
438    }
439
440    #[test]
441    fn primary_prompt_returns_first_user_turn_content() {
442        let s = Scenario::single("id1", "What year?", "2026", serde_json::Value::Null);
443        assert_eq!(s.primary_prompt().unwrap(), "What year?");
444    }
445
446    #[test]
447    fn primary_prompt_skips_leading_assistant_turns() {
448        let s = Scenario {
449            id: "id2".into(),
450            turns: vec![
451                Turn {
452                    role: Role::Assistant,
453                    content: "I am ready.".into(),
454                },
455                Turn {
456                    role: Role::User,
457                    content: "What is Rust?".into(),
458                },
459            ],
460            expected: "A systems language".into(),
461            metadata: serde_json::Value::Null,
462        };
463        assert_eq!(s.primary_prompt().unwrap(), "What is Rust?");
464    }
465
466    #[test]
467    fn primary_prompt_errors_when_no_user_turn() {
468        let s = Scenario {
469            id: "id3".into(),
470            turns: vec![Turn {
471                role: Role::Assistant,
472                content: "assistant only".into(),
473            }],
474            expected: String::new(),
475            metadata: serde_json::Value::Null,
476        };
477        assert!(s.primary_prompt().is_err());
478
479        let empty = Scenario {
480            id: "id4".into(),
481            turns: vec![],
482            expected: String::new(),
483            metadata: serde_json::Value::Null,
484        };
485        assert!(empty.primary_prompt().is_err());
486    }
487}
zeph_bench/scenario.rs

zeph_bench/
scenario.rs