Skip to main content

zeph_bench/
results.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Benchmark result types and writer.
5//!
6//! [`BenchRun`] is the top-level result record written to `results.json`.
7//! [`ResultWriter`] handles serialization to JSON and a human-readable Markdown summary,
8//! including partial flushing on SIGINT and resume support.
9
10use std::collections::HashSet;
11use std::fmt::Write as _;
12use std::path::{Path, PathBuf};
13
14use serde::{Deserialize, Serialize};
15
16use crate::error::BenchError;
17
18/// Status of a benchmark run serialized into `results.json`.
19///
20/// The `Running` variant is used in-memory during an active run and should never
21/// appear in a persisted file.
22///
23/// # Examples
24///
25/// ```
26/// use zeph_bench::RunStatus;
27///
28/// assert_ne!(RunStatus::Completed, RunStatus::Interrupted);
29/// ```
30#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
31#[serde(rename_all = "snake_case")]
32pub enum RunStatus {
33    /// All scenarios finished successfully.
34    Completed,
35    /// The run was cancelled (e.g. SIGINT) before all scenarios finished.
36    Interrupted,
37    /// The run is currently in progress; should not appear in a persisted file.
38    Running,
39}
40
41/// Per-scenario result record persisted inside [`BenchRun::results`].
42///
43/// # Examples
44///
45/// ```
46/// use zeph_bench::ScenarioResult;
47///
48/// let r = ScenarioResult {
49///     scenario_id: "gaia_t1".into(),
50///     score: 1.0,
51///     response_excerpt: "1945".into(),
52///     error: None,
53///     elapsed_ms: 820,
54/// };
55/// assert!(r.error.is_none());
56/// ```
57#[derive(Debug, Clone, Serialize, Deserialize)]
58pub struct ScenarioResult {
59    /// Unique identifier for the scenario (matches [`crate::Scenario::id`]).
60    pub scenario_id: String,
61    /// Numeric score in `[0.0, 1.0]` produced by the evaluator.
62    pub score: f64,
63    /// First 200 characters of the agent response for quick review.
64    pub response_excerpt: String,
65    /// Error message if the scenario could not be completed, otherwise `None`.
66    pub error: Option<String>,
67    /// Wall-clock time in milliseconds for this scenario.
68    pub elapsed_ms: u64,
69}
70
71/// Aggregate statistics computed from all [`ScenarioResult`]s in a [`BenchRun`].
72///
73/// Recomputed after every scenario via [`BenchRun::recompute_aggregate`] and persisted
74/// into `results.json` so partial runs still contain meaningful statistics.
75///
76/// # Examples
77///
78/// ```
79/// use zeph_bench::Aggregate;
80///
81/// let agg = Aggregate {
82///     total: 100,
83///     mean_score: 0.72,
84///     exact_match: 55,
85///     total_elapsed_ms: 240_000,
86/// };
87/// assert_eq!(agg.total, 100);
88/// ```
89#[derive(Debug, Clone, Serialize, Deserialize, Default)]
90pub struct Aggregate {
91    /// Number of scenarios included in the statistics.
92    pub total: usize,
93    /// Arithmetic mean of all per-scenario scores.
94    pub mean_score: f64,
95    /// Count of scenarios where `score >= 1.0` (exact match).
96    pub exact_match: usize,
97    /// Sum of [`ScenarioResult::elapsed_ms`] across all scenarios.
98    pub total_elapsed_ms: u64,
99}
100
101/// Top-level benchmark run record written to `results.json`.
102///
103/// The schema is a superset of the `LongMemEval` leaderboard submission format (NFR-008),
104/// making it directly usable for leaderboard submission after a `longmemeval` run.
105///
106/// Create a default instance, then populate [`BenchRun::results`] incrementally and
107/// call [`BenchRun::recompute_aggregate`] before persisting with [`ResultWriter`].
108///
109/// # Examples
110///
111/// ```
112/// use zeph_bench::{BenchRun, RunStatus, Aggregate};
113///
114/// let run = BenchRun {
115///     dataset: "gaia".into(),
116///     model: "openai/gpt-4o".into(),
117///     run_id: "a1b2c3".into(),
118///     started_at: "2026-04-09T10:00:00Z".into(),
119///     finished_at: String::new(),
120///     status: RunStatus::Running,
121///     results: vec![],
122///     aggregate: Aggregate::default(),
123/// };
124/// assert_eq!(run.dataset, "gaia");
125/// assert!(run.results.is_empty());
126/// ```
127#[derive(Debug, Clone, Serialize, Deserialize)]
128pub struct BenchRun {
129    /// Dataset name (e.g. `"longmemeval"`).
130    pub dataset: String,
131    /// Provider/model identifier (e.g. `"openai/gpt-4o"`).
132    pub model: String,
133    /// UUID v4 uniquely identifying this run.
134    pub run_id: String,
135    /// RFC 3339 timestamp when the run started.
136    pub started_at: String,
137    /// RFC 3339 timestamp when the run ended (empty string if interrupted).
138    pub finished_at: String,
139    /// Run status.
140    pub status: RunStatus,
141    /// Per-scenario results.
142    pub results: Vec<ScenarioResult>,
143    /// Aggregate statistics.
144    pub aggregate: Aggregate,
145}
146
147impl BenchRun {
148    /// Recompute [`BenchRun::aggregate`] from the current [`BenchRun::results`] list.
149    ///
150    /// Call this after appending one or more [`ScenarioResult`]s to keep the
151    /// aggregate statistics in sync before writing to disk.
152    ///
153    /// # Examples
154    ///
155    /// ```
156    /// use zeph_bench::{BenchRun, RunStatus, ScenarioResult, Aggregate};
157    ///
158    /// let mut run = BenchRun {
159    ///     dataset: "frames".into(),
160    ///     model: "openai/gpt-4o-mini".into(),
161    ///     run_id: "r1".into(),
162    ///     started_at: "2026-01-01T00:00:00Z".into(),
163    ///     finished_at: String::new(),
164    ///     status: RunStatus::Running,
165    ///     results: vec![
166    ///         ScenarioResult {
167    ///             scenario_id: "frames_0".into(),
168    ///             score: 1.0,
169    ///             response_excerpt: "Paris".into(),
170    ///             error: None,
171    ///             elapsed_ms: 500,
172    ///         },
173    ///     ],
174    ///     aggregate: Aggregate::default(),
175    /// };
176    ///
177    /// run.recompute_aggregate();
178    /// assert_eq!(run.aggregate.total, 1);
179    /// assert!((run.aggregate.mean_score - 1.0).abs() < f64::EPSILON);
180    /// assert_eq!(run.aggregate.exact_match, 1);
181    /// ```
182    pub fn recompute_aggregate(&mut self) {
183        let total = self.results.len();
184        #[allow(clippy::cast_precision_loss)]
185        let mean_score = if total == 0 {
186            0.0
187        } else {
188            self.results.iter().map(|r| r.score).sum::<f64>() / total as f64
189        };
190        let exact_match = self.results.iter().filter(|r| r.score >= 1.0).count();
191        let total_elapsed_ms = self.results.iter().map(|r| r.elapsed_ms).sum();
192        self.aggregate = Aggregate {
193            total,
194            mean_score,
195            exact_match,
196            total_elapsed_ms,
197        };
198    }
199
200    /// Return the set of scenario IDs already present in [`BenchRun::results`].
201    ///
202    /// Used by the `--resume` logic to determine which scenarios can be skipped.
203    ///
204    /// # Examples
205    ///
206    /// ```
207    /// use zeph_bench::{BenchRun, RunStatus, ScenarioResult, Aggregate};
208    ///
209    /// let run = BenchRun {
210    ///     dataset: "gaia".into(),
211    ///     model: "openai/gpt-4o".into(),
212    ///     run_id: "r2".into(),
213    ///     started_at: "2026-01-01T00:00:00Z".into(),
214    ///     finished_at: String::new(),
215    ///     status: RunStatus::Interrupted,
216    ///     results: vec![
217    ///         ScenarioResult {
218    ///             scenario_id: "t1".into(),
219    ///             score: 1.0,
220    ///             response_excerpt: "1945".into(),
221    ///             error: None,
222    ///             elapsed_ms: 300,
223    ///         },
224    ///     ],
225    ///     aggregate: Aggregate::default(),
226    /// };
227    ///
228    /// let done = run.completed_ids();
229    /// assert!(done.contains("t1"));
230    /// assert!(!done.contains("t2"));
231    /// ```
232    #[must_use]
233    pub fn completed_ids(&self) -> HashSet<String> {
234        self.results.iter().map(|r| r.scenario_id.clone()).collect()
235    }
236}
237
238/// Writes `results.json` and `summary.md` to an output directory.
239///
240/// Files are written atomically by flushing to a `.tmp` sibling file and then
241/// renaming, so a concurrent SIGINT cannot leave a half-written JSON file.
242///
243/// # Examples
244///
245/// ```no_run
246/// use zeph_bench::{ResultWriter, BenchRun, RunStatus, Aggregate};
247///
248/// let writer = ResultWriter::new("/tmp/my-bench-run").unwrap();
249/// println!("results at {}", writer.results_path().display());
250/// ```
251pub struct ResultWriter {
252    output_dir: PathBuf,
253}
254
255impl ResultWriter {
256    /// Create a writer targeting `output_dir`.
257    ///
258    /// The directory is created automatically (single level) if it does not exist.
259    ///
260    /// # Errors
261    ///
262    /// Returns [`BenchError::Io`] if the directory cannot be created.
263    pub fn new(output_dir: impl Into<PathBuf>) -> Result<Self, BenchError> {
264        let output_dir = output_dir.into();
265        if !output_dir.exists() {
266            std::fs::create_dir(&output_dir)?;
267        }
268        Ok(Self { output_dir })
269    }
270
271    /// Absolute path of `results.json` inside the output directory.
272    ///
273    /// # Examples
274    ///
275    /// ```
276    /// use std::path::Path;
277    /// use zeph_bench::ResultWriter;
278    ///
279    /// let dir = tempfile::tempdir().unwrap();
280    /// let writer = ResultWriter::new(dir.path()).unwrap();
281    /// assert!(writer.results_path().ends_with("results.json"));
282    /// ```
283    #[must_use]
284    pub fn results_path(&self) -> PathBuf {
285        self.output_dir.join("results.json")
286    }
287
288    /// Absolute path of `summary.md` inside the output directory.
289    ///
290    /// # Examples
291    ///
292    /// ```
293    /// use zeph_bench::ResultWriter;
294    ///
295    /// let dir = tempfile::tempdir().unwrap();
296    /// let writer = ResultWriter::new(dir.path()).unwrap();
297    /// assert!(writer.summary_path().ends_with("summary.md"));
298    /// ```
299    #[must_use]
300    pub fn summary_path(&self) -> PathBuf {
301        self.output_dir.join("summary.md")
302    }
303
304    /// Load an existing `results.json` for resume.
305    ///
306    /// Returns `None` when the file does not exist (treat as fresh run).
307    ///
308    /// # Errors
309    ///
310    /// Returns [`BenchError::Io`] on read failure, or [`BenchError::InvalidFormat`] if
311    /// the file exists but cannot be deserialized.
312    pub fn load_existing(&self) -> Result<Option<BenchRun>, BenchError> {
313        let path = self.results_path();
314        if !path.exists() {
315            return Ok(None);
316        }
317        let data = std::fs::read_to_string(&path)?;
318        let run: BenchRun =
319            serde_json::from_str(&data).map_err(|e| BenchError::InvalidFormat(e.to_string()))?;
320        Ok(Some(run))
321    }
322
323    /// Write `run` to `results.json` and `summary.md` atomically (best-effort).
324    ///
325    /// # Errors
326    ///
327    /// Returns [`BenchError`] on serialization or I/O failure.
328    pub fn write(&self, run: &BenchRun) -> Result<(), BenchError> {
329        self.write_json(run)?;
330        self.write_markdown(run)?;
331        Ok(())
332    }
333
334    fn write_json(&self, run: &BenchRun) -> Result<(), BenchError> {
335        let json = serde_json::to_string_pretty(run)
336            .map_err(|e| BenchError::InvalidFormat(e.to_string()))?;
337        write_atomic(&self.results_path(), json.as_bytes())?;
338        Ok(())
339    }
340
341    fn write_markdown(&self, run: &BenchRun) -> Result<(), BenchError> {
342        let mut md = String::new();
343        let _ = writeln!(md, "# Benchmark Results: {}\n", run.dataset);
344        let _ = writeln!(md, "- **Model**: {}", run.model);
345        let _ = writeln!(md, "- **Run ID**: {}", run.run_id);
346        let _ = writeln!(md, "- **Status**: {:?}", run.status);
347        let _ = writeln!(md, "- **Started**: {}", run.started_at);
348        if !run.finished_at.is_empty() {
349            let _ = writeln!(md, "- **Finished**: {}", run.finished_at);
350        }
351        let _ = writeln!(
352            md,
353            "- **Mean score**: {:.4} ({}/{} exact)\n",
354            run.aggregate.mean_score, run.aggregate.exact_match, run.aggregate.total
355        );
356
357        md.push_str("| scenario_id | score | response_excerpt | error |\n");
358        md.push_str("|-------------|-------|------------------|-------|\n");
359        for r in &run.results {
360            let excerpt = r.response_excerpt.replace('|', "\\|");
361            let error = r.error.as_deref().unwrap_or("").replace('|', "\\|");
362            let _ = writeln!(
363                md,
364                "| {} | {:.4} | {} | {} |",
365                r.scenario_id, r.score, excerpt, error
366            );
367        }
368
369        write_atomic(&self.summary_path(), md.as_bytes())?;
370        Ok(())
371    }
372}
373
374/// Write `data` to `path` using a temp file + rename for atomicity.
375fn write_atomic(path: &Path, data: &[u8]) -> Result<(), std::io::Error> {
376    let tmp = path.with_extension("tmp");
377    std::fs::write(&tmp, data)?;
378    std::fs::rename(&tmp, path)?;
379    Ok(())
380}
381
382#[cfg(test)]
383mod tests {
384    use super::*;
385
386    fn make_run() -> BenchRun {
387        BenchRun {
388            dataset: "longmemeval".into(),
389            model: "openai/gpt-4o".into(),
390            run_id: "test-run-001".into(),
391            started_at: "2026-01-01T00:00:00Z".into(),
392            finished_at: "2026-01-01T00:01:00Z".into(),
393            status: RunStatus::Completed,
394            results: vec![
395                ScenarioResult {
396                    scenario_id: "s1".into(),
397                    score: 1.0,
398                    response_excerpt: "The answer is 42.".into(),
399                    error: None,
400                    elapsed_ms: 1000,
401                },
402                ScenarioResult {
403                    scenario_id: "s2".into(),
404                    score: 0.0,
405                    response_excerpt: String::new(),
406                    error: Some("timeout".into()),
407                    elapsed_ms: 5000,
408                },
409            ],
410            aggregate: Aggregate::default(),
411        }
412    }
413
414    #[test]
415    fn recompute_aggregate_correct() {
416        let mut run = make_run();
417        run.recompute_aggregate();
418        assert_eq!(run.aggregate.total, 2);
419        assert!((run.aggregate.mean_score - 0.5).abs() < f64::EPSILON);
420        assert_eq!(run.aggregate.exact_match, 1);
421        assert_eq!(run.aggregate.total_elapsed_ms, 6000);
422    }
423
424    #[test]
425    fn completed_ids_returns_all_scenario_ids() {
426        let run = make_run();
427        let ids = run.completed_ids();
428        assert!(ids.contains("s1"));
429        assert!(ids.contains("s2"));
430        assert_eq!(ids.len(), 2);
431    }
432
433    #[test]
434    fn json_round_trip() {
435        let mut run = make_run();
436        run.recompute_aggregate();
437        let json = serde_json::to_string_pretty(&run).unwrap();
438        let decoded: BenchRun = serde_json::from_str(&json).unwrap();
439        assert_eq!(decoded.dataset, run.dataset);
440        assert_eq!(decoded.run_id, run.run_id);
441        assert_eq!(decoded.results.len(), 2);
442        assert_eq!(decoded.status, RunStatus::Completed);
443        assert_eq!(decoded.aggregate.exact_match, run.aggregate.exact_match);
444    }
445
446    #[test]
447    fn interrupted_status_serializes_correctly() {
448        let mut run = make_run();
449        run.status = RunStatus::Interrupted;
450        let json = serde_json::to_string(&run).unwrap();
451        assert!(json.contains("\"interrupted\""));
452    }
453
454    #[test]
455    fn write_and_load_round_trip() {
456        let dir = tempfile::tempdir().unwrap();
457        let writer = ResultWriter::new(dir.path()).unwrap();
458
459        assert!(writer.load_existing().unwrap().is_none());
460
461        let mut run = make_run();
462        run.recompute_aggregate();
463        writer.write(&run).unwrap();
464
465        let loaded = writer.load_existing().unwrap().unwrap();
466        assert_eq!(loaded.run_id, run.run_id);
467        assert_eq!(loaded.results.len(), 2);
468        assert_eq!(loaded.aggregate.exact_match, 1);
469    }
470
471    #[test]
472    fn summary_md_contains_table_header() {
473        let dir = tempfile::tempdir().unwrap();
474        let writer = ResultWriter::new(dir.path()).unwrap();
475        let mut run = make_run();
476        run.recompute_aggregate();
477        writer.write(&run).unwrap();
478
479        let md = std::fs::read_to_string(writer.summary_path()).unwrap();
480        assert!(md.contains("| scenario_id | score |"));
481        assert!(md.contains("s1"));
482        assert!(md.contains("s2"));
483    }
484
485    #[test]
486    fn write_creates_output_dir_if_absent() {
487        let tmp = tempfile::tempdir().unwrap();
488        let new_dir = tmp.path().join("new_subdir");
489        assert!(!new_dir.exists());
490        ResultWriter::new(&new_dir).unwrap();
491        assert!(new_dir.exists());
492    }
493
494    #[test]
495    fn resume_skips_completed_scenarios() {
496        let dir = tempfile::tempdir().unwrap();
497        let writer = ResultWriter::new(dir.path()).unwrap();
498
499        // Write partial results (only s1 done).
500        let mut partial = make_run();
501        partial.results.retain(|r| r.scenario_id == "s1");
502        partial.status = RunStatus::Interrupted;
503        partial.recompute_aggregate();
504        writer.write(&partial).unwrap();
505
506        let loaded = writer.load_existing().unwrap().unwrap();
507        let done = loaded.completed_ids();
508        assert!(done.contains("s1"));
509        assert!(!done.contains("s2"));
510    }
511}