Skip to main content

zeph_bench/
results.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Benchmark result types and writer.
5//!
6//! [`BenchRun`] is the top-level result record written to `results.json`.
7//! [`ResultWriter`] handles serialization to JSON and a human-readable Markdown summary,
8//! including partial flushing on SIGINT and resume support.
9
10use std::collections::HashSet;
11use std::fmt::Write as _;
12use std::path::{Path, PathBuf};
13
14use serde::{Deserialize, Serialize};
15
16use crate::error::BenchError;
17
18/// Status of a benchmark run serialized into `results.json`.
19///
20/// The `Running` variant is used in-memory during an active run and should never
21/// appear in a persisted file.
22///
23/// # Examples
24///
25/// ```
26/// use zeph_bench::RunStatus;
27///
28/// assert_ne!(RunStatus::Completed, RunStatus::Interrupted);
29/// ```
30#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
31#[serde(rename_all = "snake_case")]
32pub enum RunStatus {
33    /// All scenarios finished successfully.
34    Completed,
35    /// The run was cancelled (e.g. SIGINT) before all scenarios finished.
36    Interrupted,
37    /// The run is currently in progress; should not appear in a persisted file.
38    Running,
39}
40
41/// Per-scenario result record persisted inside [`BenchRun::results`].
42///
43/// # Examples
44///
45/// ```
46/// use zeph_bench::ScenarioResult;
47///
48/// let r = ScenarioResult {
49///     scenario_id: "gaia_t1".into(),
50///     score: 1.0,
51///     response_excerpt: "1945".into(),
52///     error: None,
53///     elapsed_ms: 820,
54/// };
55/// assert!(r.error.is_none());
56/// ```
57#[derive(Debug, Clone, Serialize, Deserialize)]
58pub struct ScenarioResult {
59    /// Unique identifier for the scenario (matches [`crate::Scenario::id`]).
60    pub scenario_id: String,
61    /// Numeric score in `[0.0, 1.0]` produced by the evaluator.
62    pub score: f64,
63    /// First 200 characters of the agent response for quick review.
64    pub response_excerpt: String,
65    /// Error message if the scenario could not be completed, otherwise `None`.
66    pub error: Option<String>,
67    /// Wall-clock time in milliseconds for this scenario.
68    pub elapsed_ms: u64,
69}
70
71/// Aggregate statistics computed from all [`ScenarioResult`]s in a [`BenchRun`].
72///
73/// Recomputed after every scenario via [`BenchRun::recompute_aggregate`] and persisted
74/// into `results.json` so partial runs still contain meaningful statistics.
75///
76/// # Examples
77///
78/// ```
79/// use zeph_bench::Aggregate;
80///
81/// let agg = Aggregate {
82///     total: 100,
83///     mean_score: 0.72,
84///     median_score: 0.70,
85///     stddev: 0.15,
86///     exact_match: 55,
87///     error_count: 3,
88///     total_elapsed_ms: 240_000,
89/// };
90/// assert_eq!(agg.total, 100);
91/// assert_eq!(agg.error_count, 3);
92/// assert!((agg.median_score - 0.70).abs() < f64::EPSILON);
93/// ```
94#[derive(Debug, Clone, Serialize, Deserialize, Default)]
95pub struct Aggregate {
96    /// Number of scenarios included in the statistics.
97    pub total: usize,
98    /// Arithmetic mean of all per-scenario scores.
99    pub mean_score: f64,
100    /// Median per-scenario score.
101    ///
102    /// For an even number of results, the median is the average of the two middle values.
103    /// Returns `0.0` when `total == 0`.
104    pub median_score: f64,
105    /// Population standard deviation of per-scenario scores (divide by N).
106    ///
107    /// The scenario set is treated as the full population of interest, not a sample.
108    /// Returns `0.0` when `total <= 1`.
109    pub stddev: f64,
110    /// Count of scenarios where `score >= 1.0` (exact match).
111    pub exact_match: usize,
112    /// Count of scenarios where `score == 0.0` and `error` is `Some(_)`.
113    ///
114    /// A non-zero value indicates the agent failed to produce a response (e.g. timeout,
115    /// LLM API error) rather than simply giving the wrong answer.
116    pub error_count: usize,
117    /// Sum of [`ScenarioResult::elapsed_ms`] across all scenarios.
118    pub total_elapsed_ms: u64,
119}
120
121/// Top-level benchmark run record written to `results.json`.
122///
123/// The schema is a superset of the `LongMemEval` leaderboard submission format (NFR-008),
124/// making it directly usable for leaderboard submission after a `longmemeval` run.
125///
126/// Create a default instance, then populate [`BenchRun::results`] incrementally and
127/// call [`BenchRun::recompute_aggregate`] before persisting with [`ResultWriter`].
128///
129/// # Examples
130///
131/// ```
132/// use zeph_bench::{BenchRun, RunStatus, Aggregate};
133///
134/// let run = BenchRun {
135///     dataset: "gaia".into(),
136///     model: "openai/gpt-4o".into(),
137///     run_id: "a1b2c3".into(),
138///     started_at: "2026-04-09T10:00:00Z".into(),
139///     finished_at: String::new(),
140///     status: RunStatus::Running,
141///     results: vec![],
142///     aggregate: Aggregate::default(),
143/// };
144/// assert_eq!(run.dataset, "gaia");
145/// assert!(run.results.is_empty());
146/// ```
147#[derive(Debug, Clone, Serialize, Deserialize)]
148pub struct BenchRun {
149    /// Dataset name (e.g. `"longmemeval"`).
150    pub dataset: String,
151    /// Provider/model identifier (e.g. `"openai/gpt-4o"`).
152    pub model: String,
153    /// UUID v4 uniquely identifying this run.
154    pub run_id: String,
155    /// RFC 3339 timestamp when the run started.
156    pub started_at: String,
157    /// RFC 3339 timestamp when the run ended (empty string if interrupted).
158    pub finished_at: String,
159    /// Run status.
160    pub status: RunStatus,
161    /// Per-scenario results.
162    pub results: Vec<ScenarioResult>,
163    /// Aggregate statistics.
164    pub aggregate: Aggregate,
165}
166
167impl BenchRun {
168    /// Recompute [`BenchRun::aggregate`] from the current [`BenchRun::results`] list.
169    ///
170    /// Call this after appending one or more [`ScenarioResult`]s to keep the
171    /// aggregate statistics in sync before writing to disk.
172    ///
173    /// # Examples
174    ///
175    /// ```
176    /// use zeph_bench::{BenchRun, RunStatus, ScenarioResult, Aggregate};
177    ///
178    /// let mut run = BenchRun {
179    ///     dataset: "frames".into(),
180    ///     model: "openai/gpt-4o-mini".into(),
181    ///     run_id: "r1".into(),
182    ///     started_at: "2026-01-01T00:00:00Z".into(),
183    ///     finished_at: String::new(),
184    ///     status: RunStatus::Running,
185    ///     results: vec![
186    ///         ScenarioResult {
187    ///             scenario_id: "frames_0".into(),
188    ///             score: 1.0,
189    ///             response_excerpt: "Paris".into(),
190    ///             error: None,
191    ///             elapsed_ms: 500,
192    ///         },
193    ///     ],
194    ///     aggregate: Aggregate::default(),
195    /// };
196    ///
197    /// run.recompute_aggregate();
198    /// assert_eq!(run.aggregate.total, 1);
199    /// assert!((run.aggregate.mean_score - 1.0).abs() < f64::EPSILON);
200    /// assert_eq!(run.aggregate.exact_match, 1);
201    /// assert_eq!(run.aggregate.error_count, 0);
202    /// ```
203    pub fn recompute_aggregate(&mut self) {
204        let total = self.results.len();
205
206        if total == 0 {
207            self.aggregate = Aggregate::default();
208            return;
209        }
210
211        #[allow(clippy::cast_precision_loss)]
212        let mean_score = self.results.iter().map(|r| r.score).sum::<f64>() / total as f64;
213
214        // Median: sort scores, average the two middle values for even N.
215        let mut sorted_scores: Vec<f64> = self.results.iter().map(|r| r.score).collect();
216        sorted_scores.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
217        #[allow(clippy::cast_precision_loss)]
218        let median_score = if total % 2 == 1 {
219            sorted_scores[total / 2]
220        } else {
221            f64::midpoint(sorted_scores[total / 2 - 1], sorted_scores[total / 2])
222        };
223
224        // Population standard deviation (divide by N).
225        #[allow(clippy::cast_precision_loss)]
226        let variance = self
227            .results
228            .iter()
229            .map(|r| (r.score - mean_score).powi(2))
230            .sum::<f64>()
231            / total as f64;
232        let stddev = variance.sqrt();
233
234        let exact_match = self.results.iter().filter(|r| r.score >= 1.0).count();
235        let error_count = self
236            .results
237            .iter()
238            .filter(|r| r.score == 0.0 && r.error.is_some())
239            .count();
240        let total_elapsed_ms = self.results.iter().map(|r| r.elapsed_ms).sum();
241
242        self.aggregate = Aggregate {
243            total,
244            mean_score,
245            median_score,
246            stddev,
247            exact_match,
248            error_count,
249            total_elapsed_ms,
250        };
251    }
252
253    /// Return the set of scenario IDs already present in [`BenchRun::results`].
254    ///
255    /// Used by the `--resume` logic to determine which scenarios can be skipped.
256    ///
257    /// # Examples
258    ///
259    /// ```
260    /// use zeph_bench::{BenchRun, RunStatus, ScenarioResult, Aggregate};
261    ///
262    /// let run = BenchRun {
263    ///     dataset: "gaia".into(),
264    ///     model: "openai/gpt-4o".into(),
265    ///     run_id: "r2".into(),
266    ///     started_at: "2026-01-01T00:00:00Z".into(),
267    ///     finished_at: String::new(),
268    ///     status: RunStatus::Interrupted,
269    ///     results: vec![
270    ///         ScenarioResult {
271    ///             scenario_id: "t1".into(),
272    ///             score: 1.0,
273    ///             response_excerpt: "1945".into(),
274    ///             error: None,
275    ///             elapsed_ms: 300,
276    ///         },
277    ///     ],
278    ///     aggregate: Aggregate::default(),
279    /// };
280    ///
281    /// let done = run.completed_ids();
282    /// assert!(done.contains("t1"));
283    /// assert!(!done.contains("t2"));
284    /// ```
285    #[must_use]
286    pub fn completed_ids(&self) -> HashSet<String> {
287        self.results.iter().map(|r| r.scenario_id.clone()).collect()
288    }
289}
290
291/// Writes `results.json` and `summary.md` to an output directory.
292///
293/// Files are written atomically by flushing to a `.tmp` sibling file and then
294/// renaming, so a concurrent SIGINT cannot leave a half-written JSON file.
295///
296/// # Examples
297///
298/// ```no_run
299/// use zeph_bench::{ResultWriter, BenchRun, RunStatus, Aggregate};
300///
301/// let writer = ResultWriter::new("/tmp/my-bench-run").unwrap();
302/// println!("results at {}", writer.results_path().display());
303/// ```
304pub struct ResultWriter {
305    output_dir: PathBuf,
306}
307
308impl ResultWriter {
309    /// Create a writer targeting `output_dir`.
310    ///
311    /// The directory is created automatically (single level) if it does not exist.
312    ///
313    /// # Errors
314    ///
315    /// Returns [`BenchError::Io`] if the directory cannot be created.
316    pub fn new(output_dir: impl Into<PathBuf>) -> Result<Self, BenchError> {
317        let output_dir = output_dir.into();
318        if !output_dir.exists() {
319            std::fs::create_dir_all(&output_dir)?;
320        }
321        Ok(Self { output_dir })
322    }
323
324    /// Absolute path of `results.json` inside the output directory.
325    ///
326    /// # Examples
327    ///
328    /// ```
329    /// use std::path::Path;
330    /// use zeph_bench::ResultWriter;
331    ///
332    /// let dir = tempfile::tempdir().unwrap();
333    /// let writer = ResultWriter::new(dir.path()).unwrap();
334    /// assert!(writer.results_path().ends_with("results.json"));
335    /// ```
336    #[must_use]
337    pub fn results_path(&self) -> PathBuf {
338        self.output_dir.join("results.json")
339    }
340
341    /// Absolute path of `summary.md` inside the output directory.
342    ///
343    /// # Examples
344    ///
345    /// ```
346    /// use zeph_bench::ResultWriter;
347    ///
348    /// let dir = tempfile::tempdir().unwrap();
349    /// let writer = ResultWriter::new(dir.path()).unwrap();
350    /// assert!(writer.summary_path().ends_with("summary.md"));
351    /// ```
352    #[must_use]
353    pub fn summary_path(&self) -> PathBuf {
354        self.output_dir.join("summary.md")
355    }
356
357    /// Load an existing `results.json` for resume.
358    ///
359    /// Returns `None` when the file does not exist (treat as fresh run).
360    ///
361    /// # Errors
362    ///
363    /// Returns [`BenchError::Io`] on read failure, or [`BenchError::InvalidFormat`] if
364    /// the file exists but cannot be deserialized.
365    pub fn load_existing(&self) -> Result<Option<BenchRun>, BenchError> {
366        let path = self.results_path();
367        if !path.exists() {
368            return Ok(None);
369        }
370        let data = std::fs::read_to_string(&path)?;
371        let run: BenchRun =
372            serde_json::from_str(&data).map_err(|e| BenchError::InvalidFormat(e.to_string()))?;
373        Ok(Some(run))
374    }
375
376    /// Write `run` to `results.json` and `summary.md` atomically (best-effort).
377    ///
378    /// # Errors
379    ///
380    /// Returns [`BenchError`] on serialization or I/O failure.
381    pub fn write(&self, run: &BenchRun) -> Result<(), BenchError> {
382        self.write_json(run)?;
383        self.write_markdown(run)?;
384        Ok(())
385    }
386
387    fn write_json(&self, run: &BenchRun) -> Result<(), BenchError> {
388        let json = serde_json::to_string_pretty(run)
389            .map_err(|e| BenchError::InvalidFormat(e.to_string()))?;
390        write_atomic(&self.results_path(), json.as_bytes())?;
391        Ok(())
392    }
393
394    fn write_markdown(&self, run: &BenchRun) -> Result<(), BenchError> {
395        let mut md = String::new();
396        let _ = writeln!(md, "# Benchmark Results: {}\n", run.dataset);
397        let _ = writeln!(md, "- **Model**: {}", run.model);
398        let _ = writeln!(md, "- **Run ID**: {}", run.run_id);
399        let _ = writeln!(md, "- **Status**: {:?}", run.status);
400        let _ = writeln!(md, "- **Started**: {}", run.started_at);
401        if !run.finished_at.is_empty() {
402            let _ = writeln!(md, "- **Finished**: {}", run.finished_at);
403        }
404        let _ = writeln!(
405            md,
406            "- **Mean score**: {:.4} (median: {:.4}, stddev: {:.4})\n",
407            run.aggregate.mean_score, run.aggregate.median_score, run.aggregate.stddev
408        );
409        let _ = writeln!(
410            md,
411            "- **Exact match**: {}/{} | **Errors**: {}\n",
412            run.aggregate.exact_match, run.aggregate.total, run.aggregate.error_count
413        );
414
415        md.push_str("| scenario_id | score | response_excerpt | error |\n");
416        md.push_str("|-------------|-------|------------------|-------|\n");
417        for r in &run.results {
418            let excerpt = r.response_excerpt.replace('|', "\\|");
419            let error = r.error.as_deref().unwrap_or("").replace('|', "\\|");
420            let _ = writeln!(
421                md,
422                "| {} | {:.4} | {} | {} |",
423                r.scenario_id, r.score, excerpt, error
424            );
425        }
426
427        write_atomic(&self.summary_path(), md.as_bytes())?;
428        Ok(())
429    }
430}
431
432/// Write `data` to `path` using a temp file + rename for atomicity.
433fn write_atomic(path: &Path, data: &[u8]) -> Result<(), std::io::Error> {
434    let tmp = path.with_extension("tmp");
435    std::fs::write(&tmp, data)?;
436    std::fs::rename(&tmp, path)?;
437    Ok(())
438}
439
440#[cfg(test)]
441mod tests {
442    use super::*;
443
444    fn make_run() -> BenchRun {
445        BenchRun {
446            dataset: "longmemeval".into(),
447            model: "openai/gpt-4o".into(),
448            run_id: "test-run-001".into(),
449            started_at: "2026-01-01T00:00:00Z".into(),
450            finished_at: "2026-01-01T00:01:00Z".into(),
451            status: RunStatus::Completed,
452            results: vec![
453                ScenarioResult {
454                    scenario_id: "s1".into(),
455                    score: 1.0,
456                    response_excerpt: "The answer is 42.".into(),
457                    error: None,
458                    elapsed_ms: 1000,
459                },
460                ScenarioResult {
461                    scenario_id: "s2".into(),
462                    score: 0.0,
463                    response_excerpt: String::new(),
464                    error: Some("timeout".into()),
465                    elapsed_ms: 5000,
466                },
467            ],
468            aggregate: Aggregate::default(),
469        }
470    }
471
472    #[test]
473    fn recompute_aggregate_correct() {
474        let mut run = make_run();
475        run.recompute_aggregate();
476        assert_eq!(run.aggregate.total, 2);
477        assert!((run.aggregate.mean_score - 0.5).abs() < f64::EPSILON);
478        // median for [0.0, 1.0] sorted = average of middle two = 0.5
479        assert!((run.aggregate.median_score - 0.5).abs() < f64::EPSILON);
480        // population stddev: mean=0.5, variance=((1.0-0.5)^2+(0.0-0.5)^2)/2 = 0.25, stddev=0.5
481        assert!((run.aggregate.stddev - 0.5).abs() < f64::EPSILON);
482        assert_eq!(run.aggregate.exact_match, 1);
483        // s2 has score=0.0 and error=Some("timeout")
484        assert_eq!(run.aggregate.error_count, 1);
485        assert_eq!(run.aggregate.total_elapsed_ms, 6000);
486    }
487
488    #[test]
489    fn recompute_aggregate_single_result() {
490        let mut run = make_run();
491        run.results.retain(|r| r.scenario_id == "s1");
492        run.recompute_aggregate();
493        assert_eq!(run.aggregate.total, 1);
494        assert!((run.aggregate.mean_score - 1.0).abs() < f64::EPSILON);
495        assert!((run.aggregate.median_score - 1.0).abs() < f64::EPSILON);
496        assert!(run.aggregate.stddev.abs() < f64::EPSILON);
497        assert_eq!(run.aggregate.error_count, 0);
498    }
499
500    #[test]
501    fn recompute_aggregate_empty_results() {
502        let mut run = make_run();
503        run.results.clear();
504        run.recompute_aggregate();
505        assert_eq!(run.aggregate.total, 0);
506        assert!(run.aggregate.mean_score.abs() < f64::EPSILON);
507        assert!(run.aggregate.median_score.abs() < f64::EPSILON);
508        assert!(run.aggregate.stddev.abs() < f64::EPSILON);
509        assert_eq!(run.aggregate.error_count, 0);
510    }
511
512    #[test]
513    fn recompute_aggregate_error_count_only_zero_score_with_error() {
514        let mut run = make_run();
515        // Add a scenario with score=0.0 but no error — should NOT count as error
516        run.results.push(ScenarioResult {
517            scenario_id: "s3".into(),
518            score: 0.0,
519            response_excerpt: "wrong answer".into(),
520            error: None,
521            elapsed_ms: 100,
522        });
523        run.recompute_aggregate();
524        // s2 has error, s3 does not — error_count should be 1
525        assert_eq!(run.aggregate.error_count, 1);
526    }
527
528    #[test]
529    fn completed_ids_returns_all_scenario_ids() {
530        let run = make_run();
531        let ids = run.completed_ids();
532        assert!(ids.contains("s1"));
533        assert!(ids.contains("s2"));
534        assert_eq!(ids.len(), 2);
535    }
536
537    #[test]
538    fn json_round_trip() {
539        let mut run = make_run();
540        run.recompute_aggregate();
541        let json = serde_json::to_string_pretty(&run).unwrap();
542        let decoded: BenchRun = serde_json::from_str(&json).unwrap();
543        assert_eq!(decoded.dataset, run.dataset);
544        assert_eq!(decoded.run_id, run.run_id);
545        assert_eq!(decoded.results.len(), 2);
546        assert_eq!(decoded.status, RunStatus::Completed);
547        assert_eq!(decoded.aggregate.exact_match, run.aggregate.exact_match);
548    }
549
550    #[test]
551    fn interrupted_status_serializes_correctly() {
552        let mut run = make_run();
553        run.status = RunStatus::Interrupted;
554        let json = serde_json::to_string(&run).unwrap();
555        assert!(json.contains("\"interrupted\""));
556    }
557
558    #[test]
559    fn write_and_load_round_trip() {
560        let dir = tempfile::tempdir().unwrap();
561        let writer = ResultWriter::new(dir.path()).unwrap();
562
563        assert!(writer.load_existing().unwrap().is_none());
564
565        let mut run = make_run();
566        run.recompute_aggregate();
567        writer.write(&run).unwrap();
568
569        let loaded = writer.load_existing().unwrap().unwrap();
570        assert_eq!(loaded.run_id, run.run_id);
571        assert_eq!(loaded.results.len(), 2);
572        assert_eq!(loaded.aggregate.exact_match, 1);
573    }
574
575    #[test]
576    fn summary_md_contains_table_header() {
577        let dir = tempfile::tempdir().unwrap();
578        let writer = ResultWriter::new(dir.path()).unwrap();
579        let mut run = make_run();
580        run.recompute_aggregate();
581        writer.write(&run).unwrap();
582
583        let md = std::fs::read_to_string(writer.summary_path()).unwrap();
584        assert!(md.contains("| scenario_id | score |"));
585        assert!(md.contains("s1"));
586        assert!(md.contains("s2"));
587    }
588
589    #[test]
590    fn write_creates_output_dir_if_absent() {
591        let tmp = tempfile::tempdir().unwrap();
592        let new_dir = tmp.path().join("new_subdir");
593        assert!(!new_dir.exists());
594        ResultWriter::new(&new_dir).unwrap();
595        assert!(new_dir.exists());
596    }
597
598    #[test]
599    fn resume_skips_completed_scenarios() {
600        let dir = tempfile::tempdir().unwrap();
601        let writer = ResultWriter::new(dir.path()).unwrap();
602
603        // Write partial results (only s1 done).
604        let mut partial = make_run();
605        partial.results.retain(|r| r.scenario_id == "s1");
606        partial.status = RunStatus::Interrupted;
607        partial.recompute_aggregate();
608        writer.write(&partial).unwrap();
609
610        let loaded = writer.load_existing().unwrap().unwrap();
611        let done = loaded.completed_ids();
612        assert!(done.contains("s1"));
613        assert!(!done.contains("s2"));
614    }
615}