Skip to main content

zeph_bench/
results.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Benchmark result types and writer.
5//!
6//! [`BenchRun`] is the top-level result record written to `results.json`.
7//! [`ResultWriter`] handles serialization to JSON and a human-readable Markdown summary,
8//! including partial flushing on SIGINT and resume support.
9
10use std::collections::HashSet;
11use std::fmt::Write as _;
12use std::path::{Path, PathBuf};
13
14use serde::{Deserialize, Serialize};
15
16use crate::error::BenchError;
17
18/// Status of a benchmark run serialized into `results.json`.
19///
20/// The `Running` variant is used in-memory during an active run and should never
21/// appear in a persisted file.
22///
23/// # Examples
24///
25/// ```
26/// use zeph_bench::RunStatus;
27///
28/// assert_ne!(RunStatus::Completed, RunStatus::Interrupted);
29/// ```
30#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
31#[serde(rename_all = "snake_case")]
32#[non_exhaustive]
33pub enum RunStatus {
34    /// All scenarios finished successfully.
35    Completed,
36    /// The run was cancelled (e.g. SIGINT) before all scenarios finished.
37    Interrupted,
38    /// The run is currently in progress; should not appear in a persisted file.
39    Running,
40}
41
42/// Per-scenario result record persisted inside [`BenchRun::results`].
43///
44/// # Examples
45///
46/// ```
47/// use zeph_bench::ScenarioResult;
48///
49/// let r = ScenarioResult {
50///     scenario_id: "gaia_t1".into(),
51///     score: 1.0,
52///     response_excerpt: "1945".into(),
53///     error: None,
54///     elapsed_ms: 820,
55/// };
56/// assert!(r.error.is_none());
57/// ```
58#[derive(Debug, Clone, Serialize, Deserialize)]
59pub struct ScenarioResult {
60    /// Unique identifier for the scenario (matches [`crate::Scenario::id`]).
61    pub scenario_id: String,
62    /// Numeric score in `[0.0, 1.0]` produced by the evaluator.
63    pub score: f64,
64    /// First 200 characters of the agent response for quick review.
65    pub response_excerpt: String,
66    /// Error message if the scenario could not be completed, otherwise `None`.
67    pub error: Option<String>,
68    /// Wall-clock time in milliseconds for this scenario.
69    pub elapsed_ms: u64,
70}
71
72/// Aggregate statistics computed from all [`ScenarioResult`]s in a [`BenchRun`].
73///
74/// Recomputed after every scenario via [`BenchRun::recompute_aggregate`] and persisted
75/// into `results.json` so partial runs still contain meaningful statistics.
76///
77/// # Examples
78///
79/// ```
80/// use zeph_bench::Aggregate;
81///
82/// let agg = Aggregate {
83///     total: 100,
84///     mean_score: 0.72,
85///     median_score: 0.70,
86///     stddev: 0.15,
87///     exact_match: 55,
88///     error_count: 3,
89///     total_elapsed_ms: 240_000,
90/// };
91/// assert_eq!(agg.total, 100);
92/// assert_eq!(agg.error_count, 3);
93/// assert!((agg.median_score - 0.70).abs() < f64::EPSILON);
94/// ```
95#[derive(Debug, Clone, Serialize, Deserialize, Default)]
96pub struct Aggregate {
97    /// Number of scenarios included in the statistics.
98    pub total: usize,
99    /// Arithmetic mean of all per-scenario scores.
100    pub mean_score: f64,
101    /// Median per-scenario score.
102    ///
103    /// For an even number of results, the median is the average of the two middle values.
104    /// Returns `0.0` when `total == 0`.
105    pub median_score: f64,
106    /// Population standard deviation of per-scenario scores (divide by N).
107    ///
108    /// The scenario set is treated as the full population of interest, not a sample.
109    /// Returns `0.0` when `total <= 1`.
110    pub stddev: f64,
111    /// Count of scenarios where `score >= 1.0` (exact match).
112    pub exact_match: usize,
113    /// Count of scenarios where `score == 0.0` and `error` is `Some(_)`.
114    ///
115    /// A non-zero value indicates the agent failed to produce a response (e.g. timeout,
116    /// LLM API error) rather than simply giving the wrong answer.
117    pub error_count: usize,
118    /// Sum of [`ScenarioResult::elapsed_ms`] across all scenarios.
119    pub total_elapsed_ms: u64,
120}
121
122/// Top-level benchmark run record written to `results.json`.
123///
124/// The schema is a superset of the `LongMemEval` leaderboard submission format (NFR-008),
125/// making it directly usable for leaderboard submission after a `longmemeval` run.
126///
127/// Create a default instance, then populate [`BenchRun::results`] incrementally and
128/// call [`BenchRun::recompute_aggregate`] before persisting with [`ResultWriter`].
129///
130/// # Examples
131///
132/// ```
133/// use zeph_bench::{BenchRun, RunStatus, Aggregate};
134///
135/// let run = BenchRun {
136///     dataset: "gaia".into(),
137///     model: "openai/gpt-4o".into(),
138///     run_id: "a1b2c3".into(),
139///     started_at: "2026-04-09T10:00:00Z".into(),
140///     finished_at: String::new(),
141///     status: RunStatus::Running,
142///     results: vec![],
143///     aggregate: Aggregate::default(),
144/// };
145/// assert_eq!(run.dataset, "gaia");
146/// assert!(run.results.is_empty());
147/// ```
148#[derive(Debug, Clone, Serialize, Deserialize)]
149pub struct BenchRun {
150    /// Dataset name (e.g. `"longmemeval"`).
151    pub dataset: String,
152    /// Provider/model identifier (e.g. `"openai/gpt-4o"`).
153    pub model: String,
154    /// UUID v4 uniquely identifying this run.
155    pub run_id: String,
156    /// RFC 3339 timestamp when the run started.
157    pub started_at: String,
158    /// RFC 3339 timestamp when the run ended (empty string if interrupted).
159    pub finished_at: String,
160    /// Run status.
161    pub status: RunStatus,
162    /// Per-scenario results.
163    pub results: Vec<ScenarioResult>,
164    /// Aggregate statistics.
165    pub aggregate: Aggregate,
166}
167
168impl BenchRun {
169    /// Recompute [`BenchRun::aggregate`] from the current [`BenchRun::results`] list.
170    ///
171    /// Call this after appending one or more [`ScenarioResult`]s to keep the
172    /// aggregate statistics in sync before writing to disk.
173    ///
174    /// # Examples
175    ///
176    /// ```
177    /// use zeph_bench::{BenchRun, RunStatus, ScenarioResult, Aggregate};
178    ///
179    /// let mut run = BenchRun {
180    ///     dataset: "frames".into(),
181    ///     model: "openai/gpt-4o-mini".into(),
182    ///     run_id: "r1".into(),
183    ///     started_at: "2026-01-01T00:00:00Z".into(),
184    ///     finished_at: String::new(),
185    ///     status: RunStatus::Running,
186    ///     results: vec![
187    ///         ScenarioResult {
188    ///             scenario_id: "frames_0".into(),
189    ///             score: 1.0,
190    ///             response_excerpt: "Paris".into(),
191    ///             error: None,
192    ///             elapsed_ms: 500,
193    ///         },
194    ///     ],
195    ///     aggregate: Aggregate::default(),
196    /// };
197    ///
198    /// run.recompute_aggregate();
199    /// assert_eq!(run.aggregate.total, 1);
200    /// assert!((run.aggregate.mean_score - 1.0).abs() < f64::EPSILON);
201    /// assert_eq!(run.aggregate.exact_match, 1);
202    /// assert_eq!(run.aggregate.error_count, 0);
203    /// ```
204    pub fn recompute_aggregate(&mut self) {
205        let total = self.results.len();
206
207        if total == 0 {
208            self.aggregate = Aggregate::default();
209            return;
210        }
211
212        #[allow(clippy::cast_precision_loss)]
213        let mean_score = self.results.iter().map(|r| r.score).sum::<f64>() / total as f64;
214
215        // Median: sort scores, average the two middle values for even N.
216        let mut sorted_scores: Vec<f64> = self.results.iter().map(|r| r.score).collect();
217        sorted_scores.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
218        #[allow(clippy::cast_precision_loss)]
219        let median_score = if total % 2 == 1 {
220            sorted_scores[total / 2]
221        } else {
222            f64::midpoint(sorted_scores[total / 2 - 1], sorted_scores[total / 2])
223        };
224
225        // Population standard deviation (divide by N).
226        #[allow(clippy::cast_precision_loss)]
227        let variance = self
228            .results
229            .iter()
230            .map(|r| (r.score - mean_score).powi(2))
231            .sum::<f64>()
232            / total as f64;
233        let stddev = variance.sqrt();
234
235        let exact_match = self.results.iter().filter(|r| r.score >= 1.0).count();
236        let error_count = self
237            .results
238            .iter()
239            .filter(|r| r.score == 0.0 && r.error.is_some())
240            .count();
241        let total_elapsed_ms = self.results.iter().map(|r| r.elapsed_ms).sum();
242
243        self.aggregate = Aggregate {
244            total,
245            mean_score,
246            median_score,
247            stddev,
248            exact_match,
249            error_count,
250            total_elapsed_ms,
251        };
252    }
253
254    /// Return the set of scenario IDs already present in [`BenchRun::results`].
255    ///
256    /// Used by the `--resume` logic to determine which scenarios can be skipped.
257    ///
258    /// # Examples
259    ///
260    /// ```
261    /// use zeph_bench::{BenchRun, RunStatus, ScenarioResult, Aggregate};
262    ///
263    /// let run = BenchRun {
264    ///     dataset: "gaia".into(),
265    ///     model: "openai/gpt-4o".into(),
266    ///     run_id: "r2".into(),
267    ///     started_at: "2026-01-01T00:00:00Z".into(),
268    ///     finished_at: String::new(),
269    ///     status: RunStatus::Interrupted,
270    ///     results: vec![
271    ///         ScenarioResult {
272    ///             scenario_id: "t1".into(),
273    ///             score: 1.0,
274    ///             response_excerpt: "1945".into(),
275    ///             error: None,
276    ///             elapsed_ms: 300,
277    ///         },
278    ///     ],
279    ///     aggregate: Aggregate::default(),
280    /// };
281    ///
282    /// let done = run.completed_ids();
283    /// assert!(done.contains("t1"));
284    /// assert!(!done.contains("t2"));
285    /// ```
286    #[must_use]
287    pub fn completed_ids(&self) -> HashSet<String> {
288        self.results.iter().map(|r| r.scenario_id.clone()).collect()
289    }
290}
291
292/// Writes `results.json` and `summary.md` to an output directory.
293///
294/// Files are written atomically by flushing to a `.tmp` sibling file and then
295/// renaming, so a concurrent SIGINT cannot leave a half-written JSON file.
296///
297/// # Examples
298///
299/// ```no_run
300/// use zeph_bench::{ResultWriter, BenchRun, RunStatus, Aggregate};
301///
302/// let writer = ResultWriter::new("/tmp/my-bench-run").unwrap();
303/// println!("results at {}", writer.results_path().display());
304/// ```
305pub struct ResultWriter {
306    output_dir: PathBuf,
307}
308
309impl ResultWriter {
310    /// Create a writer targeting `output_dir`.
311    ///
312    /// The directory is created automatically (single level) if it does not exist.
313    ///
314    /// # Errors
315    ///
316    /// Returns [`BenchError::Io`] if the directory cannot be created.
317    pub fn new(output_dir: impl Into<PathBuf>) -> Result<Self, BenchError> {
318        let output_dir = output_dir.into();
319        if !output_dir.exists() {
320            std::fs::create_dir_all(&output_dir)?;
321        }
322        Ok(Self { output_dir })
323    }
324
325    /// Absolute path of `results.json` inside the output directory.
326    ///
327    /// # Examples
328    ///
329    /// ```
330    /// use std::path::Path;
331    /// use zeph_bench::ResultWriter;
332    ///
333    /// let dir = tempfile::tempdir().unwrap();
334    /// let writer = ResultWriter::new(dir.path()).unwrap();
335    /// assert!(writer.results_path().ends_with("results.json"));
336    /// ```
337    #[must_use]
338    pub fn results_path(&self) -> PathBuf {
339        self.output_dir.join("results.json")
340    }
341
342    /// Absolute path of `summary.md` inside the output directory.
343    ///
344    /// # Examples
345    ///
346    /// ```
347    /// use zeph_bench::ResultWriter;
348    ///
349    /// let dir = tempfile::tempdir().unwrap();
350    /// let writer = ResultWriter::new(dir.path()).unwrap();
351    /// assert!(writer.summary_path().ends_with("summary.md"));
352    /// ```
353    #[must_use]
354    pub fn summary_path(&self) -> PathBuf {
355        self.output_dir.join("summary.md")
356    }
357
358    /// Load an existing `results.json` for resume.
359    ///
360    /// Returns `None` when the file does not exist (treat as fresh run).
361    ///
362    /// # Errors
363    ///
364    /// Returns [`BenchError::Io`] on read failure, or [`BenchError::InvalidFormat`] if
365    /// the file exists but cannot be deserialized.
366    pub fn load_existing(&self) -> Result<Option<BenchRun>, BenchError> {
367        let path = self.results_path();
368        if !path.exists() {
369            return Ok(None);
370        }
371        let data = std::fs::read_to_string(&path)?;
372        let run: BenchRun =
373            serde_json::from_str(&data).map_err(|e| BenchError::InvalidFormat(e.to_string()))?;
374        Ok(Some(run))
375    }
376
377    /// Write `run` to `results.json` and `summary.md` atomically (best-effort).
378    ///
379    /// # Errors
380    ///
381    /// Returns [`BenchError`] on serialization or I/O failure.
382    pub fn write(&self, run: &BenchRun) -> Result<(), BenchError> {
383        self.write_json(run)?;
384        self.write_markdown(run)?;
385        Ok(())
386    }
387
388    fn write_json(&self, run: &BenchRun) -> Result<(), BenchError> {
389        let json = serde_json::to_string_pretty(run)
390            .map_err(|e| BenchError::InvalidFormat(e.to_string()))?;
391        write_atomic(&self.results_path(), json.as_bytes())?;
392        Ok(())
393    }
394
395    fn write_markdown(&self, run: &BenchRun) -> Result<(), BenchError> {
396        let mut md = String::new();
397        let _ = writeln!(md, "# Benchmark Results: {}\n", run.dataset);
398        let _ = writeln!(md, "- **Model**: {}", run.model);
399        let _ = writeln!(md, "- **Run ID**: {}", run.run_id);
400        let _ = writeln!(md, "- **Status**: {:?}", run.status);
401        let _ = writeln!(md, "- **Started**: {}", run.started_at);
402        if !run.finished_at.is_empty() {
403            let _ = writeln!(md, "- **Finished**: {}", run.finished_at);
404        }
405        let _ = writeln!(
406            md,
407            "- **Mean score**: {:.4} (median: {:.4}, stddev: {:.4})\n",
408            run.aggregate.mean_score, run.aggregate.median_score, run.aggregate.stddev
409        );
410        let _ = writeln!(
411            md,
412            "- **Exact match**: {}/{} | **Errors**: {}\n",
413            run.aggregate.exact_match, run.aggregate.total, run.aggregate.error_count
414        );
415
416        md.push_str("| scenario_id | score | response_excerpt | error |\n");
417        md.push_str("|-------------|-------|------------------|-------|\n");
418        for r in &run.results {
419            let excerpt = r.response_excerpt.replace('|', "\\|");
420            let error = r.error.as_deref().unwrap_or("").replace('|', "\\|");
421            let _ = writeln!(
422                md,
423                "| {} | {:.4} | {} | {} |",
424                r.scenario_id, r.score, excerpt, error
425            );
426        }
427
428        write_atomic(&self.summary_path(), md.as_bytes())?;
429        Ok(())
430    }
431}
432
433/// Write `data` to `path` using a temp file + rename for atomicity.
434fn write_atomic(path: &Path, data: &[u8]) -> Result<(), std::io::Error> {
435    let tmp = path.with_extension("tmp");
436    std::fs::write(&tmp, data)?;
437    std::fs::rename(&tmp, path)?;
438    Ok(())
439}
440
441#[cfg(test)]
442mod tests {
443    use super::*;
444
445    fn make_run() -> BenchRun {
446        BenchRun {
447            dataset: "longmemeval".into(),
448            model: "openai/gpt-4o".into(),
449            run_id: "test-run-001".into(),
450            started_at: "2026-01-01T00:00:00Z".into(),
451            finished_at: "2026-01-01T00:01:00Z".into(),
452            status: RunStatus::Completed,
453            results: vec![
454                ScenarioResult {
455                    scenario_id: "s1".into(),
456                    score: 1.0,
457                    response_excerpt: "The answer is 42.".into(),
458                    error: None,
459                    elapsed_ms: 1000,
460                },
461                ScenarioResult {
462                    scenario_id: "s2".into(),
463                    score: 0.0,
464                    response_excerpt: String::new(),
465                    error: Some("timeout".into()),
466                    elapsed_ms: 5000,
467                },
468            ],
469            aggregate: Aggregate::default(),
470        }
471    }
472
473    #[test]
474    fn recompute_aggregate_correct() {
475        let mut run = make_run();
476        run.recompute_aggregate();
477        assert_eq!(run.aggregate.total, 2);
478        assert!((run.aggregate.mean_score - 0.5).abs() < f64::EPSILON);
479        // median for [0.0, 1.0] sorted = average of middle two = 0.5
480        assert!((run.aggregate.median_score - 0.5).abs() < f64::EPSILON);
481        // population stddev: mean=0.5, variance=((1.0-0.5)^2+(0.0-0.5)^2)/2 = 0.25, stddev=0.5
482        assert!((run.aggregate.stddev - 0.5).abs() < f64::EPSILON);
483        assert_eq!(run.aggregate.exact_match, 1);
484        // s2 has score=0.0 and error=Some("timeout")
485        assert_eq!(run.aggregate.error_count, 1);
486        assert_eq!(run.aggregate.total_elapsed_ms, 6000);
487    }
488
489    #[test]
490    fn recompute_aggregate_single_result() {
491        let mut run = make_run();
492        run.results.retain(|r| r.scenario_id == "s1");
493        run.recompute_aggregate();
494        assert_eq!(run.aggregate.total, 1);
495        assert!((run.aggregate.mean_score - 1.0).abs() < f64::EPSILON);
496        assert!((run.aggregate.median_score - 1.0).abs() < f64::EPSILON);
497        assert!(run.aggregate.stddev.abs() < f64::EPSILON);
498        assert_eq!(run.aggregate.error_count, 0);
499    }
500
501    #[test]
502    fn recompute_aggregate_empty_results() {
503        let mut run = make_run();
504        run.results.clear();
505        run.recompute_aggregate();
506        assert_eq!(run.aggregate.total, 0);
507        assert!(run.aggregate.mean_score.abs() < f64::EPSILON);
508        assert!(run.aggregate.median_score.abs() < f64::EPSILON);
509        assert!(run.aggregate.stddev.abs() < f64::EPSILON);
510        assert_eq!(run.aggregate.error_count, 0);
511    }
512
513    #[test]
514    fn recompute_aggregate_error_count_only_zero_score_with_error() {
515        let mut run = make_run();
516        // Add a scenario with score=0.0 but no error — should NOT count as error
517        run.results.push(ScenarioResult {
518            scenario_id: "s3".into(),
519            score: 0.0,
520            response_excerpt: "wrong answer".into(),
521            error: None,
522            elapsed_ms: 100,
523        });
524        run.recompute_aggregate();
525        // s2 has error, s3 does not — error_count should be 1
526        assert_eq!(run.aggregate.error_count, 1);
527    }
528
529    #[test]
530    fn completed_ids_returns_all_scenario_ids() {
531        let run = make_run();
532        let ids = run.completed_ids();
533        assert!(ids.contains("s1"));
534        assert!(ids.contains("s2"));
535        assert_eq!(ids.len(), 2);
536    }
537
538    #[test]
539    fn json_round_trip() {
540        let mut run = make_run();
541        run.recompute_aggregate();
542        let json = serde_json::to_string_pretty(&run).unwrap();
543        let decoded: BenchRun = serde_json::from_str(&json).unwrap();
544        assert_eq!(decoded.dataset, run.dataset);
545        assert_eq!(decoded.run_id, run.run_id);
546        assert_eq!(decoded.results.len(), 2);
547        assert_eq!(decoded.status, RunStatus::Completed);
548        assert_eq!(decoded.aggregate.exact_match, run.aggregate.exact_match);
549    }
550
551    #[test]
552    fn interrupted_status_serializes_correctly() {
553        let mut run = make_run();
554        run.status = RunStatus::Interrupted;
555        let json = serde_json::to_string(&run).unwrap();
556        assert!(json.contains("\"interrupted\""));
557    }
558
559    #[test]
560    fn write_and_load_round_trip() {
561        let dir = tempfile::tempdir().unwrap();
562        let writer = ResultWriter::new(dir.path()).unwrap();
563
564        assert!(writer.load_existing().unwrap().is_none());
565
566        let mut run = make_run();
567        run.recompute_aggregate();
568        writer.write(&run).unwrap();
569
570        let loaded = writer.load_existing().unwrap().unwrap();
571        assert_eq!(loaded.run_id, run.run_id);
572        assert_eq!(loaded.results.len(), 2);
573        assert_eq!(loaded.aggregate.exact_match, 1);
574    }
575
576    #[test]
577    fn summary_md_contains_table_header() {
578        let dir = tempfile::tempdir().unwrap();
579        let writer = ResultWriter::new(dir.path()).unwrap();
580        let mut run = make_run();
581        run.recompute_aggregate();
582        writer.write(&run).unwrap();
583
584        let md = std::fs::read_to_string(writer.summary_path()).unwrap();
585        assert!(md.contains("| scenario_id | score |"));
586        assert!(md.contains("s1"));
587        assert!(md.contains("s2"));
588    }
589
590    #[test]
591    fn write_creates_output_dir_if_absent() {
592        let tmp = tempfile::tempdir().unwrap();
593        let new_dir = tmp.path().join("new_subdir");
594        assert!(!new_dir.exists());
595        ResultWriter::new(&new_dir).unwrap();
596        assert!(new_dir.exists());
597    }
598
599    #[test]
600    fn resume_skips_completed_scenarios() {
601        let dir = tempfile::tempdir().unwrap();
602        let writer = ResultWriter::new(dir.path()).unwrap();
603
604        // Write partial results (only s1 done).
605        let mut partial = make_run();
606        partial.results.retain(|r| r.scenario_id == "s1");
607        partial.status = RunStatus::Interrupted;
608        partial.recompute_aggregate();
609        writer.write(&partial).unwrap();
610
611        let loaded = writer.load_existing().unwrap().unwrap();
612        let done = loaded.completed_ids();
613        assert!(done.contains("s1"));
614        assert!(!done.contains("s2"));
615    }
616}