zeph_bench/lib.rs
1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Benchmark harness for evaluating Zeph agent performance on standardized datasets.
5//!
6//! `zeph-bench` implements the CLI subcommand `zeph bench` and provides the building blocks
7//! for running reproducible evaluations against LOCOMO, FRAMES, GAIA, and other datasets.
8//!
9//! # Architecture
10//!
11//! The harness is built around three composable traits:
12//!
13//! - [`DatasetLoader`] — reads a dataset file and returns a [`Vec<Scenario>`].
14//! - [`Evaluator`] — scores one agent response against a [`Scenario`].
15//! - [`zeph_core::channel::Channel`] — implemented by [`BenchmarkChannel`] to drive the agent
16//! loop headlessly (no terminal, no network).
17//!
18//! Results are accumulated into a [`BenchRun`] and persisted by [`ResultWriter`], which writes
19//! both `results.json` (machine-readable) and `summary.md` (human-readable) to the output
20//! directory. Runs can be interrupted and resumed via the `--resume` flag.
21//!
22//! # Quick Start
23//!
24//! ```no_run
25//! use std::path::Path;
26//! use zeph_bench::{DatasetRegistry, loaders::{LocomoLoader, LocomoEvaluator}};
27//! use zeph_bench::scenario::{DatasetLoader, Evaluator};
28//!
29//! // 1. Discover available datasets.
30//! let registry = DatasetRegistry::new();
31//! let meta = registry.get("locomo").expect("locomo is built-in");
32//! println!("dataset url: {}", meta.url);
33//!
34//! // 2. Load scenarios from a locally cached file.
35//! let scenarios = LocomoLoader.load(Path::new("/data/locomo.json")).unwrap();
36//!
37//! // 3. Evaluate a response.
38//! let result = LocomoEvaluator.evaluate(&scenarios[0], "some agent response");
39//! println!("score={:.4} passed={}", result.score, result.passed);
40//! ```
41//!
42//! # Deterministic Runs
43//!
44//! By default the harness forces `temperature=0.0` on the configured provider so that runs are
45//! reproducible. Pass `--no-deterministic` on the CLI or call [`apply_deterministic_overrides`]
46//! with `no_deterministic = true` to disable this behaviour.
47//!
48//! # Modules
49//!
50//! | Module | Purpose |
51//! |--------|---------|
52//! | [`baseline`] | Baseline comparison types and delta computation |
53//! | [`channel`] | Headless [`BenchmarkChannel`] that drives the agent without I/O |
54//! | [`cli`] | Clap subcommand definition ([`BenchCommand`]) |
55//! | [`dataset`] | Dataset registry and metadata types |
56//! | [`deterministic`] | Temperature-zero override helpers |
57//! | [`error`] | [`BenchError`] error type |
58//! | [`isolation`] | Per-scenario storage isolation ([`BenchIsolation`]) |
59//! | [`loaders`] | Concrete loaders for LOCOMO, FRAMES, GAIA, LongMemEval, and tau-bench |
60//! | [`results`] | Result types and [`ResultWriter`] |
61//! | [`scenario`] | Core traits ([`DatasetLoader`], [`Evaluator`]) and scoring helpers |
62
63pub mod baseline;
64pub mod channel;
65pub mod cli;
66pub mod dataset;
67pub mod deterministic;
68pub mod error;
69pub mod isolation;
70pub mod loaders;
71pub mod results;
72pub mod scenario;
73
74pub use baseline::{BaselineComparison, ScenarioDelta};
75pub use channel::BenchmarkChannel;
76pub use cli::BenchCommand;
77pub use dataset::{DatasetFormat, DatasetMeta, DatasetRegistry};
78pub use deterministic::apply_deterministic_overrides;
79pub use error::BenchError;
80pub use isolation::BenchIsolation;
81pub use results::{Aggregate, BenchRun, ResultWriter, RunStatus, ScenarioResult};
82pub use scenario::{
83 DatasetLoader, EvalResult, Evaluator, Scenario, exact_match, gaia_normalized_exact_match,
84 token_f1,
85};