zeph_bench/lib.rs
1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4// Raised from the default 128 to accommodate deeply nested async state machines
5// generated by #[tracing::instrument] on the agent call stack.
6#![recursion_limit = "256"]
7
8//! Benchmark harness for evaluating Zeph agent performance on standardized datasets.
9//!
10//! `zeph-bench` implements the CLI subcommand `zeph bench` and provides the building blocks
11//! for running reproducible evaluations against LOCOMO, FRAMES, GAIA, and other datasets.
12//!
13//! # Architecture
14//!
15//! The harness is built around three composable traits:
16//!
17//! - [`DatasetLoader`] — reads a dataset file and returns a [`Vec<Scenario>`].
18//! - [`Evaluator`] — scores one agent response against a [`Scenario`].
19//! - [`zeph_core::channel::Channel`] — implemented by [`BenchmarkChannel`] to drive the agent
20//! loop headlessly (no terminal, no network).
21//!
22//! Results are accumulated into a [`BenchRun`] and persisted by [`ResultWriter`], which writes
23//! both `results.json` (machine-readable) and `summary.md` (human-readable) to the output
24//! directory. Runs can be interrupted and resumed via the `--resume` flag.
25//!
26//! # Quick Start
27//!
28//! ```no_run
29//! use std::path::Path;
30//! use zeph_bench::{DatasetRegistry, loaders::{LocomoLoader, LocomoEvaluator}};
31//! use zeph_bench::scenario::{DatasetLoader, Evaluator};
32//!
33//! // 1. Discover available datasets.
34//! let registry = DatasetRegistry::new();
35//! let meta = registry.get("locomo").expect("locomo is built-in");
36//! println!("dataset url: {}", meta.url);
37//!
38//! // 2. Load scenarios from a locally cached file.
39//! let scenarios = LocomoLoader.load(Path::new("/data/locomo.json")).unwrap();
40//!
41//! // 3. Evaluate a response.
42//! let result = LocomoEvaluator.evaluate(&scenarios[0], "some agent response");
43//! println!("score={:.4} passed={}", result.score, result.passed);
44//! ```
45//!
46//! # Deterministic Runs
47//!
48//! By default the harness forces `temperature=0.0` on the configured provider so that runs are
49//! reproducible. Pass `--no-deterministic` on the CLI or call [`apply_deterministic_overrides`]
50//! with `no_deterministic = true` to disable this behaviour.
51//!
52//! # Modules
53//!
54//! | Module | Purpose |
55//! |--------|---------|
56//! | [`baseline`] | Baseline comparison types and delta computation |
57//! | [`channel`] | Headless [`BenchmarkChannel`] that drives the agent without I/O |
58//! | [`cli`] | Clap subcommand definition ([`BenchCommand`]) |
59//! | [`dataset`] | Dataset registry and metadata types |
60//! | [`deterministic`] | Temperature-zero override helpers |
61//! | [`error`] | [`BenchError`] error type |
62//! | [`isolation`] | Per-scenario storage isolation ([`BenchIsolation`]) |
63//! | [`loaders`] | Concrete loaders for LOCOMO, FRAMES, GAIA, LongMemEval, and tau-bench |
64//! | [`results`] | Result types and [`ResultWriter`] |
65//! | [`runner`] | [`BenchRunner`] that drives the agent loop over a dataset |
66//! | [`scenario`] | Core traits ([`DatasetLoader`], [`Evaluator`]) and scoring helpers |
67
68pub mod baseline;
69pub mod channel;
70pub mod cli;
71pub mod dataset;
72pub mod deterministic;
73pub mod error;
74pub mod isolation;
75pub mod loaders;
76pub mod results;
77pub mod runner;
78pub mod scenario;
79
80pub use baseline::{BaselineComparison, ScenarioDelta};
81pub use channel::BenchmarkChannel;
82pub use cli::BenchCommand;
83pub use dataset::{DatasetFormat, DatasetMeta, DatasetRegistry};
84pub use deterministic::apply_deterministic_overrides;
85pub use error::BenchError;
86pub use isolation::BenchIsolation;
87pub use results::{Aggregate, BenchRun, ResultWriter, RunStatus, ScenarioResult};
88pub use runner::{BenchMemoryParams, BenchRunner, MemoryMode, ResponseMode, RunOptions};
89pub use scenario::{
90 DatasetLoader, EvalResult, Evaluator, Role, Scenario, Turn, exact_match,
91 gaia_normalized_exact_match, token_f1,
92};