pub struct BenchRun {
pub dataset: String,
pub model: String,
pub run_id: String,
pub started_at: String,
pub finished_at: String,
pub status: RunStatus,
pub results: Vec<ScenarioResult>,
pub aggregate: Aggregate,
}Expand description
Top-level benchmark run record written to results.json.
The schema is a superset of the LongMemEval leaderboard submission format (NFR-008),
making it directly usable for leaderboard submission after a longmemeval run.
Create a default instance, then populate BenchRun::results incrementally and
call BenchRun::recompute_aggregate before persisting with ResultWriter.
§Examples
use zeph_bench::{BenchRun, RunStatus, Aggregate};
let run = BenchRun {
dataset: "gaia".into(),
model: "openai/gpt-4o".into(),
run_id: "a1b2c3".into(),
started_at: "2026-04-09T10:00:00Z".into(),
finished_at: String::new(),
status: RunStatus::Running,
results: vec![],
aggregate: Aggregate::default(),
};
assert_eq!(run.dataset, "gaia");
assert!(run.results.is_empty());Fields§
§dataset: StringDataset name (e.g. "longmemeval").
model: StringProvider/model identifier (e.g. "openai/gpt-4o").
run_id: StringUUID v4 uniquely identifying this run.
started_at: StringRFC 3339 timestamp when the run started.
finished_at: StringRFC 3339 timestamp when the run ended (empty string if interrupted).
status: RunStatusRun status.
results: Vec<ScenarioResult>Per-scenario results.
aggregate: AggregateAggregate statistics.
Implementations§
Source§impl BenchRun
impl BenchRun
Sourcepub fn recompute_aggregate(&mut self)
pub fn recompute_aggregate(&mut self)
Recompute BenchRun::aggregate from the current BenchRun::results list.
Call this after appending one or more ScenarioResults to keep the
aggregate statistics in sync before writing to disk.
§Examples
use zeph_bench::{BenchRun, RunStatus, ScenarioResult, Aggregate};
let mut run = BenchRun {
dataset: "frames".into(),
model: "openai/gpt-4o-mini".into(),
run_id: "r1".into(),
started_at: "2026-01-01T00:00:00Z".into(),
finished_at: String::new(),
status: RunStatus::Running,
results: vec![
ScenarioResult {
scenario_id: "frames_0".into(),
score: 1.0,
response_excerpt: "Paris".into(),
error: None,
elapsed_ms: 500,
},
],
aggregate: Aggregate::default(),
};
run.recompute_aggregate();
assert_eq!(run.aggregate.total, 1);
assert!((run.aggregate.mean_score - 1.0).abs() < f64::EPSILON);
assert_eq!(run.aggregate.exact_match, 1);Sourcepub fn completed_ids(&self) -> HashSet<String>
pub fn completed_ids(&self) -> HashSet<String>
Return the set of scenario IDs already present in BenchRun::results.
Used by the --resume logic to determine which scenarios can be skipped.
§Examples
use zeph_bench::{BenchRun, RunStatus, ScenarioResult, Aggregate};
let run = BenchRun {
dataset: "gaia".into(),
model: "openai/gpt-4o".into(),
run_id: "r2".into(),
started_at: "2026-01-01T00:00:00Z".into(),
finished_at: String::new(),
status: RunStatus::Interrupted,
results: vec![
ScenarioResult {
scenario_id: "t1".into(),
score: 1.0,
response_excerpt: "1945".into(),
error: None,
elapsed_ms: 300,
},
],
aggregate: Aggregate::default(),
};
let done = run.completed_ids();
assert!(done.contains("t1"));
assert!(!done.contains("t2"));Trait Implementations§
Source§impl<'de> Deserialize<'de> for BenchRun
impl<'de> Deserialize<'de> for BenchRun
Source§fn deserialize<__D>(__deserializer: __D) -> Result<Self, __D::Error>where
__D: Deserializer<'de>,
fn deserialize<__D>(__deserializer: __D) -> Result<Self, __D::Error>where
__D: Deserializer<'de>,
Auto Trait Implementations§
impl Freeze for BenchRun
impl RefUnwindSafe for BenchRun
impl Send for BenchRun
impl Sync for BenchRun
impl Unpin for BenchRun
impl UnsafeUnpin for BenchRun
impl UnwindSafe for BenchRun
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Source§impl<T> CloneToUninit for Twhere
T: Clone,
impl<T> CloneToUninit for Twhere
T: Clone,
Source§impl<T> Instrument for T
impl<T> Instrument for T
Source§fn instrument(self, span: Span) -> Instrumented<Self>
fn instrument(self, span: Span) -> Instrumented<Self>
Source§fn in_current_span(self) -> Instrumented<Self>
fn in_current_span(self) -> Instrumented<Self>
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§impl<T> IntoRequest<T> for T
impl<T> IntoRequest<T> for T
Source§fn into_request(self) -> Request<T>
fn into_request(self) -> Request<T>
T in a tonic::Request