zeph_bench/runner.rs
1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Benchmark runner: drives `Agent<BenchmarkChannel>` over a dataset and collects results.
5//!
6//! [`BenchRunner`] is the execution engine for `zeph bench run`. It is intentionally
7//! minimal — baseline mode only (no tools, no memory, no MCP). Each scenario is run in
8//! isolation through a fresh [`BenchmarkChannel`] and the agent's raw text response is
9//! scored by the supplied [`Evaluator`].
10//!
11//! # Usage
12//!
13//! ```no_run
14//! use std::path::Path;
15//! use zeph_bench::runner::{BenchRunner, RunOptions};
16//! use zeph_bench::loaders::{GaiaLoader, GaiaEvaluator};
17//! use zeph_llm::{any::AnyProvider, mock::MockProvider};
18//!
19//! # async fn example() -> Result<(), zeph_bench::BenchError> {
20//! let provider = AnyProvider::Mock(MockProvider::with_responses(vec!["1945".into()]));
21//! let runner = BenchRunner::new(provider, false);
22//! let opts = RunOptions::default();
23//! let run = runner.run_dataset(&GaiaLoader::all_levels(), &GaiaEvaluator, Path::new("/data/gaia.jsonl"), opts).await?;
24//! println!("mean score: {:.4}", run.aggregate.mean_score);
25//! # Ok(())
26//! # }
27//! ```
28
29use std::collections::HashSet;
30use std::path::{Path, PathBuf};
31use std::sync::Arc;
32use std::time::Instant;
33
34use zeph_core::agent::Agent;
35use zeph_core::instructions::InstructionBlock;
36use zeph_llm::any::AnyProvider;
37use zeph_llm::provider::LlmProvider as _;
38use zeph_memory::semantic::SemanticMemory;
39use zeph_skills::registry::SkillRegistry;
40use zeph_tools::executor::{ToolError, ToolExecutor, ToolOutput};
41
42use crate::channel::BenchmarkChannel;
43use crate::error::BenchError;
44use crate::loaders::tau2_bench::{ActionTrace, TauBenchEvaluator};
45use crate::results::{BenchRun, RunStatus, ScenarioResult};
46use crate::scenario::{DatasetLoader, Evaluator, Scenario};
47
48/// Controls how the runner processes the agent's raw text response.
49///
50/// Used by [`BenchRunner::run_one_with_executor`] to select the appropriate
51/// system prompt and post-processing behaviour.
52#[derive(Debug, Clone, Copy, PartialEq, Eq)]
53pub enum ResponseMode {
54 /// Inject a "shortest possible answer" system prompt and strip markdown from the response.
55 ///
56 /// Used by all knowledge-retrieval datasets (GAIA, LOCOMO, FRAMES, `LongMemEval`).
57 TerseAnswer,
58 /// Inject a tool-use system prompt; return the raw agent response without post-processing.
59 ///
60 /// Used by tau2-bench where the evaluation is based on the action trace, not text output.
61 ToolUse,
62}
63
64/// Controls whether `SemanticMemory` is wired into the agent during a benchmark run.
65///
66/// # Examples
67///
68/// ```
69/// use zeph_bench::runner::MemoryMode;
70///
71/// assert_eq!(MemoryMode::default(), MemoryMode::Off);
72/// ```
73#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
74pub enum MemoryMode {
75 /// No `SemanticMemory` — current default behaviour.
76 #[default]
77 Off,
78 /// Wire a `SQLite`-backed `SemanticMemory` into the agent via `Agent::with_memory`.
79 On,
80}
81
82/// Parameters required to construct a per-scenario `SQLite`-backed `SemanticMemory`.
83///
84/// Populated by [`BenchRunner::with_memory_params`] and consumed inside
85/// [`BenchRunner::run_one`] when `opts.memory_mode == MemoryMode::On`.
86///
87/// # Examples
88///
89/// ```
90/// use std::path::PathBuf;
91/// use zeph_bench::runner::BenchMemoryParams;
92///
93/// let params = BenchMemoryParams {
94/// data_dir: PathBuf::from("/tmp/bench"),
95/// embedding_model: "nomic-embed-text".into(),
96/// run_id: "bench-abc".into(),
97/// dataset: "locomo".into(),
98/// };
99/// assert!(params.data_dir.to_string_lossy().contains("bench"));
100/// ```
101#[derive(Debug, Clone)]
102pub struct BenchMemoryParams {
103 /// Directory where per-scenario `SQLite` files live (deleted between scenarios).
104 ///
105 /// The derived path always contains the `bench-` segment (NFR-001).
106 pub data_dir: PathBuf,
107 /// Embedding model name passed to `SemanticMemory`.
108 pub embedding_model: String,
109 /// Run ID used to namespace bench artifacts; matches the outer `BenchRun.run_id`.
110 pub run_id: String,
111 /// Dataset name used to namespace bench artifacts.
112 pub dataset: String,
113}
114
115/// Options that control which scenarios are executed and whether to resume a prior run.
116///
117/// Build via [`RunOptions::default`] and override the fields you need.
118///
119/// # Examples
120///
121/// ```
122/// use zeph_bench::runner::{RunOptions, MemoryMode};
123///
124/// // Run all scenarios.
125/// let opts = RunOptions::default();
126/// assert!(opts.scenario_filter.is_none());
127/// assert!(opts.completed_ids.is_empty());
128/// assert_eq!(opts.memory_mode, MemoryMode::Off);
129/// ```
130#[derive(Debug, Default)]
131pub struct RunOptions {
132 /// When `Some(id)`, only the scenario with this ID is executed.
133 pub scenario_filter: Option<String>,
134 /// Set of scenario IDs already completed in a prior run (used for `--resume`).
135 pub completed_ids: HashSet<String>,
136 /// Whether to wire a `SemanticMemory` backend into the agent for this run.
137 pub memory_mode: MemoryMode,
138}
139
140/// Minimal no-op tool executor for baseline benchmark runs.
141///
142/// Returns an empty tool list and `Ok(None)` on every execute call, ensuring that
143/// the agent loop cannot invoke any tools during a benchmark run.
144struct NoopExecutor;
145
146impl ToolExecutor for NoopExecutor {
147 async fn execute(&self, _response: &str) -> Result<Option<ToolOutput>, ToolError> {
148 Ok(None)
149 }
150}
151
152/// Drives [`Agent<BenchmarkChannel>`] over a dataset and collects scored results.
153///
154/// Each call to [`run_dataset`][BenchRunner::run_dataset] creates a fresh agent per
155/// scenario (baseline mode: no tools, no MCP). Memory is optionally wired via
156/// [`BenchRunner::with_memory_params`] and [`RunOptions::memory_mode`].
157///
158/// # Examples
159///
160/// ```no_run
161/// use zeph_bench::runner::BenchRunner;
162/// use zeph_llm::{any::AnyProvider, mock::MockProvider};
163///
164/// let provider = AnyProvider::Mock(MockProvider::with_responses(vec!["Paris".into()]));
165/// let runner = BenchRunner::new(provider, false);
166/// ```
167pub struct BenchRunner {
168 provider: AnyProvider,
169 /// Parameters for constructing per-scenario `SQLite`-backed `SemanticMemory`.
170 ///
171 /// Set via [`BenchRunner::with_memory_params`]; required when
172 /// `RunOptions::memory_mode == MemoryMode::On`.
173 memory_params: Option<BenchMemoryParams>,
174}
175
176impl BenchRunner {
177 /// Create a new runner with the given provider.
178 ///
179 /// The `no_deterministic` argument is unused at runtime but kept in the public API
180 /// so the bench command can pass it through for future use (e.g., logging or config).
181 /// Apply deterministic overrides to `provider` before calling this if needed.
182 ///
183 /// # Examples
184 ///
185 /// ```no_run
186 /// use zeph_bench::runner::BenchRunner;
187 /// use zeph_llm::{any::AnyProvider, mock::MockProvider};
188 ///
189 /// let provider = AnyProvider::Mock(MockProvider::with_responses(vec![]));
190 /// let runner = BenchRunner::new(provider, false);
191 /// ```
192 #[must_use]
193 pub fn new(provider: AnyProvider, _no_deterministic: bool) -> Self {
194 Self {
195 provider,
196 memory_params: None,
197 }
198 }
199
200 /// Attach `SemanticMemory` parameters for memory-on benchmark runs.
201 ///
202 /// When set, a per-scenario `SQLite`-backed `SemanticMemory` is constructed inside
203 /// [`run_one`][BenchRunner::run_one] whenever `opts.memory_mode == MemoryMode::On`.
204 ///
205 /// # Examples
206 ///
207 /// ```no_run
208 /// use std::path::PathBuf;
209 /// use zeph_bench::runner::{BenchRunner, BenchMemoryParams};
210 /// use zeph_llm::{any::AnyProvider, mock::MockProvider};
211 ///
212 /// let provider = AnyProvider::Mock(MockProvider::with_responses(vec![]));
213 /// let params = BenchMemoryParams {
214 /// data_dir: PathBuf::from("/tmp/bench-data"),
215 /// embedding_model: "nomic-embed-text".into(),
216 /// run_id: "bench-abc".into(),
217 /// dataset: "locomo".into(),
218 /// };
219 /// let runner = BenchRunner::new(provider, false).with_memory_params(params);
220 /// ```
221 #[must_use]
222 pub fn with_memory_params(mut self, params: BenchMemoryParams) -> Self {
223 self.memory_params = Some(params);
224 self
225 }
226
227 /// Run all matching scenarios from `path` through the agent and return a [`BenchRun`].
228 ///
229 /// For each scenario:
230 /// 1. Builds a fresh `Agent<BenchmarkChannel>` with no tools or memory.
231 /// 2. Feeds the scenario prompt and collects the agent's response.
232 /// 3. Scores the response with `evaluator`.
233 /// 4. Appends a [`ScenarioResult`] and recomputes aggregate statistics.
234 ///
235 /// The returned [`BenchRun`] has `status = Running` until the caller sets it to
236 /// `Completed` or `Interrupted`.
237 ///
238 /// # Errors
239 ///
240 /// Returns [`BenchError`] if the dataset cannot be loaded or a scenario run fails.
241 pub async fn run_dataset<L, E>(
242 &self,
243 loader: &L,
244 evaluator: &E,
245 path: &Path,
246 opts: RunOptions,
247 ) -> Result<BenchRun, BenchError>
248 where
249 L: DatasetLoader,
250 E: Evaluator,
251 {
252 let scenarios = loader.load(path)?;
253
254 if let Some(ref filter) = opts.scenario_filter
255 && !scenarios.iter().any(|s| &s.id == filter)
256 {
257 return Err(BenchError::InvalidFormat(format!(
258 "scenario '{}' not found in dataset '{}'",
259 filter,
260 loader.name()
261 )));
262 }
263
264 let model_id = self.provider.model_identifier().to_owned();
265
266 let mut run = BenchRun {
267 dataset: loader.name().to_owned(),
268 model: model_id,
269 run_id: uuid(),
270 started_at: now_rfc3339(),
271 finished_at: String::new(),
272 status: RunStatus::Running,
273 results: vec![],
274 aggregate: crate::results::Aggregate::default(),
275 };
276
277 for scenario in &scenarios {
278 // Skip if resume is active and scenario already completed.
279 if opts.completed_ids.contains(&scenario.id) {
280 continue;
281 }
282 // Skip if a single-scenario filter is active.
283 if let Some(ref filter) = opts.scenario_filter
284 && &scenario.id != filter
285 {
286 continue;
287 }
288
289 let t0 = Instant::now();
290 let response_text = Box::pin(self.run_one(scenario, opts.memory_mode)).await?;
291 let elapsed_ms = u64::try_from(t0.elapsed().as_millis()).unwrap_or(u64::MAX);
292
293 let eval = evaluator.evaluate(scenario, &response_text);
294 let excerpt = response_text.chars().take(200).collect::<String>();
295
296 run.results.push(ScenarioResult {
297 scenario_id: scenario.id.clone(),
298 score: eval.score,
299 response_excerpt: excerpt,
300 error: None,
301 elapsed_ms,
302 });
303 run.recompute_aggregate();
304 }
305
306 Ok(run)
307 }
308
309 /// Run all scenarios from `path` through a per-scenario env executor and return a [`BenchRun`].
310 ///
311 /// This is the execution path for tool-driven datasets (tau2-bench). For each scenario:
312 /// 1. Calls `env_factory(scenario)` to build a fresh `(ToolExecutor, ActionTrace)`.
313 /// 2. Builds a fresh `TauBenchEvaluator` from the scenario metadata and the trace.
314 /// 3. Runs the agent with the env executor and the tool-use system prompt.
315 /// 4. Scores the response via the evaluator (reads the populated trace).
316 ///
317 /// # Errors
318 ///
319 /// Returns [`BenchError`] if the dataset cannot be loaded, the env factory fails, or
320 /// `TauBenchEvaluator::from_scenario` fails (malformed metadata).
321 pub async fn run_dataset_with_env_factory<L, F, X>(
322 &self,
323 loader: &L,
324 env_factory: F,
325 path: &Path,
326 opts: RunOptions,
327 ) -> Result<BenchRun, BenchError>
328 where
329 L: DatasetLoader,
330 F: Fn(&Scenario) -> Result<(X, ActionTrace), BenchError>,
331 X: ToolExecutor + Send + Sync + 'static,
332 {
333 let scenarios = loader.load(path)?;
334
335 if let Some(ref filter) = opts.scenario_filter
336 && !scenarios.iter().any(|s| &s.id == filter)
337 {
338 return Err(BenchError::InvalidFormat(format!(
339 "scenario '{}' not found in dataset '{}'",
340 filter,
341 loader.name()
342 )));
343 }
344
345 let model_id = self.provider.model_identifier().to_owned();
346
347 let mut run = BenchRun {
348 dataset: loader.name().to_owned(),
349 model: model_id,
350 run_id: uuid(),
351 started_at: now_rfc3339(),
352 finished_at: String::new(),
353 status: RunStatus::Running,
354 results: vec![],
355 aggregate: crate::results::Aggregate::default(),
356 };
357
358 for scenario in &scenarios {
359 if opts.completed_ids.contains(&scenario.id) {
360 continue;
361 }
362 if let Some(ref filter) = opts.scenario_filter
363 && &scenario.id != filter
364 {
365 continue;
366 }
367
368 let (executor, trace) = env_factory(scenario)?;
369 let evaluator = TauBenchEvaluator::from_scenario(scenario, trace)?;
370
371 let t0 = Instant::now();
372 let response_text = Box::pin(self.run_one_with_executor(
373 scenario,
374 executor,
375 opts.memory_mode,
376 ResponseMode::ToolUse,
377 ))
378 .await?;
379 let elapsed_ms = u64::try_from(t0.elapsed().as_millis()).unwrap_or(u64::MAX);
380
381 let eval = evaluator.evaluate(scenario, &response_text);
382 let excerpt = response_text.chars().take(200).collect::<String>();
383
384 run.results.push(ScenarioResult {
385 scenario_id: scenario.id.clone(),
386 score: eval.score,
387 response_excerpt: excerpt,
388 error: None,
389 elapsed_ms,
390 });
391 run.recompute_aggregate();
392 }
393
394 Ok(run)
395 }
396
397 /// Run a single scenario through a fresh agent and return the last response text.
398 ///
399 /// A concise-answer system prompt is injected via [`InstructionBlock`] so the model
400 /// responds with only the final answer (a number, word, or short phrase) rather than
401 /// full sentences. The raw response is then post-processed to extract the first
402 /// non-empty line and strip markdown formatting, which further reduces noise for
403 /// evaluators that perform exact or near-exact matching.
404 ///
405 /// When `memory_mode == MemoryMode::On`, a per-scenario `SQLite`-backed
406 /// `SemanticMemory` is constructed and wired into the agent. The database file is
407 /// deleted after the scenario completes (best-effort, NFR-001).
408 ///
409 /// # Errors
410 ///
411 /// Returns [`BenchError::InvalidFormat`] when the scenario has no user turn or when
412 /// `SemanticMemory` initialisation fails.
413 async fn run_one(
414 &self,
415 scenario: &Scenario,
416 memory_mode: MemoryMode,
417 ) -> Result<String, BenchError> {
418 Box::pin(self.run_one_with_executor(
419 scenario,
420 NoopExecutor,
421 memory_mode,
422 ResponseMode::TerseAnswer,
423 ))
424 .await
425 }
426
427 /// Core execution: run one scenario with the given executor and response mode.
428 ///
429 /// Called by both [`BenchRunner::run_dataset`] (with `NoopExecutor` + `TerseAnswer`) and
430 /// [`BenchRunner::run_dataset_with_env_factory`] (with the domain env + `ToolUse`).
431 async fn run_one_with_executor<X: ToolExecutor + Send + Sync + 'static>(
432 &self,
433 scenario: &Scenario,
434 executor: X,
435 memory_mode: MemoryMode,
436 mode: ResponseMode,
437 ) -> Result<String, BenchError> {
438 let prompt = scenario.primary_prompt()?.to_owned();
439 let channel = BenchmarkChannel::new(vec![prompt]);
440 // TODO(multi-turn-history): when loaders emit multiple user turns, push each in
441 // order and seed assistant turns into the channel as captured-history.
442 let registry = SkillRegistry::empty();
443
444 let system_content = match mode {
445 ResponseMode::TerseAnswer => concat!(
446 "You are an evaluation assistant. ",
447 "Answer every question with the shortest possible response. ",
448 "Give only the final answer — no explanation, no full sentences, ",
449 "no punctuation unless it is part of the answer. ",
450 "If the answer is a single word or number, respond with only that word or number."
451 ),
452 ResponseMode::ToolUse => concat!(
453 "You are a customer-service agent. ",
454 "Use the available tools to help the user. ",
455 "Always call a tool when one applies; do not ask the user to perform actions you can perform yourself. ",
456 "When you have completed the user's request, respond with a brief confirmation."
457 ),
458 };
459
460 let blocks = vec![InstructionBlock {
461 source: PathBuf::from("<bench-system-prompt>"),
462 content: system_content.to_owned(),
463 }];
464
465 let base_agent = Agent::new(self.provider.clone(), channel, registry, None, 1, executor)
466 .with_instruction_blocks(blocks);
467
468 // Optionally wire SemanticMemory when the caller requests memory-on mode.
469 let (mut agent, scenario_db) = if memory_mode == MemoryMode::On
470 && let Some(ref params) = self.memory_params
471 {
472 // One SQLite file per scenario gives strict isolation (NFR-001 choice (a)).
473 // This is more files than a per-run DB, but eliminates any cross-scenario
474 // memory bleed and avoids needing BenchIsolation::reset() between scenarios.
475 let scenario_db = params
476 .data_dir
477 .join(format!("bench-{}-{}.db", params.run_id, scenario.id));
478 debug_assert!(
479 scenario_db.to_string_lossy().contains("bench-"),
480 "NFR-001: bench SQLite path must be namespaced with 'bench-'"
481 );
482
483 tracing::debug!(
484 scenario_id = %scenario.id,
485 path = %scenario_db.display(),
486 "bench: memory init start"
487 );
488 let memory = Arc::new(
489 tokio::time::timeout(
490 std::time::Duration::from_secs(10),
491 SemanticMemory::with_sqlite_backend(
492 scenario_db.to_string_lossy().as_ref(),
493 self.provider.clone(),
494 ¶ms.embedding_model,
495 0.7,
496 0.3,
497 ),
498 )
499 .await
500 .map_err(|_| {
501 BenchError::InvalidFormat(format!(
502 "SemanticMemory init timed out for scenario '{}'",
503 scenario.id
504 ))
505 })?
506 .map_err(|e| BenchError::InvalidFormat(format!("SemanticMemory init: {e}")))?,
507 );
508 tracing::debug!(scenario_id = %scenario.id, "bench: memory init done");
509
510 // Seed the sessions table so persist_message does not fail with FK violation.
511 let conv_id = memory
512 .sqlite()
513 .create_conversation()
514 .await
515 .map_err(|e| BenchError::InvalidFormat(format!("create_conversation: {e}")))?;
516
517 // summarization_threshold = 100_000 deliberately suppresses LLM-driven
518 // compaction during bench runs. Compaction calls another LLM round-trip
519 // with non-deterministic timing/output, which would violate FR-003
520 // (deterministic runs). recall_limit = 20 is generous enough to surface
521 // long-context memory effects without silently capping LongMemEval scores
522 // below their theoretical maximum. history_limit = 200 covers the longest
523 // LongMemEval session without truncation.
524 let wired_agent = base_agent.with_memory(memory, conv_id, 200, 20, 100_000);
525 (wired_agent, Some(scenario_db))
526 } else {
527 (base_agent, None)
528 };
529
530 // Ignore agent errors — a failed LLM call still yields an empty response that
531 // the evaluator scores as 0.0 rather than aborting the entire run.
532 let _ = Box::pin(agent.run()).await;
533 let responses = agent.into_channel().into_responses();
534
535 // Best-effort cleanup: delete per-scenario SQLite file after the run.
536 // Failure is intentionally ignored — NFR-001 is hygiene, not correctness.
537 if let Some(ref db_path) = scenario_db {
538 let _ = std::fs::remove_file(db_path);
539 }
540
541 let raw = responses
542 .into_iter()
543 .last()
544 .map(|r| r.text)
545 .unwrap_or_default();
546
547 Ok(match mode {
548 ResponseMode::TerseAnswer => post_process_response(&raw),
549 // Verified: dropping send_tool_output does NOT affect the agent loop's tool-result
550 // feedback to the LLM. Tool outputs flow via Agent's internal MessagePart::ToolResult,
551 // not via the channel. See crates/zeph-core/src/agent/tool_execution/native.rs.
552 ResponseMode::ToolUse => raw,
553 })
554 }
555}
556
557/// Post-process the raw agent response to extract a clean, terse answer.
558///
559/// Applies these transformations in order:
560/// 1. Take only the first non-empty line — strips explanations appended after the answer.
561/// 2. Strip markdown formatting (bold `**`, italic `*` and `_`, inline code `` ` ``).
562/// 3. Trim surrounding whitespace.
563///
564/// This is a best-effort cleanup. Evaluators still normalize the result, so minor
565/// leftover punctuation is handled downstream.
566fn post_process_response(raw: &str) -> String {
567 // Take the first non-empty line to discard any trailing explanation.
568 let first_line = raw
569 .lines()
570 .map(str::trim)
571 .find(|l| !l.is_empty())
572 .unwrap_or("");
573
574 // Strip common markdown formatting characters.
575 first_line
576 .trim_matches(|c: char| matches!(c, '*' | '_' | '`' | ' ' | '\t'))
577 .replace("**", "")
578 .replace('`', "")
579 .trim()
580 .to_owned()
581}
582
583/// Generate a short pseudo-UUID-like run ID without the `uuid` crate.
584///
585/// Uses `std::time::SystemTime` for uniqueness. Not cryptographically random but
586/// sufficient for benchmark run identification.
587fn uuid() -> String {
588 use std::time::{SystemTime, UNIX_EPOCH};
589 let ns = SystemTime::now()
590 .duration_since(UNIX_EPOCH)
591 .map_or(0, |d| d.subsec_nanos());
592 let secs = SystemTime::now()
593 .duration_since(UNIX_EPOCH)
594 .map_or(0, |d| d.as_secs());
595 format!("bench-{secs:x}-{ns:x}")
596}
597
598/// RFC 3339-like timestamp using `std` only (no chrono).
599fn now_rfc3339() -> String {
600 use std::time::{SystemTime, UNIX_EPOCH};
601 let secs = SystemTime::now()
602 .duration_since(UNIX_EPOCH)
603 .map_or(0, |d| d.as_secs());
604 // Minimal ISO 8601 UTC representation — good enough for result metadata.
605 let (y, mo, d, h, mi, s) = secs_to_ymdhms(secs);
606 format!("{y:04}-{mo:02}-{d:02}T{h:02}:{mi:02}:{s:02}Z")
607}
608
609/// Decompose Unix seconds into (year, month, day, hour, minute, second) UTC.
610fn secs_to_ymdhms(secs: u64) -> (u64, u64, u64, u64, u64, u64) {
611 const SECS_PER_MIN: u64 = 60;
612 const DAYS_PER_400Y: u64 = 146_097;
613
614 let s = secs % SECS_PER_MIN;
615 let total_mins = secs / SECS_PER_MIN;
616 let mi = total_mins % 60;
617 let total_hours = total_mins / 60;
618 let h = total_hours % 24;
619 let mut days = total_hours / 24;
620
621 // Proleptic Gregorian calendar computation.
622 // Shift epoch from 1970-01-01 to 0000-03-01 for easier leap-year math.
623 days += 719_468;
624 let era = days / DAYS_PER_400Y;
625 let doe = days % DAYS_PER_400Y;
626 let yoe = (doe - doe / 1460 + doe / 36524 - doe / 146_096) / 365;
627 let y = yoe + era * 400;
628 let doy = doe - (365 * yoe + yoe / 4 - yoe / 100);
629 let mp = (5 * doy + 2) / 153;
630 let d = doy - (153 * mp + 2) / 5 + 1;
631 let mo = if mp < 10 { mp + 3 } else { mp - 9 };
632 let y = if mo <= 2 { y + 1 } else { y };
633 (y, mo, d, h, mi, s)
634}
635
636#[cfg(test)]
637mod tests {
638 use super::*;
639
640 #[test]
641 fn run_options_default_is_empty() {
642 let opts = RunOptions::default();
643 assert!(opts.scenario_filter.is_none());
644 assert!(opts.completed_ids.is_empty());
645 assert_eq!(opts.memory_mode, MemoryMode::Off);
646 }
647
648 #[test]
649 fn memory_mode_default_is_off() {
650 assert_eq!(MemoryMode::default(), MemoryMode::Off);
651 }
652
653 #[test]
654 fn with_memory_params_sets_isolation() {
655 use zeph_llm::{any::AnyProvider, mock::MockProvider};
656 let provider = AnyProvider::Mock(MockProvider::with_responses(vec![]));
657 let params = BenchMemoryParams {
658 data_dir: std::path::PathBuf::from("/tmp/bench-data"),
659 embedding_model: "nomic-embed-text".into(),
660 run_id: "bench-abc".into(),
661 dataset: "locomo".into(),
662 };
663 let runner = BenchRunner::new(provider, false).with_memory_params(params.clone());
664 assert!(runner.memory_params.is_some());
665 let stored = runner.memory_params.unwrap();
666 assert_eq!(stored.run_id, "bench-abc");
667 assert_eq!(stored.dataset, "locomo");
668 }
669
670 #[test]
671 fn nfr_001_sqlite_path_namespaced() {
672 let params = BenchMemoryParams {
673 data_dir: std::path::PathBuf::from("/tmp/bench-data"),
674 embedding_model: "nomic-embed-text".into(),
675 run_id: "run-xyz".into(),
676 dataset: "locomo".into(),
677 };
678 let scenario_id = "s1_0";
679 let scenario_db = params
680 .data_dir
681 .join(format!("bench-{}-{}.db", params.run_id, scenario_id));
682 assert!(
683 scenario_db.to_string_lossy().contains("bench-"),
684 "NFR-001: SQLite path must contain bench- prefix"
685 );
686 }
687
688 #[test]
689 fn now_rfc3339_has_correct_format() {
690 let ts = now_rfc3339();
691 // e.g. "2026-04-25T10:30:00Z"
692 assert_eq!(ts.len(), 20);
693 assert!(ts.ends_with('Z'));
694 assert!(ts.contains('T'));
695 }
696
697 #[test]
698 fn uuid_generates_non_empty_string() {
699 let id = uuid();
700 assert!(id.starts_with("bench-"));
701 assert!(id.len() > 10);
702 }
703
704 #[test]
705 fn post_process_takes_first_line() {
706 let raw = "1945\n\nWorld War II ended in 1945.";
707 assert_eq!(post_process_response(raw), "1945");
708 }
709
710 #[test]
711 fn post_process_strips_markdown_bold() {
712 assert_eq!(post_process_response("**1945**"), "1945");
713 }
714
715 #[test]
716 fn post_process_strips_backticks() {
717 assert_eq!(post_process_response("`Au`"), "Au");
718 }
719
720 #[test]
721 fn post_process_trims_whitespace() {
722 assert_eq!(post_process_response(" Paris "), "Paris");
723 }
724
725 #[test]
726 fn post_process_empty_input_returns_empty() {
727 assert_eq!(post_process_response(""), "");
728 }
729
730 #[test]
731 fn post_process_skips_empty_leading_lines() {
732 let raw = "\n\n \nParis";
733 assert_eq!(post_process_response(raw), "Paris");
734 }
735}