zeph_bench/runner.rs
1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Benchmark runner: drives `Agent<BenchmarkChannel>` over a dataset and collects results.
5//!
6//! [`BenchRunner`] is the execution engine for `zeph bench run`. It is intentionally
7//! minimal — baseline mode only (no tools, no memory, no MCP). Each scenario is run in
8//! isolation through a fresh [`BenchmarkChannel`] and the agent's raw text response is
9//! scored by the supplied [`Evaluator`].
10//!
11//! # Usage
12//!
13//! ```no_run
14//! use std::path::Path;
15//! use zeph_bench::runner::{BenchRunner, RunOptions};
16//! use zeph_bench::loaders::{GaiaLoader, GaiaEvaluator};
17//! use zeph_llm::{any::AnyProvider, mock::MockProvider};
18//!
19//! # async fn example() -> Result<(), zeph_bench::BenchError> {
20//! let provider = AnyProvider::Mock(MockProvider::with_responses(vec!["1945".into()]));
21//! let runner = BenchRunner::new(provider);
22//! let opts = RunOptions::default();
23//! let run = runner.run_dataset(&GaiaLoader::all_levels(), &GaiaEvaluator, Path::new("/data/gaia.jsonl"), opts).await?;
24//! println!("mean score: {:.4}", run.aggregate.mean_score);
25//! # Ok(())
26//! # }
27//! ```
28
29use std::collections::HashSet;
30use std::path::{Path, PathBuf};
31use std::sync::Arc;
32use std::time::Instant;
33
34use zeph_common::timestamp;
35use zeph_core::agent::Agent;
36use zeph_core::instructions::InstructionBlock;
37use zeph_llm::any::AnyProvider;
38use zeph_llm::provider::LlmProvider as _;
39use zeph_memory::semantic::SemanticMemory;
40use zeph_skills::registry::SkillRegistry;
41use zeph_tools::executor::{ToolError, ToolExecutor, ToolOutput};
42
43use crate::channel::BenchmarkChannel;
44use crate::error::BenchError;
45use crate::loaders::tau2_bench::{ActionTrace, TauBenchEvaluator};
46use crate::results::{BenchRun, RunStatus, ScenarioResult};
47use crate::scenario::{DatasetLoader, Evaluator, Scenario};
48
49/// Controls how the runner processes the agent's raw text response.
50///
51/// Used by [`BenchRunner::run_one_with_executor`] to select the appropriate
52/// system prompt and post-processing behaviour.
53#[derive(Debug, Clone, Copy, PartialEq, Eq)]
54#[non_exhaustive]
55pub enum ResponseMode {
56 /// Inject a "shortest possible answer" system prompt and strip markdown from the response.
57 ///
58 /// Used by all knowledge-retrieval datasets (GAIA, LOCOMO, FRAMES, `LongMemEval`).
59 TerseAnswer,
60 /// Inject a tool-use system prompt; return the raw agent response without post-processing.
61 ///
62 /// Used by tau2-bench where the evaluation is based on the action trace, not text output.
63 ToolUse,
64}
65
66/// Controls whether `SemanticMemory` is wired into the agent during a benchmark run.
67///
68/// # Examples
69///
70/// ```
71/// use zeph_bench::runner::MemoryMode;
72///
73/// assert_eq!(MemoryMode::default(), MemoryMode::Off);
74/// ```
75#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
76#[non_exhaustive]
77pub enum MemoryMode {
78 /// No `SemanticMemory` — current default behaviour.
79 #[default]
80 Off,
81 /// Wire a `SQLite`-backed `SemanticMemory` into the agent via `Agent::with_memory`.
82 On,
83}
84
85/// Parameters required to construct a per-scenario `SQLite`-backed `SemanticMemory`.
86///
87/// Populated by [`BenchRunner::with_memory_params`] and consumed inside
88/// [`BenchRunner::run_one`] when `opts.memory_mode == MemoryMode::On`.
89///
90/// # Examples
91///
92/// ```
93/// use std::path::PathBuf;
94/// use zeph_bench::runner::BenchMemoryParams;
95///
96/// let params = BenchMemoryParams {
97/// data_dir: PathBuf::from("/tmp/bench"),
98/// embedding_model: "nomic-embed-text".into(),
99/// run_id: "bench-abc".into(),
100/// dataset: "locomo".into(),
101/// };
102/// assert!(params.data_dir.to_string_lossy().contains("bench"));
103/// ```
104#[derive(Debug, Clone)]
105pub struct BenchMemoryParams {
106 /// Directory where per-scenario `SQLite` files live (deleted between scenarios).
107 ///
108 /// The derived path always contains the `bench-` segment (NFR-001).
109 pub data_dir: PathBuf,
110 /// Embedding model name passed to `SemanticMemory`.
111 pub embedding_model: String,
112 /// Run ID used to namespace bench artifacts; matches the outer `BenchRun.run_id`.
113 pub run_id: String,
114 /// Dataset name used to namespace bench artifacts.
115 pub dataset: String,
116}
117
118/// Options that control which scenarios are executed and whether to resume a prior run.
119///
120/// Build via [`RunOptions::default`] and override the fields you need.
121///
122/// # Examples
123///
124/// ```
125/// use zeph_bench::runner::{RunOptions, MemoryMode};
126///
127/// // Run all scenarios.
128/// let opts = RunOptions::default();
129/// assert!(opts.scenario_filter.is_none());
130/// assert!(opts.completed_ids.is_empty());
131/// assert_eq!(opts.memory_mode, MemoryMode::Off);
132/// ```
133#[derive(Debug, Default)]
134pub struct RunOptions {
135 /// When `Some(id)`, only the scenario with this ID is executed.
136 pub scenario_filter: Option<String>,
137 /// Set of scenario IDs already completed in a prior run (used for `--resume`).
138 pub completed_ids: HashSet<String>,
139 /// Whether to wire a `SemanticMemory` backend into the agent for this run.
140 pub memory_mode: MemoryMode,
141}
142
143/// Minimal no-op tool executor for baseline benchmark runs.
144///
145/// Returns an empty tool list and `Ok(None)` on every execute call, ensuring that
146/// the agent loop cannot invoke any tools during a benchmark run.
147struct NoopExecutor;
148
149impl ToolExecutor for NoopExecutor {
150 async fn execute(&self, _response: &str) -> Result<Option<ToolOutput>, ToolError> {
151 Ok(None)
152 }
153}
154
155/// Drives [`Agent<BenchmarkChannel>`] over a dataset and collects scored results.
156///
157/// Each call to [`run_dataset`][BenchRunner::run_dataset] creates a fresh agent per
158/// scenario (baseline mode: no tools, no MCP). Memory is optionally wired via
159/// [`BenchRunner::with_memory_params`] and [`RunOptions::memory_mode`].
160///
161/// # Examples
162///
163/// ```no_run
164/// use zeph_bench::runner::BenchRunner;
165/// use zeph_llm::{any::AnyProvider, mock::MockProvider};
166///
167/// let provider = AnyProvider::Mock(MockProvider::with_responses(vec!["Paris".into()]));
168/// let runner = BenchRunner::new(provider);
169/// ```
170pub struct BenchRunner {
171 provider: AnyProvider,
172 /// Parameters for constructing per-scenario `SQLite`-backed `SemanticMemory`.
173 ///
174 /// Set via [`BenchRunner::with_memory_params`]; required when
175 /// `RunOptions::memory_mode == MemoryMode::On`.
176 memory_params: Option<BenchMemoryParams>,
177}
178
179impl BenchRunner {
180 /// Create a new runner with the given provider.
181 ///
182 /// The `no_deterministic` argument is unused at runtime but kept in the public API
183 /// so the bench command can pass it through for future use (e.g., logging or config).
184 /// Apply deterministic overrides to `provider` before calling this if needed.
185 ///
186 /// # Examples
187 ///
188 /// ```no_run
189 /// use zeph_bench::runner::BenchRunner;
190 /// use zeph_llm::{any::AnyProvider, mock::MockProvider};
191 ///
192 /// let provider = AnyProvider::Mock(MockProvider::with_responses(vec![]));
193 /// let runner = BenchRunner::new(provider);
194 /// ```
195 #[must_use]
196 pub fn new(provider: AnyProvider) -> Self {
197 Self {
198 provider,
199 memory_params: None,
200 }
201 }
202
203 /// Attach `SemanticMemory` parameters for memory-on benchmark runs.
204 ///
205 /// When set, a per-scenario `SQLite`-backed `SemanticMemory` is constructed inside
206 /// [`run_one`][BenchRunner::run_one] whenever `opts.memory_mode == MemoryMode::On`.
207 ///
208 /// # Examples
209 ///
210 /// ```no_run
211 /// use std::path::PathBuf;
212 /// use zeph_bench::runner::{BenchRunner, BenchMemoryParams};
213 /// use zeph_llm::{any::AnyProvider, mock::MockProvider};
214 ///
215 /// let provider = AnyProvider::Mock(MockProvider::with_responses(vec![]));
216 /// let params = BenchMemoryParams {
217 /// data_dir: PathBuf::from("/tmp/bench-data"),
218 /// embedding_model: "nomic-embed-text".into(),
219 /// run_id: "bench-abc".into(),
220 /// dataset: "locomo".into(),
221 /// };
222 /// let runner = BenchRunner::new(provider).with_memory_params(params);
223 /// ```
224 #[must_use]
225 pub fn with_memory_params(mut self, params: BenchMemoryParams) -> Self {
226 self.memory_params = Some(params);
227 self
228 }
229
230 /// Run all matching scenarios from `path` through the agent and return a [`BenchRun`].
231 ///
232 /// For each scenario:
233 /// 1. Builds a fresh `Agent<BenchmarkChannel>` with no tools or memory.
234 /// 2. Feeds the scenario prompt and collects the agent's response.
235 /// 3. Scores the response with `evaluator`.
236 /// 4. Appends a [`ScenarioResult`] and recomputes aggregate statistics.
237 ///
238 /// The returned [`BenchRun`] has `status = Running` until the caller sets it to
239 /// `Completed` or `Interrupted`.
240 ///
241 /// # Errors
242 ///
243 /// Returns [`BenchError`] if the dataset cannot be loaded or a scenario run fails.
244 pub async fn run_dataset<L, E>(
245 &self,
246 loader: &L,
247 evaluator: &E,
248 path: &Path,
249 opts: RunOptions,
250 ) -> Result<BenchRun, BenchError>
251 where
252 L: DatasetLoader,
253 E: Evaluator,
254 {
255 let scenarios = loader.load(path)?;
256 let filtered = filter_scenarios(&scenarios, &opts, loader.name())?;
257
258 let _span = tracing::info_span!(
259 "bench.run_dataset",
260 dataset = loader.name(),
261 scenarios = filtered.len(),
262 )
263 .entered();
264
265 let model_id = self.provider.model_identifier().to_owned();
266
267 let mut run = BenchRun {
268 dataset: loader.name().to_owned(),
269 model: model_id,
270 run_id: uuid(),
271 started_at: timestamp::utc_now_rfc3339(),
272 finished_at: String::new(),
273 status: RunStatus::Running,
274 results: vec![],
275 aggregate: crate::results::Aggregate::default(),
276 };
277
278 for scenario in filtered {
279 let _s = tracing::info_span!("bench.scenario", id = %scenario.id).entered();
280
281 let t0 = Instant::now();
282 let response_text = Box::pin(self.run_one(scenario, opts.memory_mode)).await?;
283 let elapsed_ms = u64::try_from(t0.elapsed().as_millis()).unwrap_or(u64::MAX);
284
285 let eval = evaluator.evaluate(scenario, &response_text);
286 let excerpt = response_text.chars().take(200).collect::<String>();
287
288 run.results.push(ScenarioResult {
289 scenario_id: scenario.id.clone(),
290 score: eval.score,
291 response_excerpt: excerpt,
292 error: None,
293 elapsed_ms,
294 });
295 run.recompute_aggregate();
296 }
297
298 Ok(run)
299 }
300
301 /// Run all scenarios from `path` through a per-scenario env executor and return a [`BenchRun`].
302 ///
303 /// This is the execution path for tool-driven datasets (tau2-bench). For each scenario:
304 /// 1. Calls `env_factory(scenario)` to build a fresh `(ToolExecutor, ActionTrace)`.
305 /// 2. Builds a fresh `TauBenchEvaluator` from the scenario metadata and the trace.
306 /// 3. Runs the agent with the env executor and the tool-use system prompt.
307 /// 4. Scores the response via the evaluator (reads the populated trace).
308 ///
309 /// # Errors
310 ///
311 /// Returns [`BenchError`] if the dataset cannot be loaded, the env factory fails, or
312 /// `TauBenchEvaluator::from_scenario` fails (malformed metadata).
313 pub async fn run_dataset_with_env_factory<L, F, X>(
314 &self,
315 loader: &L,
316 env_factory: F,
317 path: &Path,
318 opts: RunOptions,
319 ) -> Result<BenchRun, BenchError>
320 where
321 L: DatasetLoader,
322 F: Fn(&Scenario) -> Result<(X, ActionTrace), BenchError>,
323 X: ToolExecutor + Send + Sync + 'static,
324 {
325 let scenarios = loader.load(path)?;
326 let filtered = filter_scenarios(&scenarios, &opts, loader.name())?;
327
328 let _span = tracing::info_span!(
329 "bench.run_dataset_with_env_factory",
330 dataset = loader.name(),
331 scenarios = filtered.len(),
332 )
333 .entered();
334
335 let model_id = self.provider.model_identifier().to_owned();
336
337 let mut run = BenchRun {
338 dataset: loader.name().to_owned(),
339 model: model_id,
340 run_id: uuid(),
341 started_at: timestamp::utc_now_rfc3339(),
342 finished_at: String::new(),
343 status: RunStatus::Running,
344 results: vec![],
345 aggregate: crate::results::Aggregate::default(),
346 };
347
348 for scenario in filtered {
349 let _s = tracing::info_span!("bench.scenario", id = %scenario.id).entered();
350
351 let (executor, trace) = env_factory(scenario)?;
352 let evaluator = TauBenchEvaluator::from_scenario(scenario, trace)?;
353
354 let t0 = Instant::now();
355 let response_text = Box::pin(self.run_one_with_executor(
356 scenario,
357 executor,
358 opts.memory_mode,
359 ResponseMode::ToolUse,
360 ))
361 .await?;
362 let elapsed_ms = u64::try_from(t0.elapsed().as_millis()).unwrap_or(u64::MAX);
363
364 let eval = evaluator.evaluate(scenario, &response_text);
365 let excerpt = response_text.chars().take(200).collect::<String>();
366
367 run.results.push(ScenarioResult {
368 scenario_id: scenario.id.clone(),
369 score: eval.score,
370 response_excerpt: excerpt,
371 error: None,
372 elapsed_ms,
373 });
374 run.recompute_aggregate();
375 }
376
377 Ok(run)
378 }
379
380 /// Run a single scenario through a fresh agent and return the last response text.
381 ///
382 /// A concise-answer system prompt is injected via [`InstructionBlock`] so the model
383 /// responds with only the final answer (a number, word, or short phrase) rather than
384 /// full sentences. The raw response is then post-processed to extract the first
385 /// non-empty line and strip markdown formatting, which further reduces noise for
386 /// evaluators that perform exact or near-exact matching.
387 ///
388 /// When `memory_mode == MemoryMode::On`, a per-scenario `SQLite`-backed
389 /// `SemanticMemory` is constructed and wired into the agent. The database file is
390 /// deleted after the scenario completes (best-effort, NFR-001).
391 ///
392 /// # Errors
393 ///
394 /// Returns [`BenchError::InvalidFormat`] when the scenario has no user turn or when
395 /// `SemanticMemory` initialisation fails.
396 async fn run_one(
397 &self,
398 scenario: &Scenario,
399 memory_mode: MemoryMode,
400 ) -> Result<String, BenchError> {
401 Box::pin(self.run_one_with_executor(
402 scenario,
403 NoopExecutor,
404 memory_mode,
405 ResponseMode::TerseAnswer,
406 ))
407 .await
408 }
409
410 /// Core execution: run one scenario with the given executor and response mode.
411 ///
412 /// Called by both [`BenchRunner::run_dataset`] (with `NoopExecutor` + `TerseAnswer`) and
413 /// [`BenchRunner::run_dataset_with_env_factory`] (with the domain env + `ToolUse`).
414 #[allow(clippy::too_many_lines)] // sequential setup steps; splitting adds indirection without clarity
415 async fn run_one_with_executor<X: ToolExecutor + Send + Sync + 'static>(
416 &self,
417 scenario: &Scenario,
418 executor: X,
419 memory_mode: MemoryMode,
420 mode: ResponseMode,
421 ) -> Result<String, BenchError> {
422 let _span = tracing::info_span!(
423 "bench.run_one",
424 scenario_id = %scenario.id,
425 mode = ?mode,
426 )
427 .entered();
428 let channel = BenchmarkChannel::from_turns(scenario.turns.clone());
429 if channel.total() == 0 {
430 return Err(BenchError::InvalidFormat(format!(
431 "scenario '{}' has no user turn",
432 scenario.id
433 )));
434 }
435 let registry = SkillRegistry::empty();
436
437 let system_content = match mode {
438 ResponseMode::TerseAnswer => concat!(
439 "You are an evaluation assistant. ",
440 "Answer every question with the shortest possible response. ",
441 "Give only the final answer — no explanation, no full sentences, ",
442 "no punctuation unless it is part of the answer. ",
443 "If the answer is a single word or number, respond with only that word or number."
444 ),
445 ResponseMode::ToolUse => concat!(
446 "You are a customer-service agent. ",
447 "Use the available tools to help the user. ",
448 "Always call a tool when one applies; do not ask the user to perform actions you can perform yourself. ",
449 "When you have completed the user's request, respond with a brief confirmation."
450 ),
451 };
452
453 let blocks = vec![InstructionBlock {
454 source: PathBuf::from("<bench-system-prompt>"),
455 content: system_content.to_owned(),
456 }];
457
458 let base_agent = Agent::new(self.provider.clone(), channel, registry, None, 1, executor)
459 .with_instruction_blocks(blocks);
460
461 // Optionally wire SemanticMemory when the caller requests memory-on mode.
462 let (mut agent, scenario_db) = if memory_mode == MemoryMode::On
463 && let Some(ref params) = self.memory_params
464 {
465 // One SQLite file per scenario gives strict isolation (NFR-001 choice (a)).
466 // This is more files than a per-run DB, but eliminates any cross-scenario
467 // memory bleed and avoids needing BenchIsolation::reset() between scenarios.
468 let scenario_db = params
469 .data_dir
470 .join(format!("bench-{}-{}.db", params.run_id, scenario.id));
471 debug_assert!(
472 scenario_db.to_string_lossy().contains("bench-"),
473 "NFR-001: bench SQLite path must be namespaced with 'bench-'"
474 );
475
476 tracing::debug!(
477 scenario_id = %scenario.id,
478 path = %scenario_db.display(),
479 "bench: memory init start"
480 );
481 let memory = Arc::new(
482 tokio::time::timeout(
483 std::time::Duration::from_secs(10),
484 SemanticMemory::with_sqlite_backend(
485 scenario_db.to_string_lossy().as_ref(),
486 self.provider.clone(),
487 ¶ms.embedding_model,
488 0.7,
489 0.3,
490 ),
491 )
492 .await
493 .map_err(|_| {
494 BenchError::InvalidFormat(format!(
495 "SemanticMemory init timed out for scenario '{}'",
496 scenario.id
497 ))
498 })?
499 .map_err(|e| BenchError::InvalidFormat(format!("SemanticMemory init: {e}")))?,
500 );
501 tracing::debug!(scenario_id = %scenario.id, "bench: memory init done");
502
503 // Seed the sessions table so persist_message does not fail with FK violation.
504 let conv_id = memory
505 .sqlite()
506 .create_conversation()
507 .await
508 .map_err(|e| BenchError::InvalidFormat(format!("create_conversation: {e}")))?;
509
510 // summarization_threshold = 100_000 deliberately suppresses LLM-driven
511 // compaction during bench runs. Compaction calls another LLM round-trip
512 // with non-deterministic timing/output, which would violate FR-003
513 // (deterministic runs). recall_limit = 20 is generous enough to surface
514 // long-context memory effects without silently capping LongMemEval scores
515 // below their theoretical maximum. history_limit = 200 covers the longest
516 // LongMemEval session without truncation.
517 let wired_agent = base_agent.with_memory(memory, conv_id, 200, 20, 100_000);
518 (wired_agent, Some(scenario_db))
519 } else {
520 (base_agent, None)
521 };
522
523 // Ignore agent errors — a failed LLM call still yields an empty response that
524 // the evaluator scores as 0.0 rather than aborting the entire run.
525 let _ = Box::pin(agent.run()).await;
526 let channel = agent.into_channel();
527 // tool_outputs available for Phase 2 scoring (#4234); log count so future
528 // implementors have a trace even before the evaluator wires them up.
529 tracing::debug!(
530 count = channel.tool_outputs().len(),
531 "bench: tool outputs captured"
532 );
533 let responses = channel.into_responses();
534
535 // Best-effort cleanup: delete per-scenario SQLite file after the run.
536 // Failure is intentionally ignored — NFR-001 is hygiene, not correctness.
537 if let Some(ref db_path) = scenario_db {
538 let _ = std::fs::remove_file(db_path);
539 }
540
541 let raw = responses
542 .into_iter()
543 .last()
544 .map(|r| r.text)
545 .unwrap_or_default();
546
547 Ok(match mode {
548 ResponseMode::TerseAnswer => post_process_response(&raw),
549 // Verified: dropping send_tool_output does NOT affect the agent loop's tool-result
550 // feedback to the LLM. Tool outputs flow via Agent's internal MessagePart::ToolResult,
551 // not via the channel. See crates/zeph-core/src/agent/tool_execution/native.rs.
552 ResponseMode::ToolUse => raw,
553 })
554 }
555}
556
557/// Return the subset of `scenarios` that should run given `opts`.
558///
559/// Validates that when a `scenario_filter` is set, at least one matching scenario exists in
560/// `scenarios`. Then filters out already-completed IDs and non-matching scenarios.
561///
562/// # Errors
563///
564/// Returns [`BenchError::InvalidFormat`] when `opts.scenario_filter` names a scenario that
565/// does not appear in `scenarios`.
566fn filter_scenarios<'a>(
567 scenarios: &'a [Scenario],
568 opts: &RunOptions,
569 loader_name: &str,
570) -> Result<Vec<&'a Scenario>, BenchError> {
571 if let Some(ref filter) = opts.scenario_filter
572 && !scenarios.iter().any(|s| &s.id == filter)
573 {
574 return Err(BenchError::InvalidFormat(format!(
575 "scenario '{filter}' not found in dataset '{loader_name}'"
576 )));
577 }
578
579 Ok(scenarios
580 .iter()
581 .filter(|s| {
582 if opts.completed_ids.contains(&s.id) {
583 return false;
584 }
585 if let Some(ref filter) = opts.scenario_filter {
586 return &s.id == filter;
587 }
588 true
589 })
590 .collect())
591}
592
593/// Post-process the raw agent response to extract a clean, terse answer.
594///
595/// Applies these transformations in order:
596/// 1. Take only the first non-empty line — strips explanations appended after the answer.
597/// 2. Strip markdown formatting (bold `**`, italic `*` and `_`, inline code `` ` ``).
598/// 3. Trim surrounding whitespace.
599///
600/// This is a best-effort cleanup. Evaluators still normalize the result, so minor
601/// leftover punctuation is handled downstream.
602fn post_process_response(raw: &str) -> String {
603 // Take the first non-empty line to discard any trailing explanation.
604 let first_line = raw
605 .lines()
606 .map(str::trim)
607 .find(|l| !l.is_empty())
608 .unwrap_or("");
609
610 // Strip common markdown formatting characters.
611 first_line
612 .trim_matches(|c: char| matches!(c, '*' | '_' | '`' | ' ' | '\t'))
613 .replace("**", "")
614 .replace('`', "")
615 .trim()
616 .to_owned()
617}
618
619/// Generate a short pseudo-UUID-like run ID without the `uuid` crate.
620///
621/// Uses `std::time::SystemTime` for uniqueness. Not cryptographically random but
622/// sufficient for benchmark run identification.
623fn uuid() -> String {
624 use std::time::{SystemTime, UNIX_EPOCH};
625 let d = SystemTime::now()
626 .duration_since(UNIX_EPOCH)
627 .unwrap_or_default();
628 format!("bench-{:x}-{:x}", d.as_secs(), d.subsec_nanos())
629}
630
631#[cfg(test)]
632mod tests {
633 use super::*;
634
635 #[test]
636 fn run_options_default_is_empty() {
637 let opts = RunOptions::default();
638 assert!(opts.scenario_filter.is_none());
639 assert!(opts.completed_ids.is_empty());
640 assert_eq!(opts.memory_mode, MemoryMode::Off);
641 }
642
643 #[test]
644 fn memory_mode_default_is_off() {
645 assert_eq!(MemoryMode::default(), MemoryMode::Off);
646 }
647
648 #[test]
649 fn with_memory_params_sets_isolation() {
650 use zeph_llm::{any::AnyProvider, mock::MockProvider};
651 let provider = AnyProvider::Mock(MockProvider::with_responses(vec![]));
652 let params = BenchMemoryParams {
653 data_dir: std::path::PathBuf::from("/tmp/bench-data"),
654 embedding_model: "nomic-embed-text".into(),
655 run_id: "bench-abc".into(),
656 dataset: "locomo".into(),
657 };
658 let runner = BenchRunner::new(provider).with_memory_params(params.clone());
659 assert!(runner.memory_params.is_some());
660 let stored = runner.memory_params.unwrap();
661 assert_eq!(stored.run_id, "bench-abc");
662 assert_eq!(stored.dataset, "locomo");
663 }
664
665 #[test]
666 fn nfr_001_sqlite_path_namespaced() {
667 let params = BenchMemoryParams {
668 data_dir: std::path::PathBuf::from("/tmp/bench-data"),
669 embedding_model: "nomic-embed-text".into(),
670 run_id: "run-xyz".into(),
671 dataset: "locomo".into(),
672 };
673 let scenario_id = "s1_0";
674 let scenario_db = params
675 .data_dir
676 .join(format!("bench-{}-{}.db", params.run_id, scenario_id));
677 assert!(
678 scenario_db.to_string_lossy().contains("bench-"),
679 "NFR-001: SQLite path must contain bench- prefix"
680 );
681 }
682
683 #[test]
684 fn now_rfc3339_has_correct_format() {
685 let ts = timestamp::utc_now_rfc3339();
686 // e.g. "2026-04-25T10:30:00Z"
687 assert_eq!(ts.len(), 20);
688 assert!(ts.ends_with('Z'));
689 assert!(ts.contains('T'));
690 }
691
692 #[test]
693 fn uuid_generates_non_empty_string() {
694 let id = uuid();
695 assert!(id.starts_with("bench-"));
696 assert!(id.len() > 10);
697 }
698
699 #[test]
700 fn post_process_takes_first_line() {
701 let raw = "1945\n\nWorld War II ended in 1945.";
702 assert_eq!(post_process_response(raw), "1945");
703 }
704
705 #[test]
706 fn post_process_strips_markdown_bold() {
707 assert_eq!(post_process_response("**1945**"), "1945");
708 }
709
710 #[test]
711 fn post_process_strips_backticks() {
712 assert_eq!(post_process_response("`Au`"), "Au");
713 }
714
715 #[test]
716 fn post_process_trims_whitespace() {
717 assert_eq!(post_process_response(" Paris "), "Paris");
718 }
719
720 #[test]
721 fn post_process_empty_input_returns_empty() {
722 assert_eq!(post_process_response(""), "");
723 }
724
725 #[test]
726 fn post_process_skips_empty_leading_lines() {
727 let raw = "\n\n \nParis";
728 assert_eq!(post_process_response(raw), "Paris");
729 }
730}