zeph_bench/runner.rs
1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Benchmark runner: drives `Agent<BenchmarkChannel>` over a dataset and collects results.
5//!
6//! [`BenchRunner`] is the execution engine for `zeph bench run`. It is intentionally
7//! minimal — baseline mode only (no tools, no memory, no MCP). Each scenario is run in
8//! isolation through a fresh [`BenchmarkChannel`] and the agent's raw text response is
9//! scored by the supplied [`Evaluator`].
10//!
11//! # Usage
12//!
13//! ```no_run
14//! use std::path::Path;
15//! use zeph_bench::runner::{BenchRunner, RunOptions};
16//! use zeph_bench::loaders::{GaiaLoader, GaiaEvaluator};
17//! use zeph_llm::{any::AnyProvider, mock::MockProvider};
18//!
19//! # async fn example() -> Result<(), zeph_bench::BenchError> {
20//! let provider = AnyProvider::Mock(MockProvider::with_responses(vec!["1945".into()]));
21//! let runner = BenchRunner::new(provider);
22//! let opts = RunOptions::default();
23//! let run = runner.run_dataset(&GaiaLoader::all_levels(), &GaiaEvaluator, Path::new("/data/gaia.jsonl"), opts).await?;
24//! println!("mean score: {:.4}", run.aggregate.mean_score);
25//! # Ok(())
26//! # }
27//! ```
28
29use std::collections::HashSet;
30use std::path::{Path, PathBuf};
31use std::sync::Arc;
32use std::time::Instant;
33
34use zeph_common::timestamp;
35use zeph_core::agent::Agent;
36use zeph_core::instructions::InstructionBlock;
37use zeph_llm::any::AnyProvider;
38use zeph_llm::provider::LlmProvider as _;
39use zeph_memory::semantic::SemanticMemory;
40use zeph_skills::registry::SkillRegistry;
41use zeph_tools::executor::{ToolError, ToolExecutor, ToolOutput};
42
43use crate::channel::BenchmarkChannel;
44use crate::error::BenchError;
45use crate::loaders::tau2_bench::{ActionTrace, TauBenchEvaluator};
46use crate::results::{BenchRun, RunStatus, ScenarioResult};
47use crate::scenario::{DatasetLoader, Evaluator, Scenario};
48
49/// Controls how the runner processes the agent's raw text response.
50///
51/// Used by [`BenchRunner::run_one_with_executor`] to select the appropriate
52/// system prompt and post-processing behaviour.
53#[derive(Debug, Clone, Copy, PartialEq, Eq)]
54pub enum ResponseMode {
55 /// Inject a "shortest possible answer" system prompt and strip markdown from the response.
56 ///
57 /// Used by all knowledge-retrieval datasets (GAIA, LOCOMO, FRAMES, `LongMemEval`).
58 TerseAnswer,
59 /// Inject a tool-use system prompt; return the raw agent response without post-processing.
60 ///
61 /// Used by tau2-bench where the evaluation is based on the action trace, not text output.
62 ToolUse,
63}
64
65/// Controls whether `SemanticMemory` is wired into the agent during a benchmark run.
66///
67/// # Examples
68///
69/// ```
70/// use zeph_bench::runner::MemoryMode;
71///
72/// assert_eq!(MemoryMode::default(), MemoryMode::Off);
73/// ```
74#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
75pub enum MemoryMode {
76 /// No `SemanticMemory` — current default behaviour.
77 #[default]
78 Off,
79 /// Wire a `SQLite`-backed `SemanticMemory` into the agent via `Agent::with_memory`.
80 On,
81}
82
83/// Parameters required to construct a per-scenario `SQLite`-backed `SemanticMemory`.
84///
85/// Populated by [`BenchRunner::with_memory_params`] and consumed inside
86/// [`BenchRunner::run_one`] when `opts.memory_mode == MemoryMode::On`.
87///
88/// # Examples
89///
90/// ```
91/// use std::path::PathBuf;
92/// use zeph_bench::runner::BenchMemoryParams;
93///
94/// let params = BenchMemoryParams {
95/// data_dir: PathBuf::from("/tmp/bench"),
96/// embedding_model: "nomic-embed-text".into(),
97/// run_id: "bench-abc".into(),
98/// dataset: "locomo".into(),
99/// };
100/// assert!(params.data_dir.to_string_lossy().contains("bench"));
101/// ```
102#[derive(Debug, Clone)]
103pub struct BenchMemoryParams {
104 /// Directory where per-scenario `SQLite` files live (deleted between scenarios).
105 ///
106 /// The derived path always contains the `bench-` segment (NFR-001).
107 pub data_dir: PathBuf,
108 /// Embedding model name passed to `SemanticMemory`.
109 pub embedding_model: String,
110 /// Run ID used to namespace bench artifacts; matches the outer `BenchRun.run_id`.
111 pub run_id: String,
112 /// Dataset name used to namespace bench artifacts.
113 pub dataset: String,
114}
115
116/// Options that control which scenarios are executed and whether to resume a prior run.
117///
118/// Build via [`RunOptions::default`] and override the fields you need.
119///
120/// # Examples
121///
122/// ```
123/// use zeph_bench::runner::{RunOptions, MemoryMode};
124///
125/// // Run all scenarios.
126/// let opts = RunOptions::default();
127/// assert!(opts.scenario_filter.is_none());
128/// assert!(opts.completed_ids.is_empty());
129/// assert_eq!(opts.memory_mode, MemoryMode::Off);
130/// ```
131#[derive(Debug, Default)]
132pub struct RunOptions {
133 /// When `Some(id)`, only the scenario with this ID is executed.
134 pub scenario_filter: Option<String>,
135 /// Set of scenario IDs already completed in a prior run (used for `--resume`).
136 pub completed_ids: HashSet<String>,
137 /// Whether to wire a `SemanticMemory` backend into the agent for this run.
138 pub memory_mode: MemoryMode,
139}
140
141/// Minimal no-op tool executor for baseline benchmark runs.
142///
143/// Returns an empty tool list and `Ok(None)` on every execute call, ensuring that
144/// the agent loop cannot invoke any tools during a benchmark run.
145struct NoopExecutor;
146
147impl ToolExecutor for NoopExecutor {
148 async fn execute(&self, _response: &str) -> Result<Option<ToolOutput>, ToolError> {
149 Ok(None)
150 }
151}
152
153/// Drives [`Agent<BenchmarkChannel>`] over a dataset and collects scored results.
154///
155/// Each call to [`run_dataset`][BenchRunner::run_dataset] creates a fresh agent per
156/// scenario (baseline mode: no tools, no MCP). Memory is optionally wired via
157/// [`BenchRunner::with_memory_params`] and [`RunOptions::memory_mode`].
158///
159/// # Examples
160///
161/// ```no_run
162/// use zeph_bench::runner::BenchRunner;
163/// use zeph_llm::{any::AnyProvider, mock::MockProvider};
164///
165/// let provider = AnyProvider::Mock(MockProvider::with_responses(vec!["Paris".into()]));
166/// let runner = BenchRunner::new(provider);
167/// ```
168pub struct BenchRunner {
169 provider: AnyProvider,
170 /// Parameters for constructing per-scenario `SQLite`-backed `SemanticMemory`.
171 ///
172 /// Set via [`BenchRunner::with_memory_params`]; required when
173 /// `RunOptions::memory_mode == MemoryMode::On`.
174 memory_params: Option<BenchMemoryParams>,
175}
176
177impl BenchRunner {
178 /// Create a new runner with the given provider.
179 ///
180 /// The `no_deterministic` argument is unused at runtime but kept in the public API
181 /// so the bench command can pass it through for future use (e.g., logging or config).
182 /// Apply deterministic overrides to `provider` before calling this if needed.
183 ///
184 /// # Examples
185 ///
186 /// ```no_run
187 /// use zeph_bench::runner::BenchRunner;
188 /// use zeph_llm::{any::AnyProvider, mock::MockProvider};
189 ///
190 /// let provider = AnyProvider::Mock(MockProvider::with_responses(vec![]));
191 /// let runner = BenchRunner::new(provider);
192 /// ```
193 #[must_use]
194 pub fn new(provider: AnyProvider) -> Self {
195 Self {
196 provider,
197 memory_params: None,
198 }
199 }
200
201 /// Attach `SemanticMemory` parameters for memory-on benchmark runs.
202 ///
203 /// When set, a per-scenario `SQLite`-backed `SemanticMemory` is constructed inside
204 /// [`run_one`][BenchRunner::run_one] whenever `opts.memory_mode == MemoryMode::On`.
205 ///
206 /// # Examples
207 ///
208 /// ```no_run
209 /// use std::path::PathBuf;
210 /// use zeph_bench::runner::{BenchRunner, BenchMemoryParams};
211 /// use zeph_llm::{any::AnyProvider, mock::MockProvider};
212 ///
213 /// let provider = AnyProvider::Mock(MockProvider::with_responses(vec![]));
214 /// let params = BenchMemoryParams {
215 /// data_dir: PathBuf::from("/tmp/bench-data"),
216 /// embedding_model: "nomic-embed-text".into(),
217 /// run_id: "bench-abc".into(),
218 /// dataset: "locomo".into(),
219 /// };
220 /// let runner = BenchRunner::new(provider).with_memory_params(params);
221 /// ```
222 #[must_use]
223 pub fn with_memory_params(mut self, params: BenchMemoryParams) -> Self {
224 self.memory_params = Some(params);
225 self
226 }
227
228 /// Run all matching scenarios from `path` through the agent and return a [`BenchRun`].
229 ///
230 /// For each scenario:
231 /// 1. Builds a fresh `Agent<BenchmarkChannel>` with no tools or memory.
232 /// 2. Feeds the scenario prompt and collects the agent's response.
233 /// 3. Scores the response with `evaluator`.
234 /// 4. Appends a [`ScenarioResult`] and recomputes aggregate statistics.
235 ///
236 /// The returned [`BenchRun`] has `status = Running` until the caller sets it to
237 /// `Completed` or `Interrupted`.
238 ///
239 /// # Errors
240 ///
241 /// Returns [`BenchError`] if the dataset cannot be loaded or a scenario run fails.
242 pub async fn run_dataset<L, E>(
243 &self,
244 loader: &L,
245 evaluator: &E,
246 path: &Path,
247 opts: RunOptions,
248 ) -> Result<BenchRun, BenchError>
249 where
250 L: DatasetLoader,
251 E: Evaluator,
252 {
253 let scenarios = loader.load(path)?;
254 let filtered = filter_scenarios(&scenarios, &opts, loader.name())?;
255
256 let _span = tracing::info_span!(
257 "bench.run_dataset",
258 dataset = loader.name(),
259 scenarios = filtered.len(),
260 )
261 .entered();
262
263 let model_id = self.provider.model_identifier().to_owned();
264
265 let mut run = BenchRun {
266 dataset: loader.name().to_owned(),
267 model: model_id,
268 run_id: uuid(),
269 started_at: timestamp::utc_now_rfc3339(),
270 finished_at: String::new(),
271 status: RunStatus::Running,
272 results: vec![],
273 aggregate: crate::results::Aggregate::default(),
274 };
275
276 for scenario in filtered {
277 let _s = tracing::info_span!("bench.scenario", id = %scenario.id).entered();
278
279 let t0 = Instant::now();
280 let response_text = Box::pin(self.run_one(scenario, opts.memory_mode)).await?;
281 let elapsed_ms = u64::try_from(t0.elapsed().as_millis()).unwrap_or(u64::MAX);
282
283 let eval = evaluator.evaluate(scenario, &response_text);
284 let excerpt = response_text.chars().take(200).collect::<String>();
285
286 run.results.push(ScenarioResult {
287 scenario_id: scenario.id.clone(),
288 score: eval.score,
289 response_excerpt: excerpt,
290 error: None,
291 elapsed_ms,
292 });
293 run.recompute_aggregate();
294 }
295
296 Ok(run)
297 }
298
299 /// Run all scenarios from `path` through a per-scenario env executor and return a [`BenchRun`].
300 ///
301 /// This is the execution path for tool-driven datasets (tau2-bench). For each scenario:
302 /// 1. Calls `env_factory(scenario)` to build a fresh `(ToolExecutor, ActionTrace)`.
303 /// 2. Builds a fresh `TauBenchEvaluator` from the scenario metadata and the trace.
304 /// 3. Runs the agent with the env executor and the tool-use system prompt.
305 /// 4. Scores the response via the evaluator (reads the populated trace).
306 ///
307 /// # Errors
308 ///
309 /// Returns [`BenchError`] if the dataset cannot be loaded, the env factory fails, or
310 /// `TauBenchEvaluator::from_scenario` fails (malformed metadata).
311 pub async fn run_dataset_with_env_factory<L, F, X>(
312 &self,
313 loader: &L,
314 env_factory: F,
315 path: &Path,
316 opts: RunOptions,
317 ) -> Result<BenchRun, BenchError>
318 where
319 L: DatasetLoader,
320 F: Fn(&Scenario) -> Result<(X, ActionTrace), BenchError>,
321 X: ToolExecutor + Send + Sync + 'static,
322 {
323 let scenarios = loader.load(path)?;
324 let filtered = filter_scenarios(&scenarios, &opts, loader.name())?;
325
326 let _span = tracing::info_span!(
327 "bench.run_dataset_with_env_factory",
328 dataset = loader.name(),
329 scenarios = filtered.len(),
330 )
331 .entered();
332
333 let model_id = self.provider.model_identifier().to_owned();
334
335 let mut run = BenchRun {
336 dataset: loader.name().to_owned(),
337 model: model_id,
338 run_id: uuid(),
339 started_at: timestamp::utc_now_rfc3339(),
340 finished_at: String::new(),
341 status: RunStatus::Running,
342 results: vec![],
343 aggregate: crate::results::Aggregate::default(),
344 };
345
346 for scenario in filtered {
347 let _s = tracing::info_span!("bench.scenario", id = %scenario.id).entered();
348
349 let (executor, trace) = env_factory(scenario)?;
350 let evaluator = TauBenchEvaluator::from_scenario(scenario, trace)?;
351
352 let t0 = Instant::now();
353 let response_text = Box::pin(self.run_one_with_executor(
354 scenario,
355 executor,
356 opts.memory_mode,
357 ResponseMode::ToolUse,
358 ))
359 .await?;
360 let elapsed_ms = u64::try_from(t0.elapsed().as_millis()).unwrap_or(u64::MAX);
361
362 let eval = evaluator.evaluate(scenario, &response_text);
363 let excerpt = response_text.chars().take(200).collect::<String>();
364
365 run.results.push(ScenarioResult {
366 scenario_id: scenario.id.clone(),
367 score: eval.score,
368 response_excerpt: excerpt,
369 error: None,
370 elapsed_ms,
371 });
372 run.recompute_aggregate();
373 }
374
375 Ok(run)
376 }
377
378 /// Run a single scenario through a fresh agent and return the last response text.
379 ///
380 /// A concise-answer system prompt is injected via [`InstructionBlock`] so the model
381 /// responds with only the final answer (a number, word, or short phrase) rather than
382 /// full sentences. The raw response is then post-processed to extract the first
383 /// non-empty line and strip markdown formatting, which further reduces noise for
384 /// evaluators that perform exact or near-exact matching.
385 ///
386 /// When `memory_mode == MemoryMode::On`, a per-scenario `SQLite`-backed
387 /// `SemanticMemory` is constructed and wired into the agent. The database file is
388 /// deleted after the scenario completes (best-effort, NFR-001).
389 ///
390 /// # Errors
391 ///
392 /// Returns [`BenchError::InvalidFormat`] when the scenario has no user turn or when
393 /// `SemanticMemory` initialisation fails.
394 async fn run_one(
395 &self,
396 scenario: &Scenario,
397 memory_mode: MemoryMode,
398 ) -> Result<String, BenchError> {
399 Box::pin(self.run_one_with_executor(
400 scenario,
401 NoopExecutor,
402 memory_mode,
403 ResponseMode::TerseAnswer,
404 ))
405 .await
406 }
407
408 /// Core execution: run one scenario with the given executor and response mode.
409 ///
410 /// Called by both [`BenchRunner::run_dataset`] (with `NoopExecutor` + `TerseAnswer`) and
411 /// [`BenchRunner::run_dataset_with_env_factory`] (with the domain env + `ToolUse`).
412 #[allow(clippy::too_many_lines)] // sequential setup steps; splitting adds indirection without clarity
413 async fn run_one_with_executor<X: ToolExecutor + Send + Sync + 'static>(
414 &self,
415 scenario: &Scenario,
416 executor: X,
417 memory_mode: MemoryMode,
418 mode: ResponseMode,
419 ) -> Result<String, BenchError> {
420 let _span = tracing::info_span!(
421 "bench.run_one",
422 scenario_id = %scenario.id,
423 mode = ?mode,
424 )
425 .entered();
426 let channel = BenchmarkChannel::from_turns(scenario.turns.clone());
427 if channel.total() == 0 {
428 return Err(BenchError::InvalidFormat(format!(
429 "scenario '{}' has no user turn",
430 scenario.id
431 )));
432 }
433 let registry = SkillRegistry::empty();
434
435 let system_content = match mode {
436 ResponseMode::TerseAnswer => concat!(
437 "You are an evaluation assistant. ",
438 "Answer every question with the shortest possible response. ",
439 "Give only the final answer — no explanation, no full sentences, ",
440 "no punctuation unless it is part of the answer. ",
441 "If the answer is a single word or number, respond with only that word or number."
442 ),
443 ResponseMode::ToolUse => concat!(
444 "You are a customer-service agent. ",
445 "Use the available tools to help the user. ",
446 "Always call a tool when one applies; do not ask the user to perform actions you can perform yourself. ",
447 "When you have completed the user's request, respond with a brief confirmation."
448 ),
449 };
450
451 let blocks = vec![InstructionBlock {
452 source: PathBuf::from("<bench-system-prompt>"),
453 content: system_content.to_owned(),
454 }];
455
456 let base_agent = Agent::new(self.provider.clone(), channel, registry, None, 1, executor)
457 .with_instruction_blocks(blocks);
458
459 // Optionally wire SemanticMemory when the caller requests memory-on mode.
460 let (mut agent, scenario_db) = if memory_mode == MemoryMode::On
461 && let Some(ref params) = self.memory_params
462 {
463 // One SQLite file per scenario gives strict isolation (NFR-001 choice (a)).
464 // This is more files than a per-run DB, but eliminates any cross-scenario
465 // memory bleed and avoids needing BenchIsolation::reset() between scenarios.
466 let scenario_db = params
467 .data_dir
468 .join(format!("bench-{}-{}.db", params.run_id, scenario.id));
469 debug_assert!(
470 scenario_db.to_string_lossy().contains("bench-"),
471 "NFR-001: bench SQLite path must be namespaced with 'bench-'"
472 );
473
474 tracing::debug!(
475 scenario_id = %scenario.id,
476 path = %scenario_db.display(),
477 "bench: memory init start"
478 );
479 let memory = Arc::new(
480 tokio::time::timeout(
481 std::time::Duration::from_secs(10),
482 SemanticMemory::with_sqlite_backend(
483 scenario_db.to_string_lossy().as_ref(),
484 self.provider.clone(),
485 ¶ms.embedding_model,
486 0.7,
487 0.3,
488 ),
489 )
490 .await
491 .map_err(|_| {
492 BenchError::InvalidFormat(format!(
493 "SemanticMemory init timed out for scenario '{}'",
494 scenario.id
495 ))
496 })?
497 .map_err(|e| BenchError::InvalidFormat(format!("SemanticMemory init: {e}")))?,
498 );
499 tracing::debug!(scenario_id = %scenario.id, "bench: memory init done");
500
501 // Seed the sessions table so persist_message does not fail with FK violation.
502 let conv_id = memory
503 .sqlite()
504 .create_conversation()
505 .await
506 .map_err(|e| BenchError::InvalidFormat(format!("create_conversation: {e}")))?;
507
508 // summarization_threshold = 100_000 deliberately suppresses LLM-driven
509 // compaction during bench runs. Compaction calls another LLM round-trip
510 // with non-deterministic timing/output, which would violate FR-003
511 // (deterministic runs). recall_limit = 20 is generous enough to surface
512 // long-context memory effects without silently capping LongMemEval scores
513 // below their theoretical maximum. history_limit = 200 covers the longest
514 // LongMemEval session without truncation.
515 let wired_agent = base_agent.with_memory(memory, conv_id, 200, 20, 100_000);
516 (wired_agent, Some(scenario_db))
517 } else {
518 (base_agent, None)
519 };
520
521 // Ignore agent errors — a failed LLM call still yields an empty response that
522 // the evaluator scores as 0.0 rather than aborting the entire run.
523 let _ = Box::pin(agent.run()).await;
524 let channel = agent.into_channel();
525 // tool_outputs available for Phase 2 scoring (#4234); log count so future
526 // implementors have a trace even before the evaluator wires them up.
527 tracing::debug!(
528 count = channel.tool_outputs().len(),
529 "bench: tool outputs captured"
530 );
531 let responses = channel.into_responses();
532
533 // Best-effort cleanup: delete per-scenario SQLite file after the run.
534 // Failure is intentionally ignored — NFR-001 is hygiene, not correctness.
535 if let Some(ref db_path) = scenario_db {
536 let _ = std::fs::remove_file(db_path);
537 }
538
539 let raw = responses
540 .into_iter()
541 .last()
542 .map(|r| r.text)
543 .unwrap_or_default();
544
545 Ok(match mode {
546 ResponseMode::TerseAnswer => post_process_response(&raw),
547 // Verified: dropping send_tool_output does NOT affect the agent loop's tool-result
548 // feedback to the LLM. Tool outputs flow via Agent's internal MessagePart::ToolResult,
549 // not via the channel. See crates/zeph-core/src/agent/tool_execution/native.rs.
550 ResponseMode::ToolUse => raw,
551 })
552 }
553}
554
555/// Return the subset of `scenarios` that should run given `opts`.
556///
557/// Validates that when a `scenario_filter` is set, at least one matching scenario exists in
558/// `scenarios`. Then filters out already-completed IDs and non-matching scenarios.
559///
560/// # Errors
561///
562/// Returns [`BenchError::InvalidFormat`] when `opts.scenario_filter` names a scenario that
563/// does not appear in `scenarios`.
564fn filter_scenarios<'a>(
565 scenarios: &'a [Scenario],
566 opts: &RunOptions,
567 loader_name: &str,
568) -> Result<Vec<&'a Scenario>, BenchError> {
569 if let Some(ref filter) = opts.scenario_filter
570 && !scenarios.iter().any(|s| &s.id == filter)
571 {
572 return Err(BenchError::InvalidFormat(format!(
573 "scenario '{filter}' not found in dataset '{loader_name}'"
574 )));
575 }
576
577 Ok(scenarios
578 .iter()
579 .filter(|s| {
580 if opts.completed_ids.contains(&s.id) {
581 return false;
582 }
583 if let Some(ref filter) = opts.scenario_filter {
584 return &s.id == filter;
585 }
586 true
587 })
588 .collect())
589}
590
591/// Post-process the raw agent response to extract a clean, terse answer.
592///
593/// Applies these transformations in order:
594/// 1. Take only the first non-empty line — strips explanations appended after the answer.
595/// 2. Strip markdown formatting (bold `**`, italic `*` and `_`, inline code `` ` ``).
596/// 3. Trim surrounding whitespace.
597///
598/// This is a best-effort cleanup. Evaluators still normalize the result, so minor
599/// leftover punctuation is handled downstream.
600fn post_process_response(raw: &str) -> String {
601 // Take the first non-empty line to discard any trailing explanation.
602 let first_line = raw
603 .lines()
604 .map(str::trim)
605 .find(|l| !l.is_empty())
606 .unwrap_or("");
607
608 // Strip common markdown formatting characters.
609 first_line
610 .trim_matches(|c: char| matches!(c, '*' | '_' | '`' | ' ' | '\t'))
611 .replace("**", "")
612 .replace('`', "")
613 .trim()
614 .to_owned()
615}
616
617/// Generate a short pseudo-UUID-like run ID without the `uuid` crate.
618///
619/// Uses `std::time::SystemTime` for uniqueness. Not cryptographically random but
620/// sufficient for benchmark run identification.
621fn uuid() -> String {
622 use std::time::{SystemTime, UNIX_EPOCH};
623 let d = SystemTime::now()
624 .duration_since(UNIX_EPOCH)
625 .unwrap_or_default();
626 format!("bench-{:x}-{:x}", d.as_secs(), d.subsec_nanos())
627}
628
629#[cfg(test)]
630mod tests {
631 use super::*;
632
633 #[test]
634 fn run_options_default_is_empty() {
635 let opts = RunOptions::default();
636 assert!(opts.scenario_filter.is_none());
637 assert!(opts.completed_ids.is_empty());
638 assert_eq!(opts.memory_mode, MemoryMode::Off);
639 }
640
641 #[test]
642 fn memory_mode_default_is_off() {
643 assert_eq!(MemoryMode::default(), MemoryMode::Off);
644 }
645
646 #[test]
647 fn with_memory_params_sets_isolation() {
648 use zeph_llm::{any::AnyProvider, mock::MockProvider};
649 let provider = AnyProvider::Mock(MockProvider::with_responses(vec![]));
650 let params = BenchMemoryParams {
651 data_dir: std::path::PathBuf::from("/tmp/bench-data"),
652 embedding_model: "nomic-embed-text".into(),
653 run_id: "bench-abc".into(),
654 dataset: "locomo".into(),
655 };
656 let runner = BenchRunner::new(provider).with_memory_params(params.clone());
657 assert!(runner.memory_params.is_some());
658 let stored = runner.memory_params.unwrap();
659 assert_eq!(stored.run_id, "bench-abc");
660 assert_eq!(stored.dataset, "locomo");
661 }
662
663 #[test]
664 fn nfr_001_sqlite_path_namespaced() {
665 let params = BenchMemoryParams {
666 data_dir: std::path::PathBuf::from("/tmp/bench-data"),
667 embedding_model: "nomic-embed-text".into(),
668 run_id: "run-xyz".into(),
669 dataset: "locomo".into(),
670 };
671 let scenario_id = "s1_0";
672 let scenario_db = params
673 .data_dir
674 .join(format!("bench-{}-{}.db", params.run_id, scenario_id));
675 assert!(
676 scenario_db.to_string_lossy().contains("bench-"),
677 "NFR-001: SQLite path must contain bench- prefix"
678 );
679 }
680
681 #[test]
682 fn now_rfc3339_has_correct_format() {
683 let ts = timestamp::utc_now_rfc3339();
684 // e.g. "2026-04-25T10:30:00Z"
685 assert_eq!(ts.len(), 20);
686 assert!(ts.ends_with('Z'));
687 assert!(ts.contains('T'));
688 }
689
690 #[test]
691 fn uuid_generates_non_empty_string() {
692 let id = uuid();
693 assert!(id.starts_with("bench-"));
694 assert!(id.len() > 10);
695 }
696
697 #[test]
698 fn post_process_takes_first_line() {
699 let raw = "1945\n\nWorld War II ended in 1945.";
700 assert_eq!(post_process_response(raw), "1945");
701 }
702
703 #[test]
704 fn post_process_strips_markdown_bold() {
705 assert_eq!(post_process_response("**1945**"), "1945");
706 }
707
708 #[test]
709 fn post_process_strips_backticks() {
710 assert_eq!(post_process_response("`Au`"), "Au");
711 }
712
713 #[test]
714 fn post_process_trims_whitespace() {
715 assert_eq!(post_process_response(" Paris "), "Paris");
716 }
717
718 #[test]
719 fn post_process_empty_input_returns_empty() {
720 assert_eq!(post_process_response(""), "");
721 }
722
723 #[test]
724 fn post_process_skips_empty_leading_lines() {
725 let raw = "\n\n \nParis";
726 assert_eq!(post_process_response(raw), "Paris");
727 }
728}