Skip to main content

zeph_bench/
runner.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Benchmark runner: drives `Agent<BenchmarkChannel>` over a dataset and collects results.
5//!
6//! [`BenchRunner`] is the execution engine for `zeph bench run`. It is intentionally
7//! minimal — baseline mode only (no tools, no memory, no MCP). Each scenario is run in
8//! isolation through a fresh [`BenchmarkChannel`] and the agent's raw text response is
9//! scored by the supplied [`Evaluator`].
10//!
11//! # Usage
12//!
13//! ```no_run
14//! use std::path::Path;
15//! use zeph_bench::runner::{BenchRunner, RunOptions};
16//! use zeph_bench::loaders::{GaiaLoader, GaiaEvaluator};
17//! use zeph_llm::{any::AnyProvider, mock::MockProvider};
18//!
19//! # async fn example() -> Result<(), zeph_bench::BenchError> {
20//! let provider = AnyProvider::Mock(MockProvider::with_responses(vec!["1945".into()]));
21//! let runner = BenchRunner::new(provider);
22//! let opts = RunOptions::default();
23//! let run = runner.run_dataset(&GaiaLoader::all_levels(), &GaiaEvaluator, Path::new("/data/gaia.jsonl"), opts).await?;
24//! println!("mean score: {:.4}", run.aggregate.mean_score);
25//! # Ok(())
26//! # }
27//! ```
28
29use std::collections::HashSet;
30use std::path::{Path, PathBuf};
31use std::sync::Arc;
32use std::time::Instant;
33
34use zeph_common::timestamp;
35use zeph_core::agent::Agent;
36use zeph_core::instructions::InstructionBlock;
37use zeph_llm::any::AnyProvider;
38use zeph_llm::provider::LlmProvider as _;
39use zeph_memory::semantic::SemanticMemory;
40use zeph_skills::registry::SkillRegistry;
41use zeph_tools::executor::{ToolError, ToolExecutor, ToolOutput};
42
43use crate::channel::BenchmarkChannel;
44use crate::error::BenchError;
45use crate::loaders::tau2_bench::{ActionTrace, TauBenchEvaluator};
46use crate::results::{BenchRun, RunStatus, ScenarioResult};
47use crate::scenario::{DatasetLoader, Evaluator, Scenario};
48
49/// Controls how the runner processes the agent's raw text response.
50///
51/// Used by [`BenchRunner::run_one_with_executor`] to select the appropriate
52/// system prompt and post-processing behaviour.
53#[derive(Debug, Clone, Copy, PartialEq, Eq)]
54pub enum ResponseMode {
55    /// Inject a "shortest possible answer" system prompt and strip markdown from the response.
56    ///
57    /// Used by all knowledge-retrieval datasets (GAIA, LOCOMO, FRAMES, `LongMemEval`).
58    TerseAnswer,
59    /// Inject a tool-use system prompt; return the raw agent response without post-processing.
60    ///
61    /// Used by tau2-bench where the evaluation is based on the action trace, not text output.
62    ToolUse,
63}
64
65/// Controls whether `SemanticMemory` is wired into the agent during a benchmark run.
66///
67/// # Examples
68///
69/// ```
70/// use zeph_bench::runner::MemoryMode;
71///
72/// assert_eq!(MemoryMode::default(), MemoryMode::Off);
73/// ```
74#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
75pub enum MemoryMode {
76    /// No `SemanticMemory` — current default behaviour.
77    #[default]
78    Off,
79    /// Wire a `SQLite`-backed `SemanticMemory` into the agent via `Agent::with_memory`.
80    On,
81}
82
83/// Parameters required to construct a per-scenario `SQLite`-backed `SemanticMemory`.
84///
85/// Populated by [`BenchRunner::with_memory_params`] and consumed inside
86/// [`BenchRunner::run_one`] when `opts.memory_mode == MemoryMode::On`.
87///
88/// # Examples
89///
90/// ```
91/// use std::path::PathBuf;
92/// use zeph_bench::runner::BenchMemoryParams;
93///
94/// let params = BenchMemoryParams {
95///     data_dir: PathBuf::from("/tmp/bench"),
96///     embedding_model: "nomic-embed-text".into(),
97///     run_id: "bench-abc".into(),
98///     dataset: "locomo".into(),
99/// };
100/// assert!(params.data_dir.to_string_lossy().contains("bench"));
101/// ```
102#[derive(Debug, Clone)]
103pub struct BenchMemoryParams {
104    /// Directory where per-scenario `SQLite` files live (deleted between scenarios).
105    ///
106    /// The derived path always contains the `bench-` segment (NFR-001).
107    pub data_dir: PathBuf,
108    /// Embedding model name passed to `SemanticMemory`.
109    pub embedding_model: String,
110    /// Run ID used to namespace bench artifacts; matches the outer `BenchRun.run_id`.
111    pub run_id: String,
112    /// Dataset name used to namespace bench artifacts.
113    pub dataset: String,
114}
115
116/// Options that control which scenarios are executed and whether to resume a prior run.
117///
118/// Build via [`RunOptions::default`] and override the fields you need.
119///
120/// # Examples
121///
122/// ```
123/// use zeph_bench::runner::{RunOptions, MemoryMode};
124///
125/// // Run all scenarios.
126/// let opts = RunOptions::default();
127/// assert!(opts.scenario_filter.is_none());
128/// assert!(opts.completed_ids.is_empty());
129/// assert_eq!(opts.memory_mode, MemoryMode::Off);
130/// ```
131#[derive(Debug, Default)]
132pub struct RunOptions {
133    /// When `Some(id)`, only the scenario with this ID is executed.
134    pub scenario_filter: Option<String>,
135    /// Set of scenario IDs already completed in a prior run (used for `--resume`).
136    pub completed_ids: HashSet<String>,
137    /// Whether to wire a `SemanticMemory` backend into the agent for this run.
138    pub memory_mode: MemoryMode,
139}
140
141/// Minimal no-op tool executor for baseline benchmark runs.
142///
143/// Returns an empty tool list and `Ok(None)` on every execute call, ensuring that
144/// the agent loop cannot invoke any tools during a benchmark run.
145struct NoopExecutor;
146
147impl ToolExecutor for NoopExecutor {
148    async fn execute(&self, _response: &str) -> Result<Option<ToolOutput>, ToolError> {
149        Ok(None)
150    }
151}
152
153/// Drives [`Agent<BenchmarkChannel>`] over a dataset and collects scored results.
154///
155/// Each call to [`run_dataset`][BenchRunner::run_dataset] creates a fresh agent per
156/// scenario (baseline mode: no tools, no MCP). Memory is optionally wired via
157/// [`BenchRunner::with_memory_params`] and [`RunOptions::memory_mode`].
158///
159/// # Examples
160///
161/// ```no_run
162/// use zeph_bench::runner::BenchRunner;
163/// use zeph_llm::{any::AnyProvider, mock::MockProvider};
164///
165/// let provider = AnyProvider::Mock(MockProvider::with_responses(vec!["Paris".into()]));
166/// let runner = BenchRunner::new(provider);
167/// ```
168pub struct BenchRunner {
169    provider: AnyProvider,
170    /// Parameters for constructing per-scenario `SQLite`-backed `SemanticMemory`.
171    ///
172    /// Set via [`BenchRunner::with_memory_params`]; required when
173    /// `RunOptions::memory_mode == MemoryMode::On`.
174    memory_params: Option<BenchMemoryParams>,
175}
176
177impl BenchRunner {
178    /// Create a new runner with the given provider.
179    ///
180    /// The `no_deterministic` argument is unused at runtime but kept in the public API
181    /// so the bench command can pass it through for future use (e.g., logging or config).
182    /// Apply deterministic overrides to `provider` before calling this if needed.
183    ///
184    /// # Examples
185    ///
186    /// ```no_run
187    /// use zeph_bench::runner::BenchRunner;
188    /// use zeph_llm::{any::AnyProvider, mock::MockProvider};
189    ///
190    /// let provider = AnyProvider::Mock(MockProvider::with_responses(vec![]));
191    /// let runner = BenchRunner::new(provider);
192    /// ```
193    #[must_use]
194    pub fn new(provider: AnyProvider) -> Self {
195        Self {
196            provider,
197            memory_params: None,
198        }
199    }
200
201    /// Attach `SemanticMemory` parameters for memory-on benchmark runs.
202    ///
203    /// When set, a per-scenario `SQLite`-backed `SemanticMemory` is constructed inside
204    /// [`run_one`][BenchRunner::run_one] whenever `opts.memory_mode == MemoryMode::On`.
205    ///
206    /// # Examples
207    ///
208    /// ```no_run
209    /// use std::path::PathBuf;
210    /// use zeph_bench::runner::{BenchRunner, BenchMemoryParams};
211    /// use zeph_llm::{any::AnyProvider, mock::MockProvider};
212    ///
213    /// let provider = AnyProvider::Mock(MockProvider::with_responses(vec![]));
214    /// let params = BenchMemoryParams {
215    ///     data_dir: PathBuf::from("/tmp/bench-data"),
216    ///     embedding_model: "nomic-embed-text".into(),
217    ///     run_id: "bench-abc".into(),
218    ///     dataset: "locomo".into(),
219    /// };
220    /// let runner = BenchRunner::new(provider).with_memory_params(params);
221    /// ```
222    #[must_use]
223    pub fn with_memory_params(mut self, params: BenchMemoryParams) -> Self {
224        self.memory_params = Some(params);
225        self
226    }
227
228    /// Run all matching scenarios from `path` through the agent and return a [`BenchRun`].
229    ///
230    /// For each scenario:
231    /// 1. Builds a fresh `Agent<BenchmarkChannel>` with no tools or memory.
232    /// 2. Feeds the scenario prompt and collects the agent's response.
233    /// 3. Scores the response with `evaluator`.
234    /// 4. Appends a [`ScenarioResult`] and recomputes aggregate statistics.
235    ///
236    /// The returned [`BenchRun`] has `status = Running` until the caller sets it to
237    /// `Completed` or `Interrupted`.
238    ///
239    /// # Errors
240    ///
241    /// Returns [`BenchError`] if the dataset cannot be loaded or a scenario run fails.
242    pub async fn run_dataset<L, E>(
243        &self,
244        loader: &L,
245        evaluator: &E,
246        path: &Path,
247        opts: RunOptions,
248    ) -> Result<BenchRun, BenchError>
249    where
250        L: DatasetLoader,
251        E: Evaluator,
252    {
253        let scenarios = loader.load(path)?;
254        let filtered = filter_scenarios(&scenarios, &opts, loader.name())?;
255
256        let _span = tracing::info_span!(
257            "bench.run_dataset",
258            dataset = loader.name(),
259            scenarios = filtered.len(),
260        )
261        .entered();
262
263        let model_id = self.provider.model_identifier().to_owned();
264
265        let mut run = BenchRun {
266            dataset: loader.name().to_owned(),
267            model: model_id,
268            run_id: uuid(),
269            started_at: timestamp::utc_now_rfc3339(),
270            finished_at: String::new(),
271            status: RunStatus::Running,
272            results: vec![],
273            aggregate: crate::results::Aggregate::default(),
274        };
275
276        for scenario in filtered {
277            let _s = tracing::info_span!("bench.scenario", id = %scenario.id).entered();
278
279            let t0 = Instant::now();
280            let response_text = Box::pin(self.run_one(scenario, opts.memory_mode)).await?;
281            let elapsed_ms = u64::try_from(t0.elapsed().as_millis()).unwrap_or(u64::MAX);
282
283            let eval = evaluator.evaluate(scenario, &response_text);
284            let excerpt = response_text.chars().take(200).collect::<String>();
285
286            run.results.push(ScenarioResult {
287                scenario_id: scenario.id.clone(),
288                score: eval.score,
289                response_excerpt: excerpt,
290                error: None,
291                elapsed_ms,
292            });
293            run.recompute_aggregate();
294        }
295
296        Ok(run)
297    }
298
299    /// Run all scenarios from `path` through a per-scenario env executor and return a [`BenchRun`].
300    ///
301    /// This is the execution path for tool-driven datasets (tau2-bench). For each scenario:
302    /// 1. Calls `env_factory(scenario)` to build a fresh `(ToolExecutor, ActionTrace)`.
303    /// 2. Builds a fresh `TauBenchEvaluator` from the scenario metadata and the trace.
304    /// 3. Runs the agent with the env executor and the tool-use system prompt.
305    /// 4. Scores the response via the evaluator (reads the populated trace).
306    ///
307    /// # Errors
308    ///
309    /// Returns [`BenchError`] if the dataset cannot be loaded, the env factory fails, or
310    /// `TauBenchEvaluator::from_scenario` fails (malformed metadata).
311    pub async fn run_dataset_with_env_factory<L, F, X>(
312        &self,
313        loader: &L,
314        env_factory: F,
315        path: &Path,
316        opts: RunOptions,
317    ) -> Result<BenchRun, BenchError>
318    where
319        L: DatasetLoader,
320        F: Fn(&Scenario) -> Result<(X, ActionTrace), BenchError>,
321        X: ToolExecutor + Send + Sync + 'static,
322    {
323        let scenarios = loader.load(path)?;
324        let filtered = filter_scenarios(&scenarios, &opts, loader.name())?;
325
326        let _span = tracing::info_span!(
327            "bench.run_dataset_with_env_factory",
328            dataset = loader.name(),
329            scenarios = filtered.len(),
330        )
331        .entered();
332
333        let model_id = self.provider.model_identifier().to_owned();
334
335        let mut run = BenchRun {
336            dataset: loader.name().to_owned(),
337            model: model_id,
338            run_id: uuid(),
339            started_at: timestamp::utc_now_rfc3339(),
340            finished_at: String::new(),
341            status: RunStatus::Running,
342            results: vec![],
343            aggregate: crate::results::Aggregate::default(),
344        };
345
346        for scenario in filtered {
347            let _s = tracing::info_span!("bench.scenario", id = %scenario.id).entered();
348
349            let (executor, trace) = env_factory(scenario)?;
350            let evaluator = TauBenchEvaluator::from_scenario(scenario, trace)?;
351
352            let t0 = Instant::now();
353            let response_text = Box::pin(self.run_one_with_executor(
354                scenario,
355                executor,
356                opts.memory_mode,
357                ResponseMode::ToolUse,
358            ))
359            .await?;
360            let elapsed_ms = u64::try_from(t0.elapsed().as_millis()).unwrap_or(u64::MAX);
361
362            let eval = evaluator.evaluate(scenario, &response_text);
363            let excerpt = response_text.chars().take(200).collect::<String>();
364
365            run.results.push(ScenarioResult {
366                scenario_id: scenario.id.clone(),
367                score: eval.score,
368                response_excerpt: excerpt,
369                error: None,
370                elapsed_ms,
371            });
372            run.recompute_aggregate();
373        }
374
375        Ok(run)
376    }
377
378    /// Run a single scenario through a fresh agent and return the last response text.
379    ///
380    /// A concise-answer system prompt is injected via [`InstructionBlock`] so the model
381    /// responds with only the final answer (a number, word, or short phrase) rather than
382    /// full sentences. The raw response is then post-processed to extract the first
383    /// non-empty line and strip markdown formatting, which further reduces noise for
384    /// evaluators that perform exact or near-exact matching.
385    ///
386    /// When `memory_mode == MemoryMode::On`, a per-scenario `SQLite`-backed
387    /// `SemanticMemory` is constructed and wired into the agent. The database file is
388    /// deleted after the scenario completes (best-effort, NFR-001).
389    ///
390    /// # Errors
391    ///
392    /// Returns [`BenchError::InvalidFormat`] when the scenario has no user turn or when
393    /// `SemanticMemory` initialisation fails.
394    async fn run_one(
395        &self,
396        scenario: &Scenario,
397        memory_mode: MemoryMode,
398    ) -> Result<String, BenchError> {
399        Box::pin(self.run_one_with_executor(
400            scenario,
401            NoopExecutor,
402            memory_mode,
403            ResponseMode::TerseAnswer,
404        ))
405        .await
406    }
407
408    /// Core execution: run one scenario with the given executor and response mode.
409    ///
410    /// Called by both [`BenchRunner::run_dataset`] (with `NoopExecutor` + `TerseAnswer`) and
411    /// [`BenchRunner::run_dataset_with_env_factory`] (with the domain env + `ToolUse`).
412    #[allow(clippy::too_many_lines)] // sequential setup steps; splitting adds indirection without clarity
413    async fn run_one_with_executor<X: ToolExecutor + Send + Sync + 'static>(
414        &self,
415        scenario: &Scenario,
416        executor: X,
417        memory_mode: MemoryMode,
418        mode: ResponseMode,
419    ) -> Result<String, BenchError> {
420        let _span = tracing::info_span!(
421            "bench.run_one",
422            scenario_id = %scenario.id,
423            mode = ?mode,
424        )
425        .entered();
426        let channel = BenchmarkChannel::from_turns(scenario.turns.clone());
427        if channel.total() == 0 {
428            return Err(BenchError::InvalidFormat(format!(
429                "scenario '{}' has no user turn",
430                scenario.id
431            )));
432        }
433        let registry = SkillRegistry::empty();
434
435        let system_content = match mode {
436            ResponseMode::TerseAnswer => concat!(
437                "You are an evaluation assistant. ",
438                "Answer every question with the shortest possible response. ",
439                "Give only the final answer — no explanation, no full sentences, ",
440                "no punctuation unless it is part of the answer. ",
441                "If the answer is a single word or number, respond with only that word or number."
442            ),
443            ResponseMode::ToolUse => concat!(
444                "You are a customer-service agent. ",
445                "Use the available tools to help the user. ",
446                "Always call a tool when one applies; do not ask the user to perform actions you can perform yourself. ",
447                "When you have completed the user's request, respond with a brief confirmation."
448            ),
449        };
450
451        let blocks = vec![InstructionBlock {
452            source: PathBuf::from("<bench-system-prompt>"),
453            content: system_content.to_owned(),
454        }];
455
456        let base_agent = Agent::new(self.provider.clone(), channel, registry, None, 1, executor)
457            .with_instruction_blocks(blocks);
458
459        // Optionally wire SemanticMemory when the caller requests memory-on mode.
460        let (mut agent, scenario_db) = if memory_mode == MemoryMode::On
461            && let Some(ref params) = self.memory_params
462        {
463            // One SQLite file per scenario gives strict isolation (NFR-001 choice (a)).
464            // This is more files than a per-run DB, but eliminates any cross-scenario
465            // memory bleed and avoids needing BenchIsolation::reset() between scenarios.
466            let scenario_db = params
467                .data_dir
468                .join(format!("bench-{}-{}.db", params.run_id, scenario.id));
469            debug_assert!(
470                scenario_db.to_string_lossy().contains("bench-"),
471                "NFR-001: bench SQLite path must be namespaced with 'bench-'"
472            );
473
474            tracing::debug!(
475                scenario_id = %scenario.id,
476                path = %scenario_db.display(),
477                "bench: memory init start"
478            );
479            let memory = Arc::new(
480                tokio::time::timeout(
481                    std::time::Duration::from_secs(10),
482                    SemanticMemory::with_sqlite_backend(
483                        scenario_db.to_string_lossy().as_ref(),
484                        self.provider.clone(),
485                        &params.embedding_model,
486                        0.7,
487                        0.3,
488                    ),
489                )
490                .await
491                .map_err(|_| {
492                    BenchError::InvalidFormat(format!(
493                        "SemanticMemory init timed out for scenario '{}'",
494                        scenario.id
495                    ))
496                })?
497                .map_err(|e| BenchError::InvalidFormat(format!("SemanticMemory init: {e}")))?,
498            );
499            tracing::debug!(scenario_id = %scenario.id, "bench: memory init done");
500
501            // Seed the sessions table so persist_message does not fail with FK violation.
502            let conv_id = memory
503                .sqlite()
504                .create_conversation()
505                .await
506                .map_err(|e| BenchError::InvalidFormat(format!("create_conversation: {e}")))?;
507
508            // summarization_threshold = 100_000 deliberately suppresses LLM-driven
509            // compaction during bench runs. Compaction calls another LLM round-trip
510            // with non-deterministic timing/output, which would violate FR-003
511            // (deterministic runs). recall_limit = 20 is generous enough to surface
512            // long-context memory effects without silently capping LongMemEval scores
513            // below their theoretical maximum. history_limit = 200 covers the longest
514            // LongMemEval session without truncation.
515            let wired_agent = base_agent.with_memory(memory, conv_id, 200, 20, 100_000);
516            (wired_agent, Some(scenario_db))
517        } else {
518            (base_agent, None)
519        };
520
521        // Ignore agent errors — a failed LLM call still yields an empty response that
522        // the evaluator scores as 0.0 rather than aborting the entire run.
523        let _ = Box::pin(agent.run()).await;
524        let channel = agent.into_channel();
525        // tool_outputs available for Phase 2 scoring (#4234); log count so future
526        // implementors have a trace even before the evaluator wires them up.
527        tracing::debug!(
528            count = channel.tool_outputs().len(),
529            "bench: tool outputs captured"
530        );
531        let responses = channel.into_responses();
532
533        // Best-effort cleanup: delete per-scenario SQLite file after the run.
534        // Failure is intentionally ignored — NFR-001 is hygiene, not correctness.
535        if let Some(ref db_path) = scenario_db {
536            let _ = std::fs::remove_file(db_path);
537        }
538
539        let raw = responses
540            .into_iter()
541            .last()
542            .map(|r| r.text)
543            .unwrap_or_default();
544
545        Ok(match mode {
546            ResponseMode::TerseAnswer => post_process_response(&raw),
547            // Verified: dropping send_tool_output does NOT affect the agent loop's tool-result
548            // feedback to the LLM. Tool outputs flow via Agent's internal MessagePart::ToolResult,
549            // not via the channel. See crates/zeph-core/src/agent/tool_execution/native.rs.
550            ResponseMode::ToolUse => raw,
551        })
552    }
553}
554
555/// Return the subset of `scenarios` that should run given `opts`.
556///
557/// Validates that when a `scenario_filter` is set, at least one matching scenario exists in
558/// `scenarios`. Then filters out already-completed IDs and non-matching scenarios.
559///
560/// # Errors
561///
562/// Returns [`BenchError::InvalidFormat`] when `opts.scenario_filter` names a scenario that
563/// does not appear in `scenarios`.
564fn filter_scenarios<'a>(
565    scenarios: &'a [Scenario],
566    opts: &RunOptions,
567    loader_name: &str,
568) -> Result<Vec<&'a Scenario>, BenchError> {
569    if let Some(ref filter) = opts.scenario_filter
570        && !scenarios.iter().any(|s| &s.id == filter)
571    {
572        return Err(BenchError::InvalidFormat(format!(
573            "scenario '{filter}' not found in dataset '{loader_name}'"
574        )));
575    }
576
577    Ok(scenarios
578        .iter()
579        .filter(|s| {
580            if opts.completed_ids.contains(&s.id) {
581                return false;
582            }
583            if let Some(ref filter) = opts.scenario_filter {
584                return &s.id == filter;
585            }
586            true
587        })
588        .collect())
589}
590
591/// Post-process the raw agent response to extract a clean, terse answer.
592///
593/// Applies these transformations in order:
594/// 1. Take only the first non-empty line — strips explanations appended after the answer.
595/// 2. Strip markdown formatting (bold `**`, italic `*` and `_`, inline code `` ` ``).
596/// 3. Trim surrounding whitespace.
597///
598/// This is a best-effort cleanup. Evaluators still normalize the result, so minor
599/// leftover punctuation is handled downstream.
600fn post_process_response(raw: &str) -> String {
601    // Take the first non-empty line to discard any trailing explanation.
602    let first_line = raw
603        .lines()
604        .map(str::trim)
605        .find(|l| !l.is_empty())
606        .unwrap_or("");
607
608    // Strip common markdown formatting characters.
609    first_line
610        .trim_matches(|c: char| matches!(c, '*' | '_' | '`' | ' ' | '\t'))
611        .replace("**", "")
612        .replace('`', "")
613        .trim()
614        .to_owned()
615}
616
617/// Generate a short pseudo-UUID-like run ID without the `uuid` crate.
618///
619/// Uses `std::time::SystemTime` for uniqueness. Not cryptographically random but
620/// sufficient for benchmark run identification.
621fn uuid() -> String {
622    use std::time::{SystemTime, UNIX_EPOCH};
623    let d = SystemTime::now()
624        .duration_since(UNIX_EPOCH)
625        .unwrap_or_default();
626    format!("bench-{:x}-{:x}", d.as_secs(), d.subsec_nanos())
627}
628
629#[cfg(test)]
630mod tests {
631    use super::*;
632
633    #[test]
634    fn run_options_default_is_empty() {
635        let opts = RunOptions::default();
636        assert!(opts.scenario_filter.is_none());
637        assert!(opts.completed_ids.is_empty());
638        assert_eq!(opts.memory_mode, MemoryMode::Off);
639    }
640
641    #[test]
642    fn memory_mode_default_is_off() {
643        assert_eq!(MemoryMode::default(), MemoryMode::Off);
644    }
645
646    #[test]
647    fn with_memory_params_sets_isolation() {
648        use zeph_llm::{any::AnyProvider, mock::MockProvider};
649        let provider = AnyProvider::Mock(MockProvider::with_responses(vec![]));
650        let params = BenchMemoryParams {
651            data_dir: std::path::PathBuf::from("/tmp/bench-data"),
652            embedding_model: "nomic-embed-text".into(),
653            run_id: "bench-abc".into(),
654            dataset: "locomo".into(),
655        };
656        let runner = BenchRunner::new(provider).with_memory_params(params.clone());
657        assert!(runner.memory_params.is_some());
658        let stored = runner.memory_params.unwrap();
659        assert_eq!(stored.run_id, "bench-abc");
660        assert_eq!(stored.dataset, "locomo");
661    }
662
663    #[test]
664    fn nfr_001_sqlite_path_namespaced() {
665        let params = BenchMemoryParams {
666            data_dir: std::path::PathBuf::from("/tmp/bench-data"),
667            embedding_model: "nomic-embed-text".into(),
668            run_id: "run-xyz".into(),
669            dataset: "locomo".into(),
670        };
671        let scenario_id = "s1_0";
672        let scenario_db = params
673            .data_dir
674            .join(format!("bench-{}-{}.db", params.run_id, scenario_id));
675        assert!(
676            scenario_db.to_string_lossy().contains("bench-"),
677            "NFR-001: SQLite path must contain bench- prefix"
678        );
679    }
680
681    #[test]
682    fn now_rfc3339_has_correct_format() {
683        let ts = timestamp::utc_now_rfc3339();
684        // e.g. "2026-04-25T10:30:00Z"
685        assert_eq!(ts.len(), 20);
686        assert!(ts.ends_with('Z'));
687        assert!(ts.contains('T'));
688    }
689
690    #[test]
691    fn uuid_generates_non_empty_string() {
692        let id = uuid();
693        assert!(id.starts_with("bench-"));
694        assert!(id.len() > 10);
695    }
696
697    #[test]
698    fn post_process_takes_first_line() {
699        let raw = "1945\n\nWorld War II ended in 1945.";
700        assert_eq!(post_process_response(raw), "1945");
701    }
702
703    #[test]
704    fn post_process_strips_markdown_bold() {
705        assert_eq!(post_process_response("**1945**"), "1945");
706    }
707
708    #[test]
709    fn post_process_strips_backticks() {
710        assert_eq!(post_process_response("`Au`"), "Au");
711    }
712
713    #[test]
714    fn post_process_trims_whitespace() {
715        assert_eq!(post_process_response("  Paris  "), "Paris");
716    }
717
718    #[test]
719    fn post_process_empty_input_returns_empty() {
720        assert_eq!(post_process_response(""), "");
721    }
722
723    #[test]
724    fn post_process_skips_empty_leading_lines() {
725        let raw = "\n\n  \nParis";
726        assert_eq!(post_process_response(raw), "Paris");
727    }
728}