Skip to main content

zeph_bench/
runner.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Benchmark runner: drives `Agent<BenchmarkChannel>` over a dataset and collects results.
5//!
6//! [`BenchRunner`] is the execution engine for `zeph bench run`. It is intentionally
7//! minimal — baseline mode only (no tools, no memory, no MCP). Each scenario is run in
8//! isolation through a fresh [`BenchmarkChannel`] and the agent's raw text response is
9//! scored by the supplied [`Evaluator`].
10//!
11//! # Usage
12//!
13//! ```no_run
14//! use std::path::Path;
15//! use zeph_bench::runner::{BenchRunner, RunOptions};
16//! use zeph_bench::loaders::{GaiaLoader, GaiaEvaluator};
17//! use zeph_llm::{any::AnyProvider, mock::MockProvider};
18//!
19//! # async fn example() -> Result<(), zeph_bench::BenchError> {
20//! let provider = AnyProvider::Mock(MockProvider::with_responses(vec!["1945".into()]));
21//! let runner = BenchRunner::new(provider);
22//! let opts = RunOptions::default();
23//! let run = runner.run_dataset(&GaiaLoader::all_levels(), &GaiaEvaluator, Path::new("/data/gaia.jsonl"), opts).await?;
24//! println!("mean score: {:.4}", run.aggregate.mean_score);
25//! # Ok(())
26//! # }
27//! ```
28
29use std::collections::HashSet;
30use std::path::{Path, PathBuf};
31use std::sync::Arc;
32use std::time::Instant;
33
34use zeph_common::timestamp;
35use zeph_core::agent::Agent;
36use zeph_core::instructions::InstructionBlock;
37use zeph_llm::any::AnyProvider;
38use zeph_llm::provider::LlmProvider as _;
39use zeph_memory::semantic::SemanticMemory;
40use zeph_skills::registry::SkillRegistry;
41use zeph_tools::executor::{ToolError, ToolExecutor, ToolOutput};
42
43use crate::channel::BenchmarkChannel;
44use crate::error::BenchError;
45use crate::loaders::tau2_bench::{ActionTrace, TauBenchEvaluator};
46use crate::results::{BenchRun, RunStatus, ScenarioResult};
47use crate::scenario::{DatasetLoader, Evaluator, Scenario};
48
49/// Controls how the runner processes the agent's raw text response.
50///
51/// Used by [`BenchRunner::run_one_with_executor`] to select the appropriate
52/// system prompt and post-processing behaviour.
53#[derive(Debug, Clone, Copy, PartialEq, Eq)]
54#[non_exhaustive]
55pub enum ResponseMode {
56    /// Inject a "shortest possible answer" system prompt and strip markdown from the response.
57    ///
58    /// Used by all knowledge-retrieval datasets (GAIA, LOCOMO, FRAMES, `LongMemEval`).
59    TerseAnswer,
60    /// Inject a tool-use system prompt; return the raw agent response without post-processing.
61    ///
62    /// Used by tau2-bench where the evaluation is based on the action trace, not text output.
63    ToolUse,
64}
65
66/// Controls whether `SemanticMemory` is wired into the agent during a benchmark run.
67///
68/// # Examples
69///
70/// ```
71/// use zeph_bench::runner::MemoryMode;
72///
73/// assert_eq!(MemoryMode::default(), MemoryMode::Off);
74/// ```
75#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
76#[non_exhaustive]
77pub enum MemoryMode {
78    /// No `SemanticMemory` — current default behaviour.
79    #[default]
80    Off,
81    /// Wire a `SQLite`-backed `SemanticMemory` into the agent via `Agent::with_memory`.
82    On,
83}
84
85/// Parameters required to construct a per-scenario `SQLite`-backed `SemanticMemory`.
86///
87/// Populated by [`BenchRunner::with_memory_params`] and consumed inside
88/// [`BenchRunner::run_one`] when `opts.memory_mode == MemoryMode::On`.
89///
90/// # Examples
91///
92/// ```
93/// use std::path::PathBuf;
94/// use zeph_bench::runner::BenchMemoryParams;
95///
96/// let params = BenchMemoryParams {
97///     data_dir: PathBuf::from("/tmp/bench"),
98///     embedding_model: "nomic-embed-text".into(),
99///     run_id: "bench-abc".into(),
100///     dataset: "locomo".into(),
101/// };
102/// assert!(params.data_dir.to_string_lossy().contains("bench"));
103/// ```
104#[derive(Debug, Clone)]
105pub struct BenchMemoryParams {
106    /// Directory where per-scenario `SQLite` files live (deleted between scenarios).
107    ///
108    /// The derived path always contains the `bench-` segment (NFR-001).
109    pub data_dir: PathBuf,
110    /// Embedding model name passed to `SemanticMemory`.
111    pub embedding_model: String,
112    /// Run ID used to namespace bench artifacts; matches the outer `BenchRun.run_id`.
113    pub run_id: String,
114    /// Dataset name used to namespace bench artifacts.
115    pub dataset: String,
116}
117
118/// Options that control which scenarios are executed and whether to resume a prior run.
119///
120/// Build via [`RunOptions::default`] and override the fields you need.
121///
122/// # Examples
123///
124/// ```
125/// use zeph_bench::runner::{RunOptions, MemoryMode};
126///
127/// // Run all scenarios.
128/// let opts = RunOptions::default();
129/// assert!(opts.scenario_filter.is_none());
130/// assert!(opts.completed_ids.is_empty());
131/// assert_eq!(opts.memory_mode, MemoryMode::Off);
132/// ```
133#[derive(Debug, Default)]
134pub struct RunOptions {
135    /// When `Some(id)`, only the scenario with this ID is executed.
136    pub scenario_filter: Option<String>,
137    /// Set of scenario IDs already completed in a prior run (used for `--resume`).
138    pub completed_ids: HashSet<String>,
139    /// Whether to wire a `SemanticMemory` backend into the agent for this run.
140    pub memory_mode: MemoryMode,
141}
142
143/// Minimal no-op tool executor for baseline benchmark runs.
144///
145/// Returns an empty tool list and `Ok(None)` on every execute call, ensuring that
146/// the agent loop cannot invoke any tools during a benchmark run.
147struct NoopExecutor;
148
149impl ToolExecutor for NoopExecutor {
150    async fn execute(&self, _response: &str) -> Result<Option<ToolOutput>, ToolError> {
151        Ok(None)
152    }
153}
154
155/// Drives [`Agent<BenchmarkChannel>`] over a dataset and collects scored results.
156///
157/// Each call to [`run_dataset`][BenchRunner::run_dataset] creates a fresh agent per
158/// scenario (baseline mode: no tools, no MCP). Memory is optionally wired via
159/// [`BenchRunner::with_memory_params`] and [`RunOptions::memory_mode`].
160///
161/// # Examples
162///
163/// ```no_run
164/// use zeph_bench::runner::BenchRunner;
165/// use zeph_llm::{any::AnyProvider, mock::MockProvider};
166///
167/// let provider = AnyProvider::Mock(MockProvider::with_responses(vec!["Paris".into()]));
168/// let runner = BenchRunner::new(provider);
169/// ```
170pub struct BenchRunner {
171    provider: AnyProvider,
172    /// Parameters for constructing per-scenario `SQLite`-backed `SemanticMemory`.
173    ///
174    /// Set via [`BenchRunner::with_memory_params`]; required when
175    /// `RunOptions::memory_mode == MemoryMode::On`.
176    memory_params: Option<BenchMemoryParams>,
177}
178
179impl BenchRunner {
180    /// Create a new runner with the given provider.
181    ///
182    /// The `no_deterministic` argument is unused at runtime but kept in the public API
183    /// so the bench command can pass it through for future use (e.g., logging or config).
184    /// Apply deterministic overrides to `provider` before calling this if needed.
185    ///
186    /// # Examples
187    ///
188    /// ```no_run
189    /// use zeph_bench::runner::BenchRunner;
190    /// use zeph_llm::{any::AnyProvider, mock::MockProvider};
191    ///
192    /// let provider = AnyProvider::Mock(MockProvider::with_responses(vec![]));
193    /// let runner = BenchRunner::new(provider);
194    /// ```
195    #[must_use]
196    pub fn new(provider: AnyProvider) -> Self {
197        Self {
198            provider,
199            memory_params: None,
200        }
201    }
202
203    /// Attach `SemanticMemory` parameters for memory-on benchmark runs.
204    ///
205    /// When set, a per-scenario `SQLite`-backed `SemanticMemory` is constructed inside
206    /// [`run_one`][BenchRunner::run_one] whenever `opts.memory_mode == MemoryMode::On`.
207    ///
208    /// # Examples
209    ///
210    /// ```no_run
211    /// use std::path::PathBuf;
212    /// use zeph_bench::runner::{BenchRunner, BenchMemoryParams};
213    /// use zeph_llm::{any::AnyProvider, mock::MockProvider};
214    ///
215    /// let provider = AnyProvider::Mock(MockProvider::with_responses(vec![]));
216    /// let params = BenchMemoryParams {
217    ///     data_dir: PathBuf::from("/tmp/bench-data"),
218    ///     embedding_model: "nomic-embed-text".into(),
219    ///     run_id: "bench-abc".into(),
220    ///     dataset: "locomo".into(),
221    /// };
222    /// let runner = BenchRunner::new(provider).with_memory_params(params);
223    /// ```
224    #[must_use]
225    pub fn with_memory_params(mut self, params: BenchMemoryParams) -> Self {
226        self.memory_params = Some(params);
227        self
228    }
229
230    /// Run all matching scenarios from `path` through the agent and return a [`BenchRun`].
231    ///
232    /// For each scenario:
233    /// 1. Builds a fresh `Agent<BenchmarkChannel>` with no tools or memory.
234    /// 2. Feeds the scenario prompt and collects the agent's response.
235    /// 3. Scores the response with `evaluator`.
236    /// 4. Appends a [`ScenarioResult`] and recomputes aggregate statistics.
237    ///
238    /// The returned [`BenchRun`] has `status = Running` until the caller sets it to
239    /// `Completed` or `Interrupted`.
240    ///
241    /// # Errors
242    ///
243    /// Returns [`BenchError`] if the dataset cannot be loaded or a scenario run fails.
244    pub async fn run_dataset<L, E>(
245        &self,
246        loader: &L,
247        evaluator: &E,
248        path: &Path,
249        opts: RunOptions,
250    ) -> Result<BenchRun, BenchError>
251    where
252        L: DatasetLoader,
253        E: Evaluator,
254    {
255        let scenarios = loader.load(path)?;
256        let filtered = filter_scenarios(&scenarios, &opts, loader.name())?;
257
258        let _span = tracing::info_span!(
259            "bench.run_dataset",
260            dataset = loader.name(),
261            scenarios = filtered.len(),
262        )
263        .entered();
264
265        let model_id = self.provider.model_identifier().to_owned();
266
267        let mut run = BenchRun {
268            dataset: loader.name().to_owned(),
269            model: model_id,
270            run_id: uuid(),
271            started_at: timestamp::utc_now_rfc3339(),
272            finished_at: String::new(),
273            status: RunStatus::Running,
274            results: vec![],
275            aggregate: crate::results::Aggregate::default(),
276        };
277
278        for scenario in filtered {
279            let _s = tracing::info_span!("bench.scenario", id = %scenario.id).entered();
280
281            let t0 = Instant::now();
282            let response_text = Box::pin(self.run_one(scenario, opts.memory_mode)).await?;
283            let elapsed_ms = u64::try_from(t0.elapsed().as_millis()).unwrap_or(u64::MAX);
284
285            let eval = evaluator.evaluate(scenario, &response_text);
286            let excerpt = response_text.chars().take(200).collect::<String>();
287
288            run.results.push(ScenarioResult {
289                scenario_id: scenario.id.clone(),
290                score: eval.score,
291                response_excerpt: excerpt,
292                error: None,
293                elapsed_ms,
294            });
295            run.recompute_aggregate();
296        }
297
298        Ok(run)
299    }
300
301    /// Run all scenarios from `path` through a per-scenario env executor and return a [`BenchRun`].
302    ///
303    /// This is the execution path for tool-driven datasets (tau2-bench). For each scenario:
304    /// 1. Calls `env_factory(scenario)` to build a fresh `(ToolExecutor, ActionTrace)`.
305    /// 2. Builds a fresh `TauBenchEvaluator` from the scenario metadata and the trace.
306    /// 3. Runs the agent with the env executor and the tool-use system prompt.
307    /// 4. Scores the response via the evaluator (reads the populated trace).
308    ///
309    /// # Errors
310    ///
311    /// Returns [`BenchError`] if the dataset cannot be loaded, the env factory fails, or
312    /// `TauBenchEvaluator::from_scenario` fails (malformed metadata).
313    pub async fn run_dataset_with_env_factory<L, F, X>(
314        &self,
315        loader: &L,
316        env_factory: F,
317        path: &Path,
318        opts: RunOptions,
319    ) -> Result<BenchRun, BenchError>
320    where
321        L: DatasetLoader,
322        F: Fn(&Scenario) -> Result<(X, ActionTrace), BenchError>,
323        X: ToolExecutor + Send + Sync + 'static,
324    {
325        let scenarios = loader.load(path)?;
326        let filtered = filter_scenarios(&scenarios, &opts, loader.name())?;
327
328        let _span = tracing::info_span!(
329            "bench.run_dataset_with_env_factory",
330            dataset = loader.name(),
331            scenarios = filtered.len(),
332        )
333        .entered();
334
335        let model_id = self.provider.model_identifier().to_owned();
336
337        let mut run = BenchRun {
338            dataset: loader.name().to_owned(),
339            model: model_id,
340            run_id: uuid(),
341            started_at: timestamp::utc_now_rfc3339(),
342            finished_at: String::new(),
343            status: RunStatus::Running,
344            results: vec![],
345            aggregate: crate::results::Aggregate::default(),
346        };
347
348        for scenario in filtered {
349            let _s = tracing::info_span!("bench.scenario", id = %scenario.id).entered();
350
351            let (executor, trace) = env_factory(scenario)?;
352            let evaluator = TauBenchEvaluator::from_scenario(scenario, trace)?;
353
354            let t0 = Instant::now();
355            let response_text = Box::pin(self.run_one_with_executor(
356                scenario,
357                executor,
358                opts.memory_mode,
359                ResponseMode::ToolUse,
360            ))
361            .await?;
362            let elapsed_ms = u64::try_from(t0.elapsed().as_millis()).unwrap_or(u64::MAX);
363
364            let eval = evaluator.evaluate(scenario, &response_text);
365            let excerpt = response_text.chars().take(200).collect::<String>();
366
367            run.results.push(ScenarioResult {
368                scenario_id: scenario.id.clone(),
369                score: eval.score,
370                response_excerpt: excerpt,
371                error: None,
372                elapsed_ms,
373            });
374            run.recompute_aggregate();
375        }
376
377        Ok(run)
378    }
379
380    /// Run a single scenario through a fresh agent and return the last response text.
381    ///
382    /// A concise-answer system prompt is injected via [`InstructionBlock`] so the model
383    /// responds with only the final answer (a number, word, or short phrase) rather than
384    /// full sentences. The raw response is then post-processed to extract the first
385    /// non-empty line and strip markdown formatting, which further reduces noise for
386    /// evaluators that perform exact or near-exact matching.
387    ///
388    /// When `memory_mode == MemoryMode::On`, a per-scenario `SQLite`-backed
389    /// `SemanticMemory` is constructed and wired into the agent. The database file is
390    /// deleted after the scenario completes (best-effort, NFR-001).
391    ///
392    /// # Errors
393    ///
394    /// Returns [`BenchError::InvalidFormat`] when the scenario has no user turn or when
395    /// `SemanticMemory` initialisation fails.
396    async fn run_one(
397        &self,
398        scenario: &Scenario,
399        memory_mode: MemoryMode,
400    ) -> Result<String, BenchError> {
401        Box::pin(self.run_one_with_executor(
402            scenario,
403            NoopExecutor,
404            memory_mode,
405            ResponseMode::TerseAnswer,
406        ))
407        .await
408    }
409
410    /// Core execution: run one scenario with the given executor and response mode.
411    ///
412    /// Called by both [`BenchRunner::run_dataset`] (with `NoopExecutor` + `TerseAnswer`) and
413    /// [`BenchRunner::run_dataset_with_env_factory`] (with the domain env + `ToolUse`).
414    #[allow(clippy::too_many_lines)] // sequential setup steps; splitting adds indirection without clarity
415    async fn run_one_with_executor<X: ToolExecutor + Send + Sync + 'static>(
416        &self,
417        scenario: &Scenario,
418        executor: X,
419        memory_mode: MemoryMode,
420        mode: ResponseMode,
421    ) -> Result<String, BenchError> {
422        let _span = tracing::info_span!(
423            "bench.run_one",
424            scenario_id = %scenario.id,
425            mode = ?mode,
426        )
427        .entered();
428        let channel = BenchmarkChannel::from_turns(scenario.turns.clone());
429        if channel.total() == 0 {
430            return Err(BenchError::InvalidFormat(format!(
431                "scenario '{}' has no user turn",
432                scenario.id
433            )));
434        }
435        let registry = SkillRegistry::empty();
436
437        let system_content = match mode {
438            ResponseMode::TerseAnswer => concat!(
439                "You are an evaluation assistant. ",
440                "Answer every question with the shortest possible response. ",
441                "Give only the final answer — no explanation, no full sentences, ",
442                "no punctuation unless it is part of the answer. ",
443                "If the answer is a single word or number, respond with only that word or number."
444            ),
445            ResponseMode::ToolUse => concat!(
446                "You are a customer-service agent. ",
447                "Use the available tools to help the user. ",
448                "Always call a tool when one applies; do not ask the user to perform actions you can perform yourself. ",
449                "When you have completed the user's request, respond with a brief confirmation."
450            ),
451        };
452
453        let blocks = vec![InstructionBlock {
454            source: PathBuf::from("<bench-system-prompt>"),
455            content: system_content.to_owned(),
456        }];
457
458        let base_agent = Agent::new(self.provider.clone(), channel, registry, None, 1, executor)
459            .with_instruction_blocks(blocks);
460
461        // Optionally wire SemanticMemory when the caller requests memory-on mode.
462        let (mut agent, scenario_db) = if memory_mode == MemoryMode::On
463            && let Some(ref params) = self.memory_params
464        {
465            // One SQLite file per scenario gives strict isolation (NFR-001 choice (a)).
466            // This is more files than a per-run DB, but eliminates any cross-scenario
467            // memory bleed and avoids needing BenchIsolation::reset() between scenarios.
468            let scenario_db = params
469                .data_dir
470                .join(format!("bench-{}-{}.db", params.run_id, scenario.id));
471            debug_assert!(
472                scenario_db.to_string_lossy().contains("bench-"),
473                "NFR-001: bench SQLite path must be namespaced with 'bench-'"
474            );
475
476            tracing::debug!(
477                scenario_id = %scenario.id,
478                path = %scenario_db.display(),
479                "bench: memory init start"
480            );
481            let memory = Arc::new(
482                tokio::time::timeout(
483                    std::time::Duration::from_secs(10),
484                    SemanticMemory::with_sqlite_backend(
485                        scenario_db.to_string_lossy().as_ref(),
486                        self.provider.clone(),
487                        &params.embedding_model,
488                        0.7,
489                        0.3,
490                    ),
491                )
492                .await
493                .map_err(|_| {
494                    BenchError::InvalidFormat(format!(
495                        "SemanticMemory init timed out for scenario '{}'",
496                        scenario.id
497                    ))
498                })?
499                .map_err(|e| BenchError::InvalidFormat(format!("SemanticMemory init: {e}")))?,
500            );
501            tracing::debug!(scenario_id = %scenario.id, "bench: memory init done");
502
503            // Seed the sessions table so persist_message does not fail with FK violation.
504            let conv_id = memory
505                .sqlite()
506                .create_conversation()
507                .await
508                .map_err(|e| BenchError::InvalidFormat(format!("create_conversation: {e}")))?;
509
510            // summarization_threshold = 100_000 deliberately suppresses LLM-driven
511            // compaction during bench runs. Compaction calls another LLM round-trip
512            // with non-deterministic timing/output, which would violate FR-003
513            // (deterministic runs). recall_limit = 20 is generous enough to surface
514            // long-context memory effects without silently capping LongMemEval scores
515            // below their theoretical maximum. history_limit = 200 covers the longest
516            // LongMemEval session without truncation.
517            let wired_agent = base_agent.with_memory(memory, conv_id, 200, 20, 100_000);
518            (wired_agent, Some(scenario_db))
519        } else {
520            (base_agent, None)
521        };
522
523        // Ignore agent errors — a failed LLM call still yields an empty response that
524        // the evaluator scores as 0.0 rather than aborting the entire run.
525        let _ = Box::pin(agent.run()).await;
526        let channel = agent.into_channel();
527        // tool_outputs available for Phase 2 scoring (#4234); log count so future
528        // implementors have a trace even before the evaluator wires them up.
529        tracing::debug!(
530            count = channel.tool_outputs().len(),
531            "bench: tool outputs captured"
532        );
533        let responses = channel.into_responses();
534
535        // Best-effort cleanup: delete per-scenario SQLite file after the run.
536        // Failure is intentionally ignored — NFR-001 is hygiene, not correctness.
537        if let Some(ref db_path) = scenario_db {
538            let _ = std::fs::remove_file(db_path);
539        }
540
541        let raw = responses
542            .into_iter()
543            .last()
544            .map(|r| r.text)
545            .unwrap_or_default();
546
547        Ok(match mode {
548            ResponseMode::TerseAnswer => post_process_response(&raw),
549            // Verified: dropping send_tool_output does NOT affect the agent loop's tool-result
550            // feedback to the LLM. Tool outputs flow via Agent's internal MessagePart::ToolResult,
551            // not via the channel. See crates/zeph-core/src/agent/tool_execution/native.rs.
552            ResponseMode::ToolUse => raw,
553        })
554    }
555}
556
557/// Return the subset of `scenarios` that should run given `opts`.
558///
559/// Validates that when a `scenario_filter` is set, at least one matching scenario exists in
560/// `scenarios`. Then filters out already-completed IDs and non-matching scenarios.
561///
562/// # Errors
563///
564/// Returns [`BenchError::InvalidFormat`] when `opts.scenario_filter` names a scenario that
565/// does not appear in `scenarios`.
566fn filter_scenarios<'a>(
567    scenarios: &'a [Scenario],
568    opts: &RunOptions,
569    loader_name: &str,
570) -> Result<Vec<&'a Scenario>, BenchError> {
571    if let Some(ref filter) = opts.scenario_filter
572        && !scenarios.iter().any(|s| &s.id == filter)
573    {
574        return Err(BenchError::InvalidFormat(format!(
575            "scenario '{filter}' not found in dataset '{loader_name}'"
576        )));
577    }
578
579    Ok(scenarios
580        .iter()
581        .filter(|s| {
582            if opts.completed_ids.contains(&s.id) {
583                return false;
584            }
585            if let Some(ref filter) = opts.scenario_filter {
586                return &s.id == filter;
587            }
588            true
589        })
590        .collect())
591}
592
593/// Post-process the raw agent response to extract a clean, terse answer.
594///
595/// Applies these transformations in order:
596/// 1. Take only the first non-empty line — strips explanations appended after the answer.
597/// 2. Strip markdown formatting (bold `**`, italic `*` and `_`, inline code `` ` ``).
598/// 3. Trim surrounding whitespace.
599///
600/// This is a best-effort cleanup. Evaluators still normalize the result, so minor
601/// leftover punctuation is handled downstream.
602fn post_process_response(raw: &str) -> String {
603    // Take the first non-empty line to discard any trailing explanation.
604    let first_line = raw
605        .lines()
606        .map(str::trim)
607        .find(|l| !l.is_empty())
608        .unwrap_or("");
609
610    // Strip common markdown formatting characters.
611    first_line
612        .trim_matches(|c: char| matches!(c, '*' | '_' | '`' | ' ' | '\t'))
613        .replace("**", "")
614        .replace('`', "")
615        .trim()
616        .to_owned()
617}
618
619/// Generate a short pseudo-UUID-like run ID without the `uuid` crate.
620///
621/// Uses `std::time::SystemTime` for uniqueness. Not cryptographically random but
622/// sufficient for benchmark run identification.
623fn uuid() -> String {
624    use std::time::{SystemTime, UNIX_EPOCH};
625    let d = SystemTime::now()
626        .duration_since(UNIX_EPOCH)
627        .unwrap_or_default();
628    format!("bench-{:x}-{:x}", d.as_secs(), d.subsec_nanos())
629}
630
631#[cfg(test)]
632mod tests {
633    use super::*;
634
635    #[test]
636    fn run_options_default_is_empty() {
637        let opts = RunOptions::default();
638        assert!(opts.scenario_filter.is_none());
639        assert!(opts.completed_ids.is_empty());
640        assert_eq!(opts.memory_mode, MemoryMode::Off);
641    }
642
643    #[test]
644    fn memory_mode_default_is_off() {
645        assert_eq!(MemoryMode::default(), MemoryMode::Off);
646    }
647
648    #[test]
649    fn with_memory_params_sets_isolation() {
650        use zeph_llm::{any::AnyProvider, mock::MockProvider};
651        let provider = AnyProvider::Mock(MockProvider::with_responses(vec![]));
652        let params = BenchMemoryParams {
653            data_dir: std::path::PathBuf::from("/tmp/bench-data"),
654            embedding_model: "nomic-embed-text".into(),
655            run_id: "bench-abc".into(),
656            dataset: "locomo".into(),
657        };
658        let runner = BenchRunner::new(provider).with_memory_params(params.clone());
659        assert!(runner.memory_params.is_some());
660        let stored = runner.memory_params.unwrap();
661        assert_eq!(stored.run_id, "bench-abc");
662        assert_eq!(stored.dataset, "locomo");
663    }
664
665    #[test]
666    fn nfr_001_sqlite_path_namespaced() {
667        let params = BenchMemoryParams {
668            data_dir: std::path::PathBuf::from("/tmp/bench-data"),
669            embedding_model: "nomic-embed-text".into(),
670            run_id: "run-xyz".into(),
671            dataset: "locomo".into(),
672        };
673        let scenario_id = "s1_0";
674        let scenario_db = params
675            .data_dir
676            .join(format!("bench-{}-{}.db", params.run_id, scenario_id));
677        assert!(
678            scenario_db.to_string_lossy().contains("bench-"),
679            "NFR-001: SQLite path must contain bench- prefix"
680        );
681    }
682
683    #[test]
684    fn now_rfc3339_has_correct_format() {
685        let ts = timestamp::utc_now_rfc3339();
686        // e.g. "2026-04-25T10:30:00Z"
687        assert_eq!(ts.len(), 20);
688        assert!(ts.ends_with('Z'));
689        assert!(ts.contains('T'));
690    }
691
692    #[test]
693    fn uuid_generates_non_empty_string() {
694        let id = uuid();
695        assert!(id.starts_with("bench-"));
696        assert!(id.len() > 10);
697    }
698
699    #[test]
700    fn post_process_takes_first_line() {
701        let raw = "1945\n\nWorld War II ended in 1945.";
702        assert_eq!(post_process_response(raw), "1945");
703    }
704
705    #[test]
706    fn post_process_strips_markdown_bold() {
707        assert_eq!(post_process_response("**1945**"), "1945");
708    }
709
710    #[test]
711    fn post_process_strips_backticks() {
712        assert_eq!(post_process_response("`Au`"), "Au");
713    }
714
715    #[test]
716    fn post_process_trims_whitespace() {
717        assert_eq!(post_process_response("  Paris  "), "Paris");
718    }
719
720    #[test]
721    fn post_process_empty_input_returns_empty() {
722        assert_eq!(post_process_response(""), "");
723    }
724
725    #[test]
726    fn post_process_skips_empty_leading_lines() {
727        let raw = "\n\n  \nParis";
728        assert_eq!(post_process_response(raw), "Paris");
729    }
730}