zeph_bench/
runner.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Benchmark runner: drives `Agent<BenchmarkChannel>` over a dataset and collects results.
5//!
6//! [`BenchRunner`] is the execution engine for `zeph bench run`. It is intentionally
7//! minimal — baseline mode only (no tools, no memory, no MCP). Each scenario is run in
8//! isolation through a fresh [`BenchmarkChannel`] and the agent's raw text response is
9//! scored by the supplied [`Evaluator`].
10//!
11//! # Usage
12//!
13//! ```no_run
14//! use std::path::Path;
15//! use zeph_bench::runner::{BenchRunner, RunOptions};
16//! use zeph_bench::loaders::{GaiaLoader, GaiaEvaluator};
17//! use zeph_llm::{any::AnyProvider, mock::MockProvider};
18//!
19//! # async fn example() -> Result<(), zeph_bench::BenchError> {
20//! let provider = AnyProvider::Mock(MockProvider::with_responses(vec!["1945".into()]));
21//! let runner = BenchRunner::new(provider, false);
22//! let opts = RunOptions::default();
23//! let run = runner.run_dataset(&GaiaLoader::all_levels(), &GaiaEvaluator, Path::new("/data/gaia.jsonl"), opts).await?;
24//! println!("mean score: {:.4}", run.aggregate.mean_score);
25//! # Ok(())
26//! # }
27//! ```
28
29use std::collections::HashSet;
30use std::path::{Path, PathBuf};
31use std::sync::Arc;
32use std::time::Instant;
33
34use zeph_core::agent::Agent;
35use zeph_core::instructions::InstructionBlock;
36use zeph_llm::any::AnyProvider;
37use zeph_llm::provider::LlmProvider as _;
38use zeph_memory::semantic::SemanticMemory;
39use zeph_skills::registry::SkillRegistry;
40use zeph_tools::executor::{ToolError, ToolExecutor, ToolOutput};
41
42use crate::channel::BenchmarkChannel;
43use crate::error::BenchError;
44use crate::loaders::tau2_bench::{ActionTrace, TauBenchEvaluator};
45use crate::results::{BenchRun, RunStatus, ScenarioResult};
46use crate::scenario::{DatasetLoader, Evaluator, Scenario};
47
48/// Controls how the runner processes the agent's raw text response.
49///
50/// Used by [`BenchRunner::run_one_with_executor`] to select the appropriate
51/// system prompt and post-processing behaviour.
52#[derive(Debug, Clone, Copy, PartialEq, Eq)]
53pub enum ResponseMode {
54    /// Inject a "shortest possible answer" system prompt and strip markdown from the response.
55    ///
56    /// Used by all knowledge-retrieval datasets (GAIA, LOCOMO, FRAMES, `LongMemEval`).
57    TerseAnswer,
58    /// Inject a tool-use system prompt; return the raw agent response without post-processing.
59    ///
60    /// Used by tau2-bench where the evaluation is based on the action trace, not text output.
61    ToolUse,
62}
63
64/// Controls whether `SemanticMemory` is wired into the agent during a benchmark run.
65///
66/// # Examples
67///
68/// ```
69/// use zeph_bench::runner::MemoryMode;
70///
71/// assert_eq!(MemoryMode::default(), MemoryMode::Off);
72/// ```
73#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
74pub enum MemoryMode {
75    /// No `SemanticMemory` — current default behaviour.
76    #[default]
77    Off,
78    /// Wire a `SQLite`-backed `SemanticMemory` into the agent via `Agent::with_memory`.
79    On,
80}
81
82/// Parameters required to construct a per-scenario `SQLite`-backed `SemanticMemory`.
83///
84/// Populated by [`BenchRunner::with_memory_params`] and consumed inside
85/// [`BenchRunner::run_one`] when `opts.memory_mode == MemoryMode::On`.
86///
87/// # Examples
88///
89/// ```
90/// use std::path::PathBuf;
91/// use zeph_bench::runner::BenchMemoryParams;
92///
93/// let params = BenchMemoryParams {
94///     data_dir: PathBuf::from("/tmp/bench"),
95///     embedding_model: "nomic-embed-text".into(),
96///     run_id: "bench-abc".into(),
97///     dataset: "locomo".into(),
98/// };
99/// assert!(params.data_dir.to_string_lossy().contains("bench"));
100/// ```
101#[derive(Debug, Clone)]
102pub struct BenchMemoryParams {
103    /// Directory where per-scenario `SQLite` files live (deleted between scenarios).
104    ///
105    /// The derived path always contains the `bench-` segment (NFR-001).
106    pub data_dir: PathBuf,
107    /// Embedding model name passed to `SemanticMemory`.
108    pub embedding_model: String,
109    /// Run ID used to namespace bench artifacts; matches the outer `BenchRun.run_id`.
110    pub run_id: String,
111    /// Dataset name used to namespace bench artifacts.
112    pub dataset: String,
113}
114
115/// Options that control which scenarios are executed and whether to resume a prior run.
116///
117/// Build via [`RunOptions::default`] and override the fields you need.
118///
119/// # Examples
120///
121/// ```
122/// use zeph_bench::runner::{RunOptions, MemoryMode};
123///
124/// // Run all scenarios.
125/// let opts = RunOptions::default();
126/// assert!(opts.scenario_filter.is_none());
127/// assert!(opts.completed_ids.is_empty());
128/// assert_eq!(opts.memory_mode, MemoryMode::Off);
129/// ```
130#[derive(Debug, Default)]
131pub struct RunOptions {
132    /// When `Some(id)`, only the scenario with this ID is executed.
133    pub scenario_filter: Option<String>,
134    /// Set of scenario IDs already completed in a prior run (used for `--resume`).
135    pub completed_ids: HashSet<String>,
136    /// Whether to wire a `SemanticMemory` backend into the agent for this run.
137    pub memory_mode: MemoryMode,
138}
139
140/// Minimal no-op tool executor for baseline benchmark runs.
141///
142/// Returns an empty tool list and `Ok(None)` on every execute call, ensuring that
143/// the agent loop cannot invoke any tools during a benchmark run.
144struct NoopExecutor;
145
146impl ToolExecutor for NoopExecutor {
147    async fn execute(&self, _response: &str) -> Result<Option<ToolOutput>, ToolError> {
148        Ok(None)
149    }
150}
151
152/// Drives [`Agent<BenchmarkChannel>`] over a dataset and collects scored results.
153///
154/// Each call to [`run_dataset`][BenchRunner::run_dataset] creates a fresh agent per
155/// scenario (baseline mode: no tools, no MCP). Memory is optionally wired via
156/// [`BenchRunner::with_memory_params`] and [`RunOptions::memory_mode`].
157///
158/// # Examples
159///
160/// ```no_run
161/// use zeph_bench::runner::BenchRunner;
162/// use zeph_llm::{any::AnyProvider, mock::MockProvider};
163///
164/// let provider = AnyProvider::Mock(MockProvider::with_responses(vec!["Paris".into()]));
165/// let runner = BenchRunner::new(provider, false);
166/// ```
167pub struct BenchRunner {
168    provider: AnyProvider,
169    /// Parameters for constructing per-scenario `SQLite`-backed `SemanticMemory`.
170    ///
171    /// Set via [`BenchRunner::with_memory_params`]; required when
172    /// `RunOptions::memory_mode == MemoryMode::On`.
173    memory_params: Option<BenchMemoryParams>,
174}
175
176impl BenchRunner {
177    /// Create a new runner with the given provider.
178    ///
179    /// The `no_deterministic` argument is unused at runtime but kept in the public API
180    /// so the bench command can pass it through for future use (e.g., logging or config).
181    /// Apply deterministic overrides to `provider` before calling this if needed.
182    ///
183    /// # Examples
184    ///
185    /// ```no_run
186    /// use zeph_bench::runner::BenchRunner;
187    /// use zeph_llm::{any::AnyProvider, mock::MockProvider};
188    ///
189    /// let provider = AnyProvider::Mock(MockProvider::with_responses(vec![]));
190    /// let runner = BenchRunner::new(provider, false);
191    /// ```
192    #[must_use]
193    pub fn new(provider: AnyProvider, _no_deterministic: bool) -> Self {
194        Self {
195            provider,
196            memory_params: None,
197        }
198    }
199
200    /// Attach `SemanticMemory` parameters for memory-on benchmark runs.
201    ///
202    /// When set, a per-scenario `SQLite`-backed `SemanticMemory` is constructed inside
203    /// [`run_one`][BenchRunner::run_one] whenever `opts.memory_mode == MemoryMode::On`.
204    ///
205    /// # Examples
206    ///
207    /// ```no_run
208    /// use std::path::PathBuf;
209    /// use zeph_bench::runner::{BenchRunner, BenchMemoryParams};
210    /// use zeph_llm::{any::AnyProvider, mock::MockProvider};
211    ///
212    /// let provider = AnyProvider::Mock(MockProvider::with_responses(vec![]));
213    /// let params = BenchMemoryParams {
214    ///     data_dir: PathBuf::from("/tmp/bench-data"),
215    ///     embedding_model: "nomic-embed-text".into(),
216    ///     run_id: "bench-abc".into(),
217    ///     dataset: "locomo".into(),
218    /// };
219    /// let runner = BenchRunner::new(provider, false).with_memory_params(params);
220    /// ```
221    #[must_use]
222    pub fn with_memory_params(mut self, params: BenchMemoryParams) -> Self {
223        self.memory_params = Some(params);
224        self
225    }
226
227    /// Run all matching scenarios from `path` through the agent and return a [`BenchRun`].
228    ///
229    /// For each scenario:
230    /// 1. Builds a fresh `Agent<BenchmarkChannel>` with no tools or memory.
231    /// 2. Feeds the scenario prompt and collects the agent's response.
232    /// 3. Scores the response with `evaluator`.
233    /// 4. Appends a [`ScenarioResult`] and recomputes aggregate statistics.
234    ///
235    /// The returned [`BenchRun`] has `status = Running` until the caller sets it to
236    /// `Completed` or `Interrupted`.
237    ///
238    /// # Errors
239    ///
240    /// Returns [`BenchError`] if the dataset cannot be loaded or a scenario run fails.
241    pub async fn run_dataset<L, E>(
242        &self,
243        loader: &L,
244        evaluator: &E,
245        path: &Path,
246        opts: RunOptions,
247    ) -> Result<BenchRun, BenchError>
248    where
249        L: DatasetLoader,
250        E: Evaluator,
251    {
252        let scenarios = loader.load(path)?;
253
254        if let Some(ref filter) = opts.scenario_filter
255            && !scenarios.iter().any(|s| &s.id == filter)
256        {
257            return Err(BenchError::InvalidFormat(format!(
258                "scenario '{}' not found in dataset '{}'",
259                filter,
260                loader.name()
261            )));
262        }
263
264        let model_id = self.provider.model_identifier().to_owned();
265
266        let mut run = BenchRun {
267            dataset: loader.name().to_owned(),
268            model: model_id,
269            run_id: uuid(),
270            started_at: now_rfc3339(),
271            finished_at: String::new(),
272            status: RunStatus::Running,
273            results: vec![],
274            aggregate: crate::results::Aggregate::default(),
275        };
276
277        for scenario in &scenarios {
278            // Skip if resume is active and scenario already completed.
279            if opts.completed_ids.contains(&scenario.id) {
280                continue;
281            }
282            // Skip if a single-scenario filter is active.
283            if let Some(ref filter) = opts.scenario_filter
284                && &scenario.id != filter
285            {
286                continue;
287            }
288
289            let t0 = Instant::now();
290            let response_text = Box::pin(self.run_one(scenario, opts.memory_mode)).await?;
291            let elapsed_ms = u64::try_from(t0.elapsed().as_millis()).unwrap_or(u64::MAX);
292
293            let eval = evaluator.evaluate(scenario, &response_text);
294            let excerpt = response_text.chars().take(200).collect::<String>();
295
296            run.results.push(ScenarioResult {
297                scenario_id: scenario.id.clone(),
298                score: eval.score,
299                response_excerpt: excerpt,
300                error: None,
301                elapsed_ms,
302            });
303            run.recompute_aggregate();
304        }
305
306        Ok(run)
307    }
308
309    /// Run all scenarios from `path` through a per-scenario env executor and return a [`BenchRun`].
310    ///
311    /// This is the execution path for tool-driven datasets (tau2-bench). For each scenario:
312    /// 1. Calls `env_factory(scenario)` to build a fresh `(ToolExecutor, ActionTrace)`.
313    /// 2. Builds a fresh `TauBenchEvaluator` from the scenario metadata and the trace.
314    /// 3. Runs the agent with the env executor and the tool-use system prompt.
315    /// 4. Scores the response via the evaluator (reads the populated trace).
316    ///
317    /// # Errors
318    ///
319    /// Returns [`BenchError`] if the dataset cannot be loaded, the env factory fails, or
320    /// `TauBenchEvaluator::from_scenario` fails (malformed metadata).
321    pub async fn run_dataset_with_env_factory<L, F, X>(
322        &self,
323        loader: &L,
324        env_factory: F,
325        path: &Path,
326        opts: RunOptions,
327    ) -> Result<BenchRun, BenchError>
328    where
329        L: DatasetLoader,
330        F: Fn(&Scenario) -> Result<(X, ActionTrace), BenchError>,
331        X: ToolExecutor + Send + Sync + 'static,
332    {
333        let scenarios = loader.load(path)?;
334
335        if let Some(ref filter) = opts.scenario_filter
336            && !scenarios.iter().any(|s| &s.id == filter)
337        {
338            return Err(BenchError::InvalidFormat(format!(
339                "scenario '{}' not found in dataset '{}'",
340                filter,
341                loader.name()
342            )));
343        }
344
345        let model_id = self.provider.model_identifier().to_owned();
346
347        let mut run = BenchRun {
348            dataset: loader.name().to_owned(),
349            model: model_id,
350            run_id: uuid(),
351            started_at: now_rfc3339(),
352            finished_at: String::new(),
353            status: RunStatus::Running,
354            results: vec![],
355            aggregate: crate::results::Aggregate::default(),
356        };
357
358        for scenario in &scenarios {
359            if opts.completed_ids.contains(&scenario.id) {
360                continue;
361            }
362            if let Some(ref filter) = opts.scenario_filter
363                && &scenario.id != filter
364            {
365                continue;
366            }
367
368            let (executor, trace) = env_factory(scenario)?;
369            let evaluator = TauBenchEvaluator::from_scenario(scenario, trace)?;
370
371            let t0 = Instant::now();
372            let response_text = Box::pin(self.run_one_with_executor(
373                scenario,
374                executor,
375                opts.memory_mode,
376                ResponseMode::ToolUse,
377            ))
378            .await?;
379            let elapsed_ms = u64::try_from(t0.elapsed().as_millis()).unwrap_or(u64::MAX);
380
381            let eval = evaluator.evaluate(scenario, &response_text);
382            let excerpt = response_text.chars().take(200).collect::<String>();
383
384            run.results.push(ScenarioResult {
385                scenario_id: scenario.id.clone(),
386                score: eval.score,
387                response_excerpt: excerpt,
388                error: None,
389                elapsed_ms,
390            });
391            run.recompute_aggregate();
392        }
393
394        Ok(run)
395    }
396
397    /// Run a single scenario through a fresh agent and return the last response text.
398    ///
399    /// A concise-answer system prompt is injected via [`InstructionBlock`] so the model
400    /// responds with only the final answer (a number, word, or short phrase) rather than
401    /// full sentences. The raw response is then post-processed to extract the first
402    /// non-empty line and strip markdown formatting, which further reduces noise for
403    /// evaluators that perform exact or near-exact matching.
404    ///
405    /// When `memory_mode == MemoryMode::On`, a per-scenario `SQLite`-backed
406    /// `SemanticMemory` is constructed and wired into the agent. The database file is
407    /// deleted after the scenario completes (best-effort, NFR-001).
408    ///
409    /// # Errors
410    ///
411    /// Returns [`BenchError::InvalidFormat`] when the scenario has no user turn or when
412    /// `SemanticMemory` initialisation fails.
413    async fn run_one(
414        &self,
415        scenario: &Scenario,
416        memory_mode: MemoryMode,
417    ) -> Result<String, BenchError> {
418        Box::pin(self.run_one_with_executor(
419            scenario,
420            NoopExecutor,
421            memory_mode,
422            ResponseMode::TerseAnswer,
423        ))
424        .await
425    }
426
427    /// Core execution: run one scenario with the given executor and response mode.
428    ///
429    /// Called by both [`BenchRunner::run_dataset`] (with `NoopExecutor` + `TerseAnswer`) and
430    /// [`BenchRunner::run_dataset_with_env_factory`] (with the domain env + `ToolUse`).
431    async fn run_one_with_executor<X: ToolExecutor + Send + Sync + 'static>(
432        &self,
433        scenario: &Scenario,
434        executor: X,
435        memory_mode: MemoryMode,
436        mode: ResponseMode,
437    ) -> Result<String, BenchError> {
438        let prompt = scenario.primary_prompt()?.to_owned();
439        let channel = BenchmarkChannel::new(vec![prompt]);
440        // TODO(multi-turn-history): when loaders emit multiple user turns, push each in
441        // order and seed assistant turns into the channel as captured-history.
442        let registry = SkillRegistry::empty();
443
444        let system_content = match mode {
445            ResponseMode::TerseAnswer => concat!(
446                "You are an evaluation assistant. ",
447                "Answer every question with the shortest possible response. ",
448                "Give only the final answer — no explanation, no full sentences, ",
449                "no punctuation unless it is part of the answer. ",
450                "If the answer is a single word or number, respond with only that word or number."
451            ),
452            ResponseMode::ToolUse => concat!(
453                "You are a customer-service agent. ",
454                "Use the available tools to help the user. ",
455                "Always call a tool when one applies; do not ask the user to perform actions you can perform yourself. ",
456                "When you have completed the user's request, respond with a brief confirmation."
457            ),
458        };
459
460        let blocks = vec![InstructionBlock {
461            source: PathBuf::from("<bench-system-prompt>"),
462            content: system_content.to_owned(),
463        }];
464
465        let base_agent = Agent::new(self.provider.clone(), channel, registry, None, 1, executor)
466            .with_instruction_blocks(blocks);
467
468        // Optionally wire SemanticMemory when the caller requests memory-on mode.
469        let (mut agent, scenario_db) = if memory_mode == MemoryMode::On
470            && let Some(ref params) = self.memory_params
471        {
472            // One SQLite file per scenario gives strict isolation (NFR-001 choice (a)).
473            // This is more files than a per-run DB, but eliminates any cross-scenario
474            // memory bleed and avoids needing BenchIsolation::reset() between scenarios.
475            let scenario_db = params
476                .data_dir
477                .join(format!("bench-{}-{}.db", params.run_id, scenario.id));
478            debug_assert!(
479                scenario_db.to_string_lossy().contains("bench-"),
480                "NFR-001: bench SQLite path must be namespaced with 'bench-'"
481            );
482
483            tracing::debug!(
484                scenario_id = %scenario.id,
485                path = %scenario_db.display(),
486                "bench: memory init start"
487            );
488            let memory = Arc::new(
489                tokio::time::timeout(
490                    std::time::Duration::from_secs(10),
491                    SemanticMemory::with_sqlite_backend(
492                        scenario_db.to_string_lossy().as_ref(),
493                        self.provider.clone(),
494                        &params.embedding_model,
495                        0.7,
496                        0.3,
497                    ),
498                )
499                .await
500                .map_err(|_| {
501                    BenchError::InvalidFormat(format!(
502                        "SemanticMemory init timed out for scenario '{}'",
503                        scenario.id
504                    ))
505                })?
506                .map_err(|e| BenchError::InvalidFormat(format!("SemanticMemory init: {e}")))?,
507            );
508            tracing::debug!(scenario_id = %scenario.id, "bench: memory init done");
509
510            // Seed the sessions table so persist_message does not fail with FK violation.
511            let conv_id = memory
512                .sqlite()
513                .create_conversation()
514                .await
515                .map_err(|e| BenchError::InvalidFormat(format!("create_conversation: {e}")))?;
516
517            // summarization_threshold = 100_000 deliberately suppresses LLM-driven
518            // compaction during bench runs. Compaction calls another LLM round-trip
519            // with non-deterministic timing/output, which would violate FR-003
520            // (deterministic runs). recall_limit = 20 is generous enough to surface
521            // long-context memory effects without silently capping LongMemEval scores
522            // below their theoretical maximum. history_limit = 200 covers the longest
523            // LongMemEval session without truncation.
524            let wired_agent = base_agent.with_memory(memory, conv_id, 200, 20, 100_000);
525            (wired_agent, Some(scenario_db))
526        } else {
527            (base_agent, None)
528        };
529
530        // Ignore agent errors — a failed LLM call still yields an empty response that
531        // the evaluator scores as 0.0 rather than aborting the entire run.
532        let _ = Box::pin(agent.run()).await;
533        let responses = agent.into_channel().into_responses();
534
535        // Best-effort cleanup: delete per-scenario SQLite file after the run.
536        // Failure is intentionally ignored — NFR-001 is hygiene, not correctness.
537        if let Some(ref db_path) = scenario_db {
538            let _ = std::fs::remove_file(db_path);
539        }
540
541        let raw = responses
542            .into_iter()
543            .last()
544            .map(|r| r.text)
545            .unwrap_or_default();
546
547        Ok(match mode {
548            ResponseMode::TerseAnswer => post_process_response(&raw),
549            // Verified: dropping send_tool_output does NOT affect the agent loop's tool-result
550            // feedback to the LLM. Tool outputs flow via Agent's internal MessagePart::ToolResult,
551            // not via the channel. See crates/zeph-core/src/agent/tool_execution/native.rs.
552            ResponseMode::ToolUse => raw,
553        })
554    }
555}
556
557/// Post-process the raw agent response to extract a clean, terse answer.
558///
559/// Applies these transformations in order:
560/// 1. Take only the first non-empty line — strips explanations appended after the answer.
561/// 2. Strip markdown formatting (bold `**`, italic `*` and `_`, inline code `` ` ``).
562/// 3. Trim surrounding whitespace.
563///
564/// This is a best-effort cleanup. Evaluators still normalize the result, so minor
565/// leftover punctuation is handled downstream.
566fn post_process_response(raw: &str) -> String {
567    // Take the first non-empty line to discard any trailing explanation.
568    let first_line = raw
569        .lines()
570        .map(str::trim)
571        .find(|l| !l.is_empty())
572        .unwrap_or("");
573
574    // Strip common markdown formatting characters.
575    first_line
576        .trim_matches(|c: char| matches!(c, '*' | '_' | '`' | ' ' | '\t'))
577        .replace("**", "")
578        .replace('`', "")
579        .trim()
580        .to_owned()
581}
582
583/// Generate a short pseudo-UUID-like run ID without the `uuid` crate.
584///
585/// Uses `std::time::SystemTime` for uniqueness. Not cryptographically random but
586/// sufficient for benchmark run identification.
587fn uuid() -> String {
588    use std::time::{SystemTime, UNIX_EPOCH};
589    let ns = SystemTime::now()
590        .duration_since(UNIX_EPOCH)
591        .map_or(0, |d| d.subsec_nanos());
592    let secs = SystemTime::now()
593        .duration_since(UNIX_EPOCH)
594        .map_or(0, |d| d.as_secs());
595    format!("bench-{secs:x}-{ns:x}")
596}
597
598/// RFC 3339-like timestamp using `std` only (no chrono).
599fn now_rfc3339() -> String {
600    use std::time::{SystemTime, UNIX_EPOCH};
601    let secs = SystemTime::now()
602        .duration_since(UNIX_EPOCH)
603        .map_or(0, |d| d.as_secs());
604    // Minimal ISO 8601 UTC representation — good enough for result metadata.
605    let (y, mo, d, h, mi, s) = secs_to_ymdhms(secs);
606    format!("{y:04}-{mo:02}-{d:02}T{h:02}:{mi:02}:{s:02}Z")
607}
608
609/// Decompose Unix seconds into (year, month, day, hour, minute, second) UTC.
610fn secs_to_ymdhms(secs: u64) -> (u64, u64, u64, u64, u64, u64) {
611    const SECS_PER_MIN: u64 = 60;
612    const DAYS_PER_400Y: u64 = 146_097;
613
614    let s = secs % SECS_PER_MIN;
615    let total_mins = secs / SECS_PER_MIN;
616    let mi = total_mins % 60;
617    let total_hours = total_mins / 60;
618    let h = total_hours % 24;
619    let mut days = total_hours / 24;
620
621    // Proleptic Gregorian calendar computation.
622    // Shift epoch from 1970-01-01 to 0000-03-01 for easier leap-year math.
623    days += 719_468;
624    let era = days / DAYS_PER_400Y;
625    let doe = days % DAYS_PER_400Y;
626    let yoe = (doe - doe / 1460 + doe / 36524 - doe / 146_096) / 365;
627    let y = yoe + era * 400;
628    let doy = doe - (365 * yoe + yoe / 4 - yoe / 100);
629    let mp = (5 * doy + 2) / 153;
630    let d = doy - (153 * mp + 2) / 5 + 1;
631    let mo = if mp < 10 { mp + 3 } else { mp - 9 };
632    let y = if mo <= 2 { y + 1 } else { y };
633    (y, mo, d, h, mi, s)
634}
635
636#[cfg(test)]
637mod tests {
638    use super::*;
639
640    #[test]
641    fn run_options_default_is_empty() {
642        let opts = RunOptions::default();
643        assert!(opts.scenario_filter.is_none());
644        assert!(opts.completed_ids.is_empty());
645        assert_eq!(opts.memory_mode, MemoryMode::Off);
646    }
647
648    #[test]
649    fn memory_mode_default_is_off() {
650        assert_eq!(MemoryMode::default(), MemoryMode::Off);
651    }
652
653    #[test]
654    fn with_memory_params_sets_isolation() {
655        use zeph_llm::{any::AnyProvider, mock::MockProvider};
656        let provider = AnyProvider::Mock(MockProvider::with_responses(vec![]));
657        let params = BenchMemoryParams {
658            data_dir: std::path::PathBuf::from("/tmp/bench-data"),
659            embedding_model: "nomic-embed-text".into(),
660            run_id: "bench-abc".into(),
661            dataset: "locomo".into(),
662        };
663        let runner = BenchRunner::new(provider, false).with_memory_params(params.clone());
664        assert!(runner.memory_params.is_some());
665        let stored = runner.memory_params.unwrap();
666        assert_eq!(stored.run_id, "bench-abc");
667        assert_eq!(stored.dataset, "locomo");
668    }
669
670    #[test]
671    fn nfr_001_sqlite_path_namespaced() {
672        let params = BenchMemoryParams {
673            data_dir: std::path::PathBuf::from("/tmp/bench-data"),
674            embedding_model: "nomic-embed-text".into(),
675            run_id: "run-xyz".into(),
676            dataset: "locomo".into(),
677        };
678        let scenario_id = "s1_0";
679        let scenario_db = params
680            .data_dir
681            .join(format!("bench-{}-{}.db", params.run_id, scenario_id));
682        assert!(
683            scenario_db.to_string_lossy().contains("bench-"),
684            "NFR-001: SQLite path must contain bench- prefix"
685        );
686    }
687
688    #[test]
689    fn now_rfc3339_has_correct_format() {
690        let ts = now_rfc3339();
691        // e.g. "2026-04-25T10:30:00Z"
692        assert_eq!(ts.len(), 20);
693        assert!(ts.ends_with('Z'));
694        assert!(ts.contains('T'));
695    }
696
697    #[test]
698    fn uuid_generates_non_empty_string() {
699        let id = uuid();
700        assert!(id.starts_with("bench-"));
701        assert!(id.len() > 10);
702    }
703
704    #[test]
705    fn post_process_takes_first_line() {
706        let raw = "1945\n\nWorld War II ended in 1945.";
707        assert_eq!(post_process_response(raw), "1945");
708    }
709
710    #[test]
711    fn post_process_strips_markdown_bold() {
712        assert_eq!(post_process_response("**1945**"), "1945");
713    }
714
715    #[test]
716    fn post_process_strips_backticks() {
717        assert_eq!(post_process_response("`Au`"), "Au");
718    }
719
720    #[test]
721    fn post_process_trims_whitespace() {
722        assert_eq!(post_process_response("  Paris  "), "Paris");
723    }
724
725    #[test]
726    fn post_process_empty_input_returns_empty() {
727        assert_eq!(post_process_response(""), "");
728    }
729
730    #[test]
731    fn post_process_skips_empty_leading_lines() {
732        let raw = "\n\n  \nParis";
733        assert_eq!(post_process_response(raw), "Paris");
734    }
735}
zeph_bench/runner.rs

zeph_bench/
runner.rs