Skip to main content

mnem_bench/
bench.rs

1//! Benchmark catalog + dataset metadata.
2//!
3//! Adding a new benchmark = adding a `Bench` variant + filling in
4//! [`Bench::metadata`].
5
6use serde::{Deserialize, Serialize};
7use std::fmt;
8
9/// Benchmarks the 0.1.0 harness ships. Order matches the TUI display
10/// order.
11#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
12#[serde(rename_all = "kebab-case")]
13pub enum Bench {
14    /// LongMemEval-S, per-session chunking variant.
15    LongMemEval,
16    /// LoCoMo, session-granularity.
17    Locomo,
18    /// ConvoMem (Snap 2024).
19    Convomem,
20    /// MemBench simple/roles slice.
21    MembenchSimpleRoles,
22    /// MemBench high-level/movie slice.
23    MembenchHighlevelMovie,
24    /// LongMemEval with the v4 hybrid post-filter.
25    LongMemEvalHybridV4,
26}
27
28/// Static metadata for one benchmark.
29#[derive(Clone, Debug, Serialize, Deserialize)]
30pub struct BenchMeta {
31    /// Stable identifier used on the CLI (`--benches longmemeval`).
32    pub id: &'static str,
33    /// Human-readable display name (TUI + RESULTS.md headers).
34    pub display: &'static str,
35    /// Approximate wall time in seconds for the full run on a
36    /// typical laptop (cpu-local mode, ONNX MiniLM). Surfaced in the
37    /// TUI so users know what they signed up for.
38    pub eta_seconds: u64,
39    /// Approximate dataset size on disk (bytes).
40    pub dataset_bytes: u64,
41    /// One-line description shown in `mnem bench list`.
42    pub description: &'static str,
43}
44
45impl Bench {
46    /// Static catalog. Single source of truth.
47    #[must_use]
48    pub const fn all() -> &'static [Bench] {
49        &[
50            Bench::LongMemEval,
51            Bench::Locomo,
52            Bench::Convomem,
53            Bench::MembenchSimpleRoles,
54            Bench::MembenchHighlevelMovie,
55            Bench::LongMemEvalHybridV4,
56        ]
57    }
58
59    /// Look up by stable id (case-insensitive). Returns `None` for
60    /// an unknown id.
61    #[must_use]
62    pub fn from_id(s: &str) -> Option<Self> {
63        let lower = s.to_ascii_lowercase();
64        for b in Self::all() {
65            if b.metadata().id.eq_ignore_ascii_case(&lower) {
66                return Some(*b);
67            }
68        }
69        None
70    }
71
72    /// Static metadata for this benchmark.
73    #[must_use]
74    pub const fn metadata(self) -> BenchMeta {
75        match self {
76            Self::LongMemEval => BenchMeta {
77                id: "longmemeval",
78                display: "LongMemEval (per-session, 500q)",
79                eta_seconds: 600,
80                dataset_bytes: 264 * 1024 * 1024,
81                description: "500 questions, MAX-aggregate turn->session, R@5 / R@10.",
82            },
83            Self::Locomo => BenchMeta {
84                id: "locomo",
85                display: "LoCoMo (session granularity)",
86                eta_seconds: 300,
87                dataset_bytes: 3 * 1024 * 1024,
88                description: "10 conversations x ~200 QA, per-conv label, session R@5 / R@10.",
89            },
90            Self::Convomem => BenchMeta {
91                id: "convomem",
92                display: "ConvoMem (5 categories, avg recall)",
93                eta_seconds: 240,
94                dataset_bytes: 5 * 1024 * 1024,
95                description: "5 headline evidence categories, substring-match avg_recall.",
96            },
97            Self::MembenchSimpleRoles => BenchMeta {
98                id: "membench-simple-roles",
99                display: "MemBench simple-roles (R@5)",
100                eta_seconds: 180,
101                dataset_bytes: 4 * 1024 * 1024,
102                description: "MemBench simple/roles slice, target_step_id R@5 over 100 items.",
103            },
104            Self::MembenchHighlevelMovie => BenchMeta {
105                id: "membench-highlevel-movie",
106                display: "MemBench high-level/movie (R@5)",
107                eta_seconds: 180,
108                dataset_bytes: 6 * 1024 * 1024,
109                description: "MemBench highlevel/movie slice, target_step_id R@5 over 100 items.",
110            },
111            Self::LongMemEvalHybridV4 => BenchMeta {
112                id: "longmemeval-hybrid-v4",
113                display: "LongMemEval hybrid-v4 (BM25 boost, R@5)",
114                eta_seconds: 600,
115                dataset_bytes: 264 * 1024 * 1024,
116                description: "LongMemEval with BM25-derived post-fusion boost; reuses LME cache.",
117            },
118        }
119    }
120}
121
122impl fmt::Display for Bench {
123    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
124        f.write_str(self.metadata().display)
125    }
126}
127
128/// Adapter (system-under-test) catalog.
129#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
130#[serde(rename_all = "kebab-case")]
131pub enum AdapterKind {
132    /// In-process mnem via `mnem-core`.
133    Mnem,
134}
135
136impl AdapterKind {
137    /// Stable id used on the CLI.
138    #[must_use]
139    pub const fn id(self) -> &'static str {
140        match self {
141            Self::Mnem => "mnem",
142        }
143    }
144
145    /// Display name for the TUI.
146    #[must_use]
147    pub const fn display(self) -> &'static str {
148        match self {
149            Self::Mnem => "mnem",
150        }
151    }
152
153    /// Catalog order (display order in the TUI).
154    #[must_use]
155    pub const fn all() -> &'static [Self] {
156        &[Self::Mnem]
157    }
158
159    /// Look up by id, case-insensitive.
160    #[must_use]
161    pub fn from_id(s: &str) -> Option<Self> {
162        for a in Self::all() {
163            if a.id().eq_ignore_ascii_case(s) {
164                return Some(*a);
165            }
166        }
167        None
168    }
169}
170
171/// Run mode for the harness.
172#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
173#[serde(rename_all = "kebab-case")]
174pub enum RunMode {
175    /// In-process, single-threaded.
176    CpuLocal,
177}
178
179impl RunMode {
180    /// Stable id used on the CLI.
181    #[must_use]
182    pub const fn id(self) -> &'static str {
183        match self {
184            Self::CpuLocal => "cpu-local",
185        }
186    }
187
188    /// Display name for the TUI.
189    #[must_use]
190    pub const fn display(self) -> &'static str {
191        match self {
192            Self::CpuLocal => "CPU local (in-process)",
193        }
194    }
195
196    /// Catalog order.
197    #[must_use]
198    pub const fn all() -> &'static [Self] {
199        &[Self::CpuLocal]
200    }
201
202    /// Look up by id, case-insensitive.
203    #[must_use]
204    pub fn from_id(s: &str) -> Option<Self> {
205        for m in Self::all() {
206            if m.id().eq_ignore_ascii_case(s) {
207                return Some(*m);
208            }
209        }
210        None
211    }
212}
213
214/// Embedder selector. Both variants ship in 0.1.0.
215#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
216#[serde(rename_all = "kebab-case")]
217pub enum EmbedderChoice {
218    /// ONNX MiniLM via the bundled embedder. Production-grade; matches
219    /// headline benchmark numbers.
220    OnnxMiniLm,
221    /// Deterministic hashed bag-of-tokens embedder built into mnem-bench.
222    /// Network-free, offline. Toy embedder; recall is not comparable to
223    /// ONNX figures. Useful for CI smoke tests that can't load the model.
224    BagOfTokens,
225}
226
227impl EmbedderChoice {
228    /// Stable id used on the CLI.
229    #[must_use]
230    pub const fn id(self) -> &'static str {
231        match self {
232            Self::BagOfTokens => "bag-of-tokens",
233            Self::OnnxMiniLm => "onnx-minilm",
234        }
235    }
236
237    /// Display name for the TUI.
238    #[must_use]
239    pub const fn display(self) -> &'static str {
240        match self {
241            Self::BagOfTokens => "bag-of-tokens (built-in, deterministic)",
242            Self::OnnxMiniLm => "ONNX MiniLM (default, bundled)",
243        }
244    }
245
246    /// Catalog order. ONNX MiniLM is listed first so the TUI's default
247    /// picks the production-grade embedder instead of the offline toy.
248    #[must_use]
249    pub const fn all() -> &'static [Self] {
250        &[Self::OnnxMiniLm, Self::BagOfTokens]
251    }
252
253    /// Look up by id, case-insensitive.
254    #[must_use]
255    pub fn from_id(s: &str) -> Option<Self> {
256        for e in Self::all() {
257            if e.id().eq_ignore_ascii_case(s) {
258                return Some(*e);
259            }
260        }
261        None
262    }
263}