Skip to main content

ai_memory/
bench.rs

1// Copyright 2026 AlphaOne LLC
2// SPDX-License-Identifier: Apache-2.0
3
4//! Pillar 3 / Stream E — `ai-memory bench` workload runner.
5//!
6//! Measures hot-path operations against the budgets published in
7//! `PERFORMANCE.md` and returns p50/p95/p99 latencies plus a pass/fail
8//! verdict per operation. The CI guard (Stream F) enforces the same
9//! 10% p95 tolerance documented in `PERFORMANCE.md`.
10//!
11//! Coverage in this build:
12//! - Embedding-free CRUD: `memory_store` (no embedding), `memory_search`
13//!   (FTS5), `memory_recall` (hot, depth=1).
14//! - Knowledge-graph traversal:
15//!     - `memory_kg_query` (depth=1) and `memory_kg_timeline` against a
16//!       fan-out fixture (50 sources × 4 outbound links each, every
17//!       link `valid_from`-stamped).
18//!     - `memory_kg_query` (depth=3, depth=5) against a chain fixture
19//!       (50 chains × 5 hops each = 300 memories + 250 links). depth=3
20//!       hits the "depth ≤ 3" 100 ms budget bucket; depth=5 hits the
21//!       "depth ≤ 5" 250 ms tail-case bucket.
22//!
23//! Both fixtures live in the same in-process disposable `SQLite` — no
24//! external service required.
25//!
26//! Embedding-bound paths (`memory_store` with embedding,
27//! `memory_recall` cold/full hybrid) still require an embedder process
28//! and are tracked as follow-up Stream E work — they don't belong on
29//! the hot path of a `cargo test` invocation.
30
31use anyhow::{Context, Result};
32use rusqlite::Connection;
33use serde::{Deserialize, Serialize};
34use std::path::Path;
35use std::time::{Duration, Instant};
36
37use crate::db;
38use crate::models::{Memory, Tier};
39
40/// CI guard tolerance — measured p95 may exceed budget by this factor
41/// before the run is marked `Fail`. Mirrors `PERFORMANCE.md`.
42pub const P95_TOLERANCE: f64 = 1.10;
43
44/// Default seeded namespace for the bench workload.
45pub const BENCH_NAMESPACE: &str = "ai-memory-bench";
46
47/// Default workload size — keep small enough for `cargo test`, large
48/// enough that p99 has signal.
49pub const DEFAULT_ITERATIONS: usize = 200;
50
51/// Default warmup iterations discarded from the percentile sample.
52pub const DEFAULT_WARMUP: usize = 20;
53
54/// Default tolerance applied when comparing a fresh run against a
55/// `--baseline` JSON file: a measured p95 may grow by this percentage
56/// before the run is flagged as a regression. Independent of
57/// [`P95_TOLERANCE`] (which guards against the absolute budget). The
58/// baseline guard catches drift that stays inside the absolute budget
59/// but trends in the wrong direction across releases.
60pub const DEFAULT_REGRESSION_THRESHOLD_PCT: f64 = 10.0;
61
62/// Hot-path operations covered by this iteration of the bench tool.
63#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
64#[serde(rename_all = "snake_case")]
65pub enum Operation {
66    /// `memory_store` without embedding — pure `SQLite` write path.
67    StoreNoEmbedding,
68    /// `memory_search` — FTS5 keyword baseline.
69    SearchFts,
70    /// `memory_recall` hot path, depth=1 (no hierarchy expansion).
71    RecallHot,
72    /// `memory_kg_query` recursive-CTE traversal at depth=1 (the
73    /// shallowest path through the depth ≤ 3 budget bucket).
74    KgQueryDepth1,
75    /// `memory_kg_query` recursive-CTE traversal at depth=3 (the
76    /// deepest path inside the "depth ≤ 3" 100 ms budget bucket). Driven
77    /// against a chain fixture so the recursive CTE actually visits
78    /// three hops per query.
79    KgQueryDepth3,
80    /// `memory_kg_query` recursive-CTE traversal at depth=5 (the tail
81    /// case for the "depth ≤ 5" 250 ms budget bucket). Driven against
82    /// the same chain fixture as depth=3.
83    KgQueryDepth5,
84    /// `memory_kg_timeline` — ordered timeline for a single source.
85    KgTimeline,
86}
87
88impl Operation {
89    #[must_use]
90    pub fn label(self) -> &'static str {
91        match self {
92            Self::StoreNoEmbedding => "memory_store (no embedding)",
93            Self::SearchFts => "memory_search (FTS5)",
94            Self::RecallHot => "memory_recall (hot, depth=1)",
95            Self::KgQueryDepth1 => "memory_kg_query (depth=1)",
96            Self::KgQueryDepth3 => "memory_kg_query (depth=3)",
97            Self::KgQueryDepth5 => "memory_kg_query (depth=5)",
98            Self::KgTimeline => "memory_kg_timeline",
99        }
100    }
101
102    /// p95 budget in milliseconds, sourced from `PERFORMANCE.md`.
103    ///
104    /// `KgQueryDepth1` and `KgQueryDepth3` both fall in the
105    /// "depth ≤ 3" (100 ms) bucket; `KgQueryDepth5` is the tail case
106    /// at "depth ≤ 5" (250 ms). `SearchFts` and `KgTimeline` happen to
107    /// share the same numeric budget as the depth ≤ 3 bucket despite
108    /// belonging to different table rows in `PERFORMANCE.md`.
109    #[must_use]
110    #[allow(clippy::match_same_arms)]
111    pub fn target_p95_ms(self) -> f64 {
112        match self {
113            Self::StoreNoEmbedding => 20.0,
114            Self::SearchFts => 100.0,
115            Self::RecallHot => 50.0,
116            Self::KgQueryDepth1 => 100.0,
117            Self::KgQueryDepth3 => 100.0,
118            Self::KgQueryDepth5 => 250.0,
119            Self::KgTimeline => 100.0,
120        }
121    }
122}
123
124#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
125#[serde(rename_all = "snake_case")]
126pub enum Status {
127    Pass,
128    Fail,
129}
130
131#[derive(Debug, Clone, Serialize)]
132pub struct OperationResult {
133    pub operation: Operation,
134    /// Pretty label, duplicated for JSON consumers.
135    pub label: &'static str,
136    pub target_p95_ms: f64,
137    pub measured_p50_ms: f64,
138    pub measured_p95_ms: f64,
139    pub measured_p99_ms: f64,
140    pub samples: usize,
141    pub status: Status,
142}
143
144#[derive(Debug, Clone)]
145pub struct BenchConfig {
146    pub iterations: usize,
147    pub warmup: usize,
148    pub namespace: String,
149}
150
151impl Default for BenchConfig {
152    fn default() -> Self {
153        Self {
154            iterations: DEFAULT_ITERATIONS,
155            warmup: DEFAULT_WARMUP,
156            namespace: BENCH_NAMESPACE.to_string(),
157        }
158    }
159}
160
161/// Run the bench workload and return per-operation results.
162///
163/// Each operation seeds its own data inside the supplied connection so
164/// callers can hand in either a fresh in-memory DB (for tests) or a
165/// disposable on-disk DB (for the CLI).
166///
167/// # Errors
168///
169/// Returns the underlying [`db`] error if any of the seeded inserts
170/// or queries fail.
171pub fn run(conn: &Connection, config: &BenchConfig) -> Result<Vec<OperationResult>> {
172    let store = run_store_no_embedding(conn, config)?;
173    let search = run_search_fts(conn, config)?;
174    let recall = run_recall_hot(conn, config)?;
175    let kg_sources = seed_kg_fixture(conn, &config.namespace)?;
176    let kg_query = run_kg_query_depth1(conn, config, &kg_sources)?;
177    let kg_chain_sources = seed_kg_chain_fixture(conn, &config.namespace)?;
178    let kg_query_d3 =
179        run_kg_query_chain(conn, config, &kg_chain_sources, Operation::KgQueryDepth3, 3)?;
180    let kg_query_d5 =
181        run_kg_query_chain(conn, config, &kg_chain_sources, Operation::KgQueryDepth5, 5)?;
182    let kg_timeline = run_kg_timeline(conn, config, &kg_sources)?;
183    Ok(vec![
184        store,
185        search,
186        recall,
187        kg_query,
188        kg_query_d3,
189        kg_query_d5,
190        kg_timeline,
191    ])
192}
193
194fn run_store_no_embedding(conn: &Connection, config: &BenchConfig) -> Result<OperationResult> {
195    let total = config.warmup + config.iterations;
196    let mut samples = Vec::with_capacity(config.iterations);
197    for i in 0..total {
198        let mem = synth_memory(&config.namespace, i, "store");
199        let start = Instant::now();
200        db::insert(conn, &mem)?;
201        let elapsed = start.elapsed();
202        if i >= config.warmup {
203            samples.push(elapsed);
204        }
205    }
206    Ok(percentile_summary(Operation::StoreNoEmbedding, &samples))
207}
208
209fn run_search_fts(conn: &Connection, config: &BenchConfig) -> Result<OperationResult> {
210    seed_corpus(conn, &config.namespace, "search", 200)?;
211    let total = config.warmup + config.iterations;
212    let mut samples = Vec::with_capacity(config.iterations);
213    for i in 0..total {
214        let query = format!("topic-{}", i % 50);
215        let start = Instant::now();
216        let _ = db::search(
217            conn,
218            &query,
219            Some(&config.namespace),
220            None,
221            10,
222            None,
223            None,
224            None,
225            None,
226            None,
227            None,
228        )?;
229        let elapsed = start.elapsed();
230        if i >= config.warmup {
231            samples.push(elapsed);
232        }
233    }
234    Ok(percentile_summary(Operation::SearchFts, &samples))
235}
236
237fn run_recall_hot(conn: &Connection, config: &BenchConfig) -> Result<OperationResult> {
238    seed_corpus(conn, &config.namespace, "recall", 200)?;
239    let warmup_query = "topic 0 category 0";
240    for _ in 0..config.warmup {
241        let _ = db::recall(
242            conn,
243            warmup_query,
244            Some(&config.namespace),
245            10,
246            None,
247            None,
248            None,
249            0,
250            0,
251            None,
252            None,
253        )?;
254    }
255    let mut samples = Vec::with_capacity(config.iterations);
256    for i in 0..config.iterations {
257        let query = format!("topic {} category {}", i % 50, i % 10);
258        let start = Instant::now();
259        let _ = db::recall(
260            conn,
261            &query,
262            Some(&config.namespace),
263            10,
264            None,
265            None,
266            None,
267            0,
268            0,
269            None,
270            None,
271        )?;
272        samples.push(start.elapsed());
273    }
274    Ok(percentile_summary(Operation::RecallHot, &samples))
275}
276
277/// Source memory IDs returned from [`seed_kg_fixture`]. Each source has
278/// `KG_FIXTURE_LINKS_PER_SOURCE` outbound links — the bench drives both
279/// `kg_query` and `kg_timeline` against the same fixture.
280const KG_FIXTURE_SOURCES: usize = 50;
281const KG_FIXTURE_LINKS_PER_SOURCE: usize = 4;
282
283/// Linear-chain fixture geometry for the depth=3 / depth=5 runners.
284/// `KG_CHAIN_FIXTURE_CHAINS` chains × `KG_CHAIN_FIXTURE_HOPS` hops yields
285/// `chains * (hops + 1)` memories and `chains * hops` links — so 50 × 5
286/// matches the fan-out fixture's order of magnitude (300 memories +
287/// 250 links). depth=5 reaches every node in a chain; depth=3 reaches
288/// the first three follow-on hops.
289const KG_CHAIN_FIXTURE_CHAINS: usize = 50;
290const KG_CHAIN_FIXTURE_HOPS: usize = 5;
291
292fn run_kg_query_depth1(
293    conn: &Connection,
294    config: &BenchConfig,
295    sources: &[String],
296) -> Result<OperationResult> {
297    debug_assert!(
298        !sources.is_empty(),
299        "kg_query bench requires a seeded fixture"
300    );
301    let total = config.warmup + config.iterations;
302    let mut samples = Vec::with_capacity(config.iterations);
303    for i in 0..total {
304        let src = &sources[i % sources.len()];
305        let start = Instant::now();
306        let _ = db::kg_query(conn, src, 1, None, None, None)?;
307        let elapsed = start.elapsed();
308        if i >= config.warmup {
309            samples.push(elapsed);
310        }
311    }
312    Ok(percentile_summary(Operation::KgQueryDepth1, &samples))
313}
314
315fn run_kg_query_chain(
316    conn: &Connection,
317    config: &BenchConfig,
318    sources: &[String],
319    operation: Operation,
320    max_depth: usize,
321) -> Result<OperationResult> {
322    debug_assert!(
323        !sources.is_empty(),
324        "kg_query chain bench requires a seeded fixture"
325    );
326    let total = config.warmup + config.iterations;
327    let mut samples = Vec::with_capacity(config.iterations);
328    for i in 0..total {
329        let src = &sources[i % sources.len()];
330        let start = Instant::now();
331        let _ = db::kg_query(conn, src, max_depth, None, None, None)?;
332        let elapsed = start.elapsed();
333        if i >= config.warmup {
334            samples.push(elapsed);
335        }
336    }
337    Ok(percentile_summary(operation, &samples))
338}
339
340fn run_kg_timeline(
341    conn: &Connection,
342    config: &BenchConfig,
343    sources: &[String],
344) -> Result<OperationResult> {
345    debug_assert!(
346        !sources.is_empty(),
347        "kg_timeline bench requires a seeded fixture"
348    );
349    let total = config.warmup + config.iterations;
350    let mut samples = Vec::with_capacity(config.iterations);
351    for i in 0..total {
352        let src = &sources[i % sources.len()];
353        let start = Instant::now();
354        let _ = db::kg_timeline(conn, src, None, None, None)?;
355        let elapsed = start.elapsed();
356        if i >= config.warmup {
357            samples.push(elapsed);
358        }
359    }
360    Ok(percentile_summary(Operation::KgTimeline, &samples))
361}
362
363/// Seed the in-process KG fixture: `KG_FIXTURE_SOURCES` source memories,
364/// each with `KG_FIXTURE_LINKS_PER_SOURCE` outbound links to distinct
365/// targets. Every link sets `valid_from` so `kg_timeline` (which skips
366/// rows with NULL `valid_from`) sees the full corpus. Returns the source
367/// IDs so the runners can hand them to `kg_query` / `kg_timeline`.
368fn seed_kg_fixture(conn: &Connection, namespace: &str) -> Result<Vec<String>> {
369    let mut sources = Vec::with_capacity(KG_FIXTURE_SOURCES);
370    for s in 0..KG_FIXTURE_SOURCES {
371        let src = synth_memory(namespace, s, "kg-src");
372        // `db::insert` upserts on `(title, namespace)` and returns the
373        // canonical id, which differs from `src.id` if the row already
374        // exists. Use the returned id so the fixture remains correct
375        // even when `run()` is invoked twice against the same conn.
376        let src_id = db::insert(conn, &src)?;
377        for t in 0..KG_FIXTURE_LINKS_PER_SOURCE {
378            let target_idx = s * KG_FIXTURE_LINKS_PER_SOURCE + t;
379            let tgt = synth_memory(namespace, target_idx, "kg-tgt");
380            let tgt_id = db::insert(conn, &tgt)?;
381            // `db::create_link` stamps `created_at` and `valid_from` to
382            // the current wall clock — sufficient for `kg_timeline`
383            // (which skips rows with NULL `valid_from`).
384            db::create_link(conn, &src_id, &tgt_id, "related_to")?;
385        }
386        sources.push(src_id);
387    }
388    Ok(sources)
389}
390
391/// Seed the linear-chain KG fixture used by the depth=3 / depth=5
392/// runners: `KG_CHAIN_FIXTURE_CHAINS` chains, each
393/// `KG_CHAIN_FIXTURE_HOPS` links long. Every node and link uses titles
394/// disjoint from the fan-out fixture's `kg-src` / `kg-tgt` prefixes, so
395/// both fixtures coexist in the same connection without colliding on
396/// the `(title, namespace)` upsert. Returns the source IDs (one per
397/// chain) so the runners can drive `kg_query` against them.
398fn seed_kg_chain_fixture(conn: &Connection, namespace: &str) -> Result<Vec<String>> {
399    let mut sources = Vec::with_capacity(KG_CHAIN_FIXTURE_CHAINS);
400    for c in 0..KG_CHAIN_FIXTURE_CHAINS {
401        let mut prev_id = {
402            let head = synth_memory(namespace, c, "kg-chain-src");
403            db::insert(conn, &head)?
404        };
405        let chain_head_id = prev_id.clone();
406        for h in 0..KG_CHAIN_FIXTURE_HOPS {
407            let node_idx = c * KG_CHAIN_FIXTURE_HOPS + h;
408            let next = synth_memory(namespace, node_idx, "kg-chain-node");
409            let next_id = db::insert(conn, &next)?;
410            db::create_link(conn, &prev_id, &next_id, "related_to")?;
411            prev_id = next_id;
412        }
413        sources.push(chain_head_id);
414    }
415    Ok(sources)
416}
417
418fn seed_corpus(conn: &Connection, namespace: &str, prefix: &str, count: usize) -> Result<()> {
419    for i in 0..count {
420        let mem = synth_memory(namespace, i, prefix);
421        db::insert(conn, &mem)?;
422    }
423    Ok(())
424}
425
426fn synth_memory(namespace: &str, i: usize, prefix: &str) -> Memory {
427    let now = chrono::Utc::now().to_rfc3339();
428    Memory {
429        id: uuid::Uuid::new_v4().to_string(),
430        tier: Tier::Long,
431        namespace: namespace.to_string(),
432        title: format!("bench-{prefix}-{i}"),
433        content: format!(
434            "bench memory {i} content about topic {} category {} for {prefix} workload",
435            i % 50,
436            i % 10
437        ),
438        tags: vec![],
439        priority: i32::try_from((i % 9) + 1).unwrap_or(5),
440        confidence: 1.0,
441        source: "bench".to_string(),
442        access_count: 0,
443        created_at: now.clone(),
444        updated_at: now,
445        last_accessed_at: None,
446        expires_at: None,
447        metadata: serde_json::json!({"agent_id": "bench"}),
448    }
449}
450
451fn percentile_summary(operation: Operation, samples: &[Duration]) -> OperationResult {
452    debug_assert!(
453        !samples.is_empty(),
454        "bench operation produced no samples; iterations must be > 0"
455    );
456    let mut sorted: Vec<f64> = samples.iter().map(duration_ms).collect();
457    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
458    let p50 = percentile(&sorted, 0.50);
459    let p95 = percentile(&sorted, 0.95);
460    let p99 = percentile(&sorted, 0.99);
461    let target = operation.target_p95_ms();
462    let status = if p95 <= target * P95_TOLERANCE {
463        Status::Pass
464    } else {
465        Status::Fail
466    };
467    OperationResult {
468        operation,
469        label: operation.label(),
470        target_p95_ms: target,
471        measured_p50_ms: p50,
472        measured_p95_ms: p95,
473        measured_p99_ms: p99,
474        samples: sorted.len(),
475        status,
476    }
477}
478
479fn duration_ms(d: &Duration) -> f64 {
480    let secs = d.as_secs_f64();
481    secs * 1000.0
482}
483
484#[allow(
485    clippy::cast_precision_loss,
486    clippy::cast_sign_loss,
487    clippy::cast_possible_truncation
488)]
489fn percentile(sorted: &[f64], q: f64) -> f64 {
490    if sorted.is_empty() {
491        return 0.0;
492    }
493    if sorted.len() == 1 {
494        return sorted[0];
495    }
496    let rank = q * (sorted.len() as f64 - 1.0);
497    let lo = rank.floor() as usize;
498    let hi = rank.ceil() as usize;
499    if lo == hi {
500        return sorted[lo];
501    }
502    let frac = rank - lo as f64;
503    sorted[lo] + (sorted[hi] - sorted[lo]) * frac
504}
505
506/// Render a results table to a string in the same shape used in the
507/// `PERFORMANCE.md` "Operator Self-Verification" example.
508#[must_use]
509pub fn render_table(results: &[OperationResult]) -> String {
510    let mut out = String::new();
511    out.push_str(
512        "Operation                       Target (p95)   Measured (p95)   p50      p99      Status\n",
513    );
514    out.push_str(
515        "─────────────────────────────────────────────────────────────────────────────────────────\n",
516    );
517    for r in results {
518        let status_str = match r.status {
519            Status::Pass => "PASS",
520            Status::Fail => "FAIL",
521        };
522        // target budgets are documented as small integer ms; rounding
523        // to the nearest int ms is what the table in PERFORMANCE.md
524        // shows. Saturating cast guards against pathological future
525        // changes to a non-integer or huge value.
526        #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
527        let target_ms = r.target_p95_ms.round() as i64;
528        let line = format!(
529            "{:<30}  < {:>4} ms       {:>7.1} ms       {:>5.1}    {:>5.1}    {}\n",
530            r.label, target_ms, r.measured_p95_ms, r.measured_p50_ms, r.measured_p99_ms, status_str
531        );
532        out.push_str(&line);
533    }
534    out
535}
536
537/// Subset of [`OperationResult`] retained when loading a previous run
538/// for `--baseline` comparison. Only the fields the regression check
539/// actually consumes are required, so any superset of those fields
540/// (the full `bench --json` output included) deserializes cleanly.
541#[derive(Debug, Clone, Deserialize)]
542pub struct BaselineRecord {
543    pub operation: Operation,
544    pub measured_p95_ms: f64,
545}
546
547/// Top-level shape of a `bench --json` payload, used to thread the
548/// `results` array out for [`load_baseline`]. The other top-level
549/// fields (`iterations`, `warmup`, anything future runs add) are
550/// ignored on purpose so older / newer JSON shapes load without
551/// migration churn.
552#[derive(Debug, Clone, Deserialize)]
553struct BaselineFile {
554    results: Vec<BaselineRecord>,
555}
556
557/// Per-operation regression row produced by
558/// [`compare_against_baseline`].
559#[derive(Debug, Clone, Serialize)]
560pub struct Regression {
561    pub operation: Operation,
562    /// Pretty label, duplicated for JSON consumers.
563    pub label: &'static str,
564    pub baseline_p95_ms: f64,
565    pub measured_p95_ms: f64,
566    pub delta_pct: f64,
567    pub threshold_pct: f64,
568    pub regressed: bool,
569}
570
571/// Load a previously emitted `bench --json` payload from disk.
572///
573/// # Errors
574///
575/// Returns an error if the file cannot be read or the JSON cannot be
576/// parsed into the [`BaselineFile`] shape.
577pub fn load_baseline(path: &Path) -> Result<Vec<BaselineRecord>> {
578    let raw = std::fs::read_to_string(path)
579        .with_context(|| format!("failed to read baseline file: {}", path.display()))?;
580    let file: BaselineFile = serde_json::from_str(&raw)
581        .with_context(|| format!("failed to parse baseline JSON: {}", path.display()))?;
582    Ok(file.results)
583}
584
585/// Compare a fresh run against a baseline. Operations missing from the
586/// baseline are skipped silently (e.g. a new bench row added since the
587/// baseline was captured). The returned `Vec` preserves the order of
588/// `current` and only includes ops present in both.
589#[must_use]
590pub fn compare_against_baseline(
591    current: &[OperationResult],
592    baseline: &[BaselineRecord],
593    threshold_pct: f64,
594) -> Vec<Regression> {
595    let mut out = Vec::with_capacity(current.len());
596    for r in current {
597        let Some(b) = baseline.iter().find(|b| b.operation == r.operation) else {
598            continue;
599        };
600        // Treat a non-positive baseline as "no signal" so we never
601        // divide by zero or produce a nonsense -100% delta. Any current
602        // measurement against a zero baseline is reported as 0% delta
603        // rather than infinity — the absolute-budget guard already
604        // catches actual breakage.
605        let delta_pct = if b.measured_p95_ms > 0.0 {
606            (r.measured_p95_ms - b.measured_p95_ms) / b.measured_p95_ms * 100.0
607        } else {
608            0.0
609        };
610        let regressed = delta_pct > threshold_pct;
611        out.push(Regression {
612            operation: r.operation,
613            label: r.operation.label(),
614            baseline_p95_ms: b.measured_p95_ms,
615            measured_p95_ms: r.measured_p95_ms,
616            delta_pct,
617            threshold_pct,
618            regressed,
619        });
620    }
621    out
622}
623
624/// Render a regression table to a string, mirroring the layout of
625/// [`render_table`].
626#[must_use]
627pub fn render_regression_table(rows: &[Regression]) -> String {
628    let mut out = String::new();
629    out.push_str(
630        "Operation                       Baseline (p95)   Measured (p95)   Delta     Status\n",
631    );
632    out.push_str(
633        "─────────────────────────────────────────────────────────────────────────────────\n",
634    );
635    for r in rows {
636        let status_str = if r.regressed { "REGRESSION" } else { "OK" };
637        let line = format!(
638            "{:<30}  {:>10.1} ms     {:>10.1} ms    {:>+6.1}%   {}\n",
639            r.label, r.baseline_p95_ms, r.measured_p95_ms, r.delta_pct, status_str
640        );
641        out.push_str(&line);
642    }
643    out
644}
645
646/// Append a benchmark result to a JSONL history file.
647/// Creates the file and parent directories if missing.
648/// Each line is a self-describing JSON object with `captured_at`, `iterations`,
649/// `warmup`, and `results` array.
650pub fn append_history(
651    path: &std::path::Path,
652    captured_at: &str,
653    iterations: usize,
654    warmup: usize,
655    results: &[OperationResult],
656) -> Result<()> {
657    use std::fs::OpenOptions;
658    use std::io::Write;
659
660    // Create parent directories if needed
661    if let Some(parent) = path.parent()
662        && !parent.as_os_str().is_empty()
663    {
664        std::fs::create_dir_all(parent)?;
665    }
666
667    let entry = serde_json::json!({
668        "captured_at": captured_at,
669        "iterations": iterations,
670        "warmup": warmup,
671        "results": results,
672    });
673
674    let mut file = OpenOptions::new().create(true).append(true).open(path)?;
675
676    writeln!(file, "{}", serde_json::to_string(&entry)?)?;
677    Ok(())
678}
679
680#[allow(clippy::wildcard_imports)]
681mod tests {
682    use super::*;
683    use crate::db;
684
685    #[allow(dead_code)]
686    fn fresh_conn() -> Connection {
687        db::open(Path::new(":memory:")).unwrap()
688    }
689
690    #[allow(dead_code)]
691    fn small_config() -> BenchConfig {
692        BenchConfig {
693            iterations: 30,
694            warmup: 5,
695            namespace: "bench-test".to_string(),
696        }
697    }
698
699    #[test]
700    fn percentile_interpolates() {
701        let s = vec![1.0, 2.0, 3.0, 4.0];
702        assert!((percentile(&s, 0.50) - 2.5).abs() < 1e-9);
703        assert!((percentile(&s, 0.0) - 1.0).abs() < 1e-9);
704        assert!((percentile(&s, 1.0) - 4.0).abs() < 1e-9);
705    }
706
707    #[test]
708    fn percentile_handles_singleton_and_empty() {
709        assert!((percentile(&[], 0.5) - 0.0).abs() < 1e-9);
710        assert!((percentile(&[42.0], 0.99) - 42.0).abs() < 1e-9);
711    }
712
713    #[test]
714    fn run_returns_all_seven_results() {
715        let conn = fresh_conn();
716        let results = run(&conn, &small_config()).unwrap();
717        assert_eq!(results.len(), 7);
718        assert_eq!(results[0].operation, Operation::StoreNoEmbedding);
719        assert_eq!(results[1].operation, Operation::SearchFts);
720        assert_eq!(results[2].operation, Operation::RecallHot);
721        assert_eq!(results[3].operation, Operation::KgQueryDepth1);
722        assert_eq!(results[4].operation, Operation::KgQueryDepth3);
723        assert_eq!(results[5].operation, Operation::KgQueryDepth5);
724        assert_eq!(results[6].operation, Operation::KgTimeline);
725        for r in &results {
726            assert_eq!(r.samples, 30);
727            assert!(r.measured_p50_ms <= r.measured_p95_ms);
728            assert!(r.measured_p95_ms <= r.measured_p99_ms);
729            assert!(r.target_p95_ms > 0.0);
730        }
731    }
732
733    #[test]
734    fn status_is_fail_when_p95_over_tolerance() {
735        let r = OperationResult {
736            operation: Operation::StoreNoEmbedding,
737            label: Operation::StoreNoEmbedding.label(),
738            target_p95_ms: 20.0,
739            measured_p50_ms: 5.0,
740            measured_p95_ms: 25.0,
741            measured_p99_ms: 30.0,
742            samples: 100,
743            status: Status::Fail,
744        };
745        assert_eq!(r.status, Status::Fail);
746        // 25 > 20 * 1.10 = 22 → Fail
747        let recomputed = if 25.0_f64 <= 20.0 * P95_TOLERANCE {
748            Status::Pass
749        } else {
750            Status::Fail
751        };
752        assert_eq!(recomputed, Status::Fail);
753    }
754
755    #[test]
756    fn status_is_pass_within_tolerance() {
757        // 21 ms over 20 ms budget = 5% over → still PASS (under 10%).
758        let recomputed = if 21.0_f64 <= 20.0 * P95_TOLERANCE {
759            Status::Pass
760        } else {
761            Status::Fail
762        };
763        assert_eq!(recomputed, Status::Pass);
764    }
765
766    #[test]
767    fn render_table_includes_all_operations() {
768        let conn = fresh_conn();
769        let results = run(&conn, &small_config()).unwrap();
770        let table = render_table(&results);
771        assert!(table.contains("memory_store (no embedding)"));
772        assert!(table.contains("memory_search (FTS5)"));
773        assert!(table.contains("memory_recall (hot, depth=1)"));
774        assert!(table.contains("memory_kg_query (depth=1)"));
775        assert!(table.contains("memory_kg_query (depth=3)"));
776        assert!(table.contains("memory_kg_query (depth=5)"));
777        assert!(table.contains("memory_kg_timeline"));
778        assert!(table.contains("Status"));
779    }
780
781    #[test]
782    fn operation_targets_match_performance_md() {
783        // Pinned to PERFORMANCE.md — if you change a budget, change both.
784        assert!((Operation::StoreNoEmbedding.target_p95_ms() - 20.0).abs() < 1e-9);
785        assert!((Operation::SearchFts.target_p95_ms() - 100.0).abs() < 1e-9);
786        assert!((Operation::RecallHot.target_p95_ms() - 50.0).abs() < 1e-9);
787        assert!((Operation::KgQueryDepth1.target_p95_ms() - 100.0).abs() < 1e-9);
788        assert!((Operation::KgQueryDepth3.target_p95_ms() - 100.0).abs() < 1e-9);
789        assert!((Operation::KgQueryDepth5.target_p95_ms() - 250.0).abs() < 1e-9);
790        assert!((Operation::KgTimeline.target_p95_ms() - 100.0).abs() < 1e-9);
791    }
792
793    #[test]
794    fn seed_kg_chain_fixture_traverses_to_max_depth() {
795        let conn = fresh_conn();
796        let sources = seed_kg_chain_fixture(&conn, "kg-chain-fixture-test").unwrap();
797        assert_eq!(sources.len(), KG_CHAIN_FIXTURE_CHAINS);
798        // Every chain must yield exactly `KG_CHAIN_FIXTURE_HOPS` reachable
799        // nodes at depth=KG_CHAIN_FIXTURE_HOPS — that's what justifies the
800        // depth=5 budget bucket. depth=3 must reach exactly 3 nodes.
801        for src in &sources {
802            let depth5 = db::kg_query(&conn, src, KG_CHAIN_FIXTURE_HOPS, None, None, None).unwrap();
803            assert_eq!(
804                depth5.len(),
805                KG_CHAIN_FIXTURE_HOPS,
806                "depth={KG_CHAIN_FIXTURE_HOPS} on a {KG_CHAIN_FIXTURE_HOPS}-hop chain must reach every node"
807            );
808            let depth3 = db::kg_query(&conn, src, 3, None, None, None).unwrap();
809            assert_eq!(
810                depth3.len(),
811                3,
812                "depth=3 on a {KG_CHAIN_FIXTURE_HOPS}-hop chain must reach exactly 3 follow-on nodes"
813            );
814        }
815    }
816
817    #[test]
818    fn seed_kg_fixture_populates_sources_and_links() {
819        let conn = fresh_conn();
820        let sources = seed_kg_fixture(&conn, "kg-fixture-test").unwrap();
821        assert_eq!(sources.len(), KG_FIXTURE_SOURCES);
822        // Every source carries the expected fan-out, every link has a
823        // non-null `valid_from` (otherwise `kg_timeline` would skip it).
824        for src in &sources {
825            let nodes = db::kg_query(&conn, src, 1, None, None, None).unwrap();
826            assert_eq!(nodes.len(), KG_FIXTURE_LINKS_PER_SOURCE);
827            let timeline = db::kg_timeline(&conn, src, None, None, None).unwrap();
828            assert_eq!(timeline.len(), KG_FIXTURE_LINKS_PER_SOURCE);
829            for ev in &timeline {
830                // `kg_timeline` filters out NULL `valid_from` rows in SQL,
831                // so any returned event must carry a non-empty stamp.
832                assert!(
833                    !ev.valid_from.is_empty(),
834                    "kg fixture must stamp valid_from on every link"
835                );
836            }
837        }
838    }
839
840    #[allow(dead_code)]
841    fn synthetic_result(op: Operation, p95: f64) -> OperationResult {
842        OperationResult {
843            operation: op,
844            label: op.label(),
845            target_p95_ms: op.target_p95_ms(),
846            measured_p50_ms: p95 / 2.0,
847            measured_p95_ms: p95,
848            measured_p99_ms: p95 * 1.1,
849            samples: 100,
850            status: Status::Pass,
851        }
852    }
853
854    #[allow(dead_code)]
855    fn synthetic_baseline(op: Operation, p95: f64) -> BaselineRecord {
856        BaselineRecord {
857            operation: op,
858            measured_p95_ms: p95,
859        }
860    }
861
862    #[test]
863    fn baseline_compare_flags_above_threshold() {
864        // 12% slowdown over baseline at default 10% threshold → REGRESSION.
865        let current = vec![synthetic_result(Operation::StoreNoEmbedding, 11.2)];
866        let baseline = vec![synthetic_baseline(Operation::StoreNoEmbedding, 10.0)];
867        let rows = compare_against_baseline(&current, &baseline, 10.0);
868        assert_eq!(rows.len(), 1);
869        assert!(rows[0].regressed);
870        assert!((rows[0].delta_pct - 12.0).abs() < 1e-9);
871    }
872
873    #[test]
874    fn baseline_compare_passes_within_threshold() {
875        // 8% slowdown over baseline at default 10% threshold → OK.
876        let current = vec![synthetic_result(Operation::StoreNoEmbedding, 10.8)];
877        let baseline = vec![synthetic_baseline(Operation::StoreNoEmbedding, 10.0)];
878        let rows = compare_against_baseline(&current, &baseline, 10.0);
879        assert_eq!(rows.len(), 1);
880        assert!(!rows[0].regressed);
881    }
882
883    #[test]
884    fn baseline_compare_speedup_is_negative_delta() {
885        // Faster than baseline → negative delta, never a regression.
886        let current = vec![synthetic_result(Operation::SearchFts, 8.0)];
887        let baseline = vec![synthetic_baseline(Operation::SearchFts, 10.0)];
888        let rows = compare_against_baseline(&current, &baseline, 10.0);
889        assert_eq!(rows.len(), 1);
890        assert!(!rows[0].regressed);
891        assert!((rows[0].delta_pct + 20.0).abs() < 1e-9);
892    }
893
894    #[test]
895    fn baseline_compare_skips_ops_missing_in_baseline() {
896        // A new op added since the baseline was captured shouldn't crash
897        // or appear as a regression.
898        let current = vec![
899            synthetic_result(Operation::StoreNoEmbedding, 10.0),
900            synthetic_result(Operation::KgQueryDepth5, 200.0),
901        ];
902        let baseline = vec![synthetic_baseline(Operation::StoreNoEmbedding, 10.0)];
903        let rows = compare_against_baseline(&current, &baseline, 10.0);
904        assert_eq!(rows.len(), 1);
905        assert_eq!(rows[0].operation, Operation::StoreNoEmbedding);
906    }
907
908    #[test]
909    fn baseline_compare_handles_zero_baseline() {
910        // Pathological zero baseline: report 0% delta rather than
911        // dividing by zero. Absolute-budget guard still catches
912        // genuinely-broken measurements.
913        let current = vec![synthetic_result(Operation::SearchFts, 5.0)];
914        let baseline = vec![synthetic_baseline(Operation::SearchFts, 0.0)];
915        let rows = compare_against_baseline(&current, &baseline, 10.0);
916        assert_eq!(rows.len(), 1);
917        assert!(!rows[0].regressed);
918        assert!((rows[0].delta_pct - 0.0).abs() < 1e-9);
919    }
920
921    #[test]
922    fn load_baseline_round_trips_json_payload() {
923        // Mirror the shape `bench --json` actually emits — it must
924        // round-trip through `load_baseline` so CI artifacts work as
925        // baselines without preprocessing.
926        let dir = tempfile::tempdir().unwrap();
927        let path = dir.path().join("baseline.json");
928        let payload = serde_json::json!({
929            "iterations": 200,
930            "warmup": 20,
931            "results": [
932                {
933                    "operation": "store_no_embedding",
934                    "label": "memory_store (no embedding)",
935                    "target_p95_ms": 20.0,
936                    "measured_p50_ms": 4.0,
937                    "measured_p95_ms": 9.0,
938                    "measured_p99_ms": 11.0,
939                    "samples": 200,
940                    "status": "pass"
941                },
942                {
943                    "operation": "search_fts",
944                    "label": "memory_search (FTS5)",
945                    "target_p95_ms": 100.0,
946                    "measured_p50_ms": 12.0,
947                    "measured_p95_ms": 31.0,
948                    "measured_p99_ms": 45.0,
949                    "samples": 200,
950                    "status": "pass"
951                }
952            ]
953        });
954        std::fs::write(&path, serde_json::to_string_pretty(&payload).unwrap()).unwrap();
955        let loaded = load_baseline(&path).unwrap();
956        assert_eq!(loaded.len(), 2);
957        assert_eq!(loaded[0].operation, Operation::StoreNoEmbedding);
958        assert!((loaded[0].measured_p95_ms - 9.0).abs() < 1e-9);
959        assert_eq!(loaded[1].operation, Operation::SearchFts);
960        assert!((loaded[1].measured_p95_ms - 31.0).abs() < 1e-9);
961    }
962
963    #[test]
964    fn render_regression_table_marks_regressions() {
965        let rows = vec![
966            Regression {
967                operation: Operation::StoreNoEmbedding,
968                label: Operation::StoreNoEmbedding.label(),
969                baseline_p95_ms: 10.0,
970                measured_p95_ms: 12.0,
971                delta_pct: 20.0,
972                threshold_pct: 10.0,
973                regressed: true,
974            },
975            Regression {
976                operation: Operation::SearchFts,
977                label: Operation::SearchFts.label(),
978                baseline_p95_ms: 30.0,
979                measured_p95_ms: 31.0,
980                delta_pct: 3.3,
981                threshold_pct: 10.0,
982                regressed: false,
983            },
984        ];
985        let table = render_regression_table(&rows);
986        assert!(table.contains("memory_store (no embedding)"));
987        assert!(table.contains("memory_search (FTS5)"));
988        assert!(table.contains("REGRESSION"));
989        assert!(table.contains("OK"));
990    }
991}