Skip to main content

ai_memory/
bench.rs

1// Copyright 2026 AlphaOne LLC
2// SPDX-License-Identifier: Apache-2.0
3
4//! Pillar 3 / Stream E — `ai-memory bench` workload runner.
5//!
6//! Measures hot-path operations against the budgets published in
7//! `PERFORMANCE.md` and returns p50/p95/p99 latencies plus a pass/fail
8//! verdict per operation. The CI guard (Stream F) enforces the same
9//! 10% p95 tolerance documented in `PERFORMANCE.md`.
10//!
11//! Coverage in this build:
12//! - Embedding-free CRUD: `memory_store` (no embedding), `memory_search`
13//!   (FTS5), `memory_recall` (hot, depth=1).
14//! - Knowledge-graph traversal:
15//!     - `memory_kg_query` (depth=1) and `memory_kg_timeline` against a
16//!       fan-out fixture (50 sources × 4 outbound links each, every
17//!       link `valid_from`-stamped).
18//!     - `memory_kg_query` (depth=3, depth=5) against a chain fixture
19//!       (50 chains × 5 hops each = 300 memories + 250 links). depth=3
20//!       hits the "depth ≤ 3" 100 ms budget bucket; depth=5 hits the
21//!       "depth ≤ 5" 250 ms tail-case bucket.
22//!
23//! Both fixtures live in the same in-process disposable `SQLite` — no
24//! external service required.
25//!
26//! Embedding-bound paths (`memory_store` with embedding,
27//! `memory_recall` cold/full hybrid) still require an embedder process
28//! and are tracked as follow-up Stream E work — they don't belong on
29//! the hot path of a `cargo test` invocation.
30
31use crate::models::ConfidenceSource;
32use anyhow::{Context, Result};
33use rusqlite::Connection;
34use serde::{Deserialize, Serialize};
35use std::path::Path;
36use std::time::{Duration, Instant};
37
38use crate::db;
39use crate::models::{Memory, Tier};
40
41/// CI guard tolerance — measured p95 may exceed budget by this factor
42/// before the run is marked `Fail`. Mirrors `PERFORMANCE.md`.
43pub const P95_TOLERANCE: f64 = 1.10;
44
45/// macOS-runner budget multiplier (issue #1193).
46///
47/// Apple's `macos-latest` GHA runner pool has substantially higher
48/// I/O scheduling variance and cold-start latency than `ubuntu-latest`.
49/// `tests/integration.rs::test_cli_bench_emits_json_with_seven_results_and_passes_budget`
50/// drives `ai-memory bench --iterations 5` end-to-end and asserts the
51/// process exits 0 — at the small iteration count the macOS tail can
52/// blow the absolute `target_p95_ms` budgets even when the underlying
53/// code is healthy. Per #1193 "Proposed fix" option 1 (preferred):
54/// apply a centralized multiplier inside the runner-effective budget
55/// path so the pass/fail verdict is platform-aware while the canonical
56/// `target_p95_ms` reported in the JSON envelope still reflects the
57/// PERFORMANCE.md numbers (unchanged for dashboards / regression
58/// trackers). Multiplier of 3 mirrors the same headroom applied to the
59/// timing-sensitive hooks tests under the same issue.
60#[cfg(target_os = "macos")]
61pub const MACOS_BUDGET_MULT: f64 = 3.0;
62#[cfg(not(target_os = "macos"))]
63pub const MACOS_BUDGET_MULT: f64 = 1.0;
64
65/// Default seeded namespace for the bench workload.
66pub const BENCH_NAMESPACE: &str = "ai-memory-bench";
67
68/// Default workload size — keep small enough for `cargo test`, large
69/// enough that p99 has signal.
70pub const DEFAULT_ITERATIONS: usize = 200;
71
72/// Default warmup iterations discarded from the percentile sample.
73pub const DEFAULT_WARMUP: usize = 20;
74
75/// Hard ceiling on `--iterations` — bounds bench wall-clock on a
76/// mistyped flag.
77pub const MAX_ITERATIONS: usize = 100_000;
78
79/// Hard ceiling on `--warmup` iterations.
80pub const MAX_WARMUP: usize = 10_000;
81
82/// Hard ceiling on `--regression-threshold` (percent) — values above
83/// this are clamped; a 1000% allowance already means "no gate".
84pub const MAX_REGRESSION_THRESHOLD_PCT: f64 = 1000.0;
85
86/// #1579 B8 — canonical corpus scale (rows) for the scale-gate run
87/// (`ai-memory bench --scale 10000`). The P1 perf-audit proved the
88/// default workload (~500 rows after per-op seeding) cannot see
89/// corpus-scale budget blowouts (recall p95 361 ms vs the 50 ms budget
90/// at 100k rows was invisible to the built-in bench).
91pub const CI_SCALE_GATE_ROWS: usize = 10_000;
92
93/// #1579 B8 — hard ceiling on `--scale` rows. Bounds seeding wall-clock
94/// + RAM on a mistyped flag (1M rows ≈ the largest corpus the perf
95/// audit exercised).
96pub const MAX_SCALE: usize = 1_000_000;
97
98/// #1579 B8 — one row of the per-scale p95 budget table published in
99/// `PERFORMANCE.md` §"Corpus-scale budgets". Only the three
100/// corpus-sensitive operations carry scale-specific budgets; the KG
101/// operations run against fixed-size fixtures (50×4 fan-out, 50×5
102/// chains) whose cost is independent of the seeded corpus scale, so
103/// they keep their canonical budgets at every scale.
104#[derive(Debug, Clone, Copy)]
105pub struct ScaleBudgets {
106    /// Seeded corpus rows this row's budgets apply to (upper bound —
107    /// a requested scale selects the first table row whose `scale` is
108    /// `>=` the request).
109    pub scale: usize,
110    /// `memory_store` (no embedding) p95 budget, ms.
111    pub store_no_embedding_ms: f64,
112    /// `memory_search` (FTS5) p95 budget, ms.
113    pub search_fts_ms: f64,
114    /// `memory_recall` (hot, keyword) p95 budget, ms.
115    pub recall_hot_ms: f64,
116}
117
118/// #1579 B8 — the per-scale budget table (SSOT; `PERFORMANCE.md`
119/// §"Corpus-scale budgets" narrates these numbers and the
120/// `operation_scale_targets_match_performance_md` test pins them).
121///
122/// 10k-row budgets were pinned from a measured release-build run on
123/// this branch (`ai-memory bench --scale 10000`, Linux x86_64) with
124/// ≥50% headroom over the measurement, capped at the operator-approved
125/// conservative ceilings from the #1579 remediation plan (store ≤120,
126/// recall ≤80, search ≤60).
127pub const SCALE_BUDGETS: &[ScaleBudgets] = &[ScaleBudgets {
128    scale: CI_SCALE_GATE_ROWS,
129    store_no_embedding_ms: 120.0,
130    search_fts_ms: 60.0,
131    recall_hot_ms: 80.0,
132}];
133
134/// #1579 B8 — resolve the budget row for a requested scale: the first
135/// table row whose `scale >= requested`, else the largest pinned row
136/// (best-effort; pin a new table row before gating larger scales).
137#[must_use]
138pub fn scale_budgets_for(requested: usize) -> ScaleBudgets {
139    for row in SCALE_BUDGETS {
140        if row.scale >= requested {
141            return *row;
142        }
143    }
144    *SCALE_BUDGETS
145        .last()
146        .expect("SCALE_BUDGETS table must be non-empty")
147}
148
149/// Default tolerance applied when comparing a fresh run against a
150/// `--baseline` JSON file: a measured p95 may grow by this percentage
151/// before the run is flagged as a regression. Independent of
152/// [`P95_TOLERANCE`] (which guards against the absolute budget). The
153/// baseline guard catches drift that stays inside the absolute budget
154/// but trends in the wrong direction across releases.
155pub const DEFAULT_REGRESSION_THRESHOLD_PCT: f64 = 10.0;
156
157/// Hot-path operations covered by this iteration of the bench tool.
158#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
159#[serde(rename_all = "snake_case")]
160pub enum Operation {
161    /// `memory_store` without embedding — pure `SQLite` write path.
162    StoreNoEmbedding,
163    /// `memory_search` — FTS5 keyword baseline.
164    SearchFts,
165    /// `memory_recall` hot path, depth=1 (no hierarchy expansion).
166    RecallHot,
167    /// `memory_kg_query` recursive-CTE traversal at depth=1 (the
168    /// shallowest path through the depth ≤ 3 budget bucket).
169    KgQueryDepth1,
170    /// `memory_kg_query` recursive-CTE traversal at depth=3 (the
171    /// deepest path inside the "depth ≤ 3" 100 ms budget bucket). Driven
172    /// against a chain fixture so the recursive CTE actually visits
173    /// three hops per query.
174    KgQueryDepth3,
175    /// `memory_kg_query` recursive-CTE traversal at depth=5 (the tail
176    /// case for the "depth ≤ 5" 250 ms budget bucket). Driven against
177    /// the same chain fixture as depth=3.
178    KgQueryDepth5,
179    /// `memory_kg_timeline` — ordered timeline for a single source.
180    KgTimeline,
181}
182
183impl Operation {
184    #[must_use]
185    pub fn label(self) -> &'static str {
186        match self {
187            Self::StoreNoEmbedding => "memory_store (no embedding)",
188            Self::SearchFts => "memory_search (FTS5)",
189            Self::RecallHot => "memory_recall (hot, depth=1)",
190            Self::KgQueryDepth1 => "memory_kg_query (depth=1)",
191            Self::KgQueryDepth3 => "memory_kg_query (depth=3)",
192            Self::KgQueryDepth5 => "memory_kg_query (depth=5)",
193            Self::KgTimeline => crate::mcp::registry::tool_names::MEMORY_KG_TIMELINE,
194        }
195    }
196
197    /// p95 budget in milliseconds, sourced from `PERFORMANCE.md`.
198    ///
199    /// `KgQueryDepth1` and `KgQueryDepth3` both fall in the
200    /// "depth ≤ 3" (100 ms) bucket; `KgQueryDepth5` is the tail case
201    /// at "depth ≤ 5" (250 ms). `SearchFts` and `KgTimeline` happen to
202    /// share the same numeric budget as the depth ≤ 3 bucket despite
203    /// belonging to different table rows in `PERFORMANCE.md`.
204    ///
205    /// This is the canonical published budget; the runner-effective
206    /// pass/fail verdict uses [`effective_target_p95_ms`] which
207    /// applies the [`MACOS_BUDGET_MULT`] platform multiplier on top.
208    #[must_use]
209    #[allow(clippy::match_same_arms)]
210    pub fn target_p95_ms(self) -> f64 {
211        match self {
212            Self::StoreNoEmbedding => 20.0,
213            Self::SearchFts => 100.0,
214            Self::RecallHot => 50.0,
215            Self::KgQueryDepth1 => 100.0,
216            Self::KgQueryDepth3 => 100.0,
217            Self::KgQueryDepth5 => 250.0,
218            Self::KgTimeline => 100.0,
219        }
220    }
221
222    /// Runner-effective p95 budget — equal to [`target_p95_ms`] on
223    /// Linux/Windows, but multiplied by [`MACOS_BUDGET_MULT`] on
224    /// macOS targets per issue #1193. The pass/fail verdict in the
225    /// CLI bench tool uses this value; the JSON envelope's
226    /// `target_p95_ms` field continues to report the canonical
227    /// PERFORMANCE.md number so regression dashboards stay stable.
228    #[must_use]
229    pub fn effective_target_p95_ms(self) -> f64 {
230        self.target_p95_ms() * MACOS_BUDGET_MULT
231    }
232
233    /// #1579 B8 — canonical p95 budget at a given corpus scale.
234    /// `None` (the default workload) keeps the legacy
235    /// [`Self::target_p95_ms`] budgets byte-for-byte. `Some(rows)`
236    /// swaps in the [`SCALE_BUDGETS`] row for the three
237    /// corpus-sensitive operations; the KG operations keep their
238    /// canonical budgets because their fixtures are scale-independent
239    /// (see [`ScaleBudgets`]).
240    #[must_use]
241    pub fn target_p95_ms_at_scale(self, scale: Option<usize>) -> f64 {
242        let Some(rows) = scale else {
243            return self.target_p95_ms();
244        };
245        let budgets = scale_budgets_for(rows);
246        match self {
247            Self::StoreNoEmbedding => budgets.store_no_embedding_ms,
248            Self::SearchFts => budgets.search_fts_ms,
249            Self::RecallHot => budgets.recall_hot_ms,
250            Self::KgQueryDepth1 | Self::KgQueryDepth3 | Self::KgQueryDepth5 | Self::KgTimeline => {
251                self.target_p95_ms()
252            }
253        }
254    }
255
256    /// #1579 B8 — runner-effective sibling of
257    /// [`Self::target_p95_ms_at_scale`] (applies the #1193 macOS
258    /// multiplier, same as [`Self::effective_target_p95_ms`]).
259    #[must_use]
260    pub fn effective_target_p95_ms_at_scale(self, scale: Option<usize>) -> f64 {
261        self.target_p95_ms_at_scale(scale) * MACOS_BUDGET_MULT
262    }
263}
264
265#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
266#[serde(rename_all = "snake_case")]
267pub enum Status {
268    Pass,
269    Fail,
270}
271
272#[derive(Debug, Clone, Serialize)]
273pub struct OperationResult {
274    pub operation: Operation,
275    /// Pretty label, duplicated for JSON consumers.
276    pub label: &'static str,
277    pub target_p95_ms: f64,
278    pub measured_p50_ms: f64,
279    pub measured_p95_ms: f64,
280    pub measured_p99_ms: f64,
281    pub samples: usize,
282    pub status: Status,
283}
284
285#[derive(Debug, Clone)]
286pub struct BenchConfig {
287    pub iterations: usize,
288    pub warmup: usize,
289    pub namespace: String,
290    /// #1579 B8 — corpus scale. `None` keeps the legacy default
291    /// workload (~500 rows after per-op seeding) and the legacy
292    /// budgets; `Some(rows)` seeds a scratch corpus of `rows` rows
293    /// into the bench namespace before the operations run and gates
294    /// the verdict against the [`SCALE_BUDGETS`] table instead.
295    pub scale: Option<usize>,
296}
297
298impl Default for BenchConfig {
299    fn default() -> Self {
300        Self {
301            iterations: DEFAULT_ITERATIONS,
302            warmup: DEFAULT_WARMUP,
303            namespace: BENCH_NAMESPACE.to_string(),
304            scale: None,
305        }
306    }
307}
308
309/// Run the bench workload and return per-operation results.
310///
311/// Each operation seeds its own data inside the supplied connection so
312/// callers can hand in either a fresh in-memory DB (for tests) or a
313/// disposable on-disk DB (for the CLI).
314///
315/// # Errors
316///
317/// Returns the underlying [`db`] error if any of the seeded inserts
318/// or queries fail.
319pub fn run(conn: &Connection, config: &BenchConfig) -> Result<Vec<OperationResult>> {
320    // #1579 B8 — seed the scratch corpus FIRST so every operation
321    // below (FTS5 search, hybrid-keyword recall, the store upsert
322    // probe) runs against a table of ~`scale` rows, not the ~500-row
323    // default that hid the 100k-corpus budget blowouts from the P1
324    // audit. The corpus shares the bench namespace and the
325    // `topic-N / category-M` vocabulary the search/recall queries use,
326    // so the queries genuinely scan it.
327    if let Some(rows) = config.scale {
328        seed_corpus(conn, &config.namespace, "scale", rows)?;
329    }
330    let store = run_store_no_embedding(conn, config)?;
331    let search = run_search_fts(conn, config)?;
332    let recall = run_recall_hot(conn, config)?;
333    let kg_sources = seed_kg_fixture(conn, &config.namespace)?;
334    let kg_query = run_kg_query_depth1(conn, config, &kg_sources)?;
335    let kg_chain_sources = seed_kg_chain_fixture(conn, &config.namespace)?;
336    let kg_query_d3 =
337        run_kg_query_chain(conn, config, &kg_chain_sources, Operation::KgQueryDepth3, 3)?;
338    let kg_query_d5 =
339        run_kg_query_chain(conn, config, &kg_chain_sources, Operation::KgQueryDepth5, 5)?;
340    let kg_timeline = run_kg_timeline(conn, config, &kg_sources)?;
341    Ok(vec![
342        store,
343        search,
344        recall,
345        kg_query,
346        kg_query_d3,
347        kg_query_d5,
348        kg_timeline,
349    ])
350}
351
352fn run_store_no_embedding(conn: &Connection, config: &BenchConfig) -> Result<OperationResult> {
353    let total = config.warmup + config.iterations;
354    let mut samples = Vec::with_capacity(config.iterations);
355    for i in 0..total {
356        let mem = synth_memory(&config.namespace, i, "store");
357        let start = Instant::now();
358        db::insert(conn, &mem)?;
359        let elapsed = start.elapsed();
360        if i >= config.warmup {
361            samples.push(elapsed);
362        }
363    }
364    Ok(percentile_summary(
365        Operation::StoreNoEmbedding,
366        &samples,
367        config.scale,
368    ))
369}
370
371fn run_search_fts(conn: &Connection, config: &BenchConfig) -> Result<OperationResult> {
372    seed_corpus(conn, &config.namespace, "search", 200)?;
373    let total = config.warmup + config.iterations;
374    let mut samples = Vec::with_capacity(config.iterations);
375    for i in 0..total {
376        let query = format!("topic-{}", i % 50);
377        let start = Instant::now();
378        let _ = db::search(
379            conn,
380            &query,
381            Some(&config.namespace),
382            None,
383            10,
384            None,
385            None,
386            None,
387            None,
388            None,
389            None,
390            false,
391        )?;
392        let elapsed = start.elapsed();
393        if i >= config.warmup {
394            samples.push(elapsed);
395        }
396    }
397    Ok(percentile_summary(
398        Operation::SearchFts,
399        &samples,
400        config.scale,
401    ))
402}
403
404fn run_recall_hot(conn: &Connection, config: &BenchConfig) -> Result<OperationResult> {
405    seed_corpus(conn, &config.namespace, "recall", 200)?;
406    let warmup_query = "topic 0 category 0";
407    for _ in 0..config.warmup {
408        let _ = db::recall(
409            conn,
410            warmup_query,
411            Some(&config.namespace),
412            10,
413            None,
414            None,
415            None,
416            0,
417            0,
418            None,
419            None,
420            false,
421            None,
422        )?;
423    }
424    let mut samples = Vec::with_capacity(config.iterations);
425    for i in 0..config.iterations {
426        let query = format!("topic {} category {}", i % 50, i % 10);
427        let start = Instant::now();
428        let _ = db::recall(
429            conn,
430            &query,
431            Some(&config.namespace),
432            10,
433            None,
434            None,
435            None,
436            0,
437            0,
438            None,
439            None,
440            false,
441            None,
442        )?;
443        samples.push(start.elapsed());
444    }
445    Ok(percentile_summary(
446        Operation::RecallHot,
447        &samples,
448        config.scale,
449    ))
450}
451
452/// Source memory IDs returned from [`seed_kg_fixture`]. Each source has
453/// `KG_FIXTURE_LINKS_PER_SOURCE` outbound links — the bench drives both
454/// `kg_query` and `kg_timeline` against the same fixture.
455const KG_FIXTURE_SOURCES: usize = 50;
456const KG_FIXTURE_LINKS_PER_SOURCE: usize = 4;
457
458/// Linear-chain fixture geometry for the depth=3 / depth=5 runners.
459/// `KG_CHAIN_FIXTURE_CHAINS` chains × `KG_CHAIN_FIXTURE_HOPS` hops yields
460/// `chains * (hops + 1)` memories and `chains * hops` links — so 50 × 5
461/// matches the fan-out fixture's order of magnitude (300 memories +
462/// 250 links). depth=5 reaches every node in a chain; depth=3 reaches
463/// the first three follow-on hops.
464const KG_CHAIN_FIXTURE_CHAINS: usize = 50;
465const KG_CHAIN_FIXTURE_HOPS: usize = 5;
466
467fn run_kg_query_depth1(
468    conn: &Connection,
469    config: &BenchConfig,
470    sources: &[String],
471) -> Result<OperationResult> {
472    debug_assert!(
473        !sources.is_empty(),
474        "kg_query bench requires a seeded fixture"
475    );
476    let total = config.warmup + config.iterations;
477    let mut samples = Vec::with_capacity(config.iterations);
478    for i in 0..total {
479        let src = &sources[i % sources.len()];
480        let start = Instant::now();
481        let _ = db::kg_query(conn, src, 1, None, None, None, false)?;
482        let elapsed = start.elapsed();
483        if i >= config.warmup {
484            samples.push(elapsed);
485        }
486    }
487    Ok(percentile_summary(
488        Operation::KgQueryDepth1,
489        &samples,
490        config.scale,
491    ))
492}
493
494fn run_kg_query_chain(
495    conn: &Connection,
496    config: &BenchConfig,
497    sources: &[String],
498    operation: Operation,
499    max_depth: usize,
500) -> Result<OperationResult> {
501    debug_assert!(
502        !sources.is_empty(),
503        "kg_query chain bench requires a seeded fixture"
504    );
505    let total = config.warmup + config.iterations;
506    let mut samples = Vec::with_capacity(config.iterations);
507    for i in 0..total {
508        let src = &sources[i % sources.len()];
509        let start = Instant::now();
510        let _ = db::kg_query(conn, src, max_depth, None, None, None, false)?;
511        let elapsed = start.elapsed();
512        if i >= config.warmup {
513            samples.push(elapsed);
514        }
515    }
516    Ok(percentile_summary(operation, &samples, config.scale))
517}
518
519fn run_kg_timeline(
520    conn: &Connection,
521    config: &BenchConfig,
522    sources: &[String],
523) -> Result<OperationResult> {
524    debug_assert!(
525        !sources.is_empty(),
526        "kg_timeline bench requires a seeded fixture"
527    );
528    let total = config.warmup + config.iterations;
529    let mut samples = Vec::with_capacity(config.iterations);
530    for i in 0..total {
531        let src = &sources[i % sources.len()];
532        let start = Instant::now();
533        let _ = db::kg_timeline(conn, src, None, None, None)?;
534        let elapsed = start.elapsed();
535        if i >= config.warmup {
536            samples.push(elapsed);
537        }
538    }
539    Ok(percentile_summary(
540        Operation::KgTimeline,
541        &samples,
542        config.scale,
543    ))
544}
545
546/// Seed the in-process KG fixture: `KG_FIXTURE_SOURCES` source memories,
547/// each with `KG_FIXTURE_LINKS_PER_SOURCE` outbound links to distinct
548/// targets. Every link sets `valid_from` so `kg_timeline` (which skips
549/// rows with NULL `valid_from`) sees the full corpus. Returns the source
550/// IDs so the runners can hand them to `kg_query` / `kg_timeline`.
551fn seed_kg_fixture(conn: &Connection, namespace: &str) -> Result<Vec<String>> {
552    let mut sources = Vec::with_capacity(KG_FIXTURE_SOURCES);
553    for s in 0..KG_FIXTURE_SOURCES {
554        let src = synth_memory(namespace, s, "kg-src");
555        // `db::insert` upserts on `(title, namespace)` and returns the
556        // canonical id, which differs from `src.id` if the row already
557        // exists. Use the returned id so the fixture remains correct
558        // even when `run()` is invoked twice against the same conn.
559        let src_id = db::insert(conn, &src)?;
560        for t in 0..KG_FIXTURE_LINKS_PER_SOURCE {
561            let target_idx = s * KG_FIXTURE_LINKS_PER_SOURCE + t;
562            let tgt = synth_memory(namespace, target_idx, "kg-tgt");
563            let tgt_id = db::insert(conn, &tgt)?;
564            // `db::create_link` stamps `created_at` and `valid_from` to
565            // the current wall clock — sufficient for `kg_timeline`
566            // (which skips rows with NULL `valid_from`).
567            db::create_link(
568                conn,
569                &src_id,
570                &tgt_id,
571                crate::models::MemoryLinkRelation::RelatedTo.as_str(),
572            )?;
573        }
574        sources.push(src_id);
575    }
576    Ok(sources)
577}
578
579/// Seed the linear-chain KG fixture used by the depth=3 / depth=5
580/// runners: `KG_CHAIN_FIXTURE_CHAINS` chains, each
581/// `KG_CHAIN_FIXTURE_HOPS` links long. Every node and link uses titles
582/// disjoint from the fan-out fixture's `kg-src` / `kg-tgt` prefixes, so
583/// both fixtures coexist in the same connection without colliding on
584/// the `(title, namespace)` upsert. Returns the source IDs (one per
585/// chain) so the runners can drive `kg_query` against them.
586fn seed_kg_chain_fixture(conn: &Connection, namespace: &str) -> Result<Vec<String>> {
587    let mut sources = Vec::with_capacity(KG_CHAIN_FIXTURE_CHAINS);
588    for c in 0..KG_CHAIN_FIXTURE_CHAINS {
589        let mut prev_id = {
590            let head = synth_memory(namespace, c, "kg-chain-src");
591            db::insert(conn, &head)?
592        };
593        let chain_head_id = prev_id.clone();
594        for h in 0..KG_CHAIN_FIXTURE_HOPS {
595            let node_idx = c * KG_CHAIN_FIXTURE_HOPS + h;
596            let next = synth_memory(namespace, node_idx, "kg-chain-node");
597            let next_id = db::insert(conn, &next)?;
598            db::create_link(
599                conn,
600                &prev_id,
601                &next_id,
602                crate::models::MemoryLinkRelation::RelatedTo.as_str(),
603            )?;
604            prev_id = next_id;
605        }
606        sources.push(chain_head_id);
607    }
608    Ok(sources)
609}
610
611fn seed_corpus(conn: &Connection, namespace: &str, prefix: &str, count: usize) -> Result<()> {
612    for i in 0..count {
613        let mem = synth_memory(namespace, i, prefix);
614        db::insert(conn, &mem)?;
615    }
616    Ok(())
617}
618
619fn synth_memory(namespace: &str, i: usize, prefix: &str) -> Memory {
620    let now = chrono::Utc::now().to_rfc3339();
621    Memory {
622        id: uuid::Uuid::new_v4().to_string(),
623        tier: Tier::Long,
624        namespace: namespace.to_string(),
625        title: format!("bench-{prefix}-{i}"),
626        content: format!(
627            "bench memory {i} content about topic {} category {} for {prefix} workload",
628            i % 50,
629            i % 10
630        ),
631        tags: vec![],
632        priority: i32::try_from((i % 9) + 1).unwrap_or(5),
633        confidence: 1.0,
634        source: "bench".to_string(),
635        access_count: 0,
636        created_at: now.clone(),
637        updated_at: now,
638        last_accessed_at: None,
639        expires_at: None,
640        metadata: serde_json::json!({"agent_id": "bench"}),
641        reflection_depth: 0,
642        memory_kind: crate::models::MemoryKind::Observation,
643        entity_id: None,
644        persona_version: None,
645        citations: Vec::new(),
646        source_uri: None,
647        source_span: None,
648        confidence_source: ConfidenceSource::CallerProvided,
649        confidence_signals: None,
650        confidence_decayed_at: None,
651        version: 1,
652    }
653}
654
655fn percentile_summary(
656    operation: Operation,
657    samples: &[Duration],
658    // #1579 B8 — corpus scale of this run; selects the budget bucket.
659    scale: Option<usize>,
660) -> OperationResult {
661    debug_assert!(
662        !samples.is_empty(),
663        "bench operation produced no samples; iterations must be > 0"
664    );
665    let mut sorted: Vec<f64> = samples.iter().map(duration_ms).collect();
666    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
667    let p50 = percentile(&sorted, 0.50);
668    let p95 = percentile(&sorted, 0.95);
669    let p99 = percentile(&sorted, 0.99);
670    // #1579 B8 — both the reported target and the verdict budget come
671    // from the scale-aware resolver; `scale == None` keeps the legacy
672    // budgets byte-for-byte.
673    let target = operation.target_p95_ms_at_scale(scale);
674    // Per issue #1193: the pass/fail verdict uses the runner-effective
675    // budget so the macOS GHA pool's higher I/O variance doesn't blow
676    // a clean run. The reported `target_p95_ms` keeps the canonical
677    // PERFORMANCE.md value so dashboards / baselines stay stable.
678    let effective_target = operation.effective_target_p95_ms_at_scale(scale);
679    let status = if p95 <= effective_target * P95_TOLERANCE {
680        Status::Pass
681    } else {
682        Status::Fail
683    };
684    OperationResult {
685        operation,
686        label: operation.label(),
687        target_p95_ms: target,
688        measured_p50_ms: p50,
689        measured_p95_ms: p95,
690        measured_p99_ms: p99,
691        samples: sorted.len(),
692        status,
693    }
694}
695
696fn duration_ms(d: &Duration) -> f64 {
697    let secs = d.as_secs_f64();
698    secs * 1000.0
699}
700
701#[allow(
702    clippy::cast_precision_loss,
703    clippy::cast_sign_loss,
704    clippy::cast_possible_truncation
705)]
706fn percentile(sorted: &[f64], q: f64) -> f64 {
707    if sorted.is_empty() {
708        return 0.0;
709    }
710    if sorted.len() == 1 {
711        return sorted[0];
712    }
713    let rank = q * (sorted.len() as f64 - 1.0);
714    let lo = rank.floor() as usize;
715    let hi = rank.ceil() as usize;
716    if lo == hi {
717        return sorted[lo];
718    }
719    let frac = rank - lo as f64;
720    sorted[lo] + (sorted[hi] - sorted[lo]) * frac
721}
722
723/// Render a results table to a string in the same shape used in the
724/// `PERFORMANCE.md` "Operator Self-Verification" example.
725#[must_use]
726pub fn render_table(results: &[OperationResult]) -> String {
727    let mut out = String::new();
728    out.push_str(
729        "Operation                       Target (p95)   Measured (p95)   p50      p99      Status\n",
730    );
731    out.push_str(
732        "─────────────────────────────────────────────────────────────────────────────────────────\n",
733    );
734    for r in results {
735        let status_str = match r.status {
736            Status::Pass => "PASS",
737            Status::Fail => "FAIL",
738        };
739        // target budgets are documented as small integer ms; rounding
740        // to the nearest int ms is what the table in PERFORMANCE.md
741        // shows. Saturating cast guards against pathological future
742        // changes to a non-integer or huge value.
743        #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
744        let target_ms = r.target_p95_ms.round() as i64;
745        let line = format!(
746            "{:<30}  < {:>4} ms       {:>7.1} ms       {:>5.1}    {:>5.1}    {}\n",
747            r.label, target_ms, r.measured_p95_ms, r.measured_p50_ms, r.measured_p99_ms, status_str
748        );
749        out.push_str(&line);
750    }
751    out
752}
753
754/// Subset of [`OperationResult`] retained when loading a previous run
755/// for `--baseline` comparison. Only the fields the regression check
756/// actually consumes are required, so any superset of those fields
757/// (the full `bench --json` output included) deserializes cleanly.
758#[derive(Debug, Clone, Deserialize)]
759pub struct BaselineRecord {
760    pub operation: Operation,
761    pub measured_p95_ms: f64,
762}
763
764/// Top-level shape of a `bench --json` payload, used to thread the
765/// `results` array out for [`load_baseline`]. The other top-level
766/// fields (`iterations`, `warmup`, anything future runs add) are
767/// ignored on purpose so older / newer JSON shapes load without
768/// migration churn.
769#[derive(Debug, Clone, Deserialize)]
770struct BaselineFile {
771    results: Vec<BaselineRecord>,
772}
773
774/// Per-operation regression row produced by
775/// [`compare_against_baseline`].
776#[derive(Debug, Clone, Serialize)]
777pub struct Regression {
778    pub operation: Operation,
779    /// Pretty label, duplicated for JSON consumers.
780    pub label: &'static str,
781    pub baseline_p95_ms: f64,
782    pub measured_p95_ms: f64,
783    pub delta_pct: f64,
784    pub threshold_pct: f64,
785    pub regressed: bool,
786}
787
788/// Load a previously emitted `bench --json` payload from disk.
789///
790/// # Errors
791///
792/// Returns an error if the file cannot be read or the JSON cannot be
793/// parsed into the [`BaselineFile`] shape.
794pub fn load_baseline(path: &Path) -> Result<Vec<BaselineRecord>> {
795    let raw = std::fs::read_to_string(path)
796        .with_context(|| format!("failed to read baseline file: {}", path.display()))?;
797    let file: BaselineFile = serde_json::from_str(&raw)
798        .with_context(|| format!("failed to parse baseline JSON: {}", path.display()))?;
799    Ok(file.results)
800}
801
802/// Compare a fresh run against a baseline. Operations missing from the
803/// baseline are skipped silently (e.g. a new bench row added since the
804/// baseline was captured). The returned `Vec` preserves the order of
805/// `current` and only includes ops present in both.
806#[must_use]
807pub fn compare_against_baseline(
808    current: &[OperationResult],
809    baseline: &[BaselineRecord],
810    threshold_pct: f64,
811) -> Vec<Regression> {
812    let mut out = Vec::with_capacity(current.len());
813    for r in current {
814        let Some(b) = baseline.iter().find(|b| b.operation == r.operation) else {
815            continue;
816        };
817        // Treat a non-positive baseline as "no signal" so we never
818        // divide by zero or produce a nonsense -100% delta. Any current
819        // measurement against a zero baseline is reported as 0% delta
820        // rather than infinity — the absolute-budget guard already
821        // catches actual breakage.
822        let delta_pct = if b.measured_p95_ms > 0.0 {
823            (r.measured_p95_ms - b.measured_p95_ms) / b.measured_p95_ms * 100.0
824        } else {
825            0.0
826        };
827        let regressed = delta_pct > threshold_pct;
828        out.push(Regression {
829            operation: r.operation,
830            label: r.operation.label(),
831            baseline_p95_ms: b.measured_p95_ms,
832            measured_p95_ms: r.measured_p95_ms,
833            delta_pct,
834            threshold_pct,
835            regressed,
836        });
837    }
838    out
839}
840
841/// Render a regression table to a string, mirroring the layout of
842/// [`render_table`].
843#[must_use]
844pub fn render_regression_table(rows: &[Regression]) -> String {
845    let mut out = String::new();
846    out.push_str(
847        "Operation                       Baseline (p95)   Measured (p95)   Delta     Status\n",
848    );
849    out.push_str(
850        "─────────────────────────────────────────────────────────────────────────────────\n",
851    );
852    for r in rows {
853        let status_str = if r.regressed { "REGRESSION" } else { "OK" };
854        let line = format!(
855            "{:<30}  {:>10.1} ms     {:>10.1} ms    {:>+6.1}%   {}\n",
856            r.label, r.baseline_p95_ms, r.measured_p95_ms, r.delta_pct, status_str
857        );
858        out.push_str(&line);
859    }
860    out
861}
862
863/// Append a benchmark result to a JSONL history file.
864/// Creates the file and parent directories if missing.
865/// Each line is a self-describing JSON object with `captured_at`, `iterations`,
866/// `warmup`, and `results` array.
867pub fn append_history(
868    path: &std::path::Path,
869    captured_at: &str,
870    iterations: usize,
871    warmup: usize,
872    // #1579 B8 — corpus scale of the recorded run (`null` = default
873    // workload) so downstream regression tooling can stratify history
874    // entries per scale bucket.
875    scale: Option<usize>,
876    results: &[OperationResult],
877) -> Result<()> {
878    use std::fs::OpenOptions;
879    use std::io::Write;
880
881    // Create parent directories if needed
882    if let Some(parent) = path.parent()
883        && !parent.as_os_str().is_empty()
884    {
885        std::fs::create_dir_all(parent)?;
886    }
887
888    let entry = serde_json::json!({
889        "captured_at": captured_at,
890        "iterations": iterations,
891        "warmup": warmup,
892        "scale": scale,
893        "results": results,
894    });
895
896    let mut file = OpenOptions::new().create(true).append(true).open(path)?;
897
898    writeln!(file, "{}", serde_json::to_string(&entry)?)?;
899    Ok(())
900}
901
902#[allow(clippy::wildcard_imports)]
903mod tests {
904    use super::*;
905    use crate::db;
906
907    #[allow(dead_code)]
908    fn fresh_conn() -> Connection {
909        db::open(Path::new(":memory:")).unwrap()
910    }
911
912    #[allow(dead_code)]
913    fn small_config() -> BenchConfig {
914        BenchConfig {
915            iterations: 30,
916            warmup: 5,
917            namespace: "bench-test".to_string(),
918            scale: None,
919        }
920    }
921
922    #[test]
923    fn percentile_interpolates() {
924        let s = vec![1.0, 2.0, 3.0, 4.0];
925        assert!((percentile(&s, 0.50) - 2.5).abs() < 1e-9);
926        assert!((percentile(&s, 0.0) - 1.0).abs() < 1e-9);
927        assert!((percentile(&s, 1.0) - 4.0).abs() < 1e-9);
928    }
929
930    #[test]
931    fn percentile_handles_singleton_and_empty() {
932        assert!((percentile(&[], 0.5) - 0.0).abs() < 1e-9);
933        assert!((percentile(&[42.0], 0.99) - 42.0).abs() < 1e-9);
934    }
935
936    #[test]
937    fn run_returns_all_seven_results() {
938        let conn = fresh_conn();
939        let results = run(&conn, &small_config()).unwrap();
940        assert_eq!(results.len(), 7);
941        assert_eq!(results[0].operation, Operation::StoreNoEmbedding);
942        assert_eq!(results[1].operation, Operation::SearchFts);
943        assert_eq!(results[2].operation, Operation::RecallHot);
944        assert_eq!(results[3].operation, Operation::KgQueryDepth1);
945        assert_eq!(results[4].operation, Operation::KgQueryDepth3);
946        assert_eq!(results[5].operation, Operation::KgQueryDepth5);
947        assert_eq!(results[6].operation, Operation::KgTimeline);
948        for r in &results {
949            assert_eq!(r.samples, 30);
950            assert!(r.measured_p50_ms <= r.measured_p95_ms);
951            assert!(r.measured_p95_ms <= r.measured_p99_ms);
952            assert!(r.target_p95_ms > 0.0);
953        }
954    }
955
956    #[test]
957    fn status_is_fail_when_p95_over_tolerance() {
958        let r = OperationResult {
959            operation: Operation::StoreNoEmbedding,
960            label: Operation::StoreNoEmbedding.label(),
961            target_p95_ms: 20.0,
962            measured_p50_ms: 5.0,
963            measured_p95_ms: 25.0,
964            measured_p99_ms: 30.0,
965            samples: 100,
966            status: Status::Fail,
967        };
968        assert_eq!(r.status, Status::Fail);
969        // 25 > 20 * 1.10 = 22 → Fail
970        let recomputed = if 25.0_f64 <= 20.0 * P95_TOLERANCE {
971            Status::Pass
972        } else {
973            Status::Fail
974        };
975        assert_eq!(recomputed, Status::Fail);
976    }
977
978    #[test]
979    fn status_is_pass_within_tolerance() {
980        // 21 ms over 20 ms budget = 5% over → still PASS (under 10%).
981        let recomputed = if 21.0_f64 <= 20.0 * P95_TOLERANCE {
982            Status::Pass
983        } else {
984            Status::Fail
985        };
986        assert_eq!(recomputed, Status::Pass);
987    }
988
989    #[test]
990    fn render_table_includes_all_operations() {
991        let conn = fresh_conn();
992        let results = run(&conn, &small_config()).unwrap();
993        let table = render_table(&results);
994        assert!(table.contains("memory_store (no embedding)"));
995        assert!(table.contains("memory_search (FTS5)"));
996        assert!(table.contains("memory_recall (hot, depth=1)"));
997        assert!(table.contains("memory_kg_query (depth=1)"));
998        assert!(table.contains("memory_kg_query (depth=3)"));
999        assert!(table.contains("memory_kg_query (depth=5)"));
1000        assert!(table.contains("memory_kg_timeline"));
1001        assert!(table.contains("Status"));
1002    }
1003
1004    #[test]
1005    fn operation_targets_match_performance_md() {
1006        // Pinned to PERFORMANCE.md — if you change a budget, change both.
1007        assert!((Operation::StoreNoEmbedding.target_p95_ms() - 20.0).abs() < 1e-9);
1008        assert!((Operation::SearchFts.target_p95_ms() - 100.0).abs() < 1e-9);
1009        assert!((Operation::RecallHot.target_p95_ms() - 50.0).abs() < 1e-9);
1010        assert!((Operation::KgQueryDepth1.target_p95_ms() - 100.0).abs() < 1e-9);
1011        assert!((Operation::KgQueryDepth3.target_p95_ms() - 100.0).abs() < 1e-9);
1012        assert!((Operation::KgQueryDepth5.target_p95_ms() - 250.0).abs() < 1e-9);
1013        assert!((Operation::KgTimeline.target_p95_ms() - 100.0).abs() < 1e-9);
1014    }
1015
1016    /// Issue #1193 — the effective budget the pass/fail verdict uses
1017    /// is the canonical budget × `MACOS_BUDGET_MULT`. On Linux/Windows
1018    /// the multiplier is 1.0 (effective == canonical); on macOS the
1019    /// multiplier is 3.0 (effective == 3 × canonical). Regression-pins
1020    /// the wiring so a future refactor can't silently revert the
1021    /// platform-aware verdict path.
1022    #[test]
1023    fn effective_target_applies_macos_multiplier() {
1024        for op in [
1025            Operation::StoreNoEmbedding,
1026            Operation::SearchFts,
1027            Operation::RecallHot,
1028            Operation::KgQueryDepth1,
1029            Operation::KgQueryDepth3,
1030            Operation::KgQueryDepth5,
1031            Operation::KgTimeline,
1032        ] {
1033            let expected = op.target_p95_ms() * MACOS_BUDGET_MULT;
1034            assert!(
1035                (op.effective_target_p95_ms() - expected).abs() < 1e-9,
1036                "effective budget for {:?} = {} (expected {})",
1037                op,
1038                op.effective_target_p95_ms(),
1039                expected,
1040            );
1041        }
1042        #[cfg(target_os = "macos")]
1043        assert!((MACOS_BUDGET_MULT - 3.0).abs() < 1e-9);
1044        #[cfg(not(target_os = "macos"))]
1045        assert!((MACOS_BUDGET_MULT - 1.0).abs() < 1e-9);
1046    }
1047
1048    /// #1579 B8 — pins the per-scale budget table to the values
1049    /// published in `PERFORMANCE.md` §"Corpus-scale budgets". If you
1050    /// change a scale budget, change both.
1051    #[test]
1052    fn operation_scale_targets_match_performance_md() {
1053        let at_gate_scale = Some(CI_SCALE_GATE_ROWS);
1054        assert!(
1055            (Operation::StoreNoEmbedding.target_p95_ms_at_scale(at_gate_scale) - 120.0).abs()
1056                < 1e-9
1057        );
1058        assert!((Operation::SearchFts.target_p95_ms_at_scale(at_gate_scale) - 60.0).abs() < 1e-9);
1059        assert!((Operation::RecallHot.target_p95_ms_at_scale(at_gate_scale) - 80.0).abs() < 1e-9);
1060        // KG fixtures are scale-independent → canonical budgets hold.
1061        for op in [
1062            Operation::KgQueryDepth1,
1063            Operation::KgQueryDepth3,
1064            Operation::KgQueryDepth5,
1065            Operation::KgTimeline,
1066        ] {
1067            assert!(
1068                (op.target_p95_ms_at_scale(at_gate_scale) - op.target_p95_ms()).abs() < 1e-9,
1069                "{op:?} must keep its canonical budget at scale"
1070            );
1071        }
1072        // `None` (default workload) keeps the legacy budgets.
1073        assert!((Operation::RecallHot.target_p95_ms_at_scale(None) - 50.0).abs() < 1e-9);
1074    }
1075
1076    /// #1579 B8 — bucket resolution: a request at or below a pinned
1077    /// scale selects that row; a request beyond the largest pinned
1078    /// scale falls back to the largest row (best-effort).
1079    #[test]
1080    fn issue_1579_b8_scale_budget_bucket_resolution() {
1081        assert_eq!(scale_budgets_for(500).scale, CI_SCALE_GATE_ROWS);
1082        assert_eq!(
1083            scale_budgets_for(CI_SCALE_GATE_ROWS).scale,
1084            CI_SCALE_GATE_ROWS
1085        );
1086        assert_eq!(scale_budgets_for(MAX_SCALE).scale, CI_SCALE_GATE_ROWS);
1087    }
1088
1089    /// #1579 B8 — a `--scale` run actually seeds the scratch corpus
1090    /// (the P1 failure mode was a bench that never grew the table) and
1091    /// gates the three corpus-sensitive ops against the scale budgets.
1092    #[test]
1093    fn issue_1579_b8_scale_run_seeds_corpus_and_uses_scale_budgets() {
1094        let conn = fresh_conn();
1095        let ns = "bench-scale-test";
1096        let config = BenchConfig {
1097            iterations: 10,
1098            warmup: 2,
1099            namespace: ns.to_string(),
1100            scale: Some(300),
1101        };
1102        let results = run(&conn, &config).unwrap();
1103        assert_eq!(results.len(), 7);
1104        let seeded: i64 = conn
1105            .query_row(
1106                "SELECT COUNT(*) FROM memories WHERE namespace = ?1",
1107                [ns],
1108                |r| r.get(0),
1109            )
1110            .unwrap();
1111        assert!(
1112            seeded >= 300,
1113            "scale run must seed the scratch corpus; found {seeded} rows"
1114        );
1115        // Scale budgets reported (300 resolves into the 10k bucket).
1116        let store = &results[0];
1117        assert_eq!(store.operation, Operation::StoreNoEmbedding);
1118        assert!((store.target_p95_ms - 120.0).abs() < 1e-9);
1119        let search = &results[1];
1120        assert!((search.target_p95_ms - 60.0).abs() < 1e-9);
1121        let recall = &results[2];
1122        assert!((recall.target_p95_ms - 80.0).abs() < 1e-9);
1123        // KG rows keep canonical budgets.
1124        assert!((results[3].target_p95_ms - 100.0).abs() < 1e-9);
1125        assert!((results[5].target_p95_ms - 250.0).abs() < 1e-9);
1126    }
1127
1128    #[test]
1129    fn seed_kg_chain_fixture_traverses_to_max_depth() {
1130        let conn = fresh_conn();
1131        let sources = seed_kg_chain_fixture(&conn, "kg-chain-fixture-test").unwrap();
1132        assert_eq!(sources.len(), KG_CHAIN_FIXTURE_CHAINS);
1133        // Every chain must yield exactly `KG_CHAIN_FIXTURE_HOPS` reachable
1134        // nodes at depth=KG_CHAIN_FIXTURE_HOPS — that's what justifies the
1135        // depth=5 budget bucket. depth=3 must reach exactly 3 nodes.
1136        for src in &sources {
1137            let depth5 =
1138                db::kg_query(&conn, src, KG_CHAIN_FIXTURE_HOPS, None, None, None, false).unwrap();
1139            assert_eq!(
1140                depth5.len(),
1141                KG_CHAIN_FIXTURE_HOPS,
1142                "depth={KG_CHAIN_FIXTURE_HOPS} on a {KG_CHAIN_FIXTURE_HOPS}-hop chain must reach every node"
1143            );
1144            let depth3 = db::kg_query(&conn, src, 3, None, None, None, false).unwrap();
1145            assert_eq!(
1146                depth3.len(),
1147                3,
1148                "depth=3 on a {KG_CHAIN_FIXTURE_HOPS}-hop chain must reach exactly 3 follow-on nodes"
1149            );
1150        }
1151    }
1152
1153    #[test]
1154    fn seed_kg_fixture_populates_sources_and_links() {
1155        let conn = fresh_conn();
1156        let sources = seed_kg_fixture(&conn, "kg-fixture-test").unwrap();
1157        assert_eq!(sources.len(), KG_FIXTURE_SOURCES);
1158        // Every source carries the expected fan-out, every link has a
1159        // non-null `valid_from` (otherwise `kg_timeline` would skip it).
1160        for src in &sources {
1161            let nodes = db::kg_query(&conn, src, 1, None, None, None, false).unwrap();
1162            assert_eq!(nodes.len(), KG_FIXTURE_LINKS_PER_SOURCE);
1163            let timeline = db::kg_timeline(&conn, src, None, None, None).unwrap();
1164            assert_eq!(timeline.len(), KG_FIXTURE_LINKS_PER_SOURCE);
1165            for ev in &timeline {
1166                // `kg_timeline` filters out NULL `valid_from` rows in SQL,
1167                // so any returned event must carry a non-empty stamp.
1168                assert!(
1169                    !ev.valid_from.is_empty(),
1170                    "kg fixture must stamp valid_from on every link"
1171                );
1172            }
1173        }
1174    }
1175
1176    #[allow(dead_code)]
1177    fn synthetic_result(op: Operation, p95: f64) -> OperationResult {
1178        OperationResult {
1179            operation: op,
1180            label: op.label(),
1181            target_p95_ms: op.target_p95_ms(),
1182            measured_p50_ms: p95 / 2.0,
1183            measured_p95_ms: p95,
1184            measured_p99_ms: p95 * 1.1,
1185            samples: 100,
1186            status: Status::Pass,
1187        }
1188    }
1189
1190    #[allow(dead_code)]
1191    fn synthetic_baseline(op: Operation, p95: f64) -> BaselineRecord {
1192        BaselineRecord {
1193            operation: op,
1194            measured_p95_ms: p95,
1195        }
1196    }
1197
1198    #[test]
1199    fn baseline_compare_flags_above_threshold() {
1200        // 12% slowdown over baseline at default 10% threshold → REGRESSION.
1201        let current = vec![synthetic_result(Operation::StoreNoEmbedding, 11.2)];
1202        let baseline = vec![synthetic_baseline(Operation::StoreNoEmbedding, 10.0)];
1203        let rows = compare_against_baseline(&current, &baseline, 10.0);
1204        assert_eq!(rows.len(), 1);
1205        assert!(rows[0].regressed);
1206        assert!((rows[0].delta_pct - 12.0).abs() < 1e-9);
1207    }
1208
1209    #[test]
1210    fn baseline_compare_passes_within_threshold() {
1211        // 8% slowdown over baseline at default 10% threshold → OK.
1212        let current = vec![synthetic_result(Operation::StoreNoEmbedding, 10.8)];
1213        let baseline = vec![synthetic_baseline(Operation::StoreNoEmbedding, 10.0)];
1214        let rows = compare_against_baseline(&current, &baseline, 10.0);
1215        assert_eq!(rows.len(), 1);
1216        assert!(!rows[0].regressed);
1217    }
1218
1219    #[test]
1220    fn baseline_compare_speedup_is_negative_delta() {
1221        // Faster than baseline → negative delta, never a regression.
1222        let current = vec![synthetic_result(Operation::SearchFts, 8.0)];
1223        let baseline = vec![synthetic_baseline(Operation::SearchFts, 10.0)];
1224        let rows = compare_against_baseline(&current, &baseline, 10.0);
1225        assert_eq!(rows.len(), 1);
1226        assert!(!rows[0].regressed);
1227        assert!((rows[0].delta_pct + 20.0).abs() < 1e-9);
1228    }
1229
1230    #[test]
1231    fn baseline_compare_skips_ops_missing_in_baseline() {
1232        // A new op added since the baseline was captured shouldn't crash
1233        // or appear as a regression.
1234        let current = vec![
1235            synthetic_result(Operation::StoreNoEmbedding, 10.0),
1236            synthetic_result(Operation::KgQueryDepth5, 200.0),
1237        ];
1238        let baseline = vec![synthetic_baseline(Operation::StoreNoEmbedding, 10.0)];
1239        let rows = compare_against_baseline(&current, &baseline, 10.0);
1240        assert_eq!(rows.len(), 1);
1241        assert_eq!(rows[0].operation, Operation::StoreNoEmbedding);
1242    }
1243
1244    #[test]
1245    fn baseline_compare_handles_zero_baseline() {
1246        // Pathological zero baseline: report 0% delta rather than
1247        // dividing by zero. Absolute-budget guard still catches
1248        // genuinely-broken measurements.
1249        let current = vec![synthetic_result(Operation::SearchFts, 5.0)];
1250        let baseline = vec![synthetic_baseline(Operation::SearchFts, 0.0)];
1251        let rows = compare_against_baseline(&current, &baseline, 10.0);
1252        assert_eq!(rows.len(), 1);
1253        assert!(!rows[0].regressed);
1254        assert!((rows[0].delta_pct - 0.0).abs() < 1e-9);
1255    }
1256
1257    #[test]
1258    fn load_baseline_round_trips_json_payload() {
1259        // Mirror the shape `bench --json` actually emits — it must
1260        // round-trip through `load_baseline` so CI artifacts work as
1261        // baselines without preprocessing.
1262        let dir = tempfile::tempdir().unwrap();
1263        let path = dir.path().join("baseline.json");
1264        let payload = serde_json::json!({
1265            "iterations": 200,
1266            "warmup": 20,
1267            "results": [
1268                {
1269                    "operation": "store_no_embedding",
1270                    "label": "memory_store (no embedding)",
1271                    "target_p95_ms": 20.0,
1272                    "measured_p50_ms": 4.0,
1273                    "measured_p95_ms": 9.0,
1274                    "measured_p99_ms": 11.0,
1275                    "samples": 200,
1276                    "status": "pass"
1277                },
1278                {
1279                    "operation": "search_fts",
1280                    "label": "memory_search (FTS5)",
1281                    "target_p95_ms": 100.0,
1282                    "measured_p50_ms": 12.0,
1283                    "measured_p95_ms": 31.0,
1284                    "measured_p99_ms": 45.0,
1285                    "samples": 200,
1286                    "status": "pass"
1287                }
1288            ]
1289        });
1290        std::fs::write(&path, serde_json::to_string_pretty(&payload).unwrap()).unwrap();
1291        let loaded = load_baseline(&path).unwrap();
1292        assert_eq!(loaded.len(), 2);
1293        assert_eq!(loaded[0].operation, Operation::StoreNoEmbedding);
1294        assert!((loaded[0].measured_p95_ms - 9.0).abs() < 1e-9);
1295        assert_eq!(loaded[1].operation, Operation::SearchFts);
1296        assert!((loaded[1].measured_p95_ms - 31.0).abs() < 1e-9);
1297    }
1298
1299    #[test]
1300    fn render_regression_table_marks_regressions() {
1301        let rows = vec![
1302            Regression {
1303                operation: Operation::StoreNoEmbedding,
1304                label: Operation::StoreNoEmbedding.label(),
1305                baseline_p95_ms: 10.0,
1306                measured_p95_ms: 12.0,
1307                delta_pct: 20.0,
1308                threshold_pct: 10.0,
1309                regressed: true,
1310            },
1311            Regression {
1312                operation: Operation::SearchFts,
1313                label: Operation::SearchFts.label(),
1314                baseline_p95_ms: 30.0,
1315                measured_p95_ms: 31.0,
1316                delta_pct: 3.3,
1317                threshold_pct: 10.0,
1318                regressed: false,
1319            },
1320        ];
1321        let table = render_regression_table(&rows);
1322        assert!(table.contains("memory_store (no embedding)"));
1323        assert!(table.contains("memory_search (FTS5)"));
1324        assert!(table.contains("REGRESSION"));
1325        assert!(table.contains("OK"));
1326    }
1327}