Skip to main content

spg_storage/
lib.rs

1//! In-memory storage primitives.
2//!
3//! v0.3 is intentionally simple: a flat catalog of tables, each holding rows
4//! as `Vec<Value>` (positional, matching the table's `TableSchema`). No MVCC,
5//! no on-disk format — those land in later milestones.
6#![no_std]
7// v3.3.2 NEON path for l2_distance_sq (aarch64 only). Scoped allow:
8// `unsafe_code = "deny"` at workspace level stays in force for every
9// other crate.
10#![cfg_attr(target_arch = "aarch64", allow(unsafe_code))]
11
12extern crate alloc;
13
14pub mod bloom;
15pub mod halfvec;
16pub mod persistent;
17pub mod persistent_btree;
18pub mod quantize;
19pub mod row_locator;
20pub mod segment;
21
22pub use self::bloom::{BloomError, BloomFilter};
23pub use self::row_locator::{RowLocator, RowLocatorError};
24pub use self::segment::{
25    BRIN_SIDECAR_MAGIC, BrinSummary, OwnedSegment, SEGMENT_COMPRESS_ALGO_LZSS,
26    SEGMENT_COMPRESS_ALGO_NONE, SEGMENT_MAGIC, SEGMENT_MAGIC_V2, SEGMENT_PAGE_BYTES, SegmentError,
27    SegmentMeta, SegmentReader, derive_brin_summaries, encode_segment, wrap_v2_envelope,
28    wrap_v2_envelope_with_brin,
29};
30
31use alloc::collections::{BTreeMap, BTreeSet};
32use alloc::format;
33use alloc::string::String;
34use alloc::sync::Arc;
35use alloc::vec::Vec;
36use core::fmt;
37
38use self::persistent::PersistentVec;
39use self::persistent_btree::PersistentBTreeMap;
40
41/// In-cell encoding for `DataType::Vector`. Mirrors
42/// `spg_sql::ast::VecEncoding` — kept here so storage stays
43/// dep-free of `spg-sql`. The engine bridges between the two
44/// at DDL-execution time.
45///
46/// `F32` is the pre-v6 default: each cell holds a raw `Vec<f32>`.
47/// `Sq8` (v6.0.1) stores `Sq8Vector { min, max, bytes: Vec<u8> }`
48/// per cell; 4× compression vs `F32` with recall@10 ≥ 0.95 on
49/// natural embeddings (Gaussian / unit-sphere corpora).
50/// `F16` (v6.0.3, DDL keyword `HALF`) stores each element as
51/// IEEE-754 binary16; 2× compression and bit-exact dequantise.
52#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
53pub enum VecEncoding {
54    #[default]
55    F32,
56    Sq8,
57    F16,
58}
59
60impl fmt::Display for VecEncoding {
61    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
62        match self {
63            Self::F32 => f.write_str("F32"),
64            Self::Sq8 => f.write_str("SQ8"),
65            Self::F16 => f.write_str("HALF"),
66        }
67    }
68}
69
70/// Runtime type tags. `Vector { dim, encoding }` / `Varchar(max)` /
71/// `Char(size)` are parameterised; the parameter travels with both
72/// the column schema and the on-wire serialised representation.
73#[derive(Debug, Clone, Copy, PartialEq, Eq)]
74pub enum DataType {
75    /// 16-bit signed. Backed by `Value::SmallInt(i16)`; arithmetic that
76    /// would overflow surfaces as a type error at INSERT time.
77    SmallInt,
78    Int,    // 32-bit signed
79    BigInt, // 64-bit signed
80    Float,  // f64 (PG double precision)
81    Text,
82    /// `VARCHAR(n)` — same byte representation as `Text`, but INSERT
83    /// rejects values longer than `n` Unicode characters.
84    Varchar(u32),
85    /// `CHAR(n)` — same representation as `Text`, but INSERT right-pads
86    /// with U+0020 to exactly `n` Unicode characters (or rejects when
87    /// the input is already longer).
88    Char(u32),
89    Bool,
90    /// pgvector-style fixed-dimension vector. `encoding` selects
91    /// the in-cell representation (`F32` = pre-v6 raw f32 buffer;
92    /// `Sq8` = v6.0.1 8-bit scalar-quantised). The DDL grammar
93    /// surfaces encoding via the optional `USING <encoding>`
94    /// clause: `VECTOR(128) USING SQ8`.
95    Vector {
96        dim: u32,
97        encoding: VecEncoding,
98    },
99    /// `NUMERIC(precision, scale)` — exact fixed-point decimal stored as
100    /// a scaled `i128`. `precision` caps total decimal digits, `scale`
101    /// fixes digits after the decimal point. v1.12 supports up to
102    /// precision 38 (the i128-safe ceiling). `NUMERIC` and `NUMERIC(p)`
103    /// surface as `Numeric { precision: p, scale: 0 }`.
104    Numeric {
105        precision: u8,
106        scale: u8,
107    },
108    /// `DATE` — calendar date with day precision, stored as `i32` days
109    /// since the Unix epoch (1970-01-01).
110    Date,
111    /// `TIMESTAMP` (a.k.a. `MySQL` `DATETIME`) — instant with microsecond
112    /// precision, stored as `i64` microseconds since the Unix epoch.
113    Timestamp,
114    /// `INTERVAL` — calendar-aware span (months + microseconds). v2.11
115    /// supports INTERVAL only as a runtime intermediate (literals,
116    /// arithmetic results); on-disk encoding is rejected so this branch
117    /// can't appear in a `ColumnSchema`.
118    Interval,
119    /// v4.9: `JSON` / `JSONB` — text-backed JSON document. We don't
120    /// parse the content (no path operators or jsonb functions yet) —
121    /// the column accepts any TEXT-compatible value and round-trips
122    /// it verbatim. Equivalent to `Text` storage with a distinct
123    /// type tag for the wire layer (PG OID 114).
124    Json,
125}
126
127impl fmt::Display for DataType {
128    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
129        match self {
130            Self::SmallInt => f.write_str("SMALLINT"),
131            Self::Int => f.write_str("INT"),
132            Self::BigInt => f.write_str("BIGINT"),
133            Self::Float => f.write_str("FLOAT"),
134            Self::Text => f.write_str("TEXT"),
135            Self::Varchar(n) => write!(f, "VARCHAR({n})"),
136            Self::Char(n) => write!(f, "CHAR({n})"),
137            Self::Bool => f.write_str("BOOL"),
138            Self::Vector { dim, encoding } => match encoding {
139                VecEncoding::F32 => write!(f, "VECTOR({dim})"),
140                VecEncoding::Sq8 => write!(f, "VECTOR({dim}) USING SQ8"),
141                VecEncoding::F16 => write!(f, "VECTOR({dim}) USING HALF"),
142            },
143            Self::Numeric { precision, scale } => {
144                if *scale == 0 {
145                    write!(f, "NUMERIC({precision})")
146                } else {
147                    write!(f, "NUMERIC({precision}, {scale})")
148                }
149            }
150            Self::Date => f.write_str("DATE"),
151            Self::Timestamp => f.write_str("TIMESTAMP"),
152            Self::Interval => f.write_str("INTERVAL"),
153            Self::Json => f.write_str("JSON"),
154        }
155    }
156}
157
158/// A row-cell value, including SQL `NULL`. `Float` uses `f64`; NaN compares
159/// non-equal to itself (PG behaviour) — `PartialEq` is derived so callers
160/// must opt into NaN-aware comparison if they need stronger guarantees.
161#[derive(Debug, Clone, PartialEq)]
162#[non_exhaustive]
163pub enum Value {
164    SmallInt(i16),
165    Int(i32),
166    BigInt(i64),
167    Float(f64),
168    Text(String),
169    Bool(bool),
170    Vector(Vec<f32>),
171    /// v6.0.1: 8-bit scalar-quantised vector cell. Lives in
172    /// columns declared `VECTOR(N) USING SQ8`. Layout per cell:
173    /// `Sq8Vector { min: f32, max: f32, bytes: Vec<u8> }` —
174    /// 4× compression vs `Vector(Vec<f32>)`. The wire layer
175    /// dequantises to `f32` on SELECT; INSERT path quantises
176    /// incoming `Vector(Vec<f32>)` cells into this variant.
177    Sq8Vector(crate::quantize::Sq8Vector),
178    /// v6.0.3: IEEE-754 binary16 vector cell. Lives in columns
179    /// declared `VECTOR(N) USING HALF`. Stores raw u16 LE bits
180    /// (2× compression vs `Vector(Vec<f32>)`). Wire / display
181    /// paths dequantise to f32 bit-exactly; INSERT path converts
182    /// incoming f32 vectors at the engine boundary.
183    HalfVector(crate::halfvec::HalfVector),
184    /// Exact fixed-point decimal. `scaled` holds the value as
185    /// `actual * 10^scale` so the storage type is always integral —
186    /// arithmetic never falls back to floating-point.
187    Numeric {
188        scaled: i128,
189        scale: u8,
190    },
191    /// Days since the Unix epoch (1970-01-01). Negative for earlier dates.
192    Date(i32),
193    /// Microseconds since the Unix epoch (1970-01-01T00:00:00Z).
194    Timestamp(i64),
195    /// Calendar span: `months` (variable-length) + `micros` (fixed-length).
196    /// Runtime-only — cannot appear in a stored row in v2.11.
197    Interval {
198        months: i32,
199        micros: i64,
200    },
201    /// v4.9 `JSON` — raw JSON text. No structural validation
202    /// happens at the storage layer; whatever the parser hands us
203    /// round-trips verbatim. Equality is byte-wise.
204    Json(String),
205    Null,
206}
207
208impl Value {
209    /// Type tag, or `None` for `NULL` (unknown at value level).
210    pub fn data_type(&self) -> Option<DataType> {
211        match self {
212            Self::SmallInt(_) => Some(DataType::SmallInt),
213            Self::Int(_) => Some(DataType::Int),
214            Self::BigInt(_) => Some(DataType::BigInt),
215            Self::Float(_) => Some(DataType::Float),
216            // `Text` covers both unbounded TEXT and bounded VARCHAR/CHAR
217            // — the constraint lives on the column schema, not the value.
218            Self::Text(_) => Some(DataType::Text),
219            Self::Bool(_) => Some(DataType::Bool),
220            Self::Vector(v) => Some(DataType::Vector {
221                dim: u32::try_from(v.len()).expect("vector dim ≤ u32"),
222                encoding: VecEncoding::F32,
223            }),
224            Self::Sq8Vector(q) => Some(DataType::Vector {
225                dim: u32::try_from(q.bytes.len()).expect("vector dim ≤ u32"),
226                encoding: VecEncoding::Sq8,
227            }),
228            Self::HalfVector(h) => Some(DataType::Vector {
229                dim: u32::try_from(h.dim()).expect("vector dim ≤ u32"),
230                encoding: VecEncoding::F16,
231            }),
232            // `Value::Numeric` doesn't carry its precision (the column
233            // schema does); we surface precision=0 as "unknown" and let
234            // the engine reconcile against the column type at coercion
235            // time.
236            Self::Numeric { scale, .. } => Some(DataType::Numeric {
237                precision: 0,
238                scale: *scale,
239            }),
240            Self::Date(_) => Some(DataType::Date),
241            Self::Timestamp(_) => Some(DataType::Timestamp),
242            Self::Interval { .. } => Some(DataType::Interval),
243            Self::Json(_) => Some(DataType::Json),
244            Self::Null => None,
245        }
246    }
247
248    pub const fn is_null(&self) -> bool {
249        matches!(self, Self::Null)
250    }
251}
252
253/// One table row — values are positional and must match
254/// `TableSchema.columns` in length and (modulo NULL) in `DataType`.
255#[derive(Debug, Clone, PartialEq)]
256pub struct Row {
257    pub values: Vec<Value>,
258}
259
260impl Row {
261    pub const fn new(values: Vec<Value>) -> Self {
262        Self { values }
263    }
264
265    pub fn len(&self) -> usize {
266        self.values.len()
267    }
268
269    pub fn is_empty(&self) -> bool {
270        self.values.is_empty()
271    }
272}
273
274#[derive(Debug, Clone, PartialEq)]
275pub struct ColumnSchema {
276    pub name: String,
277    pub ty: DataType,
278    pub nullable: bool,
279    /// Optional `DEFAULT` value, frozen at CREATE TABLE time. `None`
280    /// means "no default" (so omitted columns become NULL, or error
281    /// out when the column is NOT NULL).
282    pub default: Option<Value>,
283    /// MySQL-style `AUTO_INCREMENT`. When set, an INSERT that leaves
284    /// this column unbound (or sets it to NULL) gets the next integer
285    /// computed from the column's current max + 1.
286    pub auto_increment: bool,
287}
288
289#[derive(Debug, Clone, PartialEq)]
290pub struct TableSchema {
291    pub name: String,
292    pub columns: Vec<ColumnSchema>,
293    /// v6.7.2 — per-table hot-tier byte budget override. `None`
294    /// falls through to the global `SPG_HOT_TIER_BYTES` setting;
295    /// `Some(n)` overrides it for this specific table. Set via
296    /// `ALTER TABLE t SET hot_tier_bytes = X`. Persisted in
297    /// catalog FILE_VERSION 11+.
298    pub hot_tier_bytes: Option<u64>,
299    /// v7.6.1 — FOREIGN KEY constraints declared on this table.
300    /// Engine maintains this in lock-step with `spg-sql`'s parser
301    /// AST; the storage layer carries the on-disk shape so a
302    /// catalog snapshot round-trips without external mapping.
303    /// Persisted in catalog FILE_VERSION 13+. Older catalogs
304    /// deserialise with an empty vec.
305    pub foreign_keys: Vec<ForeignKeyConstraint>,
306}
307
308/// v7.6.1 — Storage-layer mirror of `spg_sql::ast::ForeignKeyConstraint`.
309/// The engine's CREATE TABLE path translates between the two; keeping
310/// them separate preserves the no-deps boundary between
311/// `spg-storage` and `spg-sql`.
312#[derive(Debug, Clone, PartialEq, Eq)]
313pub struct ForeignKeyConstraint {
314    /// Optional user-supplied constraint name (`CONSTRAINT <name>`
315    /// prefix). Used by `ALTER TABLE DROP CONSTRAINT <name>` in
316    /// v7.6.8; ignored by enforcement.
317    pub name: Option<String>,
318    /// Positions of local columns in this table's column list.
319    /// Same arity as `parent_columns`.
320    pub local_columns: Vec<usize>,
321    /// Referenced parent table name.
322    pub parent_table: String,
323    /// Positions of parent columns in the parent's column list.
324    /// Engine resolves these at CREATE TABLE time (after the parent
325    /// schema is known) so enforcement paths can skip the name
326    /// lookup on every row.
327    pub parent_columns: Vec<usize>,
328    /// Referential action when a parent row is deleted.
329    pub on_delete: FkAction,
330    /// Referential action when a parent row's referenced columns
331    /// are updated.
332    pub on_update: FkAction,
333}
334
335/// v7.6.1 — referential action tag. Mirrors `spg_sql::ast::FkAction`.
336#[derive(Debug, Clone, Copy, PartialEq, Eq)]
337pub enum FkAction {
338    Restrict,
339    Cascade,
340    SetNull,
341    SetDefault,
342    NoAction,
343}
344
345impl FkAction {
346    /// On-disk tag byte (v13 catalog appendix).
347    pub const fn tag(self) -> u8 {
348        match self {
349            Self::Restrict => 0,
350            Self::Cascade => 1,
351            Self::SetNull => 2,
352            Self::SetDefault => 3,
353            Self::NoAction => 4,
354        }
355    }
356    pub const fn from_tag(b: u8) -> Option<Self> {
357        Some(match b {
358            0 => Self::Restrict,
359            1 => Self::Cascade,
360            2 => Self::SetNull,
361            3 => Self::SetDefault,
362            4 => Self::NoAction,
363            _ => return None,
364        })
365    }
366}
367
368impl TableSchema {
369    pub fn column_position(&self, name: &str) -> Option<usize> {
370        self.columns.iter().position(|c| c.name == name)
371    }
372}
373
374/// Key type accepted by secondary indices. Float / NULL / Vector values
375/// can't participate in a B-tree index — `f64` is only `PartialOrd`, NULL
376/// has SQL-three-valued semantics, and Vector belongs to the (future) HNSW
377/// path. Index lookups on those columns fall back to full scan.
378#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
379pub enum IndexKey {
380    Int(i64),
381    Text(String),
382    Bool(bool),
383}
384
385impl IndexKey {
386    pub fn from_value(v: &Value) -> Option<Self> {
387        match v {
388            Value::SmallInt(n) => Some(Self::Int(i64::from(*n))),
389            Value::Int(n) => Some(Self::Int(i64::from(*n))),
390            Value::BigInt(n) => Some(Self::Int(*n)),
391            Value::Text(s) => Some(Self::Text(s.clone())),
392            Value::Bool(b) => Some(Self::Bool(*b)),
393            // Date/Timestamp use their integer storage repr as the
394            // index key — same order semantics, same comparison.
395            Value::Date(d) => Some(Self::Int(i64::from(*d))),
396            Value::Timestamp(t) => Some(Self::Int(*t)),
397            // Numeric isn't (yet) indexable — exact-decimal index keys
398            // would need a stable scale-normalised representation.
399            // Interval isn't index-eligible either (and can't reach this
400            // path through column storage anyway).
401            Value::Null
402            | Value::Float(_)
403            | Value::Vector(_)
404            | Value::Sq8Vector(_)
405            | Value::HalfVector(_)
406            | Value::Numeric { .. }
407            | Value::Interval { .. }
408            | Value::Json(_) => None,
409        }
410    }
411}
412
413/// A single-column secondary index. v2.0 carries either a B-tree map
414/// (the default — used for equality / range lookups on scalar columns)
415/// or a navigable-small-world graph (used for kNN over vector
416/// columns).
417#[derive(Debug, Clone)]
418pub struct Index {
419    pub name: String,
420    pub column_position: usize,
421    pub kind: IndexKind,
422    /// v6.8.0 — column positions of `INCLUDE (col1, col2, …)`
423    /// non-key columns. Carries the planner's "this query is
424    /// covered by the index" signal; lookup paths still resolve
425    /// via the `RowLocator` to fetch the row body, but EXPLAIN
426    /// surfaces the covered-scan annotation so operators can
427    /// confirm the planner sees the coverage.
428    ///
429    /// Empty `Vec` = no `INCLUDE` clause (the legacy shape). v12
430    /// catalog snapshots deserialise with an empty vec.
431    pub included_columns: Vec<usize>,
432    /// v6.8.1 — partial-index predicate stored as its canonical
433    /// Display form (the engine re-parses it on the maintenance
434    /// path). `None` = unconditional index (the legacy shape).
435    /// Persisted as `[u8 has_pred][u16 LE len][bytes]` on the
436    /// catalog snapshot (FILE_VERSION 12, appended after
437    /// `included_columns`).
438    pub partial_predicate: Option<String>,
439    /// v6.8.2 — expression-index key, stored as the expression's
440    /// canonical Display form. `None` = bare column-reference
441    /// index (the legacy shape). Persisted alongside
442    /// `partial_predicate` on the v12 catalog snapshot.
443    pub expression: Option<String>,
444}
445
446/// Default neighbor degree (M) for the NSW graph. Picked at construction
447/// time and persisted with the index.
448pub const NSW_DEFAULT_M: usize = 16;
449
450/// v5.2.2: outcome of a successful [`Catalog::freeze_oldest_to_cold`]
451/// call. The catalog state has already been mutated by the time this
452/// is returned (hot rows dropped + segment registered + Cold locators
453/// flipped). The caller's only remaining concern is `segment_bytes` —
454/// persist them to disk under `<db>.spg/segments/seg_<id>.spg` so a
455/// future restart can reload via the v5.1 `SPG_PRELOAD_COLD_SEGMENT`
456/// path. (v5.3's manifest will subsume this manual step.)
457#[derive(Debug, Clone)]
458pub struct FreezeReport {
459    /// Id allocated by [`Catalog::load_segment_bytes`] for the new
460    /// cold-tier segment. Stable across the call's success path.
461    pub segment_id: u32,
462    /// Number of rows that moved hot → cold. Equals the `max_rows`
463    /// the caller asked for (the API is strict on the count).
464    pub frozen_rows: usize,
465    /// Hot-tier bytes reclaimed by the freeze — the
466    /// [`Table::hot_bytes`] delta before vs after. Useful to feed
467    /// back into the freezer's budget check on the next tick.
468    pub bytes_freed: u64,
469    /// Encoded segment bytes, byte-identical to what
470    /// [`encode_segment`] produced. The catalog already owns a
471    /// copy inside `cold_segments`; this hand-off lets the caller
472    /// persist them without re-encoding.
473    pub segment_bytes: Vec<u8>,
474}
475
476/// v6.7.4 — read-only output of [`Catalog::prepare_freeze_slice`].
477/// Carries every row body + key in a contiguous hot-row range,
478/// already encoded and sorted by PK so the coordinator's merge
479/// step is a k-way merge over already-sorted streams.
480///
481/// `Vec<FreezeSlice>` from N independent workers feeds
482/// [`Catalog::commit_freeze_slices`], which concats + encodes the
483/// merged segment + atomically swaps the catalog state.
484#[derive(Debug, Clone)]
485pub struct FreezeSlice {
486    /// Hot-row index range this slice covered (half-open, in the
487    /// table's `rows: PersistentVec` ordering at call time). The
488    /// commit step uses this to compute the union range that
489    /// gets passed to [`Table::delete_rows`].
490    pub row_range: core::ops::Range<usize>,
491    /// `(pk_u64, encoded_row_body, IndexKey)` triples, sorted
492    /// ascending by `pk_u64`. Per-slice sort happens inside
493    /// `prepare_freeze_slice`; the coordinator does only a
494    /// k-way merge to reach the global PK ordering
495    /// [`encode_segment`] requires.
496    pub rows: Vec<(u64, Vec<u8>, IndexKey)>,
497}
498
499/// v6.7.3 — outcome of a [`Catalog::compact_cold_segments`] call.
500/// The catalog state has already been mutated when this is returned:
501/// the merged segment is loaded into `cold_segments`, the source
502/// segment slots are tombstoned (`None`), and every BTree-index
503/// `RowLocator::Cold` that previously pointed at a source now
504/// points at the merged segment. The caller's remaining job is to
505/// persist `merged_segment_bytes` under
506/// `<db>.spg/segments/seg_<merged_segment_id>.spg` and update the
507/// in-memory `segment_id → path` map (remove the source ids, add
508/// the merged id) so the next CHECKPOINT writes a manifest that
509/// no longer lists the retired sources.
510///
511/// On a no-op (fewer than 2 candidate segments under the threshold),
512/// `merged_segment_id` is `None` and `sources` is empty; the
513/// catalog was not mutated.
514#[derive(Debug, Clone)]
515pub struct CompactReport {
516    /// Source segment ids that were merged + tombstoned.
517    pub sources: Vec<u32>,
518    /// Id allocated for the merged segment. `None` on no-op.
519    pub merged_segment_id: Option<u32>,
520    /// Encoded merged-segment bytes (empty on no-op).
521    pub merged_segment_bytes: Vec<u8>,
522    /// Number of rows that landed in the merged segment.
523    pub merged_rows: usize,
524    /// `Σ source.num_rows − merged_rows`. Rows present in source
525    /// segment payloads but unreferenced by any live BTree
526    /// `Cold` locator — DELETE'd-but-still-frozen rows that
527    /// compaction GC'd during the merge.
528    pub deleted_rows_pruned: usize,
529    /// `Σ source.bytes() − merged.bytes()`. Estimate of on-disk
530    /// space the merge will reclaim once the source segment files
531    /// are GC'd. Saturating subtract — never negative.
532    pub bytes_reclaimed_estimate: u64,
533}
534
535#[derive(Debug, Clone)]
536pub enum IndexKind {
537    /// v4.40: structural-sharing B-tree over `IndexKey`. Replaces the v0.8
538    /// `BTreeMap<IndexKey, Vec<usize>>` — `Index::clone` is now an `Arc`
539    /// bump regardless of index size, so `Catalog::clone` inside the
540    /// v4.34 auto-commit wrap stays O(1) even for tables with secondary
541    /// indices (the case that bottlenecked v4.39 at 1M rows in the
542    /// sweep).
543    ///
544    /// v5.1: value type widened from `Vec<usize>` to `Vec<RowLocator>` so
545    /// a single key can point to a mix of hot-tier rows (`RowLocator::Hot`,
546    /// equivalent to the pre-v5 `usize` row index) and cold-tier rows
547    /// (`RowLocator::Cold { segment_id, page_offset }`) once the v5.2
548    /// freezer starts producing them. Pre-v5.2 only `Hot` entries appear
549    /// — the on-disk encoding stays at `FILE_VERSION` 8 (raw u64 row index)
550    /// because every locator round-trips through `RowLocator::from_legacy_v8_u64`
551    /// without information loss. `FILE_VERSION` 9 with tagged encoding lands
552    /// alongside the first freezer commit (v5.1 step 2b / v5.2).
553    BTree(PersistentBTreeMap<IndexKey, Vec<RowLocator>>),
554    /// Navigable-small-world graph for vector kNN search.
555    Nsw(NswGraph),
556    /// v6.7.1 — BRIN (Block Range INdex). Pure metadata: BRIN
557    /// indexes carry NO in-memory key→locator map. The (min,
558    /// max) summaries live in each cold-tier segment's v2
559    /// envelope sidecar; the BRIN entry in `Table.indices` only
560    /// records THAT a BRIN index exists on this column so the
561    /// segment encoder + planner can opt into the summary path.
562    Brin {
563        /// The cell type at `column_position` at CREATE INDEX time.
564        /// Used by the planner to type-check WHERE-clause range
565        /// predicates against the BRIN-indexed column.
566        column_type: DataType,
567    },
568}
569
570/// Multi-layer HNSW graph (v2.13). Each node is assigned a `top_level`;
571/// it appears in layers `0..=top_level`. Higher layers are sparser, so
572/// search starts from the entry at the top layer, greedy-descends to
573/// layer 0, and beam-searches there. Layer 0 keeps a larger neighbour
574/// budget (`m_max_0 = 2 * m` per the HNSW paper); upper layers cap at
575/// `m`. The struct name stays `NswGraph` so external users / on-disk
576/// callers don't have to track a rename — the algorithm changed, the
577/// data slot didn't.
578#[derive(Debug, Clone)]
579pub struct NswGraph {
580    /// Max neighbours per node on layers ≥ 1.
581    pub m: usize,
582    /// Max neighbours on layer 0 (the dense bottom layer). HNSW
583    /// convention: `m_max_0 = 2 * m`.
584    pub m_max_0: usize,
585    /// Entry point — the node that sits on the topmost layer. Search
586    /// always starts here.
587    pub entry: Option<usize>,
588    /// Top layer of the entry node (== `layers.len() - 1` when populated).
589    pub entry_level: u8,
590    /// `levels[i]` = top layer of node `i`. Nodes whose vector cell is
591    /// NULL / non-Vector have `levels[i] = 0` and no neighbour entries.
592    ///
593    /// v5.5.0: backed by `PersistentVec` so `NswGraph::clone` (and the
594    /// `Catalog::clone` on every group-commit write that contains it) is O(1)
595    /// structural-sharing instead of an O(N) element copy.
596    pub levels: PersistentVec<u8>,
597    /// `layers[l][i]` = neighbours of node `i` at layer `l`. Inner vec
598    /// is empty when node `i` doesn't reach layer `l`.
599    ///
600    /// v5.5.0: the per-node middle dimension (the O(N) one) is a
601    /// `PersistentVec`; the outer layer dimension stays a plain `Vec`
602    /// (layer count ≤ 8, so its clone is O(1) in practice) and the inner
603    /// neighbour list stays a `Vec` (bounded by `m_max_0`).
604    ///
605    /// v6.1.x: neighbour slot widened from `usize` (8 B on 64-bit) to
606    /// `u32` (4 B). Row indices are catalog-bounded by `u32::MAX` (4G
607    /// rows per table); the cast at the NSW boundary asserts this. At
608    /// 1M dim-128 SQ8, layer 0 adjacency alone shrinks by ~128 MiB
609    /// — the largest single contribution to the v6.0.5-measured
610    /// 624 MiB ambition gap. On-disk format already used u32 LE, so
611    /// this is a pure in-memory layout change; no `FILE_VERSION` bump.
612    pub layers: Vec<PersistentVec<Vec<u32>>>,
613}
614
615impl NswGraph {
616    fn new(m: usize) -> Self {
617        Self {
618            m,
619            m_max_0: m.saturating_mul(2),
620            entry: None,
621            entry_level: 0,
622            levels: PersistentVec::new(),
623            layers: alloc::vec![PersistentVec::new()],
624        }
625    }
626
627    /// Max-neighbour budget for layer `l`.
628    pub const fn cap_for_layer(&self, layer: u8) -> usize {
629        if layer == 0 { self.m_max_0 } else { self.m }
630    }
631}
632
633/// Deterministic level assignment, seeded on the row index so the same
634/// insert order reproduces the same topology. Distribution is roughly
635/// HNSW-flavoured with `mL ≈ 1/ln(M) ≈ 0.36` for M=16: each 4-bit
636/// chunk that comes up zero promotes the node one layer (so P(level ≥
637/// L) ≈ (1/16)^L).
638#[allow(clippy::verbose_bit_mask)] // clippy suggests trailing_zeros(); we need an explicit MAX cap and a stable distribution shape.
639pub fn nsw_assign_level(row_idx: usize) -> u8 {
640    const MAX_LEVEL: u8 = 7; // 7 ⇒ ~16^7 ≈ 2.7e8 expected nodes between promotions; ample.
641    // SplitMix-style mixer — cheap and seedable.
642    let mut x = (row_idx as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15);
643    x ^= x >> 30;
644    x = x.wrapping_mul(0xBF58_476D_1CE4_E5B9);
645    x ^= x >> 27;
646    x = x.wrapping_mul(0x94D0_49BB_1331_11EB);
647    x ^= x >> 31;
648    // Count contiguous low-end zero nibbles (4-bit chunks). Each zero
649    // nibble has probability 1/16, mirroring HNSW's `mL ≈ 1/ln(M)` for
650    // M=16. `trailing_zeros / 4` would lose the ordering when x = 0, so
651    // a plain loop with a cap is clearer.
652    let mut level: u8 = 0;
653    while x & 0xF == 0 && level < MAX_LEVEL {
654        level += 1;
655        x >>= 4;
656    }
657    level
658}
659
660impl Index {
661    fn new_btree(name: String, column_position: usize) -> Self {
662        Self {
663            name,
664            column_position,
665            kind: IndexKind::BTree(PersistentBTreeMap::new()),
666            included_columns: Vec::new(),
667            partial_predicate: None,
668            expression: None,
669        }
670    }
671
672    fn new_nsw(name: String, column_position: usize, m: usize) -> Self {
673        Self {
674            name,
675            column_position,
676            kind: IndexKind::Nsw(NswGraph::new(m)),
677            included_columns: Vec::new(),
678            partial_predicate: None,
679            expression: None,
680        }
681    }
682
683    /// v6.7.1 — BRIN index constructor. BRIN carries no in-memory
684    /// data; the `column_type` snapshot is used by the segment
685    /// encoder + planner for type-checking range predicates.
686    fn new_brin(name: String, column_position: usize, column_type: DataType) -> Self {
687        Self {
688            name,
689            column_position,
690            kind: IndexKind::Brin { column_type },
691            included_columns: Vec::new(),
692            partial_predicate: None,
693            expression: None,
694        }
695    }
696
697    /// Look up the locators stored under `key` (B-tree only). Returns
698    /// an empty slice when the key is absent or the index is an NSW
699    /// graph — callers can treat both cases uniformly.
700    ///
701    /// v5.1: return type widened from `&[usize]` to `&[RowLocator]`.
702    /// Pre-v5.2 callers can read the slice and `.as_hot().unwrap()`
703    /// each entry (no `Cold` variants exist until the freezer lands);
704    /// post-v5.2 callers dispatch hot vs. cold per locator.
705    pub fn lookup_eq(&self, key: &IndexKey) -> &[RowLocator] {
706        match &self.kind {
707            IndexKind::BTree(m) => m.get(key).map_or(&[][..], Vec::as_slice),
708            // BRIN/Nsw have no key→locator map; lookup is a no-op.
709            IndexKind::Nsw(_) | IndexKind::Brin { .. } => &[][..],
710        }
711    }
712
713    /// Borrow the NSW graph (if this is an NSW index). Callers that need
714    /// the graph for a kNN search go through here.
715    pub const fn nsw(&self) -> Option<&NswGraph> {
716        match &self.kind {
717            IndexKind::Nsw(g) => Some(g),
718            IndexKind::BTree(_) | IndexKind::Brin { .. } => None,
719        }
720    }
721
722    /// v6.7.1 — true when this index is a BRIN (block range) index.
723    /// Used by the segment encoder to opt into BRIN sidecar emission
724    /// at freeze time, and by the planner to opt into page-skipping
725    /// on range predicates.
726    pub const fn is_brin(&self) -> bool {
727        matches!(self.kind, IndexKind::Brin { .. })
728    }
729}
730
731/// In-memory table: schema + a persistent row vector + secondary indices.
732///
733/// v4.39: `rows` is a [`PersistentVec`] (Bitmapped Vector Trie, 32-way) so
734/// `Table::clone()` is `O(1)` — the whole reason for v4.39's existence is
735/// to make `Catalog::clone()` cheap inside the v4.34 auto-commit wrap.
736///
737/// v5.2.1: `hot_bytes` tracks the encoded byte size of every row currently
738/// in [`Self::rows`], summed over rows. Updated incrementally by `insert`
739/// (+= encoded row size), `delete_rows` (-= removed rows' encoded sizes),
740/// and `update_row` (-= old size, += new size). The value is what the
741/// v5.2 freezer reads to decide when to demote cold rows — when the
742/// catalog-wide sum crosses `SPG_HOT_TIER_BYTES` (default 4 GiB) the
743/// freezer thread wakes. v5.2.1 ships measurement only; the freezer
744/// itself lands in v5.2.2. Stored as `u64` so a single field clone in
745/// `Catalog::clone` stays at the O(1) invariant v4.39 built.
746#[derive(Debug, Clone)]
747pub struct Table {
748    schema: TableSchema,
749    rows: PersistentVec<Row>,
750    indices: Vec<Index>,
751    hot_bytes: u64,
752    /// v6.7.0 — cached count of rows currently materialised in the
753    /// cold tier via `RowLocator::Cold` entries across THIS table's
754    /// indices. Populated by `ANALYZE` (walks every BTree index and
755    /// counts Cold locators); the count survives until the next
756    /// ANALYZE recomputes it. Surfaced via `spg_statistic.cold_row_count`
757    /// and `spg_stat_segment.table_name`.
758    ///
759    /// Honest scope: this is a CACHED count, not a live one.
760    /// Freezer / promote / DELETE don't currently update the cache
761    /// incrementally — they invalidate it by setting the
762    /// `cold_row_count_stale` flag, and the next ANALYZE re-walks.
763    /// Incremental maintenance is a v6.7.x candidate if observation
764    /// shows the ANALYZE walk cost dominates.
765    cold_row_count: u64,
766    /// v6.7.0 — set when the cached `cold_row_count` may be wrong
767    /// because rows moved into / out of the cold tier since the last
768    /// ANALYZE. The virtual-table surface reports the cached value
769    /// regardless (operators run ANALYZE to refresh).
770    cold_row_count_stale: bool,
771}
772
773impl Table {
774    pub fn new(schema: TableSchema) -> Self {
775        Self {
776            schema,
777            rows: PersistentVec::new(),
778            indices: Vec::new(),
779            hot_bytes: 0,
780            cold_row_count: 0,
781            cold_row_count_stale: false,
782        }
783    }
784
785    /// Total encoded byte size of every row currently in the hot tier
786    /// (`self.rows`). See struct docs for the maintenance contract.
787    /// Returns 0 for an empty table.
788    #[must_use]
789    pub const fn hot_bytes(&self) -> u64 {
790        self.hot_bytes
791    }
792
793    /// v6.7.0 — cached count of cold-tier rows. See struct field
794    /// docs for the staleness contract.
795    #[must_use]
796    pub const fn cold_row_count(&self) -> u64 {
797        self.cold_row_count
798    }
799
800    /// v6.7.0 — overwrite the cached count. Called by the engine's
801    /// `analyze_one_table` after walking the indices.
802    pub fn set_cold_row_count(&mut self, n: u64) {
803        self.cold_row_count = n;
804        self.cold_row_count_stale = false;
805    }
806
807    /// v6.7.0 — mark the cached count as potentially out of date.
808    /// Called by freezer / promote / DELETE paths so a subsequent
809    /// `spg_statistic` read knows the number may not reflect the
810    /// current state.
811    pub fn mark_cold_row_count_stale(&mut self) {
812        self.cold_row_count_stale = true;
813    }
814
815    /// v6.7.0 — report whether the cached count is known to be out
816    /// of date. Exposed for completeness; the virtual table surface
817    /// returns the cached value regardless.
818    #[must_use]
819    pub const fn cold_row_count_stale(&self) -> bool {
820        self.cold_row_count_stale
821    }
822
823    /// v6.7.0 — walk every BTree index and count `RowLocator::Cold`
824    /// entries; return the MAX across indices. The freeze path
825    /// (`freeze_oldest_to_cold`) writes cold locators to ONE
826    /// designated index — that index ends up with the full per-row
827    /// count. MAX-across-indices yields the precise count when a
828    /// PK-style index exists; for multi-index tables without a
829    /// covering index it's a lower bound (rare in practice).
830    /// Caller responsibility: only invoke under `engine.write()`
831    /// or after taking ownership; the walk is O(N) over every
832    /// (key, locator) pair.
833    #[must_use]
834    pub fn count_cold_locators(&self) -> u64 {
835        let mut best: u64 = 0;
836        for idx in &self.indices {
837            if let IndexKind::BTree(map) = &idx.kind {
838                let n: u64 = map
839                    .iter()
840                    .map(|(_, locs)| locs.iter().filter(|l| l.is_cold()).count() as u64)
841                    .sum();
842                if n > best {
843                    best = n;
844                }
845            }
846        }
847        best
848    }
849
850    pub const fn schema(&self) -> &TableSchema {
851        &self.schema
852    }
853
854    /// v6.7.2 — mutable schema accessor for ALTER TABLE paths.
855    /// Used by `Engine::exec_alter_table` to flip per-table
856    /// settings like `hot_tier_bytes`.
857    pub const fn schema_mut(&mut self) -> &mut TableSchema {
858        &mut self.schema
859    }
860
861    /// v4.39: returns the persistent row vector by reference. Callers that
862    /// used to take `&[Row]` should switch to `.iter()` (via
863    /// `IntoIterator for &PersistentVec`) or `.get(i)` for indexing.
864    pub const fn rows(&self) -> &PersistentVec<Row> {
865        &self.rows
866    }
867
868    pub const fn row_count(&self) -> usize {
869        self.rows.len()
870    }
871
872    /// v6.8.0 — exposed for the engine layer to patch
873    /// `Index::included_columns` post-creation. Could fold into
874    /// `add_index` once the engine's IF-NOT-EXISTS guard moves up,
875    /// but the patch shape is the minimal change for v6.8.0.
876    pub fn indices_mut(&mut self) -> &mut [Index] {
877        &mut self.indices
878    }
879
880    pub fn indices(&self) -> &[Index] {
881        &self.indices
882    }
883
884    /// Compute the next `AUTO_INCREMENT` value for the column at
885    /// `col_pos`. Defined as `max(existing) + 1`, falling back to `1`
886    /// when the column currently holds no integer values. NULL / non-
887    /// integer cells are skipped. Returns `None` when the column isn't
888    /// an integer type.
889    pub fn next_auto_value(&self, col_pos: usize) -> Option<i64> {
890        let ty = self.schema.columns.get(col_pos)?.ty;
891        if !matches!(ty, DataType::SmallInt | DataType::Int | DataType::BigInt) {
892            return None;
893        }
894        let mut max: Option<i64> = None;
895        for row in &self.rows {
896            match row.values.get(col_pos) {
897                Some(Value::SmallInt(n)) => {
898                    let v = i64::from(*n);
899                    max = Some(max.map_or(v, |m| m.max(v)));
900                }
901                Some(Value::Int(n)) => {
902                    let v = i64::from(*n);
903                    max = Some(max.map_or(v, |m| m.max(v)));
904                }
905                Some(Value::BigInt(n)) => {
906                    max = Some(max.map_or(*n, |m| m.max(*n)));
907                }
908                _ => {}
909            }
910        }
911        Some(max.map_or(1, |m| m + 1))
912    }
913
914    /// Return the first index defined over `column_position`, if any.
915    /// (`v0.8` supports at most one index per column logically; the search
916    /// just picks the first match.)
917    pub fn index_on(&self, column_position: usize) -> Option<&Index> {
918        // v6.7.1 — prefer BTree (has the key→locator map needed
919        // for `lookup_eq`) over BRIN (metadata-only). When only a
920        // BRIN exists on the column, return None so the executor
921        // falls back to the hot-tier row scan instead of trying
922        // to use BRIN for an equality lookup (which would always
923        // return an empty slice and look like "no rows matched").
924        self.indices
925            .iter()
926            .find(|i| i.column_position == column_position && matches!(i.kind, IndexKind::BTree(_)))
927            .or_else(|| {
928                self.indices
929                    .iter()
930                    .find(|i| i.column_position == column_position && matches!(i.kind, IndexKind::Nsw(_)))
931            })
932    }
933
934    /// Insert one row after validating it matches the schema (length + type).
935    /// Returns `StorageError` on mismatch — the table is left unchanged.
936    /// Updates every defined index with the new row's key.
937    pub fn insert(&mut self, row: Row) -> Result<(), StorageError> {
938        if row.len() != self.schema.columns.len() {
939            return Err(StorageError::ArityMismatch {
940                expected: self.schema.columns.len(),
941                actual: row.len(),
942            });
943        }
944        for (i, (val, col)) in row.values.iter().zip(&self.schema.columns).enumerate() {
945            if val.is_null() {
946                if !col.nullable {
947                    return Err(StorageError::NullInNotNull {
948                        column: col.name.clone(),
949                    });
950                }
951                continue;
952            }
953            let actual = val.data_type().expect("non-null");
954            // Vector columns require both that the value's variant be Vector
955            // *and* its dimension match. `actual == col.ty` already encodes
956            // both because DataType::Vector carries the dim.
957            //
958            // VARCHAR(n) / CHAR(n) are storage-equivalent to TEXT — the
959            // length / padding contract is enforced upstream by
960            // `coerce_value`. Accept a `Text` value into either.
961            //
962            // NUMERIC's `Value::Numeric` carries its actual scale but the
963            // column declares the *expected* scale (a scale-rescaled
964            // Value::Numeric is produced upstream by `coerce_value`); the
965            // structural check here only verifies "value is Numeric and
966            // its scale equals the column scale".
967            let compatible = actual == col.ty
968                || matches!(
969                    (actual, col.ty),
970                    (
971                        DataType::Text,
972                        DataType::Varchar(_) | DataType::Char(_) | DataType::Json
973                    ) | (DataType::Json, DataType::Text)
974                )
975                || matches!(
976                    (actual, col.ty),
977                    (
978                        DataType::Numeric { scale: a, .. },
979                        DataType::Numeric { scale: b, .. },
980                    ) if a == b
981                );
982            if !compatible {
983                return Err(StorageError::TypeMismatch {
984                    column: col.name.clone(),
985                    expected: col.ty,
986                    actual,
987                    position: i,
988                });
989            }
990        }
991        let new_row_idx = self.rows.len();
992        // Pre-validate before mutating: ensure indices receive an IndexKey.
993        // For NSW we defer the graph update to *after* the row is pushed
994        // so the kNN search can see it in `self.rows`.
995        for idx in &mut self.indices {
996            if let IndexKind::BTree(map) = &mut idx.kind
997                && let Some(key) = IndexKey::from_value(&row.values[idx.column_position])
998            {
999                // v4.40: PersistentBTreeMap has no in-place entry-or-default.
1000                // Clone-then-insert keeps the same semantics — for typical
1001                // unique-key schemas the Vec is 1-element so the clone is
1002                // O(1). For dup-heavy columns it's O(M) per insert, traded
1003                // for the structural-sharing win at clone time.
1004                let mut entries = map.get(&key).cloned().unwrap_or_default();
1005                entries.push(RowLocator::Hot(new_row_idx));
1006                map.insert_mut(key, entries);
1007            }
1008        }
1009        // v5.2.1: maintain incremental hot-tier byte counter. Computed
1010        // before the move so we don't need to borrow `row` after push.
1011        self.hot_bytes = self
1012            .hot_bytes
1013            .saturating_add(row_body_encoded_len(&row, &self.schema) as u64);
1014        // v4.39.1: push_mut keeps streaming inserts at Vec::push speed when
1015        // the table is uniquely owned (the spg-embedded path); inside a TX
1016        // wrap where a Catalog snapshot exists, push_mut path-copies the
1017        // tail just like push() and the snapshot stays valid.
1018        self.rows.push_mut(row);
1019        // NSW updates after the push so the new row is visible to the
1020        // greedy search used during connect.
1021        let new_row_idx = self.rows.len() - 1;
1022        let nsw_targets: Vec<usize> = self
1023            .indices
1024            .iter()
1025            .enumerate()
1026            .filter_map(|(i, idx)| {
1027                if matches!(idx.kind, IndexKind::Nsw(_)) {
1028                    Some(i)
1029                } else {
1030                    None
1031                }
1032            })
1033            .collect();
1034        for idx_pos in nsw_targets {
1035            nsw_insert_at(self, idx_pos, new_row_idx);
1036        }
1037        Ok(())
1038    }
1039
1040    /// Build a new B-tree index over the named column. Rebuilds from
1041    /// existing rows. Errors if `column_name` doesn't exist or the index
1042    /// name is taken.
1043    pub fn add_index(&mut self, name: String, column_name: &str) -> Result<(), StorageError> {
1044        if self.indices.iter().any(|i| i.name == name) {
1045            return Err(StorageError::DuplicateIndex { name });
1046        }
1047        let column_position = self.schema.column_position(column_name).ok_or_else(|| {
1048            StorageError::ColumnNotFound {
1049                column: column_name.into(),
1050            }
1051        })?;
1052        let mut idx = Index::new_btree(name, column_position);
1053        if let IndexKind::BTree(map) = &mut idx.kind {
1054            for (i, row) in self.rows.iter().enumerate() {
1055                if let Some(key) = IndexKey::from_value(&row.values[column_position]) {
1056                    let mut entries = map.get(&key).cloned().unwrap_or_default();
1057                    entries.push(RowLocator::Hot(i));
1058                    map.insert_mut(key, entries);
1059                }
1060            }
1061        }
1062        self.indices.push(idx);
1063        Ok(())
1064    }
1065
1066    /// Build a new NSW (HNSW-flavoured) index over the named column.
1067    /// Required for `ORDER BY col <-> literal LIMIT k` to plan as a
1068    /// graph traversal instead of a full scan. Column must be a Vector
1069    /// type. `m` is the maximum number of neighbours per node.
1070    pub fn add_nsw_index(
1071        &mut self,
1072        name: String,
1073        column_name: &str,
1074        m: usize,
1075    ) -> Result<(), StorageError> {
1076        self.add_nsw_index_inner(name, column_name, m, None)
1077    }
1078
1079    /// v6.0.4 — synchronous rebuild of the named NSW index. If
1080    /// `new_encoding` is `Some(target)` and differs from the column's
1081    /// current encoding, every stored cell at the indexed column is
1082    /// re-coded into the target encoding before the new graph
1083    /// builds. Returns `IndexNotFound` if no index by that name exists
1084    /// and `Unsupported` for non-NSW indexes (`BTree` REBUILD is a no-op
1085    /// the engine layer rejects, not a storage-level concept).
1086    ///
1087    /// Holds the caller's `&mut self` for the duration — no
1088    /// concurrency / staging / WAL-replay machinery in v6.0.4. The
1089    /// "live" optimisation lands as v6.0.4.1.
1090    pub fn rebuild_nsw_index(
1091        &mut self,
1092        name: &str,
1093        new_encoding: Option<VecEncoding>,
1094    ) -> Result<(), StorageError> {
1095        let idx_pos = self
1096            .indices
1097            .iter()
1098            .position(|i| i.name == name)
1099            .ok_or_else(|| StorageError::IndexNotFound {
1100                name: String::from(name),
1101            })?;
1102        let col_pos = self.indices[idx_pos].column_position;
1103        let m = match &self.indices[idx_pos].kind {
1104            IndexKind::Nsw(g) => g.m,
1105            IndexKind::BTree(_) | IndexKind::Brin { .. } => {
1106                return Err(StorageError::Unsupported(format!(
1107                    "ALTER INDEX REBUILD on non-NSW index {name:?} — only NSW indexes can rebuild"
1108                )));
1109            }
1110        };
1111        let col_name = self.schema.columns[col_pos].name.clone();
1112        // 1. Optional re-encoding pass. Done first so the cells
1113        //    match the schema before the graph rebuild walks them.
1114        if let Some(target) = new_encoding {
1115            let current = match self.schema.columns[col_pos].ty {
1116                DataType::Vector { encoding, .. } => encoding,
1117                ref other => {
1118                    return Err(StorageError::Unsupported(format!(
1119                        "ALTER INDEX REBUILD WITH (encoding=…) on non-vector column type {other:?}"
1120                    )));
1121                }
1122            };
1123            if target != current {
1124                let DataType::Vector { dim, .. } = self.schema.columns[col_pos].ty else {
1125                    unreachable!("checked above")
1126                };
1127                let n = self.rows.len();
1128                for i in 0..n {
1129                    let row = self
1130                        .rows
1131                        .get_mut(i)
1132                        .expect("row index in bounds (we iterated up to len())");
1133                    let cell = core::mem::replace(&mut row.values[col_pos], Value::Null);
1134                    let recoded = recode_vector_cell(cell, target)?;
1135                    row.values[col_pos] = recoded;
1136                }
1137                self.schema.columns[col_pos].ty = DataType::Vector {
1138                    dim,
1139                    encoding: target,
1140                };
1141            }
1142        }
1143        // 2. Drop the existing index slot + rebuild from row payload.
1144        self.indices.remove(idx_pos);
1145        self.add_nsw_index_inner(String::from(name), &col_name, m, None)?;
1146        Ok(())
1147    }
1148
1149    /// Restore an NSW index from a pre-built graph (used on
1150    /// deserialize). Skips the bulk-build pass since the topology is
1151    /// already known. Returns `DuplicateIndex` or `ColumnNotFound` on
1152    /// schema mismatch as usual.
1153    pub fn restore_nsw_index(
1154        &mut self,
1155        name: String,
1156        column_name: &str,
1157        graph: NswGraph,
1158    ) -> Result<(), StorageError> {
1159        self.add_nsw_index_inner(name, column_name, graph.m, Some(graph))
1160    }
1161
1162    /// Restore a `BTree` index from a pre-built `(IndexKey, Vec<RowLocator>)`
1163    /// map. Used by [`Catalog::deserialize`] when reading a v9 (or later)
1164    /// catalog snapshot — the map travels on disk so cold-tier locators
1165    /// survive a round-trip, instead of being rebuilt from `self.rows`
1166    /// (which would lose every Cold entry). Same error contract as
1167    /// [`Table::add_index`].
1168    pub fn restore_btree_index(
1169        &mut self,
1170        name: String,
1171        column_name: &str,
1172        map: PersistentBTreeMap<IndexKey, Vec<RowLocator>>,
1173    ) -> Result<(), StorageError> {
1174        if self.indices.iter().any(|i| i.name == name) {
1175            return Err(StorageError::DuplicateIndex { name });
1176        }
1177        let column_position = self.schema.column_position(column_name).ok_or_else(|| {
1178            StorageError::ColumnNotFound {
1179                column: column_name.into(),
1180            }
1181        })?;
1182        self.indices.push(Index {
1183            name,
1184            column_position,
1185            kind: IndexKind::BTree(map),
1186            included_columns: Vec::new(),
1187            partial_predicate: None,
1188            expression: None,
1189        });
1190        Ok(())
1191    }
1192
1193    /// v6.7.1 — public restore counterpart for BRIN indices. Used
1194    /// by `Catalog::deserialize` when a v10 snapshot carries a
1195    /// BRIN index entry. BRIN carries no in-memory data — only the
1196    /// `column_type` snapshot is restored.
1197    pub fn restore_brin_index(
1198        &mut self,
1199        name: String,
1200        column_name: &str,
1201        column_type: DataType,
1202    ) -> Result<(), StorageError> {
1203        if self.indices.iter().any(|i| i.name == name) {
1204            return Err(StorageError::DuplicateIndex { name });
1205        }
1206        let column_position = self.schema.column_position(column_name).ok_or_else(|| {
1207            StorageError::ColumnNotFound {
1208                column: column_name.into(),
1209            }
1210        })?;
1211        self.indices.push(Index::new_brin(name, column_position, column_type));
1212        Ok(())
1213    }
1214
1215    /// v6.7.1 — public CREATE INDEX counterpart for BRIN. Creates
1216    /// the index entry with a snapshot of the indexed column's
1217    /// current `DataType`.
1218    pub fn add_brin_index(
1219        &mut self,
1220        name: String,
1221        column_name: &str,
1222    ) -> Result<(), StorageError> {
1223        if self.indices.iter().any(|i| i.name == name) {
1224            return Err(StorageError::DuplicateIndex { name });
1225        }
1226        let column_position = self.schema.column_position(column_name).ok_or_else(|| {
1227            StorageError::ColumnNotFound {
1228                column: column_name.into(),
1229            }
1230        })?;
1231        let column_type = self.schema.columns[column_position].ty;
1232        self.indices.push(Index::new_brin(name, column_position, column_type));
1233        Ok(())
1234    }
1235
1236    /// v5.1: register cold-tier locators on a `BTree` index. Used
1237    /// after [`Catalog::load_segment_bytes`] to wire every cold-
1238    /// tier row's PK back to its segment so
1239    /// [`Catalog::lookup_by_pk`] can resolve it. Each call
1240    /// appends to the index — keys that already have hot or cold
1241    /// locators keep them. Returns the number of locators
1242    /// registered.
1243    ///
1244    /// Pre-v5.2 (freezer) this is the only path that adds Cold
1245    /// variants to a PB; post-freezer the background freezer
1246    /// thread produces these as a batch under the engine write
1247    /// lock and this API becomes its in-memory primitive.
1248    ///
1249    /// Errors if `index_name` doesn't exist or names an NSW graph
1250    /// (NSW indices don't carry per-key row locators — they're
1251    /// vector-search structures).
1252    pub fn register_cold_locators<I>(
1253        &mut self,
1254        index_name: &str,
1255        locators: I,
1256    ) -> Result<usize, StorageError>
1257    where
1258        I: IntoIterator<Item = (IndexKey, RowLocator)>,
1259    {
1260        let idx = self
1261            .indices
1262            .iter_mut()
1263            .find(|i| i.name == index_name)
1264            .ok_or_else(|| StorageError::Corrupt(format!("index {index_name:?} not found")))?;
1265        let map = match &mut idx.kind {
1266            IndexKind::BTree(map) => map,
1267            IndexKind::Nsw(_) | IndexKind::Brin { .. } => {
1268                return Err(StorageError::Corrupt(format!(
1269                    "index {index_name:?} is not BTree; cold locators apply only to BTree indices"
1270                )));
1271            }
1272        };
1273        let mut count = 0usize;
1274        for (key, locator) in locators {
1275            let mut entries = map.get(&key).cloned().unwrap_or_default();
1276            entries.push(locator);
1277            map.insert_mut(key, entries);
1278            count += 1;
1279        }
1280        Ok(count)
1281    }
1282
1283    /// v5.2.3: remove every `Cold` locator currently registered on
1284    /// `index_name` under the given `key`. `Hot` locators for the
1285    /// same key are left in place — useful when a row has just been
1286    /// promoted hot-side and the caller wants the old Cold pointer
1287    /// retired without losing the new hot entry.
1288    ///
1289    /// Returns the number of cold locators removed (0 when the key
1290    /// has only hot entries or the key isn't present at all).
1291    /// Errors when the index doesn't exist or isn't a `BTree`.
1292    pub fn remove_cold_locators_for_key(
1293        &mut self,
1294        index_name: &str,
1295        key: &IndexKey,
1296    ) -> Result<usize, StorageError> {
1297        let idx = self
1298            .indices
1299            .iter_mut()
1300            .find(|i| i.name == index_name)
1301            .ok_or_else(|| {
1302                StorageError::Corrupt(format!(
1303                    "remove_cold_locators_for_key: index {index_name:?} not found"
1304                ))
1305            })?;
1306        let map = match &mut idx.kind {
1307            IndexKind::BTree(map) => map,
1308            IndexKind::Nsw(_) | IndexKind::Brin { .. } => {
1309                return Err(StorageError::Corrupt(format!(
1310                    "remove_cold_locators_for_key: index {index_name:?} is not BTree; \
1311                     cold locators apply only to BTree indices"
1312                )));
1313            }
1314        };
1315        let Some(entries) = map.get(key) else {
1316            return Ok(0);
1317        };
1318        let mut kept: Vec<RowLocator> =
1319            entries.iter().copied().filter(RowLocator::is_hot).collect();
1320        let removed = entries.len() - kept.len();
1321        if removed == 0 {
1322            return Ok(0);
1323        }
1324        kept.shrink_to_fit();
1325        // PersistentBTreeMap has no remove API in v5.2; when every
1326        // locator for `key` was Cold, the key keeps an empty Vec
1327        // entry. `Index::lookup_eq` already treats `Some(&[])` and
1328        // `None` as the same empty slice (via `Vec::as_slice`), so
1329        // callers can't distinguish the two. The space cost is one
1330        // empty Vec per shadowed-then-promoted key — bounded and
1331        // recoverable when the future compaction job lands.
1332        map.insert_mut(key.clone(), kept);
1333        Ok(removed)
1334    }
1335
1336    /// v4.4: delete the rows at the given positions in one pass.
1337    /// `positions` must be unique; ordering doesn't matter. Indices
1338    /// are rebuilt from scratch (cheaper than tracking incremental
1339    /// shifts across both B-tree and NSW). Returns the number of
1340    /// rows removed.
1341    pub fn delete_rows(&mut self, positions: &[usize]) -> usize {
1342        if positions.is_empty() {
1343            return 0;
1344        }
1345        // Mark positions; v4.39: PV has no in-place retain, so we rebuild
1346        // a fresh PV by pushing the survivors. Still O(n log₃₂ n); the
1347        // structural-sharing win shows up at `Catalog::clone()`, not here.
1348        let mut to_remove = alloc::vec![false; self.rows.len()];
1349        let mut removed = 0;
1350        for &p in positions {
1351            if p < to_remove.len() && !to_remove[p] {
1352                to_remove[p] = true;
1353                removed += 1;
1354            }
1355        }
1356        let mut new_rows: PersistentVec<Row> = PersistentVec::new();
1357        let mut removed_bytes: u64 = 0;
1358        for (i, row) in self.rows.iter().enumerate() {
1359            if to_remove[i] {
1360                removed_bytes =
1361                    removed_bytes.saturating_add(row_body_encoded_len(row, &self.schema) as u64);
1362            } else {
1363                new_rows.push_mut(row.clone());
1364            }
1365        }
1366        self.rows = new_rows;
1367        self.hot_bytes = self.hot_bytes.saturating_sub(removed_bytes);
1368        self.rebuild_indices();
1369        removed
1370    }
1371
1372    /// v4.4: replace the row at `position` with `new_values` (must
1373    /// match the schema arity + types). Indices are rebuilt for
1374    /// correctness — the affected column might be indexed and its
1375    /// key may have shifted, and a NSW node's vector may have
1376    /// changed, both of which need fresh state.
1377    pub fn update_row(
1378        &mut self,
1379        position: usize,
1380        new_values: Vec<Value>,
1381    ) -> Result<(), StorageError> {
1382        if position >= self.rows.len() {
1383            return Err(StorageError::Corrupt(alloc::format!(
1384                "update_row: position {position} out of bounds (rows={})",
1385                self.rows.len()
1386            )));
1387        }
1388        if new_values.len() != self.schema.columns.len() {
1389            return Err(StorageError::ArityMismatch {
1390                expected: self.schema.columns.len(),
1391                actual: new_values.len(),
1392            });
1393        }
1394        // Reuse the per-cell type-compat validation that `insert`
1395        // applies. The body below mirrors that check intentionally —
1396        // factoring it would be more code than the duplication.
1397        for (i, (val, col)) in new_values.iter().zip(&self.schema.columns).enumerate() {
1398            if val.is_null() {
1399                if !col.nullable {
1400                    return Err(StorageError::NullInNotNull {
1401                        column: col.name.clone(),
1402                    });
1403                }
1404                continue;
1405            }
1406            let actual = val.data_type().expect("non-null");
1407            let compatible = actual == col.ty
1408                || matches!(
1409                    (actual, col.ty),
1410                    (
1411                        DataType::Text,
1412                        DataType::Varchar(_) | DataType::Char(_) | DataType::Json
1413                    ) | (DataType::Json, DataType::Text)
1414                )
1415                || matches!(
1416                    (actual, col.ty),
1417                    (
1418                        DataType::Numeric { scale: a, .. },
1419                        DataType::Numeric { scale: b, .. },
1420                    ) if a == b
1421                );
1422            if !compatible {
1423                return Err(StorageError::TypeMismatch {
1424                    column: col.name.clone(),
1425                    expected: col.ty,
1426                    actual,
1427                    position: i,
1428                });
1429            }
1430        }
1431        let old_row = self
1432            .rows
1433            .get(position)
1434            .expect("position bounds-checked above");
1435        let old_bytes = row_body_encoded_len(old_row, &self.schema) as u64;
1436        let new_row = Row::new(new_values);
1437        let new_bytes = row_body_encoded_len(&new_row, &self.schema) as u64;
1438        self.rows = self
1439            .rows
1440            .set(position, new_row)
1441            .expect("position bounds-checked above");
1442        self.hot_bytes = self
1443            .hot_bytes
1444            .saturating_sub(old_bytes)
1445            .saturating_add(new_bytes);
1446        self.rebuild_indices();
1447        Ok(())
1448    }
1449
1450    /// v4.4 helper used by `delete_rows` / `update_row`: discard all
1451    /// index payloads and rebuild from `self.rows`. Cheap enough
1452    /// for typical SPG scale (catalogs in the docker-compose
1453    /// deployment shape are small); the alternative — incremental
1454    /// shift bookkeeping across B-tree + NSW — would be far more
1455    /// invasive than the savings justify.
1456    fn rebuild_indices(&mut self) {
1457        // v5.2.3: capture every `Cold` locator on every BTree index
1458        // before the rebuild, so the from-rows re-emission below
1459        // (which only produces `Hot` locators) doesn't drop cold-
1460        // tier entries on keys unrelated to the row that changed.
1461        // Pre-v5.2.3 this was a `freeze_oldest_to_cold` worry only
1462        // and the freezer did its own capture-then-reregister; v5.2.3
1463        // promotes that pattern into the base helper because UPDATE
1464        // / DELETE now run rebuild_indices on tables with cold rows.
1465        let preserved_cold: Vec<(String, Vec<(IndexKey, RowLocator)>)> = self
1466            .indices
1467            .iter()
1468            .filter_map(|idx| match &idx.kind {
1469                IndexKind::BTree(map) => {
1470                    let cold: Vec<(IndexKey, RowLocator)> = map
1471                        .iter()
1472                        .flat_map(|(k, locs)| {
1473                            locs.iter()
1474                                .filter(|l| l.is_cold())
1475                                .copied()
1476                                .map(move |l| (k.clone(), l))
1477                        })
1478                        .collect();
1479                    if cold.is_empty() {
1480                        None
1481                    } else {
1482                        Some((idx.name.clone(), cold))
1483                    }
1484                }
1485                // BRIN / NSW carry no key→locator map.
1486                IndexKind::Nsw(_) | IndexKind::Brin { .. } => None,
1487            })
1488            .collect();
1489
1490        // v6.7.1 — descriptor needs to capture index kind so the
1491        // rebuild loop can resurrect BTree / NSW / BRIN exactly as
1492        // they were. (NSW carries m; BRIN carries the column type
1493        // snapshot; BTree needs no extra payload.)
1494        #[derive(Clone)]
1495        enum RebuildKind {
1496            BTree,
1497            Nsw(usize),
1498            Brin(DataType),
1499        }
1500        let descriptors: Vec<(String, usize, RebuildKind)> = self
1501            .indices
1502            .iter()
1503            .map(|idx| {
1504                let kind = match &idx.kind {
1505                    IndexKind::Nsw(g) => RebuildKind::Nsw(g.m),
1506                    IndexKind::Brin { column_type } => RebuildKind::Brin(*column_type),
1507                    IndexKind::BTree(_) => RebuildKind::BTree,
1508                };
1509                (idx.name.clone(), idx.column_position, kind)
1510            })
1511            .collect();
1512        self.indices.clear();
1513        for (name, column_position, rebuild_kind) in descriptors {
1514            match rebuild_kind {
1515                RebuildKind::Nsw(m) => {
1516                    let idx = Index::new_nsw(name, column_position, m);
1517                    self.indices.push(idx);
1518                    let idx_pos = self.indices.len() - 1;
1519                    let row_indices: Vec<usize> = (0..self.rows.len()).collect();
1520                    for row_idx in row_indices {
1521                        nsw_insert_at(self, idx_pos, row_idx);
1522                    }
1523                }
1524                RebuildKind::Brin(column_type) => {
1525                    // BRIN has no in-memory rebuild — the summaries
1526                    // live in cold segments which freeze emits.
1527                    self.indices.push(Index::new_brin(name, column_position, column_type));
1528                }
1529                RebuildKind::BTree => {
1530                    let mut idx = Index::new_btree(name, column_position);
1531                    if let IndexKind::BTree(map) = &mut idx.kind {
1532                        for (i, row) in self.rows.iter().enumerate() {
1533                            if let Some(key) = IndexKey::from_value(&row.values[column_position]) {
1534                                let mut entries = map.get(&key).cloned().unwrap_or_default();
1535                                entries.push(RowLocator::Hot(i));
1536                                map.insert_mut(key, entries);
1537                            }
1538                        }
1539                    }
1540                    self.indices.push(idx);
1541                }
1542            }
1543        }
1544
1545        // Re-attach preserved cold locators after the from-rows
1546        // rebuild. `register_cold_locators` handles the per-key
1547        // entries-vec append; no key collisions arise because the
1548        // rebuild loop above produced only Hot locators.
1549        for (idx_name, locators) in preserved_cold {
1550            // Errors here would only fire if the index disappeared
1551            // between snapshot and rebuild, which can't happen
1552            // because the rebuild restores the same descriptor set.
1553            let _ = self.register_cold_locators(&idx_name, locators);
1554        }
1555    }
1556
1557    fn add_nsw_index_inner(
1558        &mut self,
1559        name: String,
1560        column_name: &str,
1561        m: usize,
1562        restore: Option<NswGraph>,
1563    ) -> Result<(), StorageError> {
1564        if self.indices.iter().any(|i| i.name == name) {
1565            return Err(StorageError::DuplicateIndex { name });
1566        }
1567        let column_position = self.schema.column_position(column_name).ok_or_else(|| {
1568            StorageError::ColumnNotFound {
1569                column: column_name.into(),
1570            }
1571        })?;
1572        if !matches!(
1573            self.schema.columns[column_position].ty,
1574            DataType::Vector { .. }
1575        ) {
1576            return Err(StorageError::TypeMismatch {
1577                column: column_name.into(),
1578                expected: DataType::Vector {
1579                    dim: 0,
1580                    encoding: VecEncoding::F32,
1581                },
1582                actual: self.schema.columns[column_position].ty,
1583                position: column_position,
1584            });
1585        }
1586        if let Some(graph) = restore {
1587            self.indices.push(Index {
1588                name,
1589                column_position,
1590                kind: IndexKind::Nsw(graph),
1591                included_columns: Vec::new(),
1592                partial_predicate: None,
1593                expression: None,
1594            });
1595            return Ok(());
1596        }
1597        let idx = Index::new_nsw(name, column_position, m);
1598        self.indices.push(idx);
1599        let idx_pos = self.indices.len() - 1;
1600        // Bulk-build by walking the existing rows in order — each insert
1601        // sees the partial graph and links into it.
1602        let row_indices: Vec<usize> = (0..self.rows.len()).collect();
1603        for row_idx in row_indices {
1604            nsw_insert_at(self, idx_pos, row_idx);
1605        }
1606        Ok(())
1607    }
1608}
1609
1610/// v6.0.4 — re-encode a single cell to the target `VecEncoding`.
1611/// Used by `Table::rebuild_nsw_index` when ALTER INDEX REBUILD
1612/// includes the optional `WITH (encoding = …)` clause. Round-trip
1613/// goes through f32: `current → Vec<f32> → target`, leaving NULL
1614/// cells untouched. Returns `Unsupported` on a non-vector cell —
1615/// the caller should have rejected the schema before reaching this.
1616fn recode_vector_cell(cell: Value, target: VecEncoding) -> Result<Value, StorageError> {
1617    if matches!(cell, Value::Null) {
1618        return Ok(cell);
1619    }
1620    // Step 1 — extract the f32 representation of the source cell.
1621    let as_f32: Vec<f32> = match &cell {
1622        Value::Vector(v) => v.clone(),
1623        Value::Sq8Vector(q) => quantize::dequantize(q),
1624        Value::HalfVector(h) => h.to_f32_vec(),
1625        other => {
1626            return Err(StorageError::Unsupported(format!(
1627                "ALTER INDEX REBUILD: cannot recode non-vector cell {:?}",
1628                other.data_type()
1629            )));
1630        }
1631    };
1632    // Step 2 — encode into the target shape. `F32` is the identity
1633    // path (saves one alloc round-trip when the source is already
1634    // F32 — but `Value::Vector(as_f32)` is the right answer
1635    // regardless).
1636    Ok(match target {
1637        VecEncoding::F32 => Value::Vector(as_f32),
1638        VecEncoding::Sq8 => Value::Sq8Vector(quantize::quantize(&as_f32)),
1639        VecEncoding::F16 => Value::HalfVector(halfvec::HalfVector::from_f32_slice(&as_f32)),
1640    })
1641}
1642
1643/// Insert one row into the HNSW graph held by index slot `idx_pos`.
1644/// No-op when the row's value at the indexed column isn't a vector.
1645/// v6.0.1: handles `Value::Sq8Vector` by dequantising into an f32
1646/// "query" surface — the existing greedy + beam-search machinery
1647/// then uses `cell_to_query_metric_distance` to route every
1648/// distance call through the cell's actual encoding.
1649fn nsw_insert_at(table: &mut Table, idx_pos: usize, new_row_idx: usize) {
1650    let col_pos = table.indices[idx_pos].column_position;
1651    let cell_dim: Option<usize> = match &table.rows[new_row_idx].values[col_pos] {
1652        Value::Vector(v) => Some(v.len()),
1653        Value::Sq8Vector(q) => Some(q.bytes.len()),
1654        Value::HalfVector(h) => Some(h.dim()),
1655        _ => None,
1656    };
1657    let Some(dim) = cell_dim else {
1658        // Even non-vector rows occupy a level slot so per-node Vec
1659        // lengths stay aligned with `table.rows.len()`.
1660        ensure_node_slot(table, idx_pos, new_row_idx, 0);
1661        return;
1662    };
1663    if dim == 0 {
1664        ensure_node_slot(table, idx_pos, new_row_idx, 0);
1665        return;
1666    }
1667    let level = nsw_assign_level(new_row_idx);
1668    ensure_node_slot(table, idx_pos, new_row_idx, level);
1669    let (entry, entry_level, m) = match &table.indices[idx_pos].kind {
1670        IndexKind::Nsw(g) => (g.entry, g.entry_level, g.m),
1671        IndexKind::BTree(_) | IndexKind::Brin { .. } => {
1672            unreachable!("nsw_insert_at on a non-NSW index")
1673        }
1674    };
1675    // First node ever — declare it the entry (it gets its own level).
1676    if entry.is_none() {
1677        if let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind {
1678            g.entry = Some(new_row_idx);
1679            g.entry_level = level;
1680            *g.levels
1681                .get_mut(new_row_idx)
1682                .expect("levels slot padded by ensure_node_slot") = level;
1683        }
1684        return;
1685    }
1686    // Set the node's recorded level.
1687    if let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind {
1688        *g.levels
1689            .get_mut(new_row_idx)
1690            .expect("levels slot padded by ensure_node_slot") = level;
1691    }
1692    let query = match &table.rows[new_row_idx].values[col_pos] {
1693        Value::Vector(v) => v.clone(),
1694        // v6.0.1: dequantise the inserted SQ8 cell into an f32 query
1695        // surface so the existing greedy / beam machinery can route
1696        // distances through `cell_to_query_metric_distance`. The
1697        // small dequantisation error is what the recall@10 ≥ 0.95
1698        // envelope already accounts for (V6_DESIGN deliberation #3).
1699        Value::Sq8Vector(q) => quantize::dequantize(q),
1700        // v6.0.3: halfvec dequant is bit-exact at the storage layer,
1701        // so the inserted query is a faithful representation.
1702        Value::HalfVector(h) => h.to_f32_vec(),
1703        _ => return,
1704    };
1705    // Phase 1: greedy descend from `entry` down to `level + 1`, keeping
1706    // exactly one current best so the next layer starts from it.
1707    let mut current = entry.expect("entry was Some above");
1708    let mut current_d = vec_l2_sq(table, col_pos, current, &query);
1709    if entry_level > level {
1710        for layer in (level + 1..=entry_level).rev() {
1711            (current, current_d) =
1712                greedy_layer_walk(table, idx_pos, layer, current, current_d, &query);
1713        }
1714    }
1715    // Phase 2: from `min(level, entry_level)` down to 0, beam-search
1716    // `ef_construction` candidates, run the HNSW §4 heuristic neighbour
1717    // selection over them, and connect bidirectionally.
1718    let top = level.min(entry_level);
1719    let ef = (m * 2).max(8);
1720    for layer in (0..=top).rev() {
1721        let cap = if layer == 0 { m * 2 } else { m };
1722        let mut candidates = layer_beam_search(
1723            table,
1724            idx_pos,
1725            layer,
1726            current,
1727            current_d,
1728            &query,
1729            ef,
1730            NswMetric::L2,
1731        );
1732        candidates.retain(|&(_, n)| n != new_row_idx);
1733        // Take the closest as the entry for the next layer down — done
1734        // before heuristic narrowing because the heuristic can reorder.
1735        if let Some(&(d, n)) = candidates.first() {
1736            current = n;
1737            current_d = d;
1738        }
1739        let peers = select_neighbours_heuristic(&candidates, cap, table, col_pos);
1740        connect_at_layer(table, idx_pos, layer, new_row_idx, &peers);
1741    }
1742    // Phase 3: if the new node climbed above the current entry, take
1743    // over as entry so future inserts/searches start from the new top.
1744    if level > entry_level
1745        && let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind
1746    {
1747        g.entry = Some(new_row_idx);
1748        g.entry_level = level;
1749    }
1750}
1751
1752/// Make sure `layers[*][new_row_idx]` and `levels[new_row_idx]` exist,
1753/// padding with empty/zero entries as needed. Also grows `layers` to
1754/// accommodate the node's top `level`.
1755fn ensure_node_slot(table: &mut Table, idx_pos: usize, new_row_idx: usize, level: u8) {
1756    let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind else {
1757        unreachable!("ensure_node_slot on a BTree index");
1758    };
1759    while g.layers.len() <= level as usize {
1760        g.layers.push(PersistentVec::new());
1761    }
1762    while g.levels.len() <= new_row_idx {
1763        g.levels.push_mut(0);
1764    }
1765    for layer_vec in &mut g.layers {
1766        while layer_vec.len() <= new_row_idx {
1767            layer_vec.push_mut(Vec::new());
1768        }
1769    }
1770}
1771
1772/// Single-step greedy walk on one layer: from `current` (with cached
1773/// distance `current_d`), inspect that node's neighbours at `layer` and
1774/// hop to the closest if it beats `current_d`. Repeat until no move
1775/// improves the distance. Cheap variant of beam-search used for the
1776/// "descend" phase that only needs one survivor per layer.
1777fn greedy_layer_walk(
1778    table: &Table,
1779    idx_pos: usize,
1780    layer: u8,
1781    mut current: usize,
1782    mut current_d: f32,
1783    query: &[f32],
1784) -> (usize, f32) {
1785    let g = match &table.indices[idx_pos].kind {
1786        IndexKind::Nsw(g) => g,
1787        IndexKind::BTree(_) | IndexKind::Brin { .. } => return (current, current_d),
1788    };
1789    let col_pos = table.indices[idx_pos].column_position;
1790    loop {
1791        let neighbours: &[u32] = g
1792            .layers
1793            .get(layer as usize)
1794            .and_then(|layer_v| layer_v.get(current))
1795            .map_or(&[][..], Vec::as_slice);
1796        let mut best = current;
1797        let mut best_d = current_d;
1798        for &n in neighbours {
1799            let n = n as usize;
1800            let d = vec_l2_sq(table, col_pos, n, query);
1801            if d < best_d {
1802                best = n;
1803                best_d = d;
1804            }
1805        }
1806        if best == current {
1807            return (current, current_d);
1808        }
1809        current = best;
1810        current_d = best_d;
1811    }
1812}
1813
1814/// Beam search on one layer starting from `entry_node` with cached
1815/// `entry_d`. Returns the top `ef` candidates in ascending-distance
1816/// order. Caller picks the closest as the next layer's entry and / or
1817/// trims to M for connection.
1818///
1819/// v3.0.1: uses two `BinaryHeap`s (min-heap for the open frontier,
1820/// max-heap for the working top-`ef` results) and a `Vec<bool>` visited
1821/// bitmap, replacing the v2.x `Vec` + `partition_point` + `BTreeSet`
1822/// implementation. Same algorithm shape (HNSW search algorithm 2 from
1823/// the paper); the data-structure swap cuts per-visit cost from
1824/// `O(ef + log row_count)` to amortised `O(log ef)`.
1825#[allow(clippy::too_many_arguments)] // Beam search threads layer, entry, query, ef, metric — each is intrinsic. Bundling them into a config struct hides the call sites.
1826fn layer_beam_search(
1827    table: &Table,
1828    idx_pos: usize,
1829    layer: u8,
1830    entry_node: usize,
1831    entry_d: f32,
1832    query: &[f32],
1833    ef: usize,
1834    metric: NswMetric,
1835) -> Vec<(f32, usize)> {
1836    let g = match &table.indices[idx_pos].kind {
1837        IndexKind::Nsw(g) => g,
1838        IndexKind::BTree(_) | IndexKind::Brin { .. } => return Vec::new(),
1839    };
1840    let col_pos = table.indices[idx_pos].column_position;
1841    let d0 = if matches!(metric, NswMetric::L2) {
1842        entry_d
1843    } else {
1844        cell_to_query_metric_distance(table, col_pos, entry_node, query, metric)
1845    };
1846    let row_count = table.rows.len();
1847    let mut visited: Vec<bool> = alloc::vec![false; row_count];
1848    if entry_node < row_count {
1849        visited[entry_node] = true;
1850    }
1851    // candidates: min-heap by distance (Closest wrapper) — frontier
1852    // results:    max-heap by distance (Furthest wrapper) — top-ef working set
1853    let mut candidates: alloc::collections::BinaryHeap<NodeClosest> =
1854        alloc::collections::BinaryHeap::with_capacity(ef);
1855    let mut results: alloc::collections::BinaryHeap<NodeFurthest> =
1856        alloc::collections::BinaryHeap::with_capacity(ef);
1857    candidates.push(NodeClosest {
1858        dist: d0,
1859        node: entry_node,
1860    });
1861    results.push(NodeFurthest {
1862        dist: d0,
1863        node: entry_node,
1864    });
1865    while let Some(cur) = candidates.pop() {
1866        let worst = results.peek().map_or(f32::INFINITY, |c| c.dist);
1867        if cur.dist > worst && results.len() >= ef {
1868            break;
1869        }
1870        let neighbours: &[u32] = g
1871            .layers
1872            .get(layer as usize)
1873            .and_then(|layer_v| layer_v.get(cur.node))
1874            .map_or(&[][..], Vec::as_slice);
1875        for &n in neighbours {
1876            let n = n as usize;
1877            if n >= row_count || visited[n] {
1878                continue;
1879            }
1880            visited[n] = true;
1881            // v6.0.1: cell-aware distance — F32 cells take the
1882            // existing scalar metric, SQ8 cells route through
1883            // the asymmetric ADC variant for the same metric.
1884            let dn = cell_to_query_metric_distance(table, col_pos, n, query, metric);
1885            if !dn.is_finite() {
1886                continue;
1887            }
1888            let worst = results.peek().map_or(f32::INFINITY, |c| c.dist);
1889            if results.len() < ef || dn < worst {
1890                results.push(NodeFurthest { dist: dn, node: n });
1891                if results.len() > ef {
1892                    results.pop();
1893                }
1894                candidates.push(NodeClosest { dist: dn, node: n });
1895            }
1896        }
1897    }
1898    // Drain results (max-heap order) and re-sort ascending so callers
1899    // can take `closest = result[0]` without flipping.
1900    let mut out: Vec<(f32, usize)> = results.into_iter().map(|c| (c.dist, c.node)).collect();
1901    out.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
1902    out
1903}
1904
1905/// Min-heap wrapper: smaller `dist` → higher priority in a `BinaryHeap`
1906/// (which is a max-heap), so we flip the comparison. NaN sorts last
1907/// (lowest priority) to keep the heap total-ordered.
1908#[derive(Debug, Clone, Copy)]
1909struct NodeClosest {
1910    dist: f32,
1911    node: usize,
1912}
1913impl PartialEq for NodeClosest {
1914    fn eq(&self, other: &Self) -> bool {
1915        self.dist == other.dist && self.node == other.node
1916    }
1917}
1918impl Eq for NodeClosest {}
1919impl PartialOrd for NodeClosest {
1920    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
1921        Some(self.cmp(other))
1922    }
1923}
1924impl Ord for NodeClosest {
1925    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
1926        // Reversed: smaller dist = greater priority.
1927        other
1928            .dist
1929            .partial_cmp(&self.dist)
1930            .unwrap_or(core::cmp::Ordering::Equal)
1931    }
1932}
1933
1934/// Max-heap wrapper: larger `dist` sits at the top so the worst result
1935/// can be evicted in O(log n) when a better candidate arrives.
1936#[derive(Debug, Clone, Copy)]
1937struct NodeFurthest {
1938    dist: f32,
1939    node: usize,
1940}
1941impl PartialEq for NodeFurthest {
1942    fn eq(&self, other: &Self) -> bool {
1943        self.dist == other.dist && self.node == other.node
1944    }
1945}
1946impl Eq for NodeFurthest {}
1947impl PartialOrd for NodeFurthest {
1948    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
1949        Some(self.cmp(other))
1950    }
1951}
1952impl Ord for NodeFurthest {
1953    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
1954        self.dist
1955            .partial_cmp(&other.dist)
1956            .unwrap_or(core::cmp::Ordering::Equal)
1957    }
1958}
1959
1960/// HNSW paper §4 algorithm 4: pick `m` neighbours from `candidates` so
1961/// that each chosen point isn't already covered by a closer chosen
1962/// point. Improves graph diversity → fewer hops needed at search time.
1963///
1964/// `candidates` arrives sorted ascending by distance-to-query. We walk
1965/// it in order, keeping a candidate only when no already-chosen point
1966/// is closer to it than the query is. Result is a vector of row
1967/// indices (length ≤ `m`).
1968fn select_neighbours_heuristic(
1969    candidates: &[(f32, usize)],
1970    m: usize,
1971    table: &Table,
1972    col_pos: usize,
1973) -> Vec<usize> {
1974    let mut chosen: Vec<usize> = Vec::with_capacity(m);
1975    for &(d_q, e) in candidates {
1976        if chosen.len() >= m {
1977            break;
1978        }
1979        // v6.0.1: works on either `Value::Vector` (F32) or
1980        // `Value::Sq8Vector` (Sq8) cells — `cell_l2_sq` dispatches
1981        // on encoding. A non-vector cell yields `f32::INFINITY`
1982        // which the `< d_q` test will never accept.
1983        if !matches!(
1984            table.rows.get(e).and_then(|r| r.values.get(col_pos)),
1985            Some(Value::Vector(_) | Value::Sq8Vector(_) | Value::HalfVector(_))
1986        ) {
1987            continue;
1988        }
1989        let mut covered = false;
1990        for &r in &chosen {
1991            // dist(e, r) measured in the same metric the topology was
1992            // built with (L2). If a chosen `r` is closer to `e` than
1993            // the query is, `r` already "covers" `e` for navigation.
1994            if cell_l2_sq(table, col_pos, e, r) < d_q {
1995                covered = true;
1996                break;
1997            }
1998        }
1999        if !covered {
2000            chosen.push(e);
2001        }
2002    }
2003    chosen
2004}
2005
2006/// Bidirectionally connect `new_row_idx` to each of `peers` at `layer`,
2007/// trimming each endpoint's adjacency to that layer's degree cap by
2008/// keeping only the closest neighbours.
2009fn connect_at_layer(
2010    table: &mut Table,
2011    idx_pos: usize,
2012    layer: u8,
2013    new_row_idx: usize,
2014    peers: &[usize],
2015) {
2016    let col_pos = table.indices[idx_pos].column_position;
2017    let cap = match &table.indices[idx_pos].kind {
2018        IndexKind::Nsw(g) => g.cap_for_layer(layer),
2019        IndexKind::BTree(_) | IndexKind::Brin { .. } => return,
2020    };
2021    // v6.1.x: NSW adjacency stores neighbour row indices as u32 (4 B
2022    // each) rather than usize (8 B on 64-bit). Boundary casts here
2023    // assert the row count fits in u32 — the catalog already enforces
2024    // ≤ 4G rows per table, so the conversion can't lose data.
2025    let new_row_u32 = u32::try_from(new_row_idx).expect("row index fits in u32");
2026    if let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind {
2027        let layer_v = &mut g.layers[layer as usize];
2028        if let Some(slot) = layer_v.get_mut(new_row_idx) {
2029            *slot = peers
2030                .iter()
2031                .map(|&p| u32::try_from(p).expect("row index fits in u32"))
2032                .collect();
2033        }
2034    }
2035    for &peer in peers {
2036        // Skip peers whose indexed cell isn't a vector — same fence
2037        // as the F32 path; SQ8 cells flow through `cell_l2_sq`
2038        // below without dequantising.
2039        if !matches!(
2040            &table.rows[peer].values[col_pos],
2041            Value::Vector(_) | Value::Sq8Vector(_) | Value::HalfVector(_)
2042        ) {
2043            continue;
2044        }
2045        // 1. add the new node to peer's adjacency
2046        if let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind {
2047            let layer_v = &mut g.layers[layer as usize];
2048            if let Some(slot) = layer_v.get_mut(peer)
2049                && !slot.contains(&new_row_u32)
2050            {
2051                slot.push(new_row_u32);
2052            }
2053        }
2054        // 2. if peer is over budget, rebuild its adjacency with the
2055        //    HNSW §4 heuristic — same diversity criterion as the
2056        //    insert path so connectivity stays consistent.
2057        let needs_trim = match &table.indices[idx_pos].kind {
2058            IndexKind::Nsw(g) => g.layers[layer as usize][peer].len() > cap,
2059            IndexKind::BTree(_) | IndexKind::Brin { .. } => false,
2060        };
2061        if needs_trim {
2062            let current_peers: Vec<usize> = match &table.indices[idx_pos].kind {
2063                IndexKind::Nsw(g) => g.layers[layer as usize][peer]
2064                    .iter()
2065                    .map(|&n| n as usize)
2066                    .collect(),
2067                IndexKind::BTree(_) | IndexKind::Brin { .. } => continue,
2068            };
2069            // Sort by distance from `peer`'s cell ascending so the
2070            // heuristic receives candidates closest-first. `cell_l2_sq`
2071            // dispatches on encoding so SQ8 columns trim using
2072            // symmetric ADC.
2073            let mut tagged: Vec<(f32, usize)> = current_peers
2074                .iter()
2075                .map(|&p| (cell_l2_sq(table, col_pos, peer, p), p))
2076                .collect();
2077            tagged.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
2078            let kept = select_neighbours_heuristic(&tagged, cap, table, col_pos);
2079            if let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind
2080                && let Some(slot) = g.layers[layer as usize].get_mut(peer)
2081            {
2082                *slot = kept
2083                    .into_iter()
2084                    .map(|p| u32::try_from(p).expect("row index fits in u32"))
2085                    .collect();
2086            }
2087        }
2088    }
2089}
2090
2091/// Squared L2 distance from `query` (raw f32) to the cell at
2092/// `(row, col_pos)`. Dispatches on cell encoding: `Value::Vector`
2093/// (F32) uses `l2_distance_sq`; `Value::Sq8Vector` uses
2094/// `sq8_l2_distance_sq_asymmetric` (the v6.0.1 quantised path).
2095/// Returns `f32::INFINITY` for any non-vector cell so callers can
2096/// compare uniformly.
2097fn vec_l2_sq(table: &Table, col_pos: usize, row: usize, query: &[f32]) -> f32 {
2098    match table.rows.get(row).and_then(|r| r.values.get(col_pos)) {
2099        Some(Value::Vector(v)) if v.len() == query.len() => l2_distance_sq(v, query),
2100        Some(Value::Sq8Vector(q)) if q.bytes.len() == query.len() => {
2101            quantize::sq8_l2_distance_sq_asymmetric(q, query)
2102        }
2103        // v6.0.6: halfvec → fused NEON SIMD kernel; no Vec<f32>
2104        // allocation. v6.0.3 used `to_f32_vec()` + f32 NEON which
2105        // was correct but allocated per call (5× slower than F32).
2106        Some(Value::HalfVector(h)) if h.dim() == query.len() => {
2107            halfvec::half_l2_distance_sq_asymmetric(h, query)
2108        }
2109        _ => f32::INFINITY,
2110    }
2111}
2112
2113/// Squared L2 distance between two stored cells (no f32 query in
2114/// sight). Used during HNSW graph build — both endpoints are
2115/// rows already in the table, so symmetric ADC applies for SQ8
2116/// columns. Mixed-encoding cells within one column are a
2117/// schema-level impossibility (INSERT-time coercion enforces
2118/// uniform encoding), so the catch-all is an abort.
2119fn cell_l2_sq(table: &Table, col_pos: usize, row_a: usize, row_b: usize) -> f32 {
2120    let Some(cell_a) = table.rows.get(row_a).and_then(|r| r.values.get(col_pos)) else {
2121        return f32::INFINITY;
2122    };
2123    let Some(cell_b) = table.rows.get(row_b).and_then(|r| r.values.get(col_pos)) else {
2124        return f32::INFINITY;
2125    };
2126    match (cell_a, cell_b) {
2127        (Value::Vector(a), Value::Vector(b)) if a.len() == b.len() => l2_distance_sq(a, b),
2128        (Value::Sq8Vector(a), Value::Sq8Vector(b)) if a.bytes.len() == b.bytes.len() => {
2129            quantize::sq8_l2_distance_sq(a, b)
2130        }
2131        // v6.0.6: halfvec symmetric NEON — fused SIMD kernel that
2132        // loads both cells' raw u16 bits, expands to f32 lanes
2133        // inline, FMA-accumulates the squared diff. No Vec<f32>
2134        // allocation per call.
2135        (Value::HalfVector(a), Value::HalfVector(b)) if a.dim() == b.dim() => {
2136            halfvec::half_l2_distance_sq(a, b)
2137        }
2138        _ => f32::INFINITY,
2139    }
2140}
2141
2142/// kNN-search-time distance: stored cell → f32 query under the
2143/// caller's metric. Dispatches on cell encoding so SQ8 columns
2144/// take the ADC path with the right asymmetric variant. NaN /
2145/// dim-mismatch / non-vector → `f32::INFINITY`.
2146fn cell_to_query_metric_distance(
2147    table: &Table,
2148    col_pos: usize,
2149    row: usize,
2150    query: &[f32],
2151    metric: NswMetric,
2152) -> f32 {
2153    match table.rows.get(row).and_then(|r| r.values.get(col_pos)) {
2154        Some(Value::Vector(v)) if v.len() == query.len() => metric_distance(metric, v, query),
2155        Some(Value::Sq8Vector(q)) if q.bytes.len() == query.len() => match metric {
2156            NswMetric::L2 => quantize::sq8_l2_distance_sq_asymmetric(q, query),
2157            NswMetric::InnerProduct => quantize::sq8_inner_product_asymmetric(q, query),
2158            NswMetric::Cosine => quantize::sq8_cosine_distance_asymmetric(q, query),
2159        },
2160        // v6.0.6: halfvec dispatches by metric to fused NEON
2161        // kernels — no Vec<f32> allocation per call.
2162        Some(Value::HalfVector(h)) if h.dim() == query.len() => match metric {
2163            NswMetric::L2 => halfvec::half_l2_distance_sq_asymmetric(h, query),
2164            NswMetric::InnerProduct => halfvec::half_inner_product_asymmetric(h, query),
2165            NswMetric::Cosine => halfvec::half_cosine_distance_asymmetric(h, query),
2166        },
2167        _ => f32::INFINITY,
2168    }
2169}
2170
2171/// Distance metric used at NSW search time. The graph topology is
2172/// always built with `L2`; querying with `InnerProduct` / `Cosine`
2173/// reuses the same edges but ranks candidates by the chosen metric.
2174/// For the corpus-sized graphs this loses negligible recall vs
2175/// building separate per-metric graphs.
2176#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2177pub enum NswMetric {
2178    /// Squared Euclidean — ranks "smaller = closer" (the sqrt is
2179    /// monotonic so we skip it for ordering).
2180    L2,
2181    /// Negated dot product, matching pgvector `<#>` convention so
2182    /// "smaller = more similar" holds across all three metrics.
2183    InnerProduct,
2184    /// Cosine distance `1 - cos(a, b)`. Zero-norm operand yields
2185    /// `f32::INFINITY` so it sorts last.
2186    Cosine,
2187}
2188
2189/// Multi-layer HNSW kNN search: greedy-descend from the entry to layer 0,
2190/// then beam-search there with the requested `ef` to return the top `k`
2191/// results under the caller-chosen metric. Topology was built with L2 —
2192/// upper-layer descent uses L2 as a coarse heuristic; final beam search
2193/// runs in the requested metric so rankings are correct for `<#>` / `<=>`.
2194fn nsw_search(
2195    table: &Table,
2196    idx_pos: usize,
2197    query: &[f32],
2198    k: usize,
2199    ef: usize,
2200    metric: NswMetric,
2201) -> Vec<(f32, usize)> {
2202    let (entry, entry_level) = match &table.indices[idx_pos].kind {
2203        IndexKind::Nsw(g) => (g.entry, g.entry_level),
2204        IndexKind::BTree(_) | IndexKind::Brin { .. } => return Vec::new(),
2205    };
2206    let Some(entry) = entry else {
2207        return Vec::new();
2208    };
2209    let col_pos = table.indices[idx_pos].column_position;
2210    // v6.0.1 step 5: SQ8 columns over-fetch by `SQ8_RERANK_OVER_FETCH`
2211    // so the rerank pass below sees enough candidates to recover
2212    // recall after the ADC re-ordering. F32 + F16 columns skip the
2213    // over-fetch — F32 distances are exact, F16 dequant is
2214    // bit-exact at the storage layer so the beam search already
2215    // ranks under the column's full precision.
2216    let sq8 = matches!(
2217        table.schema.columns.get(col_pos).map(|c| c.ty),
2218        Some(DataType::Vector {
2219            encoding: VecEncoding::Sq8,
2220            ..
2221        })
2222    );
2223    let ef = if sq8 {
2224        ef.max(k).max(k * SQ8_RERANK_OVER_FETCH)
2225    } else {
2226        ef.max(k)
2227    };
2228    // Descend by L2 (the topology metric) so layers prune consistently.
2229    let entry_d = vec_l2_sq(table, col_pos, entry, query);
2230    let mut current = entry;
2231    let mut current_d = entry_d;
2232    for layer in (1..=entry_level).rev() {
2233        (current, current_d) = greedy_layer_walk(table, idx_pos, layer, current, current_d, query);
2234    }
2235    // Final beam search on layer 0 under the caller's metric.
2236    let mut results = layer_beam_search(table, idx_pos, 0, current, current_d, query, ef, metric);
2237    if sq8 {
2238        results = sq8_rerank(table, col_pos, &results, query, metric);
2239    }
2240    results.truncate(k);
2241    results
2242}
2243
2244/// v6.0.1 step 5: re-score ADC top-`K*3` candidates with the
2245/// dequantised cell vs the f32 query, then re-sort. Recovers the
2246/// recall the SQ8 ADC sacrifices for 4× compression — the design's
2247/// "f32 rerank step is on by default" path (deliberation #3).
2248/// `metric` is the same metric the beam search used; the rerank
2249/// arithmetic re-derives the exact distance under that metric.
2250fn sq8_rerank(
2251    table: &Table,
2252    col_pos: usize,
2253    candidates: &[(f32, usize)],
2254    query: &[f32],
2255    metric: NswMetric,
2256) -> Vec<(f32, usize)> {
2257    let mut out: Vec<(f32, usize)> = candidates
2258        .iter()
2259        .filter_map(|&(adc_d, row)| {
2260            let cell = table.rows.get(row).and_then(|r| r.values.get(col_pos))?;
2261            let Value::Sq8Vector(q) = cell else {
2262                // F32 cells shouldn't reach this path (sq8 fence
2263                // above), but stay defensive: pass through with
2264                // the ADC distance unchanged.
2265                return Some((adc_d, row));
2266            };
2267            let deq = quantize::dequantize(q);
2268            if deq.len() != query.len() {
2269                return None;
2270            }
2271            Some((metric_distance(metric, &deq, query), row))
2272        })
2273        .collect();
2274    out.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
2275    out
2276}
2277
2278/// Multiplier applied to `k` so the SQ8 rerank pass sees a wider
2279/// candidate set. 3× is the design-stage value; v6.0.5 sweep work
2280/// can re-tune once full corpus profiling is in.
2281const SQ8_RERANK_OVER_FETCH: usize = 3;
2282
2283fn metric_distance(metric: NswMetric, a: &[f32], b: &[f32]) -> f32 {
2284    match metric {
2285        NswMetric::L2 => l2_distance_sq(a, b),
2286        NswMetric::InnerProduct => -inner_product_f32(a, b),
2287        NswMetric::Cosine => {
2288            let (dot, na, nb) = cosine_dot_norms_f32(a, b);
2289            if na == 0.0 || nb == 0.0 {
2290                return f32::INFINITY;
2291            }
2292            // `f32::sqrt` lives in std, so hand-roll Newton-Raphson on
2293            // f64 — same trick the L2 binary op already uses.
2294            let denom = sqrt_newton_f32(na) * sqrt_newton_f32(nb);
2295            1.0 - dot / denom
2296        }
2297    }
2298}
2299
2300/// v6.0.2: dispatch wrapper for the f32 dot product (used by `<#>` +
2301/// the cosine numerator). NEON path when `len % 4 == 0 && len >= 4`,
2302/// scalar fallback otherwise. Returns the positive dot — callers
2303/// negate for the pgvector `<#>` "smaller = closer" convention.
2304///
2305/// Public so perf gates + downstream benches can microbenchmark the
2306/// dispatch directly; not part of the STABILITY contract — internal
2307/// SIMD layout can evolve in any release.
2308#[doc(hidden)]
2309#[inline]
2310pub fn inner_product_f32(a: &[f32], b: &[f32]) -> f32 {
2311    #[cfg(target_arch = "aarch64")]
2312    {
2313        if a.len() == b.len() && a.len() >= 4 && a.len().is_multiple_of(4) {
2314            // SAFETY: NEON is a baseline aarch64 feature; preconditions
2315            // (matching lengths, ≥ 1 full lane group) are checked above.
2316            return unsafe { inner_product_neon(a, b) };
2317        }
2318    }
2319    inner_product_scalar(a, b)
2320}
2321
2322fn inner_product_scalar(a: &[f32], b: &[f32]) -> f32 {
2323    let mut dot: f32 = 0.0;
2324    for (x, y) in a.iter().zip(b.iter()) {
2325        dot += x * y;
2326    }
2327    dot
2328}
2329
2330#[cfg(target_arch = "aarch64")]
2331#[target_feature(enable = "neon")]
2332#[allow(clippy::many_single_char_names)] // NEON intrinsics work in single-letter regs by convention
2333unsafe fn inner_product_neon(a: &[f32], b: &[f32]) -> f32 {
2334    use core::arch::aarch64::{
2335        float32x4_t, vaddq_f32, vaddvq_f32, vdupq_n_f32, vfmaq_f32, vld1q_f32,
2336    };
2337    unsafe {
2338        // Two parallel accumulators (same trick as L2 NEON) so the
2339        // FMA dependency chain doesn't serialise.
2340        let zero: float32x4_t = vdupq_n_f32(0.0);
2341        let mut acc0 = zero;
2342        let mut acc1 = zero;
2343        let n = a.len();
2344        let mut i = 0usize;
2345        while i + 8 <= n {
2346            let av0 = vld1q_f32(a.as_ptr().add(i));
2347            let bv0 = vld1q_f32(b.as_ptr().add(i));
2348            acc0 = vfmaq_f32(acc0, av0, bv0);
2349            let av1 = vld1q_f32(a.as_ptr().add(i + 4));
2350            let bv1 = vld1q_f32(b.as_ptr().add(i + 4));
2351            acc1 = vfmaq_f32(acc1, av1, bv1);
2352            i += 8;
2353        }
2354        while i + 4 <= n {
2355            let av = vld1q_f32(a.as_ptr().add(i));
2356            let bv = vld1q_f32(b.as_ptr().add(i));
2357            acc0 = vfmaq_f32(acc0, av, bv);
2358            i += 4;
2359        }
2360        vaddvq_f32(vaddq_f32(acc0, acc1))
2361    }
2362}
2363
2364/// v6.0.2: dispatch wrapper for the three accumulators (`dot`, `||a||²`,
2365/// `||b||²`) cosine needs. Same NEON pre-condition as the L2 / IP
2366/// paths; same scalar fallback shape.
2367///
2368/// Public for benchmarking only (see `inner_product_f32`); not in the
2369/// STABILITY contract.
2370#[doc(hidden)]
2371#[inline]
2372pub fn cosine_dot_norms_f32(a: &[f32], b: &[f32]) -> (f32, f32, f32) {
2373    #[cfg(target_arch = "aarch64")]
2374    {
2375        if a.len() == b.len() && a.len() >= 4 && a.len().is_multiple_of(4) {
2376            // SAFETY: see `inner_product_neon`.
2377            return unsafe { cosine_dot_norms_neon(a, b) };
2378        }
2379    }
2380    cosine_dot_norms_scalar(a, b)
2381}
2382
2383fn cosine_dot_norms_scalar(a: &[f32], b: &[f32]) -> (f32, f32, f32) {
2384    let mut dot: f32 = 0.0;
2385    let mut na: f32 = 0.0;
2386    let mut nb: f32 = 0.0;
2387    for (x, y) in a.iter().zip(b.iter()) {
2388        dot += x * y;
2389        na += x * x;
2390        nb += y * y;
2391    }
2392    (dot, na, nb)
2393}
2394
2395#[cfg(target_arch = "aarch64")]
2396#[target_feature(enable = "neon")]
2397#[allow(clippy::many_single_char_names, clippy::similar_names)]
2398unsafe fn cosine_dot_norms_neon(a: &[f32], b: &[f32]) -> (f32, f32, f32) {
2399    use core::arch::aarch64::{float32x4_t, vaddvq_f32, vdupq_n_f32, vfmaq_f32, vld1q_f32};
2400    unsafe {
2401        let zero: float32x4_t = vdupq_n_f32(0.0);
2402        let mut acc_dot = zero;
2403        let mut acc_na = zero;
2404        let mut acc_nb = zero;
2405        let n = a.len();
2406        let mut i = 0usize;
2407        while i + 4 <= n {
2408            let av = vld1q_f32(a.as_ptr().add(i));
2409            let bv = vld1q_f32(b.as_ptr().add(i));
2410            acc_dot = vfmaq_f32(acc_dot, av, bv);
2411            acc_na = vfmaq_f32(acc_na, av, av);
2412            acc_nb = vfmaq_f32(acc_nb, bv, bv);
2413            i += 4;
2414        }
2415        (vaddvq_f32(acc_dot), vaddvq_f32(acc_na), vaddvq_f32(acc_nb))
2416    }
2417}
2418
2419fn sqrt_newton_f32(x: f32) -> f32 {
2420    if x <= 0.0 {
2421        return 0.0;
2422    }
2423    let mut g = x;
2424    for _ in 0..10 {
2425        g = 0.5 * (g + x / g);
2426    }
2427    g
2428}
2429
2430/// Squared Euclidean distance — used for ordering inside NSW (the sqrt
2431/// preserves the order). Caller takes sqrt before reporting back to SQL.
2432///
2433/// v3.3.2: aarch64 NEON path for `len % 4 == 0` (which covers every
2434/// HNSW-indexed VECTOR(N) where N is a multiple of 4 — i.e. all
2435/// production-shaped embeddings: 64, 128, 256, 384, 512, 768, 1024,
2436/// 1536, ...). Other shapes fall back to the scalar loop.
2437#[inline]
2438fn l2_distance_sq(a: &[f32], b: &[f32]) -> f32 {
2439    #[cfg(target_arch = "aarch64")]
2440    {
2441        if a.len() == b.len() && a.len() >= 4 && a.len().is_multiple_of(4) {
2442            // SAFETY: NEON is a baseline aarch64 feature (ARMv8);
2443            // the precondition is checked above (matching lengths,
2444            // multiple of 4, at least one 128-bit lane group).
2445            return unsafe { l2_distance_sq_neon(a, b) };
2446        }
2447    }
2448    l2_distance_sq_scalar(a, b)
2449}
2450
2451fn l2_distance_sq_scalar(a: &[f32], b: &[f32]) -> f32 {
2452    let mut sum: f32 = 0.0;
2453    for (x, y) in a.iter().zip(b.iter()) {
2454        let d = *x - *y;
2455        sum += d * d;
2456    }
2457    sum
2458}
2459
2460#[cfg(target_arch = "aarch64")]
2461#[target_feature(enable = "neon")]
2462#[allow(clippy::many_single_char_names)] // NEON intrinsics work in single-letter regs by convention
2463unsafe fn l2_distance_sq_neon(a: &[f32], b: &[f32]) -> f32 {
2464    use core::arch::aarch64::{
2465        float32x4_t, vaddq_f32, vaddvq_f32, vdupq_n_f32, vfmaq_f32, vld1q_f32, vsubq_f32,
2466    };
2467    unsafe {
2468        // Two independent accumulator registers so the FMA dependency
2469        // chain doesn't serialise (each FMA depends on prior FMA).
2470        // Pre-conditions checked by caller: `a.len() == b.len()`,
2471        // `a.len() % 4 == 0`, `a.len() >= 4`.
2472        let zero: float32x4_t = vdupq_n_f32(0.0);
2473        let mut acc0 = zero;
2474        let mut acc1 = zero;
2475        let n = a.len();
2476        let mut i = 0usize;
2477        // Process 8 floats per iter when available (two parallel
2478        // accumulators). Tail of 4 falls into the second loop.
2479        while i + 8 <= n {
2480            let d0 = vsubq_f32(vld1q_f32(a.as_ptr().add(i)), vld1q_f32(b.as_ptr().add(i)));
2481            acc0 = vfmaq_f32(acc0, d0, d0);
2482            let d1 = vsubq_f32(
2483                vld1q_f32(a.as_ptr().add(i + 4)),
2484                vld1q_f32(b.as_ptr().add(i + 4)),
2485            );
2486            acc1 = vfmaq_f32(acc1, d1, d1);
2487            i += 8;
2488        }
2489        while i + 4 <= n {
2490            let d = vsubq_f32(vld1q_f32(a.as_ptr().add(i)), vld1q_f32(b.as_ptr().add(i)));
2491            acc0 = vfmaq_f32(acc0, d, d);
2492            i += 4;
2493        }
2494        vaddvq_f32(vaddq_f32(acc0, acc1))
2495    }
2496}
2497
2498/// Public wrapper: run an NSW kNN search and return the top-k row
2499/// indices ordered by ascending distance under the given metric.
2500pub fn nsw_query(
2501    table: &Table,
2502    idx_name: &str,
2503    query: &[f32],
2504    k: usize,
2505    metric: NswMetric,
2506) -> Vec<usize> {
2507    let Some(idx_pos) = table.indices.iter().position(|i| i.name == idx_name) else {
2508        return Vec::new();
2509    };
2510    let ef = (k * 2).max(NSW_DEFAULT_M);
2511    let mut hits = nsw_search(table, idx_pos, query, k, ef, metric);
2512    hits.truncate(k);
2513    hits.into_iter().map(|(_, idx)| idx).collect()
2514}
2515
2516/// Find any NSW index on a column. Used by the planner to decide
2517/// whether an `ORDER BY col <-> literal LIMIT k` query can skip the
2518/// brute-force scan.
2519pub fn nsw_index_on(table: &Table, column_position: usize) -> Option<&Index> {
2520    table
2521        .indices
2522        .iter()
2523        .find(|i| i.column_position == column_position && matches!(i.kind, IndexKind::Nsw(_)))
2524}
2525
2526/// Catalog: insertion-ordered `Vec<Table>` for stable iter / serialize,
2527/// plus a `BTreeMap<String, usize>` sidecar index so `get` / `get_mut`
2528/// run in O(log n) instead of the old linear scan with per-element
2529/// string compares.
2530///
2531/// A pure `BTreeMap<String, Table>` was tried in an interim version
2532/// of v3.1.2 and regressed the single-table catalog benches by ~10%
2533/// (the per-element `BTreeMap` overhead outweighs the lookup win
2534/// when n is small). The sidecar shape preserves the insertion-order
2535/// iteration the on-disk encoding relies on and keeps `last_mut`
2536/// (used by the deserialize hot path) cheap.
2537#[derive(Debug, Clone, Default)]
2538pub struct Catalog {
2539    tables: Vec<Table>,
2540    /// `name → tables[index]`. Kept in lock-step with `tables`.
2541    /// `create_table` is the only write path.
2542    by_name: BTreeMap<String, usize>,
2543    /// v5.1: in-memory cold-tier segments. Side-loaded via
2544    /// [`Catalog::load_segment_bytes`] — they live outside the
2545    /// catalog snapshot (caller persists them as separate files
2546    /// and re-loads on boot, until v5.3's `CatalogManifest` makes
2547    /// that wiring automatic). `RowLocator::Cold { segment_id, .. }`
2548    /// indexes this `Vec`. Cleared on `Catalog::new` / fresh
2549    /// `deserialize`.
2550    ///
2551    /// `Arc` wrap keeps `Catalog::clone` at O(N segments) bumps
2552    /// (rather than O(total segment bytes) memcpy) so the v4.42
2553    /// group-commit pre-image rollback invariant — clone is
2554    /// effectively free — survives the cold-tier addition.
2555    ///
2556    /// v6.7.3 — slots became `Option<…>` so cold-segment compaction
2557    /// can tombstone merged sources without breaking the
2558    /// `segment_id = index_into_vec` contract that on-disk
2559    /// `RowLocator::Cold { segment_id }` already serialized.
2560    /// `None` slot = the segment was retired by compaction; the
2561    /// physical file may still be on disk (next CHECKPOINT writes
2562    /// a manifest that no longer lists it, and the file becomes
2563    /// an orphan eligible for offline cleanup).
2564    cold_segments: Vec<Option<Arc<OwnedSegment>>>,
2565}
2566
2567impl Catalog {
2568    pub const fn new() -> Self {
2569        Self {
2570            tables: Vec::new(),
2571            by_name: BTreeMap::new(),
2572            cold_segments: Vec::new(),
2573        }
2574    }
2575
2576    pub fn create_table(&mut self, schema: TableSchema) -> Result<(), StorageError> {
2577        if self.by_name.contains_key(&schema.name) {
2578            return Err(StorageError::DuplicateTable {
2579                name: schema.name.clone(),
2580            });
2581        }
2582        let idx = self.tables.len();
2583        let name = schema.name.clone();
2584        self.tables.push(Table::new(schema));
2585        self.by_name.insert(name, idx);
2586        Ok(())
2587    }
2588
2589    pub fn get(&self, name: &str) -> Option<&Table> {
2590        let idx = *self.by_name.get(name)?;
2591        self.tables.get(idx)
2592    }
2593
2594    pub fn get_mut(&mut self, name: &str) -> Option<&mut Table> {
2595        let idx = *self.by_name.get(name)?;
2596        self.tables.get_mut(idx)
2597    }
2598
2599    pub fn table_count(&self) -> usize {
2600        self.tables.len()
2601    }
2602
2603    /// Borrow-free copy of every table's name in catalog order
2604    /// (= insertion order, matching the on-disk encoding).
2605    pub fn table_names(&self) -> Vec<String> {
2606        self.tables.iter().map(|t| t.schema.name.clone()).collect()
2607    }
2608
2609    /// v5.1: register a cold-tier segment that already lives in
2610    /// memory (caller did the file read). Returns the
2611    /// `segment_id` that `RowLocator::Cold { segment_id, .. }`
2612    /// will reference — currently this is just the index into
2613    /// `cold_segments`, but treat it as an opaque token.
2614    ///
2615    /// Storage is `no_std`, so file I/O is the caller's
2616    /// responsibility — `spg-server` reads the file and forwards
2617    /// the bytes here. The bytes stay resident in the catalog
2618    /// for the life of the `Catalog`, parsed only once.
2619    pub fn load_segment_bytes(&mut self, bytes: Vec<u8>) -> Result<u32, StorageError> {
2620        let id = u32::try_from(self.cold_segments.len()).map_err(|_| {
2621            StorageError::Corrupt("cold segment count would exceed u32::MAX".into())
2622        })?;
2623        let seg = OwnedSegment::from_bytes(bytes)
2624            .map_err(|e| StorageError::Corrupt(format!("cold segment parse failed: {e}")))?;
2625        self.cold_segments.push(Some(Arc::new(seg)));
2626        Ok(id)
2627    }
2628
2629    /// v6.7.3 — register a cold-tier segment at a specific id. Used
2630    /// by the spg-server manifest-boot path so segments whose
2631    /// neighbouring ids were retired by compaction still get back
2632    /// the same `segment_id` they had pre-restart (the
2633    /// `RowLocator::Cold { segment_id }` baked into the BTree-index
2634    /// snapshot persists across restart and must continue to
2635    /// resolve).
2636    ///
2637    /// Pads the Vec with `None` slots up to `target_id` if needed.
2638    /// Errors when the target slot is already occupied (would
2639    /// stomp another segment), the parse fails, or `target_id`
2640    /// exceeds `u32::MAX`.
2641    pub fn load_segment_bytes_at(
2642        &mut self,
2643        target_id: u32,
2644        bytes: Vec<u8>,
2645    ) -> Result<(), StorageError> {
2646        let seg = OwnedSegment::from_bytes(bytes)
2647            .map_err(|e| StorageError::Corrupt(format!("cold segment parse failed: {e}")))?;
2648        let idx = target_id as usize;
2649        while self.cold_segments.len() <= idx {
2650            self.cold_segments.push(None);
2651        }
2652        if self.cold_segments[idx].is_some() {
2653            return Err(StorageError::Corrupt(format!(
2654                "load_segment_bytes_at: segment_id {target_id} already occupied"
2655            )));
2656        }
2657        self.cold_segments[idx] = Some(Arc::new(seg));
2658        Ok(())
2659    }
2660
2661    /// v6.7.3 — retire a cold-tier segment slot (compaction-driven).
2662    /// The physical file is the caller's concern (typically kept
2663    /// on disk until the next CHECKPOINT writes a manifest that
2664    /// no longer lists it); this just flips the in-memory slot
2665    /// to `None` so later cold lookups for `segment_id` resolve
2666    /// as "unknown" instead of returning a stale row.
2667    ///
2668    /// No-op when the slot is already `None`. Errors only when
2669    /// `segment_id` is out of bounds.
2670    pub fn tombstone_segment(&mut self, segment_id: u32) -> Result<(), StorageError> {
2671        let idx = segment_id as usize;
2672        if idx >= self.cold_segments.len() {
2673            return Err(StorageError::Corrupt(format!(
2674                "tombstone_segment: segment_id {segment_id} out of bounds (len={})",
2675                self.cold_segments.len()
2676            )));
2677        }
2678        self.cold_segments[idx] = None;
2679        Ok(())
2680    }
2681
2682    /// Number of *active* (non-tombstoned) cold segments.
2683    #[must_use]
2684    pub fn cold_segment_count(&self) -> usize {
2685        self.cold_segments.iter().filter(|s| s.is_some()).count()
2686    }
2687
2688    /// Slot count including tombstones (= the next id the
2689    /// no-arg `load_segment_bytes` would allocate).
2690    #[must_use]
2691    pub fn cold_segment_slot_count(&self) -> usize {
2692        self.cold_segments.len()
2693    }
2694
2695    /// v6.2.7 — list every *active* cold-tier segment id known to
2696    /// this catalog (skips compaction tombstones since v6.7.3).
2697    /// Used by EXPLAIN ANALYZE to annotate scan nodes with the
2698    /// segments they could have walked.
2699    #[must_use]
2700    pub fn cold_segment_ids_global(&self) -> Vec<u32> {
2701        self.cold_segments
2702            .iter()
2703            .enumerate()
2704            .filter_map(|(i, s)| s.as_ref().map(|_| i as u32))
2705            .collect()
2706    }
2707
2708    /// v5.2.1: sum of `Table::hot_bytes` across every table. The v5.2
2709    /// freezer compares this against `SPG_HOT_TIER_BYTES` (parsed at
2710    /// server startup; default 4 GiB) and wakes when the budget is
2711    /// crossed. Pre-freezer (v5.2.1) this is measurement-only — the
2712    /// counter exposes whether the budget is being approached without
2713    /// triggering any demotion.
2714    #[must_use]
2715    pub fn hot_tier_bytes(&self) -> u64 {
2716        self.tables
2717            .iter()
2718            .map(Table::hot_bytes)
2719            .fold(0u64, u64::saturating_add)
2720    }
2721
2722    /// v5.2.2: freeze the **first** `max_rows` rows of `table_name`'s
2723    /// hot tier into a brand-new cold-tier segment. The named `BTree`
2724    /// index supplies the per-row PK (its column must be an integer
2725    /// type — v5.2.2 only supports `IndexKey::Int` PKs, matching the
2726    /// `index_key_as_u64` constraint used by the cold-tier lookup
2727    /// path). On success returns a [`FreezeReport`] with the
2728    /// freshly-allocated segment id, the count of rows that moved,
2729    /// the encoded segment bytes (so the caller can persist them to
2730    /// disk for later reload via `SPG_PRELOAD_COLD_SEGMENT`), and the
2731    /// hot-tier byte delta that was reclaimed.
2732    ///
2733    /// **Semantics**:
2734    /// 1. The first `max_rows` rows (by hot-tier position — same as
2735    ///    insertion order under v4.39 `PersistentVec`) are read.
2736    /// 2. Rows are sorted ascending by PK and serialised into a new
2737    ///    segment via [`encode_segment`].
2738    /// 3. The hot rows are dropped via [`Table::delete_rows`]; the
2739    ///    `rebuild_indices` it triggers regenerates `Hot` locators
2740    ///    for every remaining row (their positions shift down by
2741    ///    `max_rows`). Existing `Cold` locators in this index — from
2742    ///    a previous freeze — are also rebuilt **but with empty
2743    ///    payload** since rebuild reads only `self.rows`; this
2744    ///    routine re-registers them at the end of the call so the
2745    ///    user-visible state preserves all prior cold locators.
2746    /// 4. The new segment is loaded into `self.cold_segments` via
2747    ///    [`Catalog::load_segment_bytes`] (allocating a fresh
2748    ///    `segment_id`). New `Cold` locators are registered on the
2749    ///    named index — one per frozen row.
2750    ///
2751    /// **v5.2.2 limits** (relaxed in later sub-versions):
2752    /// - INSERT-only flow: subsequent UPDATE/DELETE on a frozen row
2753    ///   returns a stale-locator error (no promote-on-write until
2754    ///   v5.2.3).
2755    /// - Single-table scope: callers iterate tables themselves.
2756    /// - All-or-nothing: returns `Err` and leaves catalog unchanged
2757    ///   if any step fails before the atomic swap point.
2758    ///
2759    /// Errors:
2760    /// - [`StorageError::Corrupt`] for missing table/index, non-`BTree`
2761    ///   index, non-integer PK column, `max_rows == 0`, or
2762    ///   `max_rows > row_count`.
2763    /// - The encoder's [`SegmentError`] surfaces as `Corrupt` (the
2764    ///   only realistic source is "a single row is larger than the
2765    ///   page size"; SPG schemas don't hit it in practice).
2766    pub fn freeze_oldest_to_cold(
2767        &mut self,
2768        table_name: &str,
2769        index_name: &str,
2770        max_rows: usize,
2771    ) -> Result<FreezeReport, StorageError> {
2772        // --- validation phase: never mutates ---------------------
2773        if max_rows == 0 {
2774            return Err(StorageError::Corrupt(
2775                "freeze_oldest_to_cold: max_rows must be > 0".into(),
2776            ));
2777        }
2778        let table = self.get(table_name).ok_or_else(|| {
2779            StorageError::Corrupt(format!(
2780                "freeze_oldest_to_cold: table {table_name:?} not found"
2781            ))
2782        })?;
2783        if max_rows > table.rows.len() {
2784            return Err(StorageError::Corrupt(format!(
2785                "freeze_oldest_to_cold: max_rows {max_rows} > row_count {}",
2786                table.rows.len()
2787            )));
2788        }
2789        let idx = table
2790            .indices
2791            .iter()
2792            .find(|i| i.name == index_name)
2793            .ok_or_else(|| {
2794                StorageError::Corrupt(format!(
2795                    "freeze_oldest_to_cold: index {index_name:?} not found on {table_name:?}"
2796                ))
2797            })?;
2798        if !matches!(idx.kind, IndexKind::BTree(_)) {
2799            return Err(StorageError::Corrupt(format!(
2800                "freeze_oldest_to_cold: index {index_name:?} is NSW; only BTree indices may freeze"
2801            )));
2802        }
2803        let column_position = idx.column_position;
2804
2805        // --- segment build phase: reads only --------------------
2806        let schema = table.schema.clone();
2807        let mut to_freeze: Vec<(u64, Vec<u8>, IndexKey)> = Vec::with_capacity(max_rows);
2808        for row_idx in 0..max_rows {
2809            let row = table.rows.get(row_idx).expect("bounds-checked above");
2810            let key = IndexKey::from_value(&row.values[column_position]).ok_or_else(|| {
2811                StorageError::Corrupt(format!(
2812                    "freeze_oldest_to_cold: row {row_idx} has NULL / non-key value in index column"
2813                ))
2814            })?;
2815            let pk_u64 = index_key_as_u64(&key).ok_or_else(|| {
2816                StorageError::Corrupt(format!(
2817                    "freeze_oldest_to_cold: index {index_name:?} column type is non-integer; \
2818                     v5.2.2 cold tier requires IndexKey::Int (Text PK lands in v5.5+)"
2819                ))
2820            })?;
2821            to_freeze.push((pk_u64, encode_row_body_dense(row, &schema), key));
2822        }
2823        // encode_segment requires ascending u64 keys. Sort by PK
2824        // before encoding; the caller's row-position order is not
2825        // necessarily PK order (e.g. workloads that insert random
2826        // PKs).
2827        to_freeze.sort_by_key(|(k, _, _)| *k);
2828        // Reject duplicate PKs — encode_segment also rejects them
2829        // (`SegmentError::UnsortedKey`), but the resulting error
2830        // message there is misleading. Surface a clearer one.
2831        for w in to_freeze.windows(2) {
2832            if w[0].0 == w[1].0 {
2833                return Err(StorageError::Corrupt(format!(
2834                    "freeze_oldest_to_cold: duplicate PK {} in freeze batch",
2835                    w[0].0
2836                )));
2837            }
2838        }
2839        // Snapshot the (key, locator) pairs that will be registered
2840        // post-swap. Cloning the IndexKey out before the move makes
2841        // the registration loop borrow-free.
2842        let post_swap_keys: Vec<IndexKey> = to_freeze.iter().map(|(_, _, k)| k.clone()).collect();
2843        // Segment encode is now infallible w.r.t. ordering. Map the
2844        // `SegmentError` into a `StorageError::Corrupt` so the
2845        // public surface stays one error type.
2846        let seg_rows: Vec<(u64, Vec<u8>)> = to_freeze
2847            .into_iter()
2848            .map(|(k, body, _)| (k, body))
2849            .collect();
2850        let frozen_rows = seg_rows.len();
2851        let (seg_bytes, _meta) = encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES)
2852            .map_err(|e| StorageError::Corrupt(format!("freeze_oldest_to_cold: encode: {e}")))?;
2853
2854        // --- atomic swap phase: mutations only past this point ---
2855        // v5.2.3 made `Table::rebuild_indices` preserve every Cold
2856        // locator across the per-table rebuild, so `delete_rows`
2857        // below no longer wipes prior-freeze cold entries. The pre-
2858        // v5.2.3 capture-then-re-register that used to live here
2859        // was removed in v5.3.1 — keeping it would double-count
2860        // every prior-frozen key's Cold locator on each subsequent
2861        // freeze.
2862        let bytes_before = self.get(table_name).expect("just validated").hot_bytes();
2863        let positions: Vec<usize> = (0..max_rows).collect();
2864        let t_mut = self
2865            .get_mut(table_name)
2866            .expect("just validated; still present");
2867        let removed = t_mut.delete_rows(&positions);
2868        debug_assert_eq!(removed, max_rows, "delete_rows count matches request");
2869        let bytes_after = t_mut.hot_bytes();
2870        let bytes_freed = bytes_before.saturating_sub(bytes_after);
2871
2872        let segment_id = self
2873            .load_segment_bytes(seg_bytes.clone())
2874            .map_err(|e| StorageError::Corrupt(format!("freeze_oldest_to_cold: load: {e}")))?;
2875        let new_cold = post_swap_keys.into_iter().map(|k| {
2876            (
2877                k,
2878                RowLocator::Cold {
2879                    segment_id,
2880                    page_offset: 0,
2881                },
2882            )
2883        });
2884        let t_mut = self.get_mut(table_name).expect("still present");
2885        t_mut.register_cold_locators(index_name, new_cold)?;
2886
2887        Ok(FreezeReport {
2888            segment_id,
2889            frozen_rows,
2890            bytes_freed,
2891            segment_bytes: seg_bytes,
2892        })
2893    }
2894
2895    /// v5.1: borrow the cold segment at `segment_id`. Used by the
2896    /// spg-server preload path to enumerate (key, locator) pairs
2897    /// after loading a segment, so it can call
2898    /// [`Table::register_cold_locators`] without re-parsing the
2899    /// bytes.
2900    #[must_use]
2901    pub fn cold_segment(&self, segment_id: u32) -> Option<&OwnedSegment> {
2902        self.cold_segments
2903            .get(segment_id as usize)
2904            .and_then(|s| s.as_deref())
2905    }
2906
2907    /// v5.1: resolve a single `RowLocator::Cold` to its underlying
2908    /// `Row`. Decoupled from [`Catalog::lookup_by_pk`] so callers
2909    /// iterating a multi-locator slice (e.g. the engine's index
2910    /// seek path) can dispatch per locator instead of getting back
2911    /// only the first row for a key. Returns `None` when the
2912    /// segment isn't registered, the key isn't `u64`-coercible, or
2913    /// the segment doesn't actually carry the key (bloom or page-
2914    /// index reject).
2915    pub fn resolve_cold_locator(
2916        &self,
2917        table_name: &str,
2918        segment_id: u32,
2919        key: &IndexKey,
2920    ) -> Option<Row> {
2921        let t = self.get(table_name)?;
2922        let u64_key = index_key_as_u64(key)?;
2923        let seg = self.cold_segments.get(segment_id as usize)?.as_ref()?;
2924        let payload = seg.lookup(u64_key)?;
2925        let (row, _) = decode_row_body_dense(&payload, &t.schema).ok()?;
2926        Some(row)
2927    }
2928
2929    /// v5.1: indexed PK lookup that dispatches per locator,
2930    /// returning the first matching row from either the hot tier
2931    /// (`Table::rows`) or a registered cold segment.
2932    ///
2933    /// The cold path requires the index column to be coercible to
2934    /// a `u64` (the segment's PK type) and the segment payload to
2935    /// be a [`encode_row_body_dense`]-encoded row body for the
2936    /// same schema. v5.1 ships this for BIGINT / INT / SMALLINT
2937    /// PKs; other types fall through to hot-only behavior.
2938    ///
2939    /// Returns `None` if (a) the table or index doesn't exist,
2940    /// (b) the key isn't in the index at all, or (c) the key was
2941    /// resolved to a stale locator (Hot index out of range, Cold
2942    /// segment id unknown, segment lookup miss). Does not surface
2943    /// segment-decode errors — those would indicate corrupted
2944    /// cold-tier files and should be caught at
2945    /// [`Catalog::load_segment_bytes`] time.
2946    pub fn lookup_by_pk(&self, table: &str, index_name: &str, key: &IndexKey) -> Option<Row> {
2947        let t = self.get(table)?;
2948        let idx = t.indices.iter().find(|i| i.name == index_name)?;
2949        let locators = idx.lookup_eq(key);
2950        let cold_u64_key = index_key_as_u64(key);
2951        for loc in locators {
2952            match *loc {
2953                RowLocator::Hot(i) => {
2954                    if let Some(row) = t.rows.get(i) {
2955                        return Some(row.clone());
2956                    }
2957                }
2958                RowLocator::Cold {
2959                    segment_id,
2960                    page_offset: _,
2961                } => {
2962                    let Some(u64_key) = cold_u64_key else {
2963                        // Key type not coercible to u64 — cold tier
2964                        // only handles BIGINT/INT/SMALLINT in v5.1.
2965                        continue;
2966                    };
2967                    let Some(seg) = self
2968                        .cold_segments
2969                        .get(segment_id as usize)
2970                        .and_then(|s| s.as_deref())
2971                    else {
2972                        // v6.7.3 — `None` slot = compaction
2973                        // retired this segment; the live locator
2974                        // on a freshly-compacted index points to
2975                        // the merged segment_id, so a Cold hit
2976                        // here against a tombstone means the BTree
2977                        // entry hasn't been swapped yet (mid-
2978                        // compaction reader race) or the caller is
2979                        // looking up a stale snapshot. Skip — the
2980                        // next locator in the list, if any, is
2981                        // typically the merged segment.
2982                        continue;
2983                    };
2984                    let Some(payload) = seg.lookup(u64_key) else {
2985                        continue;
2986                    };
2987                    let (row, _) = decode_row_body_dense(&payload, &t.schema).ok()?;
2988                    return Some(row);
2989                }
2990            }
2991        }
2992        None
2993    }
2994
2995    /// v5.2.3: promote a frozen row back to the hot tier so an
2996    /// UPDATE / DELETE can mutate it. Reads the cold-tier row body
2997    /// (decoded from its registered segment), pushes it into
2998    /// `table.rows` via [`Table::insert`] (which also adds a fresh
2999    /// `Hot(new_idx)` locator on `index_name`), then retires the
3000    /// shadowed `Cold` locator via
3001    /// [`Table::remove_cold_locators_for_key`]. The cold-tier row
3002    /// in the segment file becomes garbage — recoverable when a
3003    /// future cold-segment compaction job lands.
3004    ///
3005    /// Returns:
3006    /// - `Ok(Some(new_hot_idx))` when the key resolved through a
3007    ///   cold locator and the promote completed. `new_hot_idx` is
3008    ///   the position the row now occupies in `table.rows`.
3009    /// - `Ok(None)` when the key has no Cold locator on the index
3010    ///   (already hot, or wasn't present at all). Callers treat this
3011    ///   as "nothing to do here, fall back to the hot-only path".
3012    ///
3013    /// Errors when the table / index doesn't exist, the index isn't
3014    /// `BTree`, the cold segment is missing / can't decode the row,
3015    /// or the inferred row body fails `Table::insert` validation.
3016    pub fn promote_cold_row(
3017        &mut self,
3018        table_name: &str,
3019        index_name: &str,
3020        key: &IndexKey,
3021    ) -> Result<Option<usize>, StorageError> {
3022        let cold_loc = self.find_cold_locator(table_name, index_name, key)?;
3023        let Some((segment_id, _page_offset)) = cold_loc else {
3024            return Ok(None);
3025        };
3026        let u64_key = index_key_as_u64(key).ok_or_else(|| {
3027            StorageError::Corrupt(
3028                "promote_cold_row: key type not coercible to u64 (cold tier requires integer PK)"
3029                    .into(),
3030            )
3031        })?;
3032        // Read the row body from the segment. Borrow the segment +
3033        // schema short-term so we can then take `&mut self` for the
3034        // hot-side insert.
3035        let schema = self
3036            .get(table_name)
3037            .ok_or_else(|| {
3038                StorageError::Corrupt(format!("promote_cold_row: table {table_name:?} not found"))
3039            })?
3040            .schema
3041            .clone();
3042        let seg = self
3043            .cold_segments
3044            .get(segment_id as usize)
3045            .and_then(|s| s.as_ref())
3046            .ok_or_else(|| {
3047                StorageError::Corrupt(format!(
3048                    "promote_cold_row: segment {segment_id} not registered on catalog"
3049                ))
3050            })?;
3051        let payload = seg.lookup(u64_key).ok_or_else(|| {
3052            StorageError::Corrupt(format!(
3053                "promote_cold_row: key {u64_key} resolves to segment {segment_id} \
3054                 but the segment's bloom/page lookup didn't return a row"
3055            ))
3056        })?;
3057        let (row, _consumed) = decode_row_body_dense(&payload, &schema)?;
3058        // Insert the promoted row into the hot tier. `Table::insert`
3059        // appends to `self.rows`, adds a `Hot(new_idx)` locator to
3060        // every BTree index covering the row's keyed columns, and
3061        // increments `hot_bytes`.
3062        let t = self
3063            .get_mut(table_name)
3064            .expect("table existed at lookup time");
3065        t.insert(row)?;
3066        let new_hot_idx =
3067            t.rows.len().checked_sub(1).ok_or_else(|| {
3068                StorageError::Corrupt("promote_cold_row: empty after insert".into())
3069            })?;
3070        // The hot insert added Hot(new_idx) alongside the still-
3071        // present Cold locator. Drop the Cold entry so future
3072        // lookups return only the fresh hot row.
3073        t.remove_cold_locators_for_key(index_name, key)?;
3074        Ok(Some(new_hot_idx))
3075    }
3076
3077    /// v5.2.3: shadow a frozen row's index entry. Used by DELETE
3078    /// when the row to remove lives in a cold-tier segment — the
3079    /// row body stays in the segment file (becoming garbage) but
3080    /// every `Cold` locator for `key` on `index_name` is removed
3081    /// so PK lookups stop returning it.
3082    ///
3083    /// Returns the number of cold locators retired (0 when the key
3084    /// has no cold entries — the DELETE fell on a hot row or a
3085    /// key that was already absent). Errors when the table /
3086    /// index doesn't exist or the index isn't `BTree`.
3087    ///
3088    /// Cold-segment compaction (which merges shadowed-heavy
3089    /// segments and reclaims their disk footprint) lands in a
3090    /// later v5.x sub-version; until then, repeated UPDATE/DELETE
3091    /// of cold rows can amplify cold-segment disk usage by up to
3092    /// 1-2× — still well under typical LSM-tree shadowing because
3093    /// SPG segments are bulk-baked, not write-merged.
3094    pub fn shadow_cold_row(
3095        &mut self,
3096        table_name: &str,
3097        index_name: &str,
3098        key: &IndexKey,
3099    ) -> Result<usize, StorageError> {
3100        let t = self.get_mut(table_name).ok_or_else(|| {
3101            StorageError::Corrupt(format!("shadow_cold_row: table {table_name:?} not found"))
3102        })?;
3103        t.remove_cold_locators_for_key(index_name, key)
3104    }
3105
3106    /// v6.7.4 — read-only slice preparation for the parallel
3107    /// freezer. Walks rows in `row_range`, builds the
3108    /// `(pk_u64, encoded_body, IndexKey)` triples that the
3109    /// coordinator's k-way merge consumes, sorts the slice by
3110    /// `pk_u64`, and returns a [`FreezeSlice`].
3111    ///
3112    /// Caller invariants:
3113    /// - `row_range.end <= table.rows.len()` (caller's job to
3114    ///   compute the partition).
3115    /// - All slices passed to `commit_freeze_slices` must cover a
3116    ///   contiguous half-open range `[0, total_max_rows)` with no
3117    ///   gaps and no overlaps. The coordinator validates this
3118    ///   invariant before committing.
3119    ///
3120    /// `&self`-only — multiple workers can run this concurrently
3121    /// against the same `Catalog` reference under the engine's
3122    /// write lock (workers don't mutate; the coordinator does).
3123    pub fn prepare_freeze_slice(
3124        &self,
3125        table_name: &str,
3126        index_name: &str,
3127        row_range: core::ops::Range<usize>,
3128    ) -> Result<FreezeSlice, StorageError> {
3129        let table = self.get(table_name).ok_or_else(|| {
3130            StorageError::Corrupt(format!(
3131                "prepare_freeze_slice: table {table_name:?} not found"
3132            ))
3133        })?;
3134        let idx = table
3135            .indices
3136            .iter()
3137            .find(|i| i.name == index_name)
3138            .ok_or_else(|| {
3139                StorageError::Corrupt(format!(
3140                    "prepare_freeze_slice: index {index_name:?} not found on {table_name:?}"
3141                ))
3142            })?;
3143        if !matches!(idx.kind, IndexKind::BTree(_)) {
3144            return Err(StorageError::Corrupt(format!(
3145                "prepare_freeze_slice: index {index_name:?} is NSW; only BTree indices may freeze"
3146            )));
3147        }
3148        if row_range.end > table.rows.len() {
3149            return Err(StorageError::Corrupt(format!(
3150                "prepare_freeze_slice: row_range end {} > row_count {}",
3151                row_range.end,
3152                table.rows.len()
3153            )));
3154        }
3155        let column_position = idx.column_position;
3156        let schema = table.schema.clone();
3157        let mut rows: Vec<(u64, Vec<u8>, IndexKey)> = Vec::with_capacity(row_range.len());
3158        for row_idx in row_range.clone() {
3159            let row = table.rows.get(row_idx).expect("bounds-checked above");
3160            let key = IndexKey::from_value(&row.values[column_position]).ok_or_else(|| {
3161                StorageError::Corrupt(format!(
3162                    "prepare_freeze_slice: row {row_idx} has NULL / non-key value in index column"
3163                ))
3164            })?;
3165            let pk_u64 = index_key_as_u64(&key).ok_or_else(|| {
3166                StorageError::Corrupt(format!(
3167                    "prepare_freeze_slice: index {index_name:?} column type is non-integer; \
3168                     v5.2.2 cold tier requires IndexKey::Int (Text PK lands in v5.5+)"
3169                ))
3170            })?;
3171            rows.push((pk_u64, encode_row_body_dense(row, &schema), key));
3172        }
3173        rows.sort_by_key(|(k, _, _)| *k);
3174        Ok(FreezeSlice { row_range, rows })
3175    }
3176
3177    /// v6.7.4 — coordinator commit step. Merges N
3178    /// [`FreezeSlice`]s into one segment via the standard
3179    /// [`encode_segment`] path, atomically swaps the catalog
3180    /// state (delete the union row range + register Cold
3181    /// locators + load the segment).
3182    ///
3183    /// Validates that the slices cover a contiguous, gap-free,
3184    /// overlap-free half-open range starting at index 0 (the
3185    /// freezer always freezes "oldest first" — same semantics as
3186    /// the single-threaded [`Catalog::freeze_oldest_to_cold`]).
3187    ///
3188    /// Empty `slices` → no-op success (returns a zero-row report
3189    /// without mutating). Total row count = `Σ slice.rows.len()`.
3190    pub fn commit_freeze_slices(
3191        &mut self,
3192        table_name: &str,
3193        index_name: &str,
3194        slices: Vec<FreezeSlice>,
3195    ) -> Result<FreezeReport, StorageError> {
3196        // --- validation phase: never mutates ---------------------
3197        let table = self.get(table_name).ok_or_else(|| {
3198            StorageError::Corrupt(format!(
3199                "commit_freeze_slices: table {table_name:?} not found"
3200            ))
3201        })?;
3202        let idx = table
3203            .indices
3204            .iter()
3205            .find(|i| i.name == index_name)
3206            .ok_or_else(|| {
3207                StorageError::Corrupt(format!(
3208                    "commit_freeze_slices: index {index_name:?} not found on {table_name:?}"
3209                ))
3210            })?;
3211        if !matches!(idx.kind, IndexKind::BTree(_)) {
3212            return Err(StorageError::Corrupt(format!(
3213                "commit_freeze_slices: index {index_name:?} is NSW; only BTree indices may freeze"
3214            )));
3215        }
3216        // Validate slice coverage: contiguous from 0, no gaps, no
3217        // overlaps. Allow the caller to pass slices in any order —
3218        // sort by row_range.start first.
3219        let mut ordered = slices;
3220        ordered.sort_by_key(|s| s.row_range.start);
3221        // Drop fully-empty slices that fell out of an uneven
3222        // partition; they carry no data but contribute to the
3223        // contiguity check, so keep them in line.
3224        let mut expected_start = 0usize;
3225        for s in &ordered {
3226            if s.row_range.start != expected_start {
3227                return Err(StorageError::Corrupt(format!(
3228                    "commit_freeze_slices: gap/overlap at row {}; expected start {}",
3229                    s.row_range.start, expected_start
3230                )));
3231            }
3232            expected_start = s.row_range.end;
3233        }
3234        let max_rows = expected_start;
3235        if max_rows > table.rows.len() {
3236            return Err(StorageError::Corrupt(format!(
3237                "commit_freeze_slices: total row range {} exceeds row_count {}",
3238                max_rows,
3239                table.rows.len()
3240            )));
3241        }
3242        if max_rows == 0 {
3243            return Ok(FreezeReport {
3244                segment_id: u32::MAX,
3245                frozen_rows: 0,
3246                bytes_freed: 0,
3247                segment_bytes: Vec::new(),
3248            });
3249        }
3250
3251        // --- segment build phase: reads only --------------------
3252        // K-way merge of already-sorted slices. Each slice's rows
3253        // are ascending by pk_u64; we keep a per-slice cursor and
3254        // pull the next-smallest head until every cursor drains.
3255        let total_rows: usize = ordered.iter().map(|s| s.rows.len()).sum();
3256        if total_rows != max_rows {
3257            return Err(StorageError::Corrupt(format!(
3258                "commit_freeze_slices: total slice rows {total_rows} ≠ row_range coverage {max_rows}"
3259            )));
3260        }
3261        let mut cursors: Vec<usize> = alloc::vec![0; ordered.len()];
3262        let mut merged: Vec<(u64, Vec<u8>, IndexKey)> = Vec::with_capacity(total_rows);
3263        loop {
3264            // Pick the slice whose head row has the smallest key
3265            // and isn't yet exhausted.
3266            let mut pick: Option<usize> = None;
3267            for (i, c) in cursors.iter().enumerate() {
3268                let slice = &ordered[i];
3269                if *c >= slice.rows.len() {
3270                    continue;
3271                }
3272                match pick {
3273                    None => pick = Some(i),
3274                    Some(j) => {
3275                        if slice.rows[*c].0 < ordered[j].rows[cursors[j]].0 {
3276                            pick = Some(i);
3277                        }
3278                    }
3279                }
3280            }
3281            let Some(i) = pick else { break };
3282            let row = ordered[i].rows[cursors[i]].clone();
3283            cursors[i] += 1;
3284            merged.push(row);
3285        }
3286        // Reject duplicate PKs — same error as the single-threaded
3287        // path so callers get a uniform surface.
3288        for w in merged.windows(2) {
3289            if w[0].0 == w[1].0 {
3290                return Err(StorageError::Corrupt(format!(
3291                    "commit_freeze_slices: duplicate PK {} across slices",
3292                    w[0].0
3293                )));
3294            }
3295        }
3296        let post_swap_keys: Vec<IndexKey> = merged.iter().map(|(_, _, k)| k.clone()).collect();
3297        let seg_rows: Vec<(u64, Vec<u8>)> = merged
3298            .into_iter()
3299            .map(|(k, body, _)| (k, body))
3300            .collect();
3301        let frozen_rows = seg_rows.len();
3302        let (seg_bytes, _meta) =
3303            encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES).map_err(|e| {
3304                StorageError::Corrupt(format!("commit_freeze_slices: encode: {e}"))
3305            })?;
3306
3307        // --- atomic swap phase: mutations only past this point ---
3308        let bytes_before = self.get(table_name).expect("just validated").hot_bytes();
3309        let positions: Vec<usize> = (0..max_rows).collect();
3310        let t_mut = self
3311            .get_mut(table_name)
3312            .expect("just validated; still present");
3313        let removed = t_mut.delete_rows(&positions);
3314        debug_assert_eq!(removed, max_rows, "delete_rows count matches request");
3315        let bytes_after = t_mut.hot_bytes();
3316        let bytes_freed = bytes_before.saturating_sub(bytes_after);
3317
3318        let segment_id = self
3319            .load_segment_bytes(seg_bytes.clone())
3320            .map_err(|e| StorageError::Corrupt(format!("commit_freeze_slices: load: {e}")))?;
3321        let new_cold = post_swap_keys.into_iter().map(|k| {
3322            (
3323                k,
3324                RowLocator::Cold {
3325                    segment_id,
3326                    page_offset: 0,
3327                },
3328            )
3329        });
3330        let t_mut = self.get_mut(table_name).expect("still present");
3331        t_mut.register_cold_locators(index_name, new_cold)?;
3332
3333        Ok(FreezeReport {
3334            segment_id,
3335            frozen_rows,
3336            bytes_freed,
3337            segment_bytes: seg_bytes,
3338        })
3339    }
3340
3341    /// v6.7.3 — compact every cold segment on `(table, index)` whose
3342    /// `OwnedSegment::bytes().len()` is below `target_segment_bytes`
3343    /// into a single larger merged segment. Rows present in source
3344    /// segment payloads but no longer referenced by any
3345    /// `RowLocator::Cold` on the index (DELETE'd + frozen rows
3346    /// retired via [`Catalog::shadow_cold_row`]) are GC'd in the
3347    /// merge.
3348    ///
3349    /// **Semantics**:
3350    /// 1. Walk the BTree index to collect every Cold locator that
3351    ///    targets a small (< threshold) segment. Each such
3352    ///    `(key, segment_id)` becomes a row in the merged segment;
3353    ///    payload is looked up from the source segment in-place.
3354    /// 2. Encode the collected rows into one new segment via
3355    ///    [`encode_segment`]; register it via
3356    ///    [`Catalog::load_segment_bytes`] (allocating a fresh
3357    ///    `merged_segment_id` at the end of `cold_segments`).
3358    /// 3. Rewrite the BTree index in one pass: every
3359    ///    `RowLocator::Cold { segment_id ∈ sources }` becomes
3360    ///    `RowLocator::Cold { segment_id = merged_id, page_offset = 0 }`.
3361    ///    Hot locators are untouched.
3362    /// 4. Tombstone every source slot via
3363    ///    [`Catalog::tombstone_segment`]. Source segment payloads
3364    ///    are no longer reachable through the catalog; the on-disk
3365    ///    files are the caller's concern.
3366    ///
3367    /// On fewer than 2 candidate segments the catalog is **not**
3368    /// mutated and a no-op report (`merged_segment_id: None`,
3369    /// `sources: []`) is returned. This is the routine case — a
3370    /// freshly-frozen table has at most 1 small segment, no merge
3371    /// possible.
3372    ///
3373    /// Atomicity: every mutating step runs after the read-only
3374    /// gather phase, so a panic before the merge encode leaves the
3375    /// catalog unchanged. The mutation block itself (load + rewrite +
3376    /// tombstone) takes only `&mut self` — callers serialise the
3377    /// engine write lock outside this function.
3378    ///
3379    /// Errors when the table / index doesn't exist, the index isn't
3380    /// `BTree`, the index column type isn't u64-coercible (cold-tier
3381    /// pre-condition), or a source segment fails its in-place
3382    /// row-body lookup (would indicate prior catalog corruption).
3383    pub fn compact_cold_segments(
3384        &mut self,
3385        table_name: &str,
3386        index_name: &str,
3387        target_segment_bytes: u64,
3388    ) -> Result<CompactReport, StorageError> {
3389        // --- validation phase ----------------------------------
3390        let t = self.get(table_name).ok_or_else(|| {
3391            StorageError::Corrupt(format!(
3392                "compact_cold_segments: table {table_name:?} not found"
3393            ))
3394        })?;
3395        let idx = t
3396            .indices
3397            .iter()
3398            .find(|i| i.name == index_name)
3399            .ok_or_else(|| {
3400                StorageError::Corrupt(format!(
3401                    "compact_cold_segments: index {index_name:?} not found on {table_name:?}"
3402                ))
3403            })?;
3404        let map = match &idx.kind {
3405            IndexKind::BTree(m) => m,
3406            IndexKind::Nsw(_) | IndexKind::Brin { .. } => {
3407                return Err(StorageError::Corrupt(format!(
3408                    "compact_cold_segments: index {index_name:?} is not BTree; \
3409                     compaction applies only to BTree cold-tier indices"
3410                )));
3411            }
3412        };
3413
3414        // --- gather phase --------------------------------------
3415        // Step A: every segment_id this BTree index Cold-references.
3416        let mut referenced_ids: BTreeSet<u32> = BTreeSet::new();
3417        for (_key, locators) in map.iter() {
3418            for loc in locators {
3419                if let RowLocator::Cold { segment_id, .. } = loc {
3420                    referenced_ids.insert(*segment_id);
3421                }
3422            }
3423        }
3424        // Step B: keep only the small + still-active ones.
3425        let candidate_set: BTreeSet<u32> = referenced_ids
3426            .into_iter()
3427            .filter(|id| {
3428                self.cold_segments
3429                    .get(*id as usize)
3430                    .and_then(|s| s.as_deref())
3431                    .is_some_and(|s| (s.bytes().len() as u64) < target_segment_bytes)
3432            })
3433            .collect();
3434        if candidate_set.len() < 2 {
3435            return Ok(CompactReport {
3436                sources: Vec::new(),
3437                merged_segment_id: None,
3438                merged_segment_bytes: Vec::new(),
3439                merged_rows: 0,
3440                deleted_rows_pruned: 0,
3441                bytes_reclaimed_estimate: 0,
3442            });
3443        }
3444        // Step C: pre-count source rows for the deleted-pruned metric.
3445        let mut source_row_count: usize = 0;
3446        let mut source_byte_total: u64 = 0;
3447        for &id in &candidate_set {
3448            let seg = self.cold_segments[id as usize]
3449                .as_ref()
3450                .expect("candidate selected only when slot is Some");
3451            source_row_count = source_row_count.saturating_add(seg.meta().num_rows as usize);
3452            source_byte_total =
3453                source_byte_total.saturating_add(seg.bytes().len() as u64);
3454        }
3455        // Step D: collect (key, body) pairs from every live Cold
3456        // locator pointing at a candidate. dedupe by key — one
3457        // BTree key resolves to at most one cold payload (the
3458        // freezer + promote/shadow flow keeps Cold locators
3459        // unique per key).
3460        let mut collected: BTreeMap<u64, (Vec<u8>, IndexKey)> = BTreeMap::new();
3461        for (key, locators) in map.iter() {
3462            for loc in locators {
3463                let RowLocator::Cold { segment_id, .. } = loc else {
3464                    continue;
3465                };
3466                if !candidate_set.contains(segment_id) {
3467                    continue;
3468                }
3469                let u64_key = index_key_as_u64(key).ok_or_else(|| {
3470                    StorageError::Corrupt(format!(
3471                        "compact_cold_segments: index {index_name:?} has non-integer Cold key; \
3472                         cold tier requires IndexKey::Int (Text PK lands in v5.5+)"
3473                    ))
3474                })?;
3475                let seg = self.cold_segments[*segment_id as usize]
3476                    .as_ref()
3477                    .expect("candidate slot guaranteed Some above");
3478                let payload = seg.lookup(u64_key).ok_or_else(|| {
3479                    StorageError::Corrupt(format!(
3480                        "compact_cold_segments: BTree {index_name:?} points key={u64_key} \
3481                         at segment {segment_id} but the segment lookup missed"
3482                    ))
3483                })?;
3484                collected.insert(u64_key, (payload, key.clone()));
3485                break;
3486            }
3487        }
3488        let merged_rows = collected.len();
3489        let deleted_rows_pruned = source_row_count.saturating_sub(merged_rows);
3490
3491        // Step E: encode the merged segment. `BTreeMap<u64, _>`
3492        // iteration is ascending by key, which is what
3493        // `encode_segment` requires.
3494        let seg_rows: Vec<(u64, Vec<u8>)> = collected
3495            .iter()
3496            .map(|(k, (body, _))| (*k, body.clone()))
3497            .collect();
3498        let (seg_bytes, _meta) =
3499            encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES).map_err(|e| {
3500                StorageError::Corrupt(format!("compact_cold_segments: encode: {e}"))
3501            })?;
3502        let merged_bytes_len = seg_bytes.len() as u64;
3503
3504        // --- atomic mutation phase ------------------------------
3505        let merged_segment_id = self
3506            .load_segment_bytes(seg_bytes.clone())
3507            .map_err(|e| StorageError::Corrupt(format!("compact_cold_segments: load: {e}")))?;
3508
3509        // Rewrite the BTree index: every Cold locator pointing at
3510        // a candidate source becomes a Cold locator pointing at
3511        // the merged segment. Use a flat collect-then-replace
3512        // pattern so we never hold a `&self` borrow across the
3513        // `&mut self` write.
3514        let entries: Vec<(IndexKey, Vec<RowLocator>)> = {
3515            let t = self
3516                .get(table_name)
3517                .expect("table existed at the start of this fn");
3518            let idx = t
3519                .indices
3520                .iter()
3521                .find(|i| i.name == index_name)
3522                .expect("index existed at the start of this fn");
3523            let IndexKind::BTree(map) = &idx.kind else {
3524                unreachable!("validated above");
3525            };
3526            map.iter().map(|(k, v)| (k.clone(), v.clone())).collect()
3527        };
3528        let t_mut = self
3529            .get_mut(table_name)
3530            .expect("table existed at the start of this fn");
3531        let idx_mut = t_mut
3532            .indices
3533            .iter_mut()
3534            .find(|i| i.name == index_name)
3535            .expect("index existed at the start of this fn");
3536        let IndexKind::BTree(map_mut) = &mut idx_mut.kind else {
3537            unreachable!("validated above");
3538        };
3539        for (key, locators) in entries {
3540            let mut new_locs: Vec<RowLocator> = Vec::with_capacity(locators.len());
3541            let mut changed = false;
3542            for loc in &locators {
3543                match *loc {
3544                    RowLocator::Cold {
3545                        segment_id,
3546                        page_offset: _,
3547                    } if candidate_set.contains(&segment_id) => {
3548                        let replacement = RowLocator::Cold {
3549                            segment_id: merged_segment_id,
3550                            page_offset: 0,
3551                        };
3552                        if !new_locs.contains(&replacement) {
3553                            new_locs.push(replacement);
3554                        }
3555                        changed = true;
3556                    }
3557                    other => new_locs.push(other),
3558                }
3559            }
3560            if changed {
3561                map_mut.insert_mut(key, new_locs);
3562            }
3563        }
3564
3565        // Tombstone every source slot. Last step — failures here
3566        // would leave the segment double-referenced in both
3567        // memory + manifest, but `tombstone_segment` only errors
3568        // on out-of-bounds, which we've already validated.
3569        for &id in &candidate_set {
3570            self.tombstone_segment(id)?;
3571        }
3572
3573        let bytes_reclaimed_estimate = source_byte_total.saturating_sub(merged_bytes_len);
3574        Ok(CompactReport {
3575            sources: candidate_set.into_iter().collect(),
3576            merged_segment_id: Some(merged_segment_id),
3577            merged_segment_bytes: seg_bytes,
3578            merged_rows,
3579            deleted_rows_pruned,
3580            bytes_reclaimed_estimate,
3581        })
3582    }
3583
3584    /// Internal helper: scan `(table, index)` for a `Cold` locator
3585    /// keyed by `key`. Returns `Ok(Some((segment_id, page_offset)))`
3586    /// when found, `Ok(None)` when the key has only hot entries
3587    /// or no entries at all, `Err` on the same input-validation
3588    /// errors as the public `promote_cold_row` / `shadow_cold_row`.
3589    fn find_cold_locator(
3590        &self,
3591        table_name: &str,
3592        index_name: &str,
3593        key: &IndexKey,
3594    ) -> Result<Option<(u32, u32)>, StorageError> {
3595        let t = self.get(table_name).ok_or_else(|| {
3596            StorageError::Corrupt(format!("find_cold_locator: table {table_name:?} not found"))
3597        })?;
3598        let idx = t
3599            .indices
3600            .iter()
3601            .find(|i| i.name == index_name)
3602            .ok_or_else(|| {
3603                StorageError::Corrupt(format!(
3604                    "find_cold_locator: index {index_name:?} not found on {table_name:?}"
3605                ))
3606            })?;
3607        if !matches!(idx.kind, IndexKind::BTree(_)) {
3608            return Err(StorageError::Corrupt(format!(
3609                "find_cold_locator: index {index_name:?} is NSW; promote-on-write only applies to BTree indices"
3610            )));
3611        }
3612        for loc in idx.lookup_eq(key) {
3613            if let RowLocator::Cold {
3614                segment_id,
3615                page_offset,
3616            } = *loc
3617            {
3618                return Ok(Some((segment_id, page_offset)));
3619            }
3620        }
3621        Ok(None)
3622    }
3623}
3624
3625/// Coerce an [`IndexKey`] to the `u64` that v5.1 cold-tier
3626/// segments use as their on-disk PK. Returns `None` for keys that
3627/// aren't representable as `u64` — Text PKs need a hash mapping
3628/// the segment writer baked in (deferred to v5.2+), Bool PKs are
3629/// almost never wide enough to be sharded into a cold tier.
3630fn index_key_as_u64(key: &IndexKey) -> Option<u64> {
3631    match key {
3632        // Reinterpret the i64 bit pattern as u64. Cold-tier segments
3633        // are sorted by this u64 view, so the chosen interpretation
3634        // only has to match between insert (bake_segment / freezer)
3635        // and lookup — using cast_unsigned keeps both sides honest
3636        // and silences clippy::cast_sign_loss.
3637        IndexKey::Int(n) => Some(n.cast_unsigned()),
3638        IndexKey::Text(_) | IndexKey::Bool(_) => None,
3639    }
3640}
3641
3642#[derive(Debug, Clone, PartialEq, Eq)]
3643#[non_exhaustive]
3644pub enum StorageError {
3645    DuplicateTable {
3646        name: String,
3647    },
3648    TableNotFound {
3649        name: String,
3650    },
3651    ArityMismatch {
3652        expected: usize,
3653        actual: usize,
3654    },
3655    TypeMismatch {
3656        column: String,
3657        expected: DataType,
3658        actual: DataType,
3659        position: usize,
3660    },
3661    NullInNotNull {
3662        column: String,
3663    },
3664    /// Index with this name already exists on the table.
3665    DuplicateIndex {
3666        name: String,
3667    },
3668    /// Column referenced by an index doesn't exist on the table.
3669    ColumnNotFound {
3670        column: String,
3671    },
3672    /// On-disk format failed to parse — corrupted file, wrong magic, truncated
3673    /// payload, or unknown tag bytes.
3674    Corrupt(String),
3675    /// v6.0.4 — ALTER INDEX targeted an index name that doesn't
3676    /// exist on any table in this catalog.
3677    IndexNotFound {
3678        name: String,
3679    },
3680    /// v6.0.4 — operation requested isn't supported on this index
3681    /// kind / column type (e.g. ALTER INDEX REBUILD on a `BTree`
3682    /// index, or REBUILD WITH (encoding=…) on a non-vector column).
3683    Unsupported(String),
3684}
3685
3686impl fmt::Display for StorageError {
3687    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
3688        match self {
3689            Self::DuplicateTable { name } => write!(f, "table already exists: {name}"),
3690            Self::TableNotFound { name } => write!(f, "table not found: {name}"),
3691            Self::ArityMismatch { expected, actual } => write!(
3692                f,
3693                "row arity mismatch: expected {expected} columns, got {actual}"
3694            ),
3695            Self::TypeMismatch {
3696                column,
3697                expected,
3698                actual,
3699                position,
3700            } => write!(
3701                f,
3702                "type mismatch in column {column:?} (position {position}): expected {expected}, got {actual}"
3703            ),
3704            Self::NullInNotNull { column } => {
3705                write!(f, "NULL value in NOT NULL column {column:?}")
3706            }
3707            Self::DuplicateIndex { name } => write!(f, "index already exists: {name}"),
3708            Self::ColumnNotFound { column } => write!(f, "column not found: {column}"),
3709            Self::Corrupt(detail) => write!(f, "corrupt on-disk format: {detail}"),
3710            Self::IndexNotFound { name } => write!(f, "index not found: {name}"),
3711            Self::Unsupported(detail) => write!(f, "unsupported: {detail}"),
3712        }
3713    }
3714}
3715
3716impl ColumnSchema {
3717    pub fn new(name: impl Into<String>, ty: DataType, nullable: bool) -> Self {
3718        Self {
3719            name: name.into(),
3720            ty,
3721            nullable,
3722            default: None,
3723            auto_increment: false,
3724        }
3725    }
3726
3727    /// Builder-style helper to attach a default value to an otherwise
3728    /// plain column schema. Used by the engine when CREATE TABLE
3729    /// specifies `column TYPE DEFAULT <expr>`.
3730    #[must_use]
3731    pub fn with_default(mut self, default: Value) -> Self {
3732        self.default = Some(default);
3733        self
3734    }
3735
3736    /// Builder-style helper to mark a column as `AUTO_INCREMENT`.
3737    #[must_use]
3738    pub const fn with_auto_increment(mut self) -> Self {
3739        self.auto_increment = true;
3740        self
3741    }
3742}
3743
3744impl TableSchema {
3745    pub fn new(name: impl Into<String>, columns: Vec<ColumnSchema>) -> Self {
3746        Self {
3747            name: name.into(),
3748            columns,
3749            hot_tier_bytes: None,
3750            foreign_keys: Vec::new(),
3751        }
3752    }
3753}
3754
3755// =========================================================================
3756// Persistent binary format for the catalog.
3757//
3758// Layout (little-endian throughout):
3759//
3760//   [magic "SPGDB001" 8 bytes][version u8]
3761//   [table_count u32]
3762//   for each table:
3763//       [name_len u16][name bytes]
3764//       [col_count u16]
3765//       for each col:
3766//           [name_len u16][name bytes]
3767//           [type_tag u8 + optional payload]
3768//               1=Int 2=BigInt 3=Float 4=Text 5=Bool
3769//               6=Vector(u32 dim)
3770//               7=SmallInt
3771//               8=Varchar(u32 max)
3772//               9=Char(u32 size)
3773//               10=Numeric(u8 precision, u8 scale)
3774//               11=Date
3775//               12=Timestamp
3776//           [nullable u8]   0/1
3777//           [default_tag u8] 0=none 1=value (followed by [value_tag u8] + bytes)
3778//       [row_count u32]
3779//       for each row, for each col, one [value_tag u8] + value bytes:
3780//           tag 0 (Null)     → no body
3781//           tag 1 (Int)      → i32 LE
3782//           tag 2 (BigInt)   → i64 LE
3783//           tag 3 (Float)    → f64 LE
3784//           tag 4 (Text)     → u16 LE len + UTF-8 bytes
3785//           tag 5 (Bool)     → u8 0/1
3786//           tag 6 (Vector)   → u32 LE dim + dim×f32 LE
3787//           tag 7 (SmallInt) → i16 LE
3788//           tag 8 (Numeric)  → i128 LE (16 bytes) + u8 scale
3789//           tag 9 (Date)     → i32 LE (days since Unix epoch)
3790//           tag 10 (Timestamp) → i64 LE (microseconds since Unix epoch)
3791//
3792// Bumped to version 3 when NUMERIC was added; to version 4 when
3793// AUTO_INCREMENT (per-column flag) + NSW index `kind` byte landed;
3794// to version 5 when DATE / TIMESTAMP were added; to version 6 when
3795// NSW graph topology started travelling on disk (v2.7); to version 7
3796// when the NSW topology became multi-layer HNSW (v2.13); to version 8
3797// when row encoding switched to schema-driven dense layout (v3.0.2 —
3798// per-row NULL bitmap + per-column fixed-width body, no per-cell type
3799// tag).
3800// =========================================================================
3801
3802const FILE_MAGIC: &[u8; 8] = b"SPGDB001";
3803/// Current catalog snapshot format version emitted by [`Catalog::serialize`].
3804///
3805/// v9 (v5.2) extends v8 by serialising `BTree` index entries directly — every
3806/// `(IndexKey, Vec<RowLocator>)` pair travels on disk with the v5.1
3807/// `RowLocator::write_le` tag-prefixed codec. v8 `BTree` indices stored no
3808/// entries at all (the map was rebuilt from `Table::rows` on load); v9
3809/// preserves on-disk Cold locators so freezer-produced cold-tier index
3810/// entries survive a catalog snapshot round-trip. v8 readers are accepted
3811/// by version dispatch in [`Catalog::deserialize`] — every entry decodes
3812/// as `RowLocator::Hot(_)` via `add_index` rebuild, identical to v5.1
3813/// behaviour.
3814/// v6.7.2 — bumped from 10 to 11 to append per-table
3815/// `hot_tier_bytes: Option<u64>` after the per-table indices
3816/// section. v10 catalogs (v6.7.1) load with `hot_tier_bytes =
3817/// None` for every table (the deserialiser short-circuits when
3818/// version < 11). v11 snapshots written by a pre-v6.7.2 binary
3819/// fail loudly at the version check, matching the v6.1.2 /
3820/// v6.1.4 / v6.2.0 / v6.7.1 envelope-bump upgrade fences.
3821///
3822/// v6.8.0 — bumped from 11 to 12: per-index
3823/// `included_columns: Vec<u16>` appended at the tail of each
3824/// index payload. v11 (= v6.7.2) catalogs load with
3825/// `included_columns = Vec::new()` for every index — same
3826/// "older readers, append-only extension" pattern as the v6.7.2
3827/// hot_tier_bytes byte.
3828const FILE_VERSION: u8 = 13;
3829/// Oldest format version [`Catalog::deserialize`] still accepts. v8 is the
3830/// v3.0.2 dense-row layout; pre-v8 catalogs require an offline migration.
3831const MIN_SUPPORTED_FILE_VERSION: u8 = 8;
3832
3833// IndexKey wire format (v9):
3834//   tag 0 = Int  → [i64 LE]
3835//   tag 1 = Text → [u16 LE len + UTF-8 bytes] (via write_str / read_str)
3836//   tag 2 = Bool → [u8 0/1]
3837const INDEX_KEY_TAG_INT: u8 = 0;
3838const INDEX_KEY_TAG_TEXT: u8 = 1;
3839const INDEX_KEY_TAG_BOOL: u8 = 2;
3840
3841impl Catalog {
3842    /// Serialize the whole catalog (schema + every row) into a self-contained
3843    /// byte buffer. Format is documented above the impl block.
3844    pub fn serialize(&self) -> Vec<u8> {
3845        let mut out = Vec::with_capacity(64);
3846        out.extend_from_slice(FILE_MAGIC);
3847        out.push(FILE_VERSION);
3848        write_u32(
3849            &mut out,
3850            u32::try_from(self.tables.len()).expect("≤ 4G tables"),
3851        );
3852        for t in &self.tables {
3853            write_str(&mut out, &t.schema.name);
3854            write_u16(
3855                &mut out,
3856                u16::try_from(t.schema.columns.len()).expect("≤ 65k columns/table"),
3857            );
3858            for c in &t.schema.columns {
3859                write_str(&mut out, &c.name);
3860                write_data_type(&mut out, c.ty);
3861                out.push(u8::from(c.nullable));
3862                match &c.default {
3863                    None => out.push(0),
3864                    Some(v) => {
3865                        out.push(1);
3866                        write_value(&mut out, v);
3867                    }
3868                }
3869                out.push(u8::from(c.auto_increment));
3870            }
3871            write_u32(
3872                &mut out,
3873                u32::try_from(t.rows.len()).expect("≤ 4G rows/table"),
3874            );
3875            // v3.0.2 dense row encoding (FILE_VERSION 8): per-row NULL
3876            // bitmap, then tightly-packed bodies. Identical wire format
3877            // as before — extracted into `encode_row_body_dense` so cold-
3878            // tier segments (v5.1+) can share the encoding.
3879            for row in &t.rows {
3880                out.extend_from_slice(&encode_row_body_dense(row, &t.schema));
3881            }
3882            // Index definitions. Per-index payload:
3883            //   [name][col_pos u16][kind u8]
3884            //     kind 0 = B-tree           (no params — rebuilt on load)
3885            //     kind 1 = NSW graph        (u16 M + serialized graph)
3886            // For NSW the graph topology travels on disk so startup
3887            // doesn't re-run the O(n²M) rebuild — see v2.7 notes.
3888            write_u16(
3889                &mut out,
3890                u16::try_from(t.indices.len()).expect("≤ 65k indices/table"),
3891            );
3892            for idx in &t.indices {
3893                write_str(&mut out, &idx.name);
3894                write_u16(
3895                    &mut out,
3896                    u16::try_from(idx.column_position).expect("≤ 65k columns/table"),
3897                );
3898                match &idx.kind {
3899                    IndexKind::BTree(map) => {
3900                        out.push(0);
3901                        // v9: serialise the full PB map. Each entry's
3902                        // RowLocator list travels with the tag-prefixed
3903                        // codec from `row_locator::write_le`, so freezer-
3904                        // produced Cold locators survive a snapshot
3905                        // round-trip. v8 BTree wrote nothing here and
3906                        // rebuilt from rows — v9 readers tolerate v8 by
3907                        // version dispatch in `Catalog::deserialize`.
3908                        write_u32(
3909                            &mut out,
3910                            u32::try_from(map.len()).expect("≤ 4G index entries/index"),
3911                        );
3912                        for (key, locators) in map {
3913                            write_index_key(&mut out, key);
3914                            write_u32(
3915                                &mut out,
3916                                u32::try_from(locators.len()).expect("≤ 4G locators/key"),
3917                            );
3918                            for loc in locators {
3919                                loc.write_le(&mut out);
3920                            }
3921                        }
3922                    }
3923                    IndexKind::Nsw(g) => {
3924                        out.push(1);
3925                        write_u16(&mut out, u16::try_from(g.m).expect("≤ 65k NSW neighbours"));
3926                        write_nsw_graph(&mut out, g);
3927                    }
3928                    IndexKind::Brin { column_type } => {
3929                        // v6.7.1 — tag byte 2 = BRIN. Payload is the
3930                        // column type code (1 byte mapping to the
3931                        // shared DataType numeric encoding); no
3932                        // further data — BRIN summaries live in
3933                        // cold segments, not the catalog.
3934                        out.push(2);
3935                        write_data_type(&mut out, *column_type);
3936                    }
3937                }
3938                // v6.8.0 — included_columns appendix per index.
3939                // Layout: [u16 num_included][num × u16 column_position].
3940                // v11 readers stop before this u16 (deserialise loop
3941                // gated on version >= 12); v12+ readers always
3942                // consume it. Empty Vec serialises as a bare 0u16.
3943                write_u16(
3944                    &mut out,
3945                    u16::try_from(idx.included_columns.len())
3946                        .expect("≤ 65k INCLUDE columns/index"),
3947                );
3948                for col_pos in &idx.included_columns {
3949                    write_u16(
3950                        &mut out,
3951                        u16::try_from(*col_pos).expect("≤ 65k columns/table"),
3952                    );
3953                }
3954                // v6.8.1 — partial_predicate appendix per index.
3955                // Layout: [u8 has_pred][u16 LE len][bytes (if has_pred)].
3956                // Same v12 gate as included_columns.
3957                match &idx.partial_predicate {
3958                    None => out.push(0),
3959                    Some(pred) => {
3960                        out.push(1);
3961                        write_str(&mut out, pred);
3962                    }
3963                }
3964                // v6.8.2 — expression appendix. Same shape as
3965                // partial_predicate.
3966                match &idx.expression {
3967                    None => out.push(0),
3968                    Some(expr) => {
3969                        out.push(1);
3970                        write_str(&mut out, expr);
3971                    }
3972                }
3973            }
3974            // v6.7.2 — per-table hot_tier_bytes Option<u64>.
3975            // Layout: [u8 has_value][u64 LE value (if has_value)].
3976            // v10 readers stop before this byte (deserialise loop
3977            // gated on version >= 11); v11+ readers always
3978            // consume it.
3979            match t.schema.hot_tier_bytes {
3980                None => out.push(0),
3981                Some(n) => {
3982                    out.push(1);
3983                    out.extend_from_slice(&n.to_le_bytes());
3984                }
3985            }
3986            // v7.6.1 — FOREIGN KEY appendix (catalog FILE_VERSION 13+).
3987            // Layout: [u16 LE fk_count]
3988            //   per fk:
3989            //     [u8 has_name] [str name (if has_name)]
3990            //     [u16 LE local_arity] [u16 LE local_pos]*arity
3991            //     [str parent_table]
3992            //     [u16 LE parent_arity] [u16 LE parent_pos]*arity
3993            //     [u8 on_delete_tag] [u8 on_update_tag]
3994            // Older catalogs (v12 and below) skip this block entirely;
3995            // their reader stops before this byte.
3996            write_u16(
3997                &mut out,
3998                u16::try_from(t.schema.foreign_keys.len()).expect("≤ 65k FKs/table"),
3999            );
4000            for fk in &t.schema.foreign_keys {
4001                match &fk.name {
4002                    None => out.push(0),
4003                    Some(n) => {
4004                        out.push(1);
4005                        write_str(&mut out, n);
4006                    }
4007                }
4008                write_u16(
4009                    &mut out,
4010                    u16::try_from(fk.local_columns.len()).expect("≤ 65k FK columns"),
4011                );
4012                for &p in &fk.local_columns {
4013                    write_u16(
4014                        &mut out,
4015                        u16::try_from(p).expect("≤ 65k columns/table"),
4016                    );
4017                }
4018                write_str(&mut out, &fk.parent_table);
4019                write_u16(
4020                    &mut out,
4021                    u16::try_from(fk.parent_columns.len()).expect("≤ 65k FK parent columns"),
4022                );
4023                for &p in &fk.parent_columns {
4024                    write_u16(
4025                        &mut out,
4026                        u16::try_from(p).expect("≤ 65k columns/table"),
4027                    );
4028                }
4029                out.push(fk.on_delete.tag());
4030                out.push(fk.on_update.tag());
4031            }
4032        }
4033        out
4034    }
4035
4036    /// Deserialize a previously-serialized catalog. Rejects bad magic, version
4037    /// mismatch, unknown tags, truncation, and trailing bytes.
4038    pub fn deserialize(buf: &[u8]) -> Result<Self, StorageError> {
4039        let mut cur = Cursor::new(buf);
4040        let magic = cur.take(8)?;
4041        if magic != FILE_MAGIC {
4042            return Err(StorageError::Corrupt(format!(
4043                "bad magic: expected SPGDB001, got {magic:?}"
4044            )));
4045        }
4046        let version = cur.read_u8()?;
4047        if !(MIN_SUPPORTED_FILE_VERSION..=FILE_VERSION).contains(&version) {
4048            return Err(StorageError::Corrupt(format!(
4049                "unsupported file version: {version} (supported: {MIN_SUPPORTED_FILE_VERSION}..={FILE_VERSION})"
4050            )));
4051        }
4052        let table_count = cur.read_u32()? as usize;
4053        let mut cat = Self::new();
4054        for _ in 0..table_count {
4055            deserialize_table(&mut cur, &mut cat, version)?;
4056        }
4057        if cur.pos < buf.len() {
4058            return Err(StorageError::Corrupt(format!(
4059                "trailing bytes: {} unread",
4060                buf.len() - cur.pos
4061            )));
4062        }
4063        Ok(cat)
4064    }
4065}
4066
4067/// Per-table deserialize body — schema, rows, indices. Pulled out of
4068/// `Catalog::deserialize` to keep the latter under the line-budget lint
4069/// and to give the row hot loop its own scope (so the borrow on `t`
4070/// stays scoped here rather than across the whole catalog loop).
4071fn deserialize_table(
4072    cur: &mut Cursor<'_>,
4073    cat: &mut Catalog,
4074    version: u8,
4075) -> Result<(), StorageError> {
4076    let table_name = cur.read_str()?;
4077    let name = table_name.clone();
4078    let col_count = cur.read_u16()? as usize;
4079    let mut cols = Vec::with_capacity(col_count);
4080    for _ in 0..col_count {
4081        let c_name = cur.read_str()?;
4082        let ty = cur.read_data_type()?;
4083        let nullable = cur.read_u8()? != 0;
4084        let default = match cur.read_u8()? {
4085            0 => None,
4086            1 => Some(cur.read_value()?),
4087            other => {
4088                return Err(StorageError::Corrupt(format!(
4089                    "unknown default tag: {other}"
4090                )));
4091            }
4092        };
4093        let auto_increment = cur.read_u8()? != 0;
4094        cols.push(ColumnSchema {
4095            name: c_name,
4096            ty,
4097            nullable,
4098            default,
4099            auto_increment,
4100        });
4101    }
4102    let n_cols = cols.len();
4103    cat.create_table(TableSchema::new(name, cols))?;
4104    // Vec<Table> with insertion-order semantics — the just-pushed
4105    // table is at the end. Sidecar `by_name` is already wired up but
4106    // we skip the map lookup here since we know the position.
4107    let t = cat.tables.last_mut().expect("create_table just pushed");
4108    deserialize_rows(cur, t, n_cols)?;
4109    deserialize_indices(cur, t, version)?;
4110    // v6.7.2 — per-table hot_tier_bytes appendix. v11+ writes
4111    // `[u8 has_value][u64 LE value (if has_value)]`. v10 / v9 / v8
4112    // catalogs skip this entirely (the deserialiser reads no extra
4113    // bytes; the table's hot_tier_bytes stays None from
4114    // TableSchema::new).
4115    if version >= 11 {
4116        let has = cur.read_u8()?;
4117        let hot_tier_bytes = match has {
4118            0 => None,
4119            1 => Some(cur.read_u64()?),
4120            other => {
4121                return Err(StorageError::Corrupt(format!(
4122                    "hot_tier_bytes appendix: unknown has-value byte {other}"
4123                )));
4124            }
4125        };
4126        t.schema_mut().hot_tier_bytes = hot_tier_bytes;
4127    }
4128    // v7.6.1 — FOREIGN KEY appendix (FILE_VERSION 13+). v12 / v11 / …
4129    // catalogs skip this entirely.
4130    if version >= 13 {
4131        let fk_count = cur.read_u16()? as usize;
4132        let mut fks = Vec::with_capacity(fk_count);
4133        for _ in 0..fk_count {
4134            let name = match cur.read_u8()? {
4135                0 => None,
4136                1 => Some(cur.read_str()?),
4137                other => {
4138                    return Err(StorageError::Corrupt(format!(
4139                        "FK appendix: unknown has-name byte {other}"
4140                    )));
4141                }
4142            };
4143            let local_arity = cur.read_u16()? as usize;
4144            let mut local_columns = Vec::with_capacity(local_arity);
4145            for _ in 0..local_arity {
4146                local_columns.push(cur.read_u16()? as usize);
4147            }
4148            let parent_table = cur.read_str()?;
4149            let parent_arity = cur.read_u16()? as usize;
4150            if parent_arity != local_arity {
4151                return Err(StorageError::Corrupt(format!(
4152                    "FK arity mismatch in catalog: local {local_arity} vs parent {parent_arity}"
4153                )));
4154            }
4155            let mut parent_columns = Vec::with_capacity(parent_arity);
4156            for _ in 0..parent_arity {
4157                parent_columns.push(cur.read_u16()? as usize);
4158            }
4159            let on_delete = FkAction::from_tag(cur.read_u8()?).ok_or_else(|| {
4160                StorageError::Corrupt("FK appendix: unknown on_delete tag".into())
4161            })?;
4162            let on_update = FkAction::from_tag(cur.read_u8()?).ok_or_else(|| {
4163                StorageError::Corrupt("FK appendix: unknown on_update tag".into())
4164            })?;
4165            fks.push(ForeignKeyConstraint {
4166                name,
4167                local_columns,
4168                parent_table,
4169                parent_columns,
4170                on_delete,
4171                on_update,
4172            });
4173        }
4174        t.schema_mut().foreign_keys = fks;
4175    }
4176    let _ = table_name;
4177    Ok(())
4178}
4179
4180fn deserialize_rows(
4181    cur: &mut Cursor<'_>,
4182    t: &mut Table,
4183    _n_cols: usize,
4184) -> Result<(), StorageError> {
4185    let row_count = cur.read_u32()? as usize;
4186    // v4.39: PV has no `reserve` (the BVT doesn't preallocate a
4187    // contiguous buffer); we just push directly and let the trie
4188    // grow. v5.1: row decode reuses `decode_row_body_dense` so the
4189    // catalog and cold-tier segments share one row codec.
4190    let mut hot_bytes: u64 = 0;
4191    for _ in 0..row_count {
4192        let tail = &cur.buf[cur.pos..];
4193        let (row, consumed) = decode_row_body_dense(tail, &t.schema)?;
4194        cur.pos += consumed;
4195        // v5.2.1: account for hot bytes as we go; the snapshot's row
4196        // block bytes are exactly what `encode_row_body_dense` would
4197        // produce, so `consumed` would do too — but going via the
4198        // helper keeps the counter's definition coupled to the
4199        // encoder rather than the snapshot's row prefix layout.
4200        hot_bytes = hot_bytes.saturating_add(row_body_encoded_len(&row, &t.schema) as u64);
4201        t.rows.push_mut(row);
4202    }
4203    t.hot_bytes = hot_bytes;
4204    Ok(())
4205}
4206
4207fn deserialize_indices(
4208    cur: &mut Cursor<'_>,
4209    t: &mut Table,
4210    version: u8,
4211) -> Result<(), StorageError> {
4212    let index_count = cur.read_u16()? as usize;
4213    for _ in 0..index_count {
4214        let idx_name = cur.read_str()?;
4215        let col_pos = cur.read_u16()? as usize;
4216        let column_name = t
4217            .schema
4218            .columns
4219            .get(col_pos)
4220            .ok_or_else(|| {
4221                StorageError::Corrupt(format!(
4222                    "index {idx_name:?} points at non-existent column position {col_pos}"
4223                ))
4224            })?
4225            .name
4226            .clone();
4227        let kind_tag = cur.read_u8()?;
4228        match kind_tag {
4229            0 => {
4230                if version >= 9 {
4231                    // v9+: BTree entries serialised inline (tag-prefixed
4232                    // locator codec). Restore the map directly so any
4233                    // freezer-produced Cold locators come back exactly
4234                    // as they went out.
4235                    let map = read_btree_map(cur)?;
4236                    t.restore_btree_index(idx_name, &column_name, map)?;
4237                } else {
4238                    // v8: no entries on disk; rebuild from rows. Every
4239                    // entry is materialised as `RowLocator::Hot(i)` —
4240                    // semantically identical to the v5.1 in-memory state
4241                    // since v8 catalogs never produced Cold locators.
4242                    t.add_index(idx_name, &column_name)?;
4243                }
4244            }
4245            1 => {
4246                let m = cur.read_u16()? as usize;
4247                let graph = cur.read_nsw_graph(m)?;
4248                t.restore_nsw_index(idx_name, &column_name, graph)?;
4249            }
4250            2 => {
4251                // v6.7.1 — BRIN tag. Payload is the column type
4252                // tag. No further data — summaries live in cold
4253                // segments.
4254                let column_type = cur.read_data_type()?;
4255                t.restore_brin_index(idx_name, &column_name, column_type)?;
4256            }
4257            other => {
4258                return Err(StorageError::Corrupt(format!(
4259                    "unknown index kind tag: {other}"
4260                )));
4261            }
4262        }
4263        // v6.8.0 — included_columns appendix per index. v11- snapshots
4264        // stop before this u16; v12+ always carries it (possibly 0).
4265        if version >= 12 {
4266            let num_included = cur.read_u16()? as usize;
4267            if num_included > 0 {
4268                let mut included: Vec<usize> = Vec::with_capacity(num_included);
4269                for _ in 0..num_included {
4270                    let cp = cur.read_u16()? as usize;
4271                    if cp >= t.schema.columns.len() {
4272                        return Err(StorageError::Corrupt(format!(
4273                            "INCLUDE column position {cp} out of range \
4274                             ({} schema columns)",
4275                            t.schema.columns.len()
4276                        )));
4277                    }
4278                    included.push(cp);
4279                }
4280                if let Some(last) = t.indices.last_mut() {
4281                    last.included_columns = included;
4282                }
4283            }
4284            // v6.8.1 — partial_predicate appendix.
4285            match cur.read_u8()? {
4286                0 => {}
4287                1 => {
4288                    let pred = cur.read_str()?;
4289                    if let Some(last) = t.indices.last_mut() {
4290                        last.partial_predicate = Some(pred);
4291                    }
4292                }
4293                other => {
4294                    return Err(StorageError::Corrupt(format!(
4295                        "partial_predicate tag: unknown byte {other}"
4296                    )));
4297                }
4298            }
4299            // v6.8.2 — expression appendix.
4300            match cur.read_u8()? {
4301                0 => {}
4302                1 => {
4303                    let expr = cur.read_str()?;
4304                    if let Some(last) = t.indices.last_mut() {
4305                        last.expression = Some(expr);
4306                    }
4307                }
4308                other => {
4309                    return Err(StorageError::Corrupt(format!(
4310                        "expression tag: unknown byte {other}"
4311                    )));
4312                }
4313            }
4314        }
4315    }
4316    Ok(())
4317}
4318
4319/// Parse a v9 `BTree` index payload — `[u32 entry_count]` followed by
4320/// `entry_count` `(IndexKey, Vec<RowLocator>)` pairs. The locator list
4321/// uses the v5.1 tag-prefixed wire format (`RowLocator::read_le`).
4322fn read_btree_map(
4323    cur: &mut Cursor<'_>,
4324) -> Result<PersistentBTreeMap<IndexKey, Vec<RowLocator>>, StorageError> {
4325    let entry_count = cur.read_u32()? as usize;
4326    let mut map = PersistentBTreeMap::new();
4327    for _ in 0..entry_count {
4328        let key = cur.read_index_key()?;
4329        let locator_count = cur.read_u32()? as usize;
4330        let mut locators = Vec::with_capacity(locator_count);
4331        for _ in 0..locator_count {
4332            let tail = &cur.buf[cur.pos..];
4333            let (loc, consumed) = RowLocator::read_le(tail).map_err(|e| {
4334                StorageError::Corrupt(format!("row_locator decode at offset {}: {e}", cur.pos))
4335            })?;
4336            cur.pos += consumed;
4337            locators.push(loc);
4338        }
4339        map.insert_mut(key, locators);
4340    }
4341    Ok(map)
4342}
4343
4344// --- low-level binary helpers ---------------------------------------------
4345
4346/// Write a `DataType` as a tag byte + optional payload (Vector carries its
4347/// `u32` dimension). Inverse: [`read_data_type`].
4348/// Serialize an HNSW graph after the `[kind=1][u16 M]` header (v7).
4349/// Layout:
4350/// - `[u16 m_max_0]`
4351/// - `[entry u32]` — `u32::MAX` means `None`, else the entry node index
4352/// - `[u8 entry_level]`
4353/// - `[node_count u32]`
4354/// - for each node: `[u8 level]`  (top layer for this node)
4355/// - `[layer_count u8]`
4356/// - for each layer `0..layer_count`:
4357///     - `[u32 layer_node_count]` (== `node_count`; per-layer slot)
4358///     - for each node: `[u16 neighbor_count] [u32 neighbor]*`
4359fn write_nsw_graph(out: &mut Vec<u8>, g: &NswGraph) {
4360    let entry = g.entry.map_or(u32::MAX, |e| {
4361        u32::try_from(e).expect("NSW entry fits in u32")
4362    });
4363    write_u16(
4364        out,
4365        u16::try_from(g.m_max_0).expect("HNSW m_max_0 fits in u16"),
4366    );
4367    out.extend_from_slice(&entry.to_le_bytes());
4368    out.push(g.entry_level);
4369    let node_count = g.levels.len();
4370    write_u32(
4371        out,
4372        u32::try_from(node_count).expect("HNSW node count fits in u32"),
4373    );
4374    for &lvl in &g.levels {
4375        out.push(lvl);
4376    }
4377    let layer_count = u8::try_from(g.layers.len()).expect("HNSW layer count ≤ 255");
4378    out.push(layer_count);
4379    for layer in &g.layers {
4380        write_u32(
4381            out,
4382            u32::try_from(layer.len()).expect("HNSW per-layer node count fits in u32"),
4383        );
4384        for neighbors in layer {
4385            write_u16(
4386                out,
4387                u16::try_from(neighbors.len()).expect("HNSW neighbour list fits in u16"),
4388            );
4389            // v6.1.x: neighbour slot is already u32 in memory; just
4390            // emit the raw bytes. (v6.0 stored usize and converted
4391            // here.)
4392            for &peer in neighbors {
4393                write_u32(out, peer);
4394            }
4395        }
4396    }
4397}
4398
4399fn write_data_type(out: &mut Vec<u8>, t: DataType) {
4400    match t {
4401        DataType::Int => out.push(1),
4402        DataType::BigInt => out.push(2),
4403        DataType::Float => out.push(3),
4404        DataType::Text => out.push(4),
4405        DataType::Bool => out.push(5),
4406        DataType::Vector { dim, encoding } => match encoding {
4407            // Tag 6: pre-v6 F32 vector. Layout unchanged; pre-v6
4408            // binaries continue to deserialise this exactly as
4409            // before.
4410            VecEncoding::F32 => {
4411                out.push(6);
4412                out.extend_from_slice(&dim.to_le_bytes());
4413            }
4414            // v6.0.3: tag 15 for `VECTOR(N) USING HALF`. Same
4415            // forward-compat fence story as SQ8 below.
4416            VecEncoding::F16 => {
4417                out.push(15);
4418                out.extend_from_slice(&dim.to_le_bytes());
4419            }
4420            // v6.0.1: new tag 14 for `VECTOR(N) USING SQ8` column
4421            // type. Pre-v6 readers fall through `read_data_type`'s
4422            // catch-all and surface `Corrupt("unknown data type tag")`
4423            // — the explicit forward-compat fence called out in
4424            // V6_DESIGN deliberation #5.
4425            VecEncoding::Sq8 => {
4426                out.push(14);
4427                out.extend_from_slice(&dim.to_le_bytes());
4428            }
4429        },
4430        DataType::SmallInt => out.push(7),
4431        DataType::Varchar(max) => {
4432            out.push(8);
4433            out.extend_from_slice(&max.to_le_bytes());
4434        }
4435        DataType::Char(size) => {
4436            out.push(9);
4437            out.extend_from_slice(&size.to_le_bytes());
4438        }
4439        DataType::Numeric { precision, scale } => {
4440            out.push(10);
4441            out.push(precision);
4442            out.push(scale);
4443        }
4444        DataType::Date => out.push(11),
4445        DataType::Timestamp => out.push(12),
4446        // INTERVAL is runtime-only — CREATE TABLE never produces a
4447        // column with this type, so write_data_type must not be called
4448        // on it. (Disk-format codepoint reserved for a future v3 where
4449        // INTERVAL becomes storable.)
4450        DataType::Interval => {
4451            unreachable!("DataType::Interval has no on-disk encoding in v2.11")
4452        }
4453        DataType::Json => out.push(13),
4454    }
4455}
4456
4457impl Cursor<'_> {
4458    fn read_data_type(&mut self) -> Result<DataType, StorageError> {
4459        let tag = self.read_u8()?;
4460        match tag {
4461            1 => Ok(DataType::Int),
4462            2 => Ok(DataType::BigInt),
4463            3 => Ok(DataType::Float),
4464            4 => Ok(DataType::Text),
4465            5 => Ok(DataType::Bool),
4466            6 => Ok(DataType::Vector {
4467                dim: self.read_u32()?,
4468                encoding: VecEncoding::F32,
4469            }),
4470            7 => Ok(DataType::SmallInt),
4471            8 => Ok(DataType::Varchar(self.read_u32()?)),
4472            9 => Ok(DataType::Char(self.read_u32()?)),
4473            10 => {
4474                let precision = self.read_u8()?;
4475                let scale = self.read_u8()?;
4476                Ok(DataType::Numeric { precision, scale })
4477            }
4478            11 => Ok(DataType::Date),
4479            12 => Ok(DataType::Timestamp),
4480            13 => Ok(DataType::Json),
4481            14 => Ok(DataType::Vector {
4482                dim: self.read_u32()?,
4483                encoding: VecEncoding::Sq8,
4484            }),
4485            // v6.0.3: tag 15 for `VECTOR(N) USING HALF`. Same
4486            // [u32 dim] type-tag payload as F32 / SQ8; the encoding
4487            // lives in the tag byte itself.
4488            15 => Ok(DataType::Vector {
4489                dim: self.read_u32()?,
4490                encoding: VecEncoding::F16,
4491            }),
4492            other => Err(StorageError::Corrupt(format!(
4493                "unknown data type tag: {other}"
4494            ))),
4495        }
4496    }
4497}
4498
4499/// Fast computation of the byte length [`encode_row_body_dense`]
4500/// would produce, without allocating the output buffer. Mirrors the
4501/// encoder's per-column body sizing so the v5.2.1 `Table::hot_bytes`
4502/// incremental counter doesn't pay an alloc-per-insert tax. Returns
4503/// the exact same `usize` as `encode_row_body_dense(row, schema).len()`.
4504pub fn row_body_encoded_len(row: &Row, schema: &TableSchema) -> usize {
4505    debug_assert_eq!(
4506        row.values.len(),
4507        schema.columns.len(),
4508        "row_body_encoded_len: row arity must match schema"
4509    );
4510    let bitmap_bytes = schema.columns.len().div_ceil(8);
4511    let mut n = bitmap_bytes;
4512    for (col_idx, v) in row.values.iter().enumerate() {
4513        if matches!(v, Value::Null) {
4514            continue;
4515        }
4516        n += value_body_encoded_len(v, schema.columns[col_idx].ty);
4517    }
4518    n
4519}
4520
4521/// Byte length a single cell consumes when written by
4522/// `write_value_body`. Used by [`row_body_encoded_len`]; kept in
4523/// lock-step with the encoder. The `_ty` slot is reserved for future
4524/// type-dependent encodings — every variant currently writes a fixed
4525/// body shape regardless of the declared column type.
4526fn value_body_encoded_len(v: &Value, _ty: DataType) -> usize {
4527    match v {
4528        Value::SmallInt(_) => 2,
4529        // 4-byte body: i32 / Date.
4530        Value::Int(_) | Value::Date(_) => 4,
4531        // 8-byte body: i64 / f64 / Timestamp.
4532        Value::BigInt(_) | Value::Float(_) | Value::Timestamp(_) => 8,
4533        Value::Bool(_) => 1,
4534        // Text/Varchar/Char/Json share the [u16 len][utf-8] layout.
4535        Value::Text(s) | Value::Json(s) => 2 + s.len(),
4536        // [u32 dim][f32 * dim]
4537        Value::Vector(vec) => 4 + 4 * vec.len(),
4538        // v6.0.1: SQ8 cell on-disk shape — [u32 dim][f32 min]
4539        // [f32 max][u8 * dim] = 12 + dim bytes. `hot_bytes`
4540        // tracking on `Table::insert` calls this every row, so
4541        // returning the real size now (even though the actual
4542        // `write_value_body` writer lands in step 6) keeps the
4543        // sizing arithmetic honest for in-memory benches.
4544        Value::Sq8Vector(q) => 4 + 4 + 4 + q.bytes.len(),
4545        // v6.0.3: halfvec on-disk shape — [u32 dim][u16 LE * dim]
4546        // = 4 + 2 * dim bytes.
4547        Value::HalfVector(h) => 4 + h.bytes.len(),
4548        // [i128 scaled][u8 scale]
4549        Value::Numeric { .. } => 16 + 1,
4550        // NULL is encoded only in the bitmap, never in the body.
4551        Value::Null => 0,
4552        // INTERVAL has no on-disk encoding (see write_value_body).
4553        Value::Interval { .. } => {
4554            unreachable!("Value::Interval has no on-disk encoding")
4555        }
4556    }
4557}
4558
4559/// Encode one row's body in the v3.0.2 dense format (`FILE_VERSION`
4560/// 8): per-row NULL bitmap (1 bit/col, ceil(cols/8) bytes), then
4561/// each non-NULL cell as `write_value_body`. Same wire shape the
4562/// catalog snapshot writes per row inside its rows-block. Exposed
4563/// pub so v5.1+ cold-tier segment writers can produce row payloads
4564/// that the catalog [`decode_row_body_dense`] decodes 1:1.
4565///
4566/// `row.values.len()` must equal `schema.columns.len()` — the row
4567/// is expected to have been validated by `Table::insert` (the
4568/// engine's INSERT path) before reaching this function.
4569pub fn encode_row_body_dense(row: &Row, schema: &TableSchema) -> Vec<u8> {
4570    debug_assert_eq!(
4571        row.values.len(),
4572        schema.columns.len(),
4573        "dense encode: row arity must match schema"
4574    );
4575    let bitmap_bytes = schema.columns.len().div_ceil(8);
4576    // 8 B per fixed-width cell is a reasonable average; the buffer
4577    // grows past this for variable-width Text/Vector cells.
4578    let mut out = Vec::with_capacity(bitmap_bytes + schema.columns.len() * 8);
4579    let bitmap_offset = out.len();
4580    out.resize(bitmap_offset + bitmap_bytes, 0);
4581    for (i, v) in row.values.iter().enumerate() {
4582        if matches!(v, Value::Null) {
4583            out[bitmap_offset + i / 8] |= 1 << (i % 8);
4584        }
4585    }
4586    for (col_idx, v) in row.values.iter().enumerate() {
4587        if matches!(v, Value::Null) {
4588            continue;
4589        }
4590        write_value_body(&mut out, v, schema.columns[col_idx].ty);
4591    }
4592    out
4593}
4594
4595/// Inverse of [`encode_row_body_dense`]. Reads one row's body from
4596/// `bytes` and returns it plus the number of bytes consumed (so a
4597/// caller decoding a back-to-back stream of rows can advance its
4598/// cursor). Returns `StorageError::Corrupt` on truncation, bad
4599/// UTF-8, or unknown cell tags.
4600pub fn decode_row_body_dense(
4601    bytes: &[u8],
4602    schema: &TableSchema,
4603) -> Result<(Row, usize), StorageError> {
4604    let mut cur = Cursor::new(bytes);
4605    let bitmap_bytes = schema.columns.len().div_ceil(8);
4606    let mut bitmap_buf = [0u8; 32];
4607    if bitmap_bytes > bitmap_buf.len() {
4608        return Err(StorageError::Corrupt(format!(
4609            "row NULL bitmap {bitmap_bytes} B exceeds 32 B cap"
4610        )));
4611    }
4612    let slice = cur.take(bitmap_bytes)?;
4613    bitmap_buf[..bitmap_bytes].copy_from_slice(slice);
4614    let mut values = Vec::with_capacity(schema.columns.len());
4615    for (col_idx, col) in schema.columns.iter().enumerate() {
4616        if (bitmap_buf[col_idx / 8] >> (col_idx % 8)) & 1 == 1 {
4617            values.push(Value::Null);
4618        } else {
4619            values.push(cur.read_value_body(col.ty)?);
4620        }
4621    }
4622    Ok((Row { values }, cur.pos))
4623}
4624
4625/// Schema-driven dense value encoding (`FILE_VERSION` 8). Caller already
4626/// knows the column type and has decided this cell is non-NULL, so we
4627/// skip the per-cell type tag the v7 `write_value` was writing. NULL
4628/// is encoded via the per-row bitmap before this function runs, never
4629/// reaches here. Used only inside the row-encoding hot loop; the
4630/// schema-default path still goes through the legacy `write_value` so
4631/// DEFAULT values keep their self-describing tag and remain decodable
4632/// without consulting a column type.
4633fn write_value_body(out: &mut Vec<u8>, v: &Value, ty: DataType) {
4634    match (v, ty) {
4635        (Value::SmallInt(n), DataType::SmallInt) => out.extend_from_slice(&n.to_le_bytes()),
4636        (Value::Int(n), DataType::Int) => out.extend_from_slice(&n.to_le_bytes()),
4637        (Value::BigInt(n), DataType::BigInt) => out.extend_from_slice(&n.to_le_bytes()),
4638        (Value::Float(x), DataType::Float) => out.extend_from_slice(&x.to_le_bytes()),
4639        (Value::Bool(b), DataType::Bool) => out.push(u8::from(*b)),
4640        (Value::Text(s), DataType::Text | DataType::Varchar(_) | DataType::Char(_)) => {
4641            write_str(out, s);
4642        }
4643        (
4644            Value::Vector(v),
4645            DataType::Vector {
4646                encoding: VecEncoding::F32,
4647                ..
4648            },
4649        ) => {
4650            let dim = u32::try_from(v.len()).expect("vector dim fits in u32");
4651            out.extend_from_slice(&dim.to_le_bytes());
4652            for x in v {
4653                out.extend_from_slice(&x.to_le_bytes());
4654            }
4655        }
4656        // v6.0.1: SQ8 dense body — [u32 dim][f32 min][f32 max]
4657        // [u8 * dim]. Self-describes its length so v6 readers
4658        // walking rows of a v6 catalog stay aligned even if the
4659        // declared column dim drifts (defensive, not normally
4660        // possible since CREATE TABLE pins the dim).
4661        (
4662            Value::Sq8Vector(q),
4663            DataType::Vector {
4664                encoding: VecEncoding::Sq8,
4665                ..
4666            },
4667        ) => {
4668            let dim = u32::try_from(q.bytes.len()).expect("vector dim fits in u32");
4669            out.extend_from_slice(&dim.to_le_bytes());
4670            out.extend_from_slice(&q.min.to_le_bytes());
4671            out.extend_from_slice(&q.max.to_le_bytes());
4672            out.extend_from_slice(&q.bytes);
4673        }
4674        // v6.0.3: halfvec dense body — [u32 dim][u16 LE * dim].
4675        // The raw u16 bytes already live in `h.bytes` little-
4676        // endian, so we just splat them.
4677        (
4678            Value::HalfVector(h),
4679            DataType::Vector {
4680                encoding: VecEncoding::F16,
4681                ..
4682            },
4683        ) => {
4684            let dim = u32::try_from(h.dim()).expect("vector dim fits in u32");
4685            out.extend_from_slice(&dim.to_le_bytes());
4686            out.extend_from_slice(&h.bytes);
4687        }
4688        (Value::Numeric { scaled, .. }, DataType::Numeric { scale, .. }) => {
4689            out.extend_from_slice(&scaled.to_le_bytes());
4690            out.push(scale);
4691        }
4692        (Value::Date(d), DataType::Date) => out.extend_from_slice(&d.to_le_bytes()),
4693        (Value::Timestamp(t), DataType::Timestamp) => out.extend_from_slice(&t.to_le_bytes()),
4694        // v4.9: JSON stores as length-prefixed text; same shape as
4695        // Text — the type tag lives in the column schema, not the
4696        // per-cell body.
4697        (Value::Json(s), DataType::Json) => write_str(out, s),
4698        // Type mismatch shouldn't happen — `Table::insert` validates
4699        // value type against column type before pushing. Treat as a
4700        // bug, not a runtime error.
4701        (other, ty) => unreachable!(
4702            "schema-driven encode received mismatched value/type pair: \
4703             value tag={:?}, column type={:?}",
4704            other.data_type(),
4705            ty
4706        ),
4707    }
4708}
4709
4710fn write_value(out: &mut Vec<u8>, v: &Value) {
4711    match v {
4712        Value::Null => out.push(0),
4713        Value::SmallInt(n) => {
4714            out.push(7);
4715            out.extend_from_slice(&n.to_le_bytes());
4716        }
4717        Value::Int(n) => {
4718            out.push(1);
4719            out.extend_from_slice(&n.to_le_bytes());
4720        }
4721        Value::BigInt(n) => {
4722            out.push(2);
4723            out.extend_from_slice(&n.to_le_bytes());
4724        }
4725        Value::Float(x) => {
4726            out.push(3);
4727            out.extend_from_slice(&x.to_le_bytes());
4728        }
4729        // v4.9: JSON shares the tag-4 (Text) on-disk encoding —
4730        // schema decides which variant comes back on read. The
4731        // bodies are byte-identical so collapsing the match keeps
4732        // clippy::match_same_arms quiet.
4733        Value::Text(s) | Value::Json(s) => {
4734            out.push(4);
4735            write_str(out, s);
4736        }
4737        Value::Bool(b) => {
4738            out.push(5);
4739            out.push(u8::from(*b));
4740        }
4741        Value::Vector(v) => {
4742            out.push(6);
4743            let dim = u32::try_from(v.len()).expect("vector dim fits in u32");
4744            out.extend_from_slice(&dim.to_le_bytes());
4745            for x in v {
4746                out.extend_from_slice(&x.to_le_bytes());
4747            }
4748        }
4749        // v6.0.1: new tag 11 for an SQ8 cell carried with its full
4750        // header. Layout matches the dense row body shape so a
4751        // round-trip through write_value → read_value bit-equals
4752        // the original `Value::Sq8Vector`.
4753        Value::Sq8Vector(q) => {
4754            out.push(11);
4755            let dim = u32::try_from(q.bytes.len()).expect("vector dim fits in u32");
4756            out.extend_from_slice(&dim.to_le_bytes());
4757            out.extend_from_slice(&q.min.to_le_bytes());
4758            out.extend_from_slice(&q.max.to_le_bytes());
4759            out.extend_from_slice(&q.bytes);
4760        }
4761        // v6.0.3: tag 12 for a HalfVector cell.
4762        // Layout: `[u32 dim][u16 LE × dim]` — bit-identical to the
4763        // dense row body so `write_value` / `read_value` bit-equal
4764        // the original `Value::HalfVector`.
4765        Value::HalfVector(h) => {
4766            out.push(12);
4767            let dim = u32::try_from(h.dim()).expect("vector dim fits in u32");
4768            out.extend_from_slice(&dim.to_le_bytes());
4769            out.extend_from_slice(&h.bytes);
4770        }
4771        Value::Numeric { scaled, scale } => {
4772            out.push(8);
4773            out.extend_from_slice(&scaled.to_le_bytes());
4774            out.push(*scale);
4775        }
4776        Value::Date(d) => {
4777            out.push(9);
4778            out.extend_from_slice(&d.to_le_bytes());
4779        }
4780        Value::Timestamp(t) => {
4781            out.push(10);
4782            out.extend_from_slice(&t.to_le_bytes());
4783        }
4784        // Interval is a runtime-only value (no on-disk representation in
4785        // v2.11). CREATE TABLE rejects `DataType::Interval` columns, so a
4786        // Value::Interval here would mean the engine bypassed that gate.
4787        Value::Interval { .. } => {
4788            unreachable!(
4789                "Value::Interval has no on-disk encoding; engine must reject it before write"
4790            )
4791        }
4792    }
4793}
4794
4795fn write_u16(out: &mut Vec<u8>, n: u16) {
4796    out.extend_from_slice(&n.to_le_bytes());
4797}
4798fn write_u32(out: &mut Vec<u8>, n: u32) {
4799    out.extend_from_slice(&n.to_le_bytes());
4800}
4801fn write_str(out: &mut Vec<u8>, s: &str) {
4802    let len = u16::try_from(s.len()).expect("identifier / text fits in u16");
4803    write_u16(out, len);
4804    out.extend_from_slice(s.as_bytes());
4805}
4806
4807/// Serialise an [`IndexKey`] using the v9 tagged codec. `read_index_key`
4808/// is the inverse. v8 catalogs never wrote index keys (`BTree` entries were
4809/// rebuilt from `Table::rows`), so this codec is v9+ only.
4810fn write_index_key(out: &mut Vec<u8>, key: &IndexKey) {
4811    match key {
4812        IndexKey::Int(n) => {
4813            out.push(INDEX_KEY_TAG_INT);
4814            out.extend_from_slice(&n.to_le_bytes());
4815        }
4816        IndexKey::Text(s) => {
4817            out.push(INDEX_KEY_TAG_TEXT);
4818            write_str(out, s);
4819        }
4820        IndexKey::Bool(b) => {
4821            out.push(INDEX_KEY_TAG_BOOL);
4822            out.push(u8::from(*b));
4823        }
4824    }
4825}
4826
4827struct Cursor<'a> {
4828    buf: &'a [u8],
4829    pos: usize,
4830}
4831
4832impl<'a> Cursor<'a> {
4833    const fn new(buf: &'a [u8]) -> Self {
4834        Self { buf, pos: 0 }
4835    }
4836
4837    fn take(&mut self, n: usize) -> Result<&'a [u8], StorageError> {
4838        let end = self
4839            .pos
4840            .checked_add(n)
4841            .ok_or_else(|| StorageError::Corrupt(format!("length overflow taking {n} bytes")))?;
4842        if end > self.buf.len() {
4843            return Err(StorageError::Corrupt(format!(
4844                "unexpected EOF at offset {} (wanted {n} more bytes)",
4845                self.pos
4846            )));
4847        }
4848        let s = &self.buf[self.pos..end];
4849        self.pos = end;
4850        Ok(s)
4851    }
4852
4853    fn read_u8(&mut self) -> Result<u8, StorageError> {
4854        Ok(self.take(1)?[0])
4855    }
4856    fn read_u16(&mut self) -> Result<u16, StorageError> {
4857        let s = self.take(2)?;
4858        Ok(u16::from_le_bytes([s[0], s[1]]))
4859    }
4860    fn read_u32(&mut self) -> Result<u32, StorageError> {
4861        let s = self.take(4)?;
4862        Ok(u32::from_le_bytes([s[0], s[1], s[2], s[3]]))
4863    }
4864    fn read_i32(&mut self) -> Result<i32, StorageError> {
4865        let s = self.take(4)?;
4866        Ok(i32::from_le_bytes([s[0], s[1], s[2], s[3]]))
4867    }
4868    /// v6.7.2 — u64 LE read for the per-table `hot_tier_bytes`
4869    /// catalog appendix.
4870    fn read_u64(&mut self) -> Result<u64, StorageError> {
4871        let s = self.take(8)?;
4872        Ok(u64::from_le_bytes([
4873            s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7],
4874        ]))
4875    }
4876    fn read_i64(&mut self) -> Result<i64, StorageError> {
4877        let s = self.take(8)?;
4878        let arr: [u8; 8] = s.try_into().expect("checked");
4879        Ok(i64::from_le_bytes(arr))
4880    }
4881    fn read_f64(&mut self) -> Result<f64, StorageError> {
4882        let s = self.take(8)?;
4883        let arr: [u8; 8] = s.try_into().expect("checked");
4884        Ok(f64::from_le_bytes(arr))
4885    }
4886    fn read_f32(&mut self) -> Result<f32, StorageError> {
4887        let s = self.take(4)?;
4888        Ok(f32::from_le_bytes([s[0], s[1], s[2], s[3]]))
4889    }
4890    fn read_str(&mut self) -> Result<String, StorageError> {
4891        let len = self.read_u16()? as usize;
4892        let bytes = self.take(len)?;
4893        core::str::from_utf8(bytes)
4894            .map(String::from)
4895            .map_err(|_| StorageError::Corrupt("invalid UTF-8 in identifier or text".into()))
4896    }
4897
4898    /// Parse an [`IndexKey`] emitted by `write_index_key` (v9 tagged
4899    /// codec). Returns `StorageError::Corrupt` on unknown tag or
4900    /// truncated payload.
4901    fn read_index_key(&mut self) -> Result<IndexKey, StorageError> {
4902        let tag = self.read_u8()?;
4903        match tag {
4904            INDEX_KEY_TAG_INT => Ok(IndexKey::Int(self.read_i64()?)),
4905            INDEX_KEY_TAG_TEXT => Ok(IndexKey::Text(self.read_str()?)),
4906            INDEX_KEY_TAG_BOOL => Ok(IndexKey::Bool(self.read_u8()? != 0)),
4907            other => Err(StorageError::Corrupt(format!(
4908                "unknown index key tag: {other}"
4909            ))),
4910        }
4911    }
4912    /// Schema-driven dense value decode (`FILE_VERSION` 8). Caller has
4913    /// already cleared the NULL bit from the row bitmap; we read the
4914    /// fixed-width body for the given column type. Used inside the row
4915    /// hot loop; column defaults still go through `read_value` (which
4916    /// reads its own type tag) so DEFAULT round-trips without a schema.
4917    fn read_value_body(&mut self, ty: DataType) -> Result<Value, StorageError> {
4918        match ty {
4919            DataType::SmallInt => {
4920                let s = self.take(2)?;
4921                Ok(Value::SmallInt(i16::from_le_bytes([s[0], s[1]])))
4922            }
4923            DataType::Int => Ok(Value::Int(self.read_i32()?)),
4924            DataType::BigInt => Ok(Value::BigInt(self.read_i64()?)),
4925            DataType::Float => Ok(Value::Float(self.read_f64()?)),
4926            DataType::Bool => Ok(Value::Bool(self.read_u8()? != 0)),
4927            DataType::Text | DataType::Varchar(_) | DataType::Char(_) => {
4928                Ok(Value::Text(self.read_str()?))
4929            }
4930            DataType::Vector {
4931                encoding: VecEncoding::F32,
4932                ..
4933            } => {
4934                let dim = self.read_u32()? as usize;
4935                let mut v = Vec::with_capacity(dim);
4936                for _ in 0..dim {
4937                    let bytes: [u8; 4] = self.take(4)?.try_into().expect("checked");
4938                    v.push(f32::from_le_bytes(bytes));
4939                }
4940                Ok(Value::Vector(v))
4941            }
4942            DataType::Vector {
4943                encoding: VecEncoding::Sq8,
4944                ..
4945            } => {
4946                let dim = self.read_u32()? as usize;
4947                let min = self.read_f32()?;
4948                let max = self.read_f32()?;
4949                let bytes = self.take(dim)?.to_vec();
4950                Ok(Value::Sq8Vector(quantize::Sq8Vector { min, max, bytes }))
4951            }
4952            DataType::Vector {
4953                encoding: VecEncoding::F16,
4954                ..
4955            } => {
4956                let dim = self.read_u32()? as usize;
4957                let bytes = self.take(dim * 2)?.to_vec();
4958                Ok(Value::HalfVector(halfvec::HalfVector { bytes }))
4959            }
4960            DataType::Numeric { .. } => {
4961                let s = self.take(16)?;
4962                let arr: [u8; 16] = s.try_into().expect("checked");
4963                let scaled = i128::from_le_bytes(arr);
4964                let scale = self.read_u8()?;
4965                Ok(Value::Numeric { scaled, scale })
4966            }
4967            DataType::Date => Ok(Value::Date(self.read_i32()?)),
4968            DataType::Timestamp => Ok(Value::Timestamp(self.read_i64()?)),
4969            DataType::Interval => {
4970                // Defensive — schema gate (CREATE TABLE rejects Interval
4971                // columns) means this branch can't be hit through normal
4972                // flow; reject corrupt files explicitly rather than
4973                // panic.
4974                Err(StorageError::Corrupt(
4975                    "INTERVAL column found on disk — runtime-only type, v3.0.2 rejects it".into(),
4976                ))
4977            }
4978            DataType::Json => Ok(Value::Json(self.read_str()?)),
4979        }
4980    }
4981
4982    fn read_value(&mut self) -> Result<Value, StorageError> {
4983        let tag = self.read_u8()?;
4984        match tag {
4985            0 => Ok(Value::Null),
4986            1 => Ok(Value::Int(self.read_i32()?)),
4987            2 => Ok(Value::BigInt(self.read_i64()?)),
4988            3 => Ok(Value::Float(self.read_f64()?)),
4989            4 => Ok(Value::Text(self.read_str()?)),
4990            5 => Ok(Value::Bool(self.read_u8()? != 0)),
4991            6 => {
4992                let dim = self.read_u32()? as usize;
4993                let mut v = Vec::with_capacity(dim);
4994                for _ in 0..dim {
4995                    let bytes: [u8; 4] = self.take(4)?.try_into().expect("checked");
4996                    v.push(f32::from_le_bytes(bytes));
4997                }
4998                Ok(Value::Vector(v))
4999            }
5000            7 => {
5001                let s = self.take(2)?;
5002                Ok(Value::SmallInt(i16::from_le_bytes([s[0], s[1]])))
5003            }
5004            8 => {
5005                let s = self.take(16)?;
5006                let arr: [u8; 16] = s.try_into().expect("checked");
5007                let scaled = i128::from_le_bytes(arr);
5008                let scale = self.read_u8()?;
5009                Ok(Value::Numeric { scaled, scale })
5010            }
5011            9 => Ok(Value::Date(self.read_i32()?)),
5012            10 => Ok(Value::Timestamp(self.read_i64()?)),
5013            // v6.0.1: tag 11 — Sq8Vector. Pre-v6 readers fall
5014            // through to the catch-all and surface
5015            // `Corrupt("unknown value tag")`, matching the
5016            // forward-compat fence on the column-type side.
5017            11 => {
5018                let dim = self.read_u32()? as usize;
5019                let min = self.read_f32()?;
5020                let max = self.read_f32()?;
5021                let bytes = self.take(dim)?.to_vec();
5022                Ok(Value::Sq8Vector(quantize::Sq8Vector { min, max, bytes }))
5023            }
5024            // v6.0.3: tag 12 — HalfVector. Same forward-compat
5025            // fence story as tag 11.
5026            12 => {
5027                let dim = self.read_u32()? as usize;
5028                let bytes = self.take(dim * 2)?.to_vec();
5029                Ok(Value::HalfVector(halfvec::HalfVector { bytes }))
5030            }
5031            other => Err(StorageError::Corrupt(format!("unknown value tag: {other}"))),
5032        }
5033    }
5034
5035    /// Read an NSW graph that was emitted via `write_nsw_graph`. `m`
5036    /// is passed in because it was already consumed from the per-
5037    /// index header. Returns the reconstituted `NswGraph`.
5038    fn read_nsw_graph(&mut self, m: usize) -> Result<NswGraph, StorageError> {
5039        let m_max_0 = self.read_u16()? as usize;
5040        let entry_raw = self.read_u32()?;
5041        let entry = if entry_raw == u32::MAX {
5042            None
5043        } else {
5044            Some(entry_raw as usize)
5045        };
5046        let entry_level = self.read_u8()?;
5047        let node_count = self.read_u32()? as usize;
5048        // v5.5.0: levels/per-layer are PV-backed in memory, but the wire
5049        // format is unchanged — decode element-by-element into a PV via
5050        // push_mut (transient in-place, no per-element path-copy here since
5051        // the freshly-built PV is uniquely owned).
5052        let mut levels: PersistentVec<u8> = PersistentVec::new();
5053        for _ in 0..node_count {
5054            levels.push_mut(self.read_u8()?);
5055        }
5056        let layer_count = self.read_u8()? as usize;
5057        let mut layers: Vec<PersistentVec<Vec<u32>>> = Vec::with_capacity(layer_count);
5058        for _ in 0..layer_count {
5059            let n = self.read_u32()? as usize;
5060            let mut per_layer: PersistentVec<Vec<u32>> = PersistentVec::new();
5061            for _ in 0..n {
5062                let cnt = self.read_u16()? as usize;
5063                let mut row: Vec<u32> = Vec::with_capacity(cnt);
5064                for _ in 0..cnt {
5065                    row.push(self.read_u32()?);
5066                }
5067                per_layer.push_mut(row);
5068            }
5069            layers.push(per_layer);
5070        }
5071        Ok(NswGraph {
5072            m,
5073            m_max_0,
5074            entry,
5075            entry_level,
5076            levels,
5077            layers,
5078        })
5079    }
5080}
5081
5082#[cfg(test)]
5083mod tests {
5084    use super::*;
5085    use alloc::string::ToString;
5086    use alloc::vec;
5087
5088    #[cfg(target_arch = "aarch64")]
5089    #[test]
5090    fn neon_l2_matches_scalar() {
5091        // For every dim that's a multiple of 4 (4, 8, 12, 16, 64,
5092        // 128, 256, 384, 512, 768, 1024, 1536), the NEON impl must
5093        // agree with the scalar reference within tight float
5094        // tolerance (FMA rounding differs from separate * + +).
5095        let dims = [4usize, 8, 12, 16, 64, 128, 256, 384, 512, 768, 1024, 1536];
5096        for &d in &dims {
5097            let mut state: u64 = (d as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15);
5098            let mut a = Vec::with_capacity(d);
5099            let mut b = Vec::with_capacity(d);
5100            for _ in 0..d {
5101                state = state
5102                    .wrapping_mul(6_364_136_223_846_793_005)
5103                    .wrapping_add(1);
5104                #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
5105                let x = (((state >> 32) & 0x00FF_FFFF) as f32) / (0x80_0000_u32 as f32) - 1.0;
5106                state = state
5107                    .wrapping_mul(6_364_136_223_846_793_005)
5108                    .wrapping_add(1);
5109                #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
5110                let y = (((state >> 32) & 0x00FF_FFFF) as f32) / (0x80_0000_u32 as f32) - 1.0;
5111                a.push(x);
5112                b.push(y);
5113            }
5114            let scalar = l2_distance_sq_scalar(&a, &b);
5115            let neon = unsafe { l2_distance_sq_neon(&a, &b) };
5116            let tol = (scalar.abs().max(1e-6)) * 1e-4;
5117            assert!(
5118                (scalar - neon).abs() <= tol,
5119                "dim={d}: scalar={scalar} neon={neon} diff={}",
5120                (scalar - neon).abs()
5121            );
5122        }
5123    }
5124
5125    #[cfg(target_arch = "aarch64")]
5126    #[test]
5127    fn neon_inner_product_matches_scalar() {
5128        // v6.0.2 step 1: NEON IP must agree with scalar across every
5129        // production-shaped dim. FMA rounding differs from
5130        // separate * + +, so the tolerance scales with magnitude.
5131        let dims = [4usize, 8, 12, 16, 64, 128, 256, 512, 1024];
5132        for &d in &dims {
5133            let mut state: u64 = (d as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15);
5134            let mut a = Vec::with_capacity(d);
5135            let mut b = Vec::with_capacity(d);
5136            for _ in 0..d {
5137                state = state
5138                    .wrapping_mul(6_364_136_223_846_793_005)
5139                    .wrapping_add(1);
5140                #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
5141                let x = (((state >> 32) & 0x00FF_FFFF) as f32) / (0x80_0000_u32 as f32) - 1.0;
5142                state = state
5143                    .wrapping_mul(6_364_136_223_846_793_005)
5144                    .wrapping_add(1);
5145                #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
5146                let y = (((state >> 32) & 0x00FF_FFFF) as f32) / (0x80_0000_u32 as f32) - 1.0;
5147                a.push(x);
5148                b.push(y);
5149            }
5150            let scalar = inner_product_scalar(&a, &b);
5151            let neon = unsafe { inner_product_neon(&a, &b) };
5152            #[allow(clippy::cast_precision_loss)]
5153            let tol = (scalar.abs().max(1e-6)) * 1e-4 + (d as f32) * 1e-6;
5154            assert!(
5155                (scalar - neon).abs() <= tol,
5156                "IP dim={d}: scalar={scalar} neon={neon} diff={}",
5157                (scalar - neon).abs()
5158            );
5159        }
5160    }
5161
5162    #[cfg(target_arch = "aarch64")]
5163    #[allow(clippy::similar_names)]
5164    #[test]
5165    fn neon_cosine_dot_norms_matches_scalar() {
5166        let dims = [4usize, 8, 12, 16, 64, 128, 256, 512, 1024];
5167        for &d in &dims {
5168            let mut state: u64 = (d as u64).wrapping_mul(0xBF58_476D_1CE4_E5B9);
5169            let mut a = Vec::with_capacity(d);
5170            let mut b = Vec::with_capacity(d);
5171            for _ in 0..d {
5172                state = state
5173                    .wrapping_mul(6_364_136_223_846_793_005)
5174                    .wrapping_add(1);
5175                #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
5176                let x = (((state >> 32) & 0x00FF_FFFF) as f32) / (0x80_0000_u32 as f32) - 1.0;
5177                state = state
5178                    .wrapping_mul(6_364_136_223_846_793_005)
5179                    .wrapping_add(1);
5180                #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
5181                let y = (((state >> 32) & 0x00FF_FFFF) as f32) / (0x80_0000_u32 as f32) - 1.0;
5182                a.push(x);
5183                b.push(y);
5184            }
5185            let (dot_s, na_s, nb_s) = cosine_dot_norms_scalar(&a, &b);
5186            let (dot_n, na_n, nb_n) = unsafe { cosine_dot_norms_neon(&a, &b) };
5187            #[allow(clippy::cast_precision_loss)]
5188            let tol_d = (dot_s.abs().max(1e-6)) * 1e-4 + (d as f32) * 1e-6;
5189            #[allow(clippy::cast_precision_loss)]
5190            let tol_n = (na_s.abs().max(1e-6)) * 1e-4 + (d as f32) * 1e-6;
5191            assert!(
5192                (dot_s - dot_n).abs() <= tol_d,
5193                "cosine dot dim={d}: scalar={dot_s} neon={dot_n}"
5194            );
5195            assert!(
5196                (na_s - na_n).abs() <= tol_n,
5197                "cosine na dim={d}: scalar={na_s} neon={na_n}"
5198            );
5199            assert!(
5200                (nb_s - nb_n).abs() <= tol_n,
5201                "cosine nb dim={d}: scalar={nb_s} neon={nb_n}"
5202            );
5203        }
5204    }
5205
5206    fn make_users_schema() -> TableSchema {
5207        TableSchema::new(
5208            "users",
5209            vec![
5210                ColumnSchema::new("id", DataType::Int, false),
5211                ColumnSchema::new("name", DataType::Text, false),
5212                ColumnSchema::new("score", DataType::Float, true),
5213            ],
5214        )
5215    }
5216
5217    #[test]
5218    fn value_type_tag_matches_variant() {
5219        assert_eq!(Value::Int(1).data_type(), Some(DataType::Int));
5220        assert_eq!(Value::BigInt(1).data_type(), Some(DataType::BigInt));
5221        assert_eq!(Value::Float(1.0).data_type(), Some(DataType::Float));
5222        assert_eq!(Value::Text("x".into()).data_type(), Some(DataType::Text));
5223        assert_eq!(Value::Bool(true).data_type(), Some(DataType::Bool));
5224        assert_eq!(Value::Null.data_type(), None);
5225        assert!(Value::Null.is_null());
5226        assert!(!Value::Int(0).is_null());
5227    }
5228
5229    #[test]
5230    fn sq8_value_reports_sq8_data_type() {
5231        // v6.0.1: a `Value::Sq8Vector` cell surfaces its dim
5232        // (= bytes.len()) and encoding through `data_type()` so
5233        // INSERT-time column type-checks (step 3) can route on
5234        // both shape and encoding.
5235        let q = crate::quantize::quantize(&[0.0, 0.25, 0.5, 0.75, 1.0]);
5236        let v = Value::Sq8Vector(q);
5237        assert_eq!(
5238            v.data_type(),
5239            Some(DataType::Vector {
5240                dim: 5,
5241                encoding: VecEncoding::Sq8,
5242            }),
5243        );
5244    }
5245
5246    #[test]
5247    fn datatype_display_matches_pg_keyword() {
5248        assert_eq!(DataType::Int.to_string(), "INT");
5249        assert_eq!(DataType::BigInt.to_string(), "BIGINT");
5250        assert_eq!(DataType::Float.to_string(), "FLOAT");
5251        assert_eq!(DataType::Text.to_string(), "TEXT");
5252        assert_eq!(DataType::Bool.to_string(), "BOOL");
5253    }
5254
5255    #[test]
5256    fn row_len_and_emptiness() {
5257        let r = Row::new(vec![Value::Int(1), Value::Null]);
5258        assert_eq!(r.len(), 2);
5259        assert!(!r.is_empty());
5260        assert!(Row::new(Vec::new()).is_empty());
5261    }
5262
5263    #[test]
5264    fn table_schema_column_position() {
5265        let s = make_users_schema();
5266        assert_eq!(s.column_position("id"), Some(0));
5267        assert_eq!(s.column_position("score"), Some(2));
5268        assert_eq!(s.column_position("missing"), None);
5269    }
5270
5271    #[test]
5272    fn catalog_create_table_then_lookup() {
5273        let mut cat = Catalog::new();
5274        cat.create_table(make_users_schema()).unwrap();
5275        assert_eq!(cat.table_count(), 1);
5276        assert!(cat.get("users").is_some());
5277        assert!(cat.get("nope").is_none());
5278    }
5279
5280    #[test]
5281    fn catalog_duplicate_table_is_rejected() {
5282        let mut cat = Catalog::new();
5283        cat.create_table(make_users_schema()).unwrap();
5284        let err = cat.create_table(make_users_schema()).unwrap_err();
5285        assert!(matches!(err, StorageError::DuplicateTable { ref name } if name == "users"));
5286    }
5287
5288    #[test]
5289    fn table_insert_happy_path_appends_row() {
5290        let mut cat = Catalog::new();
5291        cat.create_table(make_users_schema()).unwrap();
5292        let t = cat.get_mut("users").unwrap();
5293        t.insert(Row::new(vec![
5294            Value::Int(1),
5295            Value::Text("alice".into()),
5296            Value::Float(99.5),
5297        ]))
5298        .unwrap();
5299        assert_eq!(t.row_count(), 1);
5300        assert_eq!(t.rows()[0].values[1], Value::Text("alice".into()));
5301    }
5302
5303    #[test]
5304    fn table_insert_arity_mismatch() {
5305        let mut cat = Catalog::new();
5306        cat.create_table(make_users_schema()).unwrap();
5307        let t = cat.get_mut("users").unwrap();
5308        let err = t.insert(Row::new(vec![Value::Int(1)])).unwrap_err();
5309        assert!(matches!(
5310            err,
5311            StorageError::ArityMismatch {
5312                expected: 3,
5313                actual: 1
5314            }
5315        ));
5316        assert_eq!(t.row_count(), 0);
5317    }
5318
5319    #[test]
5320    fn table_insert_type_mismatch_reports_column() {
5321        let mut cat = Catalog::new();
5322        cat.create_table(make_users_schema()).unwrap();
5323        let t = cat.get_mut("users").unwrap();
5324        let err = t
5325            .insert(Row::new(vec![
5326                Value::Int(1),
5327                Value::Int(42), // name expects Text
5328                Value::Float(0.0),
5329            ]))
5330            .unwrap_err();
5331        match err {
5332            StorageError::TypeMismatch {
5333                ref column,
5334                expected,
5335                actual,
5336                position,
5337            } => {
5338                assert_eq!(column, "name");
5339                assert_eq!(expected, DataType::Text);
5340                assert_eq!(actual, DataType::Int);
5341                assert_eq!(position, 1);
5342            }
5343            other => panic!("unexpected: {other:?}"),
5344        }
5345        assert_eq!(t.row_count(), 0);
5346    }
5347
5348    #[test]
5349    fn table_insert_null_into_not_null_rejected() {
5350        let mut cat = Catalog::new();
5351        cat.create_table(make_users_schema()).unwrap();
5352        let t = cat.get_mut("users").unwrap();
5353        let err = t
5354            .insert(Row::new(vec![
5355                Value::Int(1),
5356                Value::Null, // name is NOT NULL
5357                Value::Float(1.0),
5358            ]))
5359            .unwrap_err();
5360        assert!(matches!(err, StorageError::NullInNotNull { ref column } if column == "name"));
5361    }
5362
5363    #[test]
5364    fn table_insert_null_into_nullable_ok() {
5365        let mut cat = Catalog::new();
5366        cat.create_table(make_users_schema()).unwrap();
5367        let t = cat.get_mut("users").unwrap();
5368        t.insert(Row::new(vec![
5369            Value::Int(1),
5370            Value::Text("bob".into()),
5371            Value::Null,
5372        ]))
5373        .unwrap();
5374        assert_eq!(t.row_count(), 1);
5375    }
5376
5377    #[test]
5378    fn catalog_get_mut_independent_per_table() {
5379        let mut cat = Catalog::new();
5380        cat.create_table(TableSchema::new(
5381            "a",
5382            vec![ColumnSchema::new("v", DataType::Int, false)],
5383        ))
5384        .unwrap();
5385        cat.create_table(TableSchema::new(
5386            "b",
5387            vec![ColumnSchema::new("v", DataType::Int, false)],
5388        ))
5389        .unwrap();
5390        cat.get_mut("a")
5391            .unwrap()
5392            .insert(Row::new(vec![Value::Int(1)]))
5393            .unwrap();
5394        assert_eq!(cat.get("a").unwrap().row_count(), 1);
5395        assert_eq!(cat.get("b").unwrap().row_count(), 0);
5396    }
5397
5398    // --- v0.6 persistence round-trips --------------------------------------
5399
5400    fn assert_round_trip(cat: &Catalog) {
5401        let bytes = cat.serialize();
5402        let restored = Catalog::deserialize(&bytes).expect("deserialize");
5403        // Compare semantic state: same tables in same order, same schema +
5404        // rows in each.
5405        assert_eq!(restored.table_count(), cat.table_count());
5406        for (a, b) in cat.tables.iter().zip(restored.tables.iter()) {
5407            assert_eq!(a.schema, b.schema);
5408            assert_eq!(a.rows, b.rows);
5409        }
5410    }
5411
5412    #[test]
5413    fn serialize_empty_catalog_round_trips() {
5414        assert_round_trip(&Catalog::new());
5415    }
5416
5417    #[test]
5418    fn serialize_single_empty_table_round_trips() {
5419        let mut cat = Catalog::new();
5420        cat.create_table(make_users_schema()).unwrap();
5421        assert_round_trip(&cat);
5422    }
5423
5424    #[test]
5425    fn nsw_clone_is_o1() {
5426        // v5.5.0: NswGraph::clone must be O(1) structural sharing, not the
5427        // pre-v5.5 O(N) element copy — it rides on Catalog::clone for every
5428        // group-commit write on a vector table. Build a non-trivial multi-
5429        // layer graph, clone it, and prove the clone shares the very same PV
5430        // storage (root+tail Arc) for `levels` and every `layers[l]`. Sharing
5431        // ⇒ no per-node element copy ⇒ clone cost independent of N (node
5432        // count); only the outer layer Vec (len ≤ 8) is copied, O(1) in
5433        // practice.
5434        let mut cat = Catalog::new();
5435        cat.create_table(TableSchema::new(
5436            "docs",
5437            alloc::vec![
5438                ColumnSchema::new("id", DataType::Int, false),
5439                ColumnSchema::new(
5440                    "v",
5441                    DataType::Vector {
5442                        dim: 3,
5443                        encoding: VecEncoding::F32
5444                    },
5445                    true
5446                ),
5447            ],
5448        ))
5449        .unwrap();
5450        let t = cat.get_mut("docs").unwrap();
5451        for i in 0..1500_i32 {
5452            #[allow(clippy::cast_precision_loss)] // 0..1500 — no precision lost
5453            let base = (i as f32) * 0.01;
5454            t.insert(Row::new(alloc::vec![
5455                Value::Int(i),
5456                Value::Vector(alloc::vec![base, base + 0.05, base + 0.1]),
5457            ]))
5458            .unwrap();
5459        }
5460        t.add_nsw_index("docs_nsw".into(), "v", NSW_DEFAULT_M)
5461            .unwrap();
5462        let g = match &cat.get("docs").unwrap().indices()[0].kind {
5463            IndexKind::Nsw(g) => g,
5464            IndexKind::BTree(_) | IndexKind::Brin { .. } => panic!("expected NSW"),
5465        };
5466        // Non-trivial graph: one level slot per row, and the geometric level
5467        // distribution puts some nodes above layer 0.
5468        assert_eq!(g.levels.len(), 1500, "one level slot per inserted row");
5469        assert!(
5470            g.layers.len() >= 2,
5471            "1500 nodes should populate at least two HNSW layers, got {}",
5472            g.layers.len()
5473        );
5474
5475        let cloned = g.clone();
5476
5477        assert!(
5478            g.levels.shares_storage_with(&cloned.levels),
5479            "levels PV not shared after clone — clone copied elements (O(N))"
5480        );
5481        assert_eq!(g.layers.len(), cloned.layers.len());
5482        for (l, (orig, cl)) in g.layers.iter().zip(cloned.layers.iter()).enumerate() {
5483            assert!(
5484                orig.shares_storage_with(cl),
5485                "layer {l} PV not shared after clone — clone copied elements (O(N))"
5486            );
5487        }
5488    }
5489
5490    #[test]
5491    fn sq8_catalog_serialise_roundtrip_preserves_cells_and_index() {
5492        // v6.0.1 step 6 verify: a catalog with an `VECTOR(N)
5493        // USING SQ8` column + NSW index survives a full
5494        // serialise → deserialise cycle. Cells re-decode bit-
5495        // identically (per-vector affine triple), the NSW
5496        // topology stays intact, and kNN search still routes
5497        // through the SQ8 ADC dispatcher after the catalog hop.
5498        let mut cat = Catalog::new();
5499        cat.create_table(TableSchema::new(
5500            "vecs",
5501            alloc::vec![
5502                ColumnSchema::new("id", DataType::Int, false),
5503                ColumnSchema::new(
5504                    "v",
5505                    DataType::Vector {
5506                        dim: 8,
5507                        encoding: VecEncoding::Sq8,
5508                    },
5509                    false,
5510                ),
5511            ],
5512        ))
5513        .unwrap();
5514        let t = cat.get_mut("vecs").unwrap();
5515        for i in 0..32_i32 {
5516            #[allow(clippy::cast_precision_loss)]
5517            let base = (i as f32) * 0.03;
5518            let v: Vec<f32> = (0..8_i32)
5519                .map(|j| {
5520                    #[allow(clippy::cast_precision_loss)]
5521                    let off = (j as f32) * 0.01;
5522                    base + off
5523                })
5524                .collect();
5525            t.insert(Row::new(alloc::vec![
5526                Value::Int(i),
5527                Value::Sq8Vector(quantize::quantize(&v)),
5528            ]))
5529            .unwrap();
5530        }
5531        t.add_nsw_index("v_idx".into(), "v", NSW_DEFAULT_M).unwrap();
5532        // Capture a pre-serialise reference cell + nsw hits to
5533        // compare against the restored catalog.
5534        let query = alloc::vec![0.15_f32, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21, 0.22];
5535        let (before_cell, before_ty, before_hits) = {
5536            let t_ref = cat.get("vecs").unwrap();
5537            (
5538                t_ref.rows()[5].values[1].clone(),
5539                t_ref.schema().columns[1].ty,
5540                nsw_query(t_ref, "v_idx", &query, 5, NswMetric::L2),
5541            )
5542        };
5543
5544        let bytes = cat.serialize();
5545        let restored = Catalog::deserialize(&bytes).expect("deserialize ok");
5546        let rt = restored.get("vecs").unwrap();
5547        assert_eq!(rt.schema().columns[1].ty, before_ty);
5548        assert_eq!(rt.rows()[5].values[1], before_cell);
5549        let after_hits = nsw_query(rt, "v_idx", &query, 5, NswMetric::L2);
5550        assert_eq!(before_hits, after_hits);
5551    }
5552
5553    #[test]
5554    fn half_catalog_serialise_roundtrip_preserves_cells_and_index() {
5555        // v6.0.3 step 4 verify: a catalog with a `VECTOR(N) USING
5556        // HALF` column + NSW index survives a full serialise →
5557        // deserialise cycle. Cells re-decode bit-identically (raw
5558        // u16 LE bytes), the NSW topology stays intact, and kNN
5559        // search still returns the same hit IDs against the
5560        // restored catalog.
5561        use crate::halfvec;
5562        let mut cat = Catalog::new();
5563        cat.create_table(TableSchema::new(
5564            "vecs",
5565            alloc::vec![
5566                ColumnSchema::new("id", DataType::Int, false),
5567                ColumnSchema::new(
5568                    "v",
5569                    DataType::Vector {
5570                        dim: 8,
5571                        encoding: VecEncoding::F16,
5572                    },
5573                    false,
5574                ),
5575            ],
5576        ))
5577        .unwrap();
5578        let t = cat.get_mut("vecs").unwrap();
5579        for i in 0..32_i32 {
5580            #[allow(clippy::cast_precision_loss)]
5581            let base = (i as f32) * 0.03;
5582            let v: Vec<f32> = (0..8_i32)
5583                .map(|j| {
5584                    #[allow(clippy::cast_precision_loss)]
5585                    let off = (j as f32) * 0.01;
5586                    base + off
5587                })
5588                .collect();
5589            t.insert(Row::new(alloc::vec![
5590                Value::Int(i),
5591                Value::HalfVector(halfvec::HalfVector::from_f32_slice(&v)),
5592            ]))
5593            .unwrap();
5594        }
5595        t.add_nsw_index("v_idx".into(), "v", NSW_DEFAULT_M).unwrap();
5596        let query = alloc::vec![0.15_f32, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21, 0.22];
5597        let (before_cell, before_ty, before_hits) = {
5598            let t_ref = cat.get("vecs").unwrap();
5599            (
5600                t_ref.rows()[5].values[1].clone(),
5601                t_ref.schema().columns[1].ty,
5602                nsw_query(t_ref, "v_idx", &query, 5, NswMetric::L2),
5603            )
5604        };
5605        let bytes = cat.serialize();
5606        let restored = Catalog::deserialize(&bytes).expect("deserialize ok");
5607        let rt = restored.get("vecs").unwrap();
5608        assert_eq!(rt.schema().columns[1].ty, before_ty);
5609        assert_eq!(rt.rows()[5].values[1], before_cell);
5610        let after_hits = nsw_query(rt, "v_idx", &query, 5, NswMetric::L2);
5611        assert_eq!(before_hits, after_hits);
5612    }
5613
5614    #[test]
5615    #[allow(clippy::similar_names)]
5616    fn hnsw_half_recall_at_10_matches_f32_groundtruth() {
5617        // v6.0.3 step 3 verify: HALF column NSW retrieves ≥ 95%
5618        // top-10 overlap vs brute-force F32 ground truth.
5619        // Half-precision dequantises bit-exactly at the storage
5620        // layer (no rerank pass), so the recall floor is tighter
5621        // than the SQ8 case — only the rounding noise from f32 →
5622        // f16 quantisation contributes.
5623        use crate::halfvec;
5624        fn next(state: &mut u64) -> f32 {
5625            *state = state
5626                .wrapping_add(0x9E37_79B9_7F4A_7C15)
5627                .wrapping_mul(0xBF58_476D_1CE4_E5B9);
5628            #[allow(clippy::cast_precision_loss)]
5629            let u = ((*state >> 32) as u32 as f32) / (u32::MAX as f32);
5630            2.0 * u - 1.0
5631        }
5632        let dim: u32 = 32;
5633        let n: usize = 512;
5634        let dim_us = dim as usize;
5635        let mut seed: u64 = 0xF16_F16_F16_F16_u64;
5636        let corpus: Vec<Vec<f32>> = (0..n)
5637            .map(|_| (0..dim_us).map(|_| next(&mut seed)).collect())
5638            .collect();
5639        let queries: Vec<Vec<f32>> = (0..32)
5640            .map(|_| (0..dim_us).map(|_| next(&mut seed)).collect())
5641            .collect();
5642        let exact_top10: Vec<Vec<usize>> = queries
5643            .iter()
5644            .map(|q| {
5645                let mut scored: Vec<(f32, usize)> = corpus
5646                    .iter()
5647                    .enumerate()
5648                    .map(|(i, v)| (l2_distance_sq(v, q), i))
5649                    .collect();
5650                scored.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
5651                scored.into_iter().take(10).map(|(_, i)| i).collect()
5652            })
5653            .collect();
5654        let mut cat = Catalog::new();
5655        cat.create_table(TableSchema::new(
5656            "vecs",
5657            alloc::vec![
5658                ColumnSchema::new("id", DataType::Int, false),
5659                ColumnSchema::new(
5660                    "v",
5661                    DataType::Vector {
5662                        dim,
5663                        encoding: VecEncoding::F16,
5664                    },
5665                    false,
5666                ),
5667            ],
5668        ))
5669        .unwrap();
5670        let t = cat.get_mut("vecs").unwrap();
5671        for (i, v) in corpus.iter().enumerate() {
5672            t.insert(Row::new(alloc::vec![
5673                Value::Int(i32::try_from(i).unwrap()),
5674                Value::HalfVector(halfvec::HalfVector::from_f32_slice(v)),
5675            ]))
5676            .unwrap();
5677        }
5678        t.add_nsw_index("v_idx".into(), "v", NSW_DEFAULT_M).unwrap();
5679        let table = cat.get("vecs").unwrap();
5680        let mut total_overlap = 0_usize;
5681        for (q, exact) in queries.iter().zip(exact_top10.iter()) {
5682            let hits = nsw_query(table, "v_idx", q, 10, NswMetric::L2);
5683            for h in &hits {
5684                if exact.contains(h) {
5685                    total_overlap += 1;
5686                }
5687            }
5688        }
5689        #[allow(clippy::cast_precision_loss)]
5690        let recall = total_overlap as f32 / (10.0 * queries.len() as f32);
5691        assert!(
5692            recall >= 0.95,
5693            "HALF HNSW recall@10 = {recall:.3}, below floor 0.95 — \
5694             check halfvec dispatch in `cell_to_query_metric_distance`"
5695        );
5696    }
5697
5698    #[test]
5699    fn hnsw_sq8_recall_at_10_above_0_95_vs_f32_groundtruth() {
5700        // v6.0.1 step 5 verify: build TWO catalogs over the same
5701        // corpus — one F32, one SQ8 — and confirm SQ8 NSW + f32
5702        // rerank retrieves ≥ 95% top-10 overlap vs brute-force F32
5703        // ground truth. The rerank pass (sq8_rerank) re-scores ADC
5704        // candidates with dequantised cells, recovering recall the
5705        // raw ADC sacrifices for 4× compression.
5706        use crate::quantize;
5707        // Deterministic Gaussian-ish corpus via splitmix64. Vectors
5708        // get normalised so SQ8's per-vector `(min, max)` lives in
5709        // a sensible range; matches the v6.0.0 fuzz harness.
5710        fn next(state: &mut u64) -> f32 {
5711            *state = state
5712                .wrapping_add(0x9E37_79B9_7F4A_7C15)
5713                .wrapping_mul(0xBF58_476D_1CE4_E5B9);
5714            #[allow(clippy::cast_precision_loss)]
5715            let u = ((*state >> 32) as u32 as f32) / (u32::MAX as f32);
5716            2.0 * u - 1.0
5717        }
5718        let dim: u32 = 32;
5719        let n: usize = 512;
5720        let dim_us = dim as usize;
5721        let mut seed: u64 = 0xCAFE_BABE_DEAD_BEEFu64;
5722        let corpus: Vec<Vec<f32>> = (0..n)
5723            .map(|_| (0..dim_us).map(|_| next(&mut seed)).collect())
5724            .collect();
5725        let queries: Vec<Vec<f32>> = (0..32)
5726            .map(|_| (0..dim_us).map(|_| next(&mut seed)).collect())
5727            .collect();
5728        // F32 ground truth — pure exact arithmetic, brute force.
5729        let exact_top10: Vec<Vec<usize>> = queries
5730            .iter()
5731            .map(|q| {
5732                let mut scored: Vec<(f32, usize)> = corpus
5733                    .iter()
5734                    .enumerate()
5735                    .map(|(i, v)| (l2_distance_sq(v, q), i))
5736                    .collect();
5737                scored.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
5738                scored.into_iter().take(10).map(|(_, i)| i).collect()
5739            })
5740            .collect();
5741        // SQ8 catalog — INSERTs land as `Value::Sq8Vector` cells;
5742        // HNSW build uses the ADC path verified in step 4.
5743        let mut cat = Catalog::new();
5744        cat.create_table(TableSchema::new(
5745            "vecs",
5746            alloc::vec![
5747                ColumnSchema::new("id", DataType::Int, false),
5748                ColumnSchema::new(
5749                    "v",
5750                    DataType::Vector {
5751                        dim,
5752                        encoding: VecEncoding::Sq8,
5753                    },
5754                    false,
5755                ),
5756            ],
5757        ))
5758        .unwrap();
5759        let t = cat.get_mut("vecs").unwrap();
5760        for (i, v) in corpus.iter().enumerate() {
5761            t.insert(Row::new(alloc::vec![
5762                Value::Int(i32::try_from(i).unwrap()),
5763                Value::Sq8Vector(quantize::quantize(v)),
5764            ]))
5765            .unwrap();
5766        }
5767        t.add_nsw_index("v_idx".into(), "v", NSW_DEFAULT_M).unwrap();
5768        let table = cat.get("vecs").unwrap();
5769        let mut total_overlap = 0_usize;
5770        for (q, exact) in queries.iter().zip(exact_top10.iter()) {
5771            let hits = nsw_query(table, "v_idx", q, 10, NswMetric::L2);
5772            for h in &hits {
5773                if exact.contains(h) {
5774                    total_overlap += 1;
5775                }
5776            }
5777        }
5778        #[allow(clippy::cast_precision_loss)]
5779        let recall = total_overlap as f32 / (10.0 * queries.len() as f32);
5780        assert!(
5781            recall >= 0.95,
5782            "SQ8 HNSW recall@10 = {recall:.3}, below floor 0.95 — \
5783             check `sq8_rerank` is wired in `nsw_search` for SQ8 columns"
5784        );
5785    }
5786
5787    #[test]
5788    fn nsw_index_topology_persists_through_round_trip() {
5789        // Build an NSW index, capture its (entry, neighbors) tuple, do
5790        // a full serialize → deserialize, and verify the restored
5791        // graph is byte-for-byte identical. The point of v2.7 is that
5792        // startup skips the rebuild, so the topology has to survive
5793        // the disk hop.
5794        let mut cat = Catalog::new();
5795        cat.create_table(TableSchema::new(
5796            "docs",
5797            alloc::vec![
5798                ColumnSchema::new("id", DataType::Int, false),
5799                ColumnSchema::new(
5800                    "v",
5801                    DataType::Vector {
5802                        dim: 3,
5803                        encoding: VecEncoding::F32
5804                    },
5805                    true
5806                ),
5807            ],
5808        ))
5809        .unwrap();
5810        let t = cat.get_mut("docs").unwrap();
5811        for i in 0..6_i32 {
5812            #[allow(clippy::cast_precision_loss)] // 0..6 — no precision lost
5813            let base = (i as f32) * 0.1;
5814            let row = Row::new(alloc::vec![
5815                Value::Int(i),
5816                Value::Vector(alloc::vec![base, base + 0.05, base + 0.1]),
5817            ]);
5818            t.insert(row).unwrap();
5819        }
5820        t.add_nsw_index("docs_nsw".into(), "v", NSW_DEFAULT_M)
5821            .unwrap();
5822        let original = match &cat.get("docs").unwrap().indices()[0].kind {
5823            IndexKind::Nsw(g) => g.clone(),
5824            IndexKind::BTree(_) | IndexKind::Brin { .. } => panic!("expected NSW"),
5825        };
5826        let bytes = cat.serialize();
5827        let restored = Catalog::deserialize(&bytes).expect("deserialize");
5828        let restored_graph = match &restored.get("docs").unwrap().indices()[0].kind {
5829            IndexKind::Nsw(g) => g.clone(),
5830            IndexKind::BTree(_) | IndexKind::Brin { .. } => panic!("expected NSW"),
5831        };
5832        assert_eq!(restored_graph.m, original.m);
5833        assert_eq!(restored_graph.m_max_0, original.m_max_0);
5834        assert_eq!(restored_graph.entry, original.entry);
5835        assert_eq!(restored_graph.entry_level, original.entry_level);
5836        assert_eq!(restored_graph.levels, original.levels);
5837        assert_eq!(restored_graph.layers, original.layers);
5838    }
5839
5840    #[test]
5841    fn hnsw_level_assignment_is_deterministic() {
5842        // Same row index always produces the same level — the topology
5843        // must be reproducible (matters for serialize round-trip).
5844        for i in 0..32usize {
5845            assert_eq!(nsw_assign_level(i), nsw_assign_level(i));
5846        }
5847    }
5848
5849    #[test]
5850    fn hnsw_layer_0_dominates_population() {
5851        // Sanity: out of N inserts, the vast majority should land on
5852        // layer 0. The 4-bit-clear promotion rule gives roughly 1/16
5853        // promotion to layer ≥ 1, so under 50 nodes we expect ~3 on
5854        // layer ≥ 1 and the rest on layer 0.
5855        let on_zero = (0..200usize).filter(|&i| nsw_assign_level(i) == 0).count();
5856        assert!(on_zero > 150, "level-0 nodes too few: {on_zero}");
5857    }
5858
5859    #[test]
5860    fn hnsw_search_matches_brute_force_for_l2_top1() {
5861        // Build a small dataset, query it, and confirm the top result
5862        // matches the brute-force nearest by L2. Topology variability
5863        // shouldn't break recall at k=1 for well-separated vectors.
5864        let mut cat = Catalog::new();
5865        cat.create_table(TableSchema::new(
5866            "vecs",
5867            alloc::vec![
5868                ColumnSchema::new("id", DataType::Int, false),
5869                ColumnSchema::new(
5870                    "v",
5871                    DataType::Vector {
5872                        dim: 3,
5873                        encoding: VecEncoding::F32
5874                    },
5875                    true
5876                ),
5877            ],
5878        ))
5879        .unwrap();
5880        let t = cat.get_mut("vecs").unwrap();
5881        let dataset: alloc::vec::Vec<(i32, [f32; 3])> = alloc::vec![
5882            (1, [0.0, 0.0, 0.0]),
5883            (2, [1.0, 0.0, 0.0]),
5884            (3, [0.0, 1.0, 0.0]),
5885            (4, [0.0, 0.0, 1.0]),
5886            (5, [1.0, 1.0, 0.0]),
5887            (6, [1.0, 0.0, 1.0]),
5888            (7, [0.0, 1.0, 1.0]),
5889            (8, [1.0, 1.0, 1.0]),
5890            (9, [0.5, 0.5, 0.5]),
5891            (10, [0.2, 0.8, 0.5]),
5892        ];
5893        for &(id, v) in &dataset {
5894            t.insert(Row::new(alloc::vec![
5895                Value::Int(id),
5896                Value::Vector(alloc::vec![v[0], v[1], v[2]]),
5897            ]))
5898            .unwrap();
5899        }
5900        t.add_nsw_index("v_idx".into(), "v", NSW_DEFAULT_M).unwrap();
5901        let idx_pos = cat
5902            .get("vecs")
5903            .unwrap()
5904            .indices()
5905            .iter()
5906            .position(|i| i.name == "v_idx")
5907            .unwrap();
5908        for query in [[0.4, 0.4, 0.4], [0.9, 0.1, 0.0], [0.0, 0.9, 0.9]] {
5909            let table = cat.get("vecs").unwrap();
5910            let hnsw_top = nsw_search(table, idx_pos, &query, 1, 16, NswMetric::L2);
5911            let mut brute: alloc::vec::Vec<(f32, usize)> = (0..table.rows.len())
5912                .map(|i| {
5913                    let Value::Vector(v) = &table.rows[i].values[1] else {
5914                        return (f32::INFINITY, i);
5915                    };
5916                    (l2_distance_sq(v, &query), i)
5917                })
5918                .collect();
5919            brute.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
5920            assert!(!hnsw_top.is_empty(), "HNSW returned no results");
5921            assert_eq!(
5922                hnsw_top[0].1, brute[0].1,
5923                "HNSW top-1 != brute-force top-1 for {query:?}"
5924            );
5925        }
5926    }
5927
5928    #[test]
5929    fn serialize_table_with_rows_round_trips() {
5930        let mut cat = Catalog::new();
5931        cat.create_table(make_users_schema()).unwrap();
5932        let t = cat.get_mut("users").unwrap();
5933        t.insert(Row::new(vec![
5934            Value::Int(1),
5935            Value::Text("alice".into()),
5936            Value::Float(95.5),
5937        ]))
5938        .unwrap();
5939        t.insert(Row::new(vec![
5940            Value::Int(2),
5941            Value::Text("bob".into()),
5942            Value::Null,
5943        ]))
5944        .unwrap();
5945        assert_round_trip(&cat);
5946    }
5947
5948    #[test]
5949    fn serialize_multiple_tables_round_trips() {
5950        let mut cat = Catalog::new();
5951        cat.create_table(make_users_schema()).unwrap();
5952        cat.create_table(TableSchema::new(
5953            "flags",
5954            vec![
5955                ColumnSchema::new("id", DataType::BigInt, false),
5956                ColumnSchema::new("active", DataType::Bool, false),
5957            ],
5958        ))
5959        .unwrap();
5960        cat.get_mut("flags")
5961            .unwrap()
5962            .insert(Row::new(vec![Value::BigInt(7), Value::Bool(true)]))
5963            .unwrap();
5964        assert_round_trip(&cat);
5965    }
5966
5967    #[test]
5968    fn deserialize_rejects_bad_magic() {
5969        let mut buf = b"BADMAGIC".to_vec();
5970        buf.push(FILE_VERSION);
5971        buf.extend_from_slice(&0u32.to_le_bytes());
5972        let err = Catalog::deserialize(&buf).unwrap_err();
5973        assert!(matches!(err, StorageError::Corrupt(_)));
5974    }
5975
5976    #[test]
5977    fn deserialize_rejects_unsupported_version() {
5978        let mut buf = FILE_MAGIC.to_vec();
5979        buf.push(99); // future version
5980        buf.extend_from_slice(&0u32.to_le_bytes());
5981        let err = Catalog::deserialize(&buf).unwrap_err();
5982        assert!(matches!(err, StorageError::Corrupt(ref s) if s.contains("version")));
5983    }
5984
5985    #[test]
5986    fn deserialize_rejects_truncated_file() {
5987        let mut cat = Catalog::new();
5988        cat.create_table(make_users_schema()).unwrap();
5989        let bytes = cat.serialize();
5990        // Drop the last byte to simulate truncation.
5991        let truncated = &bytes[..bytes.len() - 1];
5992        assert!(matches!(
5993            Catalog::deserialize(truncated),
5994            Err(StorageError::Corrupt(_))
5995        ));
5996    }
5997
5998    #[test]
5999    fn deserialize_rejects_trailing_garbage() {
6000        let cat = Catalog::new();
6001        let mut bytes = cat.serialize();
6002        bytes.push(0xFF);
6003        assert!(matches!(
6004            Catalog::deserialize(&bytes),
6005            Err(StorageError::Corrupt(ref s)) if s.contains("trailing")
6006        ));
6007    }
6008
6009    // --- v0.8 indices ------------------------------------------------------
6010
6011    fn populated_users() -> Catalog {
6012        let mut cat = Catalog::new();
6013        cat.create_table(make_users_schema()).unwrap();
6014        let t = cat.get_mut("users").unwrap();
6015        for (id, name, score) in [
6016            (1, "alice", Some(90.0)),
6017            (2, "bob", None),
6018            (3, "alice", Some(70.0)), // duplicate name → maps to two row idxs
6019        ] {
6020            t.insert(Row::new(vec![
6021                Value::Int(id),
6022                Value::Text(name.into()),
6023                score.map_or(Value::Null, Value::Float),
6024            ]))
6025            .unwrap();
6026        }
6027        cat
6028    }
6029
6030    #[test]
6031    fn add_index_builds_from_existing_rows() {
6032        let mut cat = populated_users();
6033        cat.get_mut("users")
6034            .unwrap()
6035            .add_index("by_id".into(), "id")
6036            .unwrap();
6037        let t = cat.get("users").unwrap();
6038        let idx = t.index_on(0).expect("index_on(0)");
6039        assert_eq!(idx.lookup_eq(&IndexKey::Int(2)), &[RowLocator::Hot(1)]);
6040        assert_eq!(idx.lookup_eq(&IndexKey::Int(99)), &[] as &[RowLocator]);
6041    }
6042
6043    #[test]
6044    fn add_index_dup_name_rejected() {
6045        let mut cat = populated_users();
6046        let t = cat.get_mut("users").unwrap();
6047        t.add_index("ix".into(), "id").unwrap();
6048        let err = t.add_index("ix".into(), "name").unwrap_err();
6049        assert!(matches!(err, StorageError::DuplicateIndex { ref name } if name == "ix"));
6050    }
6051
6052    #[test]
6053    fn add_index_unknown_column_rejected() {
6054        let mut cat = populated_users();
6055        let err = cat
6056            .get_mut("users")
6057            .unwrap()
6058            .add_index("ix".into(), "ghost")
6059            .unwrap_err();
6060        assert!(matches!(err, StorageError::ColumnNotFound { ref column } if column == "ghost"));
6061    }
6062
6063    #[test]
6064    fn insert_after_create_index_updates_it() {
6065        let mut cat = populated_users();
6066        let t = cat.get_mut("users").unwrap();
6067        t.add_index("by_name".into(), "name").unwrap();
6068        t.insert(Row::new(vec![
6069            Value::Int(4),
6070            Value::Text("dave".into()),
6071            Value::Null,
6072        ]))
6073        .unwrap();
6074        let idx = t.index_on(1).unwrap();
6075        assert_eq!(
6076            idx.lookup_eq(&IndexKey::Text("dave".into())),
6077            &[RowLocator::Hot(3)]
6078        );
6079        // Pre-existing duplicates remain mapped to the two original row idxs.
6080        assert_eq!(
6081            idx.lookup_eq(&IndexKey::Text("alice".into())),
6082            &[RowLocator::Hot(0), RowLocator::Hot(2)]
6083        );
6084    }
6085
6086    #[test]
6087    fn null_or_float_values_are_not_indexed() {
6088        let mut cat = populated_users();
6089        let t = cat.get_mut("users").unwrap();
6090        t.add_index("by_score".into(), "score").unwrap();
6091        let idx = t.index_on(2).unwrap();
6092        // bob's score is NULL → no entry for bob.
6093        // Score is Float → the spec says we don't index NaN-prone columns,
6094        // so even the present scores are absent. Lookups via IndexKey::Int(90)
6095        // mis-match the column type and trivially find nothing.
6096        assert_eq!(idx.lookup_eq(&IndexKey::Int(90)), &[] as &[RowLocator]);
6097    }
6098
6099    // --- v0.11 vector type -------------------------------------------------
6100
6101    #[test]
6102    fn vector_value_data_type_carries_dim() {
6103        let v = Value::Vector(vec![1.0, 2.0, 3.0]);
6104        assert_eq!(
6105            v.data_type(),
6106            Some(DataType::Vector {
6107                dim: 3,
6108                encoding: VecEncoding::F32
6109            })
6110        );
6111    }
6112
6113    #[test]
6114    fn vector_column_insert_matching_dim_ok() {
6115        let mut cat = Catalog::new();
6116        cat.create_table(TableSchema::new(
6117            "emb",
6118            vec![ColumnSchema::new(
6119                "v",
6120                DataType::Vector {
6121                    dim: 3,
6122                    encoding: VecEncoding::F32,
6123                },
6124                false,
6125            )],
6126        ))
6127        .unwrap();
6128        cat.get_mut("emb")
6129            .unwrap()
6130            .insert(Row::new(vec![Value::Vector(vec![1.0, 2.0, 3.0])]))
6131            .unwrap();
6132    }
6133
6134    #[test]
6135    fn vector_column_insert_dim_mismatch_rejected() {
6136        let mut cat = Catalog::new();
6137        cat.create_table(TableSchema::new(
6138            "emb",
6139            vec![ColumnSchema::new(
6140                "v",
6141                DataType::Vector {
6142                    dim: 3,
6143                    encoding: VecEncoding::F32,
6144                },
6145                false,
6146            )],
6147        ))
6148        .unwrap();
6149        let err = cat
6150            .get_mut("emb")
6151            .unwrap()
6152            .insert(Row::new(vec![Value::Vector(vec![1.0, 2.0])]))
6153            .unwrap_err();
6154        assert!(matches!(err, StorageError::TypeMismatch { .. }));
6155    }
6156
6157    #[test]
6158    fn vector_value_survives_catalog_round_trip() {
6159        let mut cat = Catalog::new();
6160        cat.create_table(TableSchema::new(
6161            "emb",
6162            vec![
6163                ColumnSchema::new("id", DataType::Int, false),
6164                ColumnSchema::new(
6165                    "v",
6166                    DataType::Vector {
6167                        dim: 4,
6168                        encoding: VecEncoding::F32,
6169                    },
6170                    false,
6171                ),
6172            ],
6173        ))
6174        .unwrap();
6175        cat.get_mut("emb")
6176            .unwrap()
6177            .insert(Row::new(vec![
6178                Value::Int(1),
6179                Value::Vector(vec![0.5, -1.25, 3.0, 7.0]),
6180            ]))
6181            .unwrap();
6182        let restored = Catalog::deserialize(&cat.serialize()).expect("round-trip");
6183        let table = restored.get("emb").unwrap();
6184        assert_eq!(
6185            table.schema().columns[1].ty,
6186            DataType::Vector {
6187                dim: 4,
6188                encoding: VecEncoding::F32
6189            }
6190        );
6191        assert_eq!(
6192            table.rows()[0].values[1],
6193            Value::Vector(vec![0.5, -1.25, 3.0, 7.0])
6194        );
6195    }
6196
6197    #[test]
6198    fn index_survives_serialize_deserialize_round_trip() {
6199        let mut cat = populated_users();
6200        cat.get_mut("users")
6201            .unwrap()
6202            .add_index("by_name".into(), "name")
6203            .unwrap();
6204        let restored = Catalog::deserialize(&cat.serialize()).unwrap();
6205        let idx = restored
6206            .get("users")
6207            .unwrap()
6208            .index_on(1)
6209            .expect("index_on(1) after restore");
6210        assert_eq!(idx.name, "by_name");
6211        // Data was rebuilt from rows, not deserialized directly.
6212        assert_eq!(
6213            idx.lookup_eq(&IndexKey::Text("alice".into())),
6214            &[RowLocator::Hot(0), RowLocator::Hot(2)]
6215        );
6216    }
6217
6218    // --- v5.1 cold-tier integration tests ----------------------
6219
6220    /// Schema with a BIGINT PK column matching what the v5.1 cold-
6221    /// tier path supports (`IndexKey::Int` → `u64` cast).
6222    fn bigint_pk_users_schema() -> TableSchema {
6223        TableSchema::new(
6224            "users",
6225            vec![
6226                ColumnSchema::new("id", DataType::BigInt, false),
6227                ColumnSchema::new("name", DataType::Text, false),
6228            ],
6229        )
6230    }
6231
6232    fn make_user_row(id: i64, name: &str) -> Row {
6233        Row::new(vec![Value::BigInt(id), Value::Text(name.into())])
6234    }
6235
6236    #[test]
6237    fn lookup_by_pk_finds_row_via_hot_index() {
6238        let mut cat = Catalog::new();
6239        cat.create_table(bigint_pk_users_schema()).unwrap();
6240        let t = cat.get_mut("users").unwrap();
6241        for (id, name) in [(1i64, "alice"), (2, "bob"), (3, "carol")] {
6242            t.insert(make_user_row(id, name)).unwrap();
6243        }
6244        t.add_index("by_id".into(), "id").unwrap();
6245        // All locators are Hot; cold_segments is empty.
6246        let got = cat
6247            .lookup_by_pk("users", "by_id", &IndexKey::Int(2))
6248            .unwrap();
6249        assert_eq!(got, make_user_row(2, "bob"));
6250        assert_eq!(cat.cold_segment_count(), 0);
6251    }
6252
6253    #[test]
6254    fn lookup_by_pk_returns_none_when_key_missing() {
6255        let mut cat = Catalog::new();
6256        cat.create_table(bigint_pk_users_schema()).unwrap();
6257        let t = cat.get_mut("users").unwrap();
6258        t.insert(make_user_row(1, "alice")).unwrap();
6259        t.add_index("by_id".into(), "id").unwrap();
6260        assert!(
6261            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(999))
6262                .is_none()
6263        );
6264        // Also: unknown table / unknown index name.
6265        assert!(
6266            cat.lookup_by_pk("other_table", "by_id", &IndexKey::Int(1))
6267                .is_none()
6268        );
6269        assert!(
6270            cat.lookup_by_pk("users", "no_such_index", &IndexKey::Int(1))
6271                .is_none()
6272        );
6273    }
6274
6275    #[test]
6276    fn lookup_by_pk_resolves_cold_locator_via_loaded_segment() {
6277        // Build a cold-tier segment whose payloads are dense-encoded
6278        // BIGINT rows. Wire each PK into the BTree index as a Cold
6279        // locator. The hot tier carries no rows for those PKs.
6280        let mut cat = Catalog::new();
6281        cat.create_table(bigint_pk_users_schema()).unwrap();
6282        let t = cat.get_mut("users").unwrap();
6283        t.add_index("by_id".into(), "id").unwrap();
6284        let schema = t.schema.clone();
6285
6286        let cold_rows: Vec<(i64, &str)> =
6287            vec![(100, "ivy"), (200, "joe"), (300, "kim"), (400, "lin")];
6288        let seg_rows: Vec<(u64, Vec<u8>)> = cold_rows
6289            .iter()
6290            .map(|(id, name)| {
6291                let row = make_user_row(*id, name);
6292                ((*id).cast_unsigned(), encode_row_body_dense(&row, &schema))
6293            })
6294            .collect();
6295        let (seg_bytes, _meta) =
6296            encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES).unwrap();
6297        let seg_id = cat.load_segment_bytes(seg_bytes).unwrap();
6298        assert_eq!(seg_id, 0);
6299        assert_eq!(cat.cold_segment_count(), 1);
6300
6301        let pairs: Vec<(IndexKey, RowLocator)> = cold_rows
6302            .iter()
6303            .map(|(id, _)| {
6304                (
6305                    IndexKey::Int(*id),
6306                    RowLocator::Cold {
6307                        segment_id: seg_id,
6308                        page_offset: 0,
6309                    },
6310                )
6311            })
6312            .collect();
6313        let registered = cat
6314            .get_mut("users")
6315            .unwrap()
6316            .register_cold_locators("by_id", pairs)
6317            .unwrap();
6318        assert_eq!(registered, 4);
6319
6320        for (id, name) in &cold_rows {
6321            let got = cat
6322                .lookup_by_pk("users", "by_id", &IndexKey::Int(*id))
6323                .unwrap_or_else(|| panic!("cold key {id} not found"));
6324            assert_eq!(got, make_user_row(*id, name));
6325        }
6326        // Cold key that isn't in the segment must return None.
6327        assert!(
6328            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(999))
6329                .is_none()
6330        );
6331    }
6332
6333    #[test]
6334    fn lookup_by_pk_mixes_hot_and_cold_tiers() {
6335        // Half the rows live in the hot tier (Table::rows + add_index
6336        // produces Hot locators); half live in a cold segment and have
6337        // Cold locators wired manually. Each lookup hits the right tier.
6338        let mut cat = Catalog::new();
6339        cat.create_table(bigint_pk_users_schema()).unwrap();
6340        let t = cat.get_mut("users").unwrap();
6341        for (id, name) in [(1i64, "alice"), (2, "bob")] {
6342            t.insert(make_user_row(id, name)).unwrap();
6343        }
6344        t.add_index("by_id".into(), "id").unwrap();
6345        let schema = t.schema.clone();
6346
6347        let cold_rows: Vec<(i64, &str)> = vec![(100, "ivy"), (200, "joe")];
6348        let seg_rows: Vec<(u64, Vec<u8>)> = cold_rows
6349            .iter()
6350            .map(|(id, name)| {
6351                let row = make_user_row(*id, name);
6352                ((*id).cast_unsigned(), encode_row_body_dense(&row, &schema))
6353            })
6354            .collect();
6355        let (seg_bytes, _) =
6356            encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES).unwrap();
6357        let seg_id = cat.load_segment_bytes(seg_bytes).unwrap();
6358        let pairs: Vec<(IndexKey, RowLocator)> = cold_rows
6359            .iter()
6360            .map(|(id, _)| {
6361                (
6362                    IndexKey::Int(*id),
6363                    RowLocator::Cold {
6364                        segment_id: seg_id,
6365                        page_offset: 0,
6366                    },
6367                )
6368            })
6369            .collect();
6370        cat.get_mut("users")
6371            .unwrap()
6372            .register_cold_locators("by_id", pairs)
6373            .unwrap();
6374
6375        // Hot tier hits.
6376        assert_eq!(
6377            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(1))
6378                .unwrap(),
6379            make_user_row(1, "alice")
6380        );
6381        assert_eq!(
6382            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(2))
6383                .unwrap(),
6384            make_user_row(2, "bob")
6385        );
6386        // Cold tier hits.
6387        assert_eq!(
6388            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(100))
6389                .unwrap(),
6390            make_user_row(100, "ivy")
6391        );
6392        assert_eq!(
6393            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(200))
6394                .unwrap(),
6395            make_user_row(200, "joe")
6396        );
6397        // Miss in both tiers.
6398        assert!(
6399            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(50))
6400                .is_none()
6401        );
6402    }
6403
6404    #[test]
6405    fn register_cold_locators_rejects_nsw_index() {
6406        let mut cat = Catalog::new();
6407        cat.create_table(TableSchema::new(
6408            "vecs",
6409            vec![
6410                ColumnSchema::new("id", DataType::Int, false),
6411                ColumnSchema::new(
6412                    "v",
6413                    DataType::Vector {
6414                        dim: 4,
6415                        encoding: VecEncoding::F32,
6416                    },
6417                    false,
6418                ),
6419            ],
6420        ))
6421        .unwrap();
6422        let t = cat.get_mut("vecs").unwrap();
6423        t.insert(Row::new(vec![
6424            Value::Int(1),
6425            Value::Vector(vec![1.0, 0.0, 0.0, 0.0]),
6426        ]))
6427        .unwrap();
6428        t.add_nsw_index("by_v".into(), "v", NSW_DEFAULT_M).unwrap();
6429        let err = t
6430            .register_cold_locators(
6431                "by_v",
6432                vec![(
6433                    IndexKey::Int(1),
6434                    RowLocator::Cold {
6435                        segment_id: 0,
6436                        page_offset: 0,
6437                    },
6438                )],
6439            )
6440            .unwrap_err();
6441        // v6.7.1: message switched from "is NSW" to "is not BTree"
6442        // when the Brin variant was added.
6443        assert!(matches!(err, StorageError::Corrupt(ref s) if s.contains("not BTree")));
6444    }
6445
6446    #[test]
6447    fn load_segment_bytes_rejects_garbage() {
6448        let mut cat = Catalog::new();
6449        let err = cat.load_segment_bytes(vec![0u8; 10]).unwrap_err();
6450        assert!(matches!(err, StorageError::Corrupt(ref s) if s.contains("segment")));
6451        // Loader doesn't mutate state on error.
6452        assert_eq!(cat.cold_segment_count(), 0);
6453    }
6454
6455    #[test]
6456    fn load_segment_bytes_returns_sequential_ids() {
6457        let mut cat = Catalog::new();
6458        cat.create_table(bigint_pk_users_schema()).unwrap();
6459        let schema = cat.get("users").unwrap().schema.clone();
6460        for batch in 0u32..3 {
6461            let rows: Vec<(u64, Vec<u8>)> = (0u64..4)
6462                .map(|i| {
6463                    let id = u64::from(batch) * 100 + i;
6464                    let row = make_user_row(id.cast_signed(), "x");
6465                    (id, encode_row_body_dense(&row, &schema))
6466                })
6467                .collect();
6468            let (bytes, _) = encode_segment(rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES).unwrap();
6469            assert_eq!(cat.load_segment_bytes(bytes).unwrap(), batch);
6470        }
6471        assert_eq!(cat.cold_segment_count(), 3);
6472    }
6473
6474    // --- v5.2 catalog format v9 ----------------------------------
6475
6476    /// Hand-craft a v8 catalog byte stream and confirm the v9 reader
6477    /// accepts it and surfaces every `BTree` entry as a Hot locator.
6478    /// Guards the backward-compat read path: existing v3.0.2 / v4.x
6479    /// snapshots on disk must keep loading after the v5.2 bump.
6480    #[test]
6481    fn v8_catalog_decodes_as_all_hot_under_v9_reader() {
6482        // Build a populated catalog in memory, snapshot it with the
6483        // v9 serializer, then patch the version byte back to 8 and
6484        // strip the v9 BTree payload bytes so the layout matches what
6485        // a real v8 snapshot would have produced on disk. The v9
6486        // reader's version dispatch path then rebuilds the index
6487        // from rows (every locator becomes Hot).
6488        let mut cat = populated_users();
6489        cat.get_mut("users")
6490            .unwrap()
6491            .add_index("by_name".into(), "name")
6492            .unwrap();
6493
6494        // To produce a faithful v8 byte stream we re-encode the same
6495        // catalog with the v8 layout: identical bytes up to (and
6496        // including) the per-index kind tag, but no inline BTree
6497        // entries.
6498        let v8_bytes = encode_as_v8(&cat);
6499        assert_eq!(v8_bytes[FILE_MAGIC.len()], 8, "version byte must be 8");
6500
6501        let restored = Catalog::deserialize(&v8_bytes).expect("v9 reader accepts v8 stream");
6502        let idx = restored
6503            .get("users")
6504            .unwrap()
6505            .index_on(1)
6506            .expect("index_on(1) after restore");
6507        // v8 path always materialises Hot locators (no cold tier
6508        // existed pre-v5.2).
6509        assert_eq!(
6510            idx.lookup_eq(&IndexKey::Text("alice".into())),
6511            &[RowLocator::Hot(0), RowLocator::Hot(2)]
6512        );
6513        // No accidental Cold leak.
6514        for entry in idx.lookup_eq(&IndexKey::Text("alice".into())) {
6515            assert!(entry.is_hot(), "v8 → v9 read must yield Hot only");
6516        }
6517    }
6518
6519    /// Encode `cat` using the v8 layout (no inline `BTree` entries,
6520    /// version byte = 8). Pure test helper — duplicates just enough
6521    /// of `Catalog::serialize` to produce a faithful v8 stream that
6522    /// real v3.0.2 / v4.x deployments wrote.
6523    fn encode_as_v8(cat: &Catalog) -> Vec<u8> {
6524        let mut out = Vec::with_capacity(64);
6525        out.extend_from_slice(FILE_MAGIC);
6526        out.push(8u8);
6527        write_u32(&mut out, u32::try_from(cat.tables.len()).unwrap());
6528        for t in &cat.tables {
6529            write_str(&mut out, &t.schema.name);
6530            write_u16(&mut out, u16::try_from(t.schema.columns.len()).unwrap());
6531            for c in &t.schema.columns {
6532                write_str(&mut out, &c.name);
6533                write_data_type(&mut out, c.ty);
6534                out.push(u8::from(c.nullable));
6535                match &c.default {
6536                    None => out.push(0),
6537                    Some(v) => {
6538                        out.push(1);
6539                        write_value(&mut out, v);
6540                    }
6541                }
6542                out.push(u8::from(c.auto_increment));
6543            }
6544            write_u32(&mut out, u32::try_from(t.rows.len()).unwrap());
6545            for row in &t.rows {
6546                out.extend_from_slice(&encode_row_body_dense(row, &t.schema));
6547            }
6548            write_u16(&mut out, u16::try_from(t.indices.len()).unwrap());
6549            for idx in &t.indices {
6550                write_str(&mut out, &idx.name);
6551                write_u16(&mut out, u16::try_from(idx.column_position).unwrap());
6552                match &idx.kind {
6553                    // v8 BTree wrote only the kind tag; entries
6554                    // rebuild from rows on read.
6555                    IndexKind::BTree(_) => out.push(0),
6556                    IndexKind::Nsw(g) => {
6557                        out.push(1);
6558                        write_u16(&mut out, u16::try_from(g.m).unwrap());
6559                        write_nsw_graph(&mut out, g);
6560                    }
6561                    // v8 had no BRIN; this test-only writer can't
6562                    // serialise BRIN into the legacy format.
6563                    IndexKind::Brin { .. } => panic!(
6564                        "v8 catalog writer cannot serialise BRIN — \
6565                         tests with BRIN indices must use the current writer"
6566                    ),
6567                }
6568            }
6569        }
6570        out
6571    }
6572
6573    /// Build a catalog that carries both hot and cold locators on a
6574    /// `BTree` index, snapshot it through `serialize`, then deserialise
6575    /// and confirm every Cold locator round-trips byte-identical and
6576    /// `lookup_by_pk` resolves through the rebuilt cold-segment
6577    /// registry.
6578    #[test]
6579    fn v9_catalog_round_trip_preserves_cold_locators() {
6580        let mut cat = Catalog::new();
6581        cat.create_table(bigint_pk_users_schema()).unwrap();
6582        let t = cat.get_mut("users").unwrap();
6583        // Hot rows: 1, 2
6584        for (id, name) in [(1i64, "alice"), (2, "bob")] {
6585            t.insert(make_user_row(id, name)).unwrap();
6586        }
6587        t.add_index("by_id".into(), "id").unwrap();
6588        let schema = t.schema.clone();
6589
6590        // Cold rows: 100, 200, 300 — sit in a single segment.
6591        let cold_rows: Vec<(i64, &str)> = vec![(100, "ivy"), (200, "joe"), (300, "kim")];
6592        let seg_rows: Vec<(u64, Vec<u8>)> = cold_rows
6593            .iter()
6594            .map(|(id, name)| {
6595                let row = make_user_row(*id, name);
6596                ((*id).cast_unsigned(), encode_row_body_dense(&row, &schema))
6597            })
6598            .collect();
6599        let (seg_bytes, _) =
6600            encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES).unwrap();
6601        let seg_id = cat.load_segment_bytes(seg_bytes.clone()).unwrap();
6602        let pairs: Vec<(IndexKey, RowLocator)> = cold_rows
6603            .iter()
6604            .map(|(id, _)| {
6605                (
6606                    IndexKey::Int(*id),
6607                    RowLocator::Cold {
6608                        segment_id: seg_id,
6609                        page_offset: 0,
6610                    },
6611                )
6612            })
6613            .collect();
6614        cat.get_mut("users")
6615            .unwrap()
6616            .register_cold_locators("by_id", pairs)
6617            .unwrap();
6618
6619        // Snapshot + restore via the v9 codec.
6620        let bytes = cat.serialize();
6621        assert_eq!(bytes[FILE_MAGIC.len()], FILE_VERSION);
6622        let mut restored = Catalog::deserialize(&bytes).expect("v9 round-trip parses");
6623
6624        // Catalog::serialize does not yet emit cold segment file
6625        // bytes (v5.3 manifest is the future home for that). For
6626        // this v9 test the caller side-loads the segment again so
6627        // lookup_by_pk can resolve the Cold locator. The point of
6628        // this assertion is that the locator metadata survived the
6629        // catalog round-trip.
6630        let restored_seg_id = restored.load_segment_bytes(seg_bytes).unwrap();
6631        assert_eq!(restored_seg_id, seg_id);
6632
6633        let idx = restored.get("users").unwrap().index_on(0).unwrap();
6634        // Hot locators round-trip.
6635        assert_eq!(idx.lookup_eq(&IndexKey::Int(1)), &[RowLocator::Hot(0)]);
6636        assert_eq!(idx.lookup_eq(&IndexKey::Int(2)), &[RowLocator::Hot(1)]);
6637        // Cold locators round-trip byte-identical.
6638        for (id, _) in &cold_rows {
6639            assert_eq!(
6640                idx.lookup_eq(&IndexKey::Int(*id)),
6641                &[RowLocator::Cold {
6642                    segment_id: seg_id,
6643                    page_offset: 0,
6644                }]
6645            );
6646        }
6647        // End-to-end: lookup_by_pk resolves both tiers.
6648        assert_eq!(
6649            restored
6650                .lookup_by_pk("users", "by_id", &IndexKey::Int(2))
6651                .unwrap(),
6652            make_user_row(2, "bob")
6653        );
6654        for (id, name) in &cold_rows {
6655            assert_eq!(
6656                restored
6657                    .lookup_by_pk("users", "by_id", &IndexKey::Int(*id))
6658                    .unwrap(),
6659                make_user_row(*id, name)
6660            );
6661        }
6662    }
6663
6664    // --- v5.2.1 hot tier byte tracking ---------------------------
6665
6666    /// `row_body_encoded_len` is the perf-critical fast path; pin it
6667    /// against `encode_row_body_dense(...).len()` for every
6668    /// representative cell type so an encoder change can't silently
6669    /// desync the counter.
6670    #[test]
6671    fn row_body_encoded_len_matches_actual_encode_for_all_types() {
6672        let schema = TableSchema::new(
6673            "wide",
6674            vec![
6675                ColumnSchema::new("a", DataType::SmallInt, true),
6676                ColumnSchema::new("b", DataType::Int, false),
6677                ColumnSchema::new("c", DataType::BigInt, false),
6678                ColumnSchema::new("d", DataType::Float, false),
6679                ColumnSchema::new("e", DataType::Bool, false),
6680                ColumnSchema::new("f", DataType::Text, false),
6681                ColumnSchema::new(
6682                    "g",
6683                    DataType::Vector {
6684                        dim: 3,
6685                        encoding: VecEncoding::F32,
6686                    },
6687                    false,
6688                ),
6689                ColumnSchema::new(
6690                    "h",
6691                    DataType::Numeric {
6692                        precision: 18,
6693                        scale: 2,
6694                    },
6695                    false,
6696                ),
6697                ColumnSchema::new("i", DataType::Date, false),
6698                ColumnSchema::new("j", DataType::Timestamp, false),
6699            ],
6700        );
6701        let cases: &[Row] = &[
6702            Row::new(vec![
6703                Value::SmallInt(7),
6704                Value::Int(42),
6705                Value::BigInt(1_000_000),
6706                Value::Float(1.5),
6707                Value::Bool(true),
6708                Value::Text("hello".into()),
6709                Value::Vector(vec![1.0, 2.0, 3.0]),
6710                Value::Numeric {
6711                    scaled: 12345,
6712                    scale: 2,
6713                },
6714                Value::Date(20_000),
6715                Value::Timestamp(1_700_000_000_000_000),
6716            ]),
6717            // NULL in the bitmap, varied text length.
6718            Row::new(vec![
6719                Value::Null,
6720                Value::Int(0),
6721                Value::BigInt(0),
6722                Value::Float(0.0),
6723                Value::Bool(false),
6724                Value::Text(String::new()),
6725                Value::Vector(vec![]),
6726                Value::Numeric {
6727                    scaled: 0,
6728                    scale: 2,
6729                },
6730                Value::Date(0),
6731                Value::Timestamp(0),
6732            ]),
6733            Row::new(vec![
6734                Value::SmallInt(-1),
6735                Value::Int(-1),
6736                Value::BigInt(-1),
6737                Value::Float(-0.5),
6738                Value::Bool(true),
6739                Value::Text("a much longer payload here".into()),
6740                Value::Vector(vec![0.1, 0.2, 0.3]),
6741                Value::Numeric {
6742                    scaled: -999_999_999,
6743                    scale: 2,
6744                },
6745                Value::Date(-1),
6746                Value::Timestamp(-1),
6747            ]),
6748        ];
6749        for row in cases {
6750            let actual = encode_row_body_dense(row, &schema).len();
6751            let fast = row_body_encoded_len(row, &schema);
6752            assert_eq!(actual, fast, "row {row:?}");
6753        }
6754    }
6755
6756    #[test]
6757    fn hot_bytes_grows_on_insert_and_matches_encoded_sum() {
6758        let mut cat = Catalog::new();
6759        cat.create_table(bigint_pk_users_schema()).unwrap();
6760        let t = cat.get_mut("users").unwrap();
6761        assert_eq!(t.hot_bytes(), 0);
6762        let mut expected: u64 = 0;
6763        for (id, name) in [(1i64, "alice"), (2, "bob"), (3, "carol")] {
6764            let row = make_user_row(id, name);
6765            expected += encode_row_body_dense(&row, &t.schema).len() as u64;
6766            t.insert(row).unwrap();
6767        }
6768        assert_eq!(t.hot_bytes(), expected);
6769        assert_eq!(cat.hot_tier_bytes(), expected);
6770    }
6771
6772    #[test]
6773    fn hot_bytes_shrinks_on_delete() {
6774        let mut cat = Catalog::new();
6775        cat.create_table(bigint_pk_users_schema()).unwrap();
6776        let t = cat.get_mut("users").unwrap();
6777        for (id, name) in [(1i64, "alice"), (2, "bob"), (3, "carol")] {
6778            t.insert(make_user_row(id, name)).unwrap();
6779        }
6780        let before = t.hot_bytes();
6781        // Delete row at position 1 (bob).
6782        let bob_row = make_user_row(2, "bob");
6783        let bob_bytes = encode_row_body_dense(&bob_row, &t.schema).len() as u64;
6784        let removed = t.delete_rows(&[1]);
6785        assert_eq!(removed, 1);
6786        assert_eq!(t.hot_bytes(), before - bob_bytes);
6787    }
6788
6789    #[test]
6790    fn hot_bytes_diffs_on_update_for_variable_width_columns() {
6791        let mut cat = Catalog::new();
6792        cat.create_table(bigint_pk_users_schema()).unwrap();
6793        let t = cat.get_mut("users").unwrap();
6794        t.insert(make_user_row(1, "alice")).unwrap();
6795        let after_insert = t.hot_bytes();
6796        // Update with a longer text payload — bytes must grow exactly
6797        // by the text-length delta.
6798        let new_row = make_user_row(1, "alice-the-longer-name");
6799        let old_len = encode_row_body_dense(&make_user_row(1, "alice"), &t.schema).len() as u64;
6800        let new_len = encode_row_body_dense(&new_row, &t.schema).len() as u64;
6801        t.update_row(0, new_row.values).unwrap();
6802        assert_eq!(t.hot_bytes(), after_insert - old_len + new_len);
6803        assert!(t.hot_bytes() > after_insert, "longer text grew the counter");
6804    }
6805
6806    #[test]
6807    fn hot_bytes_round_trips_through_serialize_deserialize() {
6808        let mut cat = Catalog::new();
6809        cat.create_table(bigint_pk_users_schema()).unwrap();
6810        let t = cat.get_mut("users").unwrap();
6811        for i in 0..10 {
6812            t.insert(make_user_row(i, &alloc::format!("name-{i}")))
6813                .unwrap();
6814        }
6815        let pre = cat.hot_tier_bytes();
6816        let restored = Catalog::deserialize(&cat.serialize()).unwrap();
6817        assert_eq!(restored.hot_tier_bytes(), pre);
6818        assert_eq!(restored.get("users").unwrap().hot_bytes(), pre);
6819    }
6820
6821    // --- v5.2.2 freezer atomic swap -------------------------------
6822
6823    /// Happy path: freeze the first half of a populated hot tier,
6824    /// confirm row counts shift, `hot_bytes` shrinks, and every frozen
6825    /// PK still resolves via `lookup_by_pk` (now through the cold
6826    /// segment registered by the freeze).
6827    #[test]
6828    fn freeze_oldest_to_cold_moves_rows_and_keeps_lookups_working() {
6829        let mut cat = Catalog::new();
6830        cat.create_table(bigint_pk_users_schema()).unwrap();
6831        let t = cat.get_mut("users").unwrap();
6832        for id in 0..10i64 {
6833            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
6834                .unwrap();
6835        }
6836        t.add_index("by_id".into(), "id").unwrap();
6837        let total_bytes_before = t.hot_bytes();
6838
6839        let report = cat
6840            .freeze_oldest_to_cold("users", "by_id", 6)
6841            .expect("freeze succeeds");
6842        assert_eq!(report.frozen_rows, 6);
6843        assert_eq!(report.segment_id, 0);
6844        assert!(report.bytes_freed > 0);
6845        assert!(!report.segment_bytes.is_empty());
6846
6847        let t = cat.get("users").unwrap();
6848        assert_eq!(t.row_count(), 4, "4 hot rows remain (10 - 6 frozen)");
6849        assert_eq!(cat.cold_segment_count(), 1);
6850        // Hot bytes shrank by exactly the freed amount.
6851        assert_eq!(
6852            t.hot_bytes(),
6853            total_bytes_before - report.bytes_freed,
6854            "hot_bytes accounting matches FreezeReport"
6855        );
6856
6857        // Every original PK still resolves — frozen ones via the
6858        // cold segment, kept ones via the (renumbered) hot tier.
6859        for id in 0..10i64 {
6860            let got = cat
6861                .lookup_by_pk("users", "by_id", &IndexKey::Int(id))
6862                .unwrap_or_else(|| panic!("PK {id} disappeared after freeze"));
6863            assert_eq!(got, make_user_row(id, &alloc::format!("u-{id}")));
6864        }
6865    }
6866
6867    /// Two successive freezes on the same index must preserve the
6868    /// first batch's cold locators when the second freeze runs.
6869    /// Catches the `rebuild_indices` wipe-Cold-on-delete bug that
6870    /// `collect_cold_locators` / re-register guards against.
6871    #[test]
6872    fn freeze_twice_preserves_prior_cold_locators() {
6873        let mut cat = Catalog::new();
6874        cat.create_table(bigint_pk_users_schema()).unwrap();
6875        let t = cat.get_mut("users").unwrap();
6876        for id in 0..12i64 {
6877            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
6878                .unwrap();
6879        }
6880        t.add_index("by_id".into(), "id").unwrap();
6881
6882        cat.freeze_oldest_to_cold("users", "by_id", 4)
6883            .expect("first freeze ok");
6884        cat.freeze_oldest_to_cold("users", "by_id", 4)
6885            .expect("second freeze ok");
6886
6887        assert_eq!(cat.get("users").unwrap().row_count(), 4);
6888        assert_eq!(cat.cold_segment_count(), 2);
6889        // All 12 PKs still resolve — first 4 via segment 0,
6890        // next 4 via segment 1, last 4 still hot.
6891        for id in 0..12i64 {
6892            let got = cat
6893                .lookup_by_pk("users", "by_id", &IndexKey::Int(id))
6894                .unwrap_or_else(|| panic!("PK {id} not resolvable after two freezes"));
6895            assert_eq!(got, make_user_row(id, &alloc::format!("u-{id}")));
6896        }
6897    }
6898
6899    /// Validation guard tests. Each must return `Err` and **not
6900    /// mutate the catalog** — the API is all-or-nothing.
6901    #[test]
6902    fn freeze_oldest_to_cold_rejects_invalid_input() {
6903        let mut cat = Catalog::new();
6904        cat.create_table(bigint_pk_users_schema()).unwrap();
6905        let t = cat.get_mut("users").unwrap();
6906        for id in 0..3i64 {
6907            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
6908                .unwrap();
6909        }
6910        t.add_index("by_id".into(), "id").unwrap();
6911
6912        // max_rows == 0
6913        assert!(matches!(
6914            cat.freeze_oldest_to_cold("users", "by_id", 0),
6915            Err(StorageError::Corrupt(_))
6916        ));
6917        // table missing
6918        assert!(matches!(
6919            cat.freeze_oldest_to_cold("missing", "by_id", 1),
6920            Err(StorageError::Corrupt(_))
6921        ));
6922        // index missing
6923        assert!(matches!(
6924            cat.freeze_oldest_to_cold("users", "no_such_index", 1),
6925            Err(StorageError::Corrupt(_))
6926        ));
6927        // max_rows > row_count
6928        assert!(matches!(
6929            cat.freeze_oldest_to_cold("users", "by_id", 999),
6930            Err(StorageError::Corrupt(_))
6931        ));
6932        // Catalog still untouched.
6933        assert_eq!(cat.get("users").unwrap().row_count(), 3);
6934        assert_eq!(cat.cold_segment_count(), 0);
6935    }
6936
6937    /// Freeze with a non-integer PK column must surface a clear
6938    /// error (Text PKs land in v5.5+).
6939    #[test]
6940    fn freeze_oldest_to_cold_rejects_non_integer_pk() {
6941        let mut cat = Catalog::new();
6942        cat.create_table(TableSchema::new(
6943            "by_name",
6944            vec![
6945                ColumnSchema::new("name", DataType::Text, false),
6946                ColumnSchema::new("payload", DataType::BigInt, false),
6947            ],
6948        ))
6949        .unwrap();
6950        let t = cat.get_mut("by_name").unwrap();
6951        t.insert(Row::new(vec![Value::Text("a".into()), Value::BigInt(1)]))
6952            .unwrap();
6953        t.add_index("by_n".into(), "name").unwrap();
6954        let err = cat
6955            .freeze_oldest_to_cold("by_name", "by_n", 1)
6956            .expect_err("non-integer PK rejected");
6957        match err {
6958            StorageError::Corrupt(s) => assert!(
6959                s.contains("non-integer"),
6960                "error message names the constraint: {s}"
6961            ),
6962            other => panic!("expected Corrupt, got {other:?}"),
6963        }
6964        // Catalog untouched.
6965        assert_eq!(cat.get("by_name").unwrap().row_count(), 1);
6966        assert_eq!(cat.cold_segment_count(), 0);
6967    }
6968
6969    /// Hot-tier rows after the freeze must keep their secondary-
6970    /// index lookups working — `delete_rows` shifts positions, and
6971    /// `rebuild_indices` must regenerate Hot locators at the new
6972    /// indices.
6973    #[test]
6974    fn freeze_keeps_remaining_hot_rows_addressable_via_secondary_index() {
6975        let mut cat = Catalog::new();
6976        cat.create_table(bigint_pk_users_schema()).unwrap();
6977        let t = cat.get_mut("users").unwrap();
6978        for id in 0..6i64 {
6979            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
6980                .unwrap();
6981        }
6982        t.add_index("by_id".into(), "id").unwrap();
6983        t.add_index("by_name".into(), "name").unwrap();
6984
6985        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
6986
6987        // Remaining hot rows: id 3, 4, 5. They moved to positions
6988        // 0, 1, 2 inside `self.rows`; the `by_name` index must now
6989        // resolve them via fresh Hot locators.
6990        let idx = cat.get("users").unwrap().index_on(1).unwrap();
6991        let got = idx.lookup_eq(&IndexKey::Text("u-4".into()));
6992        assert_eq!(got.len(), 1);
6993        assert!(got[0].is_hot(), "kept-hot rows still surface as Hot");
6994        match got[0] {
6995            RowLocator::Hot(i) => {
6996                // The 4th-inserted row was at position 4; after
6997                // dropping positions 0..3 it sits at position 1.
6998                assert_eq!(i, 1);
6999            }
7000            RowLocator::Cold { .. } => unreachable!(),
7001        }
7002    }
7003
7004    // --- v5.2.3 promote-on-write primitives ----------------------
7005
7006    /// Build a populated catalog with the first N rows frozen, then
7007    /// run `promote_cold_row` and verify the row crossed tiers
7008    /// correctly: the cold locator is retired, a fresh Hot locator
7009    /// appears, `lookup_by_pk` returns the row from the hot tier, and
7010    /// `hot_bytes` grew by the row's encoded byte length.
7011    #[test]
7012    fn promote_cold_row_pulls_frozen_row_back_to_hot_tier() {
7013        let mut cat = Catalog::new();
7014        cat.create_table(bigint_pk_users_schema()).unwrap();
7015        let t = cat.get_mut("users").unwrap();
7016        for id in 0..6i64 {
7017            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
7018                .unwrap();
7019        }
7020        t.add_index("by_id".into(), "id").unwrap();
7021        // Freeze first 4 rows (ids 0..3). After: hot rows = 4, 5 at
7022        // positions 0, 1; cold locators for keys 0..3.
7023        cat.freeze_oldest_to_cold("users", "by_id", 4).unwrap();
7024        let hot_bytes_before = cat.get("users").unwrap().hot_bytes();
7025
7026        // Promote PK=2 — it lives in segment 0 as a cold row.
7027        let new_idx = cat
7028            .promote_cold_row("users", "by_id", &IndexKey::Int(2))
7029            .expect("promote ok")
7030            .expect("PK 2 was cold");
7031        assert_eq!(
7032            new_idx, 2,
7033            "promoted row appended after the 2 surviving hot rows"
7034        );
7035
7036        let t = cat.get("users").unwrap();
7037        assert_eq!(t.row_count(), 3, "hot tier grew from 2 to 3");
7038        // Hot-bytes climbed by exactly one row's encoded length.
7039        let row = make_user_row(2, "u-2");
7040        let row_len = encode_row_body_dense(&row, &t.schema).len() as u64;
7041        assert_eq!(t.hot_bytes(), hot_bytes_before + row_len);
7042
7043        // The index now reports a Hot locator (the freshly inserted
7044        // row) — no Cold locator left for PK 2.
7045        let entries = t.index_on(0).unwrap().lookup_eq(&IndexKey::Int(2));
7046        assert_eq!(entries.len(), 1, "exactly one locator per key");
7047        assert!(entries[0].is_hot(), "promote retired the Cold locator");
7048        // End-to-end: lookup_by_pk still returns the row body.
7049        assert_eq!(
7050            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(2))
7051                .unwrap(),
7052            row
7053        );
7054        // Other cold rows untouched — still resolvable through the
7055        // segment.
7056        assert_eq!(
7057            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(0))
7058                .unwrap(),
7059            make_user_row(0, "u-0")
7060        );
7061    }
7062
7063    /// `promote_cold_row` on a key that's already hot (or absent)
7064    /// returns `Ok(None)` — not an error. The caller falls back to
7065    /// the hot-only update/delete path.
7066    #[test]
7067    fn promote_cold_row_returns_none_when_key_is_not_cold() {
7068        let mut cat = Catalog::new();
7069        cat.create_table(bigint_pk_users_schema()).unwrap();
7070        let t = cat.get_mut("users").unwrap();
7071        t.insert(make_user_row(7, "alice")).unwrap();
7072        t.add_index("by_id".into(), "id").unwrap();
7073
7074        // Hot-only key.
7075        assert!(
7076            cat.promote_cold_row("users", "by_id", &IndexKey::Int(7))
7077                .unwrap()
7078                .is_none()
7079        );
7080        // Absent key.
7081        assert!(
7082            cat.promote_cold_row("users", "by_id", &IndexKey::Int(99))
7083                .unwrap()
7084                .is_none()
7085        );
7086        // Catalog untouched on both no-op paths.
7087        assert_eq!(cat.get("users").unwrap().row_count(), 1);
7088        assert_eq!(cat.cold_segment_count(), 0);
7089    }
7090
7091    /// `shadow_cold_row` removes every Cold locator for a key on a
7092    /// `BTree` index. After the shadow, `lookup_by_pk` for that key
7093    /// returns None (the row data still sits in the segment file,
7094    /// but it's now garbage; compaction will reclaim it later).
7095    #[test]
7096    fn shadow_cold_row_removes_cold_locators_and_drops_lookup() {
7097        let mut cat = Catalog::new();
7098        cat.create_table(bigint_pk_users_schema()).unwrap();
7099        let t = cat.get_mut("users").unwrap();
7100        for id in 0..5i64 {
7101            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
7102                .unwrap();
7103        }
7104        t.add_index("by_id".into(), "id").unwrap();
7105        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
7106
7107        // Shadow PK=1 — pre-shadow lookup hits the cold tier.
7108        assert!(
7109            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(1))
7110                .is_some(),
7111            "frozen PK resolves before shadow"
7112        );
7113        let removed = cat
7114            .shadow_cold_row("users", "by_id", &IndexKey::Int(1))
7115            .unwrap();
7116        assert_eq!(removed, 1, "exactly one cold locator retired");
7117
7118        // Post-shadow: lookup misses, even though the row still
7119        // exists in segment 0.
7120        assert!(
7121            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(1))
7122                .is_none(),
7123            "shadowed key no longer resolves"
7124        );
7125        // Other cold keys still resolve.
7126        assert_eq!(
7127            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(0))
7128                .unwrap(),
7129            make_user_row(0, "u-0")
7130        );
7131        assert_eq!(
7132            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(2))
7133                .unwrap(),
7134            make_user_row(2, "u-2")
7135        );
7136    }
7137
7138    /// `shadow_cold_row` returns 0 (not Err) for keys with only Hot
7139    /// entries or no entries — the engine's DELETE path uses this
7140    /// signal to decide whether the cold-tier shadow path consumed
7141    /// the work.
7142    #[test]
7143    fn shadow_cold_row_returns_zero_when_key_is_not_cold() {
7144        let mut cat = Catalog::new();
7145        cat.create_table(bigint_pk_users_schema()).unwrap();
7146        let t = cat.get_mut("users").unwrap();
7147        t.insert(make_user_row(1, "alice")).unwrap();
7148        t.add_index("by_id".into(), "id").unwrap();
7149        assert_eq!(
7150            cat.shadow_cold_row("users", "by_id", &IndexKey::Int(1))
7151                .unwrap(),
7152            0,
7153            "hot-only key drops no cold locators"
7154        );
7155        assert_eq!(
7156            cat.shadow_cold_row("users", "by_id", &IndexKey::Int(999))
7157                .unwrap(),
7158            0,
7159            "absent key drops no cold locators"
7160        );
7161        assert_eq!(cat.get("users").unwrap().row_count(), 1);
7162    }
7163
7164    /// Validation guards on both promote / shadow primitives.
7165    #[test]
7166    fn promote_and_shadow_reject_invalid_inputs() {
7167        let mut cat = Catalog::new();
7168        cat.create_table(bigint_pk_users_schema()).unwrap();
7169        let t = cat.get_mut("users").unwrap();
7170        t.insert(make_user_row(1, "alice")).unwrap();
7171        t.add_index("by_id".into(), "id").unwrap();
7172
7173        // Missing table.
7174        assert!(matches!(
7175            cat.promote_cold_row("missing", "by_id", &IndexKey::Int(1)),
7176            Err(StorageError::Corrupt(_))
7177        ));
7178        assert!(matches!(
7179            cat.shadow_cold_row("missing", "by_id", &IndexKey::Int(1)),
7180            Err(StorageError::Corrupt(_))
7181        ));
7182        // Missing index.
7183        assert!(matches!(
7184            cat.promote_cold_row("users", "no_such_index", &IndexKey::Int(1)),
7185            Err(StorageError::Corrupt(_))
7186        ));
7187        assert!(matches!(
7188            cat.shadow_cold_row("users", "no_such_index", &IndexKey::Int(1)),
7189            Err(StorageError::Corrupt(_))
7190        ));
7191    }
7192
7193    // --- v6.7.4 parallel-freezer slice/commit API -----------------
7194
7195    /// One slice covering the entire freeze produces the same
7196    /// catalog state as the single-threaded `freeze_oldest_to_cold`
7197    /// — segment id, frozen row count, hot byte delta, and every
7198    /// post-freeze PK lookup match exactly.
7199    #[test]
7200    fn commit_freeze_slices_single_slice_matches_freeze_oldest() {
7201        let mut a = Catalog::new();
7202        let mut b = Catalog::new();
7203        for cat in [&mut a, &mut b] {
7204            cat.create_table(bigint_pk_users_schema()).unwrap();
7205            let t = cat.get_mut("users").unwrap();
7206            for id in 0..10i64 {
7207                t.insert(make_user_row(id, &alloc::format!("u-{id}")))
7208                    .unwrap();
7209            }
7210            t.add_index("by_id".into(), "id").unwrap();
7211        }
7212        let single = a.freeze_oldest_to_cold("users", "by_id", 6).unwrap();
7213        let slice = b
7214            .prepare_freeze_slice("users", "by_id", 0..6)
7215            .expect("prepare");
7216        let parallel = b
7217            .commit_freeze_slices("users", "by_id", alloc::vec![slice])
7218            .expect("commit");
7219        assert_eq!(single.segment_id, parallel.segment_id);
7220        assert_eq!(single.frozen_rows, parallel.frozen_rows);
7221        assert_eq!(single.bytes_freed, parallel.bytes_freed);
7222        assert_eq!(single.segment_bytes, parallel.segment_bytes);
7223        // Same post-freeze lookup behaviour on both catalogs.
7224        for id in 0..10i64 {
7225            assert_eq!(
7226                a.lookup_by_pk("users", "by_id", &IndexKey::Int(id)),
7227                b.lookup_by_pk("users", "by_id", &IndexKey::Int(id)),
7228                "PK {id} differs after single vs slice freeze"
7229            );
7230        }
7231    }
7232
7233    /// Two slices covering disjoint halves of the freeze produce
7234    /// the same merged segment as one slice covering the full
7235    /// range. The k-way merge preserves PK ordering even when
7236    /// slice halves alternate.
7237    #[test]
7238    fn commit_freeze_slices_two_slices_match_single_slice() {
7239        let mut a = Catalog::new();
7240        let mut b = Catalog::new();
7241        for cat in [&mut a, &mut b] {
7242            cat.create_table(bigint_pk_users_schema()).unwrap();
7243            let t = cat.get_mut("users").unwrap();
7244            // Random-ish PKs so the per-slice sort actually has
7245            // work to do (and slice halves carry interleaved keys).
7246            for id in [3, 7, 1, 9, 5, 0, 8, 4, 2, 6].iter().copied() {
7247                t.insert(make_user_row(id as i64, &alloc::format!("u-{id}")))
7248                    .unwrap();
7249            }
7250            t.add_index("by_id".into(), "id").unwrap();
7251        }
7252        let single = a
7253            .prepare_freeze_slice("users", "by_id", 0..8)
7254            .expect("prepare");
7255        let one = a
7256            .commit_freeze_slices("users", "by_id", alloc::vec![single])
7257            .expect("commit one");
7258        let s1 = b
7259            .prepare_freeze_slice("users", "by_id", 0..4)
7260            .expect("prepare s1");
7261        let s2 = b
7262            .prepare_freeze_slice("users", "by_id", 4..8)
7263            .expect("prepare s2");
7264        let two = b
7265            .commit_freeze_slices("users", "by_id", alloc::vec![s1, s2])
7266            .expect("commit two");
7267        assert_eq!(one.segment_bytes, two.segment_bytes);
7268        assert_eq!(one.frozen_rows, two.frozen_rows);
7269        // Every PK that survived freeze (hot or cold) resolves on
7270        // both catalogs.
7271        for id in 0..10i64 {
7272            assert_eq!(
7273                a.lookup_by_pk("users", "by_id", &IndexKey::Int(id)),
7274                b.lookup_by_pk("users", "by_id", &IndexKey::Int(id)),
7275                "PK {id} differs after one-slice vs two-slice freeze"
7276            );
7277        }
7278    }
7279
7280    /// Gap between slices → error before any mutation lands.
7281    #[test]
7282    fn commit_freeze_slices_rejects_gap() {
7283        let mut cat = Catalog::new();
7284        cat.create_table(bigint_pk_users_schema()).unwrap();
7285        let t = cat.get_mut("users").unwrap();
7286        for id in 0..6i64 {
7287            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
7288                .unwrap();
7289        }
7290        t.add_index("by_id".into(), "id").unwrap();
7291        let s1 = cat.prepare_freeze_slice("users", "by_id", 0..2).unwrap();
7292        let s2 = cat.prepare_freeze_slice("users", "by_id", 3..5).unwrap();
7293        assert!(matches!(
7294            cat.commit_freeze_slices("users", "by_id", alloc::vec![s1, s2]),
7295            Err(StorageError::Corrupt(_))
7296        ));
7297        // Catalog untouched.
7298        assert_eq!(cat.cold_segment_count(), 0);
7299        assert_eq!(cat.get("users").unwrap().row_count(), 6);
7300    }
7301
7302    /// Empty slice list → no-op success, catalog untouched.
7303    #[test]
7304    fn commit_freeze_slices_empty_is_noop() {
7305        let mut cat = Catalog::new();
7306        cat.create_table(bigint_pk_users_schema()).unwrap();
7307        let t = cat.get_mut("users").unwrap();
7308        for id in 0..3i64 {
7309            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
7310                .unwrap();
7311        }
7312        t.add_index("by_id".into(), "id").unwrap();
7313        let report = cat
7314            .commit_freeze_slices("users", "by_id", Vec::new())
7315            .unwrap();
7316        assert_eq!(report.frozen_rows, 0);
7317        assert_eq!(cat.cold_segment_count(), 0);
7318        assert_eq!(cat.get("users").unwrap().row_count(), 3);
7319    }
7320
7321    // --- v6.7.3 cold-segment compaction ---------------------------
7322
7323    /// Two small cold segments merge into a single larger one. The
7324    /// merged segment carries every cold-resident row; the source
7325    /// slots are tombstoned; every PK still resolves through the
7326    /// new merged segment via `lookup_by_pk`.
7327    #[test]
7328    fn compact_merges_small_segments_storage_unit() {
7329        let mut cat = Catalog::new();
7330        cat.create_table(bigint_pk_users_schema()).unwrap();
7331        let t = cat.get_mut("users").unwrap();
7332        for id in 0..8i64 {
7333            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
7334                .unwrap();
7335        }
7336        t.add_index("by_id".into(), "id").unwrap();
7337        // Two freezes of 3 rows each → two small cold segments.
7338        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
7339        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
7340        assert_eq!(cat.cold_segment_count(), 2);
7341        assert_eq!(cat.cold_segment_slot_count(), 2);
7342
7343        // Pick a threshold larger than either segment's size so
7344        // both qualify.
7345        let max_seg_bytes = cat
7346            .cold_segment_ids_global()
7347            .iter()
7348            .map(|id| cat.cold_segment(*id).unwrap().bytes().len() as u64)
7349            .max()
7350            .unwrap();
7351        let target = max_seg_bytes + 1;
7352
7353        let report = cat
7354            .compact_cold_segments("users", "by_id", target)
7355            .expect("compact succeeds");
7356        assert_eq!(report.sources.len(), 2);
7357        let merged_id = report.merged_segment_id.expect("merge happened");
7358        assert_eq!(report.merged_rows, 6);
7359        assert_eq!(report.deleted_rows_pruned, 0);
7360        assert!(!report.merged_segment_bytes.is_empty());
7361
7362        // Active count drops back to 1; slot count grew to 3
7363        // (2 sources tombstoned + 1 merged appended).
7364        assert_eq!(cat.cold_segment_count(), 1);
7365        assert_eq!(cat.cold_segment_slot_count(), 3);
7366        assert_eq!(cat.cold_segment_ids_global(), alloc::vec![merged_id]);
7367
7368        // Every PK that was frozen still resolves (via the merged
7369        // segment); the 2 hot rows still resolve too.
7370        for id in 0..8i64 {
7371            let got = cat
7372                .lookup_by_pk("users", "by_id", &IndexKey::Int(id))
7373                .unwrap_or_else(|| panic!("PK {id} lost after compaction"));
7374            assert_eq!(got, make_user_row(id, &alloc::format!("u-{id}")));
7375        }
7376    }
7377
7378    /// DELETE'd-but-frozen rows are dropped during the merge. Set
7379    /// up two small segments, then shadow one row in each; the
7380    /// merged segment must NOT carry the shadowed rows.
7381    #[test]
7382    fn compact_drops_shadowed_cold_rows() {
7383        let mut cat = Catalog::new();
7384        cat.create_table(bigint_pk_users_schema()).unwrap();
7385        let t = cat.get_mut("users").unwrap();
7386        for id in 0..6i64 {
7387            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
7388                .unwrap();
7389        }
7390        t.add_index("by_id".into(), "id").unwrap();
7391        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
7392        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
7393        // Shadow PK 1 (in seg 0) + PK 4 (in seg 1).
7394        assert_eq!(
7395            cat.shadow_cold_row("users", "by_id", &IndexKey::Int(1))
7396                .unwrap(),
7397            1
7398        );
7399        assert_eq!(
7400            cat.shadow_cold_row("users", "by_id", &IndexKey::Int(4))
7401                .unwrap(),
7402            1
7403        );
7404
7405        let max_seg_bytes = cat
7406            .cold_segment_ids_global()
7407            .iter()
7408            .map(|id| cat.cold_segment(*id).unwrap().bytes().len() as u64)
7409            .max()
7410            .unwrap();
7411        let report = cat
7412            .compact_cold_segments("users", "by_id", max_seg_bytes + 1)
7413            .expect("compact succeeds");
7414        assert_eq!(report.sources.len(), 2);
7415        assert_eq!(report.merged_rows, 4, "6 frozen − 2 shadowed = 4 live");
7416        assert_eq!(report.deleted_rows_pruned, 2);
7417
7418        // PK 1 and 4 stay invisible after compact.
7419        for shadowed in [1i64, 4i64] {
7420            assert!(
7421                cat.lookup_by_pk("users", "by_id", &IndexKey::Int(shadowed))
7422                    .is_none(),
7423                "shadowed PK {shadowed} must remain invisible after compact"
7424            );
7425        }
7426        // The other 4 frozen rows resolve.
7427        for live in [0i64, 2, 3, 5] {
7428            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(live))
7429                .unwrap_or_else(|| panic!("live PK {live} lost after compact"));
7430        }
7431    }
7432
7433    /// No-op cases: 0 or 1 candidate segment under the threshold
7434    /// leaves the catalog untouched.
7435    #[test]
7436    fn compact_is_noop_below_two_candidates() {
7437        let mut cat = Catalog::new();
7438        cat.create_table(bigint_pk_users_schema()).unwrap();
7439        let t = cat.get_mut("users").unwrap();
7440        for id in 0..6i64 {
7441            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
7442                .unwrap();
7443        }
7444        t.add_index("by_id".into(), "id").unwrap();
7445        // 0 cold segments.
7446        let report = cat
7447            .compact_cold_segments("users", "by_id", 1 << 30)
7448            .expect("noop ok");
7449        assert!(report.merged_segment_id.is_none());
7450        assert!(report.sources.is_empty());
7451
7452        // 1 cold segment — still a no-op (need ≥2 to merge).
7453        cat.freeze_oldest_to_cold("users", "by_id", 4).unwrap();
7454        let report = cat
7455            .compact_cold_segments("users", "by_id", 1 << 30)
7456            .expect("noop ok");
7457        assert!(report.merged_segment_id.is_none());
7458        assert_eq!(cat.cold_segment_count(), 1);
7459
7460        // Threshold too small to cover the single segment → still
7461        // no-op.
7462        let report = cat
7463            .compact_cold_segments("users", "by_id", 1)
7464            .expect("noop ok");
7465        assert!(report.merged_segment_id.is_none());
7466        assert_eq!(cat.cold_segment_count(), 1);
7467    }
7468
7469    /// Manifest-style atomicity: a Catalog snapshot taken AFTER
7470    /// `compact_cold_segments` returns must round-trip with the
7471    /// post-compact BTree state, while the cold-tier registry is
7472    /// re-derived from the source-of-truth manifest (=
7473    /// `load_segment_bytes_at` with the merged id + the still-on-
7474    /// disk merged bytes). This mirrors the boot path: catalog
7475    /// snapshot + cold-segment files = full state.
7476    #[test]
7477    fn compact_swap_survives_catalog_roundtrip_via_load_at() {
7478        let mut cat = Catalog::new();
7479        cat.create_table(bigint_pk_users_schema()).unwrap();
7480        let t = cat.get_mut("users").unwrap();
7481        for id in 0..6i64 {
7482            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
7483                .unwrap();
7484        }
7485        t.add_index("by_id".into(), "id").unwrap();
7486        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
7487        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
7488        let max_seg_bytes = cat
7489            .cold_segment_ids_global()
7490            .iter()
7491            .map(|id| cat.cold_segment(*id).unwrap().bytes().len() as u64)
7492            .max()
7493            .unwrap();
7494        let report = cat
7495            .compact_cold_segments("users", "by_id", max_seg_bytes + 1)
7496            .expect("compact ok");
7497        let merged_id = report.merged_segment_id.unwrap();
7498
7499        // Serialise the catalog (BTree index points at merged_id
7500        // now) and the merged segment bytes; pretend to crash; on
7501        // restart, re-hydrate the catalog and reload only the
7502        // merged segment at its baked-in id.
7503        let cat_bytes = cat.serialize();
7504        let merged_bytes = report.merged_segment_bytes.clone();
7505
7506        let mut restored = Catalog::deserialize(&cat_bytes).expect("deserialize ok");
7507        restored
7508            .load_segment_bytes_at(merged_id, merged_bytes)
7509            .expect("reload merged ok");
7510
7511        // All 6 PKs still resolve through the restored merged segment.
7512        for id in 0..6i64 {
7513            let got = restored
7514                .lookup_by_pk("users", "by_id", &IndexKey::Int(id))
7515                .unwrap_or_else(|| panic!("PK {id} lost across roundtrip"));
7516            assert_eq!(got, make_user_row(id, &alloc::format!("u-{id}")));
7517        }
7518        // No source slot ever rehydrates — confirmed by
7519        // `cold_segment_count` matching only the merged segment.
7520        assert_eq!(restored.cold_segment_count(), 1);
7521    }
7522
7523    /// `load_segment_bytes_at` refuses to stomp an occupied slot
7524    /// and pads with `None` when the target id is past the end.
7525    #[test]
7526    fn load_segment_bytes_at_pads_and_rejects_collision() {
7527        let mut cat = Catalog::new();
7528        cat.create_table(bigint_pk_users_schema()).unwrap();
7529        let t = cat.get_mut("users").unwrap();
7530        for id in 0..4i64 {
7531            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
7532                .unwrap();
7533        }
7534        t.add_index("by_id".into(), "id").unwrap();
7535        let report = cat.freeze_oldest_to_cold("users", "by_id", 2).unwrap();
7536        let bytes_seg0 = report.segment_bytes.clone();
7537
7538        // Pad to id=5 (slots 1..5 are None, slot 5 holds the
7539        // segment loaded back). The slot count jumps, the active
7540        // count is now 2 (seg 0 + seg 5).
7541        cat.load_segment_bytes_at(5, bytes_seg0.clone())
7542            .expect("pad + load ok");
7543        assert_eq!(cat.cold_segment_slot_count(), 6);
7544        assert_eq!(cat.cold_segment_count(), 2);
7545
7546        // Re-loading at the same id collides.
7547        assert!(matches!(
7548            cat.load_segment_bytes_at(5, bytes_seg0.clone()),
7549            Err(StorageError::Corrupt(_))
7550        ));
7551        // Re-loading at id 0 (already occupied) also collides.
7552        assert!(matches!(
7553            cat.load_segment_bytes_at(0, bytes_seg0),
7554            Err(StorageError::Corrupt(_))
7555        ));
7556    }
7557
7558    /// Round trip: freeze → promote → re-freeze. The same PK can
7559    /// migrate hot ↔ cold multiple times. After two cycles only the
7560    /// final Hot locator should be live.
7561    #[test]
7562    fn promote_then_refreeze_does_not_leave_orphan_locators() {
7563        let mut cat = Catalog::new();
7564        cat.create_table(bigint_pk_users_schema()).unwrap();
7565        let t = cat.get_mut("users").unwrap();
7566        for id in 0..4i64 {
7567            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
7568                .unwrap();
7569        }
7570        t.add_index("by_id".into(), "id").unwrap();
7571
7572        // Cycle 1: freeze first 2 rows, then promote PK 0.
7573        cat.freeze_oldest_to_cold("users", "by_id", 2).unwrap();
7574        let promoted = cat
7575            .promote_cold_row("users", "by_id", &IndexKey::Int(0))
7576            .unwrap();
7577        assert!(promoted.is_some());
7578        let entries_after_promote = cat
7579            .get("users")
7580            .unwrap()
7581            .index_on(0)
7582            .unwrap()
7583            .lookup_eq(&IndexKey::Int(0))
7584            .to_vec();
7585        assert_eq!(entries_after_promote.len(), 1);
7586        assert!(entries_after_promote[0].is_hot());
7587
7588        // Cycle 2: freeze the front rows again. PK 0 is now at
7589        // position 2 (after the survivors); it could still go cold
7590        // again on a future freeze depending on policy, but the
7591        // current "first N positions" policy leaves it alone here.
7592        // What matters: prior cold locators for PKs 0..1 are gone,
7593        // PKs 2..3 still resolve through their original segments.
7594        for id in [2i64, 3] {
7595            assert_eq!(
7596                cat.lookup_by_pk("users", "by_id", &IndexKey::Int(id))
7597                    .unwrap(),
7598                make_user_row(id, &alloc::format!("u-{id}"))
7599            );
7600        }
7601    }
7602}