Skip to main content

spg_storage/
lib.rs

1//! In-memory storage primitives.
2//!
3//! v0.3 is intentionally simple: a flat catalog of tables, each holding rows
4//! as `Vec<Value>` (positional, matching the table's `TableSchema`). No MVCC,
5//! no on-disk format — those land in later milestones.
6#![no_std]
7// v3.3.2 NEON path for l2_distance_sq (aarch64 only). Scoped allow:
8// `unsafe_code = "deny"` at workspace level stays in force for every
9// other crate.
10#![cfg_attr(target_arch = "aarch64", allow(unsafe_code))]
11
12extern crate alloc;
13
14pub mod bloom;
15pub mod halfvec;
16pub mod persistent;
17pub mod persistent_btree;
18pub mod quantize;
19pub mod row_locator;
20pub mod segment;
21
22pub use self::bloom::{BloomError, BloomFilter};
23pub use self::row_locator::{RowLocator, RowLocatorError};
24pub use self::segment::{
25    BRIN_SIDECAR_MAGIC, BrinSummary, OwnedSegment, SEGMENT_COMPRESS_ALGO_LZSS,
26    SEGMENT_COMPRESS_ALGO_NONE, SEGMENT_MAGIC, SEGMENT_MAGIC_V2, SEGMENT_PAGE_BYTES, SegmentError,
27    SegmentMeta, SegmentReader, derive_brin_summaries, encode_segment, wrap_v2_envelope,
28    wrap_v2_envelope_with_brin,
29};
30
31use alloc::collections::{BTreeMap, BTreeSet};
32use alloc::format;
33use alloc::string::String;
34use alloc::sync::Arc;
35use alloc::vec::Vec;
36use core::fmt;
37
38use self::persistent::PersistentVec;
39use self::persistent_btree::PersistentBTreeMap;
40
41/// In-cell encoding for `DataType::Vector`. Mirrors
42/// `spg_sql::ast::VecEncoding` — kept here so storage stays
43/// dep-free of `spg-sql`. The engine bridges between the two
44/// at DDL-execution time.
45///
46/// `F32` is the pre-v6 default: each cell holds a raw `Vec<f32>`.
47/// `Sq8` (v6.0.1) stores `Sq8Vector { min, max, bytes: Vec<u8> }`
48/// per cell; 4× compression vs `F32` with recall@10 ≥ 0.95 on
49/// natural embeddings (Gaussian / unit-sphere corpora).
50/// `F16` (v6.0.3, DDL keyword `HALF`) stores each element as
51/// IEEE-754 binary16; 2× compression and bit-exact dequantise.
52#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
53pub enum VecEncoding {
54    #[default]
55    F32,
56    Sq8,
57    F16,
58}
59
60impl fmt::Display for VecEncoding {
61    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
62        match self {
63            Self::F32 => f.write_str("F32"),
64            Self::Sq8 => f.write_str("SQ8"),
65            Self::F16 => f.write_str("HALF"),
66        }
67    }
68}
69
70/// Runtime type tags. `Vector { dim, encoding }` / `Varchar(max)` /
71/// `Char(size)` are parameterised; the parameter travels with both
72/// the column schema and the on-wire serialised representation.
73#[derive(Debug, Clone, Copy, PartialEq, Eq)]
74pub enum DataType {
75    /// 16-bit signed. Backed by `Value::SmallInt(i16)`; arithmetic that
76    /// would overflow surfaces as a type error at INSERT time.
77    SmallInt,
78    Int,    // 32-bit signed
79    BigInt, // 64-bit signed
80    Float,  // f64 (PG double precision)
81    Text,
82    /// `VARCHAR(n)` — same byte representation as `Text`, but INSERT
83    /// rejects values longer than `n` Unicode characters.
84    Varchar(u32),
85    /// `CHAR(n)` — same representation as `Text`, but INSERT right-pads
86    /// with U+0020 to exactly `n` Unicode characters (or rejects when
87    /// the input is already longer).
88    Char(u32),
89    Bool,
90    /// pgvector-style fixed-dimension vector. `encoding` selects
91    /// the in-cell representation (`F32` = pre-v6 raw f32 buffer;
92    /// `Sq8` = v6.0.1 8-bit scalar-quantised). The DDL grammar
93    /// surfaces encoding via the optional `USING <encoding>`
94    /// clause: `VECTOR(128) USING SQ8`.
95    Vector {
96        dim: u32,
97        encoding: VecEncoding,
98    },
99    /// `NUMERIC(precision, scale)` — exact fixed-point decimal stored as
100    /// a scaled `i128`. `precision` caps total decimal digits, `scale`
101    /// fixes digits after the decimal point. v1.12 supports up to
102    /// precision 38 (the i128-safe ceiling). `NUMERIC` and `NUMERIC(p)`
103    /// surface as `Numeric { precision: p, scale: 0 }`.
104    Numeric {
105        precision: u8,
106        scale: u8,
107    },
108    /// `DATE` — calendar date with day precision, stored as `i32` days
109    /// since the Unix epoch (1970-01-01).
110    Date,
111    /// `TIMESTAMP` (a.k.a. `MySQL` `DATETIME`) — instant with microsecond
112    /// precision, stored as `i64` microseconds since the Unix epoch.
113    Timestamp,
114    /// v7.9.2 `TIMESTAMPTZ` — bit-identical to `Timestamp` on disk
115    /// (i64 microseconds, UTC by convention). Carried as a distinct
116    /// type tag so the PG-wire layer can advertise OID 1184 (PG's
117    /// `timestamp with time zone`) and `sqlx`/`pgx`/JDBC clients
118    /// decode into their TZ-aware datetime types. The internal
119    /// semantics are unchanged: SPG never stored per-row offsets,
120    /// and neither did PG — `TIMESTAMPTZ` in PG is also UTC i64.
121    Timestamptz,
122    /// `INTERVAL` — calendar-aware span (months + microseconds). v2.11
123    /// supports INTERVAL only as a runtime intermediate (literals,
124    /// arithmetic results); on-disk encoding is rejected so this branch
125    /// can't appear in a `ColumnSchema`.
126    Interval,
127    /// v4.9: `JSON` — text-backed JSON document. We don't parse
128    /// the content (no path operators or jsonb functions yet) —
129    /// the column accepts any TEXT-compatible value and round-trips
130    /// it verbatim. PG OID 114 on the wire.
131    Json,
132    /// v7.9.0: `JSONB` — semantically identical to `Json` on
133    /// the storage side (same `Value::Json` cells, same
134    /// row codec), but advertised as PG OID 3802 on the wire
135    /// so `sqlx`-style clients that bind `jsonb` columns
136    /// decode correctly. mailrs migration blocker #3.
137    Jsonb,
138}
139
140impl fmt::Display for DataType {
141    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
142        match self {
143            Self::SmallInt => f.write_str("SMALLINT"),
144            Self::Int => f.write_str("INT"),
145            Self::BigInt => f.write_str("BIGINT"),
146            Self::Float => f.write_str("FLOAT"),
147            Self::Text => f.write_str("TEXT"),
148            Self::Varchar(n) => write!(f, "VARCHAR({n})"),
149            Self::Char(n) => write!(f, "CHAR({n})"),
150            Self::Bool => f.write_str("BOOL"),
151            Self::Vector { dim, encoding } => match encoding {
152                VecEncoding::F32 => write!(f, "VECTOR({dim})"),
153                VecEncoding::Sq8 => write!(f, "VECTOR({dim}) USING SQ8"),
154                VecEncoding::F16 => write!(f, "VECTOR({dim}) USING HALF"),
155            },
156            Self::Numeric { precision, scale } => {
157                if *scale == 0 {
158                    write!(f, "NUMERIC({precision})")
159                } else {
160                    write!(f, "NUMERIC({precision}, {scale})")
161                }
162            }
163            Self::Date => f.write_str("DATE"),
164            Self::Timestamp => f.write_str("TIMESTAMP"),
165            Self::Timestamptz => f.write_str("TIMESTAMPTZ"),
166            Self::Interval => f.write_str("INTERVAL"),
167            Self::Json => f.write_str("JSON"),
168            Self::Jsonb => f.write_str("JSONB"),
169        }
170    }
171}
172
173/// A row-cell value, including SQL `NULL`. `Float` uses `f64`; NaN compares
174/// non-equal to itself (PG behaviour) — `PartialEq` is derived so callers
175/// must opt into NaN-aware comparison if they need stronger guarantees.
176#[derive(Debug, Clone, PartialEq)]
177#[non_exhaustive]
178pub enum Value {
179    SmallInt(i16),
180    Int(i32),
181    BigInt(i64),
182    Float(f64),
183    Text(String),
184    Bool(bool),
185    Vector(Vec<f32>),
186    /// v6.0.1: 8-bit scalar-quantised vector cell. Lives in
187    /// columns declared `VECTOR(N) USING SQ8`. Layout per cell:
188    /// `Sq8Vector { min: f32, max: f32, bytes: Vec<u8> }` —
189    /// 4× compression vs `Vector(Vec<f32>)`. The wire layer
190    /// dequantises to `f32` on SELECT; INSERT path quantises
191    /// incoming `Vector(Vec<f32>)` cells into this variant.
192    Sq8Vector(crate::quantize::Sq8Vector),
193    /// v6.0.3: IEEE-754 binary16 vector cell. Lives in columns
194    /// declared `VECTOR(N) USING HALF`. Stores raw u16 LE bits
195    /// (2× compression vs `Vector(Vec<f32>)`). Wire / display
196    /// paths dequantise to f32 bit-exactly; INSERT path converts
197    /// incoming f32 vectors at the engine boundary.
198    HalfVector(crate::halfvec::HalfVector),
199    /// Exact fixed-point decimal. `scaled` holds the value as
200    /// `actual * 10^scale` so the storage type is always integral —
201    /// arithmetic never falls back to floating-point.
202    Numeric {
203        scaled: i128,
204        scale: u8,
205    },
206    /// Days since the Unix epoch (1970-01-01). Negative for earlier dates.
207    Date(i32),
208    /// Microseconds since the Unix epoch (1970-01-01T00:00:00Z).
209    Timestamp(i64),
210    /// Calendar span: `months` (variable-length) + `micros` (fixed-length).
211    /// Runtime-only — cannot appear in a stored row in v2.11.
212    Interval {
213        months: i32,
214        micros: i64,
215    },
216    /// v4.9 `JSON` — raw JSON text. No structural validation
217    /// happens at the storage layer; whatever the parser hands us
218    /// round-trips verbatim. Equality is byte-wise.
219    Json(String),
220    Null,
221}
222
223impl Value {
224    /// Type tag, or `None` for `NULL` (unknown at value level).
225    pub fn data_type(&self) -> Option<DataType> {
226        match self {
227            Self::SmallInt(_) => Some(DataType::SmallInt),
228            Self::Int(_) => Some(DataType::Int),
229            Self::BigInt(_) => Some(DataType::BigInt),
230            Self::Float(_) => Some(DataType::Float),
231            // `Text` covers both unbounded TEXT and bounded VARCHAR/CHAR
232            // — the constraint lives on the column schema, not the value.
233            Self::Text(_) => Some(DataType::Text),
234            Self::Bool(_) => Some(DataType::Bool),
235            Self::Vector(v) => Some(DataType::Vector {
236                dim: u32::try_from(v.len()).expect("vector dim ≤ u32"),
237                encoding: VecEncoding::F32,
238            }),
239            Self::Sq8Vector(q) => Some(DataType::Vector {
240                dim: u32::try_from(q.bytes.len()).expect("vector dim ≤ u32"),
241                encoding: VecEncoding::Sq8,
242            }),
243            Self::HalfVector(h) => Some(DataType::Vector {
244                dim: u32::try_from(h.dim()).expect("vector dim ≤ u32"),
245                encoding: VecEncoding::F16,
246            }),
247            // `Value::Numeric` doesn't carry its precision (the column
248            // schema does); we surface precision=0 as "unknown" and let
249            // the engine reconcile against the column type at coercion
250            // time.
251            Self::Numeric { scale, .. } => Some(DataType::Numeric {
252                precision: 0,
253                scale: *scale,
254            }),
255            Self::Date(_) => Some(DataType::Date),
256            Self::Timestamp(_) => Some(DataType::Timestamp),
257            Self::Interval { .. } => Some(DataType::Interval),
258            Self::Json(_) => Some(DataType::Json),
259            Self::Null => None,
260        }
261    }
262
263    pub const fn is_null(&self) -> bool {
264        matches!(self, Self::Null)
265    }
266}
267
268/// One table row — values are positional and must match
269/// `TableSchema.columns` in length and (modulo NULL) in `DataType`.
270#[derive(Debug, Clone, PartialEq)]
271pub struct Row {
272    pub values: Vec<Value>,
273}
274
275impl Row {
276    pub const fn new(values: Vec<Value>) -> Self {
277        Self { values }
278    }
279
280    pub fn len(&self) -> usize {
281        self.values.len()
282    }
283
284    pub fn is_empty(&self) -> bool {
285        self.values.is_empty()
286    }
287}
288
289#[derive(Debug, Clone, PartialEq)]
290pub struct ColumnSchema {
291    pub name: String,
292    pub ty: DataType,
293    pub nullable: bool,
294    /// Optional `DEFAULT` value, frozen at CREATE TABLE time. `None`
295    /// means "no default" (so omitted columns become NULL, or error
296    /// out when the column is NOT NULL).
297    pub default: Option<Value>,
298    /// MySQL-style `AUTO_INCREMENT`. When set, an INSERT that leaves
299    /// this column unbound (or sets it to NULL) gets the next integer
300    /// computed from the column's current max + 1.
301    pub auto_increment: bool,
302}
303
304#[derive(Debug, Clone, PartialEq)]
305pub struct TableSchema {
306    pub name: String,
307    pub columns: Vec<ColumnSchema>,
308    /// v6.7.2 — per-table hot-tier byte budget override. `None`
309    /// falls through to the global `SPG_HOT_TIER_BYTES` setting;
310    /// `Some(n)` overrides it for this specific table. Set via
311    /// `ALTER TABLE t SET hot_tier_bytes = X`. Persisted in
312    /// catalog FILE_VERSION 11+.
313    pub hot_tier_bytes: Option<u64>,
314    /// v7.6.1 — FOREIGN KEY constraints declared on this table.
315    /// Engine maintains this in lock-step with `spg-sql`'s parser
316    /// AST; the storage layer carries the on-disk shape so a
317    /// catalog snapshot round-trips without external mapping.
318    /// Persisted in catalog FILE_VERSION 13+. Older catalogs
319    /// deserialise with an empty vec.
320    pub foreign_keys: Vec<ForeignKeyConstraint>,
321}
322
323/// v7.6.1 — Storage-layer mirror of `spg_sql::ast::ForeignKeyConstraint`.
324/// The engine's CREATE TABLE path translates between the two; keeping
325/// them separate preserves the no-deps boundary between
326/// `spg-storage` and `spg-sql`.
327#[derive(Debug, Clone, PartialEq, Eq)]
328pub struct ForeignKeyConstraint {
329    /// Optional user-supplied constraint name (`CONSTRAINT <name>`
330    /// prefix). Used by `ALTER TABLE DROP CONSTRAINT <name>` in
331    /// v7.6.8; ignored by enforcement.
332    pub name: Option<String>,
333    /// Positions of local columns in this table's column list.
334    /// Same arity as `parent_columns`.
335    pub local_columns: Vec<usize>,
336    /// Referenced parent table name.
337    pub parent_table: String,
338    /// Positions of parent columns in the parent's column list.
339    /// Engine resolves these at CREATE TABLE time (after the parent
340    /// schema is known) so enforcement paths can skip the name
341    /// lookup on every row.
342    pub parent_columns: Vec<usize>,
343    /// Referential action when a parent row is deleted.
344    pub on_delete: FkAction,
345    /// Referential action when a parent row's referenced columns
346    /// are updated.
347    pub on_update: FkAction,
348}
349
350/// v7.6.1 — referential action tag. Mirrors `spg_sql::ast::FkAction`.
351#[derive(Debug, Clone, Copy, PartialEq, Eq)]
352pub enum FkAction {
353    Restrict,
354    Cascade,
355    SetNull,
356    SetDefault,
357    NoAction,
358}
359
360impl FkAction {
361    /// On-disk tag byte (v13 catalog appendix).
362    pub const fn tag(self) -> u8 {
363        match self {
364            Self::Restrict => 0,
365            Self::Cascade => 1,
366            Self::SetNull => 2,
367            Self::SetDefault => 3,
368            Self::NoAction => 4,
369        }
370    }
371    pub const fn from_tag(b: u8) -> Option<Self> {
372        Some(match b {
373            0 => Self::Restrict,
374            1 => Self::Cascade,
375            2 => Self::SetNull,
376            3 => Self::SetDefault,
377            4 => Self::NoAction,
378            _ => return None,
379        })
380    }
381}
382
383impl TableSchema {
384    pub fn column_position(&self, name: &str) -> Option<usize> {
385        self.columns.iter().position(|c| c.name == name)
386    }
387}
388
389/// Key type accepted by secondary indices. Float / NULL / Vector values
390/// can't participate in a B-tree index — `f64` is only `PartialOrd`, NULL
391/// has SQL-three-valued semantics, and Vector belongs to the (future) HNSW
392/// path. Index lookups on those columns fall back to full scan.
393#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
394pub enum IndexKey {
395    Int(i64),
396    Text(String),
397    Bool(bool),
398}
399
400impl IndexKey {
401    pub fn from_value(v: &Value) -> Option<Self> {
402        match v {
403            Value::SmallInt(n) => Some(Self::Int(i64::from(*n))),
404            Value::Int(n) => Some(Self::Int(i64::from(*n))),
405            Value::BigInt(n) => Some(Self::Int(*n)),
406            Value::Text(s) => Some(Self::Text(s.clone())),
407            Value::Bool(b) => Some(Self::Bool(*b)),
408            // Date/Timestamp use their integer storage repr as the
409            // index key — same order semantics, same comparison.
410            Value::Date(d) => Some(Self::Int(i64::from(*d))),
411            Value::Timestamp(t) => Some(Self::Int(*t)),
412            // Numeric isn't (yet) indexable — exact-decimal index keys
413            // would need a stable scale-normalised representation.
414            // Interval isn't index-eligible either (and can't reach this
415            // path through column storage anyway).
416            Value::Null
417            | Value::Float(_)
418            | Value::Vector(_)
419            | Value::Sq8Vector(_)
420            | Value::HalfVector(_)
421            | Value::Numeric { .. }
422            | Value::Interval { .. }
423            | Value::Json(_) => None,
424        }
425    }
426}
427
428/// A single-column secondary index. v2.0 carries either a B-tree map
429/// (the default — used for equality / range lookups on scalar columns)
430/// or a navigable-small-world graph (used for kNN over vector
431/// columns).
432#[derive(Debug, Clone)]
433pub struct Index {
434    pub name: String,
435    pub column_position: usize,
436    pub kind: IndexKind,
437    /// v6.8.0 — column positions of `INCLUDE (col1, col2, …)`
438    /// non-key columns. Carries the planner's "this query is
439    /// covered by the index" signal; lookup paths still resolve
440    /// via the `RowLocator` to fetch the row body, but EXPLAIN
441    /// surfaces the covered-scan annotation so operators can
442    /// confirm the planner sees the coverage.
443    ///
444    /// Empty `Vec` = no `INCLUDE` clause (the legacy shape). v12
445    /// catalog snapshots deserialise with an empty vec.
446    pub included_columns: Vec<usize>,
447    /// v6.8.1 — partial-index predicate stored as its canonical
448    /// Display form (the engine re-parses it on the maintenance
449    /// path). `None` = unconditional index (the legacy shape).
450    /// Persisted as `[u8 has_pred][u16 LE len][bytes]` on the
451    /// catalog snapshot (FILE_VERSION 12, appended after
452    /// `included_columns`).
453    pub partial_predicate: Option<String>,
454    /// v6.8.2 — expression-index key, stored as the expression's
455    /// canonical Display form. `None` = bare column-reference
456    /// index (the legacy shape). Persisted alongside
457    /// `partial_predicate` on the v12 catalog snapshot.
458    pub expression: Option<String>,
459}
460
461/// Default neighbor degree (M) for the NSW graph. Picked at construction
462/// time and persisted with the index.
463pub const NSW_DEFAULT_M: usize = 16;
464
465/// v5.2.2: outcome of a successful [`Catalog::freeze_oldest_to_cold`]
466/// call. The catalog state has already been mutated by the time this
467/// is returned (hot rows dropped + segment registered + Cold locators
468/// flipped). The caller's only remaining concern is `segment_bytes` —
469/// persist them to disk under `<db>.spg/segments/seg_<id>.spg` so a
470/// future restart can reload via the v5.1 `SPG_PRELOAD_COLD_SEGMENT`
471/// path. (v5.3's manifest will subsume this manual step.)
472#[derive(Debug, Clone)]
473pub struct FreezeReport {
474    /// Id allocated by [`Catalog::load_segment_bytes`] for the new
475    /// cold-tier segment. Stable across the call's success path.
476    pub segment_id: u32,
477    /// Number of rows that moved hot → cold. Equals the `max_rows`
478    /// the caller asked for (the API is strict on the count).
479    pub frozen_rows: usize,
480    /// Hot-tier bytes reclaimed by the freeze — the
481    /// [`Table::hot_bytes`] delta before vs after. Useful to feed
482    /// back into the freezer's budget check on the next tick.
483    pub bytes_freed: u64,
484    /// Encoded segment bytes, byte-identical to what
485    /// [`encode_segment`] produced. The catalog already owns a
486    /// copy inside `cold_segments`; this hand-off lets the caller
487    /// persist them without re-encoding.
488    pub segment_bytes: Vec<u8>,
489}
490
491/// v6.7.4 — read-only output of [`Catalog::prepare_freeze_slice`].
492/// Carries every row body + key in a contiguous hot-row range,
493/// already encoded and sorted by PK so the coordinator's merge
494/// step is a k-way merge over already-sorted streams.
495///
496/// `Vec<FreezeSlice>` from N independent workers feeds
497/// [`Catalog::commit_freeze_slices`], which concats + encodes the
498/// merged segment + atomically swaps the catalog state.
499#[derive(Debug, Clone)]
500pub struct FreezeSlice {
501    /// Hot-row index range this slice covered (half-open, in the
502    /// table's `rows: PersistentVec` ordering at call time). The
503    /// commit step uses this to compute the union range that
504    /// gets passed to [`Table::delete_rows`].
505    pub row_range: core::ops::Range<usize>,
506    /// `(pk_u64, encoded_row_body, IndexKey)` triples, sorted
507    /// ascending by `pk_u64`. Per-slice sort happens inside
508    /// `prepare_freeze_slice`; the coordinator does only a
509    /// k-way merge to reach the global PK ordering
510    /// [`encode_segment`] requires.
511    pub rows: Vec<(u64, Vec<u8>, IndexKey)>,
512}
513
514/// v6.7.3 — outcome of a [`Catalog::compact_cold_segments`] call.
515/// The catalog state has already been mutated when this is returned:
516/// the merged segment is loaded into `cold_segments`, the source
517/// segment slots are tombstoned (`None`), and every BTree-index
518/// `RowLocator::Cold` that previously pointed at a source now
519/// points at the merged segment. The caller's remaining job is to
520/// persist `merged_segment_bytes` under
521/// `<db>.spg/segments/seg_<merged_segment_id>.spg` and update the
522/// in-memory `segment_id → path` map (remove the source ids, add
523/// the merged id) so the next CHECKPOINT writes a manifest that
524/// no longer lists the retired sources.
525///
526/// On a no-op (fewer than 2 candidate segments under the threshold),
527/// `merged_segment_id` is `None` and `sources` is empty; the
528/// catalog was not mutated.
529#[derive(Debug, Clone)]
530pub struct CompactReport {
531    /// Source segment ids that were merged + tombstoned.
532    pub sources: Vec<u32>,
533    /// Id allocated for the merged segment. `None` on no-op.
534    pub merged_segment_id: Option<u32>,
535    /// Encoded merged-segment bytes (empty on no-op).
536    pub merged_segment_bytes: Vec<u8>,
537    /// Number of rows that landed in the merged segment.
538    pub merged_rows: usize,
539    /// `Σ source.num_rows − merged_rows`. Rows present in source
540    /// segment payloads but unreferenced by any live BTree
541    /// `Cold` locator — DELETE'd-but-still-frozen rows that
542    /// compaction GC'd during the merge.
543    pub deleted_rows_pruned: usize,
544    /// `Σ source.bytes() − merged.bytes()`. Estimate of on-disk
545    /// space the merge will reclaim once the source segment files
546    /// are GC'd. Saturating subtract — never negative.
547    pub bytes_reclaimed_estimate: u64,
548}
549
550#[derive(Debug, Clone)]
551pub enum IndexKind {
552    /// v4.40: structural-sharing B-tree over `IndexKey`. Replaces the v0.8
553    /// `BTreeMap<IndexKey, Vec<usize>>` — `Index::clone` is now an `Arc`
554    /// bump regardless of index size, so `Catalog::clone` inside the
555    /// v4.34 auto-commit wrap stays O(1) even for tables with secondary
556    /// indices (the case that bottlenecked v4.39 at 1M rows in the
557    /// sweep).
558    ///
559    /// v5.1: value type widened from `Vec<usize>` to `Vec<RowLocator>` so
560    /// a single key can point to a mix of hot-tier rows (`RowLocator::Hot`,
561    /// equivalent to the pre-v5 `usize` row index) and cold-tier rows
562    /// (`RowLocator::Cold { segment_id, page_offset }`) once the v5.2
563    /// freezer starts producing them. Pre-v5.2 only `Hot` entries appear
564    /// — the on-disk encoding stays at `FILE_VERSION` 8 (raw u64 row index)
565    /// because every locator round-trips through `RowLocator::from_legacy_v8_u64`
566    /// without information loss. `FILE_VERSION` 9 with tagged encoding lands
567    /// alongside the first freezer commit (v5.1 step 2b / v5.2).
568    BTree(PersistentBTreeMap<IndexKey, Vec<RowLocator>>),
569    /// Navigable-small-world graph for vector kNN search.
570    Nsw(NswGraph),
571    /// v6.7.1 — BRIN (Block Range INdex). Pure metadata: BRIN
572    /// indexes carry NO in-memory key→locator map. The (min,
573    /// max) summaries live in each cold-tier segment's v2
574    /// envelope sidecar; the BRIN entry in `Table.indices` only
575    /// records THAT a BRIN index exists on this column so the
576    /// segment encoder + planner can opt into the summary path.
577    Brin {
578        /// The cell type at `column_position` at CREATE INDEX time.
579        /// Used by the planner to type-check WHERE-clause range
580        /// predicates against the BRIN-indexed column.
581        column_type: DataType,
582    },
583}
584
585/// Multi-layer HNSW graph (v2.13). Each node is assigned a `top_level`;
586/// it appears in layers `0..=top_level`. Higher layers are sparser, so
587/// search starts from the entry at the top layer, greedy-descends to
588/// layer 0, and beam-searches there. Layer 0 keeps a larger neighbour
589/// budget (`m_max_0 = 2 * m` per the HNSW paper); upper layers cap at
590/// `m`. The struct name stays `NswGraph` so external users / on-disk
591/// callers don't have to track a rename — the algorithm changed, the
592/// data slot didn't.
593#[derive(Debug, Clone)]
594pub struct NswGraph {
595    /// Max neighbours per node on layers ≥ 1.
596    pub m: usize,
597    /// Max neighbours on layer 0 (the dense bottom layer). HNSW
598    /// convention: `m_max_0 = 2 * m`.
599    pub m_max_0: usize,
600    /// Entry point — the node that sits on the topmost layer. Search
601    /// always starts here.
602    pub entry: Option<usize>,
603    /// Top layer of the entry node (== `layers.len() - 1` when populated).
604    pub entry_level: u8,
605    /// `levels[i]` = top layer of node `i`. Nodes whose vector cell is
606    /// NULL / non-Vector have `levels[i] = 0` and no neighbour entries.
607    ///
608    /// v5.5.0: backed by `PersistentVec` so `NswGraph::clone` (and the
609    /// `Catalog::clone` on every group-commit write that contains it) is O(1)
610    /// structural-sharing instead of an O(N) element copy.
611    pub levels: PersistentVec<u8>,
612    /// `layers[l][i]` = neighbours of node `i` at layer `l`. Inner vec
613    /// is empty when node `i` doesn't reach layer `l`.
614    ///
615    /// v5.5.0: the per-node middle dimension (the O(N) one) is a
616    /// `PersistentVec`; the outer layer dimension stays a plain `Vec`
617    /// (layer count ≤ 8, so its clone is O(1) in practice) and the inner
618    /// neighbour list stays a `Vec` (bounded by `m_max_0`).
619    ///
620    /// v6.1.x: neighbour slot widened from `usize` (8 B on 64-bit) to
621    /// `u32` (4 B). Row indices are catalog-bounded by `u32::MAX` (4G
622    /// rows per table); the cast at the NSW boundary asserts this. At
623    /// 1M dim-128 SQ8, layer 0 adjacency alone shrinks by ~128 MiB
624    /// — the largest single contribution to the v6.0.5-measured
625    /// 624 MiB ambition gap. On-disk format already used u32 LE, so
626    /// this is a pure in-memory layout change; no `FILE_VERSION` bump.
627    pub layers: Vec<PersistentVec<Vec<u32>>>,
628}
629
630impl NswGraph {
631    fn new(m: usize) -> Self {
632        Self {
633            m,
634            m_max_0: m.saturating_mul(2),
635            entry: None,
636            entry_level: 0,
637            levels: PersistentVec::new(),
638            layers: alloc::vec![PersistentVec::new()],
639        }
640    }
641
642    /// Max-neighbour budget for layer `l`.
643    pub const fn cap_for_layer(&self, layer: u8) -> usize {
644        if layer == 0 { self.m_max_0 } else { self.m }
645    }
646}
647
648/// Deterministic level assignment, seeded on the row index so the same
649/// insert order reproduces the same topology. Distribution is roughly
650/// HNSW-flavoured with `mL ≈ 1/ln(M) ≈ 0.36` for M=16: each 4-bit
651/// chunk that comes up zero promotes the node one layer (so P(level ≥
652/// L) ≈ (1/16)^L).
653#[allow(clippy::verbose_bit_mask)] // clippy suggests trailing_zeros(); we need an explicit MAX cap and a stable distribution shape.
654pub fn nsw_assign_level(row_idx: usize) -> u8 {
655    const MAX_LEVEL: u8 = 7; // 7 ⇒ ~16^7 ≈ 2.7e8 expected nodes between promotions; ample.
656    // SplitMix-style mixer — cheap and seedable.
657    let mut x = (row_idx as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15);
658    x ^= x >> 30;
659    x = x.wrapping_mul(0xBF58_476D_1CE4_E5B9);
660    x ^= x >> 27;
661    x = x.wrapping_mul(0x94D0_49BB_1331_11EB);
662    x ^= x >> 31;
663    // Count contiguous low-end zero nibbles (4-bit chunks). Each zero
664    // nibble has probability 1/16, mirroring HNSW's `mL ≈ 1/ln(M)` for
665    // M=16. `trailing_zeros / 4` would lose the ordering when x = 0, so
666    // a plain loop with a cap is clearer.
667    let mut level: u8 = 0;
668    while x & 0xF == 0 && level < MAX_LEVEL {
669        level += 1;
670        x >>= 4;
671    }
672    level
673}
674
675impl Index {
676    fn new_btree(name: String, column_position: usize) -> Self {
677        Self {
678            name,
679            column_position,
680            kind: IndexKind::BTree(PersistentBTreeMap::new()),
681            included_columns: Vec::new(),
682            partial_predicate: None,
683            expression: None,
684        }
685    }
686
687    fn new_nsw(name: String, column_position: usize, m: usize) -> Self {
688        Self {
689            name,
690            column_position,
691            kind: IndexKind::Nsw(NswGraph::new(m)),
692            included_columns: Vec::new(),
693            partial_predicate: None,
694            expression: None,
695        }
696    }
697
698    /// v6.7.1 — BRIN index constructor. BRIN carries no in-memory
699    /// data; the `column_type` snapshot is used by the segment
700    /// encoder + planner for type-checking range predicates.
701    fn new_brin(name: String, column_position: usize, column_type: DataType) -> Self {
702        Self {
703            name,
704            column_position,
705            kind: IndexKind::Brin { column_type },
706            included_columns: Vec::new(),
707            partial_predicate: None,
708            expression: None,
709        }
710    }
711
712    /// Look up the locators stored under `key` (B-tree only). Returns
713    /// an empty slice when the key is absent or the index is an NSW
714    /// graph — callers can treat both cases uniformly.
715    ///
716    /// v5.1: return type widened from `&[usize]` to `&[RowLocator]`.
717    /// Pre-v5.2 callers can read the slice and `.as_hot().unwrap()`
718    /// each entry (no `Cold` variants exist until the freezer lands);
719    /// post-v5.2 callers dispatch hot vs. cold per locator.
720    pub fn lookup_eq(&self, key: &IndexKey) -> &[RowLocator] {
721        match &self.kind {
722            IndexKind::BTree(m) => m.get(key).map_or(&[][..], Vec::as_slice),
723            // BRIN/Nsw have no key→locator map; lookup is a no-op.
724            IndexKind::Nsw(_) | IndexKind::Brin { .. } => &[][..],
725        }
726    }
727
728    /// Borrow the NSW graph (if this is an NSW index). Callers that need
729    /// the graph for a kNN search go through here.
730    pub const fn nsw(&self) -> Option<&NswGraph> {
731        match &self.kind {
732            IndexKind::Nsw(g) => Some(g),
733            IndexKind::BTree(_) | IndexKind::Brin { .. } => None,
734        }
735    }
736
737    /// v6.7.1 — true when this index is a BRIN (block range) index.
738    /// Used by the segment encoder to opt into BRIN sidecar emission
739    /// at freeze time, and by the planner to opt into page-skipping
740    /// on range predicates.
741    pub const fn is_brin(&self) -> bool {
742        matches!(self.kind, IndexKind::Brin { .. })
743    }
744}
745
746/// In-memory table: schema + a persistent row vector + secondary indices.
747///
748/// v4.39: `rows` is a [`PersistentVec`] (Bitmapped Vector Trie, 32-way) so
749/// `Table::clone()` is `O(1)` — the whole reason for v4.39's existence is
750/// to make `Catalog::clone()` cheap inside the v4.34 auto-commit wrap.
751///
752/// v5.2.1: `hot_bytes` tracks the encoded byte size of every row currently
753/// in [`Self::rows`], summed over rows. Updated incrementally by `insert`
754/// (+= encoded row size), `delete_rows` (-= removed rows' encoded sizes),
755/// and `update_row` (-= old size, += new size). The value is what the
756/// v5.2 freezer reads to decide when to demote cold rows — when the
757/// catalog-wide sum crosses `SPG_HOT_TIER_BYTES` (default 4 GiB) the
758/// freezer thread wakes. v5.2.1 ships measurement only; the freezer
759/// itself lands in v5.2.2. Stored as `u64` so a single field clone in
760/// `Catalog::clone` stays at the O(1) invariant v4.39 built.
761#[derive(Debug, Clone)]
762pub struct Table {
763    schema: TableSchema,
764    rows: PersistentVec<Row>,
765    indices: Vec<Index>,
766    hot_bytes: u64,
767    /// v6.7.0 — cached count of rows currently materialised in the
768    /// cold tier via `RowLocator::Cold` entries across THIS table's
769    /// indices. Populated by `ANALYZE` (walks every BTree index and
770    /// counts Cold locators); the count survives until the next
771    /// ANALYZE recomputes it. Surfaced via `spg_statistic.cold_row_count`
772    /// and `spg_stat_segment.table_name`.
773    ///
774    /// Honest scope: this is a CACHED count, not a live one.
775    /// Freezer / promote / DELETE don't currently update the cache
776    /// incrementally — they invalidate it by setting the
777    /// `cold_row_count_stale` flag, and the next ANALYZE re-walks.
778    /// Incremental maintenance is a v6.7.x candidate if observation
779    /// shows the ANALYZE walk cost dominates.
780    cold_row_count: u64,
781    /// v6.7.0 — set when the cached `cold_row_count` may be wrong
782    /// because rows moved into / out of the cold tier since the last
783    /// ANALYZE. The virtual-table surface reports the cached value
784    /// regardless (operators run ANALYZE to refresh).
785    cold_row_count_stale: bool,
786}
787
788impl Table {
789    pub fn new(schema: TableSchema) -> Self {
790        Self {
791            schema,
792            rows: PersistentVec::new(),
793            indices: Vec::new(),
794            hot_bytes: 0,
795            cold_row_count: 0,
796            cold_row_count_stale: false,
797        }
798    }
799
800    /// Total encoded byte size of every row currently in the hot tier
801    /// (`self.rows`). See struct docs for the maintenance contract.
802    /// Returns 0 for an empty table.
803    #[must_use]
804    pub const fn hot_bytes(&self) -> u64 {
805        self.hot_bytes
806    }
807
808    /// v6.7.0 — cached count of cold-tier rows. See struct field
809    /// docs for the staleness contract.
810    #[must_use]
811    pub const fn cold_row_count(&self) -> u64 {
812        self.cold_row_count
813    }
814
815    /// v6.7.0 — overwrite the cached count. Called by the engine's
816    /// `analyze_one_table` after walking the indices.
817    pub fn set_cold_row_count(&mut self, n: u64) {
818        self.cold_row_count = n;
819        self.cold_row_count_stale = false;
820    }
821
822    /// v6.7.0 — mark the cached count as potentially out of date.
823    /// Called by freezer / promote / DELETE paths so a subsequent
824    /// `spg_statistic` read knows the number may not reflect the
825    /// current state.
826    pub fn mark_cold_row_count_stale(&mut self) {
827        self.cold_row_count_stale = true;
828    }
829
830    /// v6.7.0 — report whether the cached count is known to be out
831    /// of date. Exposed for completeness; the virtual table surface
832    /// returns the cached value regardless.
833    #[must_use]
834    pub const fn cold_row_count_stale(&self) -> bool {
835        self.cold_row_count_stale
836    }
837
838    /// v6.7.0 — walk every BTree index and count `RowLocator::Cold`
839    /// entries; return the MAX across indices. The freeze path
840    /// (`freeze_oldest_to_cold`) writes cold locators to ONE
841    /// designated index — that index ends up with the full per-row
842    /// count. MAX-across-indices yields the precise count when a
843    /// PK-style index exists; for multi-index tables without a
844    /// covering index it's a lower bound (rare in practice).
845    /// Caller responsibility: only invoke under `engine.write()`
846    /// or after taking ownership; the walk is O(N) over every
847    /// (key, locator) pair.
848    #[must_use]
849    pub fn count_cold_locators(&self) -> u64 {
850        let mut best: u64 = 0;
851        for idx in &self.indices {
852            if let IndexKind::BTree(map) = &idx.kind {
853                let n: u64 = map
854                    .iter()
855                    .map(|(_, locs)| locs.iter().filter(|l| l.is_cold()).count() as u64)
856                    .sum();
857                if n > best {
858                    best = n;
859                }
860            }
861        }
862        best
863    }
864
865    pub const fn schema(&self) -> &TableSchema {
866        &self.schema
867    }
868
869    /// v6.7.2 — mutable schema accessor for ALTER TABLE paths.
870    /// Used by `Engine::exec_alter_table` to flip per-table
871    /// settings like `hot_tier_bytes`.
872    pub const fn schema_mut(&mut self) -> &mut TableSchema {
873        &mut self.schema
874    }
875
876    /// v4.39: returns the persistent row vector by reference. Callers that
877    /// used to take `&[Row]` should switch to `.iter()` (via
878    /// `IntoIterator for &PersistentVec`) or `.get(i)` for indexing.
879    pub const fn rows(&self) -> &PersistentVec<Row> {
880        &self.rows
881    }
882
883    pub const fn row_count(&self) -> usize {
884        self.rows.len()
885    }
886
887    /// v6.8.0 — exposed for the engine layer to patch
888    /// `Index::included_columns` post-creation. Could fold into
889    /// `add_index` once the engine's IF-NOT-EXISTS guard moves up,
890    /// but the patch shape is the minimal change for v6.8.0.
891    pub fn indices_mut(&mut self) -> &mut [Index] {
892        &mut self.indices
893    }
894
895    pub fn indices(&self) -> &[Index] {
896        &self.indices
897    }
898
899    /// Compute the next `AUTO_INCREMENT` value for the column at
900    /// `col_pos`. Defined as `max(existing) + 1`, falling back to `1`
901    /// when the column currently holds no integer values. NULL / non-
902    /// integer cells are skipped. Returns `None` when the column isn't
903    /// an integer type.
904    pub fn next_auto_value(&self, col_pos: usize) -> Option<i64> {
905        let ty = self.schema.columns.get(col_pos)?.ty;
906        if !matches!(ty, DataType::SmallInt | DataType::Int | DataType::BigInt) {
907            return None;
908        }
909        let mut max: Option<i64> = None;
910        for row in &self.rows {
911            match row.values.get(col_pos) {
912                Some(Value::SmallInt(n)) => {
913                    let v = i64::from(*n);
914                    max = Some(max.map_or(v, |m| m.max(v)));
915                }
916                Some(Value::Int(n)) => {
917                    let v = i64::from(*n);
918                    max = Some(max.map_or(v, |m| m.max(v)));
919                }
920                Some(Value::BigInt(n)) => {
921                    max = Some(max.map_or(*n, |m| m.max(*n)));
922                }
923                _ => {}
924            }
925        }
926        Some(max.map_or(1, |m| m + 1))
927    }
928
929    /// Return the first index defined over `column_position`, if any.
930    /// (`v0.8` supports at most one index per column logically; the search
931    /// just picks the first match.)
932    pub fn index_on(&self, column_position: usize) -> Option<&Index> {
933        // v6.7.1 — prefer BTree (has the key→locator map needed
934        // for `lookup_eq`) over BRIN (metadata-only). When only a
935        // BRIN exists on the column, return None so the executor
936        // falls back to the hot-tier row scan instead of trying
937        // to use BRIN for an equality lookup (which would always
938        // return an empty slice and look like "no rows matched").
939        self.indices
940            .iter()
941            .find(|i| i.column_position == column_position && matches!(i.kind, IndexKind::BTree(_)))
942            .or_else(|| {
943                self.indices
944                    .iter()
945                    .find(|i| i.column_position == column_position && matches!(i.kind, IndexKind::Nsw(_)))
946            })
947    }
948
949    /// Insert one row after validating it matches the schema (length + type).
950    /// Returns `StorageError` on mismatch — the table is left unchanged.
951    /// Updates every defined index with the new row's key.
952    pub fn insert(&mut self, row: Row) -> Result<(), StorageError> {
953        if row.len() != self.schema.columns.len() {
954            return Err(StorageError::ArityMismatch {
955                expected: self.schema.columns.len(),
956                actual: row.len(),
957            });
958        }
959        for (i, (val, col)) in row.values.iter().zip(&self.schema.columns).enumerate() {
960            if val.is_null() {
961                if !col.nullable {
962                    return Err(StorageError::NullInNotNull {
963                        column: col.name.clone(),
964                    });
965                }
966                continue;
967            }
968            let actual = val.data_type().expect("non-null");
969            // Vector columns require both that the value's variant be Vector
970            // *and* its dimension match. `actual == col.ty` already encodes
971            // both because DataType::Vector carries the dim.
972            //
973            // VARCHAR(n) / CHAR(n) are storage-equivalent to TEXT — the
974            // length / padding contract is enforced upstream by
975            // `coerce_value`. Accept a `Text` value into either.
976            //
977            // NUMERIC's `Value::Numeric` carries its actual scale but the
978            // column declares the *expected* scale (a scale-rescaled
979            // Value::Numeric is produced upstream by `coerce_value`); the
980            // structural check here only verifies "value is Numeric and
981            // its scale equals the column scale".
982            let compatible = actual == col.ty
983                || matches!(
984                    (actual, col.ty),
985                    (
986                        DataType::Text,
987                        DataType::Varchar(_) | DataType::Char(_) | DataType::Json | DataType::Jsonb
988                    ) | (DataType::Json | DataType::Jsonb, DataType::Text)
989                      | (DataType::Json, DataType::Jsonb) | (DataType::Jsonb, DataType::Json)
990                      | (DataType::Timestamp, DataType::Timestamptz)
991                      | (DataType::Timestamptz, DataType::Timestamp)
992                )
993                || matches!(
994                    (actual, col.ty),
995                    (
996                        DataType::Numeric { scale: a, .. },
997                        DataType::Numeric { scale: b, .. },
998                    ) if a == b
999                );
1000            if !compatible {
1001                return Err(StorageError::TypeMismatch {
1002                    column: col.name.clone(),
1003                    expected: col.ty,
1004                    actual,
1005                    position: i,
1006                });
1007            }
1008        }
1009        let new_row_idx = self.rows.len();
1010        // Pre-validate before mutating: ensure indices receive an IndexKey.
1011        // For NSW we defer the graph update to *after* the row is pushed
1012        // so the kNN search can see it in `self.rows`.
1013        for idx in &mut self.indices {
1014            if let IndexKind::BTree(map) = &mut idx.kind
1015                && let Some(key) = IndexKey::from_value(&row.values[idx.column_position])
1016            {
1017                // v4.40: PersistentBTreeMap has no in-place entry-or-default.
1018                // Clone-then-insert keeps the same semantics — for typical
1019                // unique-key schemas the Vec is 1-element so the clone is
1020                // O(1). For dup-heavy columns it's O(M) per insert, traded
1021                // for the structural-sharing win at clone time.
1022                let mut entries = map.get(&key).cloned().unwrap_or_default();
1023                entries.push(RowLocator::Hot(new_row_idx));
1024                map.insert_mut(key, entries);
1025            }
1026        }
1027        // v5.2.1: maintain incremental hot-tier byte counter. Computed
1028        // before the move so we don't need to borrow `row` after push.
1029        self.hot_bytes = self
1030            .hot_bytes
1031            .saturating_add(row_body_encoded_len(&row, &self.schema) as u64);
1032        // v4.39.1: push_mut keeps streaming inserts at Vec::push speed when
1033        // the table is uniquely owned (the spg-embedded path); inside a TX
1034        // wrap where a Catalog snapshot exists, push_mut path-copies the
1035        // tail just like push() and the snapshot stays valid.
1036        self.rows.push_mut(row);
1037        // NSW updates after the push so the new row is visible to the
1038        // greedy search used during connect.
1039        let new_row_idx = self.rows.len() - 1;
1040        let nsw_targets: Vec<usize> = self
1041            .indices
1042            .iter()
1043            .enumerate()
1044            .filter_map(|(i, idx)| {
1045                if matches!(idx.kind, IndexKind::Nsw(_)) {
1046                    Some(i)
1047                } else {
1048                    None
1049                }
1050            })
1051            .collect();
1052        for idx_pos in nsw_targets {
1053            nsw_insert_at(self, idx_pos, new_row_idx);
1054        }
1055        Ok(())
1056    }
1057
1058    /// Build a new B-tree index over the named column. Rebuilds from
1059    /// existing rows. Errors if `column_name` doesn't exist or the index
1060    /// name is taken.
1061    pub fn add_index(&mut self, name: String, column_name: &str) -> Result<(), StorageError> {
1062        if self.indices.iter().any(|i| i.name == name) {
1063            return Err(StorageError::DuplicateIndex { name });
1064        }
1065        let column_position = self.schema.column_position(column_name).ok_or_else(|| {
1066            StorageError::ColumnNotFound {
1067                column: column_name.into(),
1068            }
1069        })?;
1070        let mut idx = Index::new_btree(name, column_position);
1071        if let IndexKind::BTree(map) = &mut idx.kind {
1072            for (i, row) in self.rows.iter().enumerate() {
1073                if let Some(key) = IndexKey::from_value(&row.values[column_position]) {
1074                    let mut entries = map.get(&key).cloned().unwrap_or_default();
1075                    entries.push(RowLocator::Hot(i));
1076                    map.insert_mut(key, entries);
1077                }
1078            }
1079        }
1080        self.indices.push(idx);
1081        Ok(())
1082    }
1083
1084    /// Build a new NSW (HNSW-flavoured) index over the named column.
1085    /// Required for `ORDER BY col <-> literal LIMIT k` to plan as a
1086    /// graph traversal instead of a full scan. Column must be a Vector
1087    /// type. `m` is the maximum number of neighbours per node.
1088    pub fn add_nsw_index(
1089        &mut self,
1090        name: String,
1091        column_name: &str,
1092        m: usize,
1093    ) -> Result<(), StorageError> {
1094        self.add_nsw_index_inner(name, column_name, m, None)
1095    }
1096
1097    /// v6.0.4 — synchronous rebuild of the named NSW index. If
1098    /// `new_encoding` is `Some(target)` and differs from the column's
1099    /// current encoding, every stored cell at the indexed column is
1100    /// re-coded into the target encoding before the new graph
1101    /// builds. Returns `IndexNotFound` if no index by that name exists
1102    /// and `Unsupported` for non-NSW indexes (`BTree` REBUILD is a no-op
1103    /// the engine layer rejects, not a storage-level concept).
1104    ///
1105    /// Holds the caller's `&mut self` for the duration — no
1106    /// concurrency / staging / WAL-replay machinery in v6.0.4. The
1107    /// "live" optimisation lands as v6.0.4.1.
1108    pub fn rebuild_nsw_index(
1109        &mut self,
1110        name: &str,
1111        new_encoding: Option<VecEncoding>,
1112    ) -> Result<(), StorageError> {
1113        let idx_pos = self
1114            .indices
1115            .iter()
1116            .position(|i| i.name == name)
1117            .ok_or_else(|| StorageError::IndexNotFound {
1118                name: String::from(name),
1119            })?;
1120        let col_pos = self.indices[idx_pos].column_position;
1121        let m = match &self.indices[idx_pos].kind {
1122            IndexKind::Nsw(g) => g.m,
1123            IndexKind::BTree(_) | IndexKind::Brin { .. } => {
1124                return Err(StorageError::Unsupported(format!(
1125                    "ALTER INDEX REBUILD on non-NSW index {name:?} — only NSW indexes can rebuild"
1126                )));
1127            }
1128        };
1129        let col_name = self.schema.columns[col_pos].name.clone();
1130        // 1. Optional re-encoding pass. Done first so the cells
1131        //    match the schema before the graph rebuild walks them.
1132        if let Some(target) = new_encoding {
1133            let current = match self.schema.columns[col_pos].ty {
1134                DataType::Vector { encoding, .. } => encoding,
1135                ref other => {
1136                    return Err(StorageError::Unsupported(format!(
1137                        "ALTER INDEX REBUILD WITH (encoding=…) on non-vector column type {other:?}"
1138                    )));
1139                }
1140            };
1141            if target != current {
1142                let DataType::Vector { dim, .. } = self.schema.columns[col_pos].ty else {
1143                    unreachable!("checked above")
1144                };
1145                let n = self.rows.len();
1146                for i in 0..n {
1147                    let row = self
1148                        .rows
1149                        .get_mut(i)
1150                        .expect("row index in bounds (we iterated up to len())");
1151                    let cell = core::mem::replace(&mut row.values[col_pos], Value::Null);
1152                    let recoded = recode_vector_cell(cell, target)?;
1153                    row.values[col_pos] = recoded;
1154                }
1155                self.schema.columns[col_pos].ty = DataType::Vector {
1156                    dim,
1157                    encoding: target,
1158                };
1159            }
1160        }
1161        // 2. Drop the existing index slot + rebuild from row payload.
1162        self.indices.remove(idx_pos);
1163        self.add_nsw_index_inner(String::from(name), &col_name, m, None)?;
1164        Ok(())
1165    }
1166
1167    /// Restore an NSW index from a pre-built graph (used on
1168    /// deserialize). Skips the bulk-build pass since the topology is
1169    /// already known. Returns `DuplicateIndex` or `ColumnNotFound` on
1170    /// schema mismatch as usual.
1171    pub fn restore_nsw_index(
1172        &mut self,
1173        name: String,
1174        column_name: &str,
1175        graph: NswGraph,
1176    ) -> Result<(), StorageError> {
1177        self.add_nsw_index_inner(name, column_name, graph.m, Some(graph))
1178    }
1179
1180    /// Restore a `BTree` index from a pre-built `(IndexKey, Vec<RowLocator>)`
1181    /// map. Used by [`Catalog::deserialize`] when reading a v9 (or later)
1182    /// catalog snapshot — the map travels on disk so cold-tier locators
1183    /// survive a round-trip, instead of being rebuilt from `self.rows`
1184    /// (which would lose every Cold entry). Same error contract as
1185    /// [`Table::add_index`].
1186    pub fn restore_btree_index(
1187        &mut self,
1188        name: String,
1189        column_name: &str,
1190        map: PersistentBTreeMap<IndexKey, Vec<RowLocator>>,
1191    ) -> Result<(), StorageError> {
1192        if self.indices.iter().any(|i| i.name == name) {
1193            return Err(StorageError::DuplicateIndex { name });
1194        }
1195        let column_position = self.schema.column_position(column_name).ok_or_else(|| {
1196            StorageError::ColumnNotFound {
1197                column: column_name.into(),
1198            }
1199        })?;
1200        self.indices.push(Index {
1201            name,
1202            column_position,
1203            kind: IndexKind::BTree(map),
1204            included_columns: Vec::new(),
1205            partial_predicate: None,
1206            expression: None,
1207        });
1208        Ok(())
1209    }
1210
1211    /// v6.7.1 — public restore counterpart for BRIN indices. Used
1212    /// by `Catalog::deserialize` when a v10 snapshot carries a
1213    /// BRIN index entry. BRIN carries no in-memory data — only the
1214    /// `column_type` snapshot is restored.
1215    pub fn restore_brin_index(
1216        &mut self,
1217        name: String,
1218        column_name: &str,
1219        column_type: DataType,
1220    ) -> Result<(), StorageError> {
1221        if self.indices.iter().any(|i| i.name == name) {
1222            return Err(StorageError::DuplicateIndex { name });
1223        }
1224        let column_position = self.schema.column_position(column_name).ok_or_else(|| {
1225            StorageError::ColumnNotFound {
1226                column: column_name.into(),
1227            }
1228        })?;
1229        self.indices.push(Index::new_brin(name, column_position, column_type));
1230        Ok(())
1231    }
1232
1233    /// v6.7.1 — public CREATE INDEX counterpart for BRIN. Creates
1234    /// the index entry with a snapshot of the indexed column's
1235    /// current `DataType`.
1236    pub fn add_brin_index(
1237        &mut self,
1238        name: String,
1239        column_name: &str,
1240    ) -> Result<(), StorageError> {
1241        if self.indices.iter().any(|i| i.name == name) {
1242            return Err(StorageError::DuplicateIndex { name });
1243        }
1244        let column_position = self.schema.column_position(column_name).ok_or_else(|| {
1245            StorageError::ColumnNotFound {
1246                column: column_name.into(),
1247            }
1248        })?;
1249        let column_type = self.schema.columns[column_position].ty;
1250        self.indices.push(Index::new_brin(name, column_position, column_type));
1251        Ok(())
1252    }
1253
1254    /// v5.1: register cold-tier locators on a `BTree` index. Used
1255    /// after [`Catalog::load_segment_bytes`] to wire every cold-
1256    /// tier row's PK back to its segment so
1257    /// [`Catalog::lookup_by_pk`] can resolve it. Each call
1258    /// appends to the index — keys that already have hot or cold
1259    /// locators keep them. Returns the number of locators
1260    /// registered.
1261    ///
1262    /// Pre-v5.2 (freezer) this is the only path that adds Cold
1263    /// variants to a PB; post-freezer the background freezer
1264    /// thread produces these as a batch under the engine write
1265    /// lock and this API becomes its in-memory primitive.
1266    ///
1267    /// Errors if `index_name` doesn't exist or names an NSW graph
1268    /// (NSW indices don't carry per-key row locators — they're
1269    /// vector-search structures).
1270    pub fn register_cold_locators<I>(
1271        &mut self,
1272        index_name: &str,
1273        locators: I,
1274    ) -> Result<usize, StorageError>
1275    where
1276        I: IntoIterator<Item = (IndexKey, RowLocator)>,
1277    {
1278        let idx = self
1279            .indices
1280            .iter_mut()
1281            .find(|i| i.name == index_name)
1282            .ok_or_else(|| StorageError::Corrupt(format!("index {index_name:?} not found")))?;
1283        let map = match &mut idx.kind {
1284            IndexKind::BTree(map) => map,
1285            IndexKind::Nsw(_) | IndexKind::Brin { .. } => {
1286                return Err(StorageError::Corrupt(format!(
1287                    "index {index_name:?} is not BTree; cold locators apply only to BTree indices"
1288                )));
1289            }
1290        };
1291        let mut count = 0usize;
1292        for (key, locator) in locators {
1293            let mut entries = map.get(&key).cloned().unwrap_or_default();
1294            entries.push(locator);
1295            map.insert_mut(key, entries);
1296            count += 1;
1297        }
1298        Ok(count)
1299    }
1300
1301    /// v5.2.3: remove every `Cold` locator currently registered on
1302    /// `index_name` under the given `key`. `Hot` locators for the
1303    /// same key are left in place — useful when a row has just been
1304    /// promoted hot-side and the caller wants the old Cold pointer
1305    /// retired without losing the new hot entry.
1306    ///
1307    /// Returns the number of cold locators removed (0 when the key
1308    /// has only hot entries or the key isn't present at all).
1309    /// Errors when the index doesn't exist or isn't a `BTree`.
1310    pub fn remove_cold_locators_for_key(
1311        &mut self,
1312        index_name: &str,
1313        key: &IndexKey,
1314    ) -> Result<usize, StorageError> {
1315        let idx = self
1316            .indices
1317            .iter_mut()
1318            .find(|i| i.name == index_name)
1319            .ok_or_else(|| {
1320                StorageError::Corrupt(format!(
1321                    "remove_cold_locators_for_key: index {index_name:?} not found"
1322                ))
1323            })?;
1324        let map = match &mut idx.kind {
1325            IndexKind::BTree(map) => map,
1326            IndexKind::Nsw(_) | IndexKind::Brin { .. } => {
1327                return Err(StorageError::Corrupt(format!(
1328                    "remove_cold_locators_for_key: index {index_name:?} is not BTree; \
1329                     cold locators apply only to BTree indices"
1330                )));
1331            }
1332        };
1333        let Some(entries) = map.get(key) else {
1334            return Ok(0);
1335        };
1336        let mut kept: Vec<RowLocator> =
1337            entries.iter().copied().filter(RowLocator::is_hot).collect();
1338        let removed = entries.len() - kept.len();
1339        if removed == 0 {
1340            return Ok(0);
1341        }
1342        kept.shrink_to_fit();
1343        // PersistentBTreeMap has no remove API in v5.2; when every
1344        // locator for `key` was Cold, the key keeps an empty Vec
1345        // entry. `Index::lookup_eq` already treats `Some(&[])` and
1346        // `None` as the same empty slice (via `Vec::as_slice`), so
1347        // callers can't distinguish the two. The space cost is one
1348        // empty Vec per shadowed-then-promoted key — bounded and
1349        // recoverable when the future compaction job lands.
1350        map.insert_mut(key.clone(), kept);
1351        Ok(removed)
1352    }
1353
1354    /// v4.4: delete the rows at the given positions in one pass.
1355    /// `positions` must be unique; ordering doesn't matter. Indices
1356    /// are rebuilt from scratch (cheaper than tracking incremental
1357    /// shifts across both B-tree and NSW). Returns the number of
1358    /// rows removed.
1359    pub fn delete_rows(&mut self, positions: &[usize]) -> usize {
1360        if positions.is_empty() {
1361            return 0;
1362        }
1363        // Mark positions; v4.39: PV has no in-place retain, so we rebuild
1364        // a fresh PV by pushing the survivors. Still O(n log₃₂ n); the
1365        // structural-sharing win shows up at `Catalog::clone()`, not here.
1366        let mut to_remove = alloc::vec![false; self.rows.len()];
1367        let mut removed = 0;
1368        for &p in positions {
1369            if p < to_remove.len() && !to_remove[p] {
1370                to_remove[p] = true;
1371                removed += 1;
1372            }
1373        }
1374        let mut new_rows: PersistentVec<Row> = PersistentVec::new();
1375        let mut removed_bytes: u64 = 0;
1376        for (i, row) in self.rows.iter().enumerate() {
1377            if to_remove[i] {
1378                removed_bytes =
1379                    removed_bytes.saturating_add(row_body_encoded_len(row, &self.schema) as u64);
1380            } else {
1381                new_rows.push_mut(row.clone());
1382            }
1383        }
1384        self.rows = new_rows;
1385        self.hot_bytes = self.hot_bytes.saturating_sub(removed_bytes);
1386        self.rebuild_indices();
1387        removed
1388    }
1389
1390    /// v4.4: replace the row at `position` with `new_values` (must
1391    /// match the schema arity + types). Indices are rebuilt for
1392    /// correctness — the affected column might be indexed and its
1393    /// key may have shifted, and a NSW node's vector may have
1394    /// changed, both of which need fresh state.
1395    pub fn update_row(
1396        &mut self,
1397        position: usize,
1398        new_values: Vec<Value>,
1399    ) -> Result<(), StorageError> {
1400        if position >= self.rows.len() {
1401            return Err(StorageError::Corrupt(alloc::format!(
1402                "update_row: position {position} out of bounds (rows={})",
1403                self.rows.len()
1404            )));
1405        }
1406        if new_values.len() != self.schema.columns.len() {
1407            return Err(StorageError::ArityMismatch {
1408                expected: self.schema.columns.len(),
1409                actual: new_values.len(),
1410            });
1411        }
1412        // Reuse the per-cell type-compat validation that `insert`
1413        // applies. The body below mirrors that check intentionally —
1414        // factoring it would be more code than the duplication.
1415        for (i, (val, col)) in new_values.iter().zip(&self.schema.columns).enumerate() {
1416            if val.is_null() {
1417                if !col.nullable {
1418                    return Err(StorageError::NullInNotNull {
1419                        column: col.name.clone(),
1420                    });
1421                }
1422                continue;
1423            }
1424            let actual = val.data_type().expect("non-null");
1425            let compatible = actual == col.ty
1426                || matches!(
1427                    (actual, col.ty),
1428                    (
1429                        DataType::Text,
1430                        DataType::Varchar(_) | DataType::Char(_) | DataType::Json | DataType::Jsonb
1431                    ) | (DataType::Json | DataType::Jsonb, DataType::Text)
1432                      | (DataType::Json, DataType::Jsonb) | (DataType::Jsonb, DataType::Json)
1433                      | (DataType::Timestamp, DataType::Timestamptz)
1434                      | (DataType::Timestamptz, DataType::Timestamp)
1435                )
1436                || matches!(
1437                    (actual, col.ty),
1438                    (
1439                        DataType::Numeric { scale: a, .. },
1440                        DataType::Numeric { scale: b, .. },
1441                    ) if a == b
1442                );
1443            if !compatible {
1444                return Err(StorageError::TypeMismatch {
1445                    column: col.name.clone(),
1446                    expected: col.ty,
1447                    actual,
1448                    position: i,
1449                });
1450            }
1451        }
1452        let old_row = self
1453            .rows
1454            .get(position)
1455            .expect("position bounds-checked above");
1456        let old_bytes = row_body_encoded_len(old_row, &self.schema) as u64;
1457        let new_row = Row::new(new_values);
1458        let new_bytes = row_body_encoded_len(&new_row, &self.schema) as u64;
1459        self.rows = self
1460            .rows
1461            .set(position, new_row)
1462            .expect("position bounds-checked above");
1463        self.hot_bytes = self
1464            .hot_bytes
1465            .saturating_sub(old_bytes)
1466            .saturating_add(new_bytes);
1467        self.rebuild_indices();
1468        Ok(())
1469    }
1470
1471    /// v4.4 helper used by `delete_rows` / `update_row`: discard all
1472    /// index payloads and rebuild from `self.rows`. Cheap enough
1473    /// for typical SPG scale (catalogs in the docker-compose
1474    /// deployment shape are small); the alternative — incremental
1475    /// shift bookkeeping across B-tree + NSW — would be far more
1476    /// invasive than the savings justify.
1477    fn rebuild_indices(&mut self) {
1478        // v5.2.3: capture every `Cold` locator on every BTree index
1479        // before the rebuild, so the from-rows re-emission below
1480        // (which only produces `Hot` locators) doesn't drop cold-
1481        // tier entries on keys unrelated to the row that changed.
1482        // Pre-v5.2.3 this was a `freeze_oldest_to_cold` worry only
1483        // and the freezer did its own capture-then-reregister; v5.2.3
1484        // promotes that pattern into the base helper because UPDATE
1485        // / DELETE now run rebuild_indices on tables with cold rows.
1486        let preserved_cold: Vec<(String, Vec<(IndexKey, RowLocator)>)> = self
1487            .indices
1488            .iter()
1489            .filter_map(|idx| match &idx.kind {
1490                IndexKind::BTree(map) => {
1491                    let cold: Vec<(IndexKey, RowLocator)> = map
1492                        .iter()
1493                        .flat_map(|(k, locs)| {
1494                            locs.iter()
1495                                .filter(|l| l.is_cold())
1496                                .copied()
1497                                .map(move |l| (k.clone(), l))
1498                        })
1499                        .collect();
1500                    if cold.is_empty() {
1501                        None
1502                    } else {
1503                        Some((idx.name.clone(), cold))
1504                    }
1505                }
1506                // BRIN / NSW carry no key→locator map.
1507                IndexKind::Nsw(_) | IndexKind::Brin { .. } => None,
1508            })
1509            .collect();
1510
1511        // v6.7.1 — descriptor needs to capture index kind so the
1512        // rebuild loop can resurrect BTree / NSW / BRIN exactly as
1513        // they were. (NSW carries m; BRIN carries the column type
1514        // snapshot; BTree needs no extra payload.)
1515        #[derive(Clone)]
1516        enum RebuildKind {
1517            BTree,
1518            Nsw(usize),
1519            Brin(DataType),
1520        }
1521        let descriptors: Vec<(String, usize, RebuildKind)> = self
1522            .indices
1523            .iter()
1524            .map(|idx| {
1525                let kind = match &idx.kind {
1526                    IndexKind::Nsw(g) => RebuildKind::Nsw(g.m),
1527                    IndexKind::Brin { column_type } => RebuildKind::Brin(*column_type),
1528                    IndexKind::BTree(_) => RebuildKind::BTree,
1529                };
1530                (idx.name.clone(), idx.column_position, kind)
1531            })
1532            .collect();
1533        self.indices.clear();
1534        for (name, column_position, rebuild_kind) in descriptors {
1535            match rebuild_kind {
1536                RebuildKind::Nsw(m) => {
1537                    let idx = Index::new_nsw(name, column_position, m);
1538                    self.indices.push(idx);
1539                    let idx_pos = self.indices.len() - 1;
1540                    let row_indices: Vec<usize> = (0..self.rows.len()).collect();
1541                    for row_idx in row_indices {
1542                        nsw_insert_at(self, idx_pos, row_idx);
1543                    }
1544                }
1545                RebuildKind::Brin(column_type) => {
1546                    // BRIN has no in-memory rebuild — the summaries
1547                    // live in cold segments which freeze emits.
1548                    self.indices.push(Index::new_brin(name, column_position, column_type));
1549                }
1550                RebuildKind::BTree => {
1551                    let mut idx = Index::new_btree(name, column_position);
1552                    if let IndexKind::BTree(map) = &mut idx.kind {
1553                        for (i, row) in self.rows.iter().enumerate() {
1554                            if let Some(key) = IndexKey::from_value(&row.values[column_position]) {
1555                                let mut entries = map.get(&key).cloned().unwrap_or_default();
1556                                entries.push(RowLocator::Hot(i));
1557                                map.insert_mut(key, entries);
1558                            }
1559                        }
1560                    }
1561                    self.indices.push(idx);
1562                }
1563            }
1564        }
1565
1566        // Re-attach preserved cold locators after the from-rows
1567        // rebuild. `register_cold_locators` handles the per-key
1568        // entries-vec append; no key collisions arise because the
1569        // rebuild loop above produced only Hot locators.
1570        for (idx_name, locators) in preserved_cold {
1571            // Errors here would only fire if the index disappeared
1572            // between snapshot and rebuild, which can't happen
1573            // because the rebuild restores the same descriptor set.
1574            let _ = self.register_cold_locators(&idx_name, locators);
1575        }
1576    }
1577
1578    fn add_nsw_index_inner(
1579        &mut self,
1580        name: String,
1581        column_name: &str,
1582        m: usize,
1583        restore: Option<NswGraph>,
1584    ) -> Result<(), StorageError> {
1585        if self.indices.iter().any(|i| i.name == name) {
1586            return Err(StorageError::DuplicateIndex { name });
1587        }
1588        let column_position = self.schema.column_position(column_name).ok_or_else(|| {
1589            StorageError::ColumnNotFound {
1590                column: column_name.into(),
1591            }
1592        })?;
1593        if !matches!(
1594            self.schema.columns[column_position].ty,
1595            DataType::Vector { .. }
1596        ) {
1597            return Err(StorageError::TypeMismatch {
1598                column: column_name.into(),
1599                expected: DataType::Vector {
1600                    dim: 0,
1601                    encoding: VecEncoding::F32,
1602                },
1603                actual: self.schema.columns[column_position].ty,
1604                position: column_position,
1605            });
1606        }
1607        if let Some(graph) = restore {
1608            self.indices.push(Index {
1609                name,
1610                column_position,
1611                kind: IndexKind::Nsw(graph),
1612                included_columns: Vec::new(),
1613                partial_predicate: None,
1614                expression: None,
1615            });
1616            return Ok(());
1617        }
1618        let idx = Index::new_nsw(name, column_position, m);
1619        self.indices.push(idx);
1620        let idx_pos = self.indices.len() - 1;
1621        // Bulk-build by walking the existing rows in order — each insert
1622        // sees the partial graph and links into it.
1623        let row_indices: Vec<usize> = (0..self.rows.len()).collect();
1624        for row_idx in row_indices {
1625            nsw_insert_at(self, idx_pos, row_idx);
1626        }
1627        Ok(())
1628    }
1629}
1630
1631/// v6.0.4 — re-encode a single cell to the target `VecEncoding`.
1632/// Used by `Table::rebuild_nsw_index` when ALTER INDEX REBUILD
1633/// includes the optional `WITH (encoding = …)` clause. Round-trip
1634/// goes through f32: `current → Vec<f32> → target`, leaving NULL
1635/// cells untouched. Returns `Unsupported` on a non-vector cell —
1636/// the caller should have rejected the schema before reaching this.
1637fn recode_vector_cell(cell: Value, target: VecEncoding) -> Result<Value, StorageError> {
1638    if matches!(cell, Value::Null) {
1639        return Ok(cell);
1640    }
1641    // Step 1 — extract the f32 representation of the source cell.
1642    let as_f32: Vec<f32> = match &cell {
1643        Value::Vector(v) => v.clone(),
1644        Value::Sq8Vector(q) => quantize::dequantize(q),
1645        Value::HalfVector(h) => h.to_f32_vec(),
1646        other => {
1647            return Err(StorageError::Unsupported(format!(
1648                "ALTER INDEX REBUILD: cannot recode non-vector cell {:?}",
1649                other.data_type()
1650            )));
1651        }
1652    };
1653    // Step 2 — encode into the target shape. `F32` is the identity
1654    // path (saves one alloc round-trip when the source is already
1655    // F32 — but `Value::Vector(as_f32)` is the right answer
1656    // regardless).
1657    Ok(match target {
1658        VecEncoding::F32 => Value::Vector(as_f32),
1659        VecEncoding::Sq8 => Value::Sq8Vector(quantize::quantize(&as_f32)),
1660        VecEncoding::F16 => Value::HalfVector(halfvec::HalfVector::from_f32_slice(&as_f32)),
1661    })
1662}
1663
1664/// Insert one row into the HNSW graph held by index slot `idx_pos`.
1665/// No-op when the row's value at the indexed column isn't a vector.
1666/// v6.0.1: handles `Value::Sq8Vector` by dequantising into an f32
1667/// "query" surface — the existing greedy + beam-search machinery
1668/// then uses `cell_to_query_metric_distance` to route every
1669/// distance call through the cell's actual encoding.
1670fn nsw_insert_at(table: &mut Table, idx_pos: usize, new_row_idx: usize) {
1671    let col_pos = table.indices[idx_pos].column_position;
1672    let cell_dim: Option<usize> = match &table.rows[new_row_idx].values[col_pos] {
1673        Value::Vector(v) => Some(v.len()),
1674        Value::Sq8Vector(q) => Some(q.bytes.len()),
1675        Value::HalfVector(h) => Some(h.dim()),
1676        _ => None,
1677    };
1678    let Some(dim) = cell_dim else {
1679        // Even non-vector rows occupy a level slot so per-node Vec
1680        // lengths stay aligned with `table.rows.len()`.
1681        ensure_node_slot(table, idx_pos, new_row_idx, 0);
1682        return;
1683    };
1684    if dim == 0 {
1685        ensure_node_slot(table, idx_pos, new_row_idx, 0);
1686        return;
1687    }
1688    let level = nsw_assign_level(new_row_idx);
1689    ensure_node_slot(table, idx_pos, new_row_idx, level);
1690    let (entry, entry_level, m) = match &table.indices[idx_pos].kind {
1691        IndexKind::Nsw(g) => (g.entry, g.entry_level, g.m),
1692        IndexKind::BTree(_) | IndexKind::Brin { .. } => {
1693            unreachable!("nsw_insert_at on a non-NSW index")
1694        }
1695    };
1696    // First node ever — declare it the entry (it gets its own level).
1697    if entry.is_none() {
1698        if let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind {
1699            g.entry = Some(new_row_idx);
1700            g.entry_level = level;
1701            *g.levels
1702                .get_mut(new_row_idx)
1703                .expect("levels slot padded by ensure_node_slot") = level;
1704        }
1705        return;
1706    }
1707    // Set the node's recorded level.
1708    if let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind {
1709        *g.levels
1710            .get_mut(new_row_idx)
1711            .expect("levels slot padded by ensure_node_slot") = level;
1712    }
1713    let query = match &table.rows[new_row_idx].values[col_pos] {
1714        Value::Vector(v) => v.clone(),
1715        // v6.0.1: dequantise the inserted SQ8 cell into an f32 query
1716        // surface so the existing greedy / beam machinery can route
1717        // distances through `cell_to_query_metric_distance`. The
1718        // small dequantisation error is what the recall@10 ≥ 0.95
1719        // envelope already accounts for (V6_DESIGN deliberation #3).
1720        Value::Sq8Vector(q) => quantize::dequantize(q),
1721        // v6.0.3: halfvec dequant is bit-exact at the storage layer,
1722        // so the inserted query is a faithful representation.
1723        Value::HalfVector(h) => h.to_f32_vec(),
1724        _ => return,
1725    };
1726    // Phase 1: greedy descend from `entry` down to `level + 1`, keeping
1727    // exactly one current best so the next layer starts from it.
1728    let mut current = entry.expect("entry was Some above");
1729    let mut current_d = vec_l2_sq(table, col_pos, current, &query);
1730    if entry_level > level {
1731        for layer in (level + 1..=entry_level).rev() {
1732            (current, current_d) =
1733                greedy_layer_walk(table, idx_pos, layer, current, current_d, &query);
1734        }
1735    }
1736    // Phase 2: from `min(level, entry_level)` down to 0, beam-search
1737    // `ef_construction` candidates, run the HNSW §4 heuristic neighbour
1738    // selection over them, and connect bidirectionally.
1739    let top = level.min(entry_level);
1740    let ef = (m * 2).max(8);
1741    for layer in (0..=top).rev() {
1742        let cap = if layer == 0 { m * 2 } else { m };
1743        let mut candidates = layer_beam_search(
1744            table,
1745            idx_pos,
1746            layer,
1747            current,
1748            current_d,
1749            &query,
1750            ef,
1751            NswMetric::L2,
1752        );
1753        candidates.retain(|&(_, n)| n != new_row_idx);
1754        // Take the closest as the entry for the next layer down — done
1755        // before heuristic narrowing because the heuristic can reorder.
1756        if let Some(&(d, n)) = candidates.first() {
1757            current = n;
1758            current_d = d;
1759        }
1760        let peers = select_neighbours_heuristic(&candidates, cap, table, col_pos);
1761        connect_at_layer(table, idx_pos, layer, new_row_idx, &peers);
1762    }
1763    // Phase 3: if the new node climbed above the current entry, take
1764    // over as entry so future inserts/searches start from the new top.
1765    if level > entry_level
1766        && let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind
1767    {
1768        g.entry = Some(new_row_idx);
1769        g.entry_level = level;
1770    }
1771}
1772
1773/// Make sure `layers[*][new_row_idx]` and `levels[new_row_idx]` exist,
1774/// padding with empty/zero entries as needed. Also grows `layers` to
1775/// accommodate the node's top `level`.
1776fn ensure_node_slot(table: &mut Table, idx_pos: usize, new_row_idx: usize, level: u8) {
1777    let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind else {
1778        unreachable!("ensure_node_slot on a BTree index");
1779    };
1780    while g.layers.len() <= level as usize {
1781        g.layers.push(PersistentVec::new());
1782    }
1783    while g.levels.len() <= new_row_idx {
1784        g.levels.push_mut(0);
1785    }
1786    for layer_vec in &mut g.layers {
1787        while layer_vec.len() <= new_row_idx {
1788            layer_vec.push_mut(Vec::new());
1789        }
1790    }
1791}
1792
1793/// Single-step greedy walk on one layer: from `current` (with cached
1794/// distance `current_d`), inspect that node's neighbours at `layer` and
1795/// hop to the closest if it beats `current_d`. Repeat until no move
1796/// improves the distance. Cheap variant of beam-search used for the
1797/// "descend" phase that only needs one survivor per layer.
1798fn greedy_layer_walk(
1799    table: &Table,
1800    idx_pos: usize,
1801    layer: u8,
1802    mut current: usize,
1803    mut current_d: f32,
1804    query: &[f32],
1805) -> (usize, f32) {
1806    let g = match &table.indices[idx_pos].kind {
1807        IndexKind::Nsw(g) => g,
1808        IndexKind::BTree(_) | IndexKind::Brin { .. } => return (current, current_d),
1809    };
1810    let col_pos = table.indices[idx_pos].column_position;
1811    loop {
1812        let neighbours: &[u32] = g
1813            .layers
1814            .get(layer as usize)
1815            .and_then(|layer_v| layer_v.get(current))
1816            .map_or(&[][..], Vec::as_slice);
1817        let mut best = current;
1818        let mut best_d = current_d;
1819        for &n in neighbours {
1820            let n = n as usize;
1821            let d = vec_l2_sq(table, col_pos, n, query);
1822            if d < best_d {
1823                best = n;
1824                best_d = d;
1825            }
1826        }
1827        if best == current {
1828            return (current, current_d);
1829        }
1830        current = best;
1831        current_d = best_d;
1832    }
1833}
1834
1835/// Beam search on one layer starting from `entry_node` with cached
1836/// `entry_d`. Returns the top `ef` candidates in ascending-distance
1837/// order. Caller picks the closest as the next layer's entry and / or
1838/// trims to M for connection.
1839///
1840/// v3.0.1: uses two `BinaryHeap`s (min-heap for the open frontier,
1841/// max-heap for the working top-`ef` results) and a `Vec<bool>` visited
1842/// bitmap, replacing the v2.x `Vec` + `partition_point` + `BTreeSet`
1843/// implementation. Same algorithm shape (HNSW search algorithm 2 from
1844/// the paper); the data-structure swap cuts per-visit cost from
1845/// `O(ef + log row_count)` to amortised `O(log ef)`.
1846#[allow(clippy::too_many_arguments)] // Beam search threads layer, entry, query, ef, metric — each is intrinsic. Bundling them into a config struct hides the call sites.
1847fn layer_beam_search(
1848    table: &Table,
1849    idx_pos: usize,
1850    layer: u8,
1851    entry_node: usize,
1852    entry_d: f32,
1853    query: &[f32],
1854    ef: usize,
1855    metric: NswMetric,
1856) -> Vec<(f32, usize)> {
1857    let g = match &table.indices[idx_pos].kind {
1858        IndexKind::Nsw(g) => g,
1859        IndexKind::BTree(_) | IndexKind::Brin { .. } => return Vec::new(),
1860    };
1861    let col_pos = table.indices[idx_pos].column_position;
1862    let d0 = if matches!(metric, NswMetric::L2) {
1863        entry_d
1864    } else {
1865        cell_to_query_metric_distance(table, col_pos, entry_node, query, metric)
1866    };
1867    let row_count = table.rows.len();
1868    let mut visited: Vec<bool> = alloc::vec![false; row_count];
1869    if entry_node < row_count {
1870        visited[entry_node] = true;
1871    }
1872    // candidates: min-heap by distance (Closest wrapper) — frontier
1873    // results:    max-heap by distance (Furthest wrapper) — top-ef working set
1874    let mut candidates: alloc::collections::BinaryHeap<NodeClosest> =
1875        alloc::collections::BinaryHeap::with_capacity(ef);
1876    let mut results: alloc::collections::BinaryHeap<NodeFurthest> =
1877        alloc::collections::BinaryHeap::with_capacity(ef);
1878    candidates.push(NodeClosest {
1879        dist: d0,
1880        node: entry_node,
1881    });
1882    results.push(NodeFurthest {
1883        dist: d0,
1884        node: entry_node,
1885    });
1886    while let Some(cur) = candidates.pop() {
1887        let worst = results.peek().map_or(f32::INFINITY, |c| c.dist);
1888        if cur.dist > worst && results.len() >= ef {
1889            break;
1890        }
1891        let neighbours: &[u32] = g
1892            .layers
1893            .get(layer as usize)
1894            .and_then(|layer_v| layer_v.get(cur.node))
1895            .map_or(&[][..], Vec::as_slice);
1896        for &n in neighbours {
1897            let n = n as usize;
1898            if n >= row_count || visited[n] {
1899                continue;
1900            }
1901            visited[n] = true;
1902            // v6.0.1: cell-aware distance — F32 cells take the
1903            // existing scalar metric, SQ8 cells route through
1904            // the asymmetric ADC variant for the same metric.
1905            let dn = cell_to_query_metric_distance(table, col_pos, n, query, metric);
1906            if !dn.is_finite() {
1907                continue;
1908            }
1909            let worst = results.peek().map_or(f32::INFINITY, |c| c.dist);
1910            if results.len() < ef || dn < worst {
1911                results.push(NodeFurthest { dist: dn, node: n });
1912                if results.len() > ef {
1913                    results.pop();
1914                }
1915                candidates.push(NodeClosest { dist: dn, node: n });
1916            }
1917        }
1918    }
1919    // Drain results (max-heap order) and re-sort ascending so callers
1920    // can take `closest = result[0]` without flipping.
1921    let mut out: Vec<(f32, usize)> = results.into_iter().map(|c| (c.dist, c.node)).collect();
1922    out.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
1923    out
1924}
1925
1926/// Min-heap wrapper: smaller `dist` → higher priority in a `BinaryHeap`
1927/// (which is a max-heap), so we flip the comparison. NaN sorts last
1928/// (lowest priority) to keep the heap total-ordered.
1929#[derive(Debug, Clone, Copy)]
1930struct NodeClosest {
1931    dist: f32,
1932    node: usize,
1933}
1934impl PartialEq for NodeClosest {
1935    fn eq(&self, other: &Self) -> bool {
1936        self.dist == other.dist && self.node == other.node
1937    }
1938}
1939impl Eq for NodeClosest {}
1940impl PartialOrd for NodeClosest {
1941    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
1942        Some(self.cmp(other))
1943    }
1944}
1945impl Ord for NodeClosest {
1946    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
1947        // Reversed: smaller dist = greater priority.
1948        other
1949            .dist
1950            .partial_cmp(&self.dist)
1951            .unwrap_or(core::cmp::Ordering::Equal)
1952    }
1953}
1954
1955/// Max-heap wrapper: larger `dist` sits at the top so the worst result
1956/// can be evicted in O(log n) when a better candidate arrives.
1957#[derive(Debug, Clone, Copy)]
1958struct NodeFurthest {
1959    dist: f32,
1960    node: usize,
1961}
1962impl PartialEq for NodeFurthest {
1963    fn eq(&self, other: &Self) -> bool {
1964        self.dist == other.dist && self.node == other.node
1965    }
1966}
1967impl Eq for NodeFurthest {}
1968impl PartialOrd for NodeFurthest {
1969    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
1970        Some(self.cmp(other))
1971    }
1972}
1973impl Ord for NodeFurthest {
1974    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
1975        self.dist
1976            .partial_cmp(&other.dist)
1977            .unwrap_or(core::cmp::Ordering::Equal)
1978    }
1979}
1980
1981/// HNSW paper §4 algorithm 4: pick `m` neighbours from `candidates` so
1982/// that each chosen point isn't already covered by a closer chosen
1983/// point. Improves graph diversity → fewer hops needed at search time.
1984///
1985/// `candidates` arrives sorted ascending by distance-to-query. We walk
1986/// it in order, keeping a candidate only when no already-chosen point
1987/// is closer to it than the query is. Result is a vector of row
1988/// indices (length ≤ `m`).
1989fn select_neighbours_heuristic(
1990    candidates: &[(f32, usize)],
1991    m: usize,
1992    table: &Table,
1993    col_pos: usize,
1994) -> Vec<usize> {
1995    let mut chosen: Vec<usize> = Vec::with_capacity(m);
1996    for &(d_q, e) in candidates {
1997        if chosen.len() >= m {
1998            break;
1999        }
2000        // v6.0.1: works on either `Value::Vector` (F32) or
2001        // `Value::Sq8Vector` (Sq8) cells — `cell_l2_sq` dispatches
2002        // on encoding. A non-vector cell yields `f32::INFINITY`
2003        // which the `< d_q` test will never accept.
2004        if !matches!(
2005            table.rows.get(e).and_then(|r| r.values.get(col_pos)),
2006            Some(Value::Vector(_) | Value::Sq8Vector(_) | Value::HalfVector(_))
2007        ) {
2008            continue;
2009        }
2010        let mut covered = false;
2011        for &r in &chosen {
2012            // dist(e, r) measured in the same metric the topology was
2013            // built with (L2). If a chosen `r` is closer to `e` than
2014            // the query is, `r` already "covers" `e` for navigation.
2015            if cell_l2_sq(table, col_pos, e, r) < d_q {
2016                covered = true;
2017                break;
2018            }
2019        }
2020        if !covered {
2021            chosen.push(e);
2022        }
2023    }
2024    chosen
2025}
2026
2027/// Bidirectionally connect `new_row_idx` to each of `peers` at `layer`,
2028/// trimming each endpoint's adjacency to that layer's degree cap by
2029/// keeping only the closest neighbours.
2030fn connect_at_layer(
2031    table: &mut Table,
2032    idx_pos: usize,
2033    layer: u8,
2034    new_row_idx: usize,
2035    peers: &[usize],
2036) {
2037    let col_pos = table.indices[idx_pos].column_position;
2038    let cap = match &table.indices[idx_pos].kind {
2039        IndexKind::Nsw(g) => g.cap_for_layer(layer),
2040        IndexKind::BTree(_) | IndexKind::Brin { .. } => return,
2041    };
2042    // v6.1.x: NSW adjacency stores neighbour row indices as u32 (4 B
2043    // each) rather than usize (8 B on 64-bit). Boundary casts here
2044    // assert the row count fits in u32 — the catalog already enforces
2045    // ≤ 4G rows per table, so the conversion can't lose data.
2046    let new_row_u32 = u32::try_from(new_row_idx).expect("row index fits in u32");
2047    if let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind {
2048        let layer_v = &mut g.layers[layer as usize];
2049        if let Some(slot) = layer_v.get_mut(new_row_idx) {
2050            *slot = peers
2051                .iter()
2052                .map(|&p| u32::try_from(p).expect("row index fits in u32"))
2053                .collect();
2054        }
2055    }
2056    for &peer in peers {
2057        // Skip peers whose indexed cell isn't a vector — same fence
2058        // as the F32 path; SQ8 cells flow through `cell_l2_sq`
2059        // below without dequantising.
2060        if !matches!(
2061            &table.rows[peer].values[col_pos],
2062            Value::Vector(_) | Value::Sq8Vector(_) | Value::HalfVector(_)
2063        ) {
2064            continue;
2065        }
2066        // 1. add the new node to peer's adjacency
2067        if let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind {
2068            let layer_v = &mut g.layers[layer as usize];
2069            if let Some(slot) = layer_v.get_mut(peer)
2070                && !slot.contains(&new_row_u32)
2071            {
2072                slot.push(new_row_u32);
2073            }
2074        }
2075        // 2. if peer is over budget, rebuild its adjacency with the
2076        //    HNSW §4 heuristic — same diversity criterion as the
2077        //    insert path so connectivity stays consistent.
2078        let needs_trim = match &table.indices[idx_pos].kind {
2079            IndexKind::Nsw(g) => g.layers[layer as usize][peer].len() > cap,
2080            IndexKind::BTree(_) | IndexKind::Brin { .. } => false,
2081        };
2082        if needs_trim {
2083            let current_peers: Vec<usize> = match &table.indices[idx_pos].kind {
2084                IndexKind::Nsw(g) => g.layers[layer as usize][peer]
2085                    .iter()
2086                    .map(|&n| n as usize)
2087                    .collect(),
2088                IndexKind::BTree(_) | IndexKind::Brin { .. } => continue,
2089            };
2090            // Sort by distance from `peer`'s cell ascending so the
2091            // heuristic receives candidates closest-first. `cell_l2_sq`
2092            // dispatches on encoding so SQ8 columns trim using
2093            // symmetric ADC.
2094            let mut tagged: Vec<(f32, usize)> = current_peers
2095                .iter()
2096                .map(|&p| (cell_l2_sq(table, col_pos, peer, p), p))
2097                .collect();
2098            tagged.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
2099            let kept = select_neighbours_heuristic(&tagged, cap, table, col_pos);
2100            if let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind
2101                && let Some(slot) = g.layers[layer as usize].get_mut(peer)
2102            {
2103                *slot = kept
2104                    .into_iter()
2105                    .map(|p| u32::try_from(p).expect("row index fits in u32"))
2106                    .collect();
2107            }
2108        }
2109    }
2110}
2111
2112/// Squared L2 distance from `query` (raw f32) to the cell at
2113/// `(row, col_pos)`. Dispatches on cell encoding: `Value::Vector`
2114/// (F32) uses `l2_distance_sq`; `Value::Sq8Vector` uses
2115/// `sq8_l2_distance_sq_asymmetric` (the v6.0.1 quantised path).
2116/// Returns `f32::INFINITY` for any non-vector cell so callers can
2117/// compare uniformly.
2118fn vec_l2_sq(table: &Table, col_pos: usize, row: usize, query: &[f32]) -> f32 {
2119    match table.rows.get(row).and_then(|r| r.values.get(col_pos)) {
2120        Some(Value::Vector(v)) if v.len() == query.len() => l2_distance_sq(v, query),
2121        Some(Value::Sq8Vector(q)) if q.bytes.len() == query.len() => {
2122            quantize::sq8_l2_distance_sq_asymmetric(q, query)
2123        }
2124        // v6.0.6: halfvec → fused NEON SIMD kernel; no Vec<f32>
2125        // allocation. v6.0.3 used `to_f32_vec()` + f32 NEON which
2126        // was correct but allocated per call (5× slower than F32).
2127        Some(Value::HalfVector(h)) if h.dim() == query.len() => {
2128            halfvec::half_l2_distance_sq_asymmetric(h, query)
2129        }
2130        _ => f32::INFINITY,
2131    }
2132}
2133
2134/// Squared L2 distance between two stored cells (no f32 query in
2135/// sight). Used during HNSW graph build — both endpoints are
2136/// rows already in the table, so symmetric ADC applies for SQ8
2137/// columns. Mixed-encoding cells within one column are a
2138/// schema-level impossibility (INSERT-time coercion enforces
2139/// uniform encoding), so the catch-all is an abort.
2140fn cell_l2_sq(table: &Table, col_pos: usize, row_a: usize, row_b: usize) -> f32 {
2141    let Some(cell_a) = table.rows.get(row_a).and_then(|r| r.values.get(col_pos)) else {
2142        return f32::INFINITY;
2143    };
2144    let Some(cell_b) = table.rows.get(row_b).and_then(|r| r.values.get(col_pos)) else {
2145        return f32::INFINITY;
2146    };
2147    match (cell_a, cell_b) {
2148        (Value::Vector(a), Value::Vector(b)) if a.len() == b.len() => l2_distance_sq(a, b),
2149        (Value::Sq8Vector(a), Value::Sq8Vector(b)) if a.bytes.len() == b.bytes.len() => {
2150            quantize::sq8_l2_distance_sq(a, b)
2151        }
2152        // v6.0.6: halfvec symmetric NEON — fused SIMD kernel that
2153        // loads both cells' raw u16 bits, expands to f32 lanes
2154        // inline, FMA-accumulates the squared diff. No Vec<f32>
2155        // allocation per call.
2156        (Value::HalfVector(a), Value::HalfVector(b)) if a.dim() == b.dim() => {
2157            halfvec::half_l2_distance_sq(a, b)
2158        }
2159        _ => f32::INFINITY,
2160    }
2161}
2162
2163/// kNN-search-time distance: stored cell → f32 query under the
2164/// caller's metric. Dispatches on cell encoding so SQ8 columns
2165/// take the ADC path with the right asymmetric variant. NaN /
2166/// dim-mismatch / non-vector → `f32::INFINITY`.
2167fn cell_to_query_metric_distance(
2168    table: &Table,
2169    col_pos: usize,
2170    row: usize,
2171    query: &[f32],
2172    metric: NswMetric,
2173) -> f32 {
2174    match table.rows.get(row).and_then(|r| r.values.get(col_pos)) {
2175        Some(Value::Vector(v)) if v.len() == query.len() => metric_distance(metric, v, query),
2176        Some(Value::Sq8Vector(q)) if q.bytes.len() == query.len() => match metric {
2177            NswMetric::L2 => quantize::sq8_l2_distance_sq_asymmetric(q, query),
2178            NswMetric::InnerProduct => quantize::sq8_inner_product_asymmetric(q, query),
2179            NswMetric::Cosine => quantize::sq8_cosine_distance_asymmetric(q, query),
2180        },
2181        // v6.0.6: halfvec dispatches by metric to fused NEON
2182        // kernels — no Vec<f32> allocation per call.
2183        Some(Value::HalfVector(h)) if h.dim() == query.len() => match metric {
2184            NswMetric::L2 => halfvec::half_l2_distance_sq_asymmetric(h, query),
2185            NswMetric::InnerProduct => halfvec::half_inner_product_asymmetric(h, query),
2186            NswMetric::Cosine => halfvec::half_cosine_distance_asymmetric(h, query),
2187        },
2188        _ => f32::INFINITY,
2189    }
2190}
2191
2192/// Distance metric used at NSW search time. The graph topology is
2193/// always built with `L2`; querying with `InnerProduct` / `Cosine`
2194/// reuses the same edges but ranks candidates by the chosen metric.
2195/// For the corpus-sized graphs this loses negligible recall vs
2196/// building separate per-metric graphs.
2197#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2198pub enum NswMetric {
2199    /// Squared Euclidean — ranks "smaller = closer" (the sqrt is
2200    /// monotonic so we skip it for ordering).
2201    L2,
2202    /// Negated dot product, matching pgvector `<#>` convention so
2203    /// "smaller = more similar" holds across all three metrics.
2204    InnerProduct,
2205    /// Cosine distance `1 - cos(a, b)`. Zero-norm operand yields
2206    /// `f32::INFINITY` so it sorts last.
2207    Cosine,
2208}
2209
2210/// Multi-layer HNSW kNN search: greedy-descend from the entry to layer 0,
2211/// then beam-search there with the requested `ef` to return the top `k`
2212/// results under the caller-chosen metric. Topology was built with L2 —
2213/// upper-layer descent uses L2 as a coarse heuristic; final beam search
2214/// runs in the requested metric so rankings are correct for `<#>` / `<=>`.
2215fn nsw_search(
2216    table: &Table,
2217    idx_pos: usize,
2218    query: &[f32],
2219    k: usize,
2220    ef: usize,
2221    metric: NswMetric,
2222) -> Vec<(f32, usize)> {
2223    let (entry, entry_level) = match &table.indices[idx_pos].kind {
2224        IndexKind::Nsw(g) => (g.entry, g.entry_level),
2225        IndexKind::BTree(_) | IndexKind::Brin { .. } => return Vec::new(),
2226    };
2227    let Some(entry) = entry else {
2228        return Vec::new();
2229    };
2230    let col_pos = table.indices[idx_pos].column_position;
2231    // v6.0.1 step 5: SQ8 columns over-fetch by `SQ8_RERANK_OVER_FETCH`
2232    // so the rerank pass below sees enough candidates to recover
2233    // recall after the ADC re-ordering. F32 + F16 columns skip the
2234    // over-fetch — F32 distances are exact, F16 dequant is
2235    // bit-exact at the storage layer so the beam search already
2236    // ranks under the column's full precision.
2237    let sq8 = matches!(
2238        table.schema.columns.get(col_pos).map(|c| c.ty),
2239        Some(DataType::Vector {
2240            encoding: VecEncoding::Sq8,
2241            ..
2242        })
2243    );
2244    let ef = if sq8 {
2245        ef.max(k).max(k * SQ8_RERANK_OVER_FETCH)
2246    } else {
2247        ef.max(k)
2248    };
2249    // Descend by L2 (the topology metric) so layers prune consistently.
2250    let entry_d = vec_l2_sq(table, col_pos, entry, query);
2251    let mut current = entry;
2252    let mut current_d = entry_d;
2253    for layer in (1..=entry_level).rev() {
2254        (current, current_d) = greedy_layer_walk(table, idx_pos, layer, current, current_d, query);
2255    }
2256    // Final beam search on layer 0 under the caller's metric.
2257    let mut results = layer_beam_search(table, idx_pos, 0, current, current_d, query, ef, metric);
2258    if sq8 {
2259        results = sq8_rerank(table, col_pos, &results, query, metric);
2260    }
2261    results.truncate(k);
2262    results
2263}
2264
2265/// v6.0.1 step 5: re-score ADC top-`K*3` candidates with the
2266/// dequantised cell vs the f32 query, then re-sort. Recovers the
2267/// recall the SQ8 ADC sacrifices for 4× compression — the design's
2268/// "f32 rerank step is on by default" path (deliberation #3).
2269/// `metric` is the same metric the beam search used; the rerank
2270/// arithmetic re-derives the exact distance under that metric.
2271fn sq8_rerank(
2272    table: &Table,
2273    col_pos: usize,
2274    candidates: &[(f32, usize)],
2275    query: &[f32],
2276    metric: NswMetric,
2277) -> Vec<(f32, usize)> {
2278    let mut out: Vec<(f32, usize)> = candidates
2279        .iter()
2280        .filter_map(|&(adc_d, row)| {
2281            let cell = table.rows.get(row).and_then(|r| r.values.get(col_pos))?;
2282            let Value::Sq8Vector(q) = cell else {
2283                // F32 cells shouldn't reach this path (sq8 fence
2284                // above), but stay defensive: pass through with
2285                // the ADC distance unchanged.
2286                return Some((adc_d, row));
2287            };
2288            let deq = quantize::dequantize(q);
2289            if deq.len() != query.len() {
2290                return None;
2291            }
2292            Some((metric_distance(metric, &deq, query), row))
2293        })
2294        .collect();
2295    out.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
2296    out
2297}
2298
2299/// Multiplier applied to `k` so the SQ8 rerank pass sees a wider
2300/// candidate set. 3× is the design-stage value; v6.0.5 sweep work
2301/// can re-tune once full corpus profiling is in.
2302const SQ8_RERANK_OVER_FETCH: usize = 3;
2303
2304fn metric_distance(metric: NswMetric, a: &[f32], b: &[f32]) -> f32 {
2305    match metric {
2306        NswMetric::L2 => l2_distance_sq(a, b),
2307        NswMetric::InnerProduct => -inner_product_f32(a, b),
2308        NswMetric::Cosine => {
2309            let (dot, na, nb) = cosine_dot_norms_f32(a, b);
2310            if na == 0.0 || nb == 0.0 {
2311                return f32::INFINITY;
2312            }
2313            // `f32::sqrt` lives in std, so hand-roll Newton-Raphson on
2314            // f64 — same trick the L2 binary op already uses.
2315            let denom = sqrt_newton_f32(na) * sqrt_newton_f32(nb);
2316            1.0 - dot / denom
2317        }
2318    }
2319}
2320
2321/// v6.0.2: dispatch wrapper for the f32 dot product (used by `<#>` +
2322/// the cosine numerator). NEON path when `len % 4 == 0 && len >= 4`,
2323/// scalar fallback otherwise. Returns the positive dot — callers
2324/// negate for the pgvector `<#>` "smaller = closer" convention.
2325///
2326/// Public so perf gates + downstream benches can microbenchmark the
2327/// dispatch directly; not part of the STABILITY contract — internal
2328/// SIMD layout can evolve in any release.
2329#[doc(hidden)]
2330#[inline]
2331pub fn inner_product_f32(a: &[f32], b: &[f32]) -> f32 {
2332    #[cfg(target_arch = "aarch64")]
2333    {
2334        if a.len() == b.len() && a.len() >= 4 && a.len().is_multiple_of(4) {
2335            // SAFETY: NEON is a baseline aarch64 feature; preconditions
2336            // (matching lengths, ≥ 1 full lane group) are checked above.
2337            return unsafe { inner_product_neon(a, b) };
2338        }
2339    }
2340    inner_product_scalar(a, b)
2341}
2342
2343fn inner_product_scalar(a: &[f32], b: &[f32]) -> f32 {
2344    let mut dot: f32 = 0.0;
2345    for (x, y) in a.iter().zip(b.iter()) {
2346        dot += x * y;
2347    }
2348    dot
2349}
2350
2351#[cfg(target_arch = "aarch64")]
2352#[target_feature(enable = "neon")]
2353#[allow(clippy::many_single_char_names)] // NEON intrinsics work in single-letter regs by convention
2354unsafe fn inner_product_neon(a: &[f32], b: &[f32]) -> f32 {
2355    use core::arch::aarch64::{
2356        float32x4_t, vaddq_f32, vaddvq_f32, vdupq_n_f32, vfmaq_f32, vld1q_f32,
2357    };
2358    unsafe {
2359        // Two parallel accumulators (same trick as L2 NEON) so the
2360        // FMA dependency chain doesn't serialise.
2361        let zero: float32x4_t = vdupq_n_f32(0.0);
2362        let mut acc0 = zero;
2363        let mut acc1 = zero;
2364        let n = a.len();
2365        let mut i = 0usize;
2366        while i + 8 <= n {
2367            let av0 = vld1q_f32(a.as_ptr().add(i));
2368            let bv0 = vld1q_f32(b.as_ptr().add(i));
2369            acc0 = vfmaq_f32(acc0, av0, bv0);
2370            let av1 = vld1q_f32(a.as_ptr().add(i + 4));
2371            let bv1 = vld1q_f32(b.as_ptr().add(i + 4));
2372            acc1 = vfmaq_f32(acc1, av1, bv1);
2373            i += 8;
2374        }
2375        while i + 4 <= n {
2376            let av = vld1q_f32(a.as_ptr().add(i));
2377            let bv = vld1q_f32(b.as_ptr().add(i));
2378            acc0 = vfmaq_f32(acc0, av, bv);
2379            i += 4;
2380        }
2381        vaddvq_f32(vaddq_f32(acc0, acc1))
2382    }
2383}
2384
2385/// v6.0.2: dispatch wrapper for the three accumulators (`dot`, `||a||²`,
2386/// `||b||²`) cosine needs. Same NEON pre-condition as the L2 / IP
2387/// paths; same scalar fallback shape.
2388///
2389/// Public for benchmarking only (see `inner_product_f32`); not in the
2390/// STABILITY contract.
2391#[doc(hidden)]
2392#[inline]
2393pub fn cosine_dot_norms_f32(a: &[f32], b: &[f32]) -> (f32, f32, f32) {
2394    #[cfg(target_arch = "aarch64")]
2395    {
2396        if a.len() == b.len() && a.len() >= 4 && a.len().is_multiple_of(4) {
2397            // SAFETY: see `inner_product_neon`.
2398            return unsafe { cosine_dot_norms_neon(a, b) };
2399        }
2400    }
2401    cosine_dot_norms_scalar(a, b)
2402}
2403
2404fn cosine_dot_norms_scalar(a: &[f32], b: &[f32]) -> (f32, f32, f32) {
2405    let mut dot: f32 = 0.0;
2406    let mut na: f32 = 0.0;
2407    let mut nb: f32 = 0.0;
2408    for (x, y) in a.iter().zip(b.iter()) {
2409        dot += x * y;
2410        na += x * x;
2411        nb += y * y;
2412    }
2413    (dot, na, nb)
2414}
2415
2416#[cfg(target_arch = "aarch64")]
2417#[target_feature(enable = "neon")]
2418#[allow(clippy::many_single_char_names, clippy::similar_names)]
2419unsafe fn cosine_dot_norms_neon(a: &[f32], b: &[f32]) -> (f32, f32, f32) {
2420    use core::arch::aarch64::{float32x4_t, vaddvq_f32, vdupq_n_f32, vfmaq_f32, vld1q_f32};
2421    unsafe {
2422        let zero: float32x4_t = vdupq_n_f32(0.0);
2423        let mut acc_dot = zero;
2424        let mut acc_na = zero;
2425        let mut acc_nb = zero;
2426        let n = a.len();
2427        let mut i = 0usize;
2428        while i + 4 <= n {
2429            let av = vld1q_f32(a.as_ptr().add(i));
2430            let bv = vld1q_f32(b.as_ptr().add(i));
2431            acc_dot = vfmaq_f32(acc_dot, av, bv);
2432            acc_na = vfmaq_f32(acc_na, av, av);
2433            acc_nb = vfmaq_f32(acc_nb, bv, bv);
2434            i += 4;
2435        }
2436        (vaddvq_f32(acc_dot), vaddvq_f32(acc_na), vaddvq_f32(acc_nb))
2437    }
2438}
2439
2440fn sqrt_newton_f32(x: f32) -> f32 {
2441    if x <= 0.0 {
2442        return 0.0;
2443    }
2444    let mut g = x;
2445    for _ in 0..10 {
2446        g = 0.5 * (g + x / g);
2447    }
2448    g
2449}
2450
2451/// Squared Euclidean distance — used for ordering inside NSW (the sqrt
2452/// preserves the order). Caller takes sqrt before reporting back to SQL.
2453///
2454/// v3.3.2: aarch64 NEON path for `len % 4 == 0` (which covers every
2455/// HNSW-indexed VECTOR(N) where N is a multiple of 4 — i.e. all
2456/// production-shaped embeddings: 64, 128, 256, 384, 512, 768, 1024,
2457/// 1536, ...). Other shapes fall back to the scalar loop.
2458#[inline]
2459fn l2_distance_sq(a: &[f32], b: &[f32]) -> f32 {
2460    #[cfg(target_arch = "aarch64")]
2461    {
2462        if a.len() == b.len() && a.len() >= 4 && a.len().is_multiple_of(4) {
2463            // SAFETY: NEON is a baseline aarch64 feature (ARMv8);
2464            // the precondition is checked above (matching lengths,
2465            // multiple of 4, at least one 128-bit lane group).
2466            return unsafe { l2_distance_sq_neon(a, b) };
2467        }
2468    }
2469    l2_distance_sq_scalar(a, b)
2470}
2471
2472fn l2_distance_sq_scalar(a: &[f32], b: &[f32]) -> f32 {
2473    let mut sum: f32 = 0.0;
2474    for (x, y) in a.iter().zip(b.iter()) {
2475        let d = *x - *y;
2476        sum += d * d;
2477    }
2478    sum
2479}
2480
2481#[cfg(target_arch = "aarch64")]
2482#[target_feature(enable = "neon")]
2483#[allow(clippy::many_single_char_names)] // NEON intrinsics work in single-letter regs by convention
2484unsafe fn l2_distance_sq_neon(a: &[f32], b: &[f32]) -> f32 {
2485    use core::arch::aarch64::{
2486        float32x4_t, vaddq_f32, vaddvq_f32, vdupq_n_f32, vfmaq_f32, vld1q_f32, vsubq_f32,
2487    };
2488    unsafe {
2489        // Two independent accumulator registers so the FMA dependency
2490        // chain doesn't serialise (each FMA depends on prior FMA).
2491        // Pre-conditions checked by caller: `a.len() == b.len()`,
2492        // `a.len() % 4 == 0`, `a.len() >= 4`.
2493        let zero: float32x4_t = vdupq_n_f32(0.0);
2494        let mut acc0 = zero;
2495        let mut acc1 = zero;
2496        let n = a.len();
2497        let mut i = 0usize;
2498        // Process 8 floats per iter when available (two parallel
2499        // accumulators). Tail of 4 falls into the second loop.
2500        while i + 8 <= n {
2501            let d0 = vsubq_f32(vld1q_f32(a.as_ptr().add(i)), vld1q_f32(b.as_ptr().add(i)));
2502            acc0 = vfmaq_f32(acc0, d0, d0);
2503            let d1 = vsubq_f32(
2504                vld1q_f32(a.as_ptr().add(i + 4)),
2505                vld1q_f32(b.as_ptr().add(i + 4)),
2506            );
2507            acc1 = vfmaq_f32(acc1, d1, d1);
2508            i += 8;
2509        }
2510        while i + 4 <= n {
2511            let d = vsubq_f32(vld1q_f32(a.as_ptr().add(i)), vld1q_f32(b.as_ptr().add(i)));
2512            acc0 = vfmaq_f32(acc0, d, d);
2513            i += 4;
2514        }
2515        vaddvq_f32(vaddq_f32(acc0, acc1))
2516    }
2517}
2518
2519/// Public wrapper: run an NSW kNN search and return the top-k row
2520/// indices ordered by ascending distance under the given metric.
2521pub fn nsw_query(
2522    table: &Table,
2523    idx_name: &str,
2524    query: &[f32],
2525    k: usize,
2526    metric: NswMetric,
2527) -> Vec<usize> {
2528    let Some(idx_pos) = table.indices.iter().position(|i| i.name == idx_name) else {
2529        return Vec::new();
2530    };
2531    let ef = (k * 2).max(NSW_DEFAULT_M);
2532    let mut hits = nsw_search(table, idx_pos, query, k, ef, metric);
2533    hits.truncate(k);
2534    hits.into_iter().map(|(_, idx)| idx).collect()
2535}
2536
2537/// Find any NSW index on a column. Used by the planner to decide
2538/// whether an `ORDER BY col <-> literal LIMIT k` query can skip the
2539/// brute-force scan.
2540pub fn nsw_index_on(table: &Table, column_position: usize) -> Option<&Index> {
2541    table
2542        .indices
2543        .iter()
2544        .find(|i| i.column_position == column_position && matches!(i.kind, IndexKind::Nsw(_)))
2545}
2546
2547/// Catalog: insertion-ordered `Vec<Table>` for stable iter / serialize,
2548/// plus a `BTreeMap<String, usize>` sidecar index so `get` / `get_mut`
2549/// run in O(log n) instead of the old linear scan with per-element
2550/// string compares.
2551///
2552/// A pure `BTreeMap<String, Table>` was tried in an interim version
2553/// of v3.1.2 and regressed the single-table catalog benches by ~10%
2554/// (the per-element `BTreeMap` overhead outweighs the lookup win
2555/// when n is small). The sidecar shape preserves the insertion-order
2556/// iteration the on-disk encoding relies on and keeps `last_mut`
2557/// (used by the deserialize hot path) cheap.
2558#[derive(Debug, Clone, Default)]
2559pub struct Catalog {
2560    tables: Vec<Table>,
2561    /// `name → tables[index]`. Kept in lock-step with `tables`.
2562    /// `create_table` is the only write path.
2563    by_name: BTreeMap<String, usize>,
2564    /// v5.1: in-memory cold-tier segments. Side-loaded via
2565    /// [`Catalog::load_segment_bytes`] — they live outside the
2566    /// catalog snapshot (caller persists them as separate files
2567    /// and re-loads on boot, until v5.3's `CatalogManifest` makes
2568    /// that wiring automatic). `RowLocator::Cold { segment_id, .. }`
2569    /// indexes this `Vec`. Cleared on `Catalog::new` / fresh
2570    /// `deserialize`.
2571    ///
2572    /// `Arc` wrap keeps `Catalog::clone` at O(N segments) bumps
2573    /// (rather than O(total segment bytes) memcpy) so the v4.42
2574    /// group-commit pre-image rollback invariant — clone is
2575    /// effectively free — survives the cold-tier addition.
2576    ///
2577    /// v6.7.3 — slots became `Option<…>` so cold-segment compaction
2578    /// can tombstone merged sources without breaking the
2579    /// `segment_id = index_into_vec` contract that on-disk
2580    /// `RowLocator::Cold { segment_id }` already serialized.
2581    /// `None` slot = the segment was retired by compaction; the
2582    /// physical file may still be on disk (next CHECKPOINT writes
2583    /// a manifest that no longer lists it, and the file becomes
2584    /// an orphan eligible for offline cleanup).
2585    cold_segments: Vec<Option<Arc<OwnedSegment>>>,
2586}
2587
2588impl Catalog {
2589    pub const fn new() -> Self {
2590        Self {
2591            tables: Vec::new(),
2592            by_name: BTreeMap::new(),
2593            cold_segments: Vec::new(),
2594        }
2595    }
2596
2597    pub fn create_table(&mut self, schema: TableSchema) -> Result<(), StorageError> {
2598        if self.by_name.contains_key(&schema.name) {
2599            return Err(StorageError::DuplicateTable {
2600                name: schema.name.clone(),
2601            });
2602        }
2603        let idx = self.tables.len();
2604        let name = schema.name.clone();
2605        self.tables.push(Table::new(schema));
2606        self.by_name.insert(name, idx);
2607        Ok(())
2608    }
2609
2610    pub fn get(&self, name: &str) -> Option<&Table> {
2611        let idx = *self.by_name.get(name)?;
2612        self.tables.get(idx)
2613    }
2614
2615    pub fn get_mut(&mut self, name: &str) -> Option<&mut Table> {
2616        let idx = *self.by_name.get(name)?;
2617        self.tables.get_mut(idx)
2618    }
2619
2620    pub fn table_count(&self) -> usize {
2621        self.tables.len()
2622    }
2623
2624    /// Borrow-free copy of every table's name in catalog order
2625    /// (= insertion order, matching the on-disk encoding).
2626    pub fn table_names(&self) -> Vec<String> {
2627        self.tables.iter().map(|t| t.schema.name.clone()).collect()
2628    }
2629
2630    /// v5.1: register a cold-tier segment that already lives in
2631    /// memory (caller did the file read). Returns the
2632    /// `segment_id` that `RowLocator::Cold { segment_id, .. }`
2633    /// will reference — currently this is just the index into
2634    /// `cold_segments`, but treat it as an opaque token.
2635    ///
2636    /// Storage is `no_std`, so file I/O is the caller's
2637    /// responsibility — `spg-server` reads the file and forwards
2638    /// the bytes here. The bytes stay resident in the catalog
2639    /// for the life of the `Catalog`, parsed only once.
2640    pub fn load_segment_bytes(&mut self, bytes: Vec<u8>) -> Result<u32, StorageError> {
2641        let id = u32::try_from(self.cold_segments.len()).map_err(|_| {
2642            StorageError::Corrupt("cold segment count would exceed u32::MAX".into())
2643        })?;
2644        let seg = OwnedSegment::from_bytes(bytes)
2645            .map_err(|e| StorageError::Corrupt(format!("cold segment parse failed: {e}")))?;
2646        self.cold_segments.push(Some(Arc::new(seg)));
2647        Ok(id)
2648    }
2649
2650    /// v6.7.3 — register a cold-tier segment at a specific id. Used
2651    /// by the spg-server manifest-boot path so segments whose
2652    /// neighbouring ids were retired by compaction still get back
2653    /// the same `segment_id` they had pre-restart (the
2654    /// `RowLocator::Cold { segment_id }` baked into the BTree-index
2655    /// snapshot persists across restart and must continue to
2656    /// resolve).
2657    ///
2658    /// Pads the Vec with `None` slots up to `target_id` if needed.
2659    /// Errors when the target slot is already occupied (would
2660    /// stomp another segment), the parse fails, or `target_id`
2661    /// exceeds `u32::MAX`.
2662    pub fn load_segment_bytes_at(
2663        &mut self,
2664        target_id: u32,
2665        bytes: Vec<u8>,
2666    ) -> Result<(), StorageError> {
2667        let seg = OwnedSegment::from_bytes(bytes)
2668            .map_err(|e| StorageError::Corrupt(format!("cold segment parse failed: {e}")))?;
2669        let idx = target_id as usize;
2670        while self.cold_segments.len() <= idx {
2671            self.cold_segments.push(None);
2672        }
2673        if self.cold_segments[idx].is_some() {
2674            return Err(StorageError::Corrupt(format!(
2675                "load_segment_bytes_at: segment_id {target_id} already occupied"
2676            )));
2677        }
2678        self.cold_segments[idx] = Some(Arc::new(seg));
2679        Ok(())
2680    }
2681
2682    /// v6.7.3 — retire a cold-tier segment slot (compaction-driven).
2683    /// The physical file is the caller's concern (typically kept
2684    /// on disk until the next CHECKPOINT writes a manifest that
2685    /// no longer lists it); this just flips the in-memory slot
2686    /// to `None` so later cold lookups for `segment_id` resolve
2687    /// as "unknown" instead of returning a stale row.
2688    ///
2689    /// No-op when the slot is already `None`. Errors only when
2690    /// `segment_id` is out of bounds.
2691    pub fn tombstone_segment(&mut self, segment_id: u32) -> Result<(), StorageError> {
2692        let idx = segment_id as usize;
2693        if idx >= self.cold_segments.len() {
2694            return Err(StorageError::Corrupt(format!(
2695                "tombstone_segment: segment_id {segment_id} out of bounds (len={})",
2696                self.cold_segments.len()
2697            )));
2698        }
2699        self.cold_segments[idx] = None;
2700        Ok(())
2701    }
2702
2703    /// Number of *active* (non-tombstoned) cold segments.
2704    #[must_use]
2705    pub fn cold_segment_count(&self) -> usize {
2706        self.cold_segments.iter().filter(|s| s.is_some()).count()
2707    }
2708
2709    /// Slot count including tombstones (= the next id the
2710    /// no-arg `load_segment_bytes` would allocate).
2711    #[must_use]
2712    pub fn cold_segment_slot_count(&self) -> usize {
2713        self.cold_segments.len()
2714    }
2715
2716    /// v6.2.7 — list every *active* cold-tier segment id known to
2717    /// this catalog (skips compaction tombstones since v6.7.3).
2718    /// Used by EXPLAIN ANALYZE to annotate scan nodes with the
2719    /// segments they could have walked.
2720    #[must_use]
2721    pub fn cold_segment_ids_global(&self) -> Vec<u32> {
2722        self.cold_segments
2723            .iter()
2724            .enumerate()
2725            .filter_map(|(i, s)| s.as_ref().map(|_| i as u32))
2726            .collect()
2727    }
2728
2729    /// v5.2.1: sum of `Table::hot_bytes` across every table. The v5.2
2730    /// freezer compares this against `SPG_HOT_TIER_BYTES` (parsed at
2731    /// server startup; default 4 GiB) and wakes when the budget is
2732    /// crossed. Pre-freezer (v5.2.1) this is measurement-only — the
2733    /// counter exposes whether the budget is being approached without
2734    /// triggering any demotion.
2735    #[must_use]
2736    pub fn hot_tier_bytes(&self) -> u64 {
2737        self.tables
2738            .iter()
2739            .map(Table::hot_bytes)
2740            .fold(0u64, u64::saturating_add)
2741    }
2742
2743    /// v5.2.2: freeze the **first** `max_rows` rows of `table_name`'s
2744    /// hot tier into a brand-new cold-tier segment. The named `BTree`
2745    /// index supplies the per-row PK (its column must be an integer
2746    /// type — v5.2.2 only supports `IndexKey::Int` PKs, matching the
2747    /// `index_key_as_u64` constraint used by the cold-tier lookup
2748    /// path). On success returns a [`FreezeReport`] with the
2749    /// freshly-allocated segment id, the count of rows that moved,
2750    /// the encoded segment bytes (so the caller can persist them to
2751    /// disk for later reload via `SPG_PRELOAD_COLD_SEGMENT`), and the
2752    /// hot-tier byte delta that was reclaimed.
2753    ///
2754    /// **Semantics**:
2755    /// 1. The first `max_rows` rows (by hot-tier position — same as
2756    ///    insertion order under v4.39 `PersistentVec`) are read.
2757    /// 2. Rows are sorted ascending by PK and serialised into a new
2758    ///    segment via [`encode_segment`].
2759    /// 3. The hot rows are dropped via [`Table::delete_rows`]; the
2760    ///    `rebuild_indices` it triggers regenerates `Hot` locators
2761    ///    for every remaining row (their positions shift down by
2762    ///    `max_rows`). Existing `Cold` locators in this index — from
2763    ///    a previous freeze — are also rebuilt **but with empty
2764    ///    payload** since rebuild reads only `self.rows`; this
2765    ///    routine re-registers them at the end of the call so the
2766    ///    user-visible state preserves all prior cold locators.
2767    /// 4. The new segment is loaded into `self.cold_segments` via
2768    ///    [`Catalog::load_segment_bytes`] (allocating a fresh
2769    ///    `segment_id`). New `Cold` locators are registered on the
2770    ///    named index — one per frozen row.
2771    ///
2772    /// **v5.2.2 limits** (relaxed in later sub-versions):
2773    /// - INSERT-only flow: subsequent UPDATE/DELETE on a frozen row
2774    ///   returns a stale-locator error (no promote-on-write until
2775    ///   v5.2.3).
2776    /// - Single-table scope: callers iterate tables themselves.
2777    /// - All-or-nothing: returns `Err` and leaves catalog unchanged
2778    ///   if any step fails before the atomic swap point.
2779    ///
2780    /// Errors:
2781    /// - [`StorageError::Corrupt`] for missing table/index, non-`BTree`
2782    ///   index, non-integer PK column, `max_rows == 0`, or
2783    ///   `max_rows > row_count`.
2784    /// - The encoder's [`SegmentError`] surfaces as `Corrupt` (the
2785    ///   only realistic source is "a single row is larger than the
2786    ///   page size"; SPG schemas don't hit it in practice).
2787    pub fn freeze_oldest_to_cold(
2788        &mut self,
2789        table_name: &str,
2790        index_name: &str,
2791        max_rows: usize,
2792    ) -> Result<FreezeReport, StorageError> {
2793        // --- validation phase: never mutates ---------------------
2794        if max_rows == 0 {
2795            return Err(StorageError::Corrupt(
2796                "freeze_oldest_to_cold: max_rows must be > 0".into(),
2797            ));
2798        }
2799        let table = self.get(table_name).ok_or_else(|| {
2800            StorageError::Corrupt(format!(
2801                "freeze_oldest_to_cold: table {table_name:?} not found"
2802            ))
2803        })?;
2804        if max_rows > table.rows.len() {
2805            return Err(StorageError::Corrupt(format!(
2806                "freeze_oldest_to_cold: max_rows {max_rows} > row_count {}",
2807                table.rows.len()
2808            )));
2809        }
2810        let idx = table
2811            .indices
2812            .iter()
2813            .find(|i| i.name == index_name)
2814            .ok_or_else(|| {
2815                StorageError::Corrupt(format!(
2816                    "freeze_oldest_to_cold: index {index_name:?} not found on {table_name:?}"
2817                ))
2818            })?;
2819        if !matches!(idx.kind, IndexKind::BTree(_)) {
2820            return Err(StorageError::Corrupt(format!(
2821                "freeze_oldest_to_cold: index {index_name:?} is NSW; only BTree indices may freeze"
2822            )));
2823        }
2824        let column_position = idx.column_position;
2825
2826        // --- segment build phase: reads only --------------------
2827        let schema = table.schema.clone();
2828        let mut to_freeze: Vec<(u64, Vec<u8>, IndexKey)> = Vec::with_capacity(max_rows);
2829        for row_idx in 0..max_rows {
2830            let row = table.rows.get(row_idx).expect("bounds-checked above");
2831            let key = IndexKey::from_value(&row.values[column_position]).ok_or_else(|| {
2832                StorageError::Corrupt(format!(
2833                    "freeze_oldest_to_cold: row {row_idx} has NULL / non-key value in index column"
2834                ))
2835            })?;
2836            let pk_u64 = index_key_as_u64(&key).ok_or_else(|| {
2837                StorageError::Corrupt(format!(
2838                    "freeze_oldest_to_cold: index {index_name:?} column type is non-integer; \
2839                     v5.2.2 cold tier requires IndexKey::Int (Text PK lands in v5.5+)"
2840                ))
2841            })?;
2842            to_freeze.push((pk_u64, encode_row_body_dense(row, &schema), key));
2843        }
2844        // encode_segment requires ascending u64 keys. Sort by PK
2845        // before encoding; the caller's row-position order is not
2846        // necessarily PK order (e.g. workloads that insert random
2847        // PKs).
2848        to_freeze.sort_by_key(|(k, _, _)| *k);
2849        // Reject duplicate PKs — encode_segment also rejects them
2850        // (`SegmentError::UnsortedKey`), but the resulting error
2851        // message there is misleading. Surface a clearer one.
2852        for w in to_freeze.windows(2) {
2853            if w[0].0 == w[1].0 {
2854                return Err(StorageError::Corrupt(format!(
2855                    "freeze_oldest_to_cold: duplicate PK {} in freeze batch",
2856                    w[0].0
2857                )));
2858            }
2859        }
2860        // Snapshot the (key, locator) pairs that will be registered
2861        // post-swap. Cloning the IndexKey out before the move makes
2862        // the registration loop borrow-free.
2863        let post_swap_keys: Vec<IndexKey> = to_freeze.iter().map(|(_, _, k)| k.clone()).collect();
2864        // Segment encode is now infallible w.r.t. ordering. Map the
2865        // `SegmentError` into a `StorageError::Corrupt` so the
2866        // public surface stays one error type.
2867        let seg_rows: Vec<(u64, Vec<u8>)> = to_freeze
2868            .into_iter()
2869            .map(|(k, body, _)| (k, body))
2870            .collect();
2871        let frozen_rows = seg_rows.len();
2872        let (seg_bytes, _meta) = encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES)
2873            .map_err(|e| StorageError::Corrupt(format!("freeze_oldest_to_cold: encode: {e}")))?;
2874
2875        // --- atomic swap phase: mutations only past this point ---
2876        // v5.2.3 made `Table::rebuild_indices` preserve every Cold
2877        // locator across the per-table rebuild, so `delete_rows`
2878        // below no longer wipes prior-freeze cold entries. The pre-
2879        // v5.2.3 capture-then-re-register that used to live here
2880        // was removed in v5.3.1 — keeping it would double-count
2881        // every prior-frozen key's Cold locator on each subsequent
2882        // freeze.
2883        let bytes_before = self.get(table_name).expect("just validated").hot_bytes();
2884        let positions: Vec<usize> = (0..max_rows).collect();
2885        let t_mut = self
2886            .get_mut(table_name)
2887            .expect("just validated; still present");
2888        let removed = t_mut.delete_rows(&positions);
2889        debug_assert_eq!(removed, max_rows, "delete_rows count matches request");
2890        let bytes_after = t_mut.hot_bytes();
2891        let bytes_freed = bytes_before.saturating_sub(bytes_after);
2892
2893        let segment_id = self
2894            .load_segment_bytes(seg_bytes.clone())
2895            .map_err(|e| StorageError::Corrupt(format!("freeze_oldest_to_cold: load: {e}")))?;
2896        let new_cold = post_swap_keys.into_iter().map(|k| {
2897            (
2898                k,
2899                RowLocator::Cold {
2900                    segment_id,
2901                    page_offset: 0,
2902                },
2903            )
2904        });
2905        let t_mut = self.get_mut(table_name).expect("still present");
2906        t_mut.register_cold_locators(index_name, new_cold)?;
2907
2908        Ok(FreezeReport {
2909            segment_id,
2910            frozen_rows,
2911            bytes_freed,
2912            segment_bytes: seg_bytes,
2913        })
2914    }
2915
2916    /// v5.1: borrow the cold segment at `segment_id`. Used by the
2917    /// spg-server preload path to enumerate (key, locator) pairs
2918    /// after loading a segment, so it can call
2919    /// [`Table::register_cold_locators`] without re-parsing the
2920    /// bytes.
2921    #[must_use]
2922    pub fn cold_segment(&self, segment_id: u32) -> Option<&OwnedSegment> {
2923        self.cold_segments
2924            .get(segment_id as usize)
2925            .and_then(|s| s.as_deref())
2926    }
2927
2928    /// v5.1: resolve a single `RowLocator::Cold` to its underlying
2929    /// `Row`. Decoupled from [`Catalog::lookup_by_pk`] so callers
2930    /// iterating a multi-locator slice (e.g. the engine's index
2931    /// seek path) can dispatch per locator instead of getting back
2932    /// only the first row for a key. Returns `None` when the
2933    /// segment isn't registered, the key isn't `u64`-coercible, or
2934    /// the segment doesn't actually carry the key (bloom or page-
2935    /// index reject).
2936    pub fn resolve_cold_locator(
2937        &self,
2938        table_name: &str,
2939        segment_id: u32,
2940        key: &IndexKey,
2941    ) -> Option<Row> {
2942        let t = self.get(table_name)?;
2943        let u64_key = index_key_as_u64(key)?;
2944        let seg = self.cold_segments.get(segment_id as usize)?.as_ref()?;
2945        let payload = seg.lookup(u64_key)?;
2946        let (row, _) = decode_row_body_dense(&payload, &t.schema).ok()?;
2947        Some(row)
2948    }
2949
2950    /// v5.1: indexed PK lookup that dispatches per locator,
2951    /// returning the first matching row from either the hot tier
2952    /// (`Table::rows`) or a registered cold segment.
2953    ///
2954    /// The cold path requires the index column to be coercible to
2955    /// a `u64` (the segment's PK type) and the segment payload to
2956    /// be a [`encode_row_body_dense`]-encoded row body for the
2957    /// same schema. v5.1 ships this for BIGINT / INT / SMALLINT
2958    /// PKs; other types fall through to hot-only behavior.
2959    ///
2960    /// Returns `None` if (a) the table or index doesn't exist,
2961    /// (b) the key isn't in the index at all, or (c) the key was
2962    /// resolved to a stale locator (Hot index out of range, Cold
2963    /// segment id unknown, segment lookup miss). Does not surface
2964    /// segment-decode errors — those would indicate corrupted
2965    /// cold-tier files and should be caught at
2966    /// [`Catalog::load_segment_bytes`] time.
2967    pub fn lookup_by_pk(&self, table: &str, index_name: &str, key: &IndexKey) -> Option<Row> {
2968        let t = self.get(table)?;
2969        let idx = t.indices.iter().find(|i| i.name == index_name)?;
2970        let locators = idx.lookup_eq(key);
2971        let cold_u64_key = index_key_as_u64(key);
2972        for loc in locators {
2973            match *loc {
2974                RowLocator::Hot(i) => {
2975                    if let Some(row) = t.rows.get(i) {
2976                        return Some(row.clone());
2977                    }
2978                }
2979                RowLocator::Cold {
2980                    segment_id,
2981                    page_offset: _,
2982                } => {
2983                    let Some(u64_key) = cold_u64_key else {
2984                        // Key type not coercible to u64 — cold tier
2985                        // only handles BIGINT/INT/SMALLINT in v5.1.
2986                        continue;
2987                    };
2988                    let Some(seg) = self
2989                        .cold_segments
2990                        .get(segment_id as usize)
2991                        .and_then(|s| s.as_deref())
2992                    else {
2993                        // v6.7.3 — `None` slot = compaction
2994                        // retired this segment; the live locator
2995                        // on a freshly-compacted index points to
2996                        // the merged segment_id, so a Cold hit
2997                        // here against a tombstone means the BTree
2998                        // entry hasn't been swapped yet (mid-
2999                        // compaction reader race) or the caller is
3000                        // looking up a stale snapshot. Skip — the
3001                        // next locator in the list, if any, is
3002                        // typically the merged segment.
3003                        continue;
3004                    };
3005                    let Some(payload) = seg.lookup(u64_key) else {
3006                        continue;
3007                    };
3008                    let (row, _) = decode_row_body_dense(&payload, &t.schema).ok()?;
3009                    return Some(row);
3010                }
3011            }
3012        }
3013        None
3014    }
3015
3016    /// v5.2.3: promote a frozen row back to the hot tier so an
3017    /// UPDATE / DELETE can mutate it. Reads the cold-tier row body
3018    /// (decoded from its registered segment), pushes it into
3019    /// `table.rows` via [`Table::insert`] (which also adds a fresh
3020    /// `Hot(new_idx)` locator on `index_name`), then retires the
3021    /// shadowed `Cold` locator via
3022    /// [`Table::remove_cold_locators_for_key`]. The cold-tier row
3023    /// in the segment file becomes garbage — recoverable when a
3024    /// future cold-segment compaction job lands.
3025    ///
3026    /// Returns:
3027    /// - `Ok(Some(new_hot_idx))` when the key resolved through a
3028    ///   cold locator and the promote completed. `new_hot_idx` is
3029    ///   the position the row now occupies in `table.rows`.
3030    /// - `Ok(None)` when the key has no Cold locator on the index
3031    ///   (already hot, or wasn't present at all). Callers treat this
3032    ///   as "nothing to do here, fall back to the hot-only path".
3033    ///
3034    /// Errors when the table / index doesn't exist, the index isn't
3035    /// `BTree`, the cold segment is missing / can't decode the row,
3036    /// or the inferred row body fails `Table::insert` validation.
3037    pub fn promote_cold_row(
3038        &mut self,
3039        table_name: &str,
3040        index_name: &str,
3041        key: &IndexKey,
3042    ) -> Result<Option<usize>, StorageError> {
3043        let cold_loc = self.find_cold_locator(table_name, index_name, key)?;
3044        let Some((segment_id, _page_offset)) = cold_loc else {
3045            return Ok(None);
3046        };
3047        let u64_key = index_key_as_u64(key).ok_or_else(|| {
3048            StorageError::Corrupt(
3049                "promote_cold_row: key type not coercible to u64 (cold tier requires integer PK)"
3050                    .into(),
3051            )
3052        })?;
3053        // Read the row body from the segment. Borrow the segment +
3054        // schema short-term so we can then take `&mut self` for the
3055        // hot-side insert.
3056        let schema = self
3057            .get(table_name)
3058            .ok_or_else(|| {
3059                StorageError::Corrupt(format!("promote_cold_row: table {table_name:?} not found"))
3060            })?
3061            .schema
3062            .clone();
3063        let seg = self
3064            .cold_segments
3065            .get(segment_id as usize)
3066            .and_then(|s| s.as_ref())
3067            .ok_or_else(|| {
3068                StorageError::Corrupt(format!(
3069                    "promote_cold_row: segment {segment_id} not registered on catalog"
3070                ))
3071            })?;
3072        let payload = seg.lookup(u64_key).ok_or_else(|| {
3073            StorageError::Corrupt(format!(
3074                "promote_cold_row: key {u64_key} resolves to segment {segment_id} \
3075                 but the segment's bloom/page lookup didn't return a row"
3076            ))
3077        })?;
3078        let (row, _consumed) = decode_row_body_dense(&payload, &schema)?;
3079        // Insert the promoted row into the hot tier. `Table::insert`
3080        // appends to `self.rows`, adds a `Hot(new_idx)` locator to
3081        // every BTree index covering the row's keyed columns, and
3082        // increments `hot_bytes`.
3083        let t = self
3084            .get_mut(table_name)
3085            .expect("table existed at lookup time");
3086        t.insert(row)?;
3087        let new_hot_idx =
3088            t.rows.len().checked_sub(1).ok_or_else(|| {
3089                StorageError::Corrupt("promote_cold_row: empty after insert".into())
3090            })?;
3091        // The hot insert added Hot(new_idx) alongside the still-
3092        // present Cold locator. Drop the Cold entry so future
3093        // lookups return only the fresh hot row.
3094        t.remove_cold_locators_for_key(index_name, key)?;
3095        Ok(Some(new_hot_idx))
3096    }
3097
3098    /// v5.2.3: shadow a frozen row's index entry. Used by DELETE
3099    /// when the row to remove lives in a cold-tier segment — the
3100    /// row body stays in the segment file (becoming garbage) but
3101    /// every `Cold` locator for `key` on `index_name` is removed
3102    /// so PK lookups stop returning it.
3103    ///
3104    /// Returns the number of cold locators retired (0 when the key
3105    /// has no cold entries — the DELETE fell on a hot row or a
3106    /// key that was already absent). Errors when the table /
3107    /// index doesn't exist or the index isn't `BTree`.
3108    ///
3109    /// Cold-segment compaction (which merges shadowed-heavy
3110    /// segments and reclaims their disk footprint) lands in a
3111    /// later v5.x sub-version; until then, repeated UPDATE/DELETE
3112    /// of cold rows can amplify cold-segment disk usage by up to
3113    /// 1-2× — still well under typical LSM-tree shadowing because
3114    /// SPG segments are bulk-baked, not write-merged.
3115    pub fn shadow_cold_row(
3116        &mut self,
3117        table_name: &str,
3118        index_name: &str,
3119        key: &IndexKey,
3120    ) -> Result<usize, StorageError> {
3121        let t = self.get_mut(table_name).ok_or_else(|| {
3122            StorageError::Corrupt(format!("shadow_cold_row: table {table_name:?} not found"))
3123        })?;
3124        t.remove_cold_locators_for_key(index_name, key)
3125    }
3126
3127    /// v6.7.4 — read-only slice preparation for the parallel
3128    /// freezer. Walks rows in `row_range`, builds the
3129    /// `(pk_u64, encoded_body, IndexKey)` triples that the
3130    /// coordinator's k-way merge consumes, sorts the slice by
3131    /// `pk_u64`, and returns a [`FreezeSlice`].
3132    ///
3133    /// Caller invariants:
3134    /// - `row_range.end <= table.rows.len()` (caller's job to
3135    ///   compute the partition).
3136    /// - All slices passed to `commit_freeze_slices` must cover a
3137    ///   contiguous half-open range `[0, total_max_rows)` with no
3138    ///   gaps and no overlaps. The coordinator validates this
3139    ///   invariant before committing.
3140    ///
3141    /// `&self`-only — multiple workers can run this concurrently
3142    /// against the same `Catalog` reference under the engine's
3143    /// write lock (workers don't mutate; the coordinator does).
3144    pub fn prepare_freeze_slice(
3145        &self,
3146        table_name: &str,
3147        index_name: &str,
3148        row_range: core::ops::Range<usize>,
3149    ) -> Result<FreezeSlice, StorageError> {
3150        let table = self.get(table_name).ok_or_else(|| {
3151            StorageError::Corrupt(format!(
3152                "prepare_freeze_slice: table {table_name:?} not found"
3153            ))
3154        })?;
3155        let idx = table
3156            .indices
3157            .iter()
3158            .find(|i| i.name == index_name)
3159            .ok_or_else(|| {
3160                StorageError::Corrupt(format!(
3161                    "prepare_freeze_slice: index {index_name:?} not found on {table_name:?}"
3162                ))
3163            })?;
3164        if !matches!(idx.kind, IndexKind::BTree(_)) {
3165            return Err(StorageError::Corrupt(format!(
3166                "prepare_freeze_slice: index {index_name:?} is NSW; only BTree indices may freeze"
3167            )));
3168        }
3169        if row_range.end > table.rows.len() {
3170            return Err(StorageError::Corrupt(format!(
3171                "prepare_freeze_slice: row_range end {} > row_count {}",
3172                row_range.end,
3173                table.rows.len()
3174            )));
3175        }
3176        let column_position = idx.column_position;
3177        let schema = table.schema.clone();
3178        let mut rows: Vec<(u64, Vec<u8>, IndexKey)> = Vec::with_capacity(row_range.len());
3179        for row_idx in row_range.clone() {
3180            let row = table.rows.get(row_idx).expect("bounds-checked above");
3181            let key = IndexKey::from_value(&row.values[column_position]).ok_or_else(|| {
3182                StorageError::Corrupt(format!(
3183                    "prepare_freeze_slice: row {row_idx} has NULL / non-key value in index column"
3184                ))
3185            })?;
3186            let pk_u64 = index_key_as_u64(&key).ok_or_else(|| {
3187                StorageError::Corrupt(format!(
3188                    "prepare_freeze_slice: index {index_name:?} column type is non-integer; \
3189                     v5.2.2 cold tier requires IndexKey::Int (Text PK lands in v5.5+)"
3190                ))
3191            })?;
3192            rows.push((pk_u64, encode_row_body_dense(row, &schema), key));
3193        }
3194        rows.sort_by_key(|(k, _, _)| *k);
3195        Ok(FreezeSlice { row_range, rows })
3196    }
3197
3198    /// v6.7.4 — coordinator commit step. Merges N
3199    /// [`FreezeSlice`]s into one segment via the standard
3200    /// [`encode_segment`] path, atomically swaps the catalog
3201    /// state (delete the union row range + register Cold
3202    /// locators + load the segment).
3203    ///
3204    /// Validates that the slices cover a contiguous, gap-free,
3205    /// overlap-free half-open range starting at index 0 (the
3206    /// freezer always freezes "oldest first" — same semantics as
3207    /// the single-threaded [`Catalog::freeze_oldest_to_cold`]).
3208    ///
3209    /// Empty `slices` → no-op success (returns a zero-row report
3210    /// without mutating). Total row count = `Σ slice.rows.len()`.
3211    pub fn commit_freeze_slices(
3212        &mut self,
3213        table_name: &str,
3214        index_name: &str,
3215        slices: Vec<FreezeSlice>,
3216    ) -> Result<FreezeReport, StorageError> {
3217        // --- validation phase: never mutates ---------------------
3218        let table = self.get(table_name).ok_or_else(|| {
3219            StorageError::Corrupt(format!(
3220                "commit_freeze_slices: table {table_name:?} not found"
3221            ))
3222        })?;
3223        let idx = table
3224            .indices
3225            .iter()
3226            .find(|i| i.name == index_name)
3227            .ok_or_else(|| {
3228                StorageError::Corrupt(format!(
3229                    "commit_freeze_slices: index {index_name:?} not found on {table_name:?}"
3230                ))
3231            })?;
3232        if !matches!(idx.kind, IndexKind::BTree(_)) {
3233            return Err(StorageError::Corrupt(format!(
3234                "commit_freeze_slices: index {index_name:?} is NSW; only BTree indices may freeze"
3235            )));
3236        }
3237        // Validate slice coverage: contiguous from 0, no gaps, no
3238        // overlaps. Allow the caller to pass slices in any order —
3239        // sort by row_range.start first.
3240        let mut ordered = slices;
3241        ordered.sort_by_key(|s| s.row_range.start);
3242        // Drop fully-empty slices that fell out of an uneven
3243        // partition; they carry no data but contribute to the
3244        // contiguity check, so keep them in line.
3245        let mut expected_start = 0usize;
3246        for s in &ordered {
3247            if s.row_range.start != expected_start {
3248                return Err(StorageError::Corrupt(format!(
3249                    "commit_freeze_slices: gap/overlap at row {}; expected start {}",
3250                    s.row_range.start, expected_start
3251                )));
3252            }
3253            expected_start = s.row_range.end;
3254        }
3255        let max_rows = expected_start;
3256        if max_rows > table.rows.len() {
3257            return Err(StorageError::Corrupt(format!(
3258                "commit_freeze_slices: total row range {} exceeds row_count {}",
3259                max_rows,
3260                table.rows.len()
3261            )));
3262        }
3263        if max_rows == 0 {
3264            return Ok(FreezeReport {
3265                segment_id: u32::MAX,
3266                frozen_rows: 0,
3267                bytes_freed: 0,
3268                segment_bytes: Vec::new(),
3269            });
3270        }
3271
3272        // --- segment build phase: reads only --------------------
3273        // K-way merge of already-sorted slices. Each slice's rows
3274        // are ascending by pk_u64; we keep a per-slice cursor and
3275        // pull the next-smallest head until every cursor drains.
3276        let total_rows: usize = ordered.iter().map(|s| s.rows.len()).sum();
3277        if total_rows != max_rows {
3278            return Err(StorageError::Corrupt(format!(
3279                "commit_freeze_slices: total slice rows {total_rows} ≠ row_range coverage {max_rows}"
3280            )));
3281        }
3282        let mut cursors: Vec<usize> = alloc::vec![0; ordered.len()];
3283        let mut merged: Vec<(u64, Vec<u8>, IndexKey)> = Vec::with_capacity(total_rows);
3284        loop {
3285            // Pick the slice whose head row has the smallest key
3286            // and isn't yet exhausted.
3287            let mut pick: Option<usize> = None;
3288            for (i, c) in cursors.iter().enumerate() {
3289                let slice = &ordered[i];
3290                if *c >= slice.rows.len() {
3291                    continue;
3292                }
3293                match pick {
3294                    None => pick = Some(i),
3295                    Some(j) => {
3296                        if slice.rows[*c].0 < ordered[j].rows[cursors[j]].0 {
3297                            pick = Some(i);
3298                        }
3299                    }
3300                }
3301            }
3302            let Some(i) = pick else { break };
3303            let row = ordered[i].rows[cursors[i]].clone();
3304            cursors[i] += 1;
3305            merged.push(row);
3306        }
3307        // Reject duplicate PKs — same error as the single-threaded
3308        // path so callers get a uniform surface.
3309        for w in merged.windows(2) {
3310            if w[0].0 == w[1].0 {
3311                return Err(StorageError::Corrupt(format!(
3312                    "commit_freeze_slices: duplicate PK {} across slices",
3313                    w[0].0
3314                )));
3315            }
3316        }
3317        let post_swap_keys: Vec<IndexKey> = merged.iter().map(|(_, _, k)| k.clone()).collect();
3318        let seg_rows: Vec<(u64, Vec<u8>)> = merged
3319            .into_iter()
3320            .map(|(k, body, _)| (k, body))
3321            .collect();
3322        let frozen_rows = seg_rows.len();
3323        let (seg_bytes, _meta) =
3324            encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES).map_err(|e| {
3325                StorageError::Corrupt(format!("commit_freeze_slices: encode: {e}"))
3326            })?;
3327
3328        // --- atomic swap phase: mutations only past this point ---
3329        let bytes_before = self.get(table_name).expect("just validated").hot_bytes();
3330        let positions: Vec<usize> = (0..max_rows).collect();
3331        let t_mut = self
3332            .get_mut(table_name)
3333            .expect("just validated; still present");
3334        let removed = t_mut.delete_rows(&positions);
3335        debug_assert_eq!(removed, max_rows, "delete_rows count matches request");
3336        let bytes_after = t_mut.hot_bytes();
3337        let bytes_freed = bytes_before.saturating_sub(bytes_after);
3338
3339        let segment_id = self
3340            .load_segment_bytes(seg_bytes.clone())
3341            .map_err(|e| StorageError::Corrupt(format!("commit_freeze_slices: load: {e}")))?;
3342        let new_cold = post_swap_keys.into_iter().map(|k| {
3343            (
3344                k,
3345                RowLocator::Cold {
3346                    segment_id,
3347                    page_offset: 0,
3348                },
3349            )
3350        });
3351        let t_mut = self.get_mut(table_name).expect("still present");
3352        t_mut.register_cold_locators(index_name, new_cold)?;
3353
3354        Ok(FreezeReport {
3355            segment_id,
3356            frozen_rows,
3357            bytes_freed,
3358            segment_bytes: seg_bytes,
3359        })
3360    }
3361
3362    /// v6.7.3 — compact every cold segment on `(table, index)` whose
3363    /// `OwnedSegment::bytes().len()` is below `target_segment_bytes`
3364    /// into a single larger merged segment. Rows present in source
3365    /// segment payloads but no longer referenced by any
3366    /// `RowLocator::Cold` on the index (DELETE'd + frozen rows
3367    /// retired via [`Catalog::shadow_cold_row`]) are GC'd in the
3368    /// merge.
3369    ///
3370    /// **Semantics**:
3371    /// 1. Walk the BTree index to collect every Cold locator that
3372    ///    targets a small (< threshold) segment. Each such
3373    ///    `(key, segment_id)` becomes a row in the merged segment;
3374    ///    payload is looked up from the source segment in-place.
3375    /// 2. Encode the collected rows into one new segment via
3376    ///    [`encode_segment`]; register it via
3377    ///    [`Catalog::load_segment_bytes`] (allocating a fresh
3378    ///    `merged_segment_id` at the end of `cold_segments`).
3379    /// 3. Rewrite the BTree index in one pass: every
3380    ///    `RowLocator::Cold { segment_id ∈ sources }` becomes
3381    ///    `RowLocator::Cold { segment_id = merged_id, page_offset = 0 }`.
3382    ///    Hot locators are untouched.
3383    /// 4. Tombstone every source slot via
3384    ///    [`Catalog::tombstone_segment`]. Source segment payloads
3385    ///    are no longer reachable through the catalog; the on-disk
3386    ///    files are the caller's concern.
3387    ///
3388    /// On fewer than 2 candidate segments the catalog is **not**
3389    /// mutated and a no-op report (`merged_segment_id: None`,
3390    /// `sources: []`) is returned. This is the routine case — a
3391    /// freshly-frozen table has at most 1 small segment, no merge
3392    /// possible.
3393    ///
3394    /// Atomicity: every mutating step runs after the read-only
3395    /// gather phase, so a panic before the merge encode leaves the
3396    /// catalog unchanged. The mutation block itself (load + rewrite +
3397    /// tombstone) takes only `&mut self` — callers serialise the
3398    /// engine write lock outside this function.
3399    ///
3400    /// Errors when the table / index doesn't exist, the index isn't
3401    /// `BTree`, the index column type isn't u64-coercible (cold-tier
3402    /// pre-condition), or a source segment fails its in-place
3403    /// row-body lookup (would indicate prior catalog corruption).
3404    pub fn compact_cold_segments(
3405        &mut self,
3406        table_name: &str,
3407        index_name: &str,
3408        target_segment_bytes: u64,
3409    ) -> Result<CompactReport, StorageError> {
3410        // --- validation phase ----------------------------------
3411        let t = self.get(table_name).ok_or_else(|| {
3412            StorageError::Corrupt(format!(
3413                "compact_cold_segments: table {table_name:?} not found"
3414            ))
3415        })?;
3416        let idx = t
3417            .indices
3418            .iter()
3419            .find(|i| i.name == index_name)
3420            .ok_or_else(|| {
3421                StorageError::Corrupt(format!(
3422                    "compact_cold_segments: index {index_name:?} not found on {table_name:?}"
3423                ))
3424            })?;
3425        let map = match &idx.kind {
3426            IndexKind::BTree(m) => m,
3427            IndexKind::Nsw(_) | IndexKind::Brin { .. } => {
3428                return Err(StorageError::Corrupt(format!(
3429                    "compact_cold_segments: index {index_name:?} is not BTree; \
3430                     compaction applies only to BTree cold-tier indices"
3431                )));
3432            }
3433        };
3434
3435        // --- gather phase --------------------------------------
3436        // Step A: every segment_id this BTree index Cold-references.
3437        let mut referenced_ids: BTreeSet<u32> = BTreeSet::new();
3438        for (_key, locators) in map.iter() {
3439            for loc in locators {
3440                if let RowLocator::Cold { segment_id, .. } = loc {
3441                    referenced_ids.insert(*segment_id);
3442                }
3443            }
3444        }
3445        // Step B: keep only the small + still-active ones.
3446        let candidate_set: BTreeSet<u32> = referenced_ids
3447            .into_iter()
3448            .filter(|id| {
3449                self.cold_segments
3450                    .get(*id as usize)
3451                    .and_then(|s| s.as_deref())
3452                    .is_some_and(|s| (s.bytes().len() as u64) < target_segment_bytes)
3453            })
3454            .collect();
3455        if candidate_set.len() < 2 {
3456            return Ok(CompactReport {
3457                sources: Vec::new(),
3458                merged_segment_id: None,
3459                merged_segment_bytes: Vec::new(),
3460                merged_rows: 0,
3461                deleted_rows_pruned: 0,
3462                bytes_reclaimed_estimate: 0,
3463            });
3464        }
3465        // Step C: pre-count source rows for the deleted-pruned metric.
3466        let mut source_row_count: usize = 0;
3467        let mut source_byte_total: u64 = 0;
3468        for &id in &candidate_set {
3469            let seg = self.cold_segments[id as usize]
3470                .as_ref()
3471                .expect("candidate selected only when slot is Some");
3472            source_row_count = source_row_count.saturating_add(seg.meta().num_rows as usize);
3473            source_byte_total =
3474                source_byte_total.saturating_add(seg.bytes().len() as u64);
3475        }
3476        // Step D: collect (key, body) pairs from every live Cold
3477        // locator pointing at a candidate. dedupe by key — one
3478        // BTree key resolves to at most one cold payload (the
3479        // freezer + promote/shadow flow keeps Cold locators
3480        // unique per key).
3481        let mut collected: BTreeMap<u64, (Vec<u8>, IndexKey)> = BTreeMap::new();
3482        for (key, locators) in map.iter() {
3483            for loc in locators {
3484                let RowLocator::Cold { segment_id, .. } = loc else {
3485                    continue;
3486                };
3487                if !candidate_set.contains(segment_id) {
3488                    continue;
3489                }
3490                let u64_key = index_key_as_u64(key).ok_or_else(|| {
3491                    StorageError::Corrupt(format!(
3492                        "compact_cold_segments: index {index_name:?} has non-integer Cold key; \
3493                         cold tier requires IndexKey::Int (Text PK lands in v5.5+)"
3494                    ))
3495                })?;
3496                let seg = self.cold_segments[*segment_id as usize]
3497                    .as_ref()
3498                    .expect("candidate slot guaranteed Some above");
3499                let payload = seg.lookup(u64_key).ok_or_else(|| {
3500                    StorageError::Corrupt(format!(
3501                        "compact_cold_segments: BTree {index_name:?} points key={u64_key} \
3502                         at segment {segment_id} but the segment lookup missed"
3503                    ))
3504                })?;
3505                collected.insert(u64_key, (payload, key.clone()));
3506                break;
3507            }
3508        }
3509        let merged_rows = collected.len();
3510        let deleted_rows_pruned = source_row_count.saturating_sub(merged_rows);
3511
3512        // Step E: encode the merged segment. `BTreeMap<u64, _>`
3513        // iteration is ascending by key, which is what
3514        // `encode_segment` requires.
3515        let seg_rows: Vec<(u64, Vec<u8>)> = collected
3516            .iter()
3517            .map(|(k, (body, _))| (*k, body.clone()))
3518            .collect();
3519        let (seg_bytes, _meta) =
3520            encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES).map_err(|e| {
3521                StorageError::Corrupt(format!("compact_cold_segments: encode: {e}"))
3522            })?;
3523        let merged_bytes_len = seg_bytes.len() as u64;
3524
3525        // --- atomic mutation phase ------------------------------
3526        let merged_segment_id = self
3527            .load_segment_bytes(seg_bytes.clone())
3528            .map_err(|e| StorageError::Corrupt(format!("compact_cold_segments: load: {e}")))?;
3529
3530        // Rewrite the BTree index: every Cold locator pointing at
3531        // a candidate source becomes a Cold locator pointing at
3532        // the merged segment. Use a flat collect-then-replace
3533        // pattern so we never hold a `&self` borrow across the
3534        // `&mut self` write.
3535        let entries: Vec<(IndexKey, Vec<RowLocator>)> = {
3536            let t = self
3537                .get(table_name)
3538                .expect("table existed at the start of this fn");
3539            let idx = t
3540                .indices
3541                .iter()
3542                .find(|i| i.name == index_name)
3543                .expect("index existed at the start of this fn");
3544            let IndexKind::BTree(map) = &idx.kind else {
3545                unreachable!("validated above");
3546            };
3547            map.iter().map(|(k, v)| (k.clone(), v.clone())).collect()
3548        };
3549        let t_mut = self
3550            .get_mut(table_name)
3551            .expect("table existed at the start of this fn");
3552        let idx_mut = t_mut
3553            .indices
3554            .iter_mut()
3555            .find(|i| i.name == index_name)
3556            .expect("index existed at the start of this fn");
3557        let IndexKind::BTree(map_mut) = &mut idx_mut.kind else {
3558            unreachable!("validated above");
3559        };
3560        for (key, locators) in entries {
3561            let mut new_locs: Vec<RowLocator> = Vec::with_capacity(locators.len());
3562            let mut changed = false;
3563            for loc in &locators {
3564                match *loc {
3565                    RowLocator::Cold {
3566                        segment_id,
3567                        page_offset: _,
3568                    } if candidate_set.contains(&segment_id) => {
3569                        let replacement = RowLocator::Cold {
3570                            segment_id: merged_segment_id,
3571                            page_offset: 0,
3572                        };
3573                        if !new_locs.contains(&replacement) {
3574                            new_locs.push(replacement);
3575                        }
3576                        changed = true;
3577                    }
3578                    other => new_locs.push(other),
3579                }
3580            }
3581            if changed {
3582                map_mut.insert_mut(key, new_locs);
3583            }
3584        }
3585
3586        // Tombstone every source slot. Last step — failures here
3587        // would leave the segment double-referenced in both
3588        // memory + manifest, but `tombstone_segment` only errors
3589        // on out-of-bounds, which we've already validated.
3590        for &id in &candidate_set {
3591            self.tombstone_segment(id)?;
3592        }
3593
3594        let bytes_reclaimed_estimate = source_byte_total.saturating_sub(merged_bytes_len);
3595        Ok(CompactReport {
3596            sources: candidate_set.into_iter().collect(),
3597            merged_segment_id: Some(merged_segment_id),
3598            merged_segment_bytes: seg_bytes,
3599            merged_rows,
3600            deleted_rows_pruned,
3601            bytes_reclaimed_estimate,
3602        })
3603    }
3604
3605    /// Internal helper: scan `(table, index)` for a `Cold` locator
3606    /// keyed by `key`. Returns `Ok(Some((segment_id, page_offset)))`
3607    /// when found, `Ok(None)` when the key has only hot entries
3608    /// or no entries at all, `Err` on the same input-validation
3609    /// errors as the public `promote_cold_row` / `shadow_cold_row`.
3610    fn find_cold_locator(
3611        &self,
3612        table_name: &str,
3613        index_name: &str,
3614        key: &IndexKey,
3615    ) -> Result<Option<(u32, u32)>, StorageError> {
3616        let t = self.get(table_name).ok_or_else(|| {
3617            StorageError::Corrupt(format!("find_cold_locator: table {table_name:?} not found"))
3618        })?;
3619        let idx = t
3620            .indices
3621            .iter()
3622            .find(|i| i.name == index_name)
3623            .ok_or_else(|| {
3624                StorageError::Corrupt(format!(
3625                    "find_cold_locator: index {index_name:?} not found on {table_name:?}"
3626                ))
3627            })?;
3628        if !matches!(idx.kind, IndexKind::BTree(_)) {
3629            return Err(StorageError::Corrupt(format!(
3630                "find_cold_locator: index {index_name:?} is NSW; promote-on-write only applies to BTree indices"
3631            )));
3632        }
3633        for loc in idx.lookup_eq(key) {
3634            if let RowLocator::Cold {
3635                segment_id,
3636                page_offset,
3637            } = *loc
3638            {
3639                return Ok(Some((segment_id, page_offset)));
3640            }
3641        }
3642        Ok(None)
3643    }
3644}
3645
3646/// Coerce an [`IndexKey`] to the `u64` that v5.1 cold-tier
3647/// segments use as their on-disk PK. Returns `None` for keys that
3648/// aren't representable as `u64` — Text PKs need a hash mapping
3649/// the segment writer baked in (deferred to v5.2+), Bool PKs are
3650/// almost never wide enough to be sharded into a cold tier.
3651fn index_key_as_u64(key: &IndexKey) -> Option<u64> {
3652    match key {
3653        // Reinterpret the i64 bit pattern as u64. Cold-tier segments
3654        // are sorted by this u64 view, so the chosen interpretation
3655        // only has to match between insert (bake_segment / freezer)
3656        // and lookup — using cast_unsigned keeps both sides honest
3657        // and silences clippy::cast_sign_loss.
3658        IndexKey::Int(n) => Some(n.cast_unsigned()),
3659        IndexKey::Text(_) | IndexKey::Bool(_) => None,
3660    }
3661}
3662
3663#[derive(Debug, Clone, PartialEq, Eq)]
3664#[non_exhaustive]
3665pub enum StorageError {
3666    DuplicateTable {
3667        name: String,
3668    },
3669    TableNotFound {
3670        name: String,
3671    },
3672    ArityMismatch {
3673        expected: usize,
3674        actual: usize,
3675    },
3676    TypeMismatch {
3677        column: String,
3678        expected: DataType,
3679        actual: DataType,
3680        position: usize,
3681    },
3682    NullInNotNull {
3683        column: String,
3684    },
3685    /// Index with this name already exists on the table.
3686    DuplicateIndex {
3687        name: String,
3688    },
3689    /// Column referenced by an index doesn't exist on the table.
3690    ColumnNotFound {
3691        column: String,
3692    },
3693    /// On-disk format failed to parse — corrupted file, wrong magic, truncated
3694    /// payload, or unknown tag bytes.
3695    Corrupt(String),
3696    /// v6.0.4 — ALTER INDEX targeted an index name that doesn't
3697    /// exist on any table in this catalog.
3698    IndexNotFound {
3699        name: String,
3700    },
3701    /// v6.0.4 — operation requested isn't supported on this index
3702    /// kind / column type (e.g. ALTER INDEX REBUILD on a `BTree`
3703    /// index, or REBUILD WITH (encoding=…) on a non-vector column).
3704    Unsupported(String),
3705}
3706
3707impl fmt::Display for StorageError {
3708    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
3709        match self {
3710            Self::DuplicateTable { name } => write!(f, "table already exists: {name}"),
3711            Self::TableNotFound { name } => write!(f, "table not found: {name}"),
3712            Self::ArityMismatch { expected, actual } => write!(
3713                f,
3714                "row arity mismatch: expected {expected} columns, got {actual}"
3715            ),
3716            Self::TypeMismatch {
3717                column,
3718                expected,
3719                actual,
3720                position,
3721            } => write!(
3722                f,
3723                "type mismatch in column {column:?} (position {position}): expected {expected}, got {actual}"
3724            ),
3725            Self::NullInNotNull { column } => {
3726                write!(f, "NULL value in NOT NULL column {column:?}")
3727            }
3728            Self::DuplicateIndex { name } => write!(f, "index already exists: {name}"),
3729            Self::ColumnNotFound { column } => write!(f, "column not found: {column}"),
3730            Self::Corrupt(detail) => write!(f, "corrupt on-disk format: {detail}"),
3731            Self::IndexNotFound { name } => write!(f, "index not found: {name}"),
3732            Self::Unsupported(detail) => write!(f, "unsupported: {detail}"),
3733        }
3734    }
3735}
3736
3737impl ColumnSchema {
3738    pub fn new(name: impl Into<String>, ty: DataType, nullable: bool) -> Self {
3739        Self {
3740            name: name.into(),
3741            ty,
3742            nullable,
3743            default: None,
3744            auto_increment: false,
3745        }
3746    }
3747
3748    /// Builder-style helper to attach a default value to an otherwise
3749    /// plain column schema. Used by the engine when CREATE TABLE
3750    /// specifies `column TYPE DEFAULT <expr>`.
3751    #[must_use]
3752    pub fn with_default(mut self, default: Value) -> Self {
3753        self.default = Some(default);
3754        self
3755    }
3756
3757    /// Builder-style helper to mark a column as `AUTO_INCREMENT`.
3758    #[must_use]
3759    pub const fn with_auto_increment(mut self) -> Self {
3760        self.auto_increment = true;
3761        self
3762    }
3763}
3764
3765impl TableSchema {
3766    pub fn new(name: impl Into<String>, columns: Vec<ColumnSchema>) -> Self {
3767        Self {
3768            name: name.into(),
3769            columns,
3770            hot_tier_bytes: None,
3771            foreign_keys: Vec::new(),
3772        }
3773    }
3774}
3775
3776// =========================================================================
3777// Persistent binary format for the catalog.
3778//
3779// Layout (little-endian throughout):
3780//
3781//   [magic "SPGDB001" 8 bytes][version u8]
3782//   [table_count u32]
3783//   for each table:
3784//       [name_len u16][name bytes]
3785//       [col_count u16]
3786//       for each col:
3787//           [name_len u16][name bytes]
3788//           [type_tag u8 + optional payload]
3789//               1=Int 2=BigInt 3=Float 4=Text 5=Bool
3790//               6=Vector(u32 dim)
3791//               7=SmallInt
3792//               8=Varchar(u32 max)
3793//               9=Char(u32 size)
3794//               10=Numeric(u8 precision, u8 scale)
3795//               11=Date
3796//               12=Timestamp
3797//           [nullable u8]   0/1
3798//           [default_tag u8] 0=none 1=value (followed by [value_tag u8] + bytes)
3799//       [row_count u32]
3800//       for each row, for each col, one [value_tag u8] + value bytes:
3801//           tag 0 (Null)     → no body
3802//           tag 1 (Int)      → i32 LE
3803//           tag 2 (BigInt)   → i64 LE
3804//           tag 3 (Float)    → f64 LE
3805//           tag 4 (Text)     → u16 LE len + UTF-8 bytes
3806//           tag 5 (Bool)     → u8 0/1
3807//           tag 6 (Vector)   → u32 LE dim + dim×f32 LE
3808//           tag 7 (SmallInt) → i16 LE
3809//           tag 8 (Numeric)  → i128 LE (16 bytes) + u8 scale
3810//           tag 9 (Date)     → i32 LE (days since Unix epoch)
3811//           tag 10 (Timestamp) → i64 LE (microseconds since Unix epoch)
3812//
3813// Bumped to version 3 when NUMERIC was added; to version 4 when
3814// AUTO_INCREMENT (per-column flag) + NSW index `kind` byte landed;
3815// to version 5 when DATE / TIMESTAMP were added; to version 6 when
3816// NSW graph topology started travelling on disk (v2.7); to version 7
3817// when the NSW topology became multi-layer HNSW (v2.13); to version 8
3818// when row encoding switched to schema-driven dense layout (v3.0.2 —
3819// per-row NULL bitmap + per-column fixed-width body, no per-cell type
3820// tag).
3821// =========================================================================
3822
3823const FILE_MAGIC: &[u8; 8] = b"SPGDB001";
3824/// Current catalog snapshot format version emitted by [`Catalog::serialize`].
3825///
3826/// v9 (v5.2) extends v8 by serialising `BTree` index entries directly — every
3827/// `(IndexKey, Vec<RowLocator>)` pair travels on disk with the v5.1
3828/// `RowLocator::write_le` tag-prefixed codec. v8 `BTree` indices stored no
3829/// entries at all (the map was rebuilt from `Table::rows` on load); v9
3830/// preserves on-disk Cold locators so freezer-produced cold-tier index
3831/// entries survive a catalog snapshot round-trip. v8 readers are accepted
3832/// by version dispatch in [`Catalog::deserialize`] — every entry decodes
3833/// as `RowLocator::Hot(_)` via `add_index` rebuild, identical to v5.1
3834/// behaviour.
3835/// v6.7.2 — bumped from 10 to 11 to append per-table
3836/// `hot_tier_bytes: Option<u64>` after the per-table indices
3837/// section. v10 catalogs (v6.7.1) load with `hot_tier_bytes =
3838/// None` for every table (the deserialiser short-circuits when
3839/// version < 11). v11 snapshots written by a pre-v6.7.2 binary
3840/// fail loudly at the version check, matching the v6.1.2 /
3841/// v6.1.4 / v6.2.0 / v6.7.1 envelope-bump upgrade fences.
3842///
3843/// v6.8.0 — bumped from 11 to 12: per-index
3844/// `included_columns: Vec<u16>` appended at the tail of each
3845/// index payload. v11 (= v6.7.2) catalogs load with
3846/// `included_columns = Vec::new()` for every index — same
3847/// "older readers, append-only extension" pattern as the v6.7.2
3848/// hot_tier_bytes byte.
3849const FILE_VERSION: u8 = 14;
3850/// Oldest format version [`Catalog::deserialize`] still accepts. v8 is the
3851/// v3.0.2 dense-row layout; pre-v8 catalogs require an offline migration.
3852const MIN_SUPPORTED_FILE_VERSION: u8 = 8;
3853
3854// IndexKey wire format (v9):
3855//   tag 0 = Int  → [i64 LE]
3856//   tag 1 = Text → [u16 LE len + UTF-8 bytes] (via write_str / read_str)
3857//   tag 2 = Bool → [u8 0/1]
3858const INDEX_KEY_TAG_INT: u8 = 0;
3859const INDEX_KEY_TAG_TEXT: u8 = 1;
3860const INDEX_KEY_TAG_BOOL: u8 = 2;
3861
3862impl Catalog {
3863    /// Serialize the whole catalog (schema + every row) into a self-contained
3864    /// byte buffer. Format is documented above the impl block.
3865    pub fn serialize(&self) -> Vec<u8> {
3866        let mut out = Vec::with_capacity(64);
3867        out.extend_from_slice(FILE_MAGIC);
3868        out.push(FILE_VERSION);
3869        write_u32(
3870            &mut out,
3871            u32::try_from(self.tables.len()).expect("≤ 4G tables"),
3872        );
3873        for t in &self.tables {
3874            write_str(&mut out, &t.schema.name);
3875            write_u16(
3876                &mut out,
3877                u16::try_from(t.schema.columns.len()).expect("≤ 65k columns/table"),
3878            );
3879            for c in &t.schema.columns {
3880                write_str(&mut out, &c.name);
3881                write_data_type(&mut out, c.ty);
3882                out.push(u8::from(c.nullable));
3883                match &c.default {
3884                    None => out.push(0),
3885                    Some(v) => {
3886                        out.push(1);
3887                        write_value(&mut out, v);
3888                    }
3889                }
3890                out.push(u8::from(c.auto_increment));
3891            }
3892            write_u32(
3893                &mut out,
3894                u32::try_from(t.rows.len()).expect("≤ 4G rows/table"),
3895            );
3896            // v3.0.2 dense row encoding (FILE_VERSION 8): per-row NULL
3897            // bitmap, then tightly-packed bodies. Identical wire format
3898            // as before — extracted into `encode_row_body_dense` so cold-
3899            // tier segments (v5.1+) can share the encoding.
3900            for row in &t.rows {
3901                out.extend_from_slice(&encode_row_body_dense(row, &t.schema));
3902            }
3903            // Index definitions. Per-index payload:
3904            //   [name][col_pos u16][kind u8]
3905            //     kind 0 = B-tree           (no params — rebuilt on load)
3906            //     kind 1 = NSW graph        (u16 M + serialized graph)
3907            // For NSW the graph topology travels on disk so startup
3908            // doesn't re-run the O(n²M) rebuild — see v2.7 notes.
3909            write_u16(
3910                &mut out,
3911                u16::try_from(t.indices.len()).expect("≤ 65k indices/table"),
3912            );
3913            for idx in &t.indices {
3914                write_str(&mut out, &idx.name);
3915                write_u16(
3916                    &mut out,
3917                    u16::try_from(idx.column_position).expect("≤ 65k columns/table"),
3918                );
3919                match &idx.kind {
3920                    IndexKind::BTree(map) => {
3921                        out.push(0);
3922                        // v9: serialise the full PB map. Each entry's
3923                        // RowLocator list travels with the tag-prefixed
3924                        // codec from `row_locator::write_le`, so freezer-
3925                        // produced Cold locators survive a snapshot
3926                        // round-trip. v8 BTree wrote nothing here and
3927                        // rebuilt from rows — v9 readers tolerate v8 by
3928                        // version dispatch in `Catalog::deserialize`.
3929                        write_u32(
3930                            &mut out,
3931                            u32::try_from(map.len()).expect("≤ 4G index entries/index"),
3932                        );
3933                        for (key, locators) in map {
3934                            write_index_key(&mut out, key);
3935                            write_u32(
3936                                &mut out,
3937                                u32::try_from(locators.len()).expect("≤ 4G locators/key"),
3938                            );
3939                            for loc in locators {
3940                                loc.write_le(&mut out);
3941                            }
3942                        }
3943                    }
3944                    IndexKind::Nsw(g) => {
3945                        out.push(1);
3946                        write_u16(&mut out, u16::try_from(g.m).expect("≤ 65k NSW neighbours"));
3947                        write_nsw_graph(&mut out, g);
3948                    }
3949                    IndexKind::Brin { column_type } => {
3950                        // v6.7.1 — tag byte 2 = BRIN. Payload is the
3951                        // column type code (1 byte mapping to the
3952                        // shared DataType numeric encoding); no
3953                        // further data — BRIN summaries live in
3954                        // cold segments, not the catalog.
3955                        out.push(2);
3956                        write_data_type(&mut out, *column_type);
3957                    }
3958                }
3959                // v6.8.0 — included_columns appendix per index.
3960                // Layout: [u16 num_included][num × u16 column_position].
3961                // v11 readers stop before this u16 (deserialise loop
3962                // gated on version >= 12); v12+ readers always
3963                // consume it. Empty Vec serialises as a bare 0u16.
3964                write_u16(
3965                    &mut out,
3966                    u16::try_from(idx.included_columns.len())
3967                        .expect("≤ 65k INCLUDE columns/index"),
3968                );
3969                for col_pos in &idx.included_columns {
3970                    write_u16(
3971                        &mut out,
3972                        u16::try_from(*col_pos).expect("≤ 65k columns/table"),
3973                    );
3974                }
3975                // v6.8.1 — partial_predicate appendix per index.
3976                // Layout: [u8 has_pred][u16 LE len][bytes (if has_pred)].
3977                // Same v12 gate as included_columns.
3978                match &idx.partial_predicate {
3979                    None => out.push(0),
3980                    Some(pred) => {
3981                        out.push(1);
3982                        write_str(&mut out, pred);
3983                    }
3984                }
3985                // v6.8.2 — expression appendix. Same shape as
3986                // partial_predicate.
3987                match &idx.expression {
3988                    None => out.push(0),
3989                    Some(expr) => {
3990                        out.push(1);
3991                        write_str(&mut out, expr);
3992                    }
3993                }
3994            }
3995            // v6.7.2 — per-table hot_tier_bytes Option<u64>.
3996            // Layout: [u8 has_value][u64 LE value (if has_value)].
3997            // v10 readers stop before this byte (deserialise loop
3998            // gated on version >= 11); v11+ readers always
3999            // consume it.
4000            match t.schema.hot_tier_bytes {
4001                None => out.push(0),
4002                Some(n) => {
4003                    out.push(1);
4004                    out.extend_from_slice(&n.to_le_bytes());
4005                }
4006            }
4007            // v7.6.1 — FOREIGN KEY appendix (catalog FILE_VERSION 13+).
4008            // Layout: [u16 LE fk_count]
4009            //   per fk:
4010            //     [u8 has_name] [str name (if has_name)]
4011            //     [u16 LE local_arity] [u16 LE local_pos]*arity
4012            //     [str parent_table]
4013            //     [u16 LE parent_arity] [u16 LE parent_pos]*arity
4014            //     [u8 on_delete_tag] [u8 on_update_tag]
4015            // Older catalogs (v12 and below) skip this block entirely;
4016            // their reader stops before this byte.
4017            write_u16(
4018                &mut out,
4019                u16::try_from(t.schema.foreign_keys.len()).expect("≤ 65k FKs/table"),
4020            );
4021            for fk in &t.schema.foreign_keys {
4022                match &fk.name {
4023                    None => out.push(0),
4024                    Some(n) => {
4025                        out.push(1);
4026                        write_str(&mut out, n);
4027                    }
4028                }
4029                write_u16(
4030                    &mut out,
4031                    u16::try_from(fk.local_columns.len()).expect("≤ 65k FK columns"),
4032                );
4033                for &p in &fk.local_columns {
4034                    write_u16(
4035                        &mut out,
4036                        u16::try_from(p).expect("≤ 65k columns/table"),
4037                    );
4038                }
4039                write_str(&mut out, &fk.parent_table);
4040                write_u16(
4041                    &mut out,
4042                    u16::try_from(fk.parent_columns.len()).expect("≤ 65k FK parent columns"),
4043                );
4044                for &p in &fk.parent_columns {
4045                    write_u16(
4046                        &mut out,
4047                        u16::try_from(p).expect("≤ 65k columns/table"),
4048                    );
4049                }
4050                out.push(fk.on_delete.tag());
4051                out.push(fk.on_update.tag());
4052            }
4053        }
4054        out
4055    }
4056
4057    /// Deserialize a previously-serialized catalog. Rejects bad magic, version
4058    /// mismatch, unknown tags, truncation, and trailing bytes.
4059    pub fn deserialize(buf: &[u8]) -> Result<Self, StorageError> {
4060        let mut cur = Cursor::new(buf);
4061        let magic = cur.take(8)?;
4062        if magic != FILE_MAGIC {
4063            return Err(StorageError::Corrupt(format!(
4064                "bad magic: expected SPGDB001, got {magic:?}"
4065            )));
4066        }
4067        let version = cur.read_u8()?;
4068        if !(MIN_SUPPORTED_FILE_VERSION..=FILE_VERSION).contains(&version) {
4069            return Err(StorageError::Corrupt(format!(
4070                "unsupported file version: {version} (supported: {MIN_SUPPORTED_FILE_VERSION}..={FILE_VERSION})"
4071            )));
4072        }
4073        let table_count = cur.read_u32()? as usize;
4074        let mut cat = Self::new();
4075        for _ in 0..table_count {
4076            deserialize_table(&mut cur, &mut cat, version)?;
4077        }
4078        if cur.pos < buf.len() {
4079            return Err(StorageError::Corrupt(format!(
4080                "trailing bytes: {} unread",
4081                buf.len() - cur.pos
4082            )));
4083        }
4084        Ok(cat)
4085    }
4086}
4087
4088/// Per-table deserialize body — schema, rows, indices. Pulled out of
4089/// `Catalog::deserialize` to keep the latter under the line-budget lint
4090/// and to give the row hot loop its own scope (so the borrow on `t`
4091/// stays scoped here rather than across the whole catalog loop).
4092fn deserialize_table(
4093    cur: &mut Cursor<'_>,
4094    cat: &mut Catalog,
4095    version: u8,
4096) -> Result<(), StorageError> {
4097    let table_name = cur.read_str()?;
4098    let name = table_name.clone();
4099    let col_count = cur.read_u16()? as usize;
4100    let mut cols = Vec::with_capacity(col_count);
4101    for _ in 0..col_count {
4102        let c_name = cur.read_str()?;
4103        let ty = cur.read_data_type()?;
4104        let nullable = cur.read_u8()? != 0;
4105        let default = match cur.read_u8()? {
4106            0 => None,
4107            1 => Some(cur.read_value()?),
4108            other => {
4109                return Err(StorageError::Corrupt(format!(
4110                    "unknown default tag: {other}"
4111                )));
4112            }
4113        };
4114        let auto_increment = cur.read_u8()? != 0;
4115        cols.push(ColumnSchema {
4116            name: c_name,
4117            ty,
4118            nullable,
4119            default,
4120            auto_increment,
4121        });
4122    }
4123    let n_cols = cols.len();
4124    cat.create_table(TableSchema::new(name, cols))?;
4125    // Vec<Table> with insertion-order semantics — the just-pushed
4126    // table is at the end. Sidecar `by_name` is already wired up but
4127    // we skip the map lookup here since we know the position.
4128    let t = cat.tables.last_mut().expect("create_table just pushed");
4129    deserialize_rows(cur, t, n_cols)?;
4130    deserialize_indices(cur, t, version)?;
4131    // v6.7.2 — per-table hot_tier_bytes appendix. v11+ writes
4132    // `[u8 has_value][u64 LE value (if has_value)]`. v10 / v9 / v8
4133    // catalogs skip this entirely (the deserialiser reads no extra
4134    // bytes; the table's hot_tier_bytes stays None from
4135    // TableSchema::new).
4136    if version >= 11 {
4137        let has = cur.read_u8()?;
4138        let hot_tier_bytes = match has {
4139            0 => None,
4140            1 => Some(cur.read_u64()?),
4141            other => {
4142                return Err(StorageError::Corrupt(format!(
4143                    "hot_tier_bytes appendix: unknown has-value byte {other}"
4144                )));
4145            }
4146        };
4147        t.schema_mut().hot_tier_bytes = hot_tier_bytes;
4148    }
4149    // v7.6.1 — FOREIGN KEY appendix (FILE_VERSION 13+). v12 / v11 / …
4150    // catalogs skip this entirely.
4151    if version >= 13 {
4152        let fk_count = cur.read_u16()? as usize;
4153        let mut fks = Vec::with_capacity(fk_count);
4154        for _ in 0..fk_count {
4155            let name = match cur.read_u8()? {
4156                0 => None,
4157                1 => Some(cur.read_str()?),
4158                other => {
4159                    return Err(StorageError::Corrupt(format!(
4160                        "FK appendix: unknown has-name byte {other}"
4161                    )));
4162                }
4163            };
4164            let local_arity = cur.read_u16()? as usize;
4165            let mut local_columns = Vec::with_capacity(local_arity);
4166            for _ in 0..local_arity {
4167                local_columns.push(cur.read_u16()? as usize);
4168            }
4169            let parent_table = cur.read_str()?;
4170            let parent_arity = cur.read_u16()? as usize;
4171            if parent_arity != local_arity {
4172                return Err(StorageError::Corrupt(format!(
4173                    "FK arity mismatch in catalog: local {local_arity} vs parent {parent_arity}"
4174                )));
4175            }
4176            let mut parent_columns = Vec::with_capacity(parent_arity);
4177            for _ in 0..parent_arity {
4178                parent_columns.push(cur.read_u16()? as usize);
4179            }
4180            let on_delete = FkAction::from_tag(cur.read_u8()?).ok_or_else(|| {
4181                StorageError::Corrupt("FK appendix: unknown on_delete tag".into())
4182            })?;
4183            let on_update = FkAction::from_tag(cur.read_u8()?).ok_or_else(|| {
4184                StorageError::Corrupt("FK appendix: unknown on_update tag".into())
4185            })?;
4186            fks.push(ForeignKeyConstraint {
4187                name,
4188                local_columns,
4189                parent_table,
4190                parent_columns,
4191                on_delete,
4192                on_update,
4193            });
4194        }
4195        t.schema_mut().foreign_keys = fks;
4196    }
4197    let _ = table_name;
4198    Ok(())
4199}
4200
4201fn deserialize_rows(
4202    cur: &mut Cursor<'_>,
4203    t: &mut Table,
4204    _n_cols: usize,
4205) -> Result<(), StorageError> {
4206    let row_count = cur.read_u32()? as usize;
4207    // v4.39: PV has no `reserve` (the BVT doesn't preallocate a
4208    // contiguous buffer); we just push directly and let the trie
4209    // grow. v5.1: row decode reuses `decode_row_body_dense` so the
4210    // catalog and cold-tier segments share one row codec.
4211    let mut hot_bytes: u64 = 0;
4212    for _ in 0..row_count {
4213        let tail = &cur.buf[cur.pos..];
4214        let (row, consumed) = decode_row_body_dense(tail, &t.schema)?;
4215        cur.pos += consumed;
4216        // v5.2.1: account for hot bytes as we go; the snapshot's row
4217        // block bytes are exactly what `encode_row_body_dense` would
4218        // produce, so `consumed` would do too — but going via the
4219        // helper keeps the counter's definition coupled to the
4220        // encoder rather than the snapshot's row prefix layout.
4221        hot_bytes = hot_bytes.saturating_add(row_body_encoded_len(&row, &t.schema) as u64);
4222        t.rows.push_mut(row);
4223    }
4224    t.hot_bytes = hot_bytes;
4225    Ok(())
4226}
4227
4228fn deserialize_indices(
4229    cur: &mut Cursor<'_>,
4230    t: &mut Table,
4231    version: u8,
4232) -> Result<(), StorageError> {
4233    let index_count = cur.read_u16()? as usize;
4234    for _ in 0..index_count {
4235        let idx_name = cur.read_str()?;
4236        let col_pos = cur.read_u16()? as usize;
4237        let column_name = t
4238            .schema
4239            .columns
4240            .get(col_pos)
4241            .ok_or_else(|| {
4242                StorageError::Corrupt(format!(
4243                    "index {idx_name:?} points at non-existent column position {col_pos}"
4244                ))
4245            })?
4246            .name
4247            .clone();
4248        let kind_tag = cur.read_u8()?;
4249        match kind_tag {
4250            0 => {
4251                if version >= 9 {
4252                    // v9+: BTree entries serialised inline (tag-prefixed
4253                    // locator codec). Restore the map directly so any
4254                    // freezer-produced Cold locators come back exactly
4255                    // as they went out.
4256                    let map = read_btree_map(cur)?;
4257                    t.restore_btree_index(idx_name, &column_name, map)?;
4258                } else {
4259                    // v8: no entries on disk; rebuild from rows. Every
4260                    // entry is materialised as `RowLocator::Hot(i)` —
4261                    // semantically identical to the v5.1 in-memory state
4262                    // since v8 catalogs never produced Cold locators.
4263                    t.add_index(idx_name, &column_name)?;
4264                }
4265            }
4266            1 => {
4267                let m = cur.read_u16()? as usize;
4268                let graph = cur.read_nsw_graph(m)?;
4269                t.restore_nsw_index(idx_name, &column_name, graph)?;
4270            }
4271            2 => {
4272                // v6.7.1 — BRIN tag. Payload is the column type
4273                // tag. No further data — summaries live in cold
4274                // segments.
4275                let column_type = cur.read_data_type()?;
4276                t.restore_brin_index(idx_name, &column_name, column_type)?;
4277            }
4278            other => {
4279                return Err(StorageError::Corrupt(format!(
4280                    "unknown index kind tag: {other}"
4281                )));
4282            }
4283        }
4284        // v6.8.0 — included_columns appendix per index. v11- snapshots
4285        // stop before this u16; v12+ always carries it (possibly 0).
4286        if version >= 12 {
4287            let num_included = cur.read_u16()? as usize;
4288            if num_included > 0 {
4289                let mut included: Vec<usize> = Vec::with_capacity(num_included);
4290                for _ in 0..num_included {
4291                    let cp = cur.read_u16()? as usize;
4292                    if cp >= t.schema.columns.len() {
4293                        return Err(StorageError::Corrupt(format!(
4294                            "INCLUDE column position {cp} out of range \
4295                             ({} schema columns)",
4296                            t.schema.columns.len()
4297                        )));
4298                    }
4299                    included.push(cp);
4300                }
4301                if let Some(last) = t.indices.last_mut() {
4302                    last.included_columns = included;
4303                }
4304            }
4305            // v6.8.1 — partial_predicate appendix.
4306            match cur.read_u8()? {
4307                0 => {}
4308                1 => {
4309                    let pred = cur.read_str()?;
4310                    if let Some(last) = t.indices.last_mut() {
4311                        last.partial_predicate = Some(pred);
4312                    }
4313                }
4314                other => {
4315                    return Err(StorageError::Corrupt(format!(
4316                        "partial_predicate tag: unknown byte {other}"
4317                    )));
4318                }
4319            }
4320            // v6.8.2 — expression appendix.
4321            match cur.read_u8()? {
4322                0 => {}
4323                1 => {
4324                    let expr = cur.read_str()?;
4325                    if let Some(last) = t.indices.last_mut() {
4326                        last.expression = Some(expr);
4327                    }
4328                }
4329                other => {
4330                    return Err(StorageError::Corrupt(format!(
4331                        "expression tag: unknown byte {other}"
4332                    )));
4333                }
4334            }
4335        }
4336    }
4337    Ok(())
4338}
4339
4340/// Parse a v9 `BTree` index payload — `[u32 entry_count]` followed by
4341/// `entry_count` `(IndexKey, Vec<RowLocator>)` pairs. The locator list
4342/// uses the v5.1 tag-prefixed wire format (`RowLocator::read_le`).
4343fn read_btree_map(
4344    cur: &mut Cursor<'_>,
4345) -> Result<PersistentBTreeMap<IndexKey, Vec<RowLocator>>, StorageError> {
4346    let entry_count = cur.read_u32()? as usize;
4347    let mut map = PersistentBTreeMap::new();
4348    for _ in 0..entry_count {
4349        let key = cur.read_index_key()?;
4350        let locator_count = cur.read_u32()? as usize;
4351        let mut locators = Vec::with_capacity(locator_count);
4352        for _ in 0..locator_count {
4353            let tail = &cur.buf[cur.pos..];
4354            let (loc, consumed) = RowLocator::read_le(tail).map_err(|e| {
4355                StorageError::Corrupt(format!("row_locator decode at offset {}: {e}", cur.pos))
4356            })?;
4357            cur.pos += consumed;
4358            locators.push(loc);
4359        }
4360        map.insert_mut(key, locators);
4361    }
4362    Ok(map)
4363}
4364
4365// --- low-level binary helpers ---------------------------------------------
4366
4367/// Write a `DataType` as a tag byte + optional payload (Vector carries its
4368/// `u32` dimension). Inverse: [`read_data_type`].
4369/// Serialize an HNSW graph after the `[kind=1][u16 M]` header (v7).
4370/// Layout:
4371/// - `[u16 m_max_0]`
4372/// - `[entry u32]` — `u32::MAX` means `None`, else the entry node index
4373/// - `[u8 entry_level]`
4374/// - `[node_count u32]`
4375/// - for each node: `[u8 level]`  (top layer for this node)
4376/// - `[layer_count u8]`
4377/// - for each layer `0..layer_count`:
4378///     - `[u32 layer_node_count]` (== `node_count`; per-layer slot)
4379///     - for each node: `[u16 neighbor_count] [u32 neighbor]*`
4380fn write_nsw_graph(out: &mut Vec<u8>, g: &NswGraph) {
4381    let entry = g.entry.map_or(u32::MAX, |e| {
4382        u32::try_from(e).expect("NSW entry fits in u32")
4383    });
4384    write_u16(
4385        out,
4386        u16::try_from(g.m_max_0).expect("HNSW m_max_0 fits in u16"),
4387    );
4388    out.extend_from_slice(&entry.to_le_bytes());
4389    out.push(g.entry_level);
4390    let node_count = g.levels.len();
4391    write_u32(
4392        out,
4393        u32::try_from(node_count).expect("HNSW node count fits in u32"),
4394    );
4395    for &lvl in &g.levels {
4396        out.push(lvl);
4397    }
4398    let layer_count = u8::try_from(g.layers.len()).expect("HNSW layer count ≤ 255");
4399    out.push(layer_count);
4400    for layer in &g.layers {
4401        write_u32(
4402            out,
4403            u32::try_from(layer.len()).expect("HNSW per-layer node count fits in u32"),
4404        );
4405        for neighbors in layer {
4406            write_u16(
4407                out,
4408                u16::try_from(neighbors.len()).expect("HNSW neighbour list fits in u16"),
4409            );
4410            // v6.1.x: neighbour slot is already u32 in memory; just
4411            // emit the raw bytes. (v6.0 stored usize and converted
4412            // here.)
4413            for &peer in neighbors {
4414                write_u32(out, peer);
4415            }
4416        }
4417    }
4418}
4419
4420fn write_data_type(out: &mut Vec<u8>, t: DataType) {
4421    match t {
4422        DataType::Int => out.push(1),
4423        DataType::BigInt => out.push(2),
4424        DataType::Float => out.push(3),
4425        DataType::Text => out.push(4),
4426        DataType::Bool => out.push(5),
4427        DataType::Vector { dim, encoding } => match encoding {
4428            // Tag 6: pre-v6 F32 vector. Layout unchanged; pre-v6
4429            // binaries continue to deserialise this exactly as
4430            // before.
4431            VecEncoding::F32 => {
4432                out.push(6);
4433                out.extend_from_slice(&dim.to_le_bytes());
4434            }
4435            // v6.0.3: tag 15 for `VECTOR(N) USING HALF`. Same
4436            // forward-compat fence story as SQ8 below.
4437            VecEncoding::F16 => {
4438                out.push(15);
4439                out.extend_from_slice(&dim.to_le_bytes());
4440            }
4441            // v6.0.1: new tag 14 for `VECTOR(N) USING SQ8` column
4442            // type. Pre-v6 readers fall through `read_data_type`'s
4443            // catch-all and surface `Corrupt("unknown data type tag")`
4444            // — the explicit forward-compat fence called out in
4445            // V6_DESIGN deliberation #5.
4446            VecEncoding::Sq8 => {
4447                out.push(14);
4448                out.extend_from_slice(&dim.to_le_bytes());
4449            }
4450        },
4451        DataType::SmallInt => out.push(7),
4452        DataType::Varchar(max) => {
4453            out.push(8);
4454            out.extend_from_slice(&max.to_le_bytes());
4455        }
4456        DataType::Char(size) => {
4457            out.push(9);
4458            out.extend_from_slice(&size.to_le_bytes());
4459        }
4460        DataType::Numeric { precision, scale } => {
4461            out.push(10);
4462            out.push(precision);
4463            out.push(scale);
4464        }
4465        DataType::Date => out.push(11),
4466        DataType::Timestamp => out.push(12),
4467        // v7.9.2 — tag 17 for TIMESTAMPTZ. Body = i64 microseconds
4468        // UTC, identical to tag 12. Only the schema-side type tag
4469        // differs (for wire OID advertisement).
4470        DataType::Timestamptz => out.push(17),
4471        // INTERVAL is runtime-only — CREATE TABLE never produces a
4472        // column with this type, so write_data_type must not be called
4473        // on it. (Disk-format codepoint reserved for a future v3 where
4474        // INTERVAL becomes storable.)
4475        DataType::Interval => {
4476            unreachable!("DataType::Interval has no on-disk encoding in v2.11")
4477        }
4478        DataType::Json => out.push(13),
4479        // v7.9.0: tag 16 for `JSONB`. Same on-disk layout as
4480        // tag 13 — only the wire OID differs.
4481        DataType::Jsonb => out.push(16),
4482    }
4483}
4484
4485impl Cursor<'_> {
4486    fn read_data_type(&mut self) -> Result<DataType, StorageError> {
4487        let tag = self.read_u8()?;
4488        match tag {
4489            1 => Ok(DataType::Int),
4490            2 => Ok(DataType::BigInt),
4491            3 => Ok(DataType::Float),
4492            4 => Ok(DataType::Text),
4493            5 => Ok(DataType::Bool),
4494            6 => Ok(DataType::Vector {
4495                dim: self.read_u32()?,
4496                encoding: VecEncoding::F32,
4497            }),
4498            7 => Ok(DataType::SmallInt),
4499            8 => Ok(DataType::Varchar(self.read_u32()?)),
4500            9 => Ok(DataType::Char(self.read_u32()?)),
4501            10 => {
4502                let precision = self.read_u8()?;
4503                let scale = self.read_u8()?;
4504                Ok(DataType::Numeric { precision, scale })
4505            }
4506            11 => Ok(DataType::Date),
4507            12 => Ok(DataType::Timestamp),
4508            13 => Ok(DataType::Json),
4509            14 => Ok(DataType::Vector {
4510                dim: self.read_u32()?,
4511                encoding: VecEncoding::Sq8,
4512            }),
4513            // v6.0.3: tag 15 for `VECTOR(N) USING HALF`. Same
4514            // [u32 dim] type-tag payload as F32 / SQ8; the encoding
4515            // lives in the tag byte itself.
4516            15 => Ok(DataType::Vector {
4517                dim: self.read_u32()?,
4518                encoding: VecEncoding::F16,
4519            }),
4520            // v7.9.0: tag 16 for `JSONB`. Storage shape == Json;
4521            // we only carry the type tag so the wire layer can
4522            // emit PG OID 3802 instead of 114.
4523            16 => Ok(DataType::Jsonb),
4524            // v7.9.2: tag 17 for `TIMESTAMPTZ`. Storage shape ==
4525            // Timestamp (i64 microseconds UTC); only the wire OID
4526            // (1184) differs.
4527            17 => Ok(DataType::Timestamptz),
4528            other => Err(StorageError::Corrupt(format!(
4529                "unknown data type tag: {other}"
4530            ))),
4531        }
4532    }
4533}
4534
4535/// Fast computation of the byte length [`encode_row_body_dense`]
4536/// would produce, without allocating the output buffer. Mirrors the
4537/// encoder's per-column body sizing so the v5.2.1 `Table::hot_bytes`
4538/// incremental counter doesn't pay an alloc-per-insert tax. Returns
4539/// the exact same `usize` as `encode_row_body_dense(row, schema).len()`.
4540pub fn row_body_encoded_len(row: &Row, schema: &TableSchema) -> usize {
4541    debug_assert_eq!(
4542        row.values.len(),
4543        schema.columns.len(),
4544        "row_body_encoded_len: row arity must match schema"
4545    );
4546    let bitmap_bytes = schema.columns.len().div_ceil(8);
4547    let mut n = bitmap_bytes;
4548    for (col_idx, v) in row.values.iter().enumerate() {
4549        if matches!(v, Value::Null) {
4550            continue;
4551        }
4552        n += value_body_encoded_len(v, schema.columns[col_idx].ty);
4553    }
4554    n
4555}
4556
4557/// Byte length a single cell consumes when written by
4558/// `write_value_body`. Used by [`row_body_encoded_len`]; kept in
4559/// lock-step with the encoder. The `_ty` slot is reserved for future
4560/// type-dependent encodings — every variant currently writes a fixed
4561/// body shape regardless of the declared column type.
4562fn value_body_encoded_len(v: &Value, _ty: DataType) -> usize {
4563    match v {
4564        Value::SmallInt(_) => 2,
4565        // 4-byte body: i32 / Date.
4566        Value::Int(_) | Value::Date(_) => 4,
4567        // 8-byte body: i64 / f64 / Timestamp.
4568        Value::BigInt(_) | Value::Float(_) | Value::Timestamp(_) => 8,
4569        Value::Bool(_) => 1,
4570        // Text/Varchar/Char/Json share the [u16 len][utf-8] layout.
4571        Value::Text(s) | Value::Json(s) => 2 + s.len(),
4572        // [u32 dim][f32 * dim]
4573        Value::Vector(vec) => 4 + 4 * vec.len(),
4574        // v6.0.1: SQ8 cell on-disk shape — [u32 dim][f32 min]
4575        // [f32 max][u8 * dim] = 12 + dim bytes. `hot_bytes`
4576        // tracking on `Table::insert` calls this every row, so
4577        // returning the real size now (even though the actual
4578        // `write_value_body` writer lands in step 6) keeps the
4579        // sizing arithmetic honest for in-memory benches.
4580        Value::Sq8Vector(q) => 4 + 4 + 4 + q.bytes.len(),
4581        // v6.0.3: halfvec on-disk shape — [u32 dim][u16 LE * dim]
4582        // = 4 + 2 * dim bytes.
4583        Value::HalfVector(h) => 4 + h.bytes.len(),
4584        // [i128 scaled][u8 scale]
4585        Value::Numeric { .. } => 16 + 1,
4586        // NULL is encoded only in the bitmap, never in the body.
4587        Value::Null => 0,
4588        // INTERVAL has no on-disk encoding (see write_value_body).
4589        Value::Interval { .. } => {
4590            unreachable!("Value::Interval has no on-disk encoding")
4591        }
4592    }
4593}
4594
4595/// Encode one row's body in the v3.0.2 dense format (`FILE_VERSION`
4596/// 8): per-row NULL bitmap (1 bit/col, ceil(cols/8) bytes), then
4597/// each non-NULL cell as `write_value_body`. Same wire shape the
4598/// catalog snapshot writes per row inside its rows-block. Exposed
4599/// pub so v5.1+ cold-tier segment writers can produce row payloads
4600/// that the catalog [`decode_row_body_dense`] decodes 1:1.
4601///
4602/// `row.values.len()` must equal `schema.columns.len()` — the row
4603/// is expected to have been validated by `Table::insert` (the
4604/// engine's INSERT path) before reaching this function.
4605pub fn encode_row_body_dense(row: &Row, schema: &TableSchema) -> Vec<u8> {
4606    debug_assert_eq!(
4607        row.values.len(),
4608        schema.columns.len(),
4609        "dense encode: row arity must match schema"
4610    );
4611    let bitmap_bytes = schema.columns.len().div_ceil(8);
4612    // 8 B per fixed-width cell is a reasonable average; the buffer
4613    // grows past this for variable-width Text/Vector cells.
4614    let mut out = Vec::with_capacity(bitmap_bytes + schema.columns.len() * 8);
4615    let bitmap_offset = out.len();
4616    out.resize(bitmap_offset + bitmap_bytes, 0);
4617    for (i, v) in row.values.iter().enumerate() {
4618        if matches!(v, Value::Null) {
4619            out[bitmap_offset + i / 8] |= 1 << (i % 8);
4620        }
4621    }
4622    for (col_idx, v) in row.values.iter().enumerate() {
4623        if matches!(v, Value::Null) {
4624            continue;
4625        }
4626        write_value_body(&mut out, v, schema.columns[col_idx].ty);
4627    }
4628    out
4629}
4630
4631/// Inverse of [`encode_row_body_dense`]. Reads one row's body from
4632/// `bytes` and returns it plus the number of bytes consumed (so a
4633/// caller decoding a back-to-back stream of rows can advance its
4634/// cursor). Returns `StorageError::Corrupt` on truncation, bad
4635/// UTF-8, or unknown cell tags.
4636pub fn decode_row_body_dense(
4637    bytes: &[u8],
4638    schema: &TableSchema,
4639) -> Result<(Row, usize), StorageError> {
4640    let mut cur = Cursor::new(bytes);
4641    let bitmap_bytes = schema.columns.len().div_ceil(8);
4642    let mut bitmap_buf = [0u8; 32];
4643    if bitmap_bytes > bitmap_buf.len() {
4644        return Err(StorageError::Corrupt(format!(
4645            "row NULL bitmap {bitmap_bytes} B exceeds 32 B cap"
4646        )));
4647    }
4648    let slice = cur.take(bitmap_bytes)?;
4649    bitmap_buf[..bitmap_bytes].copy_from_slice(slice);
4650    let mut values = Vec::with_capacity(schema.columns.len());
4651    for (col_idx, col) in schema.columns.iter().enumerate() {
4652        if (bitmap_buf[col_idx / 8] >> (col_idx % 8)) & 1 == 1 {
4653            values.push(Value::Null);
4654        } else {
4655            values.push(cur.read_value_body(col.ty)?);
4656        }
4657    }
4658    Ok((Row { values }, cur.pos))
4659}
4660
4661/// Schema-driven dense value encoding (`FILE_VERSION` 8). Caller already
4662/// knows the column type and has decided this cell is non-NULL, so we
4663/// skip the per-cell type tag the v7 `write_value` was writing. NULL
4664/// is encoded via the per-row bitmap before this function runs, never
4665/// reaches here. Used only inside the row-encoding hot loop; the
4666/// schema-default path still goes through the legacy `write_value` so
4667/// DEFAULT values keep their self-describing tag and remain decodable
4668/// without consulting a column type.
4669fn write_value_body(out: &mut Vec<u8>, v: &Value, ty: DataType) {
4670    match (v, ty) {
4671        (Value::SmallInt(n), DataType::SmallInt) => out.extend_from_slice(&n.to_le_bytes()),
4672        (Value::Int(n), DataType::Int) => out.extend_from_slice(&n.to_le_bytes()),
4673        (Value::BigInt(n), DataType::BigInt) => out.extend_from_slice(&n.to_le_bytes()),
4674        (Value::Float(x), DataType::Float) => out.extend_from_slice(&x.to_le_bytes()),
4675        (Value::Bool(b), DataType::Bool) => out.push(u8::from(*b)),
4676        (Value::Text(s), DataType::Text | DataType::Varchar(_) | DataType::Char(_)) => {
4677            write_str(out, s);
4678        }
4679        (
4680            Value::Vector(v),
4681            DataType::Vector {
4682                encoding: VecEncoding::F32,
4683                ..
4684            },
4685        ) => {
4686            let dim = u32::try_from(v.len()).expect("vector dim fits in u32");
4687            out.extend_from_slice(&dim.to_le_bytes());
4688            for x in v {
4689                out.extend_from_slice(&x.to_le_bytes());
4690            }
4691        }
4692        // v6.0.1: SQ8 dense body — [u32 dim][f32 min][f32 max]
4693        // [u8 * dim]. Self-describes its length so v6 readers
4694        // walking rows of a v6 catalog stay aligned even if the
4695        // declared column dim drifts (defensive, not normally
4696        // possible since CREATE TABLE pins the dim).
4697        (
4698            Value::Sq8Vector(q),
4699            DataType::Vector {
4700                encoding: VecEncoding::Sq8,
4701                ..
4702            },
4703        ) => {
4704            let dim = u32::try_from(q.bytes.len()).expect("vector dim fits in u32");
4705            out.extend_from_slice(&dim.to_le_bytes());
4706            out.extend_from_slice(&q.min.to_le_bytes());
4707            out.extend_from_slice(&q.max.to_le_bytes());
4708            out.extend_from_slice(&q.bytes);
4709        }
4710        // v6.0.3: halfvec dense body — [u32 dim][u16 LE * dim].
4711        // The raw u16 bytes already live in `h.bytes` little-
4712        // endian, so we just splat them.
4713        (
4714            Value::HalfVector(h),
4715            DataType::Vector {
4716                encoding: VecEncoding::F16,
4717                ..
4718            },
4719        ) => {
4720            let dim = u32::try_from(h.dim()).expect("vector dim fits in u32");
4721            out.extend_from_slice(&dim.to_le_bytes());
4722            out.extend_from_slice(&h.bytes);
4723        }
4724        (Value::Numeric { scaled, .. }, DataType::Numeric { scale, .. }) => {
4725            out.extend_from_slice(&scaled.to_le_bytes());
4726            out.push(scale);
4727        }
4728        (Value::Date(d), DataType::Date) => out.extend_from_slice(&d.to_le_bytes()),
4729        (Value::Timestamp(t), DataType::Timestamp | DataType::Timestamptz) => {
4730            out.extend_from_slice(&t.to_le_bytes())
4731        }
4732        // v4.9: JSON stores as length-prefixed text; same shape as
4733        // Text — the type tag lives in the column schema, not the
4734        // per-cell body.
4735        (Value::Json(s), DataType::Json | DataType::Jsonb) => write_str(out, s),
4736        // Type mismatch shouldn't happen — `Table::insert` validates
4737        // value type against column type before pushing. Treat as a
4738        // bug, not a runtime error.
4739        (other, ty) => unreachable!(
4740            "schema-driven encode received mismatched value/type pair: \
4741             value tag={:?}, column type={:?}",
4742            other.data_type(),
4743            ty
4744        ),
4745    }
4746}
4747
4748fn write_value(out: &mut Vec<u8>, v: &Value) {
4749    match v {
4750        Value::Null => out.push(0),
4751        Value::SmallInt(n) => {
4752            out.push(7);
4753            out.extend_from_slice(&n.to_le_bytes());
4754        }
4755        Value::Int(n) => {
4756            out.push(1);
4757            out.extend_from_slice(&n.to_le_bytes());
4758        }
4759        Value::BigInt(n) => {
4760            out.push(2);
4761            out.extend_from_slice(&n.to_le_bytes());
4762        }
4763        Value::Float(x) => {
4764            out.push(3);
4765            out.extend_from_slice(&x.to_le_bytes());
4766        }
4767        // v4.9: JSON shares the tag-4 (Text) on-disk encoding —
4768        // schema decides which variant comes back on read. The
4769        // bodies are byte-identical so collapsing the match keeps
4770        // clippy::match_same_arms quiet.
4771        Value::Text(s) | Value::Json(s) => {
4772            out.push(4);
4773            write_str(out, s);
4774        }
4775        Value::Bool(b) => {
4776            out.push(5);
4777            out.push(u8::from(*b));
4778        }
4779        Value::Vector(v) => {
4780            out.push(6);
4781            let dim = u32::try_from(v.len()).expect("vector dim fits in u32");
4782            out.extend_from_slice(&dim.to_le_bytes());
4783            for x in v {
4784                out.extend_from_slice(&x.to_le_bytes());
4785            }
4786        }
4787        // v6.0.1: new tag 11 for an SQ8 cell carried with its full
4788        // header. Layout matches the dense row body shape so a
4789        // round-trip through write_value → read_value bit-equals
4790        // the original `Value::Sq8Vector`.
4791        Value::Sq8Vector(q) => {
4792            out.push(11);
4793            let dim = u32::try_from(q.bytes.len()).expect("vector dim fits in u32");
4794            out.extend_from_slice(&dim.to_le_bytes());
4795            out.extend_from_slice(&q.min.to_le_bytes());
4796            out.extend_from_slice(&q.max.to_le_bytes());
4797            out.extend_from_slice(&q.bytes);
4798        }
4799        // v6.0.3: tag 12 for a HalfVector cell.
4800        // Layout: `[u32 dim][u16 LE × dim]` — bit-identical to the
4801        // dense row body so `write_value` / `read_value` bit-equal
4802        // the original `Value::HalfVector`.
4803        Value::HalfVector(h) => {
4804            out.push(12);
4805            let dim = u32::try_from(h.dim()).expect("vector dim fits in u32");
4806            out.extend_from_slice(&dim.to_le_bytes());
4807            out.extend_from_slice(&h.bytes);
4808        }
4809        Value::Numeric { scaled, scale } => {
4810            out.push(8);
4811            out.extend_from_slice(&scaled.to_le_bytes());
4812            out.push(*scale);
4813        }
4814        Value::Date(d) => {
4815            out.push(9);
4816            out.extend_from_slice(&d.to_le_bytes());
4817        }
4818        Value::Timestamp(t) => {
4819            out.push(10);
4820            out.extend_from_slice(&t.to_le_bytes());
4821        }
4822        // Interval is a runtime-only value (no on-disk representation in
4823        // v2.11). CREATE TABLE rejects `DataType::Interval` columns, so a
4824        // Value::Interval here would mean the engine bypassed that gate.
4825        Value::Interval { .. } => {
4826            unreachable!(
4827                "Value::Interval has no on-disk encoding; engine must reject it before write"
4828            )
4829        }
4830    }
4831}
4832
4833fn write_u16(out: &mut Vec<u8>, n: u16) {
4834    out.extend_from_slice(&n.to_le_bytes());
4835}
4836fn write_u32(out: &mut Vec<u8>, n: u32) {
4837    out.extend_from_slice(&n.to_le_bytes());
4838}
4839fn write_str(out: &mut Vec<u8>, s: &str) {
4840    let len = u16::try_from(s.len()).expect("identifier / text fits in u16");
4841    write_u16(out, len);
4842    out.extend_from_slice(s.as_bytes());
4843}
4844
4845/// Serialise an [`IndexKey`] using the v9 tagged codec. `read_index_key`
4846/// is the inverse. v8 catalogs never wrote index keys (`BTree` entries were
4847/// rebuilt from `Table::rows`), so this codec is v9+ only.
4848fn write_index_key(out: &mut Vec<u8>, key: &IndexKey) {
4849    match key {
4850        IndexKey::Int(n) => {
4851            out.push(INDEX_KEY_TAG_INT);
4852            out.extend_from_slice(&n.to_le_bytes());
4853        }
4854        IndexKey::Text(s) => {
4855            out.push(INDEX_KEY_TAG_TEXT);
4856            write_str(out, s);
4857        }
4858        IndexKey::Bool(b) => {
4859            out.push(INDEX_KEY_TAG_BOOL);
4860            out.push(u8::from(*b));
4861        }
4862    }
4863}
4864
4865struct Cursor<'a> {
4866    buf: &'a [u8],
4867    pos: usize,
4868}
4869
4870impl<'a> Cursor<'a> {
4871    const fn new(buf: &'a [u8]) -> Self {
4872        Self { buf, pos: 0 }
4873    }
4874
4875    fn take(&mut self, n: usize) -> Result<&'a [u8], StorageError> {
4876        let end = self
4877            .pos
4878            .checked_add(n)
4879            .ok_or_else(|| StorageError::Corrupt(format!("length overflow taking {n} bytes")))?;
4880        if end > self.buf.len() {
4881            return Err(StorageError::Corrupt(format!(
4882                "unexpected EOF at offset {} (wanted {n} more bytes)",
4883                self.pos
4884            )));
4885        }
4886        let s = &self.buf[self.pos..end];
4887        self.pos = end;
4888        Ok(s)
4889    }
4890
4891    fn read_u8(&mut self) -> Result<u8, StorageError> {
4892        Ok(self.take(1)?[0])
4893    }
4894    fn read_u16(&mut self) -> Result<u16, StorageError> {
4895        let s = self.take(2)?;
4896        Ok(u16::from_le_bytes([s[0], s[1]]))
4897    }
4898    fn read_u32(&mut self) -> Result<u32, StorageError> {
4899        let s = self.take(4)?;
4900        Ok(u32::from_le_bytes([s[0], s[1], s[2], s[3]]))
4901    }
4902    fn read_i32(&mut self) -> Result<i32, StorageError> {
4903        let s = self.take(4)?;
4904        Ok(i32::from_le_bytes([s[0], s[1], s[2], s[3]]))
4905    }
4906    /// v6.7.2 — u64 LE read for the per-table `hot_tier_bytes`
4907    /// catalog appendix.
4908    fn read_u64(&mut self) -> Result<u64, StorageError> {
4909        let s = self.take(8)?;
4910        Ok(u64::from_le_bytes([
4911            s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7],
4912        ]))
4913    }
4914    fn read_i64(&mut self) -> Result<i64, StorageError> {
4915        let s = self.take(8)?;
4916        let arr: [u8; 8] = s.try_into().expect("checked");
4917        Ok(i64::from_le_bytes(arr))
4918    }
4919    fn read_f64(&mut self) -> Result<f64, StorageError> {
4920        let s = self.take(8)?;
4921        let arr: [u8; 8] = s.try_into().expect("checked");
4922        Ok(f64::from_le_bytes(arr))
4923    }
4924    fn read_f32(&mut self) -> Result<f32, StorageError> {
4925        let s = self.take(4)?;
4926        Ok(f32::from_le_bytes([s[0], s[1], s[2], s[3]]))
4927    }
4928    fn read_str(&mut self) -> Result<String, StorageError> {
4929        let len = self.read_u16()? as usize;
4930        let bytes = self.take(len)?;
4931        core::str::from_utf8(bytes)
4932            .map(String::from)
4933            .map_err(|_| StorageError::Corrupt("invalid UTF-8 in identifier or text".into()))
4934    }
4935
4936    /// Parse an [`IndexKey`] emitted by `write_index_key` (v9 tagged
4937    /// codec). Returns `StorageError::Corrupt` on unknown tag or
4938    /// truncated payload.
4939    fn read_index_key(&mut self) -> Result<IndexKey, StorageError> {
4940        let tag = self.read_u8()?;
4941        match tag {
4942            INDEX_KEY_TAG_INT => Ok(IndexKey::Int(self.read_i64()?)),
4943            INDEX_KEY_TAG_TEXT => Ok(IndexKey::Text(self.read_str()?)),
4944            INDEX_KEY_TAG_BOOL => Ok(IndexKey::Bool(self.read_u8()? != 0)),
4945            other => Err(StorageError::Corrupt(format!(
4946                "unknown index key tag: {other}"
4947            ))),
4948        }
4949    }
4950    /// Schema-driven dense value decode (`FILE_VERSION` 8). Caller has
4951    /// already cleared the NULL bit from the row bitmap; we read the
4952    /// fixed-width body for the given column type. Used inside the row
4953    /// hot loop; column defaults still go through `read_value` (which
4954    /// reads its own type tag) so DEFAULT round-trips without a schema.
4955    fn read_value_body(&mut self, ty: DataType) -> Result<Value, StorageError> {
4956        match ty {
4957            DataType::SmallInt => {
4958                let s = self.take(2)?;
4959                Ok(Value::SmallInt(i16::from_le_bytes([s[0], s[1]])))
4960            }
4961            DataType::Int => Ok(Value::Int(self.read_i32()?)),
4962            DataType::BigInt => Ok(Value::BigInt(self.read_i64()?)),
4963            DataType::Float => Ok(Value::Float(self.read_f64()?)),
4964            DataType::Bool => Ok(Value::Bool(self.read_u8()? != 0)),
4965            DataType::Text | DataType::Varchar(_) | DataType::Char(_) => {
4966                Ok(Value::Text(self.read_str()?))
4967            }
4968            DataType::Vector {
4969                encoding: VecEncoding::F32,
4970                ..
4971            } => {
4972                let dim = self.read_u32()? as usize;
4973                let mut v = Vec::with_capacity(dim);
4974                for _ in 0..dim {
4975                    let bytes: [u8; 4] = self.take(4)?.try_into().expect("checked");
4976                    v.push(f32::from_le_bytes(bytes));
4977                }
4978                Ok(Value::Vector(v))
4979            }
4980            DataType::Vector {
4981                encoding: VecEncoding::Sq8,
4982                ..
4983            } => {
4984                let dim = self.read_u32()? as usize;
4985                let min = self.read_f32()?;
4986                let max = self.read_f32()?;
4987                let bytes = self.take(dim)?.to_vec();
4988                Ok(Value::Sq8Vector(quantize::Sq8Vector { min, max, bytes }))
4989            }
4990            DataType::Vector {
4991                encoding: VecEncoding::F16,
4992                ..
4993            } => {
4994                let dim = self.read_u32()? as usize;
4995                let bytes = self.take(dim * 2)?.to_vec();
4996                Ok(Value::HalfVector(halfvec::HalfVector { bytes }))
4997            }
4998            DataType::Numeric { .. } => {
4999                let s = self.take(16)?;
5000                let arr: [u8; 16] = s.try_into().expect("checked");
5001                let scaled = i128::from_le_bytes(arr);
5002                let scale = self.read_u8()?;
5003                Ok(Value::Numeric { scaled, scale })
5004            }
5005            DataType::Date => Ok(Value::Date(self.read_i32()?)),
5006            DataType::Timestamp => Ok(Value::Timestamp(self.read_i64()?)),
5007            DataType::Timestamptz => Ok(Value::Timestamp(self.read_i64()?)),
5008            DataType::Jsonb => Ok(Value::Json(self.read_str()?)),
5009            DataType::Interval => {
5010                // Defensive — schema gate (CREATE TABLE rejects Interval
5011                // columns) means this branch can't be hit through normal
5012                // flow; reject corrupt files explicitly rather than
5013                // panic.
5014                Err(StorageError::Corrupt(
5015                    "INTERVAL column found on disk — runtime-only type, v3.0.2 rejects it".into(),
5016                ))
5017            }
5018            DataType::Json => Ok(Value::Json(self.read_str()?)),
5019        }
5020    }
5021
5022    fn read_value(&mut self) -> Result<Value, StorageError> {
5023        let tag = self.read_u8()?;
5024        match tag {
5025            0 => Ok(Value::Null),
5026            1 => Ok(Value::Int(self.read_i32()?)),
5027            2 => Ok(Value::BigInt(self.read_i64()?)),
5028            3 => Ok(Value::Float(self.read_f64()?)),
5029            4 => Ok(Value::Text(self.read_str()?)),
5030            5 => Ok(Value::Bool(self.read_u8()? != 0)),
5031            6 => {
5032                let dim = self.read_u32()? as usize;
5033                let mut v = Vec::with_capacity(dim);
5034                for _ in 0..dim {
5035                    let bytes: [u8; 4] = self.take(4)?.try_into().expect("checked");
5036                    v.push(f32::from_le_bytes(bytes));
5037                }
5038                Ok(Value::Vector(v))
5039            }
5040            7 => {
5041                let s = self.take(2)?;
5042                Ok(Value::SmallInt(i16::from_le_bytes([s[0], s[1]])))
5043            }
5044            8 => {
5045                let s = self.take(16)?;
5046                let arr: [u8; 16] = s.try_into().expect("checked");
5047                let scaled = i128::from_le_bytes(arr);
5048                let scale = self.read_u8()?;
5049                Ok(Value::Numeric { scaled, scale })
5050            }
5051            9 => Ok(Value::Date(self.read_i32()?)),
5052            10 => Ok(Value::Timestamp(self.read_i64()?)),
5053            // v6.0.1: tag 11 — Sq8Vector. Pre-v6 readers fall
5054            // through to the catch-all and surface
5055            // `Corrupt("unknown value tag")`, matching the
5056            // forward-compat fence on the column-type side.
5057            11 => {
5058                let dim = self.read_u32()? as usize;
5059                let min = self.read_f32()?;
5060                let max = self.read_f32()?;
5061                let bytes = self.take(dim)?.to_vec();
5062                Ok(Value::Sq8Vector(quantize::Sq8Vector { min, max, bytes }))
5063            }
5064            // v6.0.3: tag 12 — HalfVector. Same forward-compat
5065            // fence story as tag 11.
5066            12 => {
5067                let dim = self.read_u32()? as usize;
5068                let bytes = self.take(dim * 2)?.to_vec();
5069                Ok(Value::HalfVector(halfvec::HalfVector { bytes }))
5070            }
5071            other => Err(StorageError::Corrupt(format!("unknown value tag: {other}"))),
5072        }
5073    }
5074
5075    /// Read an NSW graph that was emitted via `write_nsw_graph`. `m`
5076    /// is passed in because it was already consumed from the per-
5077    /// index header. Returns the reconstituted `NswGraph`.
5078    fn read_nsw_graph(&mut self, m: usize) -> Result<NswGraph, StorageError> {
5079        let m_max_0 = self.read_u16()? as usize;
5080        let entry_raw = self.read_u32()?;
5081        let entry = if entry_raw == u32::MAX {
5082            None
5083        } else {
5084            Some(entry_raw as usize)
5085        };
5086        let entry_level = self.read_u8()?;
5087        let node_count = self.read_u32()? as usize;
5088        // v5.5.0: levels/per-layer are PV-backed in memory, but the wire
5089        // format is unchanged — decode element-by-element into a PV via
5090        // push_mut (transient in-place, no per-element path-copy here since
5091        // the freshly-built PV is uniquely owned).
5092        let mut levels: PersistentVec<u8> = PersistentVec::new();
5093        for _ in 0..node_count {
5094            levels.push_mut(self.read_u8()?);
5095        }
5096        let layer_count = self.read_u8()? as usize;
5097        let mut layers: Vec<PersistentVec<Vec<u32>>> = Vec::with_capacity(layer_count);
5098        for _ in 0..layer_count {
5099            let n = self.read_u32()? as usize;
5100            let mut per_layer: PersistentVec<Vec<u32>> = PersistentVec::new();
5101            for _ in 0..n {
5102                let cnt = self.read_u16()? as usize;
5103                let mut row: Vec<u32> = Vec::with_capacity(cnt);
5104                for _ in 0..cnt {
5105                    row.push(self.read_u32()?);
5106                }
5107                per_layer.push_mut(row);
5108            }
5109            layers.push(per_layer);
5110        }
5111        Ok(NswGraph {
5112            m,
5113            m_max_0,
5114            entry,
5115            entry_level,
5116            levels,
5117            layers,
5118        })
5119    }
5120}
5121
5122#[cfg(test)]
5123mod tests {
5124    use super::*;
5125    use alloc::string::ToString;
5126    use alloc::vec;
5127
5128    #[cfg(target_arch = "aarch64")]
5129    #[test]
5130    fn neon_l2_matches_scalar() {
5131        // For every dim that's a multiple of 4 (4, 8, 12, 16, 64,
5132        // 128, 256, 384, 512, 768, 1024, 1536), the NEON impl must
5133        // agree with the scalar reference within tight float
5134        // tolerance (FMA rounding differs from separate * + +).
5135        let dims = [4usize, 8, 12, 16, 64, 128, 256, 384, 512, 768, 1024, 1536];
5136        for &d in &dims {
5137            let mut state: u64 = (d as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15);
5138            let mut a = Vec::with_capacity(d);
5139            let mut b = Vec::with_capacity(d);
5140            for _ in 0..d {
5141                state = state
5142                    .wrapping_mul(6_364_136_223_846_793_005)
5143                    .wrapping_add(1);
5144                #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
5145                let x = (((state >> 32) & 0x00FF_FFFF) as f32) / (0x80_0000_u32 as f32) - 1.0;
5146                state = state
5147                    .wrapping_mul(6_364_136_223_846_793_005)
5148                    .wrapping_add(1);
5149                #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
5150                let y = (((state >> 32) & 0x00FF_FFFF) as f32) / (0x80_0000_u32 as f32) - 1.0;
5151                a.push(x);
5152                b.push(y);
5153            }
5154            let scalar = l2_distance_sq_scalar(&a, &b);
5155            let neon = unsafe { l2_distance_sq_neon(&a, &b) };
5156            let tol = (scalar.abs().max(1e-6)) * 1e-4;
5157            assert!(
5158                (scalar - neon).abs() <= tol,
5159                "dim={d}: scalar={scalar} neon={neon} diff={}",
5160                (scalar - neon).abs()
5161            );
5162        }
5163    }
5164
5165    #[cfg(target_arch = "aarch64")]
5166    #[test]
5167    fn neon_inner_product_matches_scalar() {
5168        // v6.0.2 step 1: NEON IP must agree with scalar across every
5169        // production-shaped dim. FMA rounding differs from
5170        // separate * + +, so the tolerance scales with magnitude.
5171        let dims = [4usize, 8, 12, 16, 64, 128, 256, 512, 1024];
5172        for &d in &dims {
5173            let mut state: u64 = (d as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15);
5174            let mut a = Vec::with_capacity(d);
5175            let mut b = Vec::with_capacity(d);
5176            for _ in 0..d {
5177                state = state
5178                    .wrapping_mul(6_364_136_223_846_793_005)
5179                    .wrapping_add(1);
5180                #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
5181                let x = (((state >> 32) & 0x00FF_FFFF) as f32) / (0x80_0000_u32 as f32) - 1.0;
5182                state = state
5183                    .wrapping_mul(6_364_136_223_846_793_005)
5184                    .wrapping_add(1);
5185                #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
5186                let y = (((state >> 32) & 0x00FF_FFFF) as f32) / (0x80_0000_u32 as f32) - 1.0;
5187                a.push(x);
5188                b.push(y);
5189            }
5190            let scalar = inner_product_scalar(&a, &b);
5191            let neon = unsafe { inner_product_neon(&a, &b) };
5192            #[allow(clippy::cast_precision_loss)]
5193            let tol = (scalar.abs().max(1e-6)) * 1e-4 + (d as f32) * 1e-6;
5194            assert!(
5195                (scalar - neon).abs() <= tol,
5196                "IP dim={d}: scalar={scalar} neon={neon} diff={}",
5197                (scalar - neon).abs()
5198            );
5199        }
5200    }
5201
5202    #[cfg(target_arch = "aarch64")]
5203    #[allow(clippy::similar_names)]
5204    #[test]
5205    fn neon_cosine_dot_norms_matches_scalar() {
5206        let dims = [4usize, 8, 12, 16, 64, 128, 256, 512, 1024];
5207        for &d in &dims {
5208            let mut state: u64 = (d as u64).wrapping_mul(0xBF58_476D_1CE4_E5B9);
5209            let mut a = Vec::with_capacity(d);
5210            let mut b = Vec::with_capacity(d);
5211            for _ in 0..d {
5212                state = state
5213                    .wrapping_mul(6_364_136_223_846_793_005)
5214                    .wrapping_add(1);
5215                #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
5216                let x = (((state >> 32) & 0x00FF_FFFF) as f32) / (0x80_0000_u32 as f32) - 1.0;
5217                state = state
5218                    .wrapping_mul(6_364_136_223_846_793_005)
5219                    .wrapping_add(1);
5220                #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
5221                let y = (((state >> 32) & 0x00FF_FFFF) as f32) / (0x80_0000_u32 as f32) - 1.0;
5222                a.push(x);
5223                b.push(y);
5224            }
5225            let (dot_s, na_s, nb_s) = cosine_dot_norms_scalar(&a, &b);
5226            let (dot_n, na_n, nb_n) = unsafe { cosine_dot_norms_neon(&a, &b) };
5227            #[allow(clippy::cast_precision_loss)]
5228            let tol_d = (dot_s.abs().max(1e-6)) * 1e-4 + (d as f32) * 1e-6;
5229            #[allow(clippy::cast_precision_loss)]
5230            let tol_n = (na_s.abs().max(1e-6)) * 1e-4 + (d as f32) * 1e-6;
5231            assert!(
5232                (dot_s - dot_n).abs() <= tol_d,
5233                "cosine dot dim={d}: scalar={dot_s} neon={dot_n}"
5234            );
5235            assert!(
5236                (na_s - na_n).abs() <= tol_n,
5237                "cosine na dim={d}: scalar={na_s} neon={na_n}"
5238            );
5239            assert!(
5240                (nb_s - nb_n).abs() <= tol_n,
5241                "cosine nb dim={d}: scalar={nb_s} neon={nb_n}"
5242            );
5243        }
5244    }
5245
5246    fn make_users_schema() -> TableSchema {
5247        TableSchema::new(
5248            "users",
5249            vec![
5250                ColumnSchema::new("id", DataType::Int, false),
5251                ColumnSchema::new("name", DataType::Text, false),
5252                ColumnSchema::new("score", DataType::Float, true),
5253            ],
5254        )
5255    }
5256
5257    #[test]
5258    fn value_type_tag_matches_variant() {
5259        assert_eq!(Value::Int(1).data_type(), Some(DataType::Int));
5260        assert_eq!(Value::BigInt(1).data_type(), Some(DataType::BigInt));
5261        assert_eq!(Value::Float(1.0).data_type(), Some(DataType::Float));
5262        assert_eq!(Value::Text("x".into()).data_type(), Some(DataType::Text));
5263        assert_eq!(Value::Bool(true).data_type(), Some(DataType::Bool));
5264        assert_eq!(Value::Null.data_type(), None);
5265        assert!(Value::Null.is_null());
5266        assert!(!Value::Int(0).is_null());
5267    }
5268
5269    #[test]
5270    fn sq8_value_reports_sq8_data_type() {
5271        // v6.0.1: a `Value::Sq8Vector` cell surfaces its dim
5272        // (= bytes.len()) and encoding through `data_type()` so
5273        // INSERT-time column type-checks (step 3) can route on
5274        // both shape and encoding.
5275        let q = crate::quantize::quantize(&[0.0, 0.25, 0.5, 0.75, 1.0]);
5276        let v = Value::Sq8Vector(q);
5277        assert_eq!(
5278            v.data_type(),
5279            Some(DataType::Vector {
5280                dim: 5,
5281                encoding: VecEncoding::Sq8,
5282            }),
5283        );
5284    }
5285
5286    #[test]
5287    fn datatype_display_matches_pg_keyword() {
5288        assert_eq!(DataType::Int.to_string(), "INT");
5289        assert_eq!(DataType::BigInt.to_string(), "BIGINT");
5290        assert_eq!(DataType::Float.to_string(), "FLOAT");
5291        assert_eq!(DataType::Text.to_string(), "TEXT");
5292        assert_eq!(DataType::Bool.to_string(), "BOOL");
5293    }
5294
5295    #[test]
5296    fn row_len_and_emptiness() {
5297        let r = Row::new(vec![Value::Int(1), Value::Null]);
5298        assert_eq!(r.len(), 2);
5299        assert!(!r.is_empty());
5300        assert!(Row::new(Vec::new()).is_empty());
5301    }
5302
5303    #[test]
5304    fn table_schema_column_position() {
5305        let s = make_users_schema();
5306        assert_eq!(s.column_position("id"), Some(0));
5307        assert_eq!(s.column_position("score"), Some(2));
5308        assert_eq!(s.column_position("missing"), None);
5309    }
5310
5311    #[test]
5312    fn catalog_create_table_then_lookup() {
5313        let mut cat = Catalog::new();
5314        cat.create_table(make_users_schema()).unwrap();
5315        assert_eq!(cat.table_count(), 1);
5316        assert!(cat.get("users").is_some());
5317        assert!(cat.get("nope").is_none());
5318    }
5319
5320    #[test]
5321    fn catalog_duplicate_table_is_rejected() {
5322        let mut cat = Catalog::new();
5323        cat.create_table(make_users_schema()).unwrap();
5324        let err = cat.create_table(make_users_schema()).unwrap_err();
5325        assert!(matches!(err, StorageError::DuplicateTable { ref name } if name == "users"));
5326    }
5327
5328    #[test]
5329    fn table_insert_happy_path_appends_row() {
5330        let mut cat = Catalog::new();
5331        cat.create_table(make_users_schema()).unwrap();
5332        let t = cat.get_mut("users").unwrap();
5333        t.insert(Row::new(vec![
5334            Value::Int(1),
5335            Value::Text("alice".into()),
5336            Value::Float(99.5),
5337        ]))
5338        .unwrap();
5339        assert_eq!(t.row_count(), 1);
5340        assert_eq!(t.rows()[0].values[1], Value::Text("alice".into()));
5341    }
5342
5343    #[test]
5344    fn table_insert_arity_mismatch() {
5345        let mut cat = Catalog::new();
5346        cat.create_table(make_users_schema()).unwrap();
5347        let t = cat.get_mut("users").unwrap();
5348        let err = t.insert(Row::new(vec![Value::Int(1)])).unwrap_err();
5349        assert!(matches!(
5350            err,
5351            StorageError::ArityMismatch {
5352                expected: 3,
5353                actual: 1
5354            }
5355        ));
5356        assert_eq!(t.row_count(), 0);
5357    }
5358
5359    #[test]
5360    fn table_insert_type_mismatch_reports_column() {
5361        let mut cat = Catalog::new();
5362        cat.create_table(make_users_schema()).unwrap();
5363        let t = cat.get_mut("users").unwrap();
5364        let err = t
5365            .insert(Row::new(vec![
5366                Value::Int(1),
5367                Value::Int(42), // name expects Text
5368                Value::Float(0.0),
5369            ]))
5370            .unwrap_err();
5371        match err {
5372            StorageError::TypeMismatch {
5373                ref column,
5374                expected,
5375                actual,
5376                position,
5377            } => {
5378                assert_eq!(column, "name");
5379                assert_eq!(expected, DataType::Text);
5380                assert_eq!(actual, DataType::Int);
5381                assert_eq!(position, 1);
5382            }
5383            other => panic!("unexpected: {other:?}"),
5384        }
5385        assert_eq!(t.row_count(), 0);
5386    }
5387
5388    #[test]
5389    fn table_insert_null_into_not_null_rejected() {
5390        let mut cat = Catalog::new();
5391        cat.create_table(make_users_schema()).unwrap();
5392        let t = cat.get_mut("users").unwrap();
5393        let err = t
5394            .insert(Row::new(vec![
5395                Value::Int(1),
5396                Value::Null, // name is NOT NULL
5397                Value::Float(1.0),
5398            ]))
5399            .unwrap_err();
5400        assert!(matches!(err, StorageError::NullInNotNull { ref column } if column == "name"));
5401    }
5402
5403    #[test]
5404    fn table_insert_null_into_nullable_ok() {
5405        let mut cat = Catalog::new();
5406        cat.create_table(make_users_schema()).unwrap();
5407        let t = cat.get_mut("users").unwrap();
5408        t.insert(Row::new(vec![
5409            Value::Int(1),
5410            Value::Text("bob".into()),
5411            Value::Null,
5412        ]))
5413        .unwrap();
5414        assert_eq!(t.row_count(), 1);
5415    }
5416
5417    #[test]
5418    fn catalog_get_mut_independent_per_table() {
5419        let mut cat = Catalog::new();
5420        cat.create_table(TableSchema::new(
5421            "a",
5422            vec![ColumnSchema::new("v", DataType::Int, false)],
5423        ))
5424        .unwrap();
5425        cat.create_table(TableSchema::new(
5426            "b",
5427            vec![ColumnSchema::new("v", DataType::Int, false)],
5428        ))
5429        .unwrap();
5430        cat.get_mut("a")
5431            .unwrap()
5432            .insert(Row::new(vec![Value::Int(1)]))
5433            .unwrap();
5434        assert_eq!(cat.get("a").unwrap().row_count(), 1);
5435        assert_eq!(cat.get("b").unwrap().row_count(), 0);
5436    }
5437
5438    // --- v0.6 persistence round-trips --------------------------------------
5439
5440    fn assert_round_trip(cat: &Catalog) {
5441        let bytes = cat.serialize();
5442        let restored = Catalog::deserialize(&bytes).expect("deserialize");
5443        // Compare semantic state: same tables in same order, same schema +
5444        // rows in each.
5445        assert_eq!(restored.table_count(), cat.table_count());
5446        for (a, b) in cat.tables.iter().zip(restored.tables.iter()) {
5447            assert_eq!(a.schema, b.schema);
5448            assert_eq!(a.rows, b.rows);
5449        }
5450    }
5451
5452    #[test]
5453    fn serialize_empty_catalog_round_trips() {
5454        assert_round_trip(&Catalog::new());
5455    }
5456
5457    #[test]
5458    fn serialize_single_empty_table_round_trips() {
5459        let mut cat = Catalog::new();
5460        cat.create_table(make_users_schema()).unwrap();
5461        assert_round_trip(&cat);
5462    }
5463
5464    #[test]
5465    fn nsw_clone_is_o1() {
5466        // v5.5.0: NswGraph::clone must be O(1) structural sharing, not the
5467        // pre-v5.5 O(N) element copy — it rides on Catalog::clone for every
5468        // group-commit write on a vector table. Build a non-trivial multi-
5469        // layer graph, clone it, and prove the clone shares the very same PV
5470        // storage (root+tail Arc) for `levels` and every `layers[l]`. Sharing
5471        // ⇒ no per-node element copy ⇒ clone cost independent of N (node
5472        // count); only the outer layer Vec (len ≤ 8) is copied, O(1) in
5473        // practice.
5474        let mut cat = Catalog::new();
5475        cat.create_table(TableSchema::new(
5476            "docs",
5477            alloc::vec![
5478                ColumnSchema::new("id", DataType::Int, false),
5479                ColumnSchema::new(
5480                    "v",
5481                    DataType::Vector {
5482                        dim: 3,
5483                        encoding: VecEncoding::F32
5484                    },
5485                    true
5486                ),
5487            ],
5488        ))
5489        .unwrap();
5490        let t = cat.get_mut("docs").unwrap();
5491        for i in 0..1500_i32 {
5492            #[allow(clippy::cast_precision_loss)] // 0..1500 — no precision lost
5493            let base = (i as f32) * 0.01;
5494            t.insert(Row::new(alloc::vec![
5495                Value::Int(i),
5496                Value::Vector(alloc::vec![base, base + 0.05, base + 0.1]),
5497            ]))
5498            .unwrap();
5499        }
5500        t.add_nsw_index("docs_nsw".into(), "v", NSW_DEFAULT_M)
5501            .unwrap();
5502        let g = match &cat.get("docs").unwrap().indices()[0].kind {
5503            IndexKind::Nsw(g) => g,
5504            IndexKind::BTree(_) | IndexKind::Brin { .. } => panic!("expected NSW"),
5505        };
5506        // Non-trivial graph: one level slot per row, and the geometric level
5507        // distribution puts some nodes above layer 0.
5508        assert_eq!(g.levels.len(), 1500, "one level slot per inserted row");
5509        assert!(
5510            g.layers.len() >= 2,
5511            "1500 nodes should populate at least two HNSW layers, got {}",
5512            g.layers.len()
5513        );
5514
5515        let cloned = g.clone();
5516
5517        assert!(
5518            g.levels.shares_storage_with(&cloned.levels),
5519            "levels PV not shared after clone — clone copied elements (O(N))"
5520        );
5521        assert_eq!(g.layers.len(), cloned.layers.len());
5522        for (l, (orig, cl)) in g.layers.iter().zip(cloned.layers.iter()).enumerate() {
5523            assert!(
5524                orig.shares_storage_with(cl),
5525                "layer {l} PV not shared after clone — clone copied elements (O(N))"
5526            );
5527        }
5528    }
5529
5530    #[test]
5531    fn sq8_catalog_serialise_roundtrip_preserves_cells_and_index() {
5532        // v6.0.1 step 6 verify: a catalog with an `VECTOR(N)
5533        // USING SQ8` column + NSW index survives a full
5534        // serialise → deserialise cycle. Cells re-decode bit-
5535        // identically (per-vector affine triple), the NSW
5536        // topology stays intact, and kNN search still routes
5537        // through the SQ8 ADC dispatcher after the catalog hop.
5538        let mut cat = Catalog::new();
5539        cat.create_table(TableSchema::new(
5540            "vecs",
5541            alloc::vec![
5542                ColumnSchema::new("id", DataType::Int, false),
5543                ColumnSchema::new(
5544                    "v",
5545                    DataType::Vector {
5546                        dim: 8,
5547                        encoding: VecEncoding::Sq8,
5548                    },
5549                    false,
5550                ),
5551            ],
5552        ))
5553        .unwrap();
5554        let t = cat.get_mut("vecs").unwrap();
5555        for i in 0..32_i32 {
5556            #[allow(clippy::cast_precision_loss)]
5557            let base = (i as f32) * 0.03;
5558            let v: Vec<f32> = (0..8_i32)
5559                .map(|j| {
5560                    #[allow(clippy::cast_precision_loss)]
5561                    let off = (j as f32) * 0.01;
5562                    base + off
5563                })
5564                .collect();
5565            t.insert(Row::new(alloc::vec![
5566                Value::Int(i),
5567                Value::Sq8Vector(quantize::quantize(&v)),
5568            ]))
5569            .unwrap();
5570        }
5571        t.add_nsw_index("v_idx".into(), "v", NSW_DEFAULT_M).unwrap();
5572        // Capture a pre-serialise reference cell + nsw hits to
5573        // compare against the restored catalog.
5574        let query = alloc::vec![0.15_f32, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21, 0.22];
5575        let (before_cell, before_ty, before_hits) = {
5576            let t_ref = cat.get("vecs").unwrap();
5577            (
5578                t_ref.rows()[5].values[1].clone(),
5579                t_ref.schema().columns[1].ty,
5580                nsw_query(t_ref, "v_idx", &query, 5, NswMetric::L2),
5581            )
5582        };
5583
5584        let bytes = cat.serialize();
5585        let restored = Catalog::deserialize(&bytes).expect("deserialize ok");
5586        let rt = restored.get("vecs").unwrap();
5587        assert_eq!(rt.schema().columns[1].ty, before_ty);
5588        assert_eq!(rt.rows()[5].values[1], before_cell);
5589        let after_hits = nsw_query(rt, "v_idx", &query, 5, NswMetric::L2);
5590        assert_eq!(before_hits, after_hits);
5591    }
5592
5593    #[test]
5594    fn half_catalog_serialise_roundtrip_preserves_cells_and_index() {
5595        // v6.0.3 step 4 verify: a catalog with a `VECTOR(N) USING
5596        // HALF` column + NSW index survives a full serialise →
5597        // deserialise cycle. Cells re-decode bit-identically (raw
5598        // u16 LE bytes), the NSW topology stays intact, and kNN
5599        // search still returns the same hit IDs against the
5600        // restored catalog.
5601        use crate::halfvec;
5602        let mut cat = Catalog::new();
5603        cat.create_table(TableSchema::new(
5604            "vecs",
5605            alloc::vec![
5606                ColumnSchema::new("id", DataType::Int, false),
5607                ColumnSchema::new(
5608                    "v",
5609                    DataType::Vector {
5610                        dim: 8,
5611                        encoding: VecEncoding::F16,
5612                    },
5613                    false,
5614                ),
5615            ],
5616        ))
5617        .unwrap();
5618        let t = cat.get_mut("vecs").unwrap();
5619        for i in 0..32_i32 {
5620            #[allow(clippy::cast_precision_loss)]
5621            let base = (i as f32) * 0.03;
5622            let v: Vec<f32> = (0..8_i32)
5623                .map(|j| {
5624                    #[allow(clippy::cast_precision_loss)]
5625                    let off = (j as f32) * 0.01;
5626                    base + off
5627                })
5628                .collect();
5629            t.insert(Row::new(alloc::vec![
5630                Value::Int(i),
5631                Value::HalfVector(halfvec::HalfVector::from_f32_slice(&v)),
5632            ]))
5633            .unwrap();
5634        }
5635        t.add_nsw_index("v_idx".into(), "v", NSW_DEFAULT_M).unwrap();
5636        let query = alloc::vec![0.15_f32, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21, 0.22];
5637        let (before_cell, before_ty, before_hits) = {
5638            let t_ref = cat.get("vecs").unwrap();
5639            (
5640                t_ref.rows()[5].values[1].clone(),
5641                t_ref.schema().columns[1].ty,
5642                nsw_query(t_ref, "v_idx", &query, 5, NswMetric::L2),
5643            )
5644        };
5645        let bytes = cat.serialize();
5646        let restored = Catalog::deserialize(&bytes).expect("deserialize ok");
5647        let rt = restored.get("vecs").unwrap();
5648        assert_eq!(rt.schema().columns[1].ty, before_ty);
5649        assert_eq!(rt.rows()[5].values[1], before_cell);
5650        let after_hits = nsw_query(rt, "v_idx", &query, 5, NswMetric::L2);
5651        assert_eq!(before_hits, after_hits);
5652    }
5653
5654    #[test]
5655    #[allow(clippy::similar_names)]
5656    fn hnsw_half_recall_at_10_matches_f32_groundtruth() {
5657        // v6.0.3 step 3 verify: HALF column NSW retrieves ≥ 95%
5658        // top-10 overlap vs brute-force F32 ground truth.
5659        // Half-precision dequantises bit-exactly at the storage
5660        // layer (no rerank pass), so the recall floor is tighter
5661        // than the SQ8 case — only the rounding noise from f32 →
5662        // f16 quantisation contributes.
5663        use crate::halfvec;
5664        fn next(state: &mut u64) -> f32 {
5665            *state = state
5666                .wrapping_add(0x9E37_79B9_7F4A_7C15)
5667                .wrapping_mul(0xBF58_476D_1CE4_E5B9);
5668            #[allow(clippy::cast_precision_loss)]
5669            let u = ((*state >> 32) as u32 as f32) / (u32::MAX as f32);
5670            2.0 * u - 1.0
5671        }
5672        let dim: u32 = 32;
5673        let n: usize = 512;
5674        let dim_us = dim as usize;
5675        let mut seed: u64 = 0xF16_F16_F16_F16_u64;
5676        let corpus: Vec<Vec<f32>> = (0..n)
5677            .map(|_| (0..dim_us).map(|_| next(&mut seed)).collect())
5678            .collect();
5679        let queries: Vec<Vec<f32>> = (0..32)
5680            .map(|_| (0..dim_us).map(|_| next(&mut seed)).collect())
5681            .collect();
5682        let exact_top10: Vec<Vec<usize>> = queries
5683            .iter()
5684            .map(|q| {
5685                let mut scored: Vec<(f32, usize)> = corpus
5686                    .iter()
5687                    .enumerate()
5688                    .map(|(i, v)| (l2_distance_sq(v, q), i))
5689                    .collect();
5690                scored.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
5691                scored.into_iter().take(10).map(|(_, i)| i).collect()
5692            })
5693            .collect();
5694        let mut cat = Catalog::new();
5695        cat.create_table(TableSchema::new(
5696            "vecs",
5697            alloc::vec![
5698                ColumnSchema::new("id", DataType::Int, false),
5699                ColumnSchema::new(
5700                    "v",
5701                    DataType::Vector {
5702                        dim,
5703                        encoding: VecEncoding::F16,
5704                    },
5705                    false,
5706                ),
5707            ],
5708        ))
5709        .unwrap();
5710        let t = cat.get_mut("vecs").unwrap();
5711        for (i, v) in corpus.iter().enumerate() {
5712            t.insert(Row::new(alloc::vec![
5713                Value::Int(i32::try_from(i).unwrap()),
5714                Value::HalfVector(halfvec::HalfVector::from_f32_slice(v)),
5715            ]))
5716            .unwrap();
5717        }
5718        t.add_nsw_index("v_idx".into(), "v", NSW_DEFAULT_M).unwrap();
5719        let table = cat.get("vecs").unwrap();
5720        let mut total_overlap = 0_usize;
5721        for (q, exact) in queries.iter().zip(exact_top10.iter()) {
5722            let hits = nsw_query(table, "v_idx", q, 10, NswMetric::L2);
5723            for h in &hits {
5724                if exact.contains(h) {
5725                    total_overlap += 1;
5726                }
5727            }
5728        }
5729        #[allow(clippy::cast_precision_loss)]
5730        let recall = total_overlap as f32 / (10.0 * queries.len() as f32);
5731        assert!(
5732            recall >= 0.95,
5733            "HALF HNSW recall@10 = {recall:.3}, below floor 0.95 — \
5734             check halfvec dispatch in `cell_to_query_metric_distance`"
5735        );
5736    }
5737
5738    #[test]
5739    fn hnsw_sq8_recall_at_10_above_0_95_vs_f32_groundtruth() {
5740        // v6.0.1 step 5 verify: build TWO catalogs over the same
5741        // corpus — one F32, one SQ8 — and confirm SQ8 NSW + f32
5742        // rerank retrieves ≥ 95% top-10 overlap vs brute-force F32
5743        // ground truth. The rerank pass (sq8_rerank) re-scores ADC
5744        // candidates with dequantised cells, recovering recall the
5745        // raw ADC sacrifices for 4× compression.
5746        use crate::quantize;
5747        // Deterministic Gaussian-ish corpus via splitmix64. Vectors
5748        // get normalised so SQ8's per-vector `(min, max)` lives in
5749        // a sensible range; matches the v6.0.0 fuzz harness.
5750        fn next(state: &mut u64) -> f32 {
5751            *state = state
5752                .wrapping_add(0x9E37_79B9_7F4A_7C15)
5753                .wrapping_mul(0xBF58_476D_1CE4_E5B9);
5754            #[allow(clippy::cast_precision_loss)]
5755            let u = ((*state >> 32) as u32 as f32) / (u32::MAX as f32);
5756            2.0 * u - 1.0
5757        }
5758        let dim: u32 = 32;
5759        let n: usize = 512;
5760        let dim_us = dim as usize;
5761        let mut seed: u64 = 0xCAFE_BABE_DEAD_BEEFu64;
5762        let corpus: Vec<Vec<f32>> = (0..n)
5763            .map(|_| (0..dim_us).map(|_| next(&mut seed)).collect())
5764            .collect();
5765        let queries: Vec<Vec<f32>> = (0..32)
5766            .map(|_| (0..dim_us).map(|_| next(&mut seed)).collect())
5767            .collect();
5768        // F32 ground truth — pure exact arithmetic, brute force.
5769        let exact_top10: Vec<Vec<usize>> = queries
5770            .iter()
5771            .map(|q| {
5772                let mut scored: Vec<(f32, usize)> = corpus
5773                    .iter()
5774                    .enumerate()
5775                    .map(|(i, v)| (l2_distance_sq(v, q), i))
5776                    .collect();
5777                scored.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
5778                scored.into_iter().take(10).map(|(_, i)| i).collect()
5779            })
5780            .collect();
5781        // SQ8 catalog — INSERTs land as `Value::Sq8Vector` cells;
5782        // HNSW build uses the ADC path verified in step 4.
5783        let mut cat = Catalog::new();
5784        cat.create_table(TableSchema::new(
5785            "vecs",
5786            alloc::vec![
5787                ColumnSchema::new("id", DataType::Int, false),
5788                ColumnSchema::new(
5789                    "v",
5790                    DataType::Vector {
5791                        dim,
5792                        encoding: VecEncoding::Sq8,
5793                    },
5794                    false,
5795                ),
5796            ],
5797        ))
5798        .unwrap();
5799        let t = cat.get_mut("vecs").unwrap();
5800        for (i, v) in corpus.iter().enumerate() {
5801            t.insert(Row::new(alloc::vec![
5802                Value::Int(i32::try_from(i).unwrap()),
5803                Value::Sq8Vector(quantize::quantize(v)),
5804            ]))
5805            .unwrap();
5806        }
5807        t.add_nsw_index("v_idx".into(), "v", NSW_DEFAULT_M).unwrap();
5808        let table = cat.get("vecs").unwrap();
5809        let mut total_overlap = 0_usize;
5810        for (q, exact) in queries.iter().zip(exact_top10.iter()) {
5811            let hits = nsw_query(table, "v_idx", q, 10, NswMetric::L2);
5812            for h in &hits {
5813                if exact.contains(h) {
5814                    total_overlap += 1;
5815                }
5816            }
5817        }
5818        #[allow(clippy::cast_precision_loss)]
5819        let recall = total_overlap as f32 / (10.0 * queries.len() as f32);
5820        assert!(
5821            recall >= 0.95,
5822            "SQ8 HNSW recall@10 = {recall:.3}, below floor 0.95 — \
5823             check `sq8_rerank` is wired in `nsw_search` for SQ8 columns"
5824        );
5825    }
5826
5827    #[test]
5828    fn nsw_index_topology_persists_through_round_trip() {
5829        // Build an NSW index, capture its (entry, neighbors) tuple, do
5830        // a full serialize → deserialize, and verify the restored
5831        // graph is byte-for-byte identical. The point of v2.7 is that
5832        // startup skips the rebuild, so the topology has to survive
5833        // the disk hop.
5834        let mut cat = Catalog::new();
5835        cat.create_table(TableSchema::new(
5836            "docs",
5837            alloc::vec![
5838                ColumnSchema::new("id", DataType::Int, false),
5839                ColumnSchema::new(
5840                    "v",
5841                    DataType::Vector {
5842                        dim: 3,
5843                        encoding: VecEncoding::F32
5844                    },
5845                    true
5846                ),
5847            ],
5848        ))
5849        .unwrap();
5850        let t = cat.get_mut("docs").unwrap();
5851        for i in 0..6_i32 {
5852            #[allow(clippy::cast_precision_loss)] // 0..6 — no precision lost
5853            let base = (i as f32) * 0.1;
5854            let row = Row::new(alloc::vec![
5855                Value::Int(i),
5856                Value::Vector(alloc::vec![base, base + 0.05, base + 0.1]),
5857            ]);
5858            t.insert(row).unwrap();
5859        }
5860        t.add_nsw_index("docs_nsw".into(), "v", NSW_DEFAULT_M)
5861            .unwrap();
5862        let original = match &cat.get("docs").unwrap().indices()[0].kind {
5863            IndexKind::Nsw(g) => g.clone(),
5864            IndexKind::BTree(_) | IndexKind::Brin { .. } => panic!("expected NSW"),
5865        };
5866        let bytes = cat.serialize();
5867        let restored = Catalog::deserialize(&bytes).expect("deserialize");
5868        let restored_graph = match &restored.get("docs").unwrap().indices()[0].kind {
5869            IndexKind::Nsw(g) => g.clone(),
5870            IndexKind::BTree(_) | IndexKind::Brin { .. } => panic!("expected NSW"),
5871        };
5872        assert_eq!(restored_graph.m, original.m);
5873        assert_eq!(restored_graph.m_max_0, original.m_max_0);
5874        assert_eq!(restored_graph.entry, original.entry);
5875        assert_eq!(restored_graph.entry_level, original.entry_level);
5876        assert_eq!(restored_graph.levels, original.levels);
5877        assert_eq!(restored_graph.layers, original.layers);
5878    }
5879
5880    #[test]
5881    fn hnsw_level_assignment_is_deterministic() {
5882        // Same row index always produces the same level — the topology
5883        // must be reproducible (matters for serialize round-trip).
5884        for i in 0..32usize {
5885            assert_eq!(nsw_assign_level(i), nsw_assign_level(i));
5886        }
5887    }
5888
5889    #[test]
5890    fn hnsw_layer_0_dominates_population() {
5891        // Sanity: out of N inserts, the vast majority should land on
5892        // layer 0. The 4-bit-clear promotion rule gives roughly 1/16
5893        // promotion to layer ≥ 1, so under 50 nodes we expect ~3 on
5894        // layer ≥ 1 and the rest on layer 0.
5895        let on_zero = (0..200usize).filter(|&i| nsw_assign_level(i) == 0).count();
5896        assert!(on_zero > 150, "level-0 nodes too few: {on_zero}");
5897    }
5898
5899    #[test]
5900    fn hnsw_search_matches_brute_force_for_l2_top1() {
5901        // Build a small dataset, query it, and confirm the top result
5902        // matches the brute-force nearest by L2. Topology variability
5903        // shouldn't break recall at k=1 for well-separated vectors.
5904        let mut cat = Catalog::new();
5905        cat.create_table(TableSchema::new(
5906            "vecs",
5907            alloc::vec![
5908                ColumnSchema::new("id", DataType::Int, false),
5909                ColumnSchema::new(
5910                    "v",
5911                    DataType::Vector {
5912                        dim: 3,
5913                        encoding: VecEncoding::F32
5914                    },
5915                    true
5916                ),
5917            ],
5918        ))
5919        .unwrap();
5920        let t = cat.get_mut("vecs").unwrap();
5921        let dataset: alloc::vec::Vec<(i32, [f32; 3])> = alloc::vec![
5922            (1, [0.0, 0.0, 0.0]),
5923            (2, [1.0, 0.0, 0.0]),
5924            (3, [0.0, 1.0, 0.0]),
5925            (4, [0.0, 0.0, 1.0]),
5926            (5, [1.0, 1.0, 0.0]),
5927            (6, [1.0, 0.0, 1.0]),
5928            (7, [0.0, 1.0, 1.0]),
5929            (8, [1.0, 1.0, 1.0]),
5930            (9, [0.5, 0.5, 0.5]),
5931            (10, [0.2, 0.8, 0.5]),
5932        ];
5933        for &(id, v) in &dataset {
5934            t.insert(Row::new(alloc::vec![
5935                Value::Int(id),
5936                Value::Vector(alloc::vec![v[0], v[1], v[2]]),
5937            ]))
5938            .unwrap();
5939        }
5940        t.add_nsw_index("v_idx".into(), "v", NSW_DEFAULT_M).unwrap();
5941        let idx_pos = cat
5942            .get("vecs")
5943            .unwrap()
5944            .indices()
5945            .iter()
5946            .position(|i| i.name == "v_idx")
5947            .unwrap();
5948        for query in [[0.4, 0.4, 0.4], [0.9, 0.1, 0.0], [0.0, 0.9, 0.9]] {
5949            let table = cat.get("vecs").unwrap();
5950            let hnsw_top = nsw_search(table, idx_pos, &query, 1, 16, NswMetric::L2);
5951            let mut brute: alloc::vec::Vec<(f32, usize)> = (0..table.rows.len())
5952                .map(|i| {
5953                    let Value::Vector(v) = &table.rows[i].values[1] else {
5954                        return (f32::INFINITY, i);
5955                    };
5956                    (l2_distance_sq(v, &query), i)
5957                })
5958                .collect();
5959            brute.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
5960            assert!(!hnsw_top.is_empty(), "HNSW returned no results");
5961            assert_eq!(
5962                hnsw_top[0].1, brute[0].1,
5963                "HNSW top-1 != brute-force top-1 for {query:?}"
5964            );
5965        }
5966    }
5967
5968    #[test]
5969    fn serialize_table_with_rows_round_trips() {
5970        let mut cat = Catalog::new();
5971        cat.create_table(make_users_schema()).unwrap();
5972        let t = cat.get_mut("users").unwrap();
5973        t.insert(Row::new(vec![
5974            Value::Int(1),
5975            Value::Text("alice".into()),
5976            Value::Float(95.5),
5977        ]))
5978        .unwrap();
5979        t.insert(Row::new(vec![
5980            Value::Int(2),
5981            Value::Text("bob".into()),
5982            Value::Null,
5983        ]))
5984        .unwrap();
5985        assert_round_trip(&cat);
5986    }
5987
5988    #[test]
5989    fn serialize_multiple_tables_round_trips() {
5990        let mut cat = Catalog::new();
5991        cat.create_table(make_users_schema()).unwrap();
5992        cat.create_table(TableSchema::new(
5993            "flags",
5994            vec![
5995                ColumnSchema::new("id", DataType::BigInt, false),
5996                ColumnSchema::new("active", DataType::Bool, false),
5997            ],
5998        ))
5999        .unwrap();
6000        cat.get_mut("flags")
6001            .unwrap()
6002            .insert(Row::new(vec![Value::BigInt(7), Value::Bool(true)]))
6003            .unwrap();
6004        assert_round_trip(&cat);
6005    }
6006
6007    #[test]
6008    fn deserialize_rejects_bad_magic() {
6009        let mut buf = b"BADMAGIC".to_vec();
6010        buf.push(FILE_VERSION);
6011        buf.extend_from_slice(&0u32.to_le_bytes());
6012        let err = Catalog::deserialize(&buf).unwrap_err();
6013        assert!(matches!(err, StorageError::Corrupt(_)));
6014    }
6015
6016    #[test]
6017    fn deserialize_rejects_unsupported_version() {
6018        let mut buf = FILE_MAGIC.to_vec();
6019        buf.push(99); // future version
6020        buf.extend_from_slice(&0u32.to_le_bytes());
6021        let err = Catalog::deserialize(&buf).unwrap_err();
6022        assert!(matches!(err, StorageError::Corrupt(ref s) if s.contains("version")));
6023    }
6024
6025    #[test]
6026    fn deserialize_rejects_truncated_file() {
6027        let mut cat = Catalog::new();
6028        cat.create_table(make_users_schema()).unwrap();
6029        let bytes = cat.serialize();
6030        // Drop the last byte to simulate truncation.
6031        let truncated = &bytes[..bytes.len() - 1];
6032        assert!(matches!(
6033            Catalog::deserialize(truncated),
6034            Err(StorageError::Corrupt(_))
6035        ));
6036    }
6037
6038    #[test]
6039    fn deserialize_rejects_trailing_garbage() {
6040        let cat = Catalog::new();
6041        let mut bytes = cat.serialize();
6042        bytes.push(0xFF);
6043        assert!(matches!(
6044            Catalog::deserialize(&bytes),
6045            Err(StorageError::Corrupt(ref s)) if s.contains("trailing")
6046        ));
6047    }
6048
6049    // --- v0.8 indices ------------------------------------------------------
6050
6051    fn populated_users() -> Catalog {
6052        let mut cat = Catalog::new();
6053        cat.create_table(make_users_schema()).unwrap();
6054        let t = cat.get_mut("users").unwrap();
6055        for (id, name, score) in [
6056            (1, "alice", Some(90.0)),
6057            (2, "bob", None),
6058            (3, "alice", Some(70.0)), // duplicate name → maps to two row idxs
6059        ] {
6060            t.insert(Row::new(vec![
6061                Value::Int(id),
6062                Value::Text(name.into()),
6063                score.map_or(Value::Null, Value::Float),
6064            ]))
6065            .unwrap();
6066        }
6067        cat
6068    }
6069
6070    #[test]
6071    fn add_index_builds_from_existing_rows() {
6072        let mut cat = populated_users();
6073        cat.get_mut("users")
6074            .unwrap()
6075            .add_index("by_id".into(), "id")
6076            .unwrap();
6077        let t = cat.get("users").unwrap();
6078        let idx = t.index_on(0).expect("index_on(0)");
6079        assert_eq!(idx.lookup_eq(&IndexKey::Int(2)), &[RowLocator::Hot(1)]);
6080        assert_eq!(idx.lookup_eq(&IndexKey::Int(99)), &[] as &[RowLocator]);
6081    }
6082
6083    #[test]
6084    fn add_index_dup_name_rejected() {
6085        let mut cat = populated_users();
6086        let t = cat.get_mut("users").unwrap();
6087        t.add_index("ix".into(), "id").unwrap();
6088        let err = t.add_index("ix".into(), "name").unwrap_err();
6089        assert!(matches!(err, StorageError::DuplicateIndex { ref name } if name == "ix"));
6090    }
6091
6092    #[test]
6093    fn add_index_unknown_column_rejected() {
6094        let mut cat = populated_users();
6095        let err = cat
6096            .get_mut("users")
6097            .unwrap()
6098            .add_index("ix".into(), "ghost")
6099            .unwrap_err();
6100        assert!(matches!(err, StorageError::ColumnNotFound { ref column } if column == "ghost"));
6101    }
6102
6103    #[test]
6104    fn insert_after_create_index_updates_it() {
6105        let mut cat = populated_users();
6106        let t = cat.get_mut("users").unwrap();
6107        t.add_index("by_name".into(), "name").unwrap();
6108        t.insert(Row::new(vec![
6109            Value::Int(4),
6110            Value::Text("dave".into()),
6111            Value::Null,
6112        ]))
6113        .unwrap();
6114        let idx = t.index_on(1).unwrap();
6115        assert_eq!(
6116            idx.lookup_eq(&IndexKey::Text("dave".into())),
6117            &[RowLocator::Hot(3)]
6118        );
6119        // Pre-existing duplicates remain mapped to the two original row idxs.
6120        assert_eq!(
6121            idx.lookup_eq(&IndexKey::Text("alice".into())),
6122            &[RowLocator::Hot(0), RowLocator::Hot(2)]
6123        );
6124    }
6125
6126    #[test]
6127    fn null_or_float_values_are_not_indexed() {
6128        let mut cat = populated_users();
6129        let t = cat.get_mut("users").unwrap();
6130        t.add_index("by_score".into(), "score").unwrap();
6131        let idx = t.index_on(2).unwrap();
6132        // bob's score is NULL → no entry for bob.
6133        // Score is Float → the spec says we don't index NaN-prone columns,
6134        // so even the present scores are absent. Lookups via IndexKey::Int(90)
6135        // mis-match the column type and trivially find nothing.
6136        assert_eq!(idx.lookup_eq(&IndexKey::Int(90)), &[] as &[RowLocator]);
6137    }
6138
6139    // --- v0.11 vector type -------------------------------------------------
6140
6141    #[test]
6142    fn vector_value_data_type_carries_dim() {
6143        let v = Value::Vector(vec![1.0, 2.0, 3.0]);
6144        assert_eq!(
6145            v.data_type(),
6146            Some(DataType::Vector {
6147                dim: 3,
6148                encoding: VecEncoding::F32
6149            })
6150        );
6151    }
6152
6153    #[test]
6154    fn vector_column_insert_matching_dim_ok() {
6155        let mut cat = Catalog::new();
6156        cat.create_table(TableSchema::new(
6157            "emb",
6158            vec![ColumnSchema::new(
6159                "v",
6160                DataType::Vector {
6161                    dim: 3,
6162                    encoding: VecEncoding::F32,
6163                },
6164                false,
6165            )],
6166        ))
6167        .unwrap();
6168        cat.get_mut("emb")
6169            .unwrap()
6170            .insert(Row::new(vec![Value::Vector(vec![1.0, 2.0, 3.0])]))
6171            .unwrap();
6172    }
6173
6174    #[test]
6175    fn vector_column_insert_dim_mismatch_rejected() {
6176        let mut cat = Catalog::new();
6177        cat.create_table(TableSchema::new(
6178            "emb",
6179            vec![ColumnSchema::new(
6180                "v",
6181                DataType::Vector {
6182                    dim: 3,
6183                    encoding: VecEncoding::F32,
6184                },
6185                false,
6186            )],
6187        ))
6188        .unwrap();
6189        let err = cat
6190            .get_mut("emb")
6191            .unwrap()
6192            .insert(Row::new(vec![Value::Vector(vec![1.0, 2.0])]))
6193            .unwrap_err();
6194        assert!(matches!(err, StorageError::TypeMismatch { .. }));
6195    }
6196
6197    #[test]
6198    fn vector_value_survives_catalog_round_trip() {
6199        let mut cat = Catalog::new();
6200        cat.create_table(TableSchema::new(
6201            "emb",
6202            vec![
6203                ColumnSchema::new("id", DataType::Int, false),
6204                ColumnSchema::new(
6205                    "v",
6206                    DataType::Vector {
6207                        dim: 4,
6208                        encoding: VecEncoding::F32,
6209                    },
6210                    false,
6211                ),
6212            ],
6213        ))
6214        .unwrap();
6215        cat.get_mut("emb")
6216            .unwrap()
6217            .insert(Row::new(vec![
6218                Value::Int(1),
6219                Value::Vector(vec![0.5, -1.25, 3.0, 7.0]),
6220            ]))
6221            .unwrap();
6222        let restored = Catalog::deserialize(&cat.serialize()).expect("round-trip");
6223        let table = restored.get("emb").unwrap();
6224        assert_eq!(
6225            table.schema().columns[1].ty,
6226            DataType::Vector {
6227                dim: 4,
6228                encoding: VecEncoding::F32
6229            }
6230        );
6231        assert_eq!(
6232            table.rows()[0].values[1],
6233            Value::Vector(vec![0.5, -1.25, 3.0, 7.0])
6234        );
6235    }
6236
6237    #[test]
6238    fn index_survives_serialize_deserialize_round_trip() {
6239        let mut cat = populated_users();
6240        cat.get_mut("users")
6241            .unwrap()
6242            .add_index("by_name".into(), "name")
6243            .unwrap();
6244        let restored = Catalog::deserialize(&cat.serialize()).unwrap();
6245        let idx = restored
6246            .get("users")
6247            .unwrap()
6248            .index_on(1)
6249            .expect("index_on(1) after restore");
6250        assert_eq!(idx.name, "by_name");
6251        // Data was rebuilt from rows, not deserialized directly.
6252        assert_eq!(
6253            idx.lookup_eq(&IndexKey::Text("alice".into())),
6254            &[RowLocator::Hot(0), RowLocator::Hot(2)]
6255        );
6256    }
6257
6258    // --- v5.1 cold-tier integration tests ----------------------
6259
6260    /// Schema with a BIGINT PK column matching what the v5.1 cold-
6261    /// tier path supports (`IndexKey::Int` → `u64` cast).
6262    fn bigint_pk_users_schema() -> TableSchema {
6263        TableSchema::new(
6264            "users",
6265            vec![
6266                ColumnSchema::new("id", DataType::BigInt, false),
6267                ColumnSchema::new("name", DataType::Text, false),
6268            ],
6269        )
6270    }
6271
6272    fn make_user_row(id: i64, name: &str) -> Row {
6273        Row::new(vec![Value::BigInt(id), Value::Text(name.into())])
6274    }
6275
6276    #[test]
6277    fn lookup_by_pk_finds_row_via_hot_index() {
6278        let mut cat = Catalog::new();
6279        cat.create_table(bigint_pk_users_schema()).unwrap();
6280        let t = cat.get_mut("users").unwrap();
6281        for (id, name) in [(1i64, "alice"), (2, "bob"), (3, "carol")] {
6282            t.insert(make_user_row(id, name)).unwrap();
6283        }
6284        t.add_index("by_id".into(), "id").unwrap();
6285        // All locators are Hot; cold_segments is empty.
6286        let got = cat
6287            .lookup_by_pk("users", "by_id", &IndexKey::Int(2))
6288            .unwrap();
6289        assert_eq!(got, make_user_row(2, "bob"));
6290        assert_eq!(cat.cold_segment_count(), 0);
6291    }
6292
6293    #[test]
6294    fn lookup_by_pk_returns_none_when_key_missing() {
6295        let mut cat = Catalog::new();
6296        cat.create_table(bigint_pk_users_schema()).unwrap();
6297        let t = cat.get_mut("users").unwrap();
6298        t.insert(make_user_row(1, "alice")).unwrap();
6299        t.add_index("by_id".into(), "id").unwrap();
6300        assert!(
6301            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(999))
6302                .is_none()
6303        );
6304        // Also: unknown table / unknown index name.
6305        assert!(
6306            cat.lookup_by_pk("other_table", "by_id", &IndexKey::Int(1))
6307                .is_none()
6308        );
6309        assert!(
6310            cat.lookup_by_pk("users", "no_such_index", &IndexKey::Int(1))
6311                .is_none()
6312        );
6313    }
6314
6315    #[test]
6316    fn lookup_by_pk_resolves_cold_locator_via_loaded_segment() {
6317        // Build a cold-tier segment whose payloads are dense-encoded
6318        // BIGINT rows. Wire each PK into the BTree index as a Cold
6319        // locator. The hot tier carries no rows for those PKs.
6320        let mut cat = Catalog::new();
6321        cat.create_table(bigint_pk_users_schema()).unwrap();
6322        let t = cat.get_mut("users").unwrap();
6323        t.add_index("by_id".into(), "id").unwrap();
6324        let schema = t.schema.clone();
6325
6326        let cold_rows: Vec<(i64, &str)> =
6327            vec![(100, "ivy"), (200, "joe"), (300, "kim"), (400, "lin")];
6328        let seg_rows: Vec<(u64, Vec<u8>)> = cold_rows
6329            .iter()
6330            .map(|(id, name)| {
6331                let row = make_user_row(*id, name);
6332                ((*id).cast_unsigned(), encode_row_body_dense(&row, &schema))
6333            })
6334            .collect();
6335        let (seg_bytes, _meta) =
6336            encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES).unwrap();
6337        let seg_id = cat.load_segment_bytes(seg_bytes).unwrap();
6338        assert_eq!(seg_id, 0);
6339        assert_eq!(cat.cold_segment_count(), 1);
6340
6341        let pairs: Vec<(IndexKey, RowLocator)> = cold_rows
6342            .iter()
6343            .map(|(id, _)| {
6344                (
6345                    IndexKey::Int(*id),
6346                    RowLocator::Cold {
6347                        segment_id: seg_id,
6348                        page_offset: 0,
6349                    },
6350                )
6351            })
6352            .collect();
6353        let registered = cat
6354            .get_mut("users")
6355            .unwrap()
6356            .register_cold_locators("by_id", pairs)
6357            .unwrap();
6358        assert_eq!(registered, 4);
6359
6360        for (id, name) in &cold_rows {
6361            let got = cat
6362                .lookup_by_pk("users", "by_id", &IndexKey::Int(*id))
6363                .unwrap_or_else(|| panic!("cold key {id} not found"));
6364            assert_eq!(got, make_user_row(*id, name));
6365        }
6366        // Cold key that isn't in the segment must return None.
6367        assert!(
6368            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(999))
6369                .is_none()
6370        );
6371    }
6372
6373    #[test]
6374    fn lookup_by_pk_mixes_hot_and_cold_tiers() {
6375        // Half the rows live in the hot tier (Table::rows + add_index
6376        // produces Hot locators); half live in a cold segment and have
6377        // Cold locators wired manually. Each lookup hits the right tier.
6378        let mut cat = Catalog::new();
6379        cat.create_table(bigint_pk_users_schema()).unwrap();
6380        let t = cat.get_mut("users").unwrap();
6381        for (id, name) in [(1i64, "alice"), (2, "bob")] {
6382            t.insert(make_user_row(id, name)).unwrap();
6383        }
6384        t.add_index("by_id".into(), "id").unwrap();
6385        let schema = t.schema.clone();
6386
6387        let cold_rows: Vec<(i64, &str)> = vec![(100, "ivy"), (200, "joe")];
6388        let seg_rows: Vec<(u64, Vec<u8>)> = cold_rows
6389            .iter()
6390            .map(|(id, name)| {
6391                let row = make_user_row(*id, name);
6392                ((*id).cast_unsigned(), encode_row_body_dense(&row, &schema))
6393            })
6394            .collect();
6395        let (seg_bytes, _) =
6396            encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES).unwrap();
6397        let seg_id = cat.load_segment_bytes(seg_bytes).unwrap();
6398        let pairs: Vec<(IndexKey, RowLocator)> = cold_rows
6399            .iter()
6400            .map(|(id, _)| {
6401                (
6402                    IndexKey::Int(*id),
6403                    RowLocator::Cold {
6404                        segment_id: seg_id,
6405                        page_offset: 0,
6406                    },
6407                )
6408            })
6409            .collect();
6410        cat.get_mut("users")
6411            .unwrap()
6412            .register_cold_locators("by_id", pairs)
6413            .unwrap();
6414
6415        // Hot tier hits.
6416        assert_eq!(
6417            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(1))
6418                .unwrap(),
6419            make_user_row(1, "alice")
6420        );
6421        assert_eq!(
6422            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(2))
6423                .unwrap(),
6424            make_user_row(2, "bob")
6425        );
6426        // Cold tier hits.
6427        assert_eq!(
6428            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(100))
6429                .unwrap(),
6430            make_user_row(100, "ivy")
6431        );
6432        assert_eq!(
6433            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(200))
6434                .unwrap(),
6435            make_user_row(200, "joe")
6436        );
6437        // Miss in both tiers.
6438        assert!(
6439            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(50))
6440                .is_none()
6441        );
6442    }
6443
6444    #[test]
6445    fn register_cold_locators_rejects_nsw_index() {
6446        let mut cat = Catalog::new();
6447        cat.create_table(TableSchema::new(
6448            "vecs",
6449            vec![
6450                ColumnSchema::new("id", DataType::Int, false),
6451                ColumnSchema::new(
6452                    "v",
6453                    DataType::Vector {
6454                        dim: 4,
6455                        encoding: VecEncoding::F32,
6456                    },
6457                    false,
6458                ),
6459            ],
6460        ))
6461        .unwrap();
6462        let t = cat.get_mut("vecs").unwrap();
6463        t.insert(Row::new(vec![
6464            Value::Int(1),
6465            Value::Vector(vec![1.0, 0.0, 0.0, 0.0]),
6466        ]))
6467        .unwrap();
6468        t.add_nsw_index("by_v".into(), "v", NSW_DEFAULT_M).unwrap();
6469        let err = t
6470            .register_cold_locators(
6471                "by_v",
6472                vec![(
6473                    IndexKey::Int(1),
6474                    RowLocator::Cold {
6475                        segment_id: 0,
6476                        page_offset: 0,
6477                    },
6478                )],
6479            )
6480            .unwrap_err();
6481        // v6.7.1: message switched from "is NSW" to "is not BTree"
6482        // when the Brin variant was added.
6483        assert!(matches!(err, StorageError::Corrupt(ref s) if s.contains("not BTree")));
6484    }
6485
6486    #[test]
6487    fn load_segment_bytes_rejects_garbage() {
6488        let mut cat = Catalog::new();
6489        let err = cat.load_segment_bytes(vec![0u8; 10]).unwrap_err();
6490        assert!(matches!(err, StorageError::Corrupt(ref s) if s.contains("segment")));
6491        // Loader doesn't mutate state on error.
6492        assert_eq!(cat.cold_segment_count(), 0);
6493    }
6494
6495    #[test]
6496    fn load_segment_bytes_returns_sequential_ids() {
6497        let mut cat = Catalog::new();
6498        cat.create_table(bigint_pk_users_schema()).unwrap();
6499        let schema = cat.get("users").unwrap().schema.clone();
6500        for batch in 0u32..3 {
6501            let rows: Vec<(u64, Vec<u8>)> = (0u64..4)
6502                .map(|i| {
6503                    let id = u64::from(batch) * 100 + i;
6504                    let row = make_user_row(id.cast_signed(), "x");
6505                    (id, encode_row_body_dense(&row, &schema))
6506                })
6507                .collect();
6508            let (bytes, _) = encode_segment(rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES).unwrap();
6509            assert_eq!(cat.load_segment_bytes(bytes).unwrap(), batch);
6510        }
6511        assert_eq!(cat.cold_segment_count(), 3);
6512    }
6513
6514    // --- v5.2 catalog format v9 ----------------------------------
6515
6516    /// Hand-craft a v8 catalog byte stream and confirm the v9 reader
6517    /// accepts it and surfaces every `BTree` entry as a Hot locator.
6518    /// Guards the backward-compat read path: existing v3.0.2 / v4.x
6519    /// snapshots on disk must keep loading after the v5.2 bump.
6520    #[test]
6521    fn v8_catalog_decodes_as_all_hot_under_v9_reader() {
6522        // Build a populated catalog in memory, snapshot it with the
6523        // v9 serializer, then patch the version byte back to 8 and
6524        // strip the v9 BTree payload bytes so the layout matches what
6525        // a real v8 snapshot would have produced on disk. The v9
6526        // reader's version dispatch path then rebuilds the index
6527        // from rows (every locator becomes Hot).
6528        let mut cat = populated_users();
6529        cat.get_mut("users")
6530            .unwrap()
6531            .add_index("by_name".into(), "name")
6532            .unwrap();
6533
6534        // To produce a faithful v8 byte stream we re-encode the same
6535        // catalog with the v8 layout: identical bytes up to (and
6536        // including) the per-index kind tag, but no inline BTree
6537        // entries.
6538        let v8_bytes = encode_as_v8(&cat);
6539        assert_eq!(v8_bytes[FILE_MAGIC.len()], 8, "version byte must be 8");
6540
6541        let restored = Catalog::deserialize(&v8_bytes).expect("v9 reader accepts v8 stream");
6542        let idx = restored
6543            .get("users")
6544            .unwrap()
6545            .index_on(1)
6546            .expect("index_on(1) after restore");
6547        // v8 path always materialises Hot locators (no cold tier
6548        // existed pre-v5.2).
6549        assert_eq!(
6550            idx.lookup_eq(&IndexKey::Text("alice".into())),
6551            &[RowLocator::Hot(0), RowLocator::Hot(2)]
6552        );
6553        // No accidental Cold leak.
6554        for entry in idx.lookup_eq(&IndexKey::Text("alice".into())) {
6555            assert!(entry.is_hot(), "v8 → v9 read must yield Hot only");
6556        }
6557    }
6558
6559    /// Encode `cat` using the v8 layout (no inline `BTree` entries,
6560    /// version byte = 8). Pure test helper — duplicates just enough
6561    /// of `Catalog::serialize` to produce a faithful v8 stream that
6562    /// real v3.0.2 / v4.x deployments wrote.
6563    fn encode_as_v8(cat: &Catalog) -> Vec<u8> {
6564        let mut out = Vec::with_capacity(64);
6565        out.extend_from_slice(FILE_MAGIC);
6566        out.push(8u8);
6567        write_u32(&mut out, u32::try_from(cat.tables.len()).unwrap());
6568        for t in &cat.tables {
6569            write_str(&mut out, &t.schema.name);
6570            write_u16(&mut out, u16::try_from(t.schema.columns.len()).unwrap());
6571            for c in &t.schema.columns {
6572                write_str(&mut out, &c.name);
6573                write_data_type(&mut out, c.ty);
6574                out.push(u8::from(c.nullable));
6575                match &c.default {
6576                    None => out.push(0),
6577                    Some(v) => {
6578                        out.push(1);
6579                        write_value(&mut out, v);
6580                    }
6581                }
6582                out.push(u8::from(c.auto_increment));
6583            }
6584            write_u32(&mut out, u32::try_from(t.rows.len()).unwrap());
6585            for row in &t.rows {
6586                out.extend_from_slice(&encode_row_body_dense(row, &t.schema));
6587            }
6588            write_u16(&mut out, u16::try_from(t.indices.len()).unwrap());
6589            for idx in &t.indices {
6590                write_str(&mut out, &idx.name);
6591                write_u16(&mut out, u16::try_from(idx.column_position).unwrap());
6592                match &idx.kind {
6593                    // v8 BTree wrote only the kind tag; entries
6594                    // rebuild from rows on read.
6595                    IndexKind::BTree(_) => out.push(0),
6596                    IndexKind::Nsw(g) => {
6597                        out.push(1);
6598                        write_u16(&mut out, u16::try_from(g.m).unwrap());
6599                        write_nsw_graph(&mut out, g);
6600                    }
6601                    // v8 had no BRIN; this test-only writer can't
6602                    // serialise BRIN into the legacy format.
6603                    IndexKind::Brin { .. } => panic!(
6604                        "v8 catalog writer cannot serialise BRIN — \
6605                         tests with BRIN indices must use the current writer"
6606                    ),
6607                }
6608            }
6609        }
6610        out
6611    }
6612
6613    /// Build a catalog that carries both hot and cold locators on a
6614    /// `BTree` index, snapshot it through `serialize`, then deserialise
6615    /// and confirm every Cold locator round-trips byte-identical and
6616    /// `lookup_by_pk` resolves through the rebuilt cold-segment
6617    /// registry.
6618    #[test]
6619    fn v9_catalog_round_trip_preserves_cold_locators() {
6620        let mut cat = Catalog::new();
6621        cat.create_table(bigint_pk_users_schema()).unwrap();
6622        let t = cat.get_mut("users").unwrap();
6623        // Hot rows: 1, 2
6624        for (id, name) in [(1i64, "alice"), (2, "bob")] {
6625            t.insert(make_user_row(id, name)).unwrap();
6626        }
6627        t.add_index("by_id".into(), "id").unwrap();
6628        let schema = t.schema.clone();
6629
6630        // Cold rows: 100, 200, 300 — sit in a single segment.
6631        let cold_rows: Vec<(i64, &str)> = vec![(100, "ivy"), (200, "joe"), (300, "kim")];
6632        let seg_rows: Vec<(u64, Vec<u8>)> = cold_rows
6633            .iter()
6634            .map(|(id, name)| {
6635                let row = make_user_row(*id, name);
6636                ((*id).cast_unsigned(), encode_row_body_dense(&row, &schema))
6637            })
6638            .collect();
6639        let (seg_bytes, _) =
6640            encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES).unwrap();
6641        let seg_id = cat.load_segment_bytes(seg_bytes.clone()).unwrap();
6642        let pairs: Vec<(IndexKey, RowLocator)> = cold_rows
6643            .iter()
6644            .map(|(id, _)| {
6645                (
6646                    IndexKey::Int(*id),
6647                    RowLocator::Cold {
6648                        segment_id: seg_id,
6649                        page_offset: 0,
6650                    },
6651                )
6652            })
6653            .collect();
6654        cat.get_mut("users")
6655            .unwrap()
6656            .register_cold_locators("by_id", pairs)
6657            .unwrap();
6658
6659        // Snapshot + restore via the v9 codec.
6660        let bytes = cat.serialize();
6661        assert_eq!(bytes[FILE_MAGIC.len()], FILE_VERSION);
6662        let mut restored = Catalog::deserialize(&bytes).expect("v9 round-trip parses");
6663
6664        // Catalog::serialize does not yet emit cold segment file
6665        // bytes (v5.3 manifest is the future home for that). For
6666        // this v9 test the caller side-loads the segment again so
6667        // lookup_by_pk can resolve the Cold locator. The point of
6668        // this assertion is that the locator metadata survived the
6669        // catalog round-trip.
6670        let restored_seg_id = restored.load_segment_bytes(seg_bytes).unwrap();
6671        assert_eq!(restored_seg_id, seg_id);
6672
6673        let idx = restored.get("users").unwrap().index_on(0).unwrap();
6674        // Hot locators round-trip.
6675        assert_eq!(idx.lookup_eq(&IndexKey::Int(1)), &[RowLocator::Hot(0)]);
6676        assert_eq!(idx.lookup_eq(&IndexKey::Int(2)), &[RowLocator::Hot(1)]);
6677        // Cold locators round-trip byte-identical.
6678        for (id, _) in &cold_rows {
6679            assert_eq!(
6680                idx.lookup_eq(&IndexKey::Int(*id)),
6681                &[RowLocator::Cold {
6682                    segment_id: seg_id,
6683                    page_offset: 0,
6684                }]
6685            );
6686        }
6687        // End-to-end: lookup_by_pk resolves both tiers.
6688        assert_eq!(
6689            restored
6690                .lookup_by_pk("users", "by_id", &IndexKey::Int(2))
6691                .unwrap(),
6692            make_user_row(2, "bob")
6693        );
6694        for (id, name) in &cold_rows {
6695            assert_eq!(
6696                restored
6697                    .lookup_by_pk("users", "by_id", &IndexKey::Int(*id))
6698                    .unwrap(),
6699                make_user_row(*id, name)
6700            );
6701        }
6702    }
6703
6704    // --- v5.2.1 hot tier byte tracking ---------------------------
6705
6706    /// `row_body_encoded_len` is the perf-critical fast path; pin it
6707    /// against `encode_row_body_dense(...).len()` for every
6708    /// representative cell type so an encoder change can't silently
6709    /// desync the counter.
6710    #[test]
6711    fn row_body_encoded_len_matches_actual_encode_for_all_types() {
6712        let schema = TableSchema::new(
6713            "wide",
6714            vec![
6715                ColumnSchema::new("a", DataType::SmallInt, true),
6716                ColumnSchema::new("b", DataType::Int, false),
6717                ColumnSchema::new("c", DataType::BigInt, false),
6718                ColumnSchema::new("d", DataType::Float, false),
6719                ColumnSchema::new("e", DataType::Bool, false),
6720                ColumnSchema::new("f", DataType::Text, false),
6721                ColumnSchema::new(
6722                    "g",
6723                    DataType::Vector {
6724                        dim: 3,
6725                        encoding: VecEncoding::F32,
6726                    },
6727                    false,
6728                ),
6729                ColumnSchema::new(
6730                    "h",
6731                    DataType::Numeric {
6732                        precision: 18,
6733                        scale: 2,
6734                    },
6735                    false,
6736                ),
6737                ColumnSchema::new("i", DataType::Date, false),
6738                ColumnSchema::new("j", DataType::Timestamp, false),
6739            ],
6740        );
6741        let cases: &[Row] = &[
6742            Row::new(vec![
6743                Value::SmallInt(7),
6744                Value::Int(42),
6745                Value::BigInt(1_000_000),
6746                Value::Float(1.5),
6747                Value::Bool(true),
6748                Value::Text("hello".into()),
6749                Value::Vector(vec![1.0, 2.0, 3.0]),
6750                Value::Numeric {
6751                    scaled: 12345,
6752                    scale: 2,
6753                },
6754                Value::Date(20_000),
6755                Value::Timestamp(1_700_000_000_000_000),
6756            ]),
6757            // NULL in the bitmap, varied text length.
6758            Row::new(vec![
6759                Value::Null,
6760                Value::Int(0),
6761                Value::BigInt(0),
6762                Value::Float(0.0),
6763                Value::Bool(false),
6764                Value::Text(String::new()),
6765                Value::Vector(vec![]),
6766                Value::Numeric {
6767                    scaled: 0,
6768                    scale: 2,
6769                },
6770                Value::Date(0),
6771                Value::Timestamp(0),
6772            ]),
6773            Row::new(vec![
6774                Value::SmallInt(-1),
6775                Value::Int(-1),
6776                Value::BigInt(-1),
6777                Value::Float(-0.5),
6778                Value::Bool(true),
6779                Value::Text("a much longer payload here".into()),
6780                Value::Vector(vec![0.1, 0.2, 0.3]),
6781                Value::Numeric {
6782                    scaled: -999_999_999,
6783                    scale: 2,
6784                },
6785                Value::Date(-1),
6786                Value::Timestamp(-1),
6787            ]),
6788        ];
6789        for row in cases {
6790            let actual = encode_row_body_dense(row, &schema).len();
6791            let fast = row_body_encoded_len(row, &schema);
6792            assert_eq!(actual, fast, "row {row:?}");
6793        }
6794    }
6795
6796    #[test]
6797    fn hot_bytes_grows_on_insert_and_matches_encoded_sum() {
6798        let mut cat = Catalog::new();
6799        cat.create_table(bigint_pk_users_schema()).unwrap();
6800        let t = cat.get_mut("users").unwrap();
6801        assert_eq!(t.hot_bytes(), 0);
6802        let mut expected: u64 = 0;
6803        for (id, name) in [(1i64, "alice"), (2, "bob"), (3, "carol")] {
6804            let row = make_user_row(id, name);
6805            expected += encode_row_body_dense(&row, &t.schema).len() as u64;
6806            t.insert(row).unwrap();
6807        }
6808        assert_eq!(t.hot_bytes(), expected);
6809        assert_eq!(cat.hot_tier_bytes(), expected);
6810    }
6811
6812    #[test]
6813    fn hot_bytes_shrinks_on_delete() {
6814        let mut cat = Catalog::new();
6815        cat.create_table(bigint_pk_users_schema()).unwrap();
6816        let t = cat.get_mut("users").unwrap();
6817        for (id, name) in [(1i64, "alice"), (2, "bob"), (3, "carol")] {
6818            t.insert(make_user_row(id, name)).unwrap();
6819        }
6820        let before = t.hot_bytes();
6821        // Delete row at position 1 (bob).
6822        let bob_row = make_user_row(2, "bob");
6823        let bob_bytes = encode_row_body_dense(&bob_row, &t.schema).len() as u64;
6824        let removed = t.delete_rows(&[1]);
6825        assert_eq!(removed, 1);
6826        assert_eq!(t.hot_bytes(), before - bob_bytes);
6827    }
6828
6829    #[test]
6830    fn hot_bytes_diffs_on_update_for_variable_width_columns() {
6831        let mut cat = Catalog::new();
6832        cat.create_table(bigint_pk_users_schema()).unwrap();
6833        let t = cat.get_mut("users").unwrap();
6834        t.insert(make_user_row(1, "alice")).unwrap();
6835        let after_insert = t.hot_bytes();
6836        // Update with a longer text payload — bytes must grow exactly
6837        // by the text-length delta.
6838        let new_row = make_user_row(1, "alice-the-longer-name");
6839        let old_len = encode_row_body_dense(&make_user_row(1, "alice"), &t.schema).len() as u64;
6840        let new_len = encode_row_body_dense(&new_row, &t.schema).len() as u64;
6841        t.update_row(0, new_row.values).unwrap();
6842        assert_eq!(t.hot_bytes(), after_insert - old_len + new_len);
6843        assert!(t.hot_bytes() > after_insert, "longer text grew the counter");
6844    }
6845
6846    #[test]
6847    fn hot_bytes_round_trips_through_serialize_deserialize() {
6848        let mut cat = Catalog::new();
6849        cat.create_table(bigint_pk_users_schema()).unwrap();
6850        let t = cat.get_mut("users").unwrap();
6851        for i in 0..10 {
6852            t.insert(make_user_row(i, &alloc::format!("name-{i}")))
6853                .unwrap();
6854        }
6855        let pre = cat.hot_tier_bytes();
6856        let restored = Catalog::deserialize(&cat.serialize()).unwrap();
6857        assert_eq!(restored.hot_tier_bytes(), pre);
6858        assert_eq!(restored.get("users").unwrap().hot_bytes(), pre);
6859    }
6860
6861    // --- v5.2.2 freezer atomic swap -------------------------------
6862
6863    /// Happy path: freeze the first half of a populated hot tier,
6864    /// confirm row counts shift, `hot_bytes` shrinks, and every frozen
6865    /// PK still resolves via `lookup_by_pk` (now through the cold
6866    /// segment registered by the freeze).
6867    #[test]
6868    fn freeze_oldest_to_cold_moves_rows_and_keeps_lookups_working() {
6869        let mut cat = Catalog::new();
6870        cat.create_table(bigint_pk_users_schema()).unwrap();
6871        let t = cat.get_mut("users").unwrap();
6872        for id in 0..10i64 {
6873            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
6874                .unwrap();
6875        }
6876        t.add_index("by_id".into(), "id").unwrap();
6877        let total_bytes_before = t.hot_bytes();
6878
6879        let report = cat
6880            .freeze_oldest_to_cold("users", "by_id", 6)
6881            .expect("freeze succeeds");
6882        assert_eq!(report.frozen_rows, 6);
6883        assert_eq!(report.segment_id, 0);
6884        assert!(report.bytes_freed > 0);
6885        assert!(!report.segment_bytes.is_empty());
6886
6887        let t = cat.get("users").unwrap();
6888        assert_eq!(t.row_count(), 4, "4 hot rows remain (10 - 6 frozen)");
6889        assert_eq!(cat.cold_segment_count(), 1);
6890        // Hot bytes shrank by exactly the freed amount.
6891        assert_eq!(
6892            t.hot_bytes(),
6893            total_bytes_before - report.bytes_freed,
6894            "hot_bytes accounting matches FreezeReport"
6895        );
6896
6897        // Every original PK still resolves — frozen ones via the
6898        // cold segment, kept ones via the (renumbered) hot tier.
6899        for id in 0..10i64 {
6900            let got = cat
6901                .lookup_by_pk("users", "by_id", &IndexKey::Int(id))
6902                .unwrap_or_else(|| panic!("PK {id} disappeared after freeze"));
6903            assert_eq!(got, make_user_row(id, &alloc::format!("u-{id}")));
6904        }
6905    }
6906
6907    /// Two successive freezes on the same index must preserve the
6908    /// first batch's cold locators when the second freeze runs.
6909    /// Catches the `rebuild_indices` wipe-Cold-on-delete bug that
6910    /// `collect_cold_locators` / re-register guards against.
6911    #[test]
6912    fn freeze_twice_preserves_prior_cold_locators() {
6913        let mut cat = Catalog::new();
6914        cat.create_table(bigint_pk_users_schema()).unwrap();
6915        let t = cat.get_mut("users").unwrap();
6916        for id in 0..12i64 {
6917            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
6918                .unwrap();
6919        }
6920        t.add_index("by_id".into(), "id").unwrap();
6921
6922        cat.freeze_oldest_to_cold("users", "by_id", 4)
6923            .expect("first freeze ok");
6924        cat.freeze_oldest_to_cold("users", "by_id", 4)
6925            .expect("second freeze ok");
6926
6927        assert_eq!(cat.get("users").unwrap().row_count(), 4);
6928        assert_eq!(cat.cold_segment_count(), 2);
6929        // All 12 PKs still resolve — first 4 via segment 0,
6930        // next 4 via segment 1, last 4 still hot.
6931        for id in 0..12i64 {
6932            let got = cat
6933                .lookup_by_pk("users", "by_id", &IndexKey::Int(id))
6934                .unwrap_or_else(|| panic!("PK {id} not resolvable after two freezes"));
6935            assert_eq!(got, make_user_row(id, &alloc::format!("u-{id}")));
6936        }
6937    }
6938
6939    /// Validation guard tests. Each must return `Err` and **not
6940    /// mutate the catalog** — the API is all-or-nothing.
6941    #[test]
6942    fn freeze_oldest_to_cold_rejects_invalid_input() {
6943        let mut cat = Catalog::new();
6944        cat.create_table(bigint_pk_users_schema()).unwrap();
6945        let t = cat.get_mut("users").unwrap();
6946        for id in 0..3i64 {
6947            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
6948                .unwrap();
6949        }
6950        t.add_index("by_id".into(), "id").unwrap();
6951
6952        // max_rows == 0
6953        assert!(matches!(
6954            cat.freeze_oldest_to_cold("users", "by_id", 0),
6955            Err(StorageError::Corrupt(_))
6956        ));
6957        // table missing
6958        assert!(matches!(
6959            cat.freeze_oldest_to_cold("missing", "by_id", 1),
6960            Err(StorageError::Corrupt(_))
6961        ));
6962        // index missing
6963        assert!(matches!(
6964            cat.freeze_oldest_to_cold("users", "no_such_index", 1),
6965            Err(StorageError::Corrupt(_))
6966        ));
6967        // max_rows > row_count
6968        assert!(matches!(
6969            cat.freeze_oldest_to_cold("users", "by_id", 999),
6970            Err(StorageError::Corrupt(_))
6971        ));
6972        // Catalog still untouched.
6973        assert_eq!(cat.get("users").unwrap().row_count(), 3);
6974        assert_eq!(cat.cold_segment_count(), 0);
6975    }
6976
6977    /// Freeze with a non-integer PK column must surface a clear
6978    /// error (Text PKs land in v5.5+).
6979    #[test]
6980    fn freeze_oldest_to_cold_rejects_non_integer_pk() {
6981        let mut cat = Catalog::new();
6982        cat.create_table(TableSchema::new(
6983            "by_name",
6984            vec![
6985                ColumnSchema::new("name", DataType::Text, false),
6986                ColumnSchema::new("payload", DataType::BigInt, false),
6987            ],
6988        ))
6989        .unwrap();
6990        let t = cat.get_mut("by_name").unwrap();
6991        t.insert(Row::new(vec![Value::Text("a".into()), Value::BigInt(1)]))
6992            .unwrap();
6993        t.add_index("by_n".into(), "name").unwrap();
6994        let err = cat
6995            .freeze_oldest_to_cold("by_name", "by_n", 1)
6996            .expect_err("non-integer PK rejected");
6997        match err {
6998            StorageError::Corrupt(s) => assert!(
6999                s.contains("non-integer"),
7000                "error message names the constraint: {s}"
7001            ),
7002            other => panic!("expected Corrupt, got {other:?}"),
7003        }
7004        // Catalog untouched.
7005        assert_eq!(cat.get("by_name").unwrap().row_count(), 1);
7006        assert_eq!(cat.cold_segment_count(), 0);
7007    }
7008
7009    /// Hot-tier rows after the freeze must keep their secondary-
7010    /// index lookups working — `delete_rows` shifts positions, and
7011    /// `rebuild_indices` must regenerate Hot locators at the new
7012    /// indices.
7013    #[test]
7014    fn freeze_keeps_remaining_hot_rows_addressable_via_secondary_index() {
7015        let mut cat = Catalog::new();
7016        cat.create_table(bigint_pk_users_schema()).unwrap();
7017        let t = cat.get_mut("users").unwrap();
7018        for id in 0..6i64 {
7019            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
7020                .unwrap();
7021        }
7022        t.add_index("by_id".into(), "id").unwrap();
7023        t.add_index("by_name".into(), "name").unwrap();
7024
7025        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
7026
7027        // Remaining hot rows: id 3, 4, 5. They moved to positions
7028        // 0, 1, 2 inside `self.rows`; the `by_name` index must now
7029        // resolve them via fresh Hot locators.
7030        let idx = cat.get("users").unwrap().index_on(1).unwrap();
7031        let got = idx.lookup_eq(&IndexKey::Text("u-4".into()));
7032        assert_eq!(got.len(), 1);
7033        assert!(got[0].is_hot(), "kept-hot rows still surface as Hot");
7034        match got[0] {
7035            RowLocator::Hot(i) => {
7036                // The 4th-inserted row was at position 4; after
7037                // dropping positions 0..3 it sits at position 1.
7038                assert_eq!(i, 1);
7039            }
7040            RowLocator::Cold { .. } => unreachable!(),
7041        }
7042    }
7043
7044    // --- v5.2.3 promote-on-write primitives ----------------------
7045
7046    /// Build a populated catalog with the first N rows frozen, then
7047    /// run `promote_cold_row` and verify the row crossed tiers
7048    /// correctly: the cold locator is retired, a fresh Hot locator
7049    /// appears, `lookup_by_pk` returns the row from the hot tier, and
7050    /// `hot_bytes` grew by the row's encoded byte length.
7051    #[test]
7052    fn promote_cold_row_pulls_frozen_row_back_to_hot_tier() {
7053        let mut cat = Catalog::new();
7054        cat.create_table(bigint_pk_users_schema()).unwrap();
7055        let t = cat.get_mut("users").unwrap();
7056        for id in 0..6i64 {
7057            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
7058                .unwrap();
7059        }
7060        t.add_index("by_id".into(), "id").unwrap();
7061        // Freeze first 4 rows (ids 0..3). After: hot rows = 4, 5 at
7062        // positions 0, 1; cold locators for keys 0..3.
7063        cat.freeze_oldest_to_cold("users", "by_id", 4).unwrap();
7064        let hot_bytes_before = cat.get("users").unwrap().hot_bytes();
7065
7066        // Promote PK=2 — it lives in segment 0 as a cold row.
7067        let new_idx = cat
7068            .promote_cold_row("users", "by_id", &IndexKey::Int(2))
7069            .expect("promote ok")
7070            .expect("PK 2 was cold");
7071        assert_eq!(
7072            new_idx, 2,
7073            "promoted row appended after the 2 surviving hot rows"
7074        );
7075
7076        let t = cat.get("users").unwrap();
7077        assert_eq!(t.row_count(), 3, "hot tier grew from 2 to 3");
7078        // Hot-bytes climbed by exactly one row's encoded length.
7079        let row = make_user_row(2, "u-2");
7080        let row_len = encode_row_body_dense(&row, &t.schema).len() as u64;
7081        assert_eq!(t.hot_bytes(), hot_bytes_before + row_len);
7082
7083        // The index now reports a Hot locator (the freshly inserted
7084        // row) — no Cold locator left for PK 2.
7085        let entries = t.index_on(0).unwrap().lookup_eq(&IndexKey::Int(2));
7086        assert_eq!(entries.len(), 1, "exactly one locator per key");
7087        assert!(entries[0].is_hot(), "promote retired the Cold locator");
7088        // End-to-end: lookup_by_pk still returns the row body.
7089        assert_eq!(
7090            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(2))
7091                .unwrap(),
7092            row
7093        );
7094        // Other cold rows untouched — still resolvable through the
7095        // segment.
7096        assert_eq!(
7097            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(0))
7098                .unwrap(),
7099            make_user_row(0, "u-0")
7100        );
7101    }
7102
7103    /// `promote_cold_row` on a key that's already hot (or absent)
7104    /// returns `Ok(None)` — not an error. The caller falls back to
7105    /// the hot-only update/delete path.
7106    #[test]
7107    fn promote_cold_row_returns_none_when_key_is_not_cold() {
7108        let mut cat = Catalog::new();
7109        cat.create_table(bigint_pk_users_schema()).unwrap();
7110        let t = cat.get_mut("users").unwrap();
7111        t.insert(make_user_row(7, "alice")).unwrap();
7112        t.add_index("by_id".into(), "id").unwrap();
7113
7114        // Hot-only key.
7115        assert!(
7116            cat.promote_cold_row("users", "by_id", &IndexKey::Int(7))
7117                .unwrap()
7118                .is_none()
7119        );
7120        // Absent key.
7121        assert!(
7122            cat.promote_cold_row("users", "by_id", &IndexKey::Int(99))
7123                .unwrap()
7124                .is_none()
7125        );
7126        // Catalog untouched on both no-op paths.
7127        assert_eq!(cat.get("users").unwrap().row_count(), 1);
7128        assert_eq!(cat.cold_segment_count(), 0);
7129    }
7130
7131    /// `shadow_cold_row` removes every Cold locator for a key on a
7132    /// `BTree` index. After the shadow, `lookup_by_pk` for that key
7133    /// returns None (the row data still sits in the segment file,
7134    /// but it's now garbage; compaction will reclaim it later).
7135    #[test]
7136    fn shadow_cold_row_removes_cold_locators_and_drops_lookup() {
7137        let mut cat = Catalog::new();
7138        cat.create_table(bigint_pk_users_schema()).unwrap();
7139        let t = cat.get_mut("users").unwrap();
7140        for id in 0..5i64 {
7141            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
7142                .unwrap();
7143        }
7144        t.add_index("by_id".into(), "id").unwrap();
7145        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
7146
7147        // Shadow PK=1 — pre-shadow lookup hits the cold tier.
7148        assert!(
7149            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(1))
7150                .is_some(),
7151            "frozen PK resolves before shadow"
7152        );
7153        let removed = cat
7154            .shadow_cold_row("users", "by_id", &IndexKey::Int(1))
7155            .unwrap();
7156        assert_eq!(removed, 1, "exactly one cold locator retired");
7157
7158        // Post-shadow: lookup misses, even though the row still
7159        // exists in segment 0.
7160        assert!(
7161            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(1))
7162                .is_none(),
7163            "shadowed key no longer resolves"
7164        );
7165        // Other cold keys still resolve.
7166        assert_eq!(
7167            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(0))
7168                .unwrap(),
7169            make_user_row(0, "u-0")
7170        );
7171        assert_eq!(
7172            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(2))
7173                .unwrap(),
7174            make_user_row(2, "u-2")
7175        );
7176    }
7177
7178    /// `shadow_cold_row` returns 0 (not Err) for keys with only Hot
7179    /// entries or no entries — the engine's DELETE path uses this
7180    /// signal to decide whether the cold-tier shadow path consumed
7181    /// the work.
7182    #[test]
7183    fn shadow_cold_row_returns_zero_when_key_is_not_cold() {
7184        let mut cat = Catalog::new();
7185        cat.create_table(bigint_pk_users_schema()).unwrap();
7186        let t = cat.get_mut("users").unwrap();
7187        t.insert(make_user_row(1, "alice")).unwrap();
7188        t.add_index("by_id".into(), "id").unwrap();
7189        assert_eq!(
7190            cat.shadow_cold_row("users", "by_id", &IndexKey::Int(1))
7191                .unwrap(),
7192            0,
7193            "hot-only key drops no cold locators"
7194        );
7195        assert_eq!(
7196            cat.shadow_cold_row("users", "by_id", &IndexKey::Int(999))
7197                .unwrap(),
7198            0,
7199            "absent key drops no cold locators"
7200        );
7201        assert_eq!(cat.get("users").unwrap().row_count(), 1);
7202    }
7203
7204    /// Validation guards on both promote / shadow primitives.
7205    #[test]
7206    fn promote_and_shadow_reject_invalid_inputs() {
7207        let mut cat = Catalog::new();
7208        cat.create_table(bigint_pk_users_schema()).unwrap();
7209        let t = cat.get_mut("users").unwrap();
7210        t.insert(make_user_row(1, "alice")).unwrap();
7211        t.add_index("by_id".into(), "id").unwrap();
7212
7213        // Missing table.
7214        assert!(matches!(
7215            cat.promote_cold_row("missing", "by_id", &IndexKey::Int(1)),
7216            Err(StorageError::Corrupt(_))
7217        ));
7218        assert!(matches!(
7219            cat.shadow_cold_row("missing", "by_id", &IndexKey::Int(1)),
7220            Err(StorageError::Corrupt(_))
7221        ));
7222        // Missing index.
7223        assert!(matches!(
7224            cat.promote_cold_row("users", "no_such_index", &IndexKey::Int(1)),
7225            Err(StorageError::Corrupt(_))
7226        ));
7227        assert!(matches!(
7228            cat.shadow_cold_row("users", "no_such_index", &IndexKey::Int(1)),
7229            Err(StorageError::Corrupt(_))
7230        ));
7231    }
7232
7233    // --- v6.7.4 parallel-freezer slice/commit API -----------------
7234
7235    /// One slice covering the entire freeze produces the same
7236    /// catalog state as the single-threaded `freeze_oldest_to_cold`
7237    /// — segment id, frozen row count, hot byte delta, and every
7238    /// post-freeze PK lookup match exactly.
7239    #[test]
7240    fn commit_freeze_slices_single_slice_matches_freeze_oldest() {
7241        let mut a = Catalog::new();
7242        let mut b = Catalog::new();
7243        for cat in [&mut a, &mut b] {
7244            cat.create_table(bigint_pk_users_schema()).unwrap();
7245            let t = cat.get_mut("users").unwrap();
7246            for id in 0..10i64 {
7247                t.insert(make_user_row(id, &alloc::format!("u-{id}")))
7248                    .unwrap();
7249            }
7250            t.add_index("by_id".into(), "id").unwrap();
7251        }
7252        let single = a.freeze_oldest_to_cold("users", "by_id", 6).unwrap();
7253        let slice = b
7254            .prepare_freeze_slice("users", "by_id", 0..6)
7255            .expect("prepare");
7256        let parallel = b
7257            .commit_freeze_slices("users", "by_id", alloc::vec![slice])
7258            .expect("commit");
7259        assert_eq!(single.segment_id, parallel.segment_id);
7260        assert_eq!(single.frozen_rows, parallel.frozen_rows);
7261        assert_eq!(single.bytes_freed, parallel.bytes_freed);
7262        assert_eq!(single.segment_bytes, parallel.segment_bytes);
7263        // Same post-freeze lookup behaviour on both catalogs.
7264        for id in 0..10i64 {
7265            assert_eq!(
7266                a.lookup_by_pk("users", "by_id", &IndexKey::Int(id)),
7267                b.lookup_by_pk("users", "by_id", &IndexKey::Int(id)),
7268                "PK {id} differs after single vs slice freeze"
7269            );
7270        }
7271    }
7272
7273    /// Two slices covering disjoint halves of the freeze produce
7274    /// the same merged segment as one slice covering the full
7275    /// range. The k-way merge preserves PK ordering even when
7276    /// slice halves alternate.
7277    #[test]
7278    fn commit_freeze_slices_two_slices_match_single_slice() {
7279        let mut a = Catalog::new();
7280        let mut b = Catalog::new();
7281        for cat in [&mut a, &mut b] {
7282            cat.create_table(bigint_pk_users_schema()).unwrap();
7283            let t = cat.get_mut("users").unwrap();
7284            // Random-ish PKs so the per-slice sort actually has
7285            // work to do (and slice halves carry interleaved keys).
7286            for id in [3, 7, 1, 9, 5, 0, 8, 4, 2, 6].iter().copied() {
7287                t.insert(make_user_row(id as i64, &alloc::format!("u-{id}")))
7288                    .unwrap();
7289            }
7290            t.add_index("by_id".into(), "id").unwrap();
7291        }
7292        let single = a
7293            .prepare_freeze_slice("users", "by_id", 0..8)
7294            .expect("prepare");
7295        let one = a
7296            .commit_freeze_slices("users", "by_id", alloc::vec![single])
7297            .expect("commit one");
7298        let s1 = b
7299            .prepare_freeze_slice("users", "by_id", 0..4)
7300            .expect("prepare s1");
7301        let s2 = b
7302            .prepare_freeze_slice("users", "by_id", 4..8)
7303            .expect("prepare s2");
7304        let two = b
7305            .commit_freeze_slices("users", "by_id", alloc::vec![s1, s2])
7306            .expect("commit two");
7307        assert_eq!(one.segment_bytes, two.segment_bytes);
7308        assert_eq!(one.frozen_rows, two.frozen_rows);
7309        // Every PK that survived freeze (hot or cold) resolves on
7310        // both catalogs.
7311        for id in 0..10i64 {
7312            assert_eq!(
7313                a.lookup_by_pk("users", "by_id", &IndexKey::Int(id)),
7314                b.lookup_by_pk("users", "by_id", &IndexKey::Int(id)),
7315                "PK {id} differs after one-slice vs two-slice freeze"
7316            );
7317        }
7318    }
7319
7320    /// Gap between slices → error before any mutation lands.
7321    #[test]
7322    fn commit_freeze_slices_rejects_gap() {
7323        let mut cat = Catalog::new();
7324        cat.create_table(bigint_pk_users_schema()).unwrap();
7325        let t = cat.get_mut("users").unwrap();
7326        for id in 0..6i64 {
7327            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
7328                .unwrap();
7329        }
7330        t.add_index("by_id".into(), "id").unwrap();
7331        let s1 = cat.prepare_freeze_slice("users", "by_id", 0..2).unwrap();
7332        let s2 = cat.prepare_freeze_slice("users", "by_id", 3..5).unwrap();
7333        assert!(matches!(
7334            cat.commit_freeze_slices("users", "by_id", alloc::vec![s1, s2]),
7335            Err(StorageError::Corrupt(_))
7336        ));
7337        // Catalog untouched.
7338        assert_eq!(cat.cold_segment_count(), 0);
7339        assert_eq!(cat.get("users").unwrap().row_count(), 6);
7340    }
7341
7342    /// Empty slice list → no-op success, catalog untouched.
7343    #[test]
7344    fn commit_freeze_slices_empty_is_noop() {
7345        let mut cat = Catalog::new();
7346        cat.create_table(bigint_pk_users_schema()).unwrap();
7347        let t = cat.get_mut("users").unwrap();
7348        for id in 0..3i64 {
7349            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
7350                .unwrap();
7351        }
7352        t.add_index("by_id".into(), "id").unwrap();
7353        let report = cat
7354            .commit_freeze_slices("users", "by_id", Vec::new())
7355            .unwrap();
7356        assert_eq!(report.frozen_rows, 0);
7357        assert_eq!(cat.cold_segment_count(), 0);
7358        assert_eq!(cat.get("users").unwrap().row_count(), 3);
7359    }
7360
7361    // --- v6.7.3 cold-segment compaction ---------------------------
7362
7363    /// Two small cold segments merge into a single larger one. The
7364    /// merged segment carries every cold-resident row; the source
7365    /// slots are tombstoned; every PK still resolves through the
7366    /// new merged segment via `lookup_by_pk`.
7367    #[test]
7368    fn compact_merges_small_segments_storage_unit() {
7369        let mut cat = Catalog::new();
7370        cat.create_table(bigint_pk_users_schema()).unwrap();
7371        let t = cat.get_mut("users").unwrap();
7372        for id in 0..8i64 {
7373            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
7374                .unwrap();
7375        }
7376        t.add_index("by_id".into(), "id").unwrap();
7377        // Two freezes of 3 rows each → two small cold segments.
7378        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
7379        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
7380        assert_eq!(cat.cold_segment_count(), 2);
7381        assert_eq!(cat.cold_segment_slot_count(), 2);
7382
7383        // Pick a threshold larger than either segment's size so
7384        // both qualify.
7385        let max_seg_bytes = cat
7386            .cold_segment_ids_global()
7387            .iter()
7388            .map(|id| cat.cold_segment(*id).unwrap().bytes().len() as u64)
7389            .max()
7390            .unwrap();
7391        let target = max_seg_bytes + 1;
7392
7393        let report = cat
7394            .compact_cold_segments("users", "by_id", target)
7395            .expect("compact succeeds");
7396        assert_eq!(report.sources.len(), 2);
7397        let merged_id = report.merged_segment_id.expect("merge happened");
7398        assert_eq!(report.merged_rows, 6);
7399        assert_eq!(report.deleted_rows_pruned, 0);
7400        assert!(!report.merged_segment_bytes.is_empty());
7401
7402        // Active count drops back to 1; slot count grew to 3
7403        // (2 sources tombstoned + 1 merged appended).
7404        assert_eq!(cat.cold_segment_count(), 1);
7405        assert_eq!(cat.cold_segment_slot_count(), 3);
7406        assert_eq!(cat.cold_segment_ids_global(), alloc::vec![merged_id]);
7407
7408        // Every PK that was frozen still resolves (via the merged
7409        // segment); the 2 hot rows still resolve too.
7410        for id in 0..8i64 {
7411            let got = cat
7412                .lookup_by_pk("users", "by_id", &IndexKey::Int(id))
7413                .unwrap_or_else(|| panic!("PK {id} lost after compaction"));
7414            assert_eq!(got, make_user_row(id, &alloc::format!("u-{id}")));
7415        }
7416    }
7417
7418    /// DELETE'd-but-frozen rows are dropped during the merge. Set
7419    /// up two small segments, then shadow one row in each; the
7420    /// merged segment must NOT carry the shadowed rows.
7421    #[test]
7422    fn compact_drops_shadowed_cold_rows() {
7423        let mut cat = Catalog::new();
7424        cat.create_table(bigint_pk_users_schema()).unwrap();
7425        let t = cat.get_mut("users").unwrap();
7426        for id in 0..6i64 {
7427            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
7428                .unwrap();
7429        }
7430        t.add_index("by_id".into(), "id").unwrap();
7431        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
7432        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
7433        // Shadow PK 1 (in seg 0) + PK 4 (in seg 1).
7434        assert_eq!(
7435            cat.shadow_cold_row("users", "by_id", &IndexKey::Int(1))
7436                .unwrap(),
7437            1
7438        );
7439        assert_eq!(
7440            cat.shadow_cold_row("users", "by_id", &IndexKey::Int(4))
7441                .unwrap(),
7442            1
7443        );
7444
7445        let max_seg_bytes = cat
7446            .cold_segment_ids_global()
7447            .iter()
7448            .map(|id| cat.cold_segment(*id).unwrap().bytes().len() as u64)
7449            .max()
7450            .unwrap();
7451        let report = cat
7452            .compact_cold_segments("users", "by_id", max_seg_bytes + 1)
7453            .expect("compact succeeds");
7454        assert_eq!(report.sources.len(), 2);
7455        assert_eq!(report.merged_rows, 4, "6 frozen − 2 shadowed = 4 live");
7456        assert_eq!(report.deleted_rows_pruned, 2);
7457
7458        // PK 1 and 4 stay invisible after compact.
7459        for shadowed in [1i64, 4i64] {
7460            assert!(
7461                cat.lookup_by_pk("users", "by_id", &IndexKey::Int(shadowed))
7462                    .is_none(),
7463                "shadowed PK {shadowed} must remain invisible after compact"
7464            );
7465        }
7466        // The other 4 frozen rows resolve.
7467        for live in [0i64, 2, 3, 5] {
7468            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(live))
7469                .unwrap_or_else(|| panic!("live PK {live} lost after compact"));
7470        }
7471    }
7472
7473    /// No-op cases: 0 or 1 candidate segment under the threshold
7474    /// leaves the catalog untouched.
7475    #[test]
7476    fn compact_is_noop_below_two_candidates() {
7477        let mut cat = Catalog::new();
7478        cat.create_table(bigint_pk_users_schema()).unwrap();
7479        let t = cat.get_mut("users").unwrap();
7480        for id in 0..6i64 {
7481            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
7482                .unwrap();
7483        }
7484        t.add_index("by_id".into(), "id").unwrap();
7485        // 0 cold segments.
7486        let report = cat
7487            .compact_cold_segments("users", "by_id", 1 << 30)
7488            .expect("noop ok");
7489        assert!(report.merged_segment_id.is_none());
7490        assert!(report.sources.is_empty());
7491
7492        // 1 cold segment — still a no-op (need ≥2 to merge).
7493        cat.freeze_oldest_to_cold("users", "by_id", 4).unwrap();
7494        let report = cat
7495            .compact_cold_segments("users", "by_id", 1 << 30)
7496            .expect("noop ok");
7497        assert!(report.merged_segment_id.is_none());
7498        assert_eq!(cat.cold_segment_count(), 1);
7499
7500        // Threshold too small to cover the single segment → still
7501        // no-op.
7502        let report = cat
7503            .compact_cold_segments("users", "by_id", 1)
7504            .expect("noop ok");
7505        assert!(report.merged_segment_id.is_none());
7506        assert_eq!(cat.cold_segment_count(), 1);
7507    }
7508
7509    /// Manifest-style atomicity: a Catalog snapshot taken AFTER
7510    /// `compact_cold_segments` returns must round-trip with the
7511    /// post-compact BTree state, while the cold-tier registry is
7512    /// re-derived from the source-of-truth manifest (=
7513    /// `load_segment_bytes_at` with the merged id + the still-on-
7514    /// disk merged bytes). This mirrors the boot path: catalog
7515    /// snapshot + cold-segment files = full state.
7516    #[test]
7517    fn compact_swap_survives_catalog_roundtrip_via_load_at() {
7518        let mut cat = Catalog::new();
7519        cat.create_table(bigint_pk_users_schema()).unwrap();
7520        let t = cat.get_mut("users").unwrap();
7521        for id in 0..6i64 {
7522            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
7523                .unwrap();
7524        }
7525        t.add_index("by_id".into(), "id").unwrap();
7526        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
7527        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
7528        let max_seg_bytes = cat
7529            .cold_segment_ids_global()
7530            .iter()
7531            .map(|id| cat.cold_segment(*id).unwrap().bytes().len() as u64)
7532            .max()
7533            .unwrap();
7534        let report = cat
7535            .compact_cold_segments("users", "by_id", max_seg_bytes + 1)
7536            .expect("compact ok");
7537        let merged_id = report.merged_segment_id.unwrap();
7538
7539        // Serialise the catalog (BTree index points at merged_id
7540        // now) and the merged segment bytes; pretend to crash; on
7541        // restart, re-hydrate the catalog and reload only the
7542        // merged segment at its baked-in id.
7543        let cat_bytes = cat.serialize();
7544        let merged_bytes = report.merged_segment_bytes.clone();
7545
7546        let mut restored = Catalog::deserialize(&cat_bytes).expect("deserialize ok");
7547        restored
7548            .load_segment_bytes_at(merged_id, merged_bytes)
7549            .expect("reload merged ok");
7550
7551        // All 6 PKs still resolve through the restored merged segment.
7552        for id in 0..6i64 {
7553            let got = restored
7554                .lookup_by_pk("users", "by_id", &IndexKey::Int(id))
7555                .unwrap_or_else(|| panic!("PK {id} lost across roundtrip"));
7556            assert_eq!(got, make_user_row(id, &alloc::format!("u-{id}")));
7557        }
7558        // No source slot ever rehydrates — confirmed by
7559        // `cold_segment_count` matching only the merged segment.
7560        assert_eq!(restored.cold_segment_count(), 1);
7561    }
7562
7563    /// `load_segment_bytes_at` refuses to stomp an occupied slot
7564    /// and pads with `None` when the target id is past the end.
7565    #[test]
7566    fn load_segment_bytes_at_pads_and_rejects_collision() {
7567        let mut cat = Catalog::new();
7568        cat.create_table(bigint_pk_users_schema()).unwrap();
7569        let t = cat.get_mut("users").unwrap();
7570        for id in 0..4i64 {
7571            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
7572                .unwrap();
7573        }
7574        t.add_index("by_id".into(), "id").unwrap();
7575        let report = cat.freeze_oldest_to_cold("users", "by_id", 2).unwrap();
7576        let bytes_seg0 = report.segment_bytes.clone();
7577
7578        // Pad to id=5 (slots 1..5 are None, slot 5 holds the
7579        // segment loaded back). The slot count jumps, the active
7580        // count is now 2 (seg 0 + seg 5).
7581        cat.load_segment_bytes_at(5, bytes_seg0.clone())
7582            .expect("pad + load ok");
7583        assert_eq!(cat.cold_segment_slot_count(), 6);
7584        assert_eq!(cat.cold_segment_count(), 2);
7585
7586        // Re-loading at the same id collides.
7587        assert!(matches!(
7588            cat.load_segment_bytes_at(5, bytes_seg0.clone()),
7589            Err(StorageError::Corrupt(_))
7590        ));
7591        // Re-loading at id 0 (already occupied) also collides.
7592        assert!(matches!(
7593            cat.load_segment_bytes_at(0, bytes_seg0),
7594            Err(StorageError::Corrupt(_))
7595        ));
7596    }
7597
7598    /// Round trip: freeze → promote → re-freeze. The same PK can
7599    /// migrate hot ↔ cold multiple times. After two cycles only the
7600    /// final Hot locator should be live.
7601    #[test]
7602    fn promote_then_refreeze_does_not_leave_orphan_locators() {
7603        let mut cat = Catalog::new();
7604        cat.create_table(bigint_pk_users_schema()).unwrap();
7605        let t = cat.get_mut("users").unwrap();
7606        for id in 0..4i64 {
7607            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
7608                .unwrap();
7609        }
7610        t.add_index("by_id".into(), "id").unwrap();
7611
7612        // Cycle 1: freeze first 2 rows, then promote PK 0.
7613        cat.freeze_oldest_to_cold("users", "by_id", 2).unwrap();
7614        let promoted = cat
7615            .promote_cold_row("users", "by_id", &IndexKey::Int(0))
7616            .unwrap();
7617        assert!(promoted.is_some());
7618        let entries_after_promote = cat
7619            .get("users")
7620            .unwrap()
7621            .index_on(0)
7622            .unwrap()
7623            .lookup_eq(&IndexKey::Int(0))
7624            .to_vec();
7625        assert_eq!(entries_after_promote.len(), 1);
7626        assert!(entries_after_promote[0].is_hot());
7627
7628        // Cycle 2: freeze the front rows again. PK 0 is now at
7629        // position 2 (after the survivors); it could still go cold
7630        // again on a future freeze depending on policy, but the
7631        // current "first N positions" policy leaves it alone here.
7632        // What matters: prior cold locators for PKs 0..1 are gone,
7633        // PKs 2..3 still resolve through their original segments.
7634        for id in [2i64, 3] {
7635            assert_eq!(
7636                cat.lookup_by_pk("users", "by_id", &IndexKey::Int(id))
7637                    .unwrap(),
7638                make_user_row(id, &alloc::format!("u-{id}"))
7639            );
7640        }
7641    }
7642}