Skip to main content

spg_storage/
lib.rs

1//! In-memory storage primitives.
2//!
3//! v0.3 is intentionally simple: a flat catalog of tables, each holding rows
4//! as `Vec<Value>` (positional, matching the table's `TableSchema`). No MVCC,
5//! no on-disk format — those land in later milestones.
6#![no_std]
7// v3.3.2 NEON path for l2_distance_sq (aarch64 only). Scoped allow:
8// `unsafe_code = "deny"` at workspace level stays in force for every
9// other crate.
10#![cfg_attr(target_arch = "aarch64", allow(unsafe_code))]
11
12extern crate alloc;
13
14pub mod bloom;
15pub mod halfvec;
16pub mod persistent;
17pub mod persistent_btree;
18pub mod quantize;
19pub mod row_locator;
20pub mod segment;
21pub mod trgm;
22
23pub use self::bloom::{BloomError, BloomFilter};
24pub use self::row_locator::{RowLocator, RowLocatorError};
25pub use self::segment::{
26    BRIN_SIDECAR_MAGIC, BrinSummary, OwnedSegment, SEGMENT_COMPRESS_ALGO_LZSS,
27    SEGMENT_COMPRESS_ALGO_NONE, SEGMENT_MAGIC, SEGMENT_MAGIC_V2, SEGMENT_PAGE_BYTES, SegmentError,
28    SegmentMeta, SegmentReader, derive_brin_summaries, encode_segment, wrap_v2_envelope,
29    wrap_v2_envelope_with_brin,
30};
31
32use alloc::boxed::Box;
33use alloc::collections::{BTreeMap, BTreeSet};
34use alloc::format;
35use alloc::string::{String, ToString};
36use alloc::sync::Arc;
37use alloc::vec::Vec;
38use core::fmt;
39
40use self::persistent::PersistentVec;
41use self::persistent_btree::PersistentBTreeMap;
42
43/// In-cell encoding for `DataType::Vector`. Mirrors
44/// `spg_sql::ast::VecEncoding` — kept here so storage stays
45/// dep-free of `spg-sql`. The engine bridges between the two
46/// at DDL-execution time.
47///
48/// `F32` is the pre-v6 default: each cell holds a raw `Vec<f32>`.
49/// `Sq8` (v6.0.1) stores `Sq8Vector { min, max, bytes: Vec<u8> }`
50/// per cell; 4× compression vs `F32` with recall@10 ≥ 0.95 on
51/// natural embeddings (Gaussian / unit-sphere corpora).
52/// `F16` (v6.0.3, DDL keyword `HALF`) stores each element as
53/// IEEE-754 binary16; 2× compression and bit-exact dequantise.
54#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
55pub enum VecEncoding {
56    #[default]
57    F32,
58    Sq8,
59    F16,
60}
61
62impl fmt::Display for VecEncoding {
63    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
64        match self {
65            Self::F32 => f.write_str("F32"),
66            Self::Sq8 => f.write_str("SQ8"),
67            Self::F16 => f.write_str("HALF"),
68        }
69    }
70}
71
72/// Runtime type tags. `Vector { dim, encoding }` / `Varchar(max)` /
73/// `Char(size)` are parameterised; the parameter travels with both
74/// the column schema and the on-wire serialised representation.
75#[derive(Debug, Clone, Copy, PartialEq, Eq)]
76pub enum DataType {
77    /// 16-bit signed. Backed by `Value::SmallInt(i16)`; arithmetic that
78    /// would overflow surfaces as a type error at INSERT time.
79    SmallInt,
80    Int,    // 32-bit signed
81    BigInt, // 64-bit signed
82    Float,  // f64 (PG double precision)
83    Text,
84    /// `VARCHAR(n)` — same byte representation as `Text`, but INSERT
85    /// rejects values longer than `n` Unicode characters.
86    Varchar(u32),
87    /// `CHAR(n)` — same representation as `Text`, but INSERT right-pads
88    /// with U+0020 to exactly `n` Unicode characters (or rejects when
89    /// the input is already longer).
90    Char(u32),
91    Bool,
92    /// pgvector-style fixed-dimension vector. `encoding` selects
93    /// the in-cell representation (`F32` = pre-v6 raw f32 buffer;
94    /// `Sq8` = v6.0.1 8-bit scalar-quantised). The DDL grammar
95    /// surfaces encoding via the optional `USING <encoding>`
96    /// clause: `VECTOR(128) USING SQ8`.
97    Vector {
98        dim: u32,
99        encoding: VecEncoding,
100    },
101    /// `NUMERIC(precision, scale)` — exact fixed-point decimal stored as
102    /// a scaled `i128`. `precision` caps total decimal digits, `scale`
103    /// fixes digits after the decimal point. v1.12 supports up to
104    /// precision 38 (the i128-safe ceiling). `NUMERIC` and `NUMERIC(p)`
105    /// surface as `Numeric { precision: p, scale: 0 }`.
106    Numeric {
107        precision: u8,
108        scale: u8,
109    },
110    /// `DATE` — calendar date with day precision, stored as `i32` days
111    /// since the Unix epoch (1970-01-01).
112    Date,
113    /// `TIMESTAMP` (a.k.a. `MySQL` `DATETIME`) — instant with microsecond
114    /// precision, stored as `i64` microseconds since the Unix epoch.
115    Timestamp,
116    /// v7.9.2 `TIMESTAMPTZ` — bit-identical to `Timestamp` on disk
117    /// (i64 microseconds, UTC by convention). Carried as a distinct
118    /// type tag so the PG-wire layer can advertise OID 1184 (PG's
119    /// `timestamp with time zone`) and `sqlx`/`pgx`/JDBC clients
120    /// decode into their TZ-aware datetime types. The internal
121    /// semantics are unchanged: SPG never stored per-row offsets,
122    /// and neither did PG — `TIMESTAMPTZ` in PG is also UTC i64.
123    Timestamptz,
124    /// `INTERVAL` — calendar-aware span (months + microseconds). v2.11
125    /// supports INTERVAL only as a runtime intermediate (literals,
126    /// arithmetic results); on-disk encoding is rejected so this branch
127    /// can't appear in a `ColumnSchema`.
128    Interval,
129    /// v4.9: `JSON` — text-backed JSON document. We don't parse
130    /// the content (no path operators or jsonb functions yet) —
131    /// the column accepts any TEXT-compatible value and round-trips
132    /// it verbatim. PG OID 114 on the wire.
133    Json,
134    /// v7.9.0: `JSONB` — semantically identical to `Json` on
135    /// the storage side (same `Value::Json` cells, same
136    /// row codec), but advertised as PG OID 3802 on the wire
137    /// so `sqlx`-style clients that bind `jsonb` columns
138    /// decode correctly. mailrs migration blocker #3.
139    Jsonb,
140    /// v7.10.4: `BYTES` / `BYTEA` — variable-length raw binary.
141    /// Backed by `Value::Bytes(Vec<u8>)`. PG wire OID 17. Literal
142    /// forms accepted by parser/engine: PG hex form `'\xDEADBEEF'`
143    /// (case-insensitive hex pairs) and escape form
144    /// `'foo\\000bar'` (the latter decoded at coercion time when
145    /// the target column is BYTEA — TEXT columns leave the
146    /// backslash sequence verbatim).
147    Bytes,
148    /// v7.10.9: `TEXT[]` — single-dimension TEXT array. Elements
149    /// may be NULL (PG semantics). PG wire OID 1009. Literal
150    /// forms: `ARRAY['a', 'b', NULL]` and the PG external form
151    /// `'{a,b,NULL}'::TEXT[]`. Engine implements `= ANY(arr)`,
152    /// `<> ALL(arr)`, and 1-based indexing `arr[i]`. Catalog
153    /// FILE_VERSION 18+; older snapshots reject this DataType
154    /// (forward-only by design — TEXT[] columns aren't readable
155    /// on a pre-v7.10 binary).
156    TextArray,
157    /// v7.11.12: `INT[]` — single-dimension i32 array. PG wire
158    /// OID 1007 (_int4). Same `ARRAY[...]` / `'{1,2,3}'::INT[]`
159    /// literal surface as TEXT[]. Catalog FILE_VERSION 19+.
160    IntArray,
161    /// v7.11.12: `BIGINT[]` — single-dimension i64 array. PG
162    /// wire OID 1016 (_int8). Catalog FILE_VERSION 19+.
163    BigIntArray,
164    /// v7.12.0: PG `tsvector` — ordered, deduplicated set of
165    /// `(lexeme, positions, weight)` tuples. PG wire OID 3614.
166    /// Catalog FILE_VERSION 20+. Storage shape is row-codec
167    /// tag 22; the schema-agnostic `write_value` path emits tag
168    /// 18. Literal: `'foo:1 bar:2,3'::tsvector` (PG external
169    /// form). G-CRIT-3 entry — v7.12.0 only ships the type +
170    /// codec; matching `@@` lands in v7.12.2.
171    TsVector,
172    /// v7.12.0: PG `tsquery` — parse tree of lexemes joined by
173    /// `&` `|` `!` and phrase operators. PG wire OID 3615.
174    /// Catalog FILE_VERSION 20+.
175    TsQuery,
176}
177
178impl fmt::Display for DataType {
179    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
180        match self {
181            Self::SmallInt => f.write_str("SMALLINT"),
182            Self::Int => f.write_str("INT"),
183            Self::BigInt => f.write_str("BIGINT"),
184            Self::Float => f.write_str("FLOAT"),
185            Self::Text => f.write_str("TEXT"),
186            Self::Varchar(n) => write!(f, "VARCHAR({n})"),
187            Self::Char(n) => write!(f, "CHAR({n})"),
188            Self::Bool => f.write_str("BOOL"),
189            Self::Vector { dim, encoding } => match encoding {
190                VecEncoding::F32 => write!(f, "VECTOR({dim})"),
191                VecEncoding::Sq8 => write!(f, "VECTOR({dim}) USING SQ8"),
192                VecEncoding::F16 => write!(f, "VECTOR({dim}) USING HALF"),
193            },
194            Self::Numeric { precision, scale } => {
195                if *scale == 0 {
196                    write!(f, "NUMERIC({precision})")
197                } else {
198                    write!(f, "NUMERIC({precision}, {scale})")
199                }
200            }
201            Self::Date => f.write_str("DATE"),
202            Self::Timestamp => f.write_str("TIMESTAMP"),
203            Self::Timestamptz => f.write_str("TIMESTAMPTZ"),
204            Self::Interval => f.write_str("INTERVAL"),
205            Self::Json => f.write_str("JSON"),
206            Self::Jsonb => f.write_str("JSONB"),
207            Self::Bytes => f.write_str("BYTEA"),
208            Self::TextArray => f.write_str("TEXT[]"),
209            Self::IntArray => f.write_str("INT[]"),
210            Self::BigIntArray => f.write_str("BIGINT[]"),
211            Self::TsVector => f.write_str("TSVECTOR"),
212            Self::TsQuery => f.write_str("TSQUERY"),
213        }
214    }
215}
216
217/// v7.12.0 — one entry in a `Value::TsVector`. The lexeme is the
218/// (already-tokenised + stemmed in v7.12.1+) word; `positions` is
219/// a strictly-ascending list of 1-based positions; `weight` is the
220/// PG weight letter (A=3, B=2, C=1, D=0) — v7.12.0 defaults every
221/// lexeme to D, the v7.12.2 ranking path consumes the weight.
222#[derive(Debug, Clone, PartialEq, Eq)]
223pub struct TsLexeme {
224    pub word: String,
225    pub positions: Vec<u16>,
226    pub weight: u8,
227}
228
229/// v7.12.0 — parse tree for a PG `tsquery`. v7.12.0 ships the
230/// type + codec only; the `to_tsquery` / `plainto_tsquery` lexer
231/// lands in v7.12.1 and the `@@` evaluator in v7.12.2.
232#[derive(Debug, Clone, PartialEq, Eq)]
233pub enum TsQueryAst {
234    /// Single lexeme term. The `weight_mask` is the PG-style
235    /// bitmask of accepted weights (`A=1<<3`, `B=1<<2`, `C=1<<1`,
236    /// `D=1<<0`); `0` = any weight. v7.12.0 always sets it to 0.
237    Term {
238        word: String,
239        weight_mask: u8,
240    },
241    And(Box<TsQueryAst>, Box<TsQueryAst>),
242    Or(Box<TsQueryAst>, Box<TsQueryAst>),
243    Not(Box<TsQueryAst>),
244    /// `phrase <distance> phrase`. v7.12.0 only persists this; the
245    /// match semantics arrive in v7.12.2 alongside `@@`.
246    Phrase {
247        left: Box<TsQueryAst>,
248        right: Box<TsQueryAst>,
249        distance: u16,
250    },
251}
252
253/// A row-cell value, including SQL `NULL`. `Float` uses `f64`; NaN compares
254/// non-equal to itself (PG behaviour) — `PartialEq` is derived so callers
255/// must opt into NaN-aware comparison if they need stronger guarantees.
256#[derive(Debug, Clone, PartialEq)]
257#[non_exhaustive]
258pub enum Value {
259    SmallInt(i16),
260    Int(i32),
261    BigInt(i64),
262    Float(f64),
263    Text(String),
264    Bool(bool),
265    Vector(Vec<f32>),
266    /// v6.0.1: 8-bit scalar-quantised vector cell. Lives in
267    /// columns declared `VECTOR(N) USING SQ8`. Layout per cell:
268    /// `Sq8Vector { min: f32, max: f32, bytes: Vec<u8> }` —
269    /// 4× compression vs `Vector(Vec<f32>)`. The wire layer
270    /// dequantises to `f32` on SELECT; INSERT path quantises
271    /// incoming `Vector(Vec<f32>)` cells into this variant.
272    Sq8Vector(crate::quantize::Sq8Vector),
273    /// v6.0.3: IEEE-754 binary16 vector cell. Lives in columns
274    /// declared `VECTOR(N) USING HALF`. Stores raw u16 LE bits
275    /// (2× compression vs `Vector(Vec<f32>)`). Wire / display
276    /// paths dequantise to f32 bit-exactly; INSERT path converts
277    /// incoming f32 vectors at the engine boundary.
278    HalfVector(crate::halfvec::HalfVector),
279    /// Exact fixed-point decimal. `scaled` holds the value as
280    /// `actual * 10^scale` so the storage type is always integral —
281    /// arithmetic never falls back to floating-point.
282    Numeric {
283        scaled: i128,
284        scale: u8,
285    },
286    /// Days since the Unix epoch (1970-01-01). Negative for earlier dates.
287    Date(i32),
288    /// Microseconds since the Unix epoch (1970-01-01T00:00:00Z).
289    Timestamp(i64),
290    /// Calendar span: `months` (variable-length) + `micros` (fixed-length).
291    /// Runtime-only — cannot appear in a stored row in v2.11.
292    Interval {
293        months: i32,
294        micros: i64,
295    },
296    /// v4.9 `JSON` — raw JSON text. No structural validation
297    /// happens at the storage layer; whatever the parser hands us
298    /// round-trips verbatim. Equality is byte-wise.
299    Json(String),
300    /// v7.10.4 `BYTEA` — raw binary blob. Equality is byte-wise.
301    /// Layout matches `Text`'s length-prefixed shape (`[u32 LE
302    /// len][bytes]`) under tag 18; the engine accepts PG hex
303    /// literals (`'\xDEADBEEF'`) and escape literals at the
304    /// coercion boundary.
305    Bytes(Vec<u8>),
306    /// v7.10.9 `TEXT[]` — single-dimension TEXT array with
307    /// optional NULL elements. Equality is element-wise. PG's
308    /// NULL-element comparison semantics: NULL ≠ NULL inside
309    /// arrays under `=`, so `[NULL] != [NULL]` (the engine
310    /// honours this).
311    TextArray(Vec<Option<String>>),
312    /// v7.11.12 `INT[]` — single-dimension i32 array with optional
313    /// NULL elements. Codec mirrors TextArray with i32 LE per
314    /// element instead of length-prefixed UTF-8.
315    IntArray(Vec<Option<i32>>),
316    /// v7.11.12 `BIGINT[]` — single-dimension i64 array with optional
317    /// NULL elements.
318    BigIntArray(Vec<Option<i64>>),
319    /// v7.12.0 `tsvector` — sorted-by-word, deduped lexeme set with
320    /// positions + weights. The engine enforces sort/dedup on
321    /// construction; consumers can rely on `lexemes.windows(2)`
322    /// being strictly ascending by `word`.
323    TsVector(Vec<TsLexeme>),
324    /// v7.12.0 `tsquery` — boolean / phrase parse tree over
325    /// lexemes. Engine builds via `to_tsquery` family.
326    TsQuery(TsQueryAst),
327    Null,
328}
329
330impl Value {
331    /// Type tag, or `None` for `NULL` (unknown at value level).
332    pub fn data_type(&self) -> Option<DataType> {
333        match self {
334            Self::SmallInt(_) => Some(DataType::SmallInt),
335            Self::Int(_) => Some(DataType::Int),
336            Self::BigInt(_) => Some(DataType::BigInt),
337            Self::Float(_) => Some(DataType::Float),
338            // `Text` covers both unbounded TEXT and bounded VARCHAR/CHAR
339            // — the constraint lives on the column schema, not the value.
340            Self::Text(_) => Some(DataType::Text),
341            Self::Bool(_) => Some(DataType::Bool),
342            Self::Vector(v) => Some(DataType::Vector {
343                dim: u32::try_from(v.len()).expect("vector dim ≤ u32"),
344                encoding: VecEncoding::F32,
345            }),
346            Self::Sq8Vector(q) => Some(DataType::Vector {
347                dim: u32::try_from(q.bytes.len()).expect("vector dim ≤ u32"),
348                encoding: VecEncoding::Sq8,
349            }),
350            Self::HalfVector(h) => Some(DataType::Vector {
351                dim: u32::try_from(h.dim()).expect("vector dim ≤ u32"),
352                encoding: VecEncoding::F16,
353            }),
354            // `Value::Numeric` doesn't carry its precision (the column
355            // schema does); we surface precision=0 as "unknown" and let
356            // the engine reconcile against the column type at coercion
357            // time.
358            Self::Numeric { scale, .. } => Some(DataType::Numeric {
359                precision: 0,
360                scale: *scale,
361            }),
362            Self::Date(_) => Some(DataType::Date),
363            Self::Timestamp(_) => Some(DataType::Timestamp),
364            Self::Interval { .. } => Some(DataType::Interval),
365            Self::Json(_) => Some(DataType::Json),
366            Self::Bytes(_) => Some(DataType::Bytes),
367            Self::TextArray(_) => Some(DataType::TextArray),
368            Self::IntArray(_) => Some(DataType::IntArray),
369            Self::BigIntArray(_) => Some(DataType::BigIntArray),
370            Self::TsVector(_) => Some(DataType::TsVector),
371            Self::TsQuery(_) => Some(DataType::TsQuery),
372            Self::Null => None,
373        }
374    }
375
376    pub const fn is_null(&self) -> bool {
377        matches!(self, Self::Null)
378    }
379}
380
381/// One table row — values are positional and must match
382/// `TableSchema.columns` in length and (modulo NULL) in `DataType`.
383#[derive(Debug, Clone, PartialEq)]
384pub struct Row {
385    pub values: Vec<Value>,
386}
387
388impl Row {
389    pub const fn new(values: Vec<Value>) -> Self {
390        Self { values }
391    }
392
393    pub fn len(&self) -> usize {
394        self.values.len()
395    }
396
397    pub fn is_empty(&self) -> bool {
398        self.values.is_empty()
399    }
400}
401
402#[derive(Debug, Clone, PartialEq)]
403pub struct ColumnSchema {
404    pub name: String,
405    pub ty: DataType,
406    pub nullable: bool,
407    /// Optional `DEFAULT` value, frozen at CREATE TABLE time. `None`
408    /// means "no default" (so omitted columns become NULL, or error
409    /// out when the column is NOT NULL). Literal defaults take this
410    /// path.
411    pub default: Option<Value>,
412    /// v7.9.21 — for DEFAULT expressions that need INSERT-time
413    /// evaluation (e.g. `DEFAULT now()`, `DEFAULT CURRENT_TIMESTAMP`),
414    /// the Display form of the expression. The engine re-parses
415    /// it on each INSERT default-fill, evaluates against an empty
416    /// row context, and coerces to the column type. mailrs G4.
417    /// Persisted in catalog FILE_VERSION 15+; older catalogs
418    /// deserialise with None.
419    pub runtime_default: Option<String>,
420    /// MySQL-style `AUTO_INCREMENT`. When set, an INSERT that leaves
421    /// this column unbound (or sets it to NULL) gets the next integer
422    /// computed from the column's current max + 1.
423    pub auto_increment: bool,
424}
425
426#[derive(Debug, Clone, PartialEq)]
427pub struct TableSchema {
428    pub name: String,
429    pub columns: Vec<ColumnSchema>,
430    /// v6.7.2 — per-table hot-tier byte budget override. `None`
431    /// falls through to the global `SPG_HOT_TIER_BYTES` setting;
432    /// `Some(n)` overrides it for this specific table. Set via
433    /// `ALTER TABLE t SET hot_tier_bytes = X`. Persisted in
434    /// catalog FILE_VERSION 11+.
435    pub hot_tier_bytes: Option<u64>,
436    /// v7.6.1 — FOREIGN KEY constraints declared on this table.
437    /// Engine maintains this in lock-step with `spg-sql`'s parser
438    /// AST; the storage layer carries the on-disk shape so a
439    /// catalog snapshot round-trips without external mapping.
440    /// Persisted in catalog FILE_VERSION 13+. Older catalogs
441    /// deserialise with an empty vec.
442    pub foreign_keys: Vec<ForeignKeyConstraint>,
443    /// v7.9.19 — composite UNIQUE / PRIMARY KEY constraints
444    /// declared at the table level. Each entry's leading column
445    /// has a BTree index (created via the constraint), and INSERT
446    /// path enforces the full-tuple uniqueness via a scan keyed
447    /// by the leading column. Persisted in catalog FILE_VERSION
448    /// 15+. Older catalogs (≤ 14) deserialise with an empty vec.
449    pub uniqueness_constraints: Vec<UniquenessConstraint>,
450    /// v7.13.0 — `CHECK (<expr>)` predicates declared on this
451    /// table. Both column-level inline `CHECK (…)` and
452    /// table-level `CHECK (…)` fold into this list. Each entry
453    /// is the AST Expr's `Display` form, re-parsed on every
454    /// INSERT/UPDATE and evaluated against the candidate row.
455    /// A false / NULL result rejects the mutation (PG semantics).
456    /// Persisted in catalog FILE_VERSION 23+. Older catalogs
457    /// deserialise with an empty vec.
458    pub checks: Vec<String>,
459}
460
461/// v7.9.19 — composite UNIQUE / PRIMARY KEY constraint persisted
462/// on the table schema. The leading column always has a BTree
463/// index (created at CREATE TABLE time); INSERT enforcement
464/// scans that index for collisions on the full column tuple.
465#[derive(Debug, Clone, PartialEq, Eq)]
466pub struct UniquenessConstraint {
467    /// `true` when this constraint was declared as `PRIMARY KEY`
468    /// (vs `UNIQUE`). Semantically PK implies NOT NULL on all
469    /// referenced columns; the engine enforces that at CREATE
470    /// TABLE time.
471    pub is_primary_key: bool,
472    /// Column positions on the parent table. ≥ 1 element. For
473    /// single-column UNIQUE this is exactly one position; the
474    /// BTree index alone enforces it.
475    pub columns: Vec<usize>,
476    /// v7.13.0 — `UNIQUE NULLS NOT DISTINCT` modifier
477    /// (mailrs round-5 G10; PG 15+ surface). When `true`, two
478    /// rows whose constrained columns are all NULL collide on
479    /// the constraint. Default (`false`) is the SQL-standard
480    /// `NULLS DISTINCT` behaviour where any NULL passes.
481    /// Persisted in catalog FILE_VERSION 23+.
482    pub nulls_not_distinct: bool,
483}
484
485/// v7.6.1 — Storage-layer mirror of `spg_sql::ast::ForeignKeyConstraint`.
486/// The engine's CREATE TABLE path translates between the two; keeping
487/// them separate preserves the no-deps boundary between
488/// `spg-storage` and `spg-sql`.
489#[derive(Debug, Clone, PartialEq, Eq)]
490pub struct ForeignKeyConstraint {
491    /// Optional user-supplied constraint name (`CONSTRAINT <name>`
492    /// prefix). Used by `ALTER TABLE DROP CONSTRAINT <name>` in
493    /// v7.6.8; ignored by enforcement.
494    pub name: Option<String>,
495    /// Positions of local columns in this table's column list.
496    /// Same arity as `parent_columns`.
497    pub local_columns: Vec<usize>,
498    /// Referenced parent table name.
499    pub parent_table: String,
500    /// Positions of parent columns in the parent's column list.
501    /// Engine resolves these at CREATE TABLE time (after the parent
502    /// schema is known) so enforcement paths can skip the name
503    /// lookup on every row.
504    pub parent_columns: Vec<usize>,
505    /// Referential action when a parent row is deleted.
506    pub on_delete: FkAction,
507    /// Referential action when a parent row's referenced columns
508    /// are updated.
509    pub on_update: FkAction,
510}
511
512/// v7.6.1 — referential action tag. Mirrors `spg_sql::ast::FkAction`.
513#[derive(Debug, Clone, Copy, PartialEq, Eq)]
514pub enum FkAction {
515    Restrict,
516    Cascade,
517    SetNull,
518    SetDefault,
519    NoAction,
520}
521
522impl FkAction {
523    /// On-disk tag byte (v13 catalog appendix).
524    pub const fn tag(self) -> u8 {
525        match self {
526            Self::Restrict => 0,
527            Self::Cascade => 1,
528            Self::SetNull => 2,
529            Self::SetDefault => 3,
530            Self::NoAction => 4,
531        }
532    }
533    pub const fn from_tag(b: u8) -> Option<Self> {
534        Some(match b {
535            0 => Self::Restrict,
536            1 => Self::Cascade,
537            2 => Self::SetNull,
538            3 => Self::SetDefault,
539            4 => Self::NoAction,
540            _ => return None,
541        })
542    }
543}
544
545impl TableSchema {
546    pub fn column_position(&self, name: &str) -> Option<usize> {
547        self.columns.iter().position(|c| c.name == name)
548    }
549}
550
551/// Key type accepted by secondary indices. Float / NULL / Vector values
552/// can't participate in a B-tree index — `f64` is only `PartialOrd`, NULL
553/// has SQL-three-valued semantics, and Vector belongs to the (future) HNSW
554/// path. Index lookups on those columns fall back to full scan.
555#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
556pub enum IndexKey {
557    Int(i64),
558    Text(String),
559    Bool(bool),
560}
561
562impl IndexKey {
563    pub fn from_value(v: &Value) -> Option<Self> {
564        match v {
565            Value::SmallInt(n) => Some(Self::Int(i64::from(*n))),
566            Value::Int(n) => Some(Self::Int(i64::from(*n))),
567            Value::BigInt(n) => Some(Self::Int(*n)),
568            Value::Text(s) => Some(Self::Text(s.clone())),
569            Value::Bool(b) => Some(Self::Bool(*b)),
570            // Date/Timestamp use their integer storage repr as the
571            // index key — same order semantics, same comparison.
572            Value::Date(d) => Some(Self::Int(i64::from(*d))),
573            Value::Timestamp(t) => Some(Self::Int(*t)),
574            // Numeric isn't (yet) indexable — exact-decimal index keys
575            // would need a stable scale-normalised representation.
576            // Interval isn't index-eligible either (and can't reach this
577            // path through column storage anyway).
578            Value::Null
579            | Value::Float(_)
580            | Value::Vector(_)
581            | Value::Sq8Vector(_)
582            | Value::HalfVector(_)
583            | Value::Numeric { .. }
584            | Value::Interval { .. }
585            | Value::Json(_)
586            | Value::Bytes(_)
587            | Value::TextArray(_)
588            | Value::IntArray(_)
589            | Value::BigIntArray(_)
590            | Value::TsVector(_)
591            | Value::TsQuery(_) => None,
592        }
593    }
594}
595
596/// A single-column secondary index. v2.0 carries either a B-tree map
597/// (the default — used for equality / range lookups on scalar columns)
598/// or a navigable-small-world graph (used for kNN over vector
599/// columns).
600#[derive(Debug, Clone)]
601pub struct Index {
602    pub name: String,
603    pub column_position: usize,
604    pub kind: IndexKind,
605    /// v6.8.0 — column positions of `INCLUDE (col1, col2, …)`
606    /// non-key columns. Carries the planner's "this query is
607    /// covered by the index" signal; lookup paths still resolve
608    /// via the `RowLocator` to fetch the row body, but EXPLAIN
609    /// surfaces the covered-scan annotation so operators can
610    /// confirm the planner sees the coverage.
611    ///
612    /// Empty `Vec` = no `INCLUDE` clause (the legacy shape). v12
613    /// catalog snapshots deserialise with an empty vec.
614    pub included_columns: Vec<usize>,
615    /// v6.8.1 — partial-index predicate stored as its canonical
616    /// Display form (the engine re-parses it on the maintenance
617    /// path). `None` = unconditional index (the legacy shape).
618    /// Persisted as `[u8 has_pred][u16 LE len][bytes]` on the
619    /// catalog snapshot (FILE_VERSION 12, appended after
620    /// `included_columns`).
621    pub partial_predicate: Option<String>,
622    /// v6.8.2 — expression-index key, stored as the expression's
623    /// canonical Display form. `None` = bare column-reference
624    /// index (the legacy shape). Persisted alongside
625    /// `partial_predicate` on the v12 catalog snapshot.
626    pub expression: Option<String>,
627    /// v7.9.29 — `CREATE UNIQUE INDEX …`. When true the engine
628    /// rejects INSERTs whose key already appears in this index
629    /// (combined with `partial_predicate` when present — only
630    /// rows matching the predicate enter the uniqueness check).
631    /// Catalog FILE_VERSION 16+; older snapshots deserialise
632    /// with `false`. mailrs K1.
633    pub is_unique: bool,
634    /// v7.9.29 — extra (non-leading) column positions for
635    /// multi-column indexes (`CREATE INDEX … (a, b, c)`). The
636    /// planner today still only uses the leading
637    /// `column_position` for index seeks, but UNIQUE INDEX
638    /// enforcement walks the full tuple so partial-unique
639    /// invariants like CalDAV `(calendar_id, uid,
640    /// recurrence_id)` are enforced correctly. Catalog
641    /// FILE_VERSION 16+; older snapshots deserialise empty.
642    pub extra_column_positions: Vec<usize>,
643}
644
645/// Default neighbor degree (M) for the NSW graph. Picked at construction
646/// time and persisted with the index.
647pub const NSW_DEFAULT_M: usize = 16;
648
649/// v5.2.2: outcome of a successful [`Catalog::freeze_oldest_to_cold`]
650/// call. The catalog state has already been mutated by the time this
651/// is returned (hot rows dropped + segment registered + Cold locators
652/// flipped). The caller's only remaining concern is `segment_bytes` —
653/// persist them to disk under `<db>.spg/segments/seg_<id>.spg` so a
654/// future restart can reload via the v5.1 `SPG_PRELOAD_COLD_SEGMENT`
655/// path. (v5.3's manifest will subsume this manual step.)
656#[derive(Debug, Clone)]
657pub struct FreezeReport {
658    /// Id allocated by [`Catalog::load_segment_bytes`] for the new
659    /// cold-tier segment. Stable across the call's success path.
660    pub segment_id: u32,
661    /// Number of rows that moved hot → cold. Equals the `max_rows`
662    /// the caller asked for (the API is strict on the count).
663    pub frozen_rows: usize,
664    /// Hot-tier bytes reclaimed by the freeze — the
665    /// [`Table::hot_bytes`] delta before vs after. Useful to feed
666    /// back into the freezer's budget check on the next tick.
667    pub bytes_freed: u64,
668    /// Encoded segment bytes, byte-identical to what
669    /// [`encode_segment`] produced. The catalog already owns a
670    /// copy inside `cold_segments`; this hand-off lets the caller
671    /// persist them without re-encoding.
672    pub segment_bytes: Vec<u8>,
673}
674
675/// v6.7.4 — read-only output of [`Catalog::prepare_freeze_slice`].
676/// Carries every row body + key in a contiguous hot-row range,
677/// already encoded and sorted by PK so the coordinator's merge
678/// step is a k-way merge over already-sorted streams.
679///
680/// `Vec<FreezeSlice>` from N independent workers feeds
681/// [`Catalog::commit_freeze_slices`], which concats + encodes the
682/// merged segment + atomically swaps the catalog state.
683#[derive(Debug, Clone)]
684pub struct FreezeSlice {
685    /// Hot-row index range this slice covered (half-open, in the
686    /// table's `rows: PersistentVec` ordering at call time). The
687    /// commit step uses this to compute the union range that
688    /// gets passed to [`Table::delete_rows`].
689    pub row_range: core::ops::Range<usize>,
690    /// `(pk_u64, encoded_row_body, IndexKey)` triples, sorted
691    /// ascending by `pk_u64`. Per-slice sort happens inside
692    /// `prepare_freeze_slice`; the coordinator does only a
693    /// k-way merge to reach the global PK ordering
694    /// [`encode_segment`] requires.
695    pub rows: Vec<(u64, Vec<u8>, IndexKey)>,
696}
697
698/// v6.7.3 — outcome of a [`Catalog::compact_cold_segments`] call.
699/// The catalog state has already been mutated when this is returned:
700/// the merged segment is loaded into `cold_segments`, the source
701/// segment slots are tombstoned (`None`), and every BTree-index
702/// `RowLocator::Cold` that previously pointed at a source now
703/// points at the merged segment. The caller's remaining job is to
704/// persist `merged_segment_bytes` under
705/// `<db>.spg/segments/seg_<merged_segment_id>.spg` and update the
706/// in-memory `segment_id → path` map (remove the source ids, add
707/// the merged id) so the next CHECKPOINT writes a manifest that
708/// no longer lists the retired sources.
709///
710/// On a no-op (fewer than 2 candidate segments under the threshold),
711/// `merged_segment_id` is `None` and `sources` is empty; the
712/// catalog was not mutated.
713#[derive(Debug, Clone)]
714pub struct CompactReport {
715    /// Source segment ids that were merged + tombstoned.
716    pub sources: Vec<u32>,
717    /// Id allocated for the merged segment. `None` on no-op.
718    pub merged_segment_id: Option<u32>,
719    /// Encoded merged-segment bytes (empty on no-op).
720    pub merged_segment_bytes: Vec<u8>,
721    /// Number of rows that landed in the merged segment.
722    pub merged_rows: usize,
723    /// `Σ source.num_rows − merged_rows`. Rows present in source
724    /// segment payloads but unreferenced by any live BTree
725    /// `Cold` locator — DELETE'd-but-still-frozen rows that
726    /// compaction GC'd during the merge.
727    pub deleted_rows_pruned: usize,
728    /// `Σ source.bytes() − merged.bytes()`. Estimate of on-disk
729    /// space the merge will reclaim once the source segment files
730    /// are GC'd. Saturating subtract — never negative.
731    pub bytes_reclaimed_estimate: u64,
732}
733
734#[derive(Debug, Clone)]
735pub enum IndexKind {
736    /// v4.40: structural-sharing B-tree over `IndexKey`. Replaces the v0.8
737    /// `BTreeMap<IndexKey, Vec<usize>>` — `Index::clone` is now an `Arc`
738    /// bump regardless of index size, so `Catalog::clone` inside the
739    /// v4.34 auto-commit wrap stays O(1) even for tables with secondary
740    /// indices (the case that bottlenecked v4.39 at 1M rows in the
741    /// sweep).
742    ///
743    /// v5.1: value type widened from `Vec<usize>` to `Vec<RowLocator>` so
744    /// a single key can point to a mix of hot-tier rows (`RowLocator::Hot`,
745    /// equivalent to the pre-v5 `usize` row index) and cold-tier rows
746    /// (`RowLocator::Cold { segment_id, page_offset }`) once the v5.2
747    /// freezer starts producing them. Pre-v5.2 only `Hot` entries appear
748    /// — the on-disk encoding stays at `FILE_VERSION` 8 (raw u64 row index)
749    /// because every locator round-trips through `RowLocator::from_legacy_v8_u64`
750    /// without information loss. `FILE_VERSION` 9 with tagged encoding lands
751    /// alongside the first freezer commit (v5.1 step 2b / v5.2).
752    BTree(PersistentBTreeMap<IndexKey, Vec<RowLocator>>),
753    /// Navigable-small-world graph for vector kNN search.
754    Nsw(NswGraph),
755    /// v6.7.1 — BRIN (Block Range INdex). Pure metadata: BRIN
756    /// indexes carry NO in-memory key→locator map. The (min,
757    /// max) summaries live in each cold-tier segment's v2
758    /// envelope sidecar; the BRIN entry in `Table.indices` only
759    /// records THAT a BRIN index exists on this column so the
760    /// segment encoder + planner can opt into the summary path.
761    Brin {
762        /// The cell type at `column_position` at CREATE INDEX time.
763        /// Used by the planner to type-check WHERE-clause range
764        /// predicates against the BRIN-indexed column.
765        column_type: DataType,
766    },
767    /// v7.12.3 — GIN inverted index over a `tsvector` column.
768    ///
769    /// Storage shape: `lexeme word → Vec<RowLocator>`. The posting
770    /// list per word is appended in row-order, so range scans are
771    /// O(matching rows) once the per-word lookup is done. Multi-
772    /// term queries intersect / union posting lists.
773    ///
774    /// `IndexKey::from_value(TsVector)` returns `None` — GIN doesn't
775    /// participate in `try_index_seek` (which is BTree-equality-keyed).
776    /// The engine consults this index through `try_gin_lookup` on
777    /// `WHERE col @@ tsquery` predicates instead.
778    ///
779    /// Backed by a `PersistentBTreeMap` so `Catalog::clone` (the
780    /// per-write snapshot) stays O(1) — same structural-sharing
781    /// invariant as BTree.
782    Gin(PersistentBTreeMap<alloc::string::String, Vec<RowLocator>>),
783    /// v7.15.0 — `USING gin (col gin_trgm_ops)` over a `TEXT`
784    /// column. Posting lists map `trigram` (PG-compatible 3-byte
785    /// shingle on the lower-cased + space-padded input) to row
786    /// locators. The planner uses this index to accelerate
787    /// `WHERE col LIKE '…'` / `ILIKE '…'` / `similarity(col, q) >
788    /// t` — every literal run of length ≥ 1 in the pattern
789    /// produces a trigram set, the engine intersects the posting
790    /// lists, and the LIKE / similarity predicate is re-evaluated
791    /// per candidate row to filter the over-approximation.
792    /// Persisted via tag-4 index payload in `FILE_VERSION` 24+.
793    GinTrgm(PersistentBTreeMap<alloc::string::String, Vec<RowLocator>>),
794}
795
796/// Multi-layer HNSW graph (v2.13). Each node is assigned a `top_level`;
797/// it appears in layers `0..=top_level`. Higher layers are sparser, so
798/// search starts from the entry at the top layer, greedy-descends to
799/// layer 0, and beam-searches there. Layer 0 keeps a larger neighbour
800/// budget (`m_max_0 = 2 * m` per the HNSW paper); upper layers cap at
801/// `m`. The struct name stays `NswGraph` so external users / on-disk
802/// callers don't have to track a rename — the algorithm changed, the
803/// data slot didn't.
804#[derive(Debug, Clone)]
805pub struct NswGraph {
806    /// Max neighbours per node on layers ≥ 1.
807    pub m: usize,
808    /// Max neighbours on layer 0 (the dense bottom layer). HNSW
809    /// convention: `m_max_0 = 2 * m`.
810    pub m_max_0: usize,
811    /// Entry point — the node that sits on the topmost layer. Search
812    /// always starts here.
813    pub entry: Option<usize>,
814    /// Top layer of the entry node (== `layers.len() - 1` when populated).
815    pub entry_level: u8,
816    /// `levels[i]` = top layer of node `i`. Nodes whose vector cell is
817    /// NULL / non-Vector have `levels[i] = 0` and no neighbour entries.
818    ///
819    /// v5.5.0: backed by `PersistentVec` so `NswGraph::clone` (and the
820    /// `Catalog::clone` on every group-commit write that contains it) is O(1)
821    /// structural-sharing instead of an O(N) element copy.
822    pub levels: PersistentVec<u8>,
823    /// `layers[l][i]` = neighbours of node `i` at layer `l`. Inner vec
824    /// is empty when node `i` doesn't reach layer `l`.
825    ///
826    /// v5.5.0: the per-node middle dimension (the O(N) one) is a
827    /// `PersistentVec`; the outer layer dimension stays a plain `Vec`
828    /// (layer count ≤ 8, so its clone is O(1) in practice) and the inner
829    /// neighbour list stays a `Vec` (bounded by `m_max_0`).
830    ///
831    /// v6.1.x: neighbour slot widened from `usize` (8 B on 64-bit) to
832    /// `u32` (4 B). Row indices are catalog-bounded by `u32::MAX` (4G
833    /// rows per table); the cast at the NSW boundary asserts this. At
834    /// 1M dim-128 SQ8, layer 0 adjacency alone shrinks by ~128 MiB
835    /// — the largest single contribution to the v6.0.5-measured
836    /// 624 MiB ambition gap. On-disk format already used u32 LE, so
837    /// this is a pure in-memory layout change; no `FILE_VERSION` bump.
838    pub layers: Vec<PersistentVec<Vec<u32>>>,
839}
840
841impl NswGraph {
842    fn new(m: usize) -> Self {
843        Self {
844            m,
845            m_max_0: m.saturating_mul(2),
846            entry: None,
847            entry_level: 0,
848            levels: PersistentVec::new(),
849            layers: alloc::vec![PersistentVec::new()],
850        }
851    }
852
853    /// Max-neighbour budget for layer `l`.
854    pub const fn cap_for_layer(&self, layer: u8) -> usize {
855        if layer == 0 { self.m_max_0 } else { self.m }
856    }
857}
858
859/// Deterministic level assignment, seeded on the row index so the same
860/// insert order reproduces the same topology. Distribution is roughly
861/// HNSW-flavoured with `mL ≈ 1/ln(M) ≈ 0.36` for M=16: each 4-bit
862/// chunk that comes up zero promotes the node one layer (so P(level ≥
863/// L) ≈ (1/16)^L).
864#[allow(clippy::verbose_bit_mask)] // clippy suggests trailing_zeros(); we need an explicit MAX cap and a stable distribution shape.
865pub fn nsw_assign_level(row_idx: usize) -> u8 {
866    const MAX_LEVEL: u8 = 7; // 7 ⇒ ~16^7 ≈ 2.7e8 expected nodes between promotions; ample.
867    // SplitMix-style mixer — cheap and seedable.
868    let mut x = (row_idx as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15);
869    x ^= x >> 30;
870    x = x.wrapping_mul(0xBF58_476D_1CE4_E5B9);
871    x ^= x >> 27;
872    x = x.wrapping_mul(0x94D0_49BB_1331_11EB);
873    x ^= x >> 31;
874    // Count contiguous low-end zero nibbles (4-bit chunks). Each zero
875    // nibble has probability 1/16, mirroring HNSW's `mL ≈ 1/ln(M)` for
876    // M=16. `trailing_zeros / 4` would lose the ordering when x = 0, so
877    // a plain loop with a cap is clearer.
878    let mut level: u8 = 0;
879    while x & 0xF == 0 && level < MAX_LEVEL {
880        level += 1;
881        x >>= 4;
882    }
883    level
884}
885
886impl Index {
887    fn new_btree(name: String, column_position: usize) -> Self {
888        Self {
889            name,
890            column_position,
891            kind: IndexKind::BTree(PersistentBTreeMap::new()),
892            included_columns: Vec::new(),
893            partial_predicate: None,
894            expression: None,
895            is_unique: false,
896            extra_column_positions: Vec::new(),
897        }
898    }
899
900    fn new_nsw(name: String, column_position: usize, m: usize) -> Self {
901        Self {
902            name,
903            column_position,
904            kind: IndexKind::Nsw(NswGraph::new(m)),
905            included_columns: Vec::new(),
906            partial_predicate: None,
907            expression: None,
908            is_unique: false,
909            extra_column_positions: Vec::new(),
910        }
911    }
912
913    /// v6.7.1 — BRIN index constructor. BRIN carries no in-memory
914    /// data; the `column_type` snapshot is used by the segment
915    /// encoder + planner for type-checking range predicates.
916    fn new_brin(name: String, column_position: usize, column_type: DataType) -> Self {
917        Self {
918            name,
919            column_position,
920            kind: IndexKind::Brin { column_type },
921            included_columns: Vec::new(),
922            partial_predicate: None,
923            expression: None,
924            is_unique: false,
925            extra_column_positions: Vec::new(),
926        }
927    }
928
929    /// v7.12.3 — GIN inverted-index constructor. Empty posting-list
930    /// map; caller (typically [`Table::add_gin_index`] or
931    /// [`Table::restore_gin_index`]) populates it from existing rows
932    /// or from a deserialised snapshot.
933    fn new_gin(name: String, column_position: usize) -> Self {
934        Self {
935            name,
936            column_position,
937            kind: IndexKind::Gin(PersistentBTreeMap::new()),
938            included_columns: Vec::new(),
939            partial_predicate: None,
940            expression: None,
941            is_unique: false,
942            extra_column_positions: Vec::new(),
943        }
944    }
945
946    /// v7.15.0 — `gin_trgm_ops`-flavoured GIN constructor. Same
947    /// shape as `new_gin` but the posting-list keys are 3-byte
948    /// trigram shingles (`pg_trgm`-compatible) and the column
949    /// type is `TEXT` / `VARCHAR` (not `TSVECTOR`).
950    fn new_gin_trgm(name: String, column_position: usize) -> Self {
951        Self {
952            name,
953            column_position,
954            kind: IndexKind::GinTrgm(PersistentBTreeMap::new()),
955            included_columns: Vec::new(),
956            partial_predicate: None,
957            expression: None,
958            is_unique: false,
959            extra_column_positions: Vec::new(),
960        }
961    }
962
963    /// Look up the locators stored under `key` (B-tree only). Returns
964    /// an empty slice when the key is absent or the index isn't a
965    /// BTree — callers can treat both cases uniformly.
966    ///
967    /// v5.1: return type widened from `&[usize]` to `&[RowLocator]`.
968    /// Pre-v5.2 callers can read the slice and `.as_hot().unwrap()`
969    /// each entry (no `Cold` variants exist until the freezer lands);
970    /// post-v5.2 callers dispatch hot vs. cold per locator.
971    pub fn lookup_eq(&self, key: &IndexKey) -> &[RowLocator] {
972        match &self.kind {
973            IndexKind::BTree(m) => m.get(key).map_or(&[][..], Vec::as_slice),
974            // BRIN / NSW / GIN / trigram-GIN have no IndexKey-keyed
975            // map; lookup is a no-op. GIN uses
976            // [`Index::gin_lookup_word`] instead.
977            IndexKind::Nsw(_)
978            | IndexKind::Brin { .. }
979            | IndexKind::Gin(_)
980            | IndexKind::GinTrgm(_) => &[][..],
981        }
982    }
983
984    /// v7.12.3 — GIN posting-list lookup. Returns the row locators
985    /// whose `tsvector` cell contains `word`. Empty when the word is
986    /// absent from the index or this isn't a GIN index.
987    pub fn gin_lookup_word(&self, word: &str) -> &[RowLocator] {
988        match &self.kind {
989            IndexKind::Gin(m) => m.get(&String::from(word)).map_or(&[][..], Vec::as_slice),
990            IndexKind::BTree(_)
991            | IndexKind::Nsw(_)
992            | IndexKind::Brin { .. }
993            | IndexKind::GinTrgm(_) => &[][..],
994        }
995    }
996
997    /// v7.15.0 — trigram-GIN posting-list lookup. Returns the row
998    /// locators whose indexed `TEXT` cell contains the trigram
999    /// `tri`. Empty when the trigram is absent or this isn't a
1000    /// trigram-GIN index.
1001    pub fn gin_trgm_lookup(&self, tri: &str) -> &[RowLocator] {
1002        match &self.kind {
1003            IndexKind::GinTrgm(m) => m.get(&String::from(tri)).map_or(&[][..], Vec::as_slice),
1004            IndexKind::BTree(_)
1005            | IndexKind::Nsw(_)
1006            | IndexKind::Brin { .. }
1007            | IndexKind::Gin(_) => &[][..],
1008        }
1009    }
1010
1011    /// Borrow the NSW graph (if this is an NSW index). Callers that need
1012    /// the graph for a kNN search go through here.
1013    pub const fn nsw(&self) -> Option<&NswGraph> {
1014        match &self.kind {
1015            IndexKind::Nsw(g) => Some(g),
1016            IndexKind::BTree(_)
1017            | IndexKind::Brin { .. }
1018            | IndexKind::Gin(_)
1019            | IndexKind::GinTrgm(_) => None,
1020        }
1021    }
1022
1023    /// v6.7.1 — true when this index is a BRIN (block range) index.
1024    /// Used by the segment encoder to opt into BRIN sidecar emission
1025    /// at freeze time, and by the planner to opt into page-skipping
1026    /// on range predicates.
1027    pub const fn is_brin(&self) -> bool {
1028        matches!(self.kind, IndexKind::Brin { .. })
1029    }
1030
1031    /// v7.15.0 — true when this index is a trigram GIN
1032    /// (`gin_trgm_ops`-flavoured). Used by the LIKE planner to
1033    /// opt into trigram acceleration.
1034    pub const fn is_gin_trgm(&self) -> bool {
1035        matches!(self.kind, IndexKind::GinTrgm(_))
1036    }
1037
1038    /// v7.12.3 — true when this index is a GIN inverted index.
1039    /// Used by the planner to opt into posting-list acceleration on
1040    /// `WHERE col @@ tsquery` predicates.
1041    pub const fn is_gin(&self) -> bool {
1042        matches!(self.kind, IndexKind::Gin(_))
1043    }
1044}
1045
1046/// In-memory table: schema + a persistent row vector + secondary indices.
1047///
1048/// v4.39: `rows` is a [`PersistentVec`] (Bitmapped Vector Trie, 32-way) so
1049/// `Table::clone()` is `O(1)` — the whole reason for v4.39's existence is
1050/// to make `Catalog::clone()` cheap inside the v4.34 auto-commit wrap.
1051///
1052/// v5.2.1: `hot_bytes` tracks the encoded byte size of every row currently
1053/// in [`Self::rows`], summed over rows. Updated incrementally by `insert`
1054/// (+= encoded row size), `delete_rows` (-= removed rows' encoded sizes),
1055/// and `update_row` (-= old size, += new size). The value is what the
1056/// v5.2 freezer reads to decide when to demote cold rows — when the
1057/// catalog-wide sum crosses `SPG_HOT_TIER_BYTES` (default 4 GiB) the
1058/// freezer thread wakes. v5.2.1 ships measurement only; the freezer
1059/// itself lands in v5.2.2. Stored as `u64` so a single field clone in
1060/// `Catalog::clone` stays at the O(1) invariant v4.39 built.
1061#[derive(Debug, Clone)]
1062pub struct Table {
1063    schema: TableSchema,
1064    rows: PersistentVec<Row>,
1065    indices: Vec<Index>,
1066    hot_bytes: u64,
1067    /// v6.7.0 — cached count of rows currently materialised in the
1068    /// cold tier via `RowLocator::Cold` entries across THIS table's
1069    /// indices. Populated by `ANALYZE` (walks every BTree index and
1070    /// counts Cold locators); the count survives until the next
1071    /// ANALYZE recomputes it. Surfaced via `spg_statistic.cold_row_count`
1072    /// and `spg_stat_segment.table_name`.
1073    ///
1074    /// Honest scope: this is a CACHED count, not a live one.
1075    /// Freezer / promote / DELETE don't currently update the cache
1076    /// incrementally — they invalidate it by setting the
1077    /// `cold_row_count_stale` flag, and the next ANALYZE re-walks.
1078    /// Incremental maintenance is a v6.7.x candidate if observation
1079    /// shows the ANALYZE walk cost dominates.
1080    cold_row_count: u64,
1081    /// v6.7.0 — set when the cached `cold_row_count` may be wrong
1082    /// because rows moved into / out of the cold tier since the last
1083    /// ANALYZE. The virtual-table surface reports the cached value
1084    /// regardless (operators run ANALYZE to refresh).
1085    cold_row_count_stale: bool,
1086}
1087
1088impl Table {
1089    pub fn new(schema: TableSchema) -> Self {
1090        Self {
1091            schema,
1092            rows: PersistentVec::new(),
1093            indices: Vec::new(),
1094            hot_bytes: 0,
1095            cold_row_count: 0,
1096            cold_row_count_stale: false,
1097        }
1098    }
1099
1100    /// Total encoded byte size of every row currently in the hot tier
1101    /// (`self.rows`). See struct docs for the maintenance contract.
1102    /// Returns 0 for an empty table.
1103    #[must_use]
1104    pub const fn hot_bytes(&self) -> u64 {
1105        self.hot_bytes
1106    }
1107
1108    /// v6.7.0 — cached count of cold-tier rows. See struct field
1109    /// docs for the staleness contract.
1110    #[must_use]
1111    pub const fn cold_row_count(&self) -> u64 {
1112        self.cold_row_count
1113    }
1114
1115    /// v6.7.0 — overwrite the cached count. Called by the engine's
1116    /// `analyze_one_table` after walking the indices.
1117    pub fn set_cold_row_count(&mut self, n: u64) {
1118        self.cold_row_count = n;
1119        self.cold_row_count_stale = false;
1120    }
1121
1122    /// v6.7.0 — mark the cached count as potentially out of date.
1123    /// Called by freezer / promote / DELETE paths so a subsequent
1124    /// `spg_statistic` read knows the number may not reflect the
1125    /// current state.
1126    pub fn mark_cold_row_count_stale(&mut self) {
1127        self.cold_row_count_stale = true;
1128    }
1129
1130    /// v6.7.0 — report whether the cached count is known to be out
1131    /// of date. Exposed for completeness; the virtual table surface
1132    /// returns the cached value regardless.
1133    #[must_use]
1134    pub const fn cold_row_count_stale(&self) -> bool {
1135        self.cold_row_count_stale
1136    }
1137
1138    /// v6.7.0 — walk every BTree index and count `RowLocator::Cold`
1139    /// entries; return the MAX across indices. The freeze path
1140    /// (`freeze_oldest_to_cold`) writes cold locators to ONE
1141    /// designated index — that index ends up with the full per-row
1142    /// count. MAX-across-indices yields the precise count when a
1143    /// PK-style index exists; for multi-index tables without a
1144    /// covering index it's a lower bound (rare in practice).
1145    /// Caller responsibility: only invoke under `engine.write()`
1146    /// or after taking ownership; the walk is O(N) over every
1147    /// (key, locator) pair.
1148    #[must_use]
1149    pub fn count_cold_locators(&self) -> u64 {
1150        let mut best: u64 = 0;
1151        for idx in &self.indices {
1152            if let IndexKind::BTree(map) = &idx.kind {
1153                let n: u64 = map
1154                    .iter()
1155                    .map(|(_, locs)| locs.iter().filter(|l| l.is_cold()).count() as u64)
1156                    .sum();
1157                if n > best {
1158                    best = n;
1159                }
1160            }
1161        }
1162        best
1163    }
1164
1165    pub const fn schema(&self) -> &TableSchema {
1166        &self.schema
1167    }
1168
1169    /// v6.7.2 — mutable schema accessor for ALTER TABLE paths.
1170    /// Used by `Engine::exec_alter_table` to flip per-table
1171    /// settings like `hot_tier_bytes`.
1172    pub const fn schema_mut(&mut self) -> &mut TableSchema {
1173        &mut self.schema
1174    }
1175
1176    /// v4.39: returns the persistent row vector by reference. Callers that
1177    /// used to take `&[Row]` should switch to `.iter()` (via
1178    /// `IntoIterator for &PersistentVec`) or `.get(i)` for indexing.
1179    pub const fn rows(&self) -> &PersistentVec<Row> {
1180        &self.rows
1181    }
1182
1183    pub const fn row_count(&self) -> usize {
1184        self.rows.len()
1185    }
1186
1187    /// v6.8.0 — exposed for the engine layer to patch
1188    /// `Index::included_columns` post-creation. Could fold into
1189    /// `add_index` once the engine's IF-NOT-EXISTS guard moves up,
1190    /// but the patch shape is the minimal change for v6.8.0.
1191    pub fn indices_mut(&mut self) -> &mut [Index] {
1192        &mut self.indices
1193    }
1194
1195    pub fn indices(&self) -> &[Index] {
1196        &self.indices
1197    }
1198
1199    /// Compute the next `AUTO_INCREMENT` value for the column at
1200    /// `col_pos`. Defined as `max(existing) + 1`, falling back to `1`
1201    /// when the column currently holds no integer values. NULL / non-
1202    /// integer cells are skipped. Returns `None` when the column isn't
1203    /// an integer type.
1204    pub fn next_auto_value(&self, col_pos: usize) -> Option<i64> {
1205        let ty = self.schema.columns.get(col_pos)?.ty;
1206        if !matches!(ty, DataType::SmallInt | DataType::Int | DataType::BigInt) {
1207            return None;
1208        }
1209        let mut max: Option<i64> = None;
1210        for row in &self.rows {
1211            match row.values.get(col_pos) {
1212                Some(Value::SmallInt(n)) => {
1213                    let v = i64::from(*n);
1214                    max = Some(max.map_or(v, |m| m.max(v)));
1215                }
1216                Some(Value::Int(n)) => {
1217                    let v = i64::from(*n);
1218                    max = Some(max.map_or(v, |m| m.max(v)));
1219                }
1220                Some(Value::BigInt(n)) => {
1221                    max = Some(max.map_or(*n, |m| m.max(*n)));
1222                }
1223                _ => {}
1224            }
1225        }
1226        Some(max.map_or(1, |m| m + 1))
1227    }
1228
1229    /// Return the first index defined over `column_position`, if any.
1230    /// (`v0.8` supports at most one index per column logically; the search
1231    /// just picks the first match.)
1232    pub fn index_on(&self, column_position: usize) -> Option<&Index> {
1233        // v6.7.1 — prefer BTree (has the key→locator map needed
1234        // for `lookup_eq`) over BRIN (metadata-only). When only a
1235        // BRIN exists on the column, return None so the executor
1236        // falls back to the hot-tier row scan instead of trying
1237        // to use BRIN for an equality lookup (which would always
1238        // return an empty slice and look like "no rows matched").
1239        self.indices
1240            .iter()
1241            .find(|i| i.column_position == column_position && matches!(i.kind, IndexKind::BTree(_)))
1242            .or_else(|| {
1243                self.indices.iter().find(|i| {
1244                    i.column_position == column_position && matches!(i.kind, IndexKind::Nsw(_))
1245                })
1246            })
1247    }
1248
1249    /// Insert one row after validating it matches the schema (length + type).
1250    /// Returns `StorageError` on mismatch — the table is left unchanged.
1251    /// Updates every defined index with the new row's key.
1252    pub fn insert(&mut self, row: Row) -> Result<(), StorageError> {
1253        if row.len() != self.schema.columns.len() {
1254            return Err(StorageError::ArityMismatch {
1255                expected: self.schema.columns.len(),
1256                actual: row.len(),
1257            });
1258        }
1259        for (i, (val, col)) in row.values.iter().zip(&self.schema.columns).enumerate() {
1260            if val.is_null() {
1261                if !col.nullable {
1262                    return Err(StorageError::NullInNotNull {
1263                        column: col.name.clone(),
1264                    });
1265                }
1266                continue;
1267            }
1268            let actual = val.data_type().expect("non-null");
1269            // Vector columns require both that the value's variant be Vector
1270            // *and* its dimension match. `actual == col.ty` already encodes
1271            // both because DataType::Vector carries the dim.
1272            //
1273            // VARCHAR(n) / CHAR(n) are storage-equivalent to TEXT — the
1274            // length / padding contract is enforced upstream by
1275            // `coerce_value`. Accept a `Text` value into either.
1276            //
1277            // NUMERIC's `Value::Numeric` carries its actual scale but the
1278            // column declares the *expected* scale (a scale-rescaled
1279            // Value::Numeric is produced upstream by `coerce_value`); the
1280            // structural check here only verifies "value is Numeric and
1281            // its scale equals the column scale".
1282            let compatible = actual == col.ty
1283                || matches!(
1284                    (actual, col.ty),
1285                    (
1286                        DataType::Text,
1287                        DataType::Varchar(_) | DataType::Char(_) | DataType::Json | DataType::Jsonb
1288                    ) | (DataType::Json | DataType::Jsonb, DataType::Text)
1289                        | (DataType::Json, DataType::Jsonb)
1290                        | (DataType::Jsonb, DataType::Json)
1291                        | (DataType::Timestamp, DataType::Timestamptz)
1292                        | (DataType::Timestamptz, DataType::Timestamp)
1293                )
1294                || matches!(
1295                    (actual, col.ty),
1296                    (
1297                        DataType::Numeric { scale: a, .. },
1298                        DataType::Numeric { scale: b, .. },
1299                    ) if a == b
1300                );
1301            if !compatible {
1302                return Err(StorageError::TypeMismatch {
1303                    column: col.name.clone(),
1304                    expected: col.ty,
1305                    actual,
1306                    position: i,
1307                });
1308            }
1309        }
1310        let new_row_idx = self.rows.len();
1311        // Pre-validate before mutating: ensure indices receive an IndexKey.
1312        // For NSW we defer the graph update to *after* the row is pushed
1313        // so the kNN search can see it in `self.rows`.
1314        for idx in &mut self.indices {
1315            match &mut idx.kind {
1316                IndexKind::BTree(map) => {
1317                    if let Some(key) = IndexKey::from_value(&row.values[idx.column_position]) {
1318                        // v4.40: PersistentBTreeMap has no in-place entry-or-default.
1319                        // Clone-then-insert keeps the same semantics — for typical
1320                        // unique-key schemas the Vec is 1-element so the clone is
1321                        // O(1). For dup-heavy columns it's O(M) per insert, traded
1322                        // for the structural-sharing win at clone time.
1323                        let mut entries = map.get(&key).cloned().unwrap_or_default();
1324                        entries.push(RowLocator::Hot(new_row_idx));
1325                        map.insert_mut(key, entries);
1326                    }
1327                }
1328                IndexKind::Gin(map) => {
1329                    // v7.12.3 — extend posting list per lexeme word.
1330                    // NULL or non-TsVector cell → no-op (cell carries
1331                    // no lexemes to index).
1332                    if let Value::TsVector(lexemes) = &row.values[idx.column_position] {
1333                        for lex in lexemes {
1334                            let mut entries = map.get(&lex.word).cloned().unwrap_or_default();
1335                            entries.push(RowLocator::Hot(new_row_idx));
1336                            map.insert_mut(lex.word.clone(), entries);
1337                        }
1338                    }
1339                }
1340                IndexKind::GinTrgm(map) => {
1341                    // v7.15.0 — trigram GIN. Shingle the TEXT cell
1342                    // into PG-compatible 3-byte trigrams and extend
1343                    // each trigram's posting list.
1344                    if let Value::Text(s) = &row.values[idx.column_position] {
1345                        for tri in trgm::extract_trigrams(s) {
1346                            let mut entries = map.get(&tri).cloned().unwrap_or_default();
1347                            entries.push(RowLocator::Hot(new_row_idx));
1348                            map.insert_mut(tri, entries);
1349                        }
1350                    }
1351                }
1352                // NSW handled below after the row push (so the new row
1353                // is visible to the kNN-graph connect step). BRIN
1354                // carries no per-row state.
1355                IndexKind::Nsw(_) | IndexKind::Brin { .. } => {}
1356            }
1357        }
1358        // v5.2.1: maintain incremental hot-tier byte counter. Computed
1359        // before the move so we don't need to borrow `row` after push.
1360        self.hot_bytes = self
1361            .hot_bytes
1362            .saturating_add(row_body_encoded_len(&row, &self.schema) as u64);
1363        // v4.39.1: push_mut keeps streaming inserts at Vec::push speed when
1364        // the table is uniquely owned (the spg-embedded path); inside a TX
1365        // wrap where a Catalog snapshot exists, push_mut path-copies the
1366        // tail just like push() and the snapshot stays valid.
1367        self.rows.push_mut(row);
1368        // NSW updates after the push so the new row is visible to the
1369        // greedy search used during connect.
1370        let new_row_idx = self.rows.len() - 1;
1371        let nsw_targets: Vec<usize> = self
1372            .indices
1373            .iter()
1374            .enumerate()
1375            .filter_map(|(i, idx)| {
1376                if matches!(idx.kind, IndexKind::Nsw(_)) {
1377                    Some(i)
1378                } else {
1379                    None
1380                }
1381            })
1382            .collect();
1383        for idx_pos in nsw_targets {
1384            nsw_insert_at(self, idx_pos, new_row_idx);
1385        }
1386        Ok(())
1387    }
1388
1389    /// Build a new B-tree index over the named column. Rebuilds from
1390    /// existing rows. Errors if `column_name` doesn't exist or the index
1391    /// name is taken.
1392    pub fn add_index(&mut self, name: String, column_name: &str) -> Result<(), StorageError> {
1393        if self.indices.iter().any(|i| i.name == name) {
1394            return Err(StorageError::DuplicateIndex { name });
1395        }
1396        let column_position = self.schema.column_position(column_name).ok_or_else(|| {
1397            StorageError::ColumnNotFound {
1398                column: column_name.into(),
1399            }
1400        })?;
1401        let mut idx = Index::new_btree(name, column_position);
1402        if let IndexKind::BTree(map) = &mut idx.kind {
1403            for (i, row) in self.rows.iter().enumerate() {
1404                if let Some(key) = IndexKey::from_value(&row.values[column_position]) {
1405                    let mut entries = map.get(&key).cloned().unwrap_or_default();
1406                    entries.push(RowLocator::Hot(i));
1407                    map.insert_mut(key, entries);
1408                }
1409            }
1410        }
1411        self.indices.push(idx);
1412        Ok(())
1413    }
1414
1415    /// Build a new NSW (HNSW-flavoured) index over the named column.
1416    /// Required for `ORDER BY col <-> literal LIMIT k` to plan as a
1417    /// graph traversal instead of a full scan. Column must be a Vector
1418    /// type. `m` is the maximum number of neighbours per node.
1419    pub fn add_nsw_index(
1420        &mut self,
1421        name: String,
1422        column_name: &str,
1423        m: usize,
1424    ) -> Result<(), StorageError> {
1425        self.add_nsw_index_inner(name, column_name, m, None)
1426    }
1427
1428    /// v6.0.4 — synchronous rebuild of the named NSW index. If
1429    /// `new_encoding` is `Some(target)` and differs from the column's
1430    /// current encoding, every stored cell at the indexed column is
1431    /// re-coded into the target encoding before the new graph
1432    /// builds. Returns `IndexNotFound` if no index by that name exists
1433    /// and `Unsupported` for non-NSW indexes (`BTree` REBUILD is a no-op
1434    /// the engine layer rejects, not a storage-level concept).
1435    ///
1436    /// Holds the caller's `&mut self` for the duration — no
1437    /// concurrency / staging / WAL-replay machinery in v6.0.4. The
1438    /// "live" optimisation lands as v6.0.4.1.
1439    pub fn rebuild_nsw_index(
1440        &mut self,
1441        name: &str,
1442        new_encoding: Option<VecEncoding>,
1443    ) -> Result<(), StorageError> {
1444        let idx_pos = self
1445            .indices
1446            .iter()
1447            .position(|i| i.name == name)
1448            .ok_or_else(|| StorageError::IndexNotFound {
1449                name: String::from(name),
1450            })?;
1451        let col_pos = self.indices[idx_pos].column_position;
1452        let m = match &self.indices[idx_pos].kind {
1453            IndexKind::Nsw(g) => g.m,
1454            IndexKind::BTree(_)
1455            | IndexKind::Brin { .. }
1456            | IndexKind::Gin(_)
1457            | IndexKind::GinTrgm(_) => {
1458                return Err(StorageError::Unsupported(format!(
1459                    "ALTER INDEX REBUILD on non-NSW index {name:?} — only NSW indexes can rebuild"
1460                )));
1461            }
1462        };
1463        let col_name = self.schema.columns[col_pos].name.clone();
1464        // 1. Optional re-encoding pass. Done first so the cells
1465        //    match the schema before the graph rebuild walks them.
1466        if let Some(target) = new_encoding {
1467            let current = match self.schema.columns[col_pos].ty {
1468                DataType::Vector { encoding, .. } => encoding,
1469                ref other => {
1470                    return Err(StorageError::Unsupported(format!(
1471                        "ALTER INDEX REBUILD WITH (encoding=…) on non-vector column type {other:?}"
1472                    )));
1473                }
1474            };
1475            if target != current {
1476                let DataType::Vector { dim, .. } = self.schema.columns[col_pos].ty else {
1477                    unreachable!("checked above")
1478                };
1479                let n = self.rows.len();
1480                for i in 0..n {
1481                    let row = self
1482                        .rows
1483                        .get_mut(i)
1484                        .expect("row index in bounds (we iterated up to len())");
1485                    let cell = core::mem::replace(&mut row.values[col_pos], Value::Null);
1486                    let recoded = recode_vector_cell(cell, target)?;
1487                    row.values[col_pos] = recoded;
1488                }
1489                self.schema.columns[col_pos].ty = DataType::Vector {
1490                    dim,
1491                    encoding: target,
1492                };
1493            }
1494        }
1495        // 2. Drop the existing index slot + rebuild from row payload.
1496        self.indices.remove(idx_pos);
1497        self.add_nsw_index_inner(String::from(name), &col_name, m, None)?;
1498        Ok(())
1499    }
1500
1501    /// Restore an NSW index from a pre-built graph (used on
1502    /// deserialize). Skips the bulk-build pass since the topology is
1503    /// already known. Returns `DuplicateIndex` or `ColumnNotFound` on
1504    /// schema mismatch as usual.
1505    pub fn restore_nsw_index(
1506        &mut self,
1507        name: String,
1508        column_name: &str,
1509        graph: NswGraph,
1510    ) -> Result<(), StorageError> {
1511        self.add_nsw_index_inner(name, column_name, graph.m, Some(graph))
1512    }
1513
1514    /// Restore a `BTree` index from a pre-built `(IndexKey, Vec<RowLocator>)`
1515    /// map. Used by [`Catalog::deserialize`] when reading a v9 (or later)
1516    /// catalog snapshot — the map travels on disk so cold-tier locators
1517    /// survive a round-trip, instead of being rebuilt from `self.rows`
1518    /// (which would lose every Cold entry). Same error contract as
1519    /// [`Table::add_index`].
1520    pub fn restore_btree_index(
1521        &mut self,
1522        name: String,
1523        column_name: &str,
1524        map: PersistentBTreeMap<IndexKey, Vec<RowLocator>>,
1525    ) -> Result<(), StorageError> {
1526        if self.indices.iter().any(|i| i.name == name) {
1527            return Err(StorageError::DuplicateIndex { name });
1528        }
1529        let column_position = self.schema.column_position(column_name).ok_or_else(|| {
1530            StorageError::ColumnNotFound {
1531                column: column_name.into(),
1532            }
1533        })?;
1534        self.indices.push(Index {
1535            name,
1536            column_position,
1537            kind: IndexKind::BTree(map),
1538            included_columns: Vec::new(),
1539            partial_predicate: None,
1540            expression: None,
1541            is_unique: false,
1542            extra_column_positions: Vec::new(),
1543        });
1544        Ok(())
1545    }
1546
1547    /// v6.7.1 — public restore counterpart for BRIN indices. Used
1548    /// by `Catalog::deserialize` when a v10 snapshot carries a
1549    /// BRIN index entry. BRIN carries no in-memory data — only the
1550    /// `column_type` snapshot is restored.
1551    pub fn restore_brin_index(
1552        &mut self,
1553        name: String,
1554        column_name: &str,
1555        column_type: DataType,
1556    ) -> Result<(), StorageError> {
1557        if self.indices.iter().any(|i| i.name == name) {
1558            return Err(StorageError::DuplicateIndex { name });
1559        }
1560        let column_position = self.schema.column_position(column_name).ok_or_else(|| {
1561            StorageError::ColumnNotFound {
1562                column: column_name.into(),
1563            }
1564        })?;
1565        self.indices
1566            .push(Index::new_brin(name, column_position, column_type));
1567        Ok(())
1568    }
1569
1570    /// v6.7.1 — public CREATE INDEX counterpart for BRIN. Creates
1571    /// the index entry with a snapshot of the indexed column's
1572    /// current `DataType`.
1573    pub fn add_brin_index(&mut self, name: String, column_name: &str) -> Result<(), StorageError> {
1574        if self.indices.iter().any(|i| i.name == name) {
1575            return Err(StorageError::DuplicateIndex { name });
1576        }
1577        let column_position = self.schema.column_position(column_name).ok_or_else(|| {
1578            StorageError::ColumnNotFound {
1579                column: column_name.into(),
1580            }
1581        })?;
1582        let column_type = self.schema.columns[column_position].ty;
1583        self.indices
1584            .push(Index::new_brin(name, column_position, column_type));
1585        Ok(())
1586    }
1587
1588    /// v7.12.3 — Build a new GIN inverted index over a `tsvector`
1589    /// column. Populates posting lists from existing rows. Errors
1590    /// if the column doesn't exist, isn't `TsVector`, or the index
1591    /// name is taken.
1592    pub fn add_gin_index(&mut self, name: String, column_name: &str) -> Result<(), StorageError> {
1593        if self.indices.iter().any(|i| i.name == name) {
1594            return Err(StorageError::DuplicateIndex { name });
1595        }
1596        let column_position = self.schema.column_position(column_name).ok_or_else(|| {
1597            StorageError::ColumnNotFound {
1598                column: column_name.into(),
1599            }
1600        })?;
1601        if self.schema.columns[column_position].ty != DataType::TsVector {
1602            return Err(StorageError::Corrupt(format!(
1603                "GIN index {name:?} requires a tsvector column; \
1604                 {column_name:?} is {:?}",
1605                self.schema.columns[column_position].ty
1606            )));
1607        }
1608        let mut idx = Index::new_gin(name, column_position);
1609        if let IndexKind::Gin(map) = &mut idx.kind {
1610            for (i, row) in self.rows.iter().enumerate() {
1611                if let Value::TsVector(lexemes) = &row.values[column_position] {
1612                    for lex in lexemes {
1613                        let mut entries = map.get(&lex.word).cloned().unwrap_or_default();
1614                        entries.push(RowLocator::Hot(i));
1615                        map.insert_mut(lex.word.clone(), entries);
1616                    }
1617                }
1618            }
1619        }
1620        self.indices.push(idx);
1621        Ok(())
1622    }
1623
1624    /// v7.12.3 — Restore a GIN index from a deserialised snapshot.
1625    /// Mirrors [`Self::restore_btree_index`] but takes the GIN's
1626    /// `word → Vec<RowLocator>` posting-list map (already populated
1627    /// from the catalog stream) instead of an `IndexKey` map.
1628    pub fn restore_gin_index(
1629        &mut self,
1630        name: String,
1631        column_name: &str,
1632        map: PersistentBTreeMap<String, Vec<RowLocator>>,
1633    ) -> Result<(), StorageError> {
1634        if self.indices.iter().any(|i| i.name == name) {
1635            return Err(StorageError::DuplicateIndex { name });
1636        }
1637        let column_position = self.schema.column_position(column_name).ok_or_else(|| {
1638            StorageError::ColumnNotFound {
1639                column: column_name.into(),
1640            }
1641        })?;
1642        let mut idx = Index::new_gin(name, column_position);
1643        idx.kind = IndexKind::Gin(map);
1644        self.indices.push(idx);
1645        Ok(())
1646    }
1647
1648    /// v7.15.0 — `gin_trgm_ops` GIN over a TEXT column. Walks
1649    /// every row, shingles the cell into PG-compatible trigrams,
1650    /// and builds the posting-list map. NULL / non-TEXT cells
1651    /// contribute nothing (no trigrams).
1652    pub fn add_gin_trgm_index(
1653        &mut self,
1654        name: String,
1655        column_name: &str,
1656    ) -> Result<(), StorageError> {
1657        if self.indices.iter().any(|i| i.name == name) {
1658            return Err(StorageError::DuplicateIndex { name });
1659        }
1660        let column_position = self.schema.column_position(column_name).ok_or_else(|| {
1661            StorageError::ColumnNotFound {
1662                column: column_name.into(),
1663            }
1664        })?;
1665        if !matches!(
1666            self.schema.columns[column_position].ty,
1667            DataType::Text | DataType::Varchar(_)
1668        ) {
1669            return Err(StorageError::Corrupt(format!(
1670                "trigram-GIN index {name:?} requires a TEXT/VARCHAR column; \
1671                 {column_name:?} is {:?}",
1672                self.schema.columns[column_position].ty
1673            )));
1674        }
1675        let mut idx = Index::new_gin_trgm(name, column_position);
1676        if let IndexKind::GinTrgm(map) = &mut idx.kind {
1677            for (i, row) in self.rows.iter().enumerate() {
1678                if let Value::Text(s) = &row.values[column_position] {
1679                    for tri in trgm::extract_trigrams(s) {
1680                        let mut entries = map.get(&tri).cloned().unwrap_or_default();
1681                        entries.push(RowLocator::Hot(i));
1682                        map.insert_mut(tri, entries);
1683                    }
1684                }
1685            }
1686        }
1687        self.indices.push(idx);
1688        Ok(())
1689    }
1690
1691    /// v7.15.0 — restore a trigram-GIN from its catalog snapshot
1692    /// payload. Mirrors [`Self::restore_gin_index`].
1693    pub fn restore_gin_trgm_index(
1694        &mut self,
1695        name: String,
1696        column_name: &str,
1697        map: PersistentBTreeMap<String, Vec<RowLocator>>,
1698    ) -> Result<(), StorageError> {
1699        if self.indices.iter().any(|i| i.name == name) {
1700            return Err(StorageError::DuplicateIndex { name });
1701        }
1702        let column_position = self.schema.column_position(column_name).ok_or_else(|| {
1703            StorageError::ColumnNotFound {
1704                column: column_name.into(),
1705            }
1706        })?;
1707        let mut idx = Index::new_gin_trgm(name, column_position);
1708        idx.kind = IndexKind::GinTrgm(map);
1709        self.indices.push(idx);
1710        Ok(())
1711    }
1712
1713    /// v5.1: register cold-tier locators on a `BTree` index. Used
1714    /// after [`Catalog::load_segment_bytes`] to wire every cold-
1715    /// tier row's PK back to its segment so
1716    /// [`Catalog::lookup_by_pk`] can resolve it. Each call
1717    /// appends to the index — keys that already have hot or cold
1718    /// locators keep them. Returns the number of locators
1719    /// registered.
1720    ///
1721    /// Pre-v5.2 (freezer) this is the only path that adds Cold
1722    /// variants to a PB; post-freezer the background freezer
1723    /// thread produces these as a batch under the engine write
1724    /// lock and this API becomes its in-memory primitive.
1725    ///
1726    /// Errors if `index_name` doesn't exist or names an NSW graph
1727    /// (NSW indices don't carry per-key row locators — they're
1728    /// vector-search structures).
1729    pub fn register_cold_locators<I>(
1730        &mut self,
1731        index_name: &str,
1732        locators: I,
1733    ) -> Result<usize, StorageError>
1734    where
1735        I: IntoIterator<Item = (IndexKey, RowLocator)>,
1736    {
1737        let idx = self
1738            .indices
1739            .iter_mut()
1740            .find(|i| i.name == index_name)
1741            .ok_or_else(|| StorageError::Corrupt(format!("index {index_name:?} not found")))?;
1742        let map = match &mut idx.kind {
1743            IndexKind::BTree(map) => map,
1744            IndexKind::Nsw(_)
1745            | IndexKind::Brin { .. }
1746            | IndexKind::Gin(_)
1747            | IndexKind::GinTrgm(_) => {
1748                return Err(StorageError::Corrupt(format!(
1749                    "index {index_name:?} is not BTree; cold locators apply only to BTree indices"
1750                )));
1751            }
1752        };
1753        let mut count = 0usize;
1754        for (key, locator) in locators {
1755            let mut entries = map.get(&key).cloned().unwrap_or_default();
1756            entries.push(locator);
1757            map.insert_mut(key, entries);
1758            count += 1;
1759        }
1760        Ok(count)
1761    }
1762
1763    /// v7.12.3 — GIN-side parallel to [`Self::register_cold_locators`].
1764    /// Re-attaches `word → cold RowLocator` posting-list entries after
1765    /// the from-rows rebuild loop. Errors when the index doesn't
1766    /// exist or isn't a GIN. Both tsvector-GIN and trigram-GIN
1767    /// variants share posting-list shape (`String → Vec<RowLocator>`),
1768    /// so this helper accepts either.
1769    pub fn register_gin_cold_locators<I>(
1770        &mut self,
1771        index_name: &str,
1772        locators: I,
1773    ) -> Result<usize, StorageError>
1774    where
1775        I: IntoIterator<Item = (String, RowLocator)>,
1776    {
1777        let idx = self
1778            .indices
1779            .iter_mut()
1780            .find(|i| i.name == index_name)
1781            .ok_or_else(|| StorageError::Corrupt(format!("index {index_name:?} not found")))?;
1782        let map = match &mut idx.kind {
1783            IndexKind::Gin(map) | IndexKind::GinTrgm(map) => map,
1784            IndexKind::BTree(_) | IndexKind::Nsw(_) | IndexKind::Brin { .. } => {
1785                return Err(StorageError::Corrupt(format!(
1786                    "register_gin_cold_locators: index {index_name:?} is not GIN"
1787                )));
1788            }
1789        };
1790        let mut count = 0usize;
1791        for (word, locator) in locators {
1792            let mut entries = map.get(&word).cloned().unwrap_or_default();
1793            entries.push(locator);
1794            map.insert_mut(word, entries);
1795            count += 1;
1796        }
1797        Ok(count)
1798    }
1799
1800    /// v5.2.3: remove every `Cold` locator currently registered on
1801    /// `index_name` under the given `key`. `Hot` locators for the
1802    /// same key are left in place — useful when a row has just been
1803    /// promoted hot-side and the caller wants the old Cold pointer
1804    /// retired without losing the new hot entry.
1805    ///
1806    /// Returns the number of cold locators removed (0 when the key
1807    /// has only hot entries or the key isn't present at all).
1808    /// Errors when the index doesn't exist or isn't a `BTree`.
1809    pub fn remove_cold_locators_for_key(
1810        &mut self,
1811        index_name: &str,
1812        key: &IndexKey,
1813    ) -> Result<usize, StorageError> {
1814        let idx = self
1815            .indices
1816            .iter_mut()
1817            .find(|i| i.name == index_name)
1818            .ok_or_else(|| {
1819                StorageError::Corrupt(format!(
1820                    "remove_cold_locators_for_key: index {index_name:?} not found"
1821                ))
1822            })?;
1823        let map = match &mut idx.kind {
1824            IndexKind::BTree(map) => map,
1825            IndexKind::Nsw(_)
1826            | IndexKind::Brin { .. }
1827            | IndexKind::Gin(_)
1828            | IndexKind::GinTrgm(_) => {
1829                return Err(StorageError::Corrupt(format!(
1830                    "remove_cold_locators_for_key: index {index_name:?} is not BTree; \
1831                     cold locators apply only to BTree indices"
1832                )));
1833            }
1834        };
1835        let Some(entries) = map.get(key) else {
1836            return Ok(0);
1837        };
1838        let mut kept: Vec<RowLocator> =
1839            entries.iter().copied().filter(RowLocator::is_hot).collect();
1840        let removed = entries.len() - kept.len();
1841        if removed == 0 {
1842            return Ok(0);
1843        }
1844        kept.shrink_to_fit();
1845        // PersistentBTreeMap has no remove API in v5.2; when every
1846        // locator for `key` was Cold, the key keeps an empty Vec
1847        // entry. `Index::lookup_eq` already treats `Some(&[])` and
1848        // `None` as the same empty slice (via `Vec::as_slice`), so
1849        // callers can't distinguish the two. The space cost is one
1850        // empty Vec per shadowed-then-promoted key — bounded and
1851        // recoverable when the future compaction job lands.
1852        map.insert_mut(key.clone(), kept);
1853        Ok(removed)
1854    }
1855
1856    /// v7.13.0 — append a new column to the schema and back-fill
1857    /// every existing row with `fill_value`. Used by the engine's
1858    /// `ALTER TABLE t ADD COLUMN …` handler (mailrs round-5 G1).
1859    /// Indices on existing columns keep working — column positions
1860    /// don't shift since the new column lands at the end — so no
1861    /// index rebuild is needed.
1862    pub fn add_column(&mut self, col: ColumnSchema, fill_value: Value) {
1863        self.schema.columns.push(col);
1864        let mut new_rows: PersistentVec<Row> = PersistentVec::new();
1865        for row in self.rows.iter() {
1866            let mut values = row.values.clone();
1867            values.push(fill_value.clone());
1868            new_rows.push_mut(Row::new(values));
1869        }
1870        self.rows = new_rows;
1871    }
1872
1873    /// v7.15.0 — replace the partial-index predicate source on
1874    /// the index at slot `idx`. Used by `ALTER TABLE … RENAME
1875    /// COLUMN` after the engine rewrites column-identifier
1876    /// references in the predicate source text. Pure metadata
1877    /// edit; index rows are unaffected (they're keyed by
1878    /// column position, not predicate text).
1879    pub fn set_partial_predicate(&mut self, idx: usize, pred: Option<String>) {
1880        debug_assert!(idx < self.indices.len());
1881        self.indices[idx].partial_predicate = pred;
1882    }
1883
1884    /// v7.15.0 — rename the column at `col_pos` to `new_name`.
1885    /// The on-disk row encoding is positional, so no row rewrite
1886    /// is needed; only the schema's column name changes. Indices,
1887    /// UCs, FKs all key off column positions and are unaffected.
1888    /// Source-text references that hold the column name (CHECK
1889    /// predicates, partial-index predicates, runtime DEFAULT
1890    /// expressions, trigger `UPDATE OF` lists) are rewritten by
1891    /// the engine before this helper is called — the storage
1892    /// layer doesn't depend on `spg-sql` and so can't re-parse the
1893    /// predicate sources itself.
1894    pub fn rename_column(&mut self, col_pos: usize, new_name: &str) {
1895        debug_assert!(col_pos < self.schema.columns.len());
1896        self.schema.columns[col_pos].name = new_name.to_string();
1897    }
1898
1899    /// v7.13.3 — drop the column at `col_pos`. Removes the entry
1900    /// from the schema, the value from every row, any index that
1901    /// references the column (pure drop, not shift), and shifts
1902    /// every remaining index/UC/FK column position that pointed
1903    /// past `col_pos` down by one. Used by `ALTER TABLE t DROP
1904    /// COLUMN <c>` (mailrs round-7 S8). FK dependents on this
1905    /// column must already have been removed by the caller (CASCADE
1906    /// path); the helper assumes only same-column index removal is
1907    /// needed.
1908    pub fn drop_column(&mut self, col_pos: usize) {
1909        debug_assert!(col_pos < self.schema.columns.len());
1910        // Strip the column from the schema.
1911        self.schema.columns.remove(col_pos);
1912        // Rewrite every row to omit the cell at col_pos.
1913        let mut new_rows: PersistentVec<Row> = PersistentVec::new();
1914        for row in self.rows.iter() {
1915            let mut values = row.values.clone();
1916            if col_pos < values.len() {
1917                values.remove(col_pos);
1918            }
1919            new_rows.push_mut(Row::new(values));
1920        }
1921        self.rows = new_rows;
1922        // Drop indices on the column outright; shift the rest.
1923        self.indices.retain(|idx| idx.column_position != col_pos);
1924        for idx in &mut self.indices {
1925            if idx.column_position > col_pos {
1926                idx.column_position -= 1;
1927            }
1928            // Same shift for any included-columns reference.
1929            for inc in &mut idx.included_columns {
1930                if *inc > col_pos {
1931                    *inc -= 1;
1932                }
1933            }
1934        }
1935        // Shift uniqueness-constraint column positions (and drop
1936        // entries that lose all columns, though that shouldn't
1937        // happen in practice — caller has already CASCADE-removed
1938        // FKs and there's no general CASCADE for UCs).
1939        let mut surviving_ucs: Vec<UniquenessConstraint> = Vec::new();
1940        for mut uc in core::mem::take(&mut self.schema.uniqueness_constraints) {
1941            uc.columns.retain(|&c| c != col_pos);
1942            if uc.columns.is_empty() {
1943                continue;
1944            }
1945            for c in &mut uc.columns {
1946                if *c > col_pos {
1947                    *c -= 1;
1948                }
1949            }
1950            surviving_ucs.push(uc);
1951        }
1952        self.schema.uniqueness_constraints = surviving_ucs;
1953        // Shift FK local_columns (parent-pointing column positions
1954        // are off-table and untouched).
1955        for fk in &mut self.schema.foreign_keys {
1956            for c in &mut fk.local_columns {
1957                if *c > col_pos {
1958                    *c -= 1;
1959                }
1960            }
1961        }
1962        // Rebuild remaining indices' payload — the column-position
1963        // shift means existing IndexKey entries are still keyed by
1964        // the same column data but the position numbers changed;
1965        // existing key→locator maps stay valid because they're
1966        // keyed by Value not position. The rebuild is conservative
1967        // — same pattern delete_rows uses post-mutation.
1968        self.rebuild_indices();
1969    }
1970
1971    /// v4.4: delete the rows at the given positions in one pass.
1972    /// `positions` must be unique; ordering doesn't matter. Indices
1973    /// are rebuilt from scratch (cheaper than tracking incremental
1974    /// shifts across both B-tree and NSW). Returns the number of
1975    /// rows removed.
1976    pub fn delete_rows(&mut self, positions: &[usize]) -> usize {
1977        if positions.is_empty() {
1978            return 0;
1979        }
1980        // Mark positions; v4.39: PV has no in-place retain, so we rebuild
1981        // a fresh PV by pushing the survivors. Still O(n log₃₂ n); the
1982        // structural-sharing win shows up at `Catalog::clone()`, not here.
1983        let mut to_remove = alloc::vec![false; self.rows.len()];
1984        let mut removed = 0;
1985        for &p in positions {
1986            if p < to_remove.len() && !to_remove[p] {
1987                to_remove[p] = true;
1988                removed += 1;
1989            }
1990        }
1991        let mut new_rows: PersistentVec<Row> = PersistentVec::new();
1992        let mut removed_bytes: u64 = 0;
1993        for (i, row) in self.rows.iter().enumerate() {
1994            if to_remove[i] {
1995                removed_bytes =
1996                    removed_bytes.saturating_add(row_body_encoded_len(row, &self.schema) as u64);
1997            } else {
1998                new_rows.push_mut(row.clone());
1999            }
2000        }
2001        self.rows = new_rows;
2002        self.hot_bytes = self.hot_bytes.saturating_sub(removed_bytes);
2003        self.rebuild_indices();
2004        removed
2005    }
2006
2007    /// v4.4: replace the row at `position` with `new_values` (must
2008    /// match the schema arity + types). Indices are rebuilt for
2009    /// correctness — the affected column might be indexed and its
2010    /// key may have shifted, and a NSW node's vector may have
2011    /// changed, both of which need fresh state.
2012    pub fn update_row(
2013        &mut self,
2014        position: usize,
2015        new_values: Vec<Value>,
2016    ) -> Result<(), StorageError> {
2017        if position >= self.rows.len() {
2018            return Err(StorageError::Corrupt(alloc::format!(
2019                "update_row: position {position} out of bounds (rows={})",
2020                self.rows.len()
2021            )));
2022        }
2023        if new_values.len() != self.schema.columns.len() {
2024            return Err(StorageError::ArityMismatch {
2025                expected: self.schema.columns.len(),
2026                actual: new_values.len(),
2027            });
2028        }
2029        // Reuse the per-cell type-compat validation that `insert`
2030        // applies. The body below mirrors that check intentionally —
2031        // factoring it would be more code than the duplication.
2032        for (i, (val, col)) in new_values.iter().zip(&self.schema.columns).enumerate() {
2033            if val.is_null() {
2034                if !col.nullable {
2035                    return Err(StorageError::NullInNotNull {
2036                        column: col.name.clone(),
2037                    });
2038                }
2039                continue;
2040            }
2041            let actual = val.data_type().expect("non-null");
2042            let compatible = actual == col.ty
2043                || matches!(
2044                    (actual, col.ty),
2045                    (
2046                        DataType::Text,
2047                        DataType::Varchar(_) | DataType::Char(_) | DataType::Json | DataType::Jsonb
2048                    ) | (DataType::Json | DataType::Jsonb, DataType::Text)
2049                        | (DataType::Json, DataType::Jsonb)
2050                        | (DataType::Jsonb, DataType::Json)
2051                        | (DataType::Timestamp, DataType::Timestamptz)
2052                        | (DataType::Timestamptz, DataType::Timestamp)
2053                )
2054                || matches!(
2055                    (actual, col.ty),
2056                    (
2057                        DataType::Numeric { scale: a, .. },
2058                        DataType::Numeric { scale: b, .. },
2059                    ) if a == b
2060                );
2061            if !compatible {
2062                return Err(StorageError::TypeMismatch {
2063                    column: col.name.clone(),
2064                    expected: col.ty,
2065                    actual,
2066                    position: i,
2067                });
2068            }
2069        }
2070        let old_row = self
2071            .rows
2072            .get(position)
2073            .expect("position bounds-checked above");
2074        let old_bytes = row_body_encoded_len(old_row, &self.schema) as u64;
2075        let new_row = Row::new(new_values);
2076        let new_bytes = row_body_encoded_len(&new_row, &self.schema) as u64;
2077        self.rows = self
2078            .rows
2079            .set(position, new_row)
2080            .expect("position bounds-checked above");
2081        self.hot_bytes = self
2082            .hot_bytes
2083            .saturating_sub(old_bytes)
2084            .saturating_add(new_bytes);
2085        self.rebuild_indices();
2086        Ok(())
2087    }
2088
2089    /// v4.4 helper used by `delete_rows` / `update_row`: discard all
2090    /// index payloads and rebuild from `self.rows`. Cheap enough
2091    /// for typical SPG scale (catalogs in the docker-compose
2092    /// deployment shape are small); the alternative — incremental
2093    /// shift bookkeeping across B-tree + NSW — would be far more
2094    /// invasive than the savings justify.
2095    fn rebuild_indices(&mut self) {
2096        // v5.2.3: capture every `Cold` locator on every BTree index
2097        // before the rebuild, so the from-rows re-emission below
2098        // (which only produces `Hot` locators) doesn't drop cold-
2099        // tier entries on keys unrelated to the row that changed.
2100        // Pre-v5.2.3 this was a `freeze_oldest_to_cold` worry only
2101        // and the freezer did its own capture-then-reregister; v5.2.3
2102        // promotes that pattern into the base helper because UPDATE
2103        // / DELETE now run rebuild_indices on tables with cold rows.
2104        let preserved_cold: Vec<(String, Vec<(IndexKey, RowLocator)>)> = self
2105            .indices
2106            .iter()
2107            .filter_map(|idx| match &idx.kind {
2108                IndexKind::BTree(map) => {
2109                    let cold: Vec<(IndexKey, RowLocator)> = map
2110                        .iter()
2111                        .flat_map(|(k, locs)| {
2112                            locs.iter()
2113                                .filter(|l| l.is_cold())
2114                                .copied()
2115                                .map(move |l| (k.clone(), l))
2116                        })
2117                        .collect();
2118                    if cold.is_empty() {
2119                        None
2120                    } else {
2121                        Some((idx.name.clone(), cold))
2122                    }
2123                }
2124                // BRIN / NSW carry no key→locator map. GIN handles
2125                // its own cold preservation below in `preserved_gin_cold`.
2126                IndexKind::Nsw(_)
2127                | IndexKind::Brin { .. }
2128                | IndexKind::Gin(_)
2129                | IndexKind::GinTrgm(_) => None,
2130            })
2131            .collect();
2132
2133        // v7.12.3 — same cold-preservation pattern for GIN's
2134        // `word → Vec<RowLocator>` posting lists. Parallel to the
2135        // BTree pass above (different key type so a separate vec is
2136        // cleaner than a generic merge). v7.15.0: trigram-GIN
2137        // (`gin_trgm_ops`) shares the same posting-list shape, so
2138        // one pass handles both — the `RebuildKind` carries the
2139        // kind tag to drive resurrection.
2140        let preserved_gin_cold: Vec<(String, Vec<(String, RowLocator)>)> = self
2141            .indices
2142            .iter()
2143            .filter_map(|idx| match &idx.kind {
2144                IndexKind::Gin(map) | IndexKind::GinTrgm(map) => {
2145                    let cold: Vec<(String, RowLocator)> = map
2146                        .iter()
2147                        .flat_map(|(w, locs)| {
2148                            locs.iter()
2149                                .filter(|l| l.is_cold())
2150                                .copied()
2151                                .map(move |l| (w.clone(), l))
2152                        })
2153                        .collect();
2154                    if cold.is_empty() {
2155                        None
2156                    } else {
2157                        Some((idx.name.clone(), cold))
2158                    }
2159                }
2160                IndexKind::BTree(_) | IndexKind::Nsw(_) | IndexKind::Brin { .. } => None,
2161            })
2162            .collect();
2163
2164        // v6.7.1 — descriptor needs to capture index kind so the
2165        // rebuild loop can resurrect BTree / NSW / BRIN / GIN exactly
2166        // as they were. (NSW carries m; BRIN carries the column type
2167        // snapshot; BTree / GIN need no extra payload.)
2168        #[derive(Clone)]
2169        enum RebuildKind {
2170            BTree,
2171            Nsw(usize),
2172            Brin(DataType),
2173            Gin,
2174            GinTrgm,
2175        }
2176        let descriptors: Vec<(String, usize, RebuildKind)> = self
2177            .indices
2178            .iter()
2179            .map(|idx| {
2180                let kind = match &idx.kind {
2181                    IndexKind::Nsw(g) => RebuildKind::Nsw(g.m),
2182                    IndexKind::Brin { column_type } => RebuildKind::Brin(*column_type),
2183                    IndexKind::BTree(_) => RebuildKind::BTree,
2184                    IndexKind::Gin(_) => RebuildKind::Gin,
2185                    IndexKind::GinTrgm(_) => RebuildKind::GinTrgm,
2186                };
2187                (idx.name.clone(), idx.column_position, kind)
2188            })
2189            .collect();
2190        self.indices.clear();
2191        for (name, column_position, rebuild_kind) in descriptors {
2192            match rebuild_kind {
2193                RebuildKind::Nsw(m) => {
2194                    let idx = Index::new_nsw(name, column_position, m);
2195                    self.indices.push(idx);
2196                    let idx_pos = self.indices.len() - 1;
2197                    let row_indices: Vec<usize> = (0..self.rows.len()).collect();
2198                    for row_idx in row_indices {
2199                        nsw_insert_at(self, idx_pos, row_idx);
2200                    }
2201                }
2202                RebuildKind::Brin(column_type) => {
2203                    // BRIN has no in-memory rebuild — the summaries
2204                    // live in cold segments which freeze emits.
2205                    self.indices
2206                        .push(Index::new_brin(name, column_position, column_type));
2207                }
2208                RebuildKind::BTree => {
2209                    let mut idx = Index::new_btree(name, column_position);
2210                    if let IndexKind::BTree(map) = &mut idx.kind {
2211                        for (i, row) in self.rows.iter().enumerate() {
2212                            if let Some(key) = IndexKey::from_value(&row.values[column_position]) {
2213                                let mut entries = map.get(&key).cloned().unwrap_or_default();
2214                                entries.push(RowLocator::Hot(i));
2215                                map.insert_mut(key, entries);
2216                            }
2217                        }
2218                    }
2219                    self.indices.push(idx);
2220                }
2221                RebuildKind::Gin => {
2222                    let mut idx = Index::new_gin(name, column_position);
2223                    if let IndexKind::Gin(map) = &mut idx.kind {
2224                        for (i, row) in self.rows.iter().enumerate() {
2225                            if let Value::TsVector(lexemes) = &row.values[column_position] {
2226                                for lex in lexemes {
2227                                    let mut entries =
2228                                        map.get(&lex.word).cloned().unwrap_or_default();
2229                                    entries.push(RowLocator::Hot(i));
2230                                    map.insert_mut(lex.word.clone(), entries);
2231                                }
2232                            }
2233                        }
2234                    }
2235                    self.indices.push(idx);
2236                }
2237                RebuildKind::GinTrgm => {
2238                    let mut idx = Index::new_gin_trgm(name, column_position);
2239                    if let IndexKind::GinTrgm(map) = &mut idx.kind {
2240                        for (i, row) in self.rows.iter().enumerate() {
2241                            if let Value::Text(s) = &row.values[column_position] {
2242                                for tri in trgm::extract_trigrams(s) {
2243                                    let mut entries = map.get(&tri).cloned().unwrap_or_default();
2244                                    entries.push(RowLocator::Hot(i));
2245                                    map.insert_mut(tri, entries);
2246                                }
2247                            }
2248                        }
2249                    }
2250                    self.indices.push(idx);
2251                }
2252            }
2253        }
2254
2255        // Re-attach preserved cold locators after the from-rows
2256        // rebuild. `register_cold_locators` handles the per-key
2257        // entries-vec append; no key collisions arise because the
2258        // rebuild loop above produced only Hot locators.
2259        for (idx_name, locators) in preserved_cold {
2260            // Errors here would only fire if the index disappeared
2261            // between snapshot and rebuild, which can't happen
2262            // because the rebuild restores the same descriptor set.
2263            let _ = self.register_cold_locators(&idx_name, locators);
2264        }
2265        // v7.12.3 — same for GIN posting-list cold locators.
2266        for (idx_name, locators) in preserved_gin_cold {
2267            let _ = self.register_gin_cold_locators(&idx_name, locators);
2268        }
2269    }
2270
2271    fn add_nsw_index_inner(
2272        &mut self,
2273        name: String,
2274        column_name: &str,
2275        m: usize,
2276        restore: Option<NswGraph>,
2277    ) -> Result<(), StorageError> {
2278        if self.indices.iter().any(|i| i.name == name) {
2279            return Err(StorageError::DuplicateIndex { name });
2280        }
2281        let column_position = self.schema.column_position(column_name).ok_or_else(|| {
2282            StorageError::ColumnNotFound {
2283                column: column_name.into(),
2284            }
2285        })?;
2286        if !matches!(
2287            self.schema.columns[column_position].ty,
2288            DataType::Vector { .. }
2289        ) {
2290            return Err(StorageError::TypeMismatch {
2291                column: column_name.into(),
2292                expected: DataType::Vector {
2293                    dim: 0,
2294                    encoding: VecEncoding::F32,
2295                },
2296                actual: self.schema.columns[column_position].ty,
2297                position: column_position,
2298            });
2299        }
2300        if let Some(graph) = restore {
2301            self.indices.push(Index {
2302                name,
2303                column_position,
2304                kind: IndexKind::Nsw(graph),
2305                included_columns: Vec::new(),
2306                partial_predicate: None,
2307                expression: None,
2308                is_unique: false,
2309                extra_column_positions: Vec::new(),
2310            });
2311            return Ok(());
2312        }
2313        let idx = Index::new_nsw(name, column_position, m);
2314        self.indices.push(idx);
2315        let idx_pos = self.indices.len() - 1;
2316        // Bulk-build by walking the existing rows in order — each insert
2317        // sees the partial graph and links into it.
2318        let row_indices: Vec<usize> = (0..self.rows.len()).collect();
2319        for row_idx in row_indices {
2320            nsw_insert_at(self, idx_pos, row_idx);
2321        }
2322        Ok(())
2323    }
2324}
2325
2326/// v6.0.4 — re-encode a single cell to the target `VecEncoding`.
2327/// Used by `Table::rebuild_nsw_index` when ALTER INDEX REBUILD
2328/// includes the optional `WITH (encoding = …)` clause. Round-trip
2329/// goes through f32: `current → Vec<f32> → target`, leaving NULL
2330/// cells untouched. Returns `Unsupported` on a non-vector cell —
2331/// the caller should have rejected the schema before reaching this.
2332fn recode_vector_cell(cell: Value, target: VecEncoding) -> Result<Value, StorageError> {
2333    if matches!(cell, Value::Null) {
2334        return Ok(cell);
2335    }
2336    // Step 1 — extract the f32 representation of the source cell.
2337    let as_f32: Vec<f32> = match &cell {
2338        Value::Vector(v) => v.clone(),
2339        Value::Sq8Vector(q) => quantize::dequantize(q),
2340        Value::HalfVector(h) => h.to_f32_vec(),
2341        other => {
2342            return Err(StorageError::Unsupported(format!(
2343                "ALTER INDEX REBUILD: cannot recode non-vector cell {:?}",
2344                other.data_type()
2345            )));
2346        }
2347    };
2348    // Step 2 — encode into the target shape. `F32` is the identity
2349    // path (saves one alloc round-trip when the source is already
2350    // F32 — but `Value::Vector(as_f32)` is the right answer
2351    // regardless).
2352    Ok(match target {
2353        VecEncoding::F32 => Value::Vector(as_f32),
2354        VecEncoding::Sq8 => Value::Sq8Vector(quantize::quantize(&as_f32)),
2355        VecEncoding::F16 => Value::HalfVector(halfvec::HalfVector::from_f32_slice(&as_f32)),
2356    })
2357}
2358
2359/// Insert one row into the HNSW graph held by index slot `idx_pos`.
2360/// No-op when the row's value at the indexed column isn't a vector.
2361/// v6.0.1: handles `Value::Sq8Vector` by dequantising into an f32
2362/// "query" surface — the existing greedy + beam-search machinery
2363/// then uses `cell_to_query_metric_distance` to route every
2364/// distance call through the cell's actual encoding.
2365fn nsw_insert_at(table: &mut Table, idx_pos: usize, new_row_idx: usize) {
2366    let col_pos = table.indices[idx_pos].column_position;
2367    let cell_dim: Option<usize> = match &table.rows[new_row_idx].values[col_pos] {
2368        Value::Vector(v) => Some(v.len()),
2369        Value::Sq8Vector(q) => Some(q.bytes.len()),
2370        Value::HalfVector(h) => Some(h.dim()),
2371        _ => None,
2372    };
2373    let Some(dim) = cell_dim else {
2374        // Even non-vector rows occupy a level slot so per-node Vec
2375        // lengths stay aligned with `table.rows.len()`.
2376        ensure_node_slot(table, idx_pos, new_row_idx, 0);
2377        return;
2378    };
2379    if dim == 0 {
2380        ensure_node_slot(table, idx_pos, new_row_idx, 0);
2381        return;
2382    }
2383    let level = nsw_assign_level(new_row_idx);
2384    ensure_node_slot(table, idx_pos, new_row_idx, level);
2385    let (entry, entry_level, m) = match &table.indices[idx_pos].kind {
2386        IndexKind::Nsw(g) => (g.entry, g.entry_level, g.m),
2387        IndexKind::BTree(_)
2388        | IndexKind::Brin { .. }
2389        | IndexKind::Gin(_)
2390        | IndexKind::GinTrgm(_) => {
2391            unreachable!("nsw_insert_at on a non-NSW index")
2392        }
2393    };
2394    // First node ever — declare it the entry (it gets its own level).
2395    if entry.is_none() {
2396        if let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind {
2397            g.entry = Some(new_row_idx);
2398            g.entry_level = level;
2399            *g.levels
2400                .get_mut(new_row_idx)
2401                .expect("levels slot padded by ensure_node_slot") = level;
2402        }
2403        return;
2404    }
2405    // Set the node's recorded level.
2406    if let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind {
2407        *g.levels
2408            .get_mut(new_row_idx)
2409            .expect("levels slot padded by ensure_node_slot") = level;
2410    }
2411    let query = match &table.rows[new_row_idx].values[col_pos] {
2412        Value::Vector(v) => v.clone(),
2413        // v6.0.1: dequantise the inserted SQ8 cell into an f32 query
2414        // surface so the existing greedy / beam machinery can route
2415        // distances through `cell_to_query_metric_distance`. The
2416        // small dequantisation error is what the recall@10 ≥ 0.95
2417        // envelope already accounts for (V6_DESIGN deliberation #3).
2418        Value::Sq8Vector(q) => quantize::dequantize(q),
2419        // v6.0.3: halfvec dequant is bit-exact at the storage layer,
2420        // so the inserted query is a faithful representation.
2421        Value::HalfVector(h) => h.to_f32_vec(),
2422        _ => return,
2423    };
2424    // Phase 1: greedy descend from `entry` down to `level + 1`, keeping
2425    // exactly one current best so the next layer starts from it.
2426    let mut current = entry.expect("entry was Some above");
2427    let mut current_d = vec_l2_sq(table, col_pos, current, &query);
2428    if entry_level > level {
2429        for layer in (level + 1..=entry_level).rev() {
2430            (current, current_d) =
2431                greedy_layer_walk(table, idx_pos, layer, current, current_d, &query);
2432        }
2433    }
2434    // Phase 2: from `min(level, entry_level)` down to 0, beam-search
2435    // `ef_construction` candidates, run the HNSW §4 heuristic neighbour
2436    // selection over them, and connect bidirectionally.
2437    let top = level.min(entry_level);
2438    let ef = (m * 2).max(8);
2439    for layer in (0..=top).rev() {
2440        let cap = if layer == 0 { m * 2 } else { m };
2441        let mut candidates = layer_beam_search(
2442            table,
2443            idx_pos,
2444            layer,
2445            current,
2446            current_d,
2447            &query,
2448            ef,
2449            NswMetric::L2,
2450        );
2451        candidates.retain(|&(_, n)| n != new_row_idx);
2452        // Take the closest as the entry for the next layer down — done
2453        // before heuristic narrowing because the heuristic can reorder.
2454        if let Some(&(d, n)) = candidates.first() {
2455            current = n;
2456            current_d = d;
2457        }
2458        let peers = select_neighbours_heuristic(&candidates, cap, table, col_pos);
2459        connect_at_layer(table, idx_pos, layer, new_row_idx, &peers);
2460    }
2461    // Phase 3: if the new node climbed above the current entry, take
2462    // over as entry so future inserts/searches start from the new top.
2463    if level > entry_level
2464        && let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind
2465    {
2466        g.entry = Some(new_row_idx);
2467        g.entry_level = level;
2468    }
2469}
2470
2471/// Make sure `layers[*][new_row_idx]` and `levels[new_row_idx]` exist,
2472/// padding with empty/zero entries as needed. Also grows `layers` to
2473/// accommodate the node's top `level`.
2474fn ensure_node_slot(table: &mut Table, idx_pos: usize, new_row_idx: usize, level: u8) {
2475    let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind else {
2476        unreachable!("ensure_node_slot on a BTree index");
2477    };
2478    while g.layers.len() <= level as usize {
2479        g.layers.push(PersistentVec::new());
2480    }
2481    while g.levels.len() <= new_row_idx {
2482        g.levels.push_mut(0);
2483    }
2484    for layer_vec in &mut g.layers {
2485        while layer_vec.len() <= new_row_idx {
2486            layer_vec.push_mut(Vec::new());
2487        }
2488    }
2489}
2490
2491/// Single-step greedy walk on one layer: from `current` (with cached
2492/// distance `current_d`), inspect that node's neighbours at `layer` and
2493/// hop to the closest if it beats `current_d`. Repeat until no move
2494/// improves the distance. Cheap variant of beam-search used for the
2495/// "descend" phase that only needs one survivor per layer.
2496fn greedy_layer_walk(
2497    table: &Table,
2498    idx_pos: usize,
2499    layer: u8,
2500    mut current: usize,
2501    mut current_d: f32,
2502    query: &[f32],
2503) -> (usize, f32) {
2504    let g = match &table.indices[idx_pos].kind {
2505        IndexKind::Nsw(g) => g,
2506        IndexKind::BTree(_)
2507        | IndexKind::Brin { .. }
2508        | IndexKind::Gin(_)
2509        | IndexKind::GinTrgm(_) => {
2510            return (current, current_d);
2511        }
2512    };
2513    let col_pos = table.indices[idx_pos].column_position;
2514    loop {
2515        let neighbours: &[u32] = g
2516            .layers
2517            .get(layer as usize)
2518            .and_then(|layer_v| layer_v.get(current))
2519            .map_or(&[][..], Vec::as_slice);
2520        let mut best = current;
2521        let mut best_d = current_d;
2522        for &n in neighbours {
2523            let n = n as usize;
2524            let d = vec_l2_sq(table, col_pos, n, query);
2525            if d < best_d {
2526                best = n;
2527                best_d = d;
2528            }
2529        }
2530        if best == current {
2531            return (current, current_d);
2532        }
2533        current = best;
2534        current_d = best_d;
2535    }
2536}
2537
2538/// Beam search on one layer starting from `entry_node` with cached
2539/// `entry_d`. Returns the top `ef` candidates in ascending-distance
2540/// order. Caller picks the closest as the next layer's entry and / or
2541/// trims to M for connection.
2542///
2543/// v3.0.1: uses two `BinaryHeap`s (min-heap for the open frontier,
2544/// max-heap for the working top-`ef` results) and a `Vec<bool>` visited
2545/// bitmap, replacing the v2.x `Vec` + `partition_point` + `BTreeSet`
2546/// implementation. Same algorithm shape (HNSW search algorithm 2 from
2547/// the paper); the data-structure swap cuts per-visit cost from
2548/// `O(ef + log row_count)` to amortised `O(log ef)`.
2549#[allow(clippy::too_many_arguments)] // Beam search threads layer, entry, query, ef, metric — each is intrinsic. Bundling them into a config struct hides the call sites.
2550fn layer_beam_search(
2551    table: &Table,
2552    idx_pos: usize,
2553    layer: u8,
2554    entry_node: usize,
2555    entry_d: f32,
2556    query: &[f32],
2557    ef: usize,
2558    metric: NswMetric,
2559) -> Vec<(f32, usize)> {
2560    let g = match &table.indices[idx_pos].kind {
2561        IndexKind::Nsw(g) => g,
2562        IndexKind::BTree(_)
2563        | IndexKind::Brin { .. }
2564        | IndexKind::Gin(_)
2565        | IndexKind::GinTrgm(_) => return Vec::new(),
2566    };
2567    let col_pos = table.indices[idx_pos].column_position;
2568    let d0 = if matches!(metric, NswMetric::L2) {
2569        entry_d
2570    } else {
2571        cell_to_query_metric_distance(table, col_pos, entry_node, query, metric)
2572    };
2573    let row_count = table.rows.len();
2574    let mut visited: Vec<bool> = alloc::vec![false; row_count];
2575    if entry_node < row_count {
2576        visited[entry_node] = true;
2577    }
2578    // candidates: min-heap by distance (Closest wrapper) — frontier
2579    // results:    max-heap by distance (Furthest wrapper) — top-ef working set
2580    let mut candidates: alloc::collections::BinaryHeap<NodeClosest> =
2581        alloc::collections::BinaryHeap::with_capacity(ef);
2582    let mut results: alloc::collections::BinaryHeap<NodeFurthest> =
2583        alloc::collections::BinaryHeap::with_capacity(ef);
2584    candidates.push(NodeClosest {
2585        dist: d0,
2586        node: entry_node,
2587    });
2588    results.push(NodeFurthest {
2589        dist: d0,
2590        node: entry_node,
2591    });
2592    while let Some(cur) = candidates.pop() {
2593        let worst = results.peek().map_or(f32::INFINITY, |c| c.dist);
2594        if cur.dist > worst && results.len() >= ef {
2595            break;
2596        }
2597        let neighbours: &[u32] = g
2598            .layers
2599            .get(layer as usize)
2600            .and_then(|layer_v| layer_v.get(cur.node))
2601            .map_or(&[][..], Vec::as_slice);
2602        for &n in neighbours {
2603            let n = n as usize;
2604            if n >= row_count || visited[n] {
2605                continue;
2606            }
2607            visited[n] = true;
2608            // v6.0.1: cell-aware distance — F32 cells take the
2609            // existing scalar metric, SQ8 cells route through
2610            // the asymmetric ADC variant for the same metric.
2611            let dn = cell_to_query_metric_distance(table, col_pos, n, query, metric);
2612            if !dn.is_finite() {
2613                continue;
2614            }
2615            let worst = results.peek().map_or(f32::INFINITY, |c| c.dist);
2616            if results.len() < ef || dn < worst {
2617                results.push(NodeFurthest { dist: dn, node: n });
2618                if results.len() > ef {
2619                    results.pop();
2620                }
2621                candidates.push(NodeClosest { dist: dn, node: n });
2622            }
2623        }
2624    }
2625    // Drain results (max-heap order) and re-sort ascending so callers
2626    // can take `closest = result[0]` without flipping.
2627    let mut out: Vec<(f32, usize)> = results.into_iter().map(|c| (c.dist, c.node)).collect();
2628    out.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
2629    out
2630}
2631
2632/// Min-heap wrapper: smaller `dist` → higher priority in a `BinaryHeap`
2633/// (which is a max-heap), so we flip the comparison. NaN sorts last
2634/// (lowest priority) to keep the heap total-ordered.
2635#[derive(Debug, Clone, Copy)]
2636struct NodeClosest {
2637    dist: f32,
2638    node: usize,
2639}
2640impl PartialEq for NodeClosest {
2641    fn eq(&self, other: &Self) -> bool {
2642        self.dist == other.dist && self.node == other.node
2643    }
2644}
2645impl Eq for NodeClosest {}
2646impl PartialOrd for NodeClosest {
2647    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
2648        Some(self.cmp(other))
2649    }
2650}
2651impl Ord for NodeClosest {
2652    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
2653        // Reversed: smaller dist = greater priority.
2654        other
2655            .dist
2656            .partial_cmp(&self.dist)
2657            .unwrap_or(core::cmp::Ordering::Equal)
2658    }
2659}
2660
2661/// Max-heap wrapper: larger `dist` sits at the top so the worst result
2662/// can be evicted in O(log n) when a better candidate arrives.
2663#[derive(Debug, Clone, Copy)]
2664struct NodeFurthest {
2665    dist: f32,
2666    node: usize,
2667}
2668impl PartialEq for NodeFurthest {
2669    fn eq(&self, other: &Self) -> bool {
2670        self.dist == other.dist && self.node == other.node
2671    }
2672}
2673impl Eq for NodeFurthest {}
2674impl PartialOrd for NodeFurthest {
2675    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
2676        Some(self.cmp(other))
2677    }
2678}
2679impl Ord for NodeFurthest {
2680    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
2681        self.dist
2682            .partial_cmp(&other.dist)
2683            .unwrap_or(core::cmp::Ordering::Equal)
2684    }
2685}
2686
2687/// HNSW paper §4 algorithm 4: pick `m` neighbours from `candidates` so
2688/// that each chosen point isn't already covered by a closer chosen
2689/// point. Improves graph diversity → fewer hops needed at search time.
2690///
2691/// `candidates` arrives sorted ascending by distance-to-query. We walk
2692/// it in order, keeping a candidate only when no already-chosen point
2693/// is closer to it than the query is. Result is a vector of row
2694/// indices (length ≤ `m`).
2695fn select_neighbours_heuristic(
2696    candidates: &[(f32, usize)],
2697    m: usize,
2698    table: &Table,
2699    col_pos: usize,
2700) -> Vec<usize> {
2701    let mut chosen: Vec<usize> = Vec::with_capacity(m);
2702    for &(d_q, e) in candidates {
2703        if chosen.len() >= m {
2704            break;
2705        }
2706        // v6.0.1: works on either `Value::Vector` (F32) or
2707        // `Value::Sq8Vector` (Sq8) cells — `cell_l2_sq` dispatches
2708        // on encoding. A non-vector cell yields `f32::INFINITY`
2709        // which the `< d_q` test will never accept.
2710        if !matches!(
2711            table.rows.get(e).and_then(|r| r.values.get(col_pos)),
2712            Some(Value::Vector(_) | Value::Sq8Vector(_) | Value::HalfVector(_))
2713        ) {
2714            continue;
2715        }
2716        let mut covered = false;
2717        for &r in &chosen {
2718            // dist(e, r) measured in the same metric the topology was
2719            // built with (L2). If a chosen `r` is closer to `e` than
2720            // the query is, `r` already "covers" `e` for navigation.
2721            if cell_l2_sq(table, col_pos, e, r) < d_q {
2722                covered = true;
2723                break;
2724            }
2725        }
2726        if !covered {
2727            chosen.push(e);
2728        }
2729    }
2730    chosen
2731}
2732
2733/// Bidirectionally connect `new_row_idx` to each of `peers` at `layer`,
2734/// trimming each endpoint's adjacency to that layer's degree cap by
2735/// keeping only the closest neighbours.
2736fn connect_at_layer(
2737    table: &mut Table,
2738    idx_pos: usize,
2739    layer: u8,
2740    new_row_idx: usize,
2741    peers: &[usize],
2742) {
2743    let col_pos = table.indices[idx_pos].column_position;
2744    let cap = match &table.indices[idx_pos].kind {
2745        IndexKind::Nsw(g) => g.cap_for_layer(layer),
2746        IndexKind::BTree(_)
2747        | IndexKind::Brin { .. }
2748        | IndexKind::Gin(_)
2749        | IndexKind::GinTrgm(_) => return,
2750    };
2751    // v6.1.x: NSW adjacency stores neighbour row indices as u32 (4 B
2752    // each) rather than usize (8 B on 64-bit). Boundary casts here
2753    // assert the row count fits in u32 — the catalog already enforces
2754    // ≤ 4G rows per table, so the conversion can't lose data.
2755    let new_row_u32 = u32::try_from(new_row_idx).expect("row index fits in u32");
2756    if let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind {
2757        let layer_v = &mut g.layers[layer as usize];
2758        if let Some(slot) = layer_v.get_mut(new_row_idx) {
2759            *slot = peers
2760                .iter()
2761                .map(|&p| u32::try_from(p).expect("row index fits in u32"))
2762                .collect();
2763        }
2764    }
2765    for &peer in peers {
2766        // Skip peers whose indexed cell isn't a vector — same fence
2767        // as the F32 path; SQ8 cells flow through `cell_l2_sq`
2768        // below without dequantising.
2769        if !matches!(
2770            &table.rows[peer].values[col_pos],
2771            Value::Vector(_) | Value::Sq8Vector(_) | Value::HalfVector(_)
2772        ) {
2773            continue;
2774        }
2775        // 1. add the new node to peer's adjacency
2776        if let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind {
2777            let layer_v = &mut g.layers[layer as usize];
2778            if let Some(slot) = layer_v.get_mut(peer)
2779                && !slot.contains(&new_row_u32)
2780            {
2781                slot.push(new_row_u32);
2782            }
2783        }
2784        // 2. if peer is over budget, rebuild its adjacency with the
2785        //    HNSW §4 heuristic — same diversity criterion as the
2786        //    insert path so connectivity stays consistent.
2787        let needs_trim = match &table.indices[idx_pos].kind {
2788            IndexKind::Nsw(g) => g.layers[layer as usize][peer].len() > cap,
2789            IndexKind::BTree(_)
2790            | IndexKind::Brin { .. }
2791            | IndexKind::Gin(_)
2792            | IndexKind::GinTrgm(_) => false,
2793        };
2794        if needs_trim {
2795            let current_peers: Vec<usize> = match &table.indices[idx_pos].kind {
2796                IndexKind::Nsw(g) => g.layers[layer as usize][peer]
2797                    .iter()
2798                    .map(|&n| n as usize)
2799                    .collect(),
2800                IndexKind::BTree(_)
2801                | IndexKind::Brin { .. }
2802                | IndexKind::Gin(_)
2803                | IndexKind::GinTrgm(_) => continue,
2804            };
2805            // Sort by distance from `peer`'s cell ascending so the
2806            // heuristic receives candidates closest-first. `cell_l2_sq`
2807            // dispatches on encoding so SQ8 columns trim using
2808            // symmetric ADC.
2809            let mut tagged: Vec<(f32, usize)> = current_peers
2810                .iter()
2811                .map(|&p| (cell_l2_sq(table, col_pos, peer, p), p))
2812                .collect();
2813            tagged.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
2814            let kept = select_neighbours_heuristic(&tagged, cap, table, col_pos);
2815            if let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind
2816                && let Some(slot) = g.layers[layer as usize].get_mut(peer)
2817            {
2818                *slot = kept
2819                    .into_iter()
2820                    .map(|p| u32::try_from(p).expect("row index fits in u32"))
2821                    .collect();
2822            }
2823        }
2824    }
2825}
2826
2827/// Squared L2 distance from `query` (raw f32) to the cell at
2828/// `(row, col_pos)`. Dispatches on cell encoding: `Value::Vector`
2829/// (F32) uses `l2_distance_sq`; `Value::Sq8Vector` uses
2830/// `sq8_l2_distance_sq_asymmetric` (the v6.0.1 quantised path).
2831/// Returns `f32::INFINITY` for any non-vector cell so callers can
2832/// compare uniformly.
2833fn vec_l2_sq(table: &Table, col_pos: usize, row: usize, query: &[f32]) -> f32 {
2834    match table.rows.get(row).and_then(|r| r.values.get(col_pos)) {
2835        Some(Value::Vector(v)) if v.len() == query.len() => l2_distance_sq(v, query),
2836        Some(Value::Sq8Vector(q)) if q.bytes.len() == query.len() => {
2837            quantize::sq8_l2_distance_sq_asymmetric(q, query)
2838        }
2839        // v6.0.6: halfvec → fused NEON SIMD kernel; no Vec<f32>
2840        // allocation. v6.0.3 used `to_f32_vec()` + f32 NEON which
2841        // was correct but allocated per call (5× slower than F32).
2842        Some(Value::HalfVector(h)) if h.dim() == query.len() => {
2843            halfvec::half_l2_distance_sq_asymmetric(h, query)
2844        }
2845        _ => f32::INFINITY,
2846    }
2847}
2848
2849/// Squared L2 distance between two stored cells (no f32 query in
2850/// sight). Used during HNSW graph build — both endpoints are
2851/// rows already in the table, so symmetric ADC applies for SQ8
2852/// columns. Mixed-encoding cells within one column are a
2853/// schema-level impossibility (INSERT-time coercion enforces
2854/// uniform encoding), so the catch-all is an abort.
2855fn cell_l2_sq(table: &Table, col_pos: usize, row_a: usize, row_b: usize) -> f32 {
2856    let Some(cell_a) = table.rows.get(row_a).and_then(|r| r.values.get(col_pos)) else {
2857        return f32::INFINITY;
2858    };
2859    let Some(cell_b) = table.rows.get(row_b).and_then(|r| r.values.get(col_pos)) else {
2860        return f32::INFINITY;
2861    };
2862    match (cell_a, cell_b) {
2863        (Value::Vector(a), Value::Vector(b)) if a.len() == b.len() => l2_distance_sq(a, b),
2864        (Value::Sq8Vector(a), Value::Sq8Vector(b)) if a.bytes.len() == b.bytes.len() => {
2865            quantize::sq8_l2_distance_sq(a, b)
2866        }
2867        // v6.0.6: halfvec symmetric NEON — fused SIMD kernel that
2868        // loads both cells' raw u16 bits, expands to f32 lanes
2869        // inline, FMA-accumulates the squared diff. No Vec<f32>
2870        // allocation per call.
2871        (Value::HalfVector(a), Value::HalfVector(b)) if a.dim() == b.dim() => {
2872            halfvec::half_l2_distance_sq(a, b)
2873        }
2874        _ => f32::INFINITY,
2875    }
2876}
2877
2878/// kNN-search-time distance: stored cell → f32 query under the
2879/// caller's metric. Dispatches on cell encoding so SQ8 columns
2880/// take the ADC path with the right asymmetric variant. NaN /
2881/// dim-mismatch / non-vector → `f32::INFINITY`.
2882fn cell_to_query_metric_distance(
2883    table: &Table,
2884    col_pos: usize,
2885    row: usize,
2886    query: &[f32],
2887    metric: NswMetric,
2888) -> f32 {
2889    match table.rows.get(row).and_then(|r| r.values.get(col_pos)) {
2890        Some(Value::Vector(v)) if v.len() == query.len() => metric_distance(metric, v, query),
2891        Some(Value::Sq8Vector(q)) if q.bytes.len() == query.len() => match metric {
2892            NswMetric::L2 => quantize::sq8_l2_distance_sq_asymmetric(q, query),
2893            NswMetric::InnerProduct => quantize::sq8_inner_product_asymmetric(q, query),
2894            NswMetric::Cosine => quantize::sq8_cosine_distance_asymmetric(q, query),
2895        },
2896        // v6.0.6: halfvec dispatches by metric to fused NEON
2897        // kernels — no Vec<f32> allocation per call.
2898        Some(Value::HalfVector(h)) if h.dim() == query.len() => match metric {
2899            NswMetric::L2 => halfvec::half_l2_distance_sq_asymmetric(h, query),
2900            NswMetric::InnerProduct => halfvec::half_inner_product_asymmetric(h, query),
2901            NswMetric::Cosine => halfvec::half_cosine_distance_asymmetric(h, query),
2902        },
2903        _ => f32::INFINITY,
2904    }
2905}
2906
2907/// Distance metric used at NSW search time. The graph topology is
2908/// always built with `L2`; querying with `InnerProduct` / `Cosine`
2909/// reuses the same edges but ranks candidates by the chosen metric.
2910/// For the corpus-sized graphs this loses negligible recall vs
2911/// building separate per-metric graphs.
2912#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2913pub enum NswMetric {
2914    /// Squared Euclidean — ranks "smaller = closer" (the sqrt is
2915    /// monotonic so we skip it for ordering).
2916    L2,
2917    /// Negated dot product, matching pgvector `<#>` convention so
2918    /// "smaller = more similar" holds across all three metrics.
2919    InnerProduct,
2920    /// Cosine distance `1 - cos(a, b)`. Zero-norm operand yields
2921    /// `f32::INFINITY` so it sorts last.
2922    Cosine,
2923}
2924
2925/// Multi-layer HNSW kNN search: greedy-descend from the entry to layer 0,
2926/// then beam-search there with the requested `ef` to return the top `k`
2927/// results under the caller-chosen metric. Topology was built with L2 —
2928/// upper-layer descent uses L2 as a coarse heuristic; final beam search
2929/// runs in the requested metric so rankings are correct for `<#>` / `<=>`.
2930fn nsw_search(
2931    table: &Table,
2932    idx_pos: usize,
2933    query: &[f32],
2934    k: usize,
2935    ef: usize,
2936    metric: NswMetric,
2937) -> Vec<(f32, usize)> {
2938    let (entry, entry_level) = match &table.indices[idx_pos].kind {
2939        IndexKind::Nsw(g) => (g.entry, g.entry_level),
2940        IndexKind::BTree(_)
2941        | IndexKind::Brin { .. }
2942        | IndexKind::Gin(_)
2943        | IndexKind::GinTrgm(_) => return Vec::new(),
2944    };
2945    let Some(entry) = entry else {
2946        return Vec::new();
2947    };
2948    let col_pos = table.indices[idx_pos].column_position;
2949    // v6.0.1 step 5: SQ8 columns over-fetch by `SQ8_RERANK_OVER_FETCH`
2950    // so the rerank pass below sees enough candidates to recover
2951    // recall after the ADC re-ordering. F32 + F16 columns skip the
2952    // over-fetch — F32 distances are exact, F16 dequant is
2953    // bit-exact at the storage layer so the beam search already
2954    // ranks under the column's full precision.
2955    let sq8 = matches!(
2956        table.schema.columns.get(col_pos).map(|c| c.ty),
2957        Some(DataType::Vector {
2958            encoding: VecEncoding::Sq8,
2959            ..
2960        })
2961    );
2962    let ef = if sq8 {
2963        ef.max(k).max(k * SQ8_RERANK_OVER_FETCH)
2964    } else {
2965        ef.max(k)
2966    };
2967    // Descend by L2 (the topology metric) so layers prune consistently.
2968    let entry_d = vec_l2_sq(table, col_pos, entry, query);
2969    let mut current = entry;
2970    let mut current_d = entry_d;
2971    for layer in (1..=entry_level).rev() {
2972        (current, current_d) = greedy_layer_walk(table, idx_pos, layer, current, current_d, query);
2973    }
2974    // Final beam search on layer 0 under the caller's metric.
2975    let mut results = layer_beam_search(table, idx_pos, 0, current, current_d, query, ef, metric);
2976    if sq8 {
2977        results = sq8_rerank(table, col_pos, &results, query, metric);
2978    }
2979    results.truncate(k);
2980    results
2981}
2982
2983/// v6.0.1 step 5: re-score ADC top-`K*3` candidates with the
2984/// dequantised cell vs the f32 query, then re-sort. Recovers the
2985/// recall the SQ8 ADC sacrifices for 4× compression — the design's
2986/// "f32 rerank step is on by default" path (deliberation #3).
2987/// `metric` is the same metric the beam search used; the rerank
2988/// arithmetic re-derives the exact distance under that metric.
2989fn sq8_rerank(
2990    table: &Table,
2991    col_pos: usize,
2992    candidates: &[(f32, usize)],
2993    query: &[f32],
2994    metric: NswMetric,
2995) -> Vec<(f32, usize)> {
2996    let mut out: Vec<(f32, usize)> = candidates
2997        .iter()
2998        .filter_map(|&(adc_d, row)| {
2999            let cell = table.rows.get(row).and_then(|r| r.values.get(col_pos))?;
3000            let Value::Sq8Vector(q) = cell else {
3001                // F32 cells shouldn't reach this path (sq8 fence
3002                // above), but stay defensive: pass through with
3003                // the ADC distance unchanged.
3004                return Some((adc_d, row));
3005            };
3006            let deq = quantize::dequantize(q);
3007            if deq.len() != query.len() {
3008                return None;
3009            }
3010            Some((metric_distance(metric, &deq, query), row))
3011        })
3012        .collect();
3013    out.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
3014    out
3015}
3016
3017/// Multiplier applied to `k` so the SQ8 rerank pass sees a wider
3018/// candidate set. 3× is the design-stage value; v6.0.5 sweep work
3019/// can re-tune once full corpus profiling is in.
3020const SQ8_RERANK_OVER_FETCH: usize = 3;
3021
3022fn metric_distance(metric: NswMetric, a: &[f32], b: &[f32]) -> f32 {
3023    match metric {
3024        NswMetric::L2 => l2_distance_sq(a, b),
3025        NswMetric::InnerProduct => -inner_product_f32(a, b),
3026        NswMetric::Cosine => {
3027            let (dot, na, nb) = cosine_dot_norms_f32(a, b);
3028            if na == 0.0 || nb == 0.0 {
3029                return f32::INFINITY;
3030            }
3031            // `f32::sqrt` lives in std, so hand-roll Newton-Raphson on
3032            // f64 — same trick the L2 binary op already uses.
3033            let denom = sqrt_newton_f32(na) * sqrt_newton_f32(nb);
3034            1.0 - dot / denom
3035        }
3036    }
3037}
3038
3039/// v6.0.2: dispatch wrapper for the f32 dot product (used by `<#>` +
3040/// the cosine numerator). NEON path when `len % 4 == 0 && len >= 4`,
3041/// scalar fallback otherwise. Returns the positive dot — callers
3042/// negate for the pgvector `<#>` "smaller = closer" convention.
3043///
3044/// Public so perf gates + downstream benches can microbenchmark the
3045/// dispatch directly; not part of the STABILITY contract — internal
3046/// SIMD layout can evolve in any release.
3047#[doc(hidden)]
3048#[inline]
3049pub fn inner_product_f32(a: &[f32], b: &[f32]) -> f32 {
3050    #[cfg(target_arch = "aarch64")]
3051    {
3052        if a.len() == b.len() && a.len() >= 4 && a.len().is_multiple_of(4) {
3053            // SAFETY: NEON is a baseline aarch64 feature; preconditions
3054            // (matching lengths, ≥ 1 full lane group) are checked above.
3055            return unsafe { inner_product_neon(a, b) };
3056        }
3057    }
3058    inner_product_scalar(a, b)
3059}
3060
3061fn inner_product_scalar(a: &[f32], b: &[f32]) -> f32 {
3062    let mut dot: f32 = 0.0;
3063    for (x, y) in a.iter().zip(b.iter()) {
3064        dot += x * y;
3065    }
3066    dot
3067}
3068
3069#[cfg(target_arch = "aarch64")]
3070#[target_feature(enable = "neon")]
3071#[allow(clippy::many_single_char_names)] // NEON intrinsics work in single-letter regs by convention
3072unsafe fn inner_product_neon(a: &[f32], b: &[f32]) -> f32 {
3073    use core::arch::aarch64::{
3074        float32x4_t, vaddq_f32, vaddvq_f32, vdupq_n_f32, vfmaq_f32, vld1q_f32,
3075    };
3076    unsafe {
3077        // Two parallel accumulators (same trick as L2 NEON) so the
3078        // FMA dependency chain doesn't serialise.
3079        let zero: float32x4_t = vdupq_n_f32(0.0);
3080        let mut acc0 = zero;
3081        let mut acc1 = zero;
3082        let n = a.len();
3083        let mut i = 0usize;
3084        while i + 8 <= n {
3085            let av0 = vld1q_f32(a.as_ptr().add(i));
3086            let bv0 = vld1q_f32(b.as_ptr().add(i));
3087            acc0 = vfmaq_f32(acc0, av0, bv0);
3088            let av1 = vld1q_f32(a.as_ptr().add(i + 4));
3089            let bv1 = vld1q_f32(b.as_ptr().add(i + 4));
3090            acc1 = vfmaq_f32(acc1, av1, bv1);
3091            i += 8;
3092        }
3093        while i + 4 <= n {
3094            let av = vld1q_f32(a.as_ptr().add(i));
3095            let bv = vld1q_f32(b.as_ptr().add(i));
3096            acc0 = vfmaq_f32(acc0, av, bv);
3097            i += 4;
3098        }
3099        vaddvq_f32(vaddq_f32(acc0, acc1))
3100    }
3101}
3102
3103/// v6.0.2: dispatch wrapper for the three accumulators (`dot`, `||a||²`,
3104/// `||b||²`) cosine needs. Same NEON pre-condition as the L2 / IP
3105/// paths; same scalar fallback shape.
3106///
3107/// Public for benchmarking only (see `inner_product_f32`); not in the
3108/// STABILITY contract.
3109#[doc(hidden)]
3110#[inline]
3111pub fn cosine_dot_norms_f32(a: &[f32], b: &[f32]) -> (f32, f32, f32) {
3112    #[cfg(target_arch = "aarch64")]
3113    {
3114        if a.len() == b.len() && a.len() >= 4 && a.len().is_multiple_of(4) {
3115            // SAFETY: see `inner_product_neon`.
3116            return unsafe { cosine_dot_norms_neon(a, b) };
3117        }
3118    }
3119    cosine_dot_norms_scalar(a, b)
3120}
3121
3122fn cosine_dot_norms_scalar(a: &[f32], b: &[f32]) -> (f32, f32, f32) {
3123    let mut dot: f32 = 0.0;
3124    let mut na: f32 = 0.0;
3125    let mut nb: f32 = 0.0;
3126    for (x, y) in a.iter().zip(b.iter()) {
3127        dot += x * y;
3128        na += x * x;
3129        nb += y * y;
3130    }
3131    (dot, na, nb)
3132}
3133
3134#[cfg(target_arch = "aarch64")]
3135#[target_feature(enable = "neon")]
3136#[allow(clippy::many_single_char_names, clippy::similar_names)]
3137unsafe fn cosine_dot_norms_neon(a: &[f32], b: &[f32]) -> (f32, f32, f32) {
3138    use core::arch::aarch64::{float32x4_t, vaddvq_f32, vdupq_n_f32, vfmaq_f32, vld1q_f32};
3139    unsafe {
3140        let zero: float32x4_t = vdupq_n_f32(0.0);
3141        let mut acc_dot = zero;
3142        let mut acc_na = zero;
3143        let mut acc_nb = zero;
3144        let n = a.len();
3145        let mut i = 0usize;
3146        while i + 4 <= n {
3147            let av = vld1q_f32(a.as_ptr().add(i));
3148            let bv = vld1q_f32(b.as_ptr().add(i));
3149            acc_dot = vfmaq_f32(acc_dot, av, bv);
3150            acc_na = vfmaq_f32(acc_na, av, av);
3151            acc_nb = vfmaq_f32(acc_nb, bv, bv);
3152            i += 4;
3153        }
3154        (vaddvq_f32(acc_dot), vaddvq_f32(acc_na), vaddvq_f32(acc_nb))
3155    }
3156}
3157
3158fn sqrt_newton_f32(x: f32) -> f32 {
3159    if x <= 0.0 {
3160        return 0.0;
3161    }
3162    let mut g = x;
3163    for _ in 0..10 {
3164        g = 0.5 * (g + x / g);
3165    }
3166    g
3167}
3168
3169/// Squared Euclidean distance — used for ordering inside NSW (the sqrt
3170/// preserves the order). Caller takes sqrt before reporting back to SQL.
3171///
3172/// v3.3.2: aarch64 NEON path for `len % 4 == 0` (which covers every
3173/// HNSW-indexed VECTOR(N) where N is a multiple of 4 — i.e. all
3174/// production-shaped embeddings: 64, 128, 256, 384, 512, 768, 1024,
3175/// 1536, ...). Other shapes fall back to the scalar loop.
3176#[inline]
3177fn l2_distance_sq(a: &[f32], b: &[f32]) -> f32 {
3178    #[cfg(target_arch = "aarch64")]
3179    {
3180        if a.len() == b.len() && a.len() >= 4 && a.len().is_multiple_of(4) {
3181            // SAFETY: NEON is a baseline aarch64 feature (ARMv8);
3182            // the precondition is checked above (matching lengths,
3183            // multiple of 4, at least one 128-bit lane group).
3184            return unsafe { l2_distance_sq_neon(a, b) };
3185        }
3186    }
3187    l2_distance_sq_scalar(a, b)
3188}
3189
3190fn l2_distance_sq_scalar(a: &[f32], b: &[f32]) -> f32 {
3191    let mut sum: f32 = 0.0;
3192    for (x, y) in a.iter().zip(b.iter()) {
3193        let d = *x - *y;
3194        sum += d * d;
3195    }
3196    sum
3197}
3198
3199#[cfg(target_arch = "aarch64")]
3200#[target_feature(enable = "neon")]
3201#[allow(clippy::many_single_char_names)] // NEON intrinsics work in single-letter regs by convention
3202unsafe fn l2_distance_sq_neon(a: &[f32], b: &[f32]) -> f32 {
3203    use core::arch::aarch64::{
3204        float32x4_t, vaddq_f32, vaddvq_f32, vdupq_n_f32, vfmaq_f32, vld1q_f32, vsubq_f32,
3205    };
3206    unsafe {
3207        // Two independent accumulator registers so the FMA dependency
3208        // chain doesn't serialise (each FMA depends on prior FMA).
3209        // Pre-conditions checked by caller: `a.len() == b.len()`,
3210        // `a.len() % 4 == 0`, `a.len() >= 4`.
3211        let zero: float32x4_t = vdupq_n_f32(0.0);
3212        let mut acc0 = zero;
3213        let mut acc1 = zero;
3214        let n = a.len();
3215        let mut i = 0usize;
3216        // Process 8 floats per iter when available (two parallel
3217        // accumulators). Tail of 4 falls into the second loop.
3218        while i + 8 <= n {
3219            let d0 = vsubq_f32(vld1q_f32(a.as_ptr().add(i)), vld1q_f32(b.as_ptr().add(i)));
3220            acc0 = vfmaq_f32(acc0, d0, d0);
3221            let d1 = vsubq_f32(
3222                vld1q_f32(a.as_ptr().add(i + 4)),
3223                vld1q_f32(b.as_ptr().add(i + 4)),
3224            );
3225            acc1 = vfmaq_f32(acc1, d1, d1);
3226            i += 8;
3227        }
3228        while i + 4 <= n {
3229            let d = vsubq_f32(vld1q_f32(a.as_ptr().add(i)), vld1q_f32(b.as_ptr().add(i)));
3230            acc0 = vfmaq_f32(acc0, d, d);
3231            i += 4;
3232        }
3233        vaddvq_f32(vaddq_f32(acc0, acc1))
3234    }
3235}
3236
3237/// Public wrapper: run an NSW kNN search and return the top-k row
3238/// indices ordered by ascending distance under the given metric.
3239pub fn nsw_query(
3240    table: &Table,
3241    idx_name: &str,
3242    query: &[f32],
3243    k: usize,
3244    metric: NswMetric,
3245) -> Vec<usize> {
3246    let Some(idx_pos) = table.indices.iter().position(|i| i.name == idx_name) else {
3247        return Vec::new();
3248    };
3249    let ef = (k * 2).max(NSW_DEFAULT_M);
3250    let mut hits = nsw_search(table, idx_pos, query, k, ef, metric);
3251    hits.truncate(k);
3252    hits.into_iter().map(|(_, idx)| idx).collect()
3253}
3254
3255/// Find any NSW index on a column. Used by the planner to decide
3256/// whether an `ORDER BY col <-> literal LIMIT k` query can skip the
3257/// brute-force scan.
3258pub fn nsw_index_on(table: &Table, column_position: usize) -> Option<&Index> {
3259    table
3260        .indices
3261        .iter()
3262        .find(|i| i.column_position == column_position && matches!(i.kind, IndexKind::Nsw(_)))
3263}
3264
3265/// Catalog: insertion-ordered `Vec<Table>` for stable iter / serialize,
3266/// plus a `BTreeMap<String, usize>` sidecar index so `get` / `get_mut`
3267/// run in O(log n) instead of the old linear scan with per-element
3268/// string compares.
3269///
3270/// A pure `BTreeMap<String, Table>` was tried in an interim version
3271/// of v3.1.2 and regressed the single-table catalog benches by ~10%
3272/// (the per-element `BTreeMap` overhead outweighs the lookup win
3273/// when n is small). The sidecar shape preserves the insertion-order
3274/// iteration the on-disk encoding relies on and keeps `last_mut`
3275/// (used by the deserialize hot path) cheap.
3276#[derive(Debug, Clone, Default)]
3277pub struct Catalog {
3278    tables: Vec<Table>,
3279    /// `name → tables[index]`. Kept in lock-step with `tables`.
3280    /// `create_table` is the only write path.
3281    by_name: BTreeMap<String, usize>,
3282    /// v5.1: in-memory cold-tier segments. Side-loaded via
3283    /// [`Catalog::load_segment_bytes`] — they live outside the
3284    /// catalog snapshot (caller persists them as separate files
3285    /// and re-loads on boot, until v5.3's `CatalogManifest` makes
3286    /// that wiring automatic). `RowLocator::Cold { segment_id, .. }`
3287    /// indexes this `Vec`. Cleared on `Catalog::new` / fresh
3288    /// `deserialize`.
3289    ///
3290    /// `Arc` wrap keeps `Catalog::clone` at O(N segments) bumps
3291    /// (rather than O(total segment bytes) memcpy) so the v4.42
3292    /// group-commit pre-image rollback invariant — clone is
3293    /// effectively free — survives the cold-tier addition.
3294    ///
3295    /// v6.7.3 — slots became `Option<…>` so cold-segment compaction
3296    /// can tombstone merged sources without breaking the
3297    /// `segment_id = index_into_vec` contract that on-disk
3298    /// `RowLocator::Cold { segment_id }` already serialized.
3299    /// `None` slot = the segment was retired by compaction; the
3300    /// physical file may still be on disk (next CHECKPOINT writes
3301    /// a manifest that no longer lists it, and the file becomes
3302    /// an orphan eligible for offline cleanup).
3303    cold_segments: Vec<Option<Arc<OwnedSegment>>>,
3304    /// v7.12.4 — user-defined functions (PL/pgSQL + SQL).
3305    /// Keyed by function name (PG overloading is out of scope).
3306    /// Bodies are stored as the raw source text the parser saw
3307    /// between `$$ ... $$`; the engine re-parses on each
3308    /// invocation. This keeps `spg-storage` free of `spg-sql`
3309    /// dependency — same pattern as partial-index predicates.
3310    functions: BTreeMap<String, FunctionDef>,
3311    /// v7.12.4 — triggers in insertion order. Multiple triggers
3312    /// per table / event fire in this order (matching PG's
3313    /// alphabetical-by-default with insertion-stable tie-break
3314    /// behaviour — we just keep insertion order for now).
3315    triggers: Vec<TriggerDef>,
3316}
3317
3318/// v7.12.4 — catalogued user-defined function. `body` is the raw
3319/// source text between `$$ ... $$`; the engine re-parses it on
3320/// invocation. This keeps the storage codec stable when the
3321/// PL/pgSQL surface grows (no breaking-change risk on the disk
3322/// format).
3323#[derive(Debug, Clone, PartialEq, Eq)]
3324pub struct FunctionDef {
3325    pub name: String,
3326    /// Display form of the argument list, e.g.
3327    /// `"(name TEXT, ts TIMESTAMP)"`. Empty `"()"` for the trigger
3328    /// function shape. Parser-side canonicalised before storage.
3329    pub args_repr: String,
3330    /// Display form of the return type, e.g. `"TRIGGER"` /
3331    /// `"INT"` / `"SETOF text"`. The engine special-cases
3332    /// `"TRIGGER"` (case-insensitive) to gate trigger-only
3333    /// semantics (NEW/OLD).
3334    pub returns: String,
3335    /// `LANGUAGE` clause, lowercased. `"plpgsql"` / `"sql"`.
3336    pub language: String,
3337    /// Source body of the function. PL/pgSQL: includes the
3338    /// surrounding `BEGIN ... END;`. SQL: includes the
3339    /// statement(s). The engine re-parses on invocation; bad
3340    /// bodies surface as a parse error at CALL time, not CREATE.
3341    pub body: String,
3342}
3343
3344/// v7.12.4 — catalogued trigger. References its function by
3345/// name; the function must exist at TRIGGER creation time
3346/// (forward references are deferred to v7.12.5+).
3347#[derive(Debug, Clone, PartialEq, Eq)]
3348pub struct TriggerDef {
3349    pub name: String,
3350    /// Watched table. Trigger is dropped when the table drops.
3351    pub table: String,
3352    /// `"BEFORE"` / `"AFTER"` / `"INSTEAD OF"`. Stored as the
3353    /// uppercased keyword so deserialised catalogs round-trip
3354    /// without canonicalisation surprises.
3355    pub timing: String,
3356    /// Each entry is one of `"INSERT"` / `"UPDATE"` / `"DELETE"`
3357    /// / `"TRUNCATE"`. `INSERT OR UPDATE` parses to two entries.
3358    pub events: Vec<String>,
3359    /// `"ROW"` / `"STATEMENT"`. v7.12.4 ships `"ROW"` only;
3360    /// `"STATEMENT"` parses and persists but the executor
3361    /// refuses it at trigger fire time.
3362    pub for_each: String,
3363    /// Name of the PL/pgSQL function to invoke.
3364    pub function: String,
3365    /// v7.13.0 — `UPDATE OF col, col, …` column-list filter
3366    /// (mailrs round-5 G7). Non-empty means the trigger fires
3367    /// only when at least one of these columns appears in the
3368    /// UPDATE's SET list. Empty = no column filter. Stored in
3369    /// catalog FILE_VERSION 23+; older catalogs deserialise with
3370    /// an empty vec.
3371    pub update_columns: Vec<String>,
3372    /// v7.16.1 — whether the trigger fires when its watched
3373    /// event occurs. Toggled by `ALTER TABLE … { ENABLE |
3374    /// DISABLE } TRIGGER …`; pg_dump --disable-triggers wraps
3375    /// every data block with a DISABLE/ENABLE pair so the
3376    /// rows already-computed in prod don't get re-rewritten.
3377    /// Defaults to `true` at CREATE TRIGGER time. Stored in
3378    /// catalog FILE_VERSION 25+; older catalogs deserialise
3379    /// with `enabled = true`.
3380    pub enabled: bool,
3381}
3382
3383impl Catalog {
3384    pub const fn new() -> Self {
3385        Self {
3386            tables: Vec::new(),
3387            by_name: BTreeMap::new(),
3388            cold_segments: Vec::new(),
3389            functions: BTreeMap::new(),
3390            triggers: Vec::new(),
3391        }
3392    }
3393
3394    /// v7.12.4 — read-only view of catalogued user-defined
3395    /// functions. Engine callers go through here to look up the
3396    /// function body before re-parsing it for invocation.
3397    pub const fn functions(&self) -> &BTreeMap<String, FunctionDef> {
3398        &self.functions
3399    }
3400
3401    /// v7.12.4 — register a new user-defined function. With
3402    /// `or_replace = false`, errors if the name is taken. The
3403    /// engine validates the body before passing it here.
3404    pub fn create_function(
3405        &mut self,
3406        def: FunctionDef,
3407        or_replace: bool,
3408    ) -> Result<(), StorageError> {
3409        if !or_replace && self.functions.contains_key(&def.name) {
3410            return Err(StorageError::Corrupt(format!(
3411                "function {:?} already exists (drop or use CREATE OR REPLACE)",
3412                def.name
3413            )));
3414        }
3415        self.functions.insert(def.name.clone(), def);
3416        Ok(())
3417    }
3418
3419    /// v7.12.4 — remove a user-defined function by name. Returns
3420    /// `true` if a function was removed, `false` if none matched.
3421    /// Caller decides whether to surface `if_exists` semantics.
3422    pub fn drop_function(&mut self, name: &str) -> bool {
3423        self.functions.remove(name).is_some()
3424    }
3425
3426    /// v7.12.4 — read-only slice of all catalogued triggers.
3427    /// Engine row-write paths filter this by (table, event,
3428    /// timing) and fire matches in slice order.
3429    pub fn triggers(&self) -> &[TriggerDef] {
3430        &self.triggers
3431    }
3432
3433    /// v7.15.0 — mutable handle to the trigger slice for
3434    /// `ALTER TABLE … RENAME COLUMN`, which rewrites every
3435    /// `update_columns` entry that referenced the renamed
3436    /// column.
3437    pub fn triggers_mut(&mut self) -> &mut Vec<TriggerDef> {
3438        &mut self.triggers
3439    }
3440
3441    /// v7.12.4 — register a new trigger. With `or_replace = false`,
3442    /// errors when a trigger with the same name already exists on
3443    /// the same table (PG scoping rule — trigger names are
3444    /// per-table, not global). Trigger function must already
3445    /// exist in the catalog at registration time.
3446    pub fn create_trigger(
3447        &mut self,
3448        def: TriggerDef,
3449        or_replace: bool,
3450    ) -> Result<(), StorageError> {
3451        if !self.by_name.contains_key(&def.table) {
3452            return Err(StorageError::TableNotFound {
3453                name: def.table.clone(),
3454            });
3455        }
3456        if !self.functions.contains_key(&def.function) {
3457            return Err(StorageError::Corrupt(format!(
3458                "trigger {:?} references unknown function {:?}",
3459                def.name, def.function
3460            )));
3461        }
3462        let dup = self
3463            .triggers
3464            .iter()
3465            .position(|t| t.name == def.name && t.table == def.table);
3466        match (dup, or_replace) {
3467            (Some(_), false) => Err(StorageError::Corrupt(format!(
3468                "trigger {:?} already exists on table {:?}",
3469                def.name, def.table
3470            ))),
3471            (Some(i), true) => {
3472                self.triggers[i] = def;
3473                Ok(())
3474            }
3475            (None, _) => {
3476                self.triggers.push(def);
3477                Ok(())
3478            }
3479        }
3480    }
3481
3482    /// v7.12.4 — remove a trigger by `(name, table)`. Returns
3483    /// `true` if one was removed.
3484    pub fn drop_trigger(&mut self, name: &str, table: &str) -> bool {
3485        let before = self.triggers.len();
3486        self.triggers
3487            .retain(|t| !(t.name == name && t.table == table));
3488        before != self.triggers.len()
3489    }
3490
3491    pub fn create_table(&mut self, schema: TableSchema) -> Result<(), StorageError> {
3492        if self.by_name.contains_key(&schema.name) {
3493            return Err(StorageError::DuplicateTable {
3494                name: schema.name.clone(),
3495            });
3496        }
3497        let idx = self.tables.len();
3498        let name = schema.name.clone();
3499        self.tables.push(Table::new(schema));
3500        self.by_name.insert(name, idx);
3501        Ok(())
3502    }
3503
3504    pub fn get(&self, name: &str) -> Option<&Table> {
3505        let idx = *self.by_name.get(name)?;
3506        self.tables.get(idx)
3507    }
3508
3509    pub fn get_mut(&mut self, name: &str) -> Option<&mut Table> {
3510        let idx = *self.by_name.get(name)?;
3511        self.tables.get_mut(idx)
3512    }
3513
3514    pub fn table_count(&self) -> usize {
3515        self.tables.len()
3516    }
3517
3518    /// v7.14.0 — remove a table by name. Returns `true` when the
3519    /// table existed (and is now gone), `false` when it didn't.
3520    /// Used by `DROP TABLE` from pg_dump / mysqldump preambles
3521    /// where the dump re-creates schema and starts with
3522    /// `DROP TABLE IF EXISTS`.
3523    pub fn drop_table(&mut self, name: &str) -> bool {
3524        let Some(idx) = self.by_name.remove(name) else {
3525            return false;
3526        };
3527        // swap_remove invalidates the trailing index → rebuild
3528        // by_name for affected entries.
3529        self.tables.swap_remove(idx);
3530        // Re-stamp moved table's index slot in by_name.
3531        if idx < self.tables.len() {
3532            let moved_name = self.tables[idx].schema.name.clone();
3533            self.by_name.insert(moved_name, idx);
3534        }
3535        true
3536    }
3537
3538    /// v7.16.2 — rename a table (mailrs round-10 A.5). Updates
3539    /// the schema name, the catalog name → index map, and
3540    /// rewrites every reference dangling at the table name:
3541    ///   * every FK on every OTHER table whose `parent_table`
3542    ///     pointed at the old name now points at the new
3543    ///     name, so FK enforcement keeps working
3544    ///   * every trigger watching the table updates its `table`
3545    ///     field
3546    /// Returns `Ok` on success; `Err(StorageError::TableNotFound)`
3547    /// when the old name isn't in the catalog and
3548    /// `Err(StorageError::DuplicateTable)` when the new name is
3549    /// already taken.
3550    pub fn rename_table(&mut self, old: &str, new: &str) -> Result<(), StorageError> {
3551        if old == new {
3552            return Ok(());
3553        }
3554        if self.by_name.contains_key(new) {
3555            return Err(StorageError::Corrupt(format!(
3556                "rename_table: target name {new:?} already exists"
3557            )));
3558        }
3559        let idx = self
3560            .by_name
3561            .remove(old)
3562            .ok_or_else(|| StorageError::TableNotFound { name: old.into() })?;
3563        self.tables[idx].schema.name = new.to_string();
3564        self.by_name.insert(new.to_string(), idx);
3565        for t in &mut self.tables {
3566            for fk in &mut t.schema.foreign_keys {
3567                if fk.parent_table == old {
3568                    fk.parent_table = new.to_string();
3569                }
3570            }
3571        }
3572        for trig in &mut self.triggers {
3573            if trig.table == old {
3574                trig.table = new.to_string();
3575            }
3576        }
3577        Ok(())
3578    }
3579
3580    /// v7.16.2 — rename an index by name. Walks every table
3581    /// since the index lives on its owning table; updates the
3582    /// name in place. Errors with `IndexNotFound` when no
3583    /// index matches. mailrs round-10 A.5.
3584    pub fn rename_index(&mut self, old: &str, new: &str) -> Result<(), StorageError> {
3585        if old == new {
3586            return Ok(());
3587        }
3588        // Reject the new name if it already exists anywhere.
3589        for t in &self.tables {
3590            if t.indices.iter().any(|i| i.name == new) {
3591                return Err(StorageError::Corrupt(format!(
3592                    "rename_index: target name {new:?} already exists"
3593                )));
3594            }
3595        }
3596        for t in &mut self.tables {
3597            for i in &mut t.indices {
3598                if i.name == old {
3599                    i.name = new.to_string();
3600                    return Ok(());
3601                }
3602            }
3603        }
3604        Err(StorageError::IndexNotFound { name: old.into() })
3605    }
3606
3607    /// v7.14.0 — remove a named index across the catalog.
3608    /// Returns `true` when found + dropped.
3609    pub fn drop_named_index(&mut self, name: &str) -> bool {
3610        for t in &mut self.tables {
3611            let before = t.indices.len();
3612            t.indices.retain(|i| i.name != name);
3613            if t.indices.len() != before {
3614                return true;
3615            }
3616        }
3617        false
3618    }
3619
3620    /// Borrow-free copy of every table's name in catalog order
3621    /// (= insertion order, matching the on-disk encoding).
3622    pub fn table_names(&self) -> Vec<String> {
3623        self.tables.iter().map(|t| t.schema.name.clone()).collect()
3624    }
3625
3626    /// v5.1: register a cold-tier segment that already lives in
3627    /// memory (caller did the file read). Returns the
3628    /// `segment_id` that `RowLocator::Cold { segment_id, .. }`
3629    /// will reference — currently this is just the index into
3630    /// `cold_segments`, but treat it as an opaque token.
3631    ///
3632    /// Storage is `no_std`, so file I/O is the caller's
3633    /// responsibility — `spg-server` reads the file and forwards
3634    /// the bytes here. The bytes stay resident in the catalog
3635    /// for the life of the `Catalog`, parsed only once.
3636    pub fn load_segment_bytes(&mut self, bytes: Vec<u8>) -> Result<u32, StorageError> {
3637        let id = u32::try_from(self.cold_segments.len()).map_err(|_| {
3638            StorageError::Corrupt("cold segment count would exceed u32::MAX".into())
3639        })?;
3640        let seg = OwnedSegment::from_bytes(bytes)
3641            .map_err(|e| StorageError::Corrupt(format!("cold segment parse failed: {e}")))?;
3642        self.cold_segments.push(Some(Arc::new(seg)));
3643        Ok(id)
3644    }
3645
3646    /// v6.7.3 — register a cold-tier segment at a specific id. Used
3647    /// by the spg-server manifest-boot path so segments whose
3648    /// neighbouring ids were retired by compaction still get back
3649    /// the same `segment_id` they had pre-restart (the
3650    /// `RowLocator::Cold { segment_id }` baked into the BTree-index
3651    /// snapshot persists across restart and must continue to
3652    /// resolve).
3653    ///
3654    /// Pads the Vec with `None` slots up to `target_id` if needed.
3655    /// Errors when the target slot is already occupied (would
3656    /// stomp another segment), the parse fails, or `target_id`
3657    /// exceeds `u32::MAX`.
3658    pub fn load_segment_bytes_at(
3659        &mut self,
3660        target_id: u32,
3661        bytes: Vec<u8>,
3662    ) -> Result<(), StorageError> {
3663        let seg = OwnedSegment::from_bytes(bytes)
3664            .map_err(|e| StorageError::Corrupt(format!("cold segment parse failed: {e}")))?;
3665        let idx = target_id as usize;
3666        while self.cold_segments.len() <= idx {
3667            self.cold_segments.push(None);
3668        }
3669        if self.cold_segments[idx].is_some() {
3670            return Err(StorageError::Corrupt(format!(
3671                "load_segment_bytes_at: segment_id {target_id} already occupied"
3672            )));
3673        }
3674        self.cold_segments[idx] = Some(Arc::new(seg));
3675        Ok(())
3676    }
3677
3678    /// v6.7.3 — retire a cold-tier segment slot (compaction-driven).
3679    /// The physical file is the caller's concern (typically kept
3680    /// on disk until the next CHECKPOINT writes a manifest that
3681    /// no longer lists it); this just flips the in-memory slot
3682    /// to `None` so later cold lookups for `segment_id` resolve
3683    /// as "unknown" instead of returning a stale row.
3684    ///
3685    /// No-op when the slot is already `None`. Errors only when
3686    /// `segment_id` is out of bounds.
3687    pub fn tombstone_segment(&mut self, segment_id: u32) -> Result<(), StorageError> {
3688        let idx = segment_id as usize;
3689        if idx >= self.cold_segments.len() {
3690            return Err(StorageError::Corrupt(format!(
3691                "tombstone_segment: segment_id {segment_id} out of bounds (len={})",
3692                self.cold_segments.len()
3693            )));
3694        }
3695        self.cold_segments[idx] = None;
3696        Ok(())
3697    }
3698
3699    /// Number of *active* (non-tombstoned) cold segments.
3700    #[must_use]
3701    pub fn cold_segment_count(&self) -> usize {
3702        self.cold_segments.iter().filter(|s| s.is_some()).count()
3703    }
3704
3705    /// Slot count including tombstones (= the next id the
3706    /// no-arg `load_segment_bytes` would allocate).
3707    #[must_use]
3708    pub fn cold_segment_slot_count(&self) -> usize {
3709        self.cold_segments.len()
3710    }
3711
3712    /// v6.2.7 — list every *active* cold-tier segment id known to
3713    /// this catalog (skips compaction tombstones since v6.7.3).
3714    /// Used by EXPLAIN ANALYZE to annotate scan nodes with the
3715    /// segments they could have walked.
3716    #[must_use]
3717    pub fn cold_segment_ids_global(&self) -> Vec<u32> {
3718        self.cold_segments
3719            .iter()
3720            .enumerate()
3721            .filter_map(|(i, s)| s.as_ref().map(|_| i as u32))
3722            .collect()
3723    }
3724
3725    /// v5.2.1: sum of `Table::hot_bytes` across every table. The v5.2
3726    /// freezer compares this against `SPG_HOT_TIER_BYTES` (parsed at
3727    /// server startup; default 4 GiB) and wakes when the budget is
3728    /// crossed. Pre-freezer (v5.2.1) this is measurement-only — the
3729    /// counter exposes whether the budget is being approached without
3730    /// triggering any demotion.
3731    #[must_use]
3732    pub fn hot_tier_bytes(&self) -> u64 {
3733        self.tables
3734            .iter()
3735            .map(Table::hot_bytes)
3736            .fold(0u64, u64::saturating_add)
3737    }
3738
3739    /// v5.2.2: freeze the **first** `max_rows` rows of `table_name`'s
3740    /// hot tier into a brand-new cold-tier segment. The named `BTree`
3741    /// index supplies the per-row PK (its column must be an integer
3742    /// type — v5.2.2 only supports `IndexKey::Int` PKs, matching the
3743    /// `index_key_as_u64` constraint used by the cold-tier lookup
3744    /// path). On success returns a [`FreezeReport`] with the
3745    /// freshly-allocated segment id, the count of rows that moved,
3746    /// the encoded segment bytes (so the caller can persist them to
3747    /// disk for later reload via `SPG_PRELOAD_COLD_SEGMENT`), and the
3748    /// hot-tier byte delta that was reclaimed.
3749    ///
3750    /// **Semantics**:
3751    /// 1. The first `max_rows` rows (by hot-tier position — same as
3752    ///    insertion order under v4.39 `PersistentVec`) are read.
3753    /// 2. Rows are sorted ascending by PK and serialised into a new
3754    ///    segment via [`encode_segment`].
3755    /// 3. The hot rows are dropped via [`Table::delete_rows`]; the
3756    ///    `rebuild_indices` it triggers regenerates `Hot` locators
3757    ///    for every remaining row (their positions shift down by
3758    ///    `max_rows`). Existing `Cold` locators in this index — from
3759    ///    a previous freeze — are also rebuilt **but with empty
3760    ///    payload** since rebuild reads only `self.rows`; this
3761    ///    routine re-registers them at the end of the call so the
3762    ///    user-visible state preserves all prior cold locators.
3763    /// 4. The new segment is loaded into `self.cold_segments` via
3764    ///    [`Catalog::load_segment_bytes`] (allocating a fresh
3765    ///    `segment_id`). New `Cold` locators are registered on the
3766    ///    named index — one per frozen row.
3767    ///
3768    /// **v5.2.2 limits** (relaxed in later sub-versions):
3769    /// - INSERT-only flow: subsequent UPDATE/DELETE on a frozen row
3770    ///   returns a stale-locator error (no promote-on-write until
3771    ///   v5.2.3).
3772    /// - Single-table scope: callers iterate tables themselves.
3773    /// - All-or-nothing: returns `Err` and leaves catalog unchanged
3774    ///   if any step fails before the atomic swap point.
3775    ///
3776    /// Errors:
3777    /// - [`StorageError::Corrupt`] for missing table/index, non-`BTree`
3778    ///   index, non-integer PK column, `max_rows == 0`, or
3779    ///   `max_rows > row_count`.
3780    /// - The encoder's [`SegmentError`] surfaces as `Corrupt` (the
3781    ///   only realistic source is "a single row is larger than the
3782    ///   page size"; SPG schemas don't hit it in practice).
3783    pub fn freeze_oldest_to_cold(
3784        &mut self,
3785        table_name: &str,
3786        index_name: &str,
3787        max_rows: usize,
3788    ) -> Result<FreezeReport, StorageError> {
3789        // --- validation phase: never mutates ---------------------
3790        if max_rows == 0 {
3791            return Err(StorageError::Corrupt(
3792                "freeze_oldest_to_cold: max_rows must be > 0".into(),
3793            ));
3794        }
3795        let table = self.get(table_name).ok_or_else(|| {
3796            StorageError::Corrupt(format!(
3797                "freeze_oldest_to_cold: table {table_name:?} not found"
3798            ))
3799        })?;
3800        if max_rows > table.rows.len() {
3801            return Err(StorageError::Corrupt(format!(
3802                "freeze_oldest_to_cold: max_rows {max_rows} > row_count {}",
3803                table.rows.len()
3804            )));
3805        }
3806        let idx = table
3807            .indices
3808            .iter()
3809            .find(|i| i.name == index_name)
3810            .ok_or_else(|| {
3811                StorageError::Corrupt(format!(
3812                    "freeze_oldest_to_cold: index {index_name:?} not found on {table_name:?}"
3813                ))
3814            })?;
3815        if !matches!(idx.kind, IndexKind::BTree(_)) {
3816            return Err(StorageError::Corrupt(format!(
3817                "freeze_oldest_to_cold: index {index_name:?} is NSW; only BTree indices may freeze"
3818            )));
3819        }
3820        let column_position = idx.column_position;
3821
3822        // --- segment build phase: reads only --------------------
3823        let schema = table.schema.clone();
3824        let mut to_freeze: Vec<(u64, Vec<u8>, IndexKey)> = Vec::with_capacity(max_rows);
3825        for row_idx in 0..max_rows {
3826            let row = table.rows.get(row_idx).expect("bounds-checked above");
3827            let key = IndexKey::from_value(&row.values[column_position]).ok_or_else(|| {
3828                StorageError::Corrupt(format!(
3829                    "freeze_oldest_to_cold: row {row_idx} has NULL / non-key value in index column"
3830                ))
3831            })?;
3832            let pk_u64 = index_key_as_u64(&key).ok_or_else(|| {
3833                StorageError::Corrupt(format!(
3834                    "freeze_oldest_to_cold: index {index_name:?} column type is non-integer; \
3835                     v5.2.2 cold tier requires IndexKey::Int (Text PK lands in v5.5+)"
3836                ))
3837            })?;
3838            to_freeze.push((pk_u64, encode_row_body_dense(row, &schema), key));
3839        }
3840        // encode_segment requires ascending u64 keys. Sort by PK
3841        // before encoding; the caller's row-position order is not
3842        // necessarily PK order (e.g. workloads that insert random
3843        // PKs).
3844        to_freeze.sort_by_key(|(k, _, _)| *k);
3845        // Reject duplicate PKs — encode_segment also rejects them
3846        // (`SegmentError::UnsortedKey`), but the resulting error
3847        // message there is misleading. Surface a clearer one.
3848        for w in to_freeze.windows(2) {
3849            if w[0].0 == w[1].0 {
3850                return Err(StorageError::Corrupt(format!(
3851                    "freeze_oldest_to_cold: duplicate PK {} in freeze batch",
3852                    w[0].0
3853                )));
3854            }
3855        }
3856        // Snapshot the (key, locator) pairs that will be registered
3857        // post-swap. Cloning the IndexKey out before the move makes
3858        // the registration loop borrow-free.
3859        let post_swap_keys: Vec<IndexKey> = to_freeze.iter().map(|(_, _, k)| k.clone()).collect();
3860        // Segment encode is now infallible w.r.t. ordering. Map the
3861        // `SegmentError` into a `StorageError::Corrupt` so the
3862        // public surface stays one error type.
3863        let seg_rows: Vec<(u64, Vec<u8>)> = to_freeze
3864            .into_iter()
3865            .map(|(k, body, _)| (k, body))
3866            .collect();
3867        let frozen_rows = seg_rows.len();
3868        let (seg_bytes, _meta) = encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES)
3869            .map_err(|e| StorageError::Corrupt(format!("freeze_oldest_to_cold: encode: {e}")))?;
3870
3871        // --- atomic swap phase: mutations only past this point ---
3872        // v5.2.3 made `Table::rebuild_indices` preserve every Cold
3873        // locator across the per-table rebuild, so `delete_rows`
3874        // below no longer wipes prior-freeze cold entries. The pre-
3875        // v5.2.3 capture-then-re-register that used to live here
3876        // was removed in v5.3.1 — keeping it would double-count
3877        // every prior-frozen key's Cold locator on each subsequent
3878        // freeze.
3879        let bytes_before = self.get(table_name).expect("just validated").hot_bytes();
3880        let positions: Vec<usize> = (0..max_rows).collect();
3881        let t_mut = self
3882            .get_mut(table_name)
3883            .expect("just validated; still present");
3884        let removed = t_mut.delete_rows(&positions);
3885        debug_assert_eq!(removed, max_rows, "delete_rows count matches request");
3886        let bytes_after = t_mut.hot_bytes();
3887        let bytes_freed = bytes_before.saturating_sub(bytes_after);
3888
3889        let segment_id = self
3890            .load_segment_bytes(seg_bytes.clone())
3891            .map_err(|e| StorageError::Corrupt(format!("freeze_oldest_to_cold: load: {e}")))?;
3892        let new_cold = post_swap_keys.into_iter().map(|k| {
3893            (
3894                k,
3895                RowLocator::Cold {
3896                    segment_id,
3897                    page_offset: 0,
3898                },
3899            )
3900        });
3901        let t_mut = self.get_mut(table_name).expect("still present");
3902        t_mut.register_cold_locators(index_name, new_cold)?;
3903
3904        Ok(FreezeReport {
3905            segment_id,
3906            frozen_rows,
3907            bytes_freed,
3908            segment_bytes: seg_bytes,
3909        })
3910    }
3911
3912    /// v5.1: borrow the cold segment at `segment_id`. Used by the
3913    /// spg-server preload path to enumerate (key, locator) pairs
3914    /// after loading a segment, so it can call
3915    /// [`Table::register_cold_locators`] without re-parsing the
3916    /// bytes.
3917    #[must_use]
3918    pub fn cold_segment(&self, segment_id: u32) -> Option<&OwnedSegment> {
3919        self.cold_segments
3920            .get(segment_id as usize)
3921            .and_then(|s| s.as_deref())
3922    }
3923
3924    /// v5.1: resolve a single `RowLocator::Cold` to its underlying
3925    /// `Row`. Decoupled from [`Catalog::lookup_by_pk`] so callers
3926    /// iterating a multi-locator slice (e.g. the engine's index
3927    /// seek path) can dispatch per locator instead of getting back
3928    /// only the first row for a key. Returns `None` when the
3929    /// segment isn't registered, the key isn't `u64`-coercible, or
3930    /// the segment doesn't actually carry the key (bloom or page-
3931    /// index reject).
3932    pub fn resolve_cold_locator(
3933        &self,
3934        table_name: &str,
3935        segment_id: u32,
3936        key: &IndexKey,
3937    ) -> Option<Row> {
3938        let t = self.get(table_name)?;
3939        let u64_key = index_key_as_u64(key)?;
3940        let seg = self.cold_segments.get(segment_id as usize)?.as_ref()?;
3941        let payload = seg.lookup(u64_key)?;
3942        let (row, _) = decode_row_body_dense(&payload, &t.schema).ok()?;
3943        Some(row)
3944    }
3945
3946    /// v5.1: indexed PK lookup that dispatches per locator,
3947    /// returning the first matching row from either the hot tier
3948    /// (`Table::rows`) or a registered cold segment.
3949    ///
3950    /// The cold path requires the index column to be coercible to
3951    /// a `u64` (the segment's PK type) and the segment payload to
3952    /// be a [`encode_row_body_dense`]-encoded row body for the
3953    /// same schema. v5.1 ships this for BIGINT / INT / SMALLINT
3954    /// PKs; other types fall through to hot-only behavior.
3955    ///
3956    /// Returns `None` if (a) the table or index doesn't exist,
3957    /// (b) the key isn't in the index at all, or (c) the key was
3958    /// resolved to a stale locator (Hot index out of range, Cold
3959    /// segment id unknown, segment lookup miss). Does not surface
3960    /// segment-decode errors — those would indicate corrupted
3961    /// cold-tier files and should be caught at
3962    /// [`Catalog::load_segment_bytes`] time.
3963    pub fn lookup_by_pk(&self, table: &str, index_name: &str, key: &IndexKey) -> Option<Row> {
3964        let t = self.get(table)?;
3965        let idx = t.indices.iter().find(|i| i.name == index_name)?;
3966        let locators = idx.lookup_eq(key);
3967        let cold_u64_key = index_key_as_u64(key);
3968        for loc in locators {
3969            match *loc {
3970                RowLocator::Hot(i) => {
3971                    if let Some(row) = t.rows.get(i) {
3972                        return Some(row.clone());
3973                    }
3974                }
3975                RowLocator::Cold {
3976                    segment_id,
3977                    page_offset: _,
3978                } => {
3979                    let Some(u64_key) = cold_u64_key else {
3980                        // Key type not coercible to u64 — cold tier
3981                        // only handles BIGINT/INT/SMALLINT in v5.1.
3982                        continue;
3983                    };
3984                    let Some(seg) = self
3985                        .cold_segments
3986                        .get(segment_id as usize)
3987                        .and_then(|s| s.as_deref())
3988                    else {
3989                        // v6.7.3 — `None` slot = compaction
3990                        // retired this segment; the live locator
3991                        // on a freshly-compacted index points to
3992                        // the merged segment_id, so a Cold hit
3993                        // here against a tombstone means the BTree
3994                        // entry hasn't been swapped yet (mid-
3995                        // compaction reader race) or the caller is
3996                        // looking up a stale snapshot. Skip — the
3997                        // next locator in the list, if any, is
3998                        // typically the merged segment.
3999                        continue;
4000                    };
4001                    let Some(payload) = seg.lookup(u64_key) else {
4002                        continue;
4003                    };
4004                    let (row, _) = decode_row_body_dense(&payload, &t.schema).ok()?;
4005                    return Some(row);
4006                }
4007            }
4008        }
4009        None
4010    }
4011
4012    /// v5.2.3: promote a frozen row back to the hot tier so an
4013    /// UPDATE / DELETE can mutate it. Reads the cold-tier row body
4014    /// (decoded from its registered segment), pushes it into
4015    /// `table.rows` via [`Table::insert`] (which also adds a fresh
4016    /// `Hot(new_idx)` locator on `index_name`), then retires the
4017    /// shadowed `Cold` locator via
4018    /// [`Table::remove_cold_locators_for_key`]. The cold-tier row
4019    /// in the segment file becomes garbage — recoverable when a
4020    /// future cold-segment compaction job lands.
4021    ///
4022    /// Returns:
4023    /// - `Ok(Some(new_hot_idx))` when the key resolved through a
4024    ///   cold locator and the promote completed. `new_hot_idx` is
4025    ///   the position the row now occupies in `table.rows`.
4026    /// - `Ok(None)` when the key has no Cold locator on the index
4027    ///   (already hot, or wasn't present at all). Callers treat this
4028    ///   as "nothing to do here, fall back to the hot-only path".
4029    ///
4030    /// Errors when the table / index doesn't exist, the index isn't
4031    /// `BTree`, the cold segment is missing / can't decode the row,
4032    /// or the inferred row body fails `Table::insert` validation.
4033    pub fn promote_cold_row(
4034        &mut self,
4035        table_name: &str,
4036        index_name: &str,
4037        key: &IndexKey,
4038    ) -> Result<Option<usize>, StorageError> {
4039        let cold_loc = self.find_cold_locator(table_name, index_name, key)?;
4040        let Some((segment_id, _page_offset)) = cold_loc else {
4041            return Ok(None);
4042        };
4043        let u64_key = index_key_as_u64(key).ok_or_else(|| {
4044            StorageError::Corrupt(
4045                "promote_cold_row: key type not coercible to u64 (cold tier requires integer PK)"
4046                    .into(),
4047            )
4048        })?;
4049        // Read the row body from the segment. Borrow the segment +
4050        // schema short-term so we can then take `&mut self` for the
4051        // hot-side insert.
4052        let schema = self
4053            .get(table_name)
4054            .ok_or_else(|| {
4055                StorageError::Corrupt(format!("promote_cold_row: table {table_name:?} not found"))
4056            })?
4057            .schema
4058            .clone();
4059        let seg = self
4060            .cold_segments
4061            .get(segment_id as usize)
4062            .and_then(|s| s.as_ref())
4063            .ok_or_else(|| {
4064                StorageError::Corrupt(format!(
4065                    "promote_cold_row: segment {segment_id} not registered on catalog"
4066                ))
4067            })?;
4068        let payload = seg.lookup(u64_key).ok_or_else(|| {
4069            StorageError::Corrupt(format!(
4070                "promote_cold_row: key {u64_key} resolves to segment {segment_id} \
4071                 but the segment's bloom/page lookup didn't return a row"
4072            ))
4073        })?;
4074        let (row, _consumed) = decode_row_body_dense(&payload, &schema)?;
4075        // Insert the promoted row into the hot tier. `Table::insert`
4076        // appends to `self.rows`, adds a `Hot(new_idx)` locator to
4077        // every BTree index covering the row's keyed columns, and
4078        // increments `hot_bytes`.
4079        let t = self
4080            .get_mut(table_name)
4081            .expect("table existed at lookup time");
4082        t.insert(row)?;
4083        let new_hot_idx =
4084            t.rows.len().checked_sub(1).ok_or_else(|| {
4085                StorageError::Corrupt("promote_cold_row: empty after insert".into())
4086            })?;
4087        // The hot insert added Hot(new_idx) alongside the still-
4088        // present Cold locator. Drop the Cold entry so future
4089        // lookups return only the fresh hot row.
4090        t.remove_cold_locators_for_key(index_name, key)?;
4091        Ok(Some(new_hot_idx))
4092    }
4093
4094    /// v5.2.3: shadow a frozen row's index entry. Used by DELETE
4095    /// when the row to remove lives in a cold-tier segment — the
4096    /// row body stays in the segment file (becoming garbage) but
4097    /// every `Cold` locator for `key` on `index_name` is removed
4098    /// so PK lookups stop returning it.
4099    ///
4100    /// Returns the number of cold locators retired (0 when the key
4101    /// has no cold entries — the DELETE fell on a hot row or a
4102    /// key that was already absent). Errors when the table /
4103    /// index doesn't exist or the index isn't `BTree`.
4104    ///
4105    /// Cold-segment compaction (which merges shadowed-heavy
4106    /// segments and reclaims their disk footprint) lands in a
4107    /// later v5.x sub-version; until then, repeated UPDATE/DELETE
4108    /// of cold rows can amplify cold-segment disk usage by up to
4109    /// 1-2× — still well under typical LSM-tree shadowing because
4110    /// SPG segments are bulk-baked, not write-merged.
4111    pub fn shadow_cold_row(
4112        &mut self,
4113        table_name: &str,
4114        index_name: &str,
4115        key: &IndexKey,
4116    ) -> Result<usize, StorageError> {
4117        let t = self.get_mut(table_name).ok_or_else(|| {
4118            StorageError::Corrupt(format!("shadow_cold_row: table {table_name:?} not found"))
4119        })?;
4120        t.remove_cold_locators_for_key(index_name, key)
4121    }
4122
4123    /// v6.7.4 — read-only slice preparation for the parallel
4124    /// freezer. Walks rows in `row_range`, builds the
4125    /// `(pk_u64, encoded_body, IndexKey)` triples that the
4126    /// coordinator's k-way merge consumes, sorts the slice by
4127    /// `pk_u64`, and returns a [`FreezeSlice`].
4128    ///
4129    /// Caller invariants:
4130    /// - `row_range.end <= table.rows.len()` (caller's job to
4131    ///   compute the partition).
4132    /// - All slices passed to `commit_freeze_slices` must cover a
4133    ///   contiguous half-open range `[0, total_max_rows)` with no
4134    ///   gaps and no overlaps. The coordinator validates this
4135    ///   invariant before committing.
4136    ///
4137    /// `&self`-only — multiple workers can run this concurrently
4138    /// against the same `Catalog` reference under the engine's
4139    /// write lock (workers don't mutate; the coordinator does).
4140    pub fn prepare_freeze_slice(
4141        &self,
4142        table_name: &str,
4143        index_name: &str,
4144        row_range: core::ops::Range<usize>,
4145    ) -> Result<FreezeSlice, StorageError> {
4146        let table = self.get(table_name).ok_or_else(|| {
4147            StorageError::Corrupt(format!(
4148                "prepare_freeze_slice: table {table_name:?} not found"
4149            ))
4150        })?;
4151        let idx = table
4152            .indices
4153            .iter()
4154            .find(|i| i.name == index_name)
4155            .ok_or_else(|| {
4156                StorageError::Corrupt(format!(
4157                    "prepare_freeze_slice: index {index_name:?} not found on {table_name:?}"
4158                ))
4159            })?;
4160        if !matches!(idx.kind, IndexKind::BTree(_)) {
4161            return Err(StorageError::Corrupt(format!(
4162                "prepare_freeze_slice: index {index_name:?} is NSW; only BTree indices may freeze"
4163            )));
4164        }
4165        if row_range.end > table.rows.len() {
4166            return Err(StorageError::Corrupt(format!(
4167                "prepare_freeze_slice: row_range end {} > row_count {}",
4168                row_range.end,
4169                table.rows.len()
4170            )));
4171        }
4172        let column_position = idx.column_position;
4173        let schema = table.schema.clone();
4174        let mut rows: Vec<(u64, Vec<u8>, IndexKey)> = Vec::with_capacity(row_range.len());
4175        for row_idx in row_range.clone() {
4176            let row = table.rows.get(row_idx).expect("bounds-checked above");
4177            let key = IndexKey::from_value(&row.values[column_position]).ok_or_else(|| {
4178                StorageError::Corrupt(format!(
4179                    "prepare_freeze_slice: row {row_idx} has NULL / non-key value in index column"
4180                ))
4181            })?;
4182            let pk_u64 = index_key_as_u64(&key).ok_or_else(|| {
4183                StorageError::Corrupt(format!(
4184                    "prepare_freeze_slice: index {index_name:?} column type is non-integer; \
4185                     v5.2.2 cold tier requires IndexKey::Int (Text PK lands in v5.5+)"
4186                ))
4187            })?;
4188            rows.push((pk_u64, encode_row_body_dense(row, &schema), key));
4189        }
4190        rows.sort_by_key(|(k, _, _)| *k);
4191        Ok(FreezeSlice { row_range, rows })
4192    }
4193
4194    /// v6.7.4 — coordinator commit step. Merges N
4195    /// [`FreezeSlice`]s into one segment via the standard
4196    /// [`encode_segment`] path, atomically swaps the catalog
4197    /// state (delete the union row range + register Cold
4198    /// locators + load the segment).
4199    ///
4200    /// Validates that the slices cover a contiguous, gap-free,
4201    /// overlap-free half-open range starting at index 0 (the
4202    /// freezer always freezes "oldest first" — same semantics as
4203    /// the single-threaded [`Catalog::freeze_oldest_to_cold`]).
4204    ///
4205    /// Empty `slices` → no-op success (returns a zero-row report
4206    /// without mutating). Total row count = `Σ slice.rows.len()`.
4207    pub fn commit_freeze_slices(
4208        &mut self,
4209        table_name: &str,
4210        index_name: &str,
4211        slices: Vec<FreezeSlice>,
4212    ) -> Result<FreezeReport, StorageError> {
4213        // --- validation phase: never mutates ---------------------
4214        let table = self.get(table_name).ok_or_else(|| {
4215            StorageError::Corrupt(format!(
4216                "commit_freeze_slices: table {table_name:?} not found"
4217            ))
4218        })?;
4219        let idx = table
4220            .indices
4221            .iter()
4222            .find(|i| i.name == index_name)
4223            .ok_or_else(|| {
4224                StorageError::Corrupt(format!(
4225                    "commit_freeze_slices: index {index_name:?} not found on {table_name:?}"
4226                ))
4227            })?;
4228        if !matches!(idx.kind, IndexKind::BTree(_)) {
4229            return Err(StorageError::Corrupt(format!(
4230                "commit_freeze_slices: index {index_name:?} is NSW; only BTree indices may freeze"
4231            )));
4232        }
4233        // Validate slice coverage: contiguous from 0, no gaps, no
4234        // overlaps. Allow the caller to pass slices in any order —
4235        // sort by row_range.start first.
4236        let mut ordered = slices;
4237        ordered.sort_by_key(|s| s.row_range.start);
4238        // Drop fully-empty slices that fell out of an uneven
4239        // partition; they carry no data but contribute to the
4240        // contiguity check, so keep them in line.
4241        let mut expected_start = 0usize;
4242        for s in &ordered {
4243            if s.row_range.start != expected_start {
4244                return Err(StorageError::Corrupt(format!(
4245                    "commit_freeze_slices: gap/overlap at row {}; expected start {}",
4246                    s.row_range.start, expected_start
4247                )));
4248            }
4249            expected_start = s.row_range.end;
4250        }
4251        let max_rows = expected_start;
4252        if max_rows > table.rows.len() {
4253            return Err(StorageError::Corrupt(format!(
4254                "commit_freeze_slices: total row range {} exceeds row_count {}",
4255                max_rows,
4256                table.rows.len()
4257            )));
4258        }
4259        if max_rows == 0 {
4260            return Ok(FreezeReport {
4261                segment_id: u32::MAX,
4262                frozen_rows: 0,
4263                bytes_freed: 0,
4264                segment_bytes: Vec::new(),
4265            });
4266        }
4267
4268        // --- segment build phase: reads only --------------------
4269        // K-way merge of already-sorted slices. Each slice's rows
4270        // are ascending by pk_u64; we keep a per-slice cursor and
4271        // pull the next-smallest head until every cursor drains.
4272        let total_rows: usize = ordered.iter().map(|s| s.rows.len()).sum();
4273        if total_rows != max_rows {
4274            return Err(StorageError::Corrupt(format!(
4275                "commit_freeze_slices: total slice rows {total_rows} ≠ row_range coverage {max_rows}"
4276            )));
4277        }
4278        let mut cursors: Vec<usize> = alloc::vec![0; ordered.len()];
4279        let mut merged: Vec<(u64, Vec<u8>, IndexKey)> = Vec::with_capacity(total_rows);
4280        loop {
4281            // Pick the slice whose head row has the smallest key
4282            // and isn't yet exhausted.
4283            let mut pick: Option<usize> = None;
4284            for (i, c) in cursors.iter().enumerate() {
4285                let slice = &ordered[i];
4286                if *c >= slice.rows.len() {
4287                    continue;
4288                }
4289                match pick {
4290                    None => pick = Some(i),
4291                    Some(j) => {
4292                        if slice.rows[*c].0 < ordered[j].rows[cursors[j]].0 {
4293                            pick = Some(i);
4294                        }
4295                    }
4296                }
4297            }
4298            let Some(i) = pick else { break };
4299            let row = ordered[i].rows[cursors[i]].clone();
4300            cursors[i] += 1;
4301            merged.push(row);
4302        }
4303        // Reject duplicate PKs — same error as the single-threaded
4304        // path so callers get a uniform surface.
4305        for w in merged.windows(2) {
4306            if w[0].0 == w[1].0 {
4307                return Err(StorageError::Corrupt(format!(
4308                    "commit_freeze_slices: duplicate PK {} across slices",
4309                    w[0].0
4310                )));
4311            }
4312        }
4313        let post_swap_keys: Vec<IndexKey> = merged.iter().map(|(_, _, k)| k.clone()).collect();
4314        let seg_rows: Vec<(u64, Vec<u8>)> =
4315            merged.into_iter().map(|(k, body, _)| (k, body)).collect();
4316        let frozen_rows = seg_rows.len();
4317        let (seg_bytes, _meta) = encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES)
4318            .map_err(|e| StorageError::Corrupt(format!("commit_freeze_slices: encode: {e}")))?;
4319
4320        // --- atomic swap phase: mutations only past this point ---
4321        let bytes_before = self.get(table_name).expect("just validated").hot_bytes();
4322        let positions: Vec<usize> = (0..max_rows).collect();
4323        let t_mut = self
4324            .get_mut(table_name)
4325            .expect("just validated; still present");
4326        let removed = t_mut.delete_rows(&positions);
4327        debug_assert_eq!(removed, max_rows, "delete_rows count matches request");
4328        let bytes_after = t_mut.hot_bytes();
4329        let bytes_freed = bytes_before.saturating_sub(bytes_after);
4330
4331        let segment_id = self
4332            .load_segment_bytes(seg_bytes.clone())
4333            .map_err(|e| StorageError::Corrupt(format!("commit_freeze_slices: load: {e}")))?;
4334        let new_cold = post_swap_keys.into_iter().map(|k| {
4335            (
4336                k,
4337                RowLocator::Cold {
4338                    segment_id,
4339                    page_offset: 0,
4340                },
4341            )
4342        });
4343        let t_mut = self.get_mut(table_name).expect("still present");
4344        t_mut.register_cold_locators(index_name, new_cold)?;
4345
4346        Ok(FreezeReport {
4347            segment_id,
4348            frozen_rows,
4349            bytes_freed,
4350            segment_bytes: seg_bytes,
4351        })
4352    }
4353
4354    /// v6.7.3 — compact every cold segment on `(table, index)` whose
4355    /// `OwnedSegment::bytes().len()` is below `target_segment_bytes`
4356    /// into a single larger merged segment. Rows present in source
4357    /// segment payloads but no longer referenced by any
4358    /// `RowLocator::Cold` on the index (DELETE'd + frozen rows
4359    /// retired via [`Catalog::shadow_cold_row`]) are GC'd in the
4360    /// merge.
4361    ///
4362    /// **Semantics**:
4363    /// 1. Walk the BTree index to collect every Cold locator that
4364    ///    targets a small (< threshold) segment. Each such
4365    ///    `(key, segment_id)` becomes a row in the merged segment;
4366    ///    payload is looked up from the source segment in-place.
4367    /// 2. Encode the collected rows into one new segment via
4368    ///    [`encode_segment`]; register it via
4369    ///    [`Catalog::load_segment_bytes`] (allocating a fresh
4370    ///    `merged_segment_id` at the end of `cold_segments`).
4371    /// 3. Rewrite the BTree index in one pass: every
4372    ///    `RowLocator::Cold { segment_id ∈ sources }` becomes
4373    ///    `RowLocator::Cold { segment_id = merged_id, page_offset = 0 }`.
4374    ///    Hot locators are untouched.
4375    /// 4. Tombstone every source slot via
4376    ///    [`Catalog::tombstone_segment`]. Source segment payloads
4377    ///    are no longer reachable through the catalog; the on-disk
4378    ///    files are the caller's concern.
4379    ///
4380    /// On fewer than 2 candidate segments the catalog is **not**
4381    /// mutated and a no-op report (`merged_segment_id: None`,
4382    /// `sources: []`) is returned. This is the routine case — a
4383    /// freshly-frozen table has at most 1 small segment, no merge
4384    /// possible.
4385    ///
4386    /// Atomicity: every mutating step runs after the read-only
4387    /// gather phase, so a panic before the merge encode leaves the
4388    /// catalog unchanged. The mutation block itself (load + rewrite +
4389    /// tombstone) takes only `&mut self` — callers serialise the
4390    /// engine write lock outside this function.
4391    ///
4392    /// Errors when the table / index doesn't exist, the index isn't
4393    /// `BTree`, the index column type isn't u64-coercible (cold-tier
4394    /// pre-condition), or a source segment fails its in-place
4395    /// row-body lookup (would indicate prior catalog corruption).
4396    pub fn compact_cold_segments(
4397        &mut self,
4398        table_name: &str,
4399        index_name: &str,
4400        target_segment_bytes: u64,
4401    ) -> Result<CompactReport, StorageError> {
4402        // --- validation phase ----------------------------------
4403        let t = self.get(table_name).ok_or_else(|| {
4404            StorageError::Corrupt(format!(
4405                "compact_cold_segments: table {table_name:?} not found"
4406            ))
4407        })?;
4408        let idx = t
4409            .indices
4410            .iter()
4411            .find(|i| i.name == index_name)
4412            .ok_or_else(|| {
4413                StorageError::Corrupt(format!(
4414                    "compact_cold_segments: index {index_name:?} not found on {table_name:?}"
4415                ))
4416            })?;
4417        let map = match &idx.kind {
4418            IndexKind::BTree(m) => m,
4419            IndexKind::Nsw(_)
4420            | IndexKind::Brin { .. }
4421            | IndexKind::Gin(_)
4422            | IndexKind::GinTrgm(_) => {
4423                return Err(StorageError::Corrupt(format!(
4424                    "compact_cold_segments: index {index_name:?} is not BTree; \
4425                     compaction applies only to BTree cold-tier indices"
4426                )));
4427            }
4428        };
4429
4430        // --- gather phase --------------------------------------
4431        // Step A: every segment_id this BTree index Cold-references.
4432        let mut referenced_ids: BTreeSet<u32> = BTreeSet::new();
4433        for (_key, locators) in map.iter() {
4434            for loc in locators {
4435                if let RowLocator::Cold { segment_id, .. } = loc {
4436                    referenced_ids.insert(*segment_id);
4437                }
4438            }
4439        }
4440        // Step B: keep only the small + still-active ones.
4441        let candidate_set: BTreeSet<u32> = referenced_ids
4442            .into_iter()
4443            .filter(|id| {
4444                self.cold_segments
4445                    .get(*id as usize)
4446                    .and_then(|s| s.as_deref())
4447                    .is_some_and(|s| (s.bytes().len() as u64) < target_segment_bytes)
4448            })
4449            .collect();
4450        if candidate_set.len() < 2 {
4451            return Ok(CompactReport {
4452                sources: Vec::new(),
4453                merged_segment_id: None,
4454                merged_segment_bytes: Vec::new(),
4455                merged_rows: 0,
4456                deleted_rows_pruned: 0,
4457                bytes_reclaimed_estimate: 0,
4458            });
4459        }
4460        // Step C: pre-count source rows for the deleted-pruned metric.
4461        let mut source_row_count: usize = 0;
4462        let mut source_byte_total: u64 = 0;
4463        for &id in &candidate_set {
4464            let seg = self.cold_segments[id as usize]
4465                .as_ref()
4466                .expect("candidate selected only when slot is Some");
4467            source_row_count = source_row_count.saturating_add(seg.meta().num_rows as usize);
4468            source_byte_total = source_byte_total.saturating_add(seg.bytes().len() as u64);
4469        }
4470        // Step D: collect (key, body) pairs from every live Cold
4471        // locator pointing at a candidate. dedupe by key — one
4472        // BTree key resolves to at most one cold payload (the
4473        // freezer + promote/shadow flow keeps Cold locators
4474        // unique per key).
4475        let mut collected: BTreeMap<u64, (Vec<u8>, IndexKey)> = BTreeMap::new();
4476        for (key, locators) in map.iter() {
4477            for loc in locators {
4478                let RowLocator::Cold { segment_id, .. } = loc else {
4479                    continue;
4480                };
4481                if !candidate_set.contains(segment_id) {
4482                    continue;
4483                }
4484                let u64_key = index_key_as_u64(key).ok_or_else(|| {
4485                    StorageError::Corrupt(format!(
4486                        "compact_cold_segments: index {index_name:?} has non-integer Cold key; \
4487                         cold tier requires IndexKey::Int (Text PK lands in v5.5+)"
4488                    ))
4489                })?;
4490                let seg = self.cold_segments[*segment_id as usize]
4491                    .as_ref()
4492                    .expect("candidate slot guaranteed Some above");
4493                let payload = seg.lookup(u64_key).ok_or_else(|| {
4494                    StorageError::Corrupt(format!(
4495                        "compact_cold_segments: BTree {index_name:?} points key={u64_key} \
4496                         at segment {segment_id} but the segment lookup missed"
4497                    ))
4498                })?;
4499                collected.insert(u64_key, (payload, key.clone()));
4500                break;
4501            }
4502        }
4503        let merged_rows = collected.len();
4504        let deleted_rows_pruned = source_row_count.saturating_sub(merged_rows);
4505
4506        // Step E: encode the merged segment. `BTreeMap<u64, _>`
4507        // iteration is ascending by key, which is what
4508        // `encode_segment` requires.
4509        let seg_rows: Vec<(u64, Vec<u8>)> = collected
4510            .iter()
4511            .map(|(k, (body, _))| (*k, body.clone()))
4512            .collect();
4513        let (seg_bytes, _meta) = encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES)
4514            .map_err(|e| StorageError::Corrupt(format!("compact_cold_segments: encode: {e}")))?;
4515        let merged_bytes_len = seg_bytes.len() as u64;
4516
4517        // --- atomic mutation phase ------------------------------
4518        let merged_segment_id = self
4519            .load_segment_bytes(seg_bytes.clone())
4520            .map_err(|e| StorageError::Corrupt(format!("compact_cold_segments: load: {e}")))?;
4521
4522        // Rewrite the BTree index: every Cold locator pointing at
4523        // a candidate source becomes a Cold locator pointing at
4524        // the merged segment. Use a flat collect-then-replace
4525        // pattern so we never hold a `&self` borrow across the
4526        // `&mut self` write.
4527        let entries: Vec<(IndexKey, Vec<RowLocator>)> = {
4528            let t = self
4529                .get(table_name)
4530                .expect("table existed at the start of this fn");
4531            let idx = t
4532                .indices
4533                .iter()
4534                .find(|i| i.name == index_name)
4535                .expect("index existed at the start of this fn");
4536            let IndexKind::BTree(map) = &idx.kind else {
4537                unreachable!("validated above");
4538            };
4539            map.iter().map(|(k, v)| (k.clone(), v.clone())).collect()
4540        };
4541        let t_mut = self
4542            .get_mut(table_name)
4543            .expect("table existed at the start of this fn");
4544        let idx_mut = t_mut
4545            .indices
4546            .iter_mut()
4547            .find(|i| i.name == index_name)
4548            .expect("index existed at the start of this fn");
4549        let IndexKind::BTree(map_mut) = &mut idx_mut.kind else {
4550            unreachable!("validated above");
4551        };
4552        for (key, locators) in entries {
4553            let mut new_locs: Vec<RowLocator> = Vec::with_capacity(locators.len());
4554            let mut changed = false;
4555            for loc in &locators {
4556                match *loc {
4557                    RowLocator::Cold {
4558                        segment_id,
4559                        page_offset: _,
4560                    } if candidate_set.contains(&segment_id) => {
4561                        let replacement = RowLocator::Cold {
4562                            segment_id: merged_segment_id,
4563                            page_offset: 0,
4564                        };
4565                        if !new_locs.contains(&replacement) {
4566                            new_locs.push(replacement);
4567                        }
4568                        changed = true;
4569                    }
4570                    other => new_locs.push(other),
4571                }
4572            }
4573            if changed {
4574                map_mut.insert_mut(key, new_locs);
4575            }
4576        }
4577
4578        // Tombstone every source slot. Last step — failures here
4579        // would leave the segment double-referenced in both
4580        // memory + manifest, but `tombstone_segment` only errors
4581        // on out-of-bounds, which we've already validated.
4582        for &id in &candidate_set {
4583            self.tombstone_segment(id)?;
4584        }
4585
4586        let bytes_reclaimed_estimate = source_byte_total.saturating_sub(merged_bytes_len);
4587        Ok(CompactReport {
4588            sources: candidate_set.into_iter().collect(),
4589            merged_segment_id: Some(merged_segment_id),
4590            merged_segment_bytes: seg_bytes,
4591            merged_rows,
4592            deleted_rows_pruned,
4593            bytes_reclaimed_estimate,
4594        })
4595    }
4596
4597    /// Internal helper: scan `(table, index)` for a `Cold` locator
4598    /// keyed by `key`. Returns `Ok(Some((segment_id, page_offset)))`
4599    /// when found, `Ok(None)` when the key has only hot entries
4600    /// or no entries at all, `Err` on the same input-validation
4601    /// errors as the public `promote_cold_row` / `shadow_cold_row`.
4602    fn find_cold_locator(
4603        &self,
4604        table_name: &str,
4605        index_name: &str,
4606        key: &IndexKey,
4607    ) -> Result<Option<(u32, u32)>, StorageError> {
4608        let t = self.get(table_name).ok_or_else(|| {
4609            StorageError::Corrupt(format!("find_cold_locator: table {table_name:?} not found"))
4610        })?;
4611        let idx = t
4612            .indices
4613            .iter()
4614            .find(|i| i.name == index_name)
4615            .ok_or_else(|| {
4616                StorageError::Corrupt(format!(
4617                    "find_cold_locator: index {index_name:?} not found on {table_name:?}"
4618                ))
4619            })?;
4620        if !matches!(idx.kind, IndexKind::BTree(_)) {
4621            return Err(StorageError::Corrupt(format!(
4622                "find_cold_locator: index {index_name:?} is NSW; promote-on-write only applies to BTree indices"
4623            )));
4624        }
4625        for loc in idx.lookup_eq(key) {
4626            if let RowLocator::Cold {
4627                segment_id,
4628                page_offset,
4629            } = *loc
4630            {
4631                return Ok(Some((segment_id, page_offset)));
4632            }
4633        }
4634        Ok(None)
4635    }
4636}
4637
4638/// Coerce an [`IndexKey`] to the `u64` that v5.1 cold-tier
4639/// segments use as their on-disk PK. Returns `None` for keys that
4640/// aren't representable as `u64` — Text PKs need a hash mapping
4641/// the segment writer baked in (deferred to v5.2+), Bool PKs are
4642/// almost never wide enough to be sharded into a cold tier.
4643fn index_key_as_u64(key: &IndexKey) -> Option<u64> {
4644    match key {
4645        // Reinterpret the i64 bit pattern as u64. Cold-tier segments
4646        // are sorted by this u64 view, so the chosen interpretation
4647        // only has to match between insert (bake_segment / freezer)
4648        // and lookup — using cast_unsigned keeps both sides honest
4649        // and silences clippy::cast_sign_loss.
4650        IndexKey::Int(n) => Some(n.cast_unsigned()),
4651        IndexKey::Text(_) | IndexKey::Bool(_) => None,
4652    }
4653}
4654
4655#[derive(Debug, Clone, PartialEq, Eq)]
4656#[non_exhaustive]
4657pub enum StorageError {
4658    DuplicateTable {
4659        name: String,
4660    },
4661    TableNotFound {
4662        name: String,
4663    },
4664    ArityMismatch {
4665        expected: usize,
4666        actual: usize,
4667    },
4668    TypeMismatch {
4669        column: String,
4670        expected: DataType,
4671        actual: DataType,
4672        position: usize,
4673    },
4674    NullInNotNull {
4675        column: String,
4676    },
4677    /// Index with this name already exists on the table.
4678    DuplicateIndex {
4679        name: String,
4680    },
4681    /// Column referenced by an index doesn't exist on the table.
4682    ColumnNotFound {
4683        column: String,
4684    },
4685    /// On-disk format failed to parse — corrupted file, wrong magic, truncated
4686    /// payload, or unknown tag bytes.
4687    Corrupt(String),
4688    /// v6.0.4 — ALTER INDEX targeted an index name that doesn't
4689    /// exist on any table in this catalog.
4690    IndexNotFound {
4691        name: String,
4692    },
4693    /// v6.0.4 — operation requested isn't supported on this index
4694    /// kind / column type (e.g. ALTER INDEX REBUILD on a `BTree`
4695    /// index, or REBUILD WITH (encoding=…) on a non-vector column).
4696    Unsupported(String),
4697}
4698
4699impl fmt::Display for StorageError {
4700    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
4701        match self {
4702            Self::DuplicateTable { name } => write!(f, "table already exists: {name}"),
4703            Self::TableNotFound { name } => write!(f, "table not found: {name}"),
4704            Self::ArityMismatch { expected, actual } => write!(
4705                f,
4706                "row arity mismatch: expected {expected} columns, got {actual}"
4707            ),
4708            Self::TypeMismatch {
4709                column,
4710                expected,
4711                actual,
4712                position,
4713            } => write!(
4714                f,
4715                "type mismatch in column {column:?} (position {position}): expected {expected}, got {actual}"
4716            ),
4717            Self::NullInNotNull { column } => {
4718                write!(f, "NULL value in NOT NULL column {column:?}")
4719            }
4720            Self::DuplicateIndex { name } => write!(f, "index already exists: {name}"),
4721            Self::ColumnNotFound { column } => write!(f, "column not found: {column}"),
4722            Self::Corrupt(detail) => write!(f, "corrupt on-disk format: {detail}"),
4723            Self::IndexNotFound { name } => write!(f, "index not found: {name}"),
4724            Self::Unsupported(detail) => write!(f, "unsupported: {detail}"),
4725        }
4726    }
4727}
4728
4729impl ColumnSchema {
4730    pub fn new(name: impl Into<String>, ty: DataType, nullable: bool) -> Self {
4731        Self {
4732            name: name.into(),
4733            ty,
4734            nullable,
4735            default: None,
4736            runtime_default: None,
4737            auto_increment: false,
4738        }
4739    }
4740
4741    /// Builder-style helper to attach a default value to an otherwise
4742    /// plain column schema. Used by the engine when CREATE TABLE
4743    /// specifies `column TYPE DEFAULT <expr>`.
4744    #[must_use]
4745    pub fn with_default(mut self, default: Value) -> Self {
4746        self.default = Some(default);
4747        self
4748    }
4749
4750    /// v7.9.21 — builder for runtime-evaluated defaults
4751    /// (`DEFAULT now()`, `DEFAULT CURRENT_TIMESTAMP`, …).
4752    /// `expr` is the Expr's `Display` form, re-parsed by the
4753    /// engine at each INSERT.
4754    #[must_use]
4755    pub fn with_runtime_default(mut self, expr: impl Into<String>) -> Self {
4756        self.runtime_default = Some(expr.into());
4757        self
4758    }
4759
4760    /// Builder-style helper to mark a column as `AUTO_INCREMENT`.
4761    #[must_use]
4762    pub const fn with_auto_increment(mut self) -> Self {
4763        self.auto_increment = true;
4764        self
4765    }
4766}
4767
4768impl TableSchema {
4769    pub fn new(name: impl Into<String>, columns: Vec<ColumnSchema>) -> Self {
4770        Self {
4771            name: name.into(),
4772            columns,
4773            hot_tier_bytes: None,
4774            foreign_keys: Vec::new(),
4775            uniqueness_constraints: Vec::new(),
4776            checks: Vec::new(),
4777        }
4778    }
4779}
4780
4781// =========================================================================
4782// Persistent binary format for the catalog.
4783//
4784// Layout (little-endian throughout):
4785//
4786//   [magic "SPGDB001" 8 bytes][version u8]
4787//   [table_count u32]
4788//   for each table:
4789//       [name_len u16][name bytes]
4790//       [col_count u16]
4791//       for each col:
4792//           [name_len u16][name bytes]
4793//           [type_tag u8 + optional payload]
4794//               1=Int 2=BigInt 3=Float 4=Text 5=Bool
4795//               6=Vector(u32 dim)
4796//               7=SmallInt
4797//               8=Varchar(u32 max)
4798//               9=Char(u32 size)
4799//               10=Numeric(u8 precision, u8 scale)
4800//               11=Date
4801//               12=Timestamp
4802//           [nullable u8]   0/1
4803//           [default_tag u8] 0=none 1=value (followed by [value_tag u8] + bytes)
4804//       [row_count u32]
4805//       for each row, for each col, one [value_tag u8] + value bytes:
4806//           tag 0 (Null)     → no body
4807//           tag 1 (Int)      → i32 LE
4808//           tag 2 (BigInt)   → i64 LE
4809//           tag 3 (Float)    → f64 LE
4810//           tag 4 (Text)     → u16 LE len + UTF-8 bytes
4811//           tag 5 (Bool)     → u8 0/1
4812//           tag 6 (Vector)   → u32 LE dim + dim×f32 LE
4813//           tag 7 (SmallInt) → i16 LE
4814//           tag 8 (Numeric)  → i128 LE (16 bytes) + u8 scale
4815//           tag 9 (Date)     → i32 LE (days since Unix epoch)
4816//           tag 10 (Timestamp) → i64 LE (microseconds since Unix epoch)
4817//
4818// Bumped to version 3 when NUMERIC was added; to version 4 when
4819// AUTO_INCREMENT (per-column flag) + NSW index `kind` byte landed;
4820// to version 5 when DATE / TIMESTAMP were added; to version 6 when
4821// NSW graph topology started travelling on disk (v2.7); to version 7
4822// when the NSW topology became multi-layer HNSW (v2.13); to version 8
4823// when row encoding switched to schema-driven dense layout (v3.0.2 —
4824// per-row NULL bitmap + per-column fixed-width body, no per-cell type
4825// tag).
4826// =========================================================================
4827
4828const FILE_MAGIC: &[u8; 8] = b"SPGDB001";
4829/// Current catalog snapshot format version emitted by [`Catalog::serialize`].
4830///
4831/// v9 (v5.2) extends v8 by serialising `BTree` index entries directly — every
4832/// `(IndexKey, Vec<RowLocator>)` pair travels on disk with the v5.1
4833/// `RowLocator::write_le` tag-prefixed codec. v8 `BTree` indices stored no
4834/// entries at all (the map was rebuilt from `Table::rows` on load); v9
4835/// preserves on-disk Cold locators so freezer-produced cold-tier index
4836/// entries survive a catalog snapshot round-trip. v8 readers are accepted
4837/// by version dispatch in [`Catalog::deserialize`] — every entry decodes
4838/// as `RowLocator::Hot(_)` via `add_index` rebuild, identical to v5.1
4839/// behaviour.
4840/// v6.7.2 — bumped from 10 to 11 to append per-table
4841/// `hot_tier_bytes: Option<u64>` after the per-table indices
4842/// section. v10 catalogs (v6.7.1) load with `hot_tier_bytes =
4843/// None` for every table (the deserialiser short-circuits when
4844/// version < 11). v11 snapshots written by a pre-v6.7.2 binary
4845/// fail loudly at the version check, matching the v6.1.2 /
4846/// v6.1.4 / v6.2.0 / v6.7.1 envelope-bump upgrade fences.
4847///
4848/// v6.8.0 — bumped from 11 to 12: per-index
4849/// `included_columns: Vec<u16>` appended at the tail of each
4850/// index payload. v11 (= v6.7.2) catalogs load with
4851/// `included_columns = Vec::new()` for every index — same
4852/// "older readers, append-only extension" pattern as the v6.7.2
4853/// hot_tier_bytes byte.
4854/// v7.13.0 — bumped from 22 to 23. mailrs round-5 G3 / G10.
4855/// Per-table appendix gains two new sections:
4856///   * `checks: Vec<String>` — CHECK predicate sources (Display
4857///     form of the AST Expr); re-parsed on INSERT/UPDATE to
4858///     enforce against candidate rows. Same persistence pattern
4859///     as `Index::partial_predicate`.
4860///   * Per `UniquenessConstraint`: trailing `nulls_not_distinct:
4861///     u8` flag for PG 15+ `UNIQUE NULLS NOT DISTINCT (cols)`
4862///     semantics.
4863/// v22 catalogs deserialise with empty `checks` and every UC
4864/// at `nulls_not_distinct = false`.
4865/// v24 introduces:
4866///   * Index kind tag 4 = trigram-GIN (`gin_trgm_ops`-flavoured
4867///     `USING gin` over a TEXT/VARCHAR column). Payload shape is
4868///     identical to tag-3 GIN (String → Vec<RowLocator>); the
4869///     keys are PG-compatible 3-byte trigram shingles instead of
4870///     tsvector lexemes. v23 catalogs deserialise unchanged — no
4871///     v23 writer ever emitted tag 4.
4872/// v25 introduces:
4873///   * Per `TriggerDef`: trailing `enabled: u8` flag (mailrs
4874///     round-9 A.2.b — `ALTER TABLE … { ENABLE | DISABLE }
4875///     TRIGGER …`). v24 catalogs deserialise with every trigger
4876///     `enabled = true`, matching pre-v7.16.1 behaviour.
4877const FILE_VERSION: u8 = 25;
4878/// Oldest format version [`Catalog::deserialize`] still accepts. v8 is the
4879/// v3.0.2 dense-row layout; pre-v8 catalogs require an offline migration.
4880const MIN_SUPPORTED_FILE_VERSION: u8 = 8;
4881
4882// IndexKey wire format (v9):
4883//   tag 0 = Int  → [i64 LE]
4884//   tag 1 = Text → [u16 LE len + UTF-8 bytes] (via write_str / read_str)
4885//   tag 2 = Bool → [u8 0/1]
4886const INDEX_KEY_TAG_INT: u8 = 0;
4887const INDEX_KEY_TAG_TEXT: u8 = 1;
4888const INDEX_KEY_TAG_BOOL: u8 = 2;
4889
4890impl Catalog {
4891    /// Serialize the whole catalog (schema + every row) into a self-contained
4892    /// byte buffer. Format is documented above the impl block.
4893    pub fn serialize(&self) -> Vec<u8> {
4894        let mut out = Vec::with_capacity(64);
4895        out.extend_from_slice(FILE_MAGIC);
4896        out.push(FILE_VERSION);
4897        write_u32(
4898            &mut out,
4899            u32::try_from(self.tables.len()).expect("≤ 4G tables"),
4900        );
4901        for t in &self.tables {
4902            write_str(&mut out, &t.schema.name);
4903            write_u16(
4904                &mut out,
4905                u16::try_from(t.schema.columns.len()).expect("≤ 65k columns/table"),
4906            );
4907            for c in &t.schema.columns {
4908                write_str(&mut out, &c.name);
4909                write_data_type(&mut out, c.ty);
4910                out.push(u8::from(c.nullable));
4911                match &c.default {
4912                    None => out.push(0),
4913                    Some(v) => {
4914                        out.push(1);
4915                        write_value(&mut out, v);
4916                    }
4917                }
4918                out.push(u8::from(c.auto_increment));
4919            }
4920            write_u32(
4921                &mut out,
4922                u32::try_from(t.rows.len()).expect("≤ 4G rows/table"),
4923            );
4924            // v3.0.2 dense row encoding (FILE_VERSION 8): per-row NULL
4925            // bitmap, then tightly-packed bodies. Identical wire format
4926            // as before — extracted into `encode_row_body_dense` so cold-
4927            // tier segments (v5.1+) can share the encoding.
4928            for row in &t.rows {
4929                out.extend_from_slice(&encode_row_body_dense(row, &t.schema));
4930            }
4931            // Index definitions. Per-index payload:
4932            //   [name][col_pos u16][kind u8]
4933            //     kind 0 = B-tree           (no params — rebuilt on load)
4934            //     kind 1 = NSW graph        (u16 M + serialized graph)
4935            // For NSW the graph topology travels on disk so startup
4936            // doesn't re-run the O(n²M) rebuild — see v2.7 notes.
4937            write_u16(
4938                &mut out,
4939                u16::try_from(t.indices.len()).expect("≤ 65k indices/table"),
4940            );
4941            for idx in &t.indices {
4942                write_str(&mut out, &idx.name);
4943                write_u16(
4944                    &mut out,
4945                    u16::try_from(idx.column_position).expect("≤ 65k columns/table"),
4946                );
4947                match &idx.kind {
4948                    IndexKind::BTree(map) => {
4949                        out.push(0);
4950                        // v9: serialise the full PB map. Each entry's
4951                        // RowLocator list travels with the tag-prefixed
4952                        // codec from `row_locator::write_le`, so freezer-
4953                        // produced Cold locators survive a snapshot
4954                        // round-trip. v8 BTree wrote nothing here and
4955                        // rebuilt from rows — v9 readers tolerate v8 by
4956                        // version dispatch in `Catalog::deserialize`.
4957                        write_u32(
4958                            &mut out,
4959                            u32::try_from(map.len()).expect("≤ 4G index entries/index"),
4960                        );
4961                        for (key, locators) in map {
4962                            write_index_key(&mut out, key);
4963                            write_u32(
4964                                &mut out,
4965                                u32::try_from(locators.len()).expect("≤ 4G locators/key"),
4966                            );
4967                            for loc in locators {
4968                                loc.write_le(&mut out);
4969                            }
4970                        }
4971                    }
4972                    IndexKind::Nsw(g) => {
4973                        out.push(1);
4974                        write_u16(&mut out, u16::try_from(g.m).expect("≤ 65k NSW neighbours"));
4975                        write_nsw_graph(&mut out, g);
4976                    }
4977                    IndexKind::Brin { column_type } => {
4978                        // v6.7.1 — tag byte 2 = BRIN. Payload is the
4979                        // column type code (1 byte mapping to the
4980                        // shared DataType numeric encoding); no
4981                        // further data — BRIN summaries live in
4982                        // cold segments, not the catalog.
4983                        out.push(2);
4984                        write_data_type(&mut out, *column_type);
4985                    }
4986                    IndexKind::Gin(map) => {
4987                        // v7.12.3 — tag byte 3 = GIN. Payload mirrors
4988                        // the BTree encoding but with String (lexeme
4989                        // word) keys instead of IndexKey. Tag-prefixed
4990                        // RowLocator codec so freezer-produced Cold
4991                        // locators survive snapshot round-trip.
4992                        // FILE_VERSION 21+; v20 catalogs never wrote a
4993                        // GIN index (the AM degraded to BTree fallback
4994                        // pre-v7.12.3), so no migration shim is needed.
4995                        out.push(3);
4996                        write_u32(
4997                            &mut out,
4998                            u32::try_from(map.len()).expect("≤ 4G GIN posting lists"),
4999                        );
5000                        for (word, locators) in map {
5001                            write_str(&mut out, word);
5002                            write_u32(
5003                                &mut out,
5004                                u32::try_from(locators.len()).expect("≤ 4G locators/posting list"),
5005                            );
5006                            for loc in locators {
5007                                loc.write_le(&mut out);
5008                            }
5009                        }
5010                    }
5011                    IndexKind::GinTrgm(map) => {
5012                        // v7.15.0 — tag byte 4 = GinTrgm
5013                        // (`gin_trgm_ops` GIN over a TEXT column).
5014                        // Payload shape is identical to tag-3 GIN —
5015                        // `String → Vec<RowLocator>` posting lists.
5016                        // The String keys are 3-byte trigrams instead
5017                        // of tsvector lexemes; the deserializer
5018                        // dispatches on the tag, not the key shape.
5019                        // FILE_VERSION 24+; v23 catalogs never wrote
5020                        // a trigram-GIN.
5021                        out.push(4);
5022                        write_u32(
5023                            &mut out,
5024                            u32::try_from(map.len()).expect("≤ 4G trigram-GIN posting lists"),
5025                        );
5026                        for (tri, locators) in map {
5027                            write_str(&mut out, tri);
5028                            write_u32(
5029                                &mut out,
5030                                u32::try_from(locators.len()).expect("≤ 4G locators/posting list"),
5031                            );
5032                            for loc in locators {
5033                                loc.write_le(&mut out);
5034                            }
5035                        }
5036                    }
5037                }
5038                // v6.8.0 — included_columns appendix per index.
5039                // Layout: [u16 num_included][num × u16 column_position].
5040                // v11 readers stop before this u16 (deserialise loop
5041                // gated on version >= 12); v12+ readers always
5042                // consume it. Empty Vec serialises as a bare 0u16.
5043                write_u16(
5044                    &mut out,
5045                    u16::try_from(idx.included_columns.len()).expect("≤ 65k INCLUDE columns/index"),
5046                );
5047                for col_pos in &idx.included_columns {
5048                    write_u16(
5049                        &mut out,
5050                        u16::try_from(*col_pos).expect("≤ 65k columns/table"),
5051                    );
5052                }
5053                // v6.8.1 — partial_predicate appendix per index.
5054                // Layout: [u8 has_pred][u16 LE len][bytes (if has_pred)].
5055                // Same v12 gate as included_columns.
5056                match &idx.partial_predicate {
5057                    None => out.push(0),
5058                    Some(pred) => {
5059                        out.push(1);
5060                        write_str(&mut out, pred);
5061                    }
5062                }
5063                // v6.8.2 — expression appendix. Same shape as
5064                // partial_predicate.
5065                match &idx.expression {
5066                    None => out.push(0),
5067                    Some(expr) => {
5068                        out.push(1);
5069                        write_str(&mut out, expr);
5070                    }
5071                }
5072                // v7.9.29 — is_unique appendix (FILE_VERSION 16+).
5073                // Single byte 0/1. v15-and-below readers stop before
5074                // this byte; v16 readers always consume it. mailrs K1.
5075                out.push(u8::from(idx.is_unique));
5076                // v7.9.29 — extra_column_positions appendix.
5077                // Layout: [u16 count][count × u16 column_position].
5078                write_u16(
5079                    &mut out,
5080                    u16::try_from(idx.extra_column_positions.len())
5081                        .expect("≤ 65k extra cols / index"),
5082                );
5083                for cp in &idx.extra_column_positions {
5084                    write_u16(&mut out, u16::try_from(*cp).expect("≤ 65k columns/table"));
5085                }
5086            }
5087            // v6.7.2 — per-table hot_tier_bytes Option<u64>.
5088            // Layout: [u8 has_value][u64 LE value (if has_value)].
5089            // v10 readers stop before this byte (deserialise loop
5090            // gated on version >= 11); v11+ readers always
5091            // consume it.
5092            match t.schema.hot_tier_bytes {
5093                None => out.push(0),
5094                Some(n) => {
5095                    out.push(1);
5096                    out.extend_from_slice(&n.to_le_bytes());
5097                }
5098            }
5099            // v7.6.1 — FOREIGN KEY appendix (catalog FILE_VERSION 13+).
5100            // Layout: [u16 LE fk_count]
5101            //   per fk:
5102            //     [u8 has_name] [str name (if has_name)]
5103            //     [u16 LE local_arity] [u16 LE local_pos]*arity
5104            //     [str parent_table]
5105            //     [u16 LE parent_arity] [u16 LE parent_pos]*arity
5106            //     [u8 on_delete_tag] [u8 on_update_tag]
5107            // Older catalogs (v12 and below) skip this block entirely;
5108            // their reader stops before this byte.
5109            write_u16(
5110                &mut out,
5111                u16::try_from(t.schema.foreign_keys.len()).expect("≤ 65k FKs/table"),
5112            );
5113            for fk in &t.schema.foreign_keys {
5114                match &fk.name {
5115                    None => out.push(0),
5116                    Some(n) => {
5117                        out.push(1);
5118                        write_str(&mut out, n);
5119                    }
5120                }
5121                write_u16(
5122                    &mut out,
5123                    u16::try_from(fk.local_columns.len()).expect("≤ 65k FK columns"),
5124                );
5125                for &p in &fk.local_columns {
5126                    write_u16(&mut out, u16::try_from(p).expect("≤ 65k columns/table"));
5127                }
5128                write_str(&mut out, &fk.parent_table);
5129                write_u16(
5130                    &mut out,
5131                    u16::try_from(fk.parent_columns.len()).expect("≤ 65k FK parent columns"),
5132                );
5133                for &p in &fk.parent_columns {
5134                    write_u16(&mut out, u16::try_from(p).expect("≤ 65k columns/table"));
5135                }
5136                out.push(fk.on_delete.tag());
5137                out.push(fk.on_update.tag());
5138            }
5139            // v7.9.19 — UniquenessConstraint appendix (catalog
5140            // FILE_VERSION 15+). Layout per table after the FK
5141            // block:
5142            //   [u16 count]
5143            //     per constraint:
5144            //       [u8 is_primary_key]
5145            //       [u16 arity][u16 col_pos]*arity
5146            // Older catalogs (v14 and below) skip this block.
5147            write_u16(
5148                &mut out,
5149                u16::try_from(t.schema.uniqueness_constraints.len())
5150                    .expect("≤ 65k uniqueness constraints/table"),
5151            );
5152            for uc in &t.schema.uniqueness_constraints {
5153                out.push(u8::from(uc.is_primary_key));
5154                write_u16(
5155                    &mut out,
5156                    u16::try_from(uc.columns.len()).expect("≤ 65k cols in uniqueness constraint"),
5157                );
5158                for &p in &uc.columns {
5159                    write_u16(&mut out, u16::try_from(p).expect("≤ 65k columns/table"));
5160                }
5161                // v7.13.0 — `nulls_not_distinct` flag
5162                // (FILE_VERSION 23+). Always written by writers at
5163                // version 23+; deserialise gates on `version >= 23`
5164                // so v22-and-below catalogs round-trip cleanly.
5165                out.push(u8::from(uc.nulls_not_distinct));
5166            }
5167            // v7.9.21 — runtime_default appendix per table.
5168            // Layout: [u16 count] then for each:
5169            //   [u16 col_pos][str expr]
5170            // Only columns whose runtime_default is Some land here;
5171            // catalog stays compact for the common literal-default
5172            // case.
5173            let mut rt_defaults: Vec<(usize, &str)> = Vec::new();
5174            for (i, c) in t.schema.columns.iter().enumerate() {
5175                if let Some(e) = &c.runtime_default {
5176                    rt_defaults.push((i, e.as_str()));
5177                }
5178            }
5179            write_u16(
5180                &mut out,
5181                u16::try_from(rt_defaults.len()).expect("≤ 65k runtime defaults/table"),
5182            );
5183            for (pos, expr) in rt_defaults {
5184                write_u16(&mut out, u16::try_from(pos).expect("≤ 65k columns/table"));
5185                write_str(&mut out, expr);
5186            }
5187            // v7.13.0 — CHECK constraint appendix per table.
5188            // Layout: [u16 count] then `count` Display-form
5189            // expression strings. Re-parsed on every INSERT/UPDATE
5190            // by the engine. FILE_VERSION 23+ only; v22 readers
5191            // never reach this block because the writer also moves
5192            // to v23 in lock-step.
5193            write_u16(
5194                &mut out,
5195                u16::try_from(t.schema.checks.len()).expect("≤ 65k CHECK constraints/table"),
5196            );
5197            for c in &t.schema.checks {
5198                write_str(&mut out, c.as_str());
5199            }
5200        }
5201        // v7.12.4 — catalog-wide appendix: user-defined functions
5202        // then triggers. FILE_VERSION 22+ only. v21 and earlier
5203        // readers stop after the last table; v22 readers always
5204        // consume two `u32` counts (possibly zero).
5205        //
5206        // Function entry layout:
5207        //   [str name] [str args_repr] [str returns]
5208        //   [str language] [str body]
5209        // Trigger entry layout:
5210        //   [str name] [str table] [str timing]
5211        //   [u16 event_count] (event_count × str)
5212        //   [str for_each] [str function]
5213        write_u32(
5214            &mut out,
5215            u32::try_from(self.functions.len()).expect("≤ 4G functions"),
5216        );
5217        for fd in self.functions.values() {
5218            write_str(&mut out, &fd.name);
5219            write_str(&mut out, &fd.args_repr);
5220            write_str(&mut out, &fd.returns);
5221            write_str(&mut out, &fd.language);
5222            write_str_long(&mut out, &fd.body);
5223        }
5224        write_u32(
5225            &mut out,
5226            u32::try_from(self.triggers.len()).expect("≤ 4G triggers"),
5227        );
5228        for td in &self.triggers {
5229            write_str(&mut out, &td.name);
5230            write_str(&mut out, &td.table);
5231            write_str(&mut out, &td.timing);
5232            write_u16(
5233                &mut out,
5234                u16::try_from(td.events.len()).expect("≤ 65k events / trigger"),
5235            );
5236            for ev in &td.events {
5237                write_str(&mut out, ev);
5238            }
5239            write_str(&mut out, &td.for_each);
5240            write_str(&mut out, &td.function);
5241            // v7.13.0 — `UPDATE OF cols` filter
5242            // (FILE_VERSION 23+). v22 readers omit; v23 writers
5243            // always emit (possibly zero).
5244            write_u16(
5245                &mut out,
5246                u16::try_from(td.update_columns.len()).expect("≤ 65k cols / trigger"),
5247            );
5248            for c in &td.update_columns {
5249                write_str(&mut out, c);
5250            }
5251            // v7.16.1 — TriggerDef.enabled (FILE_VERSION 25+).
5252            out.push(u8::from(td.enabled));
5253        }
5254        out
5255    }
5256
5257    /// Deserialize a previously-serialized catalog. Rejects bad magic, version
5258    /// mismatch, unknown tags, truncation, and trailing bytes.
5259    pub fn deserialize(buf: &[u8]) -> Result<Self, StorageError> {
5260        let mut cur = Cursor::new(buf);
5261        let magic = cur.take(8)?;
5262        if magic != FILE_MAGIC {
5263            return Err(StorageError::Corrupt(format!(
5264                "bad magic: expected SPGDB001, got {magic:?}"
5265            )));
5266        }
5267        let version = cur.read_u8()?;
5268        if !(MIN_SUPPORTED_FILE_VERSION..=FILE_VERSION).contains(&version) {
5269            return Err(StorageError::Corrupt(format!(
5270                "unsupported file version: {version} (supported: {MIN_SUPPORTED_FILE_VERSION}..={FILE_VERSION})"
5271            )));
5272        }
5273        let table_count = cur.read_u32()? as usize;
5274        let mut cat = Self::new();
5275        for _ in 0..table_count {
5276            deserialize_table(&mut cur, &mut cat, version)?;
5277        }
5278        // v7.12.4 — catalog-wide function + trigger appendix.
5279        // FILE_VERSION 22+ only; v21 and earlier catalogs stop
5280        // after the last table.
5281        if version >= 22 {
5282            let fn_count = cur.read_u32()? as usize;
5283            for _ in 0..fn_count {
5284                let name = cur.read_str()?;
5285                let args_repr = cur.read_str()?;
5286                let returns = cur.read_str()?;
5287                let language = cur.read_str()?;
5288                let body = cur.read_str_long()?;
5289                cat.functions.insert(
5290                    name.clone(),
5291                    FunctionDef {
5292                        name,
5293                        args_repr,
5294                        returns,
5295                        language,
5296                        body,
5297                    },
5298                );
5299            }
5300            let trg_count = cur.read_u32()? as usize;
5301            for _ in 0..trg_count {
5302                let name = cur.read_str()?;
5303                let table = cur.read_str()?;
5304                let timing = cur.read_str()?;
5305                let ev_count = cur.read_u16()? as usize;
5306                let mut events = Vec::with_capacity(ev_count);
5307                for _ in 0..ev_count {
5308                    events.push(cur.read_str()?);
5309                }
5310                let for_each = cur.read_str()?;
5311                let function = cur.read_str()?;
5312                // v7.13.0 — trailing `UPDATE OF cols` filter
5313                // (FILE_VERSION 23+ only; v22 catalogs omit and
5314                // deserialise with an empty vec).
5315                let update_columns = if version >= 23 {
5316                    let n = cur.read_u16()? as usize;
5317                    let mut cols = Vec::with_capacity(n);
5318                    for _ in 0..n {
5319                        cols.push(cur.read_str()?);
5320                    }
5321                    cols
5322                } else {
5323                    Vec::new()
5324                };
5325                // v7.16.1 — TriggerDef.enabled (FILE_VERSION 25+).
5326                // v24-and-below catalogs deserialise with `true`
5327                // — pre-v7.16.1 every trigger always fired.
5328                let enabled = if version >= 25 {
5329                    cur.read_u8()? != 0
5330                } else {
5331                    true
5332                };
5333                cat.triggers.push(TriggerDef {
5334                    name,
5335                    table,
5336                    timing,
5337                    events,
5338                    for_each,
5339                    function,
5340                    update_columns,
5341                    enabled,
5342                });
5343            }
5344        }
5345        if cur.pos < buf.len() {
5346            return Err(StorageError::Corrupt(format!(
5347                "trailing bytes: {} unread",
5348                buf.len() - cur.pos
5349            )));
5350        }
5351        Ok(cat)
5352    }
5353}
5354
5355/// Per-table deserialize body — schema, rows, indices. Pulled out of
5356/// `Catalog::deserialize` to keep the latter under the line-budget lint
5357/// and to give the row hot loop its own scope (so the borrow on `t`
5358/// stays scoped here rather than across the whole catalog loop).
5359fn deserialize_table(
5360    cur: &mut Cursor<'_>,
5361    cat: &mut Catalog,
5362    version: u8,
5363) -> Result<(), StorageError> {
5364    let table_name = cur.read_str()?;
5365    let name = table_name.clone();
5366    let col_count = cur.read_u16()? as usize;
5367    let mut cols = Vec::with_capacity(col_count);
5368    for _ in 0..col_count {
5369        let c_name = cur.read_str()?;
5370        let ty = cur.read_data_type()?;
5371        let nullable = cur.read_u8()? != 0;
5372        let default = match cur.read_u8()? {
5373            0 => None,
5374            1 => Some(cur.read_value()?),
5375            other => {
5376                return Err(StorageError::Corrupt(format!(
5377                    "unknown default tag: {other}"
5378                )));
5379            }
5380        };
5381        let auto_increment = cur.read_u8()? != 0;
5382        // Note: deserialiser sets runtime_default = None for
5383        // older catalogs (≤ v14). v15+ reads it from the
5384        // per-column appendix below.
5385        cols.push(ColumnSchema {
5386            name: c_name,
5387            ty,
5388            nullable,
5389            default,
5390            runtime_default: None,
5391            auto_increment,
5392        });
5393    }
5394    let n_cols = cols.len();
5395    cat.create_table(TableSchema::new(name, cols))?;
5396    // Vec<Table> with insertion-order semantics — the just-pushed
5397    // table is at the end. Sidecar `by_name` is already wired up but
5398    // we skip the map lookup here since we know the position.
5399    let t = cat.tables.last_mut().expect("create_table just pushed");
5400    deserialize_rows(cur, t, n_cols)?;
5401    deserialize_indices(cur, t, version)?;
5402    // v6.7.2 — per-table hot_tier_bytes appendix. v11+ writes
5403    // `[u8 has_value][u64 LE value (if has_value)]`. v10 / v9 / v8
5404    // catalogs skip this entirely (the deserialiser reads no extra
5405    // bytes; the table's hot_tier_bytes stays None from
5406    // TableSchema::new).
5407    if version >= 11 {
5408        let has = cur.read_u8()?;
5409        let hot_tier_bytes = match has {
5410            0 => None,
5411            1 => Some(cur.read_u64()?),
5412            other => {
5413                return Err(StorageError::Corrupt(format!(
5414                    "hot_tier_bytes appendix: unknown has-value byte {other}"
5415                )));
5416            }
5417        };
5418        t.schema_mut().hot_tier_bytes = hot_tier_bytes;
5419    }
5420    // v7.6.1 — FOREIGN KEY appendix (FILE_VERSION 13+). v12 / v11 / …
5421    // catalogs skip this entirely.
5422    if version >= 13 {
5423        let fk_count = cur.read_u16()? as usize;
5424        let mut fks = Vec::with_capacity(fk_count);
5425        for _ in 0..fk_count {
5426            let name = match cur.read_u8()? {
5427                0 => None,
5428                1 => Some(cur.read_str()?),
5429                other => {
5430                    return Err(StorageError::Corrupt(format!(
5431                        "FK appendix: unknown has-name byte {other}"
5432                    )));
5433                }
5434            };
5435            let local_arity = cur.read_u16()? as usize;
5436            let mut local_columns = Vec::with_capacity(local_arity);
5437            for _ in 0..local_arity {
5438                local_columns.push(cur.read_u16()? as usize);
5439            }
5440            let parent_table = cur.read_str()?;
5441            let parent_arity = cur.read_u16()? as usize;
5442            if parent_arity != local_arity {
5443                return Err(StorageError::Corrupt(format!(
5444                    "FK arity mismatch in catalog: local {local_arity} vs parent {parent_arity}"
5445                )));
5446            }
5447            let mut parent_columns = Vec::with_capacity(parent_arity);
5448            for _ in 0..parent_arity {
5449                parent_columns.push(cur.read_u16()? as usize);
5450            }
5451            let on_delete = FkAction::from_tag(cur.read_u8()?).ok_or_else(|| {
5452                StorageError::Corrupt("FK appendix: unknown on_delete tag".into())
5453            })?;
5454            let on_update = FkAction::from_tag(cur.read_u8()?).ok_or_else(|| {
5455                StorageError::Corrupt("FK appendix: unknown on_update tag".into())
5456            })?;
5457            fks.push(ForeignKeyConstraint {
5458                name,
5459                local_columns,
5460                parent_table,
5461                parent_columns,
5462                on_delete,
5463                on_update,
5464            });
5465        }
5466        t.schema_mut().foreign_keys = fks;
5467    }
5468    // v7.9.19 — UniquenessConstraint appendix (FILE_VERSION 15+).
5469    // v14 and below skip this entirely.
5470    if version >= 15 {
5471        let uc_count = cur.read_u16()? as usize;
5472        let mut ucs = Vec::with_capacity(uc_count);
5473        for _ in 0..uc_count {
5474            let is_pk = cur.read_u8()? != 0;
5475            let arity = cur.read_u16()? as usize;
5476            let mut cols = Vec::with_capacity(arity);
5477            for _ in 0..arity {
5478                cols.push(cur.read_u16()? as usize);
5479            }
5480            // v7.13.0 — trailing `nulls_not_distinct` flag
5481            // (FILE_VERSION 23+). v22 and below skip — flag
5482            // defaults to false (= NULLS DISTINCT).
5483            let nulls_not_distinct = if version >= 23 {
5484                cur.read_u8()? != 0
5485            } else {
5486                false
5487            };
5488            ucs.push(UniquenessConstraint {
5489                is_primary_key: is_pk,
5490                columns: cols,
5491                nulls_not_distinct,
5492            });
5493        }
5494        t.schema_mut().uniqueness_constraints = ucs;
5495        // v7.9.21 — runtime_default appendix (FILE_VERSION 15+).
5496        let rt_count = cur.read_u16()? as usize;
5497        for _ in 0..rt_count {
5498            let pos = cur.read_u16()? as usize;
5499            let expr = cur.read_str()?;
5500            if let Some(col) = t.schema_mut().columns.get_mut(pos) {
5501                col.runtime_default = Some(expr);
5502            }
5503        }
5504    }
5505    // v7.13.0 — CHECK constraints appendix (FILE_VERSION 23+).
5506    // v22 and below leave the vec empty.
5507    if version >= 23 {
5508        let check_count = cur.read_u16()? as usize;
5509        let mut checks = Vec::with_capacity(check_count);
5510        for _ in 0..check_count {
5511            checks.push(cur.read_str()?);
5512        }
5513        t.schema_mut().checks = checks;
5514    }
5515    let _ = table_name;
5516    Ok(())
5517}
5518
5519fn deserialize_rows(
5520    cur: &mut Cursor<'_>,
5521    t: &mut Table,
5522    _n_cols: usize,
5523) -> Result<(), StorageError> {
5524    let row_count = cur.read_u32()? as usize;
5525    // v4.39: PV has no `reserve` (the BVT doesn't preallocate a
5526    // contiguous buffer); we just push directly and let the trie
5527    // grow. v5.1: row decode reuses `decode_row_body_dense` so the
5528    // catalog and cold-tier segments share one row codec.
5529    let mut hot_bytes: u64 = 0;
5530    for _ in 0..row_count {
5531        let tail = &cur.buf[cur.pos..];
5532        let (row, consumed) = decode_row_body_dense(tail, &t.schema)?;
5533        cur.pos += consumed;
5534        // v5.2.1: account for hot bytes as we go; the snapshot's row
5535        // block bytes are exactly what `encode_row_body_dense` would
5536        // produce, so `consumed` would do too — but going via the
5537        // helper keeps the counter's definition coupled to the
5538        // encoder rather than the snapshot's row prefix layout.
5539        hot_bytes = hot_bytes.saturating_add(row_body_encoded_len(&row, &t.schema) as u64);
5540        t.rows.push_mut(row);
5541    }
5542    t.hot_bytes = hot_bytes;
5543    Ok(())
5544}
5545
5546fn deserialize_indices(
5547    cur: &mut Cursor<'_>,
5548    t: &mut Table,
5549    version: u8,
5550) -> Result<(), StorageError> {
5551    let index_count = cur.read_u16()? as usize;
5552    for _ in 0..index_count {
5553        let idx_name = cur.read_str()?;
5554        let col_pos = cur.read_u16()? as usize;
5555        let column_name = t
5556            .schema
5557            .columns
5558            .get(col_pos)
5559            .ok_or_else(|| {
5560                StorageError::Corrupt(format!(
5561                    "index {idx_name:?} points at non-existent column position {col_pos}"
5562                ))
5563            })?
5564            .name
5565            .clone();
5566        let kind_tag = cur.read_u8()?;
5567        match kind_tag {
5568            0 => {
5569                if version >= 9 {
5570                    // v9+: BTree entries serialised inline (tag-prefixed
5571                    // locator codec). Restore the map directly so any
5572                    // freezer-produced Cold locators come back exactly
5573                    // as they went out.
5574                    let map = read_btree_map(cur)?;
5575                    t.restore_btree_index(idx_name, &column_name, map)?;
5576                } else {
5577                    // v8: no entries on disk; rebuild from rows. Every
5578                    // entry is materialised as `RowLocator::Hot(i)` —
5579                    // semantically identical to the v5.1 in-memory state
5580                    // since v8 catalogs never produced Cold locators.
5581                    t.add_index(idx_name, &column_name)?;
5582                }
5583            }
5584            1 => {
5585                let m = cur.read_u16()? as usize;
5586                let graph = cur.read_nsw_graph(m)?;
5587                t.restore_nsw_index(idx_name, &column_name, graph)?;
5588            }
5589            2 => {
5590                // v6.7.1 — BRIN tag. Payload is the column type
5591                // tag. No further data — summaries live in cold
5592                // segments.
5593                let column_type = cur.read_data_type()?;
5594                t.restore_brin_index(idx_name, &column_name, column_type)?;
5595            }
5596            3 => {
5597                // v7.12.3 — GIN tag. Payload mirrors the BTree
5598                // encoding but with String (lexeme word) keys.
5599                // Only emitted by FILE_VERSION 21+ writers — v20
5600                // and earlier degraded `USING gin` to BTree.
5601                let map = read_gin_map(cur)?;
5602                t.restore_gin_index(idx_name, &column_name, map)?;
5603            }
5604            4 => {
5605                // v7.15.0 — trigram-GIN tag (`gin_trgm_ops`).
5606                // Same payload shape as tag 3 (String → posting
5607                // list); only emitted by FILE_VERSION 24+ writers.
5608                if version < 24 {
5609                    return Err(StorageError::Corrupt(format!(
5610                        "trigram-GIN index tag 4 found in catalog FILE_VERSION {version}; \
5611                         FILE_VERSION 24+ required (v7.15.0 introduced this tag)"
5612                    )));
5613                }
5614                let map = read_gin_map(cur)?;
5615                t.restore_gin_trgm_index(idx_name, &column_name, map)?;
5616            }
5617            other => {
5618                return Err(StorageError::Corrupt(format!(
5619                    "unknown index kind tag: {other}"
5620                )));
5621            }
5622        }
5623        // v6.8.0 — included_columns appendix per index. v11- snapshots
5624        // stop before this u16; v12+ always carries it (possibly 0).
5625        if version >= 12 {
5626            let num_included = cur.read_u16()? as usize;
5627            if num_included > 0 {
5628                let mut included: Vec<usize> = Vec::with_capacity(num_included);
5629                for _ in 0..num_included {
5630                    let cp = cur.read_u16()? as usize;
5631                    if cp >= t.schema.columns.len() {
5632                        return Err(StorageError::Corrupt(format!(
5633                            "INCLUDE column position {cp} out of range \
5634                             ({} schema columns)",
5635                            t.schema.columns.len()
5636                        )));
5637                    }
5638                    included.push(cp);
5639                }
5640                if let Some(last) = t.indices.last_mut() {
5641                    last.included_columns = included;
5642                }
5643            }
5644            // v6.8.1 — partial_predicate appendix.
5645            match cur.read_u8()? {
5646                0 => {}
5647                1 => {
5648                    let pred = cur.read_str()?;
5649                    if let Some(last) = t.indices.last_mut() {
5650                        last.partial_predicate = Some(pred);
5651                    }
5652                }
5653                other => {
5654                    return Err(StorageError::Corrupt(format!(
5655                        "partial_predicate tag: unknown byte {other}"
5656                    )));
5657                }
5658            }
5659            // v6.8.2 — expression appendix.
5660            match cur.read_u8()? {
5661                0 => {}
5662                1 => {
5663                    let expr = cur.read_str()?;
5664                    if let Some(last) = t.indices.last_mut() {
5665                        last.expression = Some(expr);
5666                    }
5667                }
5668                other => {
5669                    return Err(StorageError::Corrupt(format!(
5670                        "expression tag: unknown byte {other}"
5671                    )));
5672                }
5673            }
5674            // v7.9.29 — is_unique appendix (FILE_VERSION 16+).
5675            // v15-and-below catalogs stop before this byte. mailrs K1.
5676            if version >= 16 {
5677                match cur.read_u8()? {
5678                    0 => {}
5679                    1 => {
5680                        if let Some(last) = t.indices.last_mut() {
5681                            last.is_unique = true;
5682                        }
5683                    }
5684                    other => {
5685                        return Err(StorageError::Corrupt(format!(
5686                            "is_unique tag: unknown byte {other}"
5687                        )));
5688                    }
5689                }
5690                // v7.9.29 — extra_column_positions appendix.
5691                let n = cur.read_u16()? as usize;
5692                if n > 0 {
5693                    let mut extras: Vec<usize> = Vec::with_capacity(n);
5694                    for _ in 0..n {
5695                        let cp = cur.read_u16()? as usize;
5696                        if cp >= t.schema.columns.len() {
5697                            return Err(StorageError::Corrupt(format!(
5698                                "extra column position {cp} out of range \
5699                                 ({} schema columns)",
5700                                t.schema.columns.len()
5701                            )));
5702                        }
5703                        extras.push(cp);
5704                    }
5705                    if let Some(last) = t.indices.last_mut() {
5706                        last.extra_column_positions = extras;
5707                    }
5708                }
5709            }
5710        }
5711    }
5712    Ok(())
5713}
5714
5715/// Parse a v9 `BTree` index payload — `[u32 entry_count]` followed by
5716/// `entry_count` `(IndexKey, Vec<RowLocator>)` pairs. The locator list
5717/// uses the v5.1 tag-prefixed wire format (`RowLocator::read_le`).
5718fn read_btree_map(
5719    cur: &mut Cursor<'_>,
5720) -> Result<PersistentBTreeMap<IndexKey, Vec<RowLocator>>, StorageError> {
5721    let entry_count = cur.read_u32()? as usize;
5722    let mut map = PersistentBTreeMap::new();
5723    for _ in 0..entry_count {
5724        let key = cur.read_index_key()?;
5725        let locator_count = cur.read_u32()? as usize;
5726        let mut locators = Vec::with_capacity(locator_count);
5727        for _ in 0..locator_count {
5728            let tail = &cur.buf[cur.pos..];
5729            let (loc, consumed) = RowLocator::read_le(tail).map_err(|e| {
5730                StorageError::Corrupt(format!("row_locator decode at offset {}: {e}", cur.pos))
5731            })?;
5732            cur.pos += consumed;
5733            locators.push(loc);
5734        }
5735        map.insert_mut(key, locators);
5736    }
5737    Ok(map)
5738}
5739
5740/// v7.12.3 — parse a `Gin` index payload. Mirrors [`read_btree_map`]
5741/// but with `String` (lexeme word) keys instead of `IndexKey`.
5742/// FILE_VERSION 21+ only.
5743fn read_gin_map(
5744    cur: &mut Cursor<'_>,
5745) -> Result<PersistentBTreeMap<String, Vec<RowLocator>>, StorageError> {
5746    let entry_count = cur.read_u32()? as usize;
5747    let mut map = PersistentBTreeMap::new();
5748    for _ in 0..entry_count {
5749        let word = cur.read_str()?;
5750        let locator_count = cur.read_u32()? as usize;
5751        let mut locators = Vec::with_capacity(locator_count);
5752        for _ in 0..locator_count {
5753            let tail = &cur.buf[cur.pos..];
5754            let (loc, consumed) = RowLocator::read_le(tail).map_err(|e| {
5755                StorageError::Corrupt(format!("row_locator decode at offset {}: {e}", cur.pos))
5756            })?;
5757            cur.pos += consumed;
5758            locators.push(loc);
5759        }
5760        map.insert_mut(word, locators);
5761    }
5762    Ok(map)
5763}
5764
5765// --- low-level binary helpers ---------------------------------------------
5766
5767/// Write a `DataType` as a tag byte + optional payload (Vector carries its
5768/// `u32` dimension). Inverse: [`read_data_type`].
5769/// Serialize an HNSW graph after the `[kind=1][u16 M]` header (v7).
5770/// Layout:
5771/// - `[u16 m_max_0]`
5772/// - `[entry u32]` — `u32::MAX` means `None`, else the entry node index
5773/// - `[u8 entry_level]`
5774/// - `[node_count u32]`
5775/// - for each node: `[u8 level]`  (top layer for this node)
5776/// - `[layer_count u8]`
5777/// - for each layer `0..layer_count`:
5778///     - `[u32 layer_node_count]` (== `node_count`; per-layer slot)
5779///     - for each node: `[u16 neighbor_count] [u32 neighbor]*`
5780fn write_nsw_graph(out: &mut Vec<u8>, g: &NswGraph) {
5781    let entry = g.entry.map_or(u32::MAX, |e| {
5782        u32::try_from(e).expect("NSW entry fits in u32")
5783    });
5784    write_u16(
5785        out,
5786        u16::try_from(g.m_max_0).expect("HNSW m_max_0 fits in u16"),
5787    );
5788    out.extend_from_slice(&entry.to_le_bytes());
5789    out.push(g.entry_level);
5790    let node_count = g.levels.len();
5791    write_u32(
5792        out,
5793        u32::try_from(node_count).expect("HNSW node count fits in u32"),
5794    );
5795    for &lvl in &g.levels {
5796        out.push(lvl);
5797    }
5798    let layer_count = u8::try_from(g.layers.len()).expect("HNSW layer count ≤ 255");
5799    out.push(layer_count);
5800    for layer in &g.layers {
5801        write_u32(
5802            out,
5803            u32::try_from(layer.len()).expect("HNSW per-layer node count fits in u32"),
5804        );
5805        for neighbors in layer {
5806            write_u16(
5807                out,
5808                u16::try_from(neighbors.len()).expect("HNSW neighbour list fits in u16"),
5809            );
5810            // v6.1.x: neighbour slot is already u32 in memory; just
5811            // emit the raw bytes. (v6.0 stored usize and converted
5812            // here.)
5813            for &peer in neighbors {
5814                write_u32(out, peer);
5815            }
5816        }
5817    }
5818}
5819
5820fn write_data_type(out: &mut Vec<u8>, t: DataType) {
5821    match t {
5822        DataType::Int => out.push(1),
5823        DataType::BigInt => out.push(2),
5824        DataType::Float => out.push(3),
5825        DataType::Text => out.push(4),
5826        DataType::Bool => out.push(5),
5827        DataType::Vector { dim, encoding } => match encoding {
5828            // Tag 6: pre-v6 F32 vector. Layout unchanged; pre-v6
5829            // binaries continue to deserialise this exactly as
5830            // before.
5831            VecEncoding::F32 => {
5832                out.push(6);
5833                out.extend_from_slice(&dim.to_le_bytes());
5834            }
5835            // v6.0.3: tag 15 for `VECTOR(N) USING HALF`. Same
5836            // forward-compat fence story as SQ8 below.
5837            VecEncoding::F16 => {
5838                out.push(15);
5839                out.extend_from_slice(&dim.to_le_bytes());
5840            }
5841            // v6.0.1: new tag 14 for `VECTOR(N) USING SQ8` column
5842            // type. Pre-v6 readers fall through `read_data_type`'s
5843            // catch-all and surface `Corrupt("unknown data type tag")`
5844            // — the explicit forward-compat fence called out in
5845            // V6_DESIGN deliberation #5.
5846            VecEncoding::Sq8 => {
5847                out.push(14);
5848                out.extend_from_slice(&dim.to_le_bytes());
5849            }
5850        },
5851        DataType::SmallInt => out.push(7),
5852        DataType::Varchar(max) => {
5853            out.push(8);
5854            out.extend_from_slice(&max.to_le_bytes());
5855        }
5856        DataType::Char(size) => {
5857            out.push(9);
5858            out.extend_from_slice(&size.to_le_bytes());
5859        }
5860        DataType::Numeric { precision, scale } => {
5861            out.push(10);
5862            out.push(precision);
5863            out.push(scale);
5864        }
5865        DataType::Date => out.push(11),
5866        DataType::Timestamp => out.push(12),
5867        // v7.9.2 — tag 17 for TIMESTAMPTZ. Body = i64 microseconds
5868        // UTC, identical to tag 12. Only the schema-side type tag
5869        // differs (for wire OID advertisement).
5870        DataType::Timestamptz => out.push(17),
5871        // INTERVAL is runtime-only — CREATE TABLE never produces a
5872        // column with this type, so write_data_type must not be called
5873        // on it. (Disk-format codepoint reserved for a future v3 where
5874        // INTERVAL becomes storable.)
5875        DataType::Interval => {
5876            unreachable!("DataType::Interval has no on-disk encoding in v2.11")
5877        }
5878        DataType::Json => out.push(13),
5879        // v7.9.0: tag 16 for `JSONB`. Same on-disk layout as
5880        // tag 13 — only the wire OID differs.
5881        DataType::Jsonb => out.push(16),
5882        // v7.10.4: tag 18 for `BYTEA`. Body = [u16 len][bytes].
5883        DataType::Bytes => out.push(18),
5884        // v7.10.9: tag 19 for `TEXT[]`. Body = [u16 count][per
5885        // element: u8 null + (if non-null) u16 len + utf-8].
5886        DataType::TextArray => out.push(19),
5887        // v7.11.12: tag 20 for `INT[]`. Body = [u16 count][per
5888        // element: u8 null + (if non-null) i32 LE].
5889        DataType::IntArray => out.push(20),
5890        // v7.11.12: tag 21 for `BIGINT[]`. Body = [u16 count][per
5891        // element: u8 null + (if non-null) i64 LE].
5892        DataType::BigIntArray => out.push(21),
5893        // v7.12.0: tag 22 for `tsvector`. No body — type identity
5894        // alone. Catalog FILE_VERSION 20+.
5895        DataType::TsVector => out.push(22),
5896        // v7.12.0: tag 23 for `tsquery`. No body. Catalog
5897        // FILE_VERSION 20+.
5898        DataType::TsQuery => out.push(23),
5899    }
5900}
5901
5902impl Cursor<'_> {
5903    fn read_data_type(&mut self) -> Result<DataType, StorageError> {
5904        let tag = self.read_u8()?;
5905        match tag {
5906            1 => Ok(DataType::Int),
5907            2 => Ok(DataType::BigInt),
5908            3 => Ok(DataType::Float),
5909            4 => Ok(DataType::Text),
5910            5 => Ok(DataType::Bool),
5911            6 => Ok(DataType::Vector {
5912                dim: self.read_u32()?,
5913                encoding: VecEncoding::F32,
5914            }),
5915            7 => Ok(DataType::SmallInt),
5916            8 => Ok(DataType::Varchar(self.read_u32()?)),
5917            9 => Ok(DataType::Char(self.read_u32()?)),
5918            10 => {
5919                let precision = self.read_u8()?;
5920                let scale = self.read_u8()?;
5921                Ok(DataType::Numeric { precision, scale })
5922            }
5923            11 => Ok(DataType::Date),
5924            12 => Ok(DataType::Timestamp),
5925            13 => Ok(DataType::Json),
5926            14 => Ok(DataType::Vector {
5927                dim: self.read_u32()?,
5928                encoding: VecEncoding::Sq8,
5929            }),
5930            // v6.0.3: tag 15 for `VECTOR(N) USING HALF`. Same
5931            // [u32 dim] type-tag payload as F32 / SQ8; the encoding
5932            // lives in the tag byte itself.
5933            15 => Ok(DataType::Vector {
5934                dim: self.read_u32()?,
5935                encoding: VecEncoding::F16,
5936            }),
5937            // v7.9.0: tag 16 for `JSONB`. Storage shape == Json;
5938            // we only carry the type tag so the wire layer can
5939            // emit PG OID 3802 instead of 114.
5940            16 => Ok(DataType::Jsonb),
5941            // v7.9.2: tag 17 for `TIMESTAMPTZ`. Storage shape ==
5942            // Timestamp (i64 microseconds UTC); only the wire OID
5943            // (1184) differs.
5944            17 => Ok(DataType::Timestamptz),
5945            // v7.10.4: tag 18 for `BYTEA`. Catalog FILE_VERSION 17+.
5946            18 => Ok(DataType::Bytes),
5947            // v7.10.9: tag 19 for `TEXT[]`. Catalog FILE_VERSION 18+.
5948            19 => Ok(DataType::TextArray),
5949            // v7.11.12: tags 20/21 for INT[]/BIGINT[]. FILE_VERSION 19+.
5950            20 => Ok(DataType::IntArray),
5951            21 => Ok(DataType::BigIntArray),
5952            // v7.12.0: tags 22/23 for tsvector / tsquery. Catalog
5953            // FILE_VERSION 20+.
5954            22 => Ok(DataType::TsVector),
5955            23 => Ok(DataType::TsQuery),
5956            other => Err(StorageError::Corrupt(format!(
5957                "unknown data type tag: {other}"
5958            ))),
5959        }
5960    }
5961}
5962
5963/// Fast computation of the byte length [`encode_row_body_dense`]
5964/// would produce, without allocating the output buffer. Mirrors the
5965/// encoder's per-column body sizing so the v5.2.1 `Table::hot_bytes`
5966/// incremental counter doesn't pay an alloc-per-insert tax. Returns
5967/// the exact same `usize` as `encode_row_body_dense(row, schema).len()`.
5968pub fn row_body_encoded_len(row: &Row, schema: &TableSchema) -> usize {
5969    debug_assert_eq!(
5970        row.values.len(),
5971        schema.columns.len(),
5972        "row_body_encoded_len: row arity must match schema"
5973    );
5974    let bitmap_bytes = schema.columns.len().div_ceil(8);
5975    let mut n = bitmap_bytes;
5976    for (col_idx, v) in row.values.iter().enumerate() {
5977        if matches!(v, Value::Null) {
5978            continue;
5979        }
5980        n += value_body_encoded_len(v, schema.columns[col_idx].ty);
5981    }
5982    n
5983}
5984
5985/// Byte length a single cell consumes when written by
5986/// `write_value_body`. Used by [`row_body_encoded_len`]; kept in
5987/// lock-step with the encoder. The `_ty` slot is reserved for future
5988/// type-dependent encodings — every variant currently writes a fixed
5989/// body shape regardless of the declared column type.
5990fn value_body_encoded_len(v: &Value, _ty: DataType) -> usize {
5991    match v {
5992        Value::SmallInt(_) => 2,
5993        // 4-byte body: i32 / Date.
5994        Value::Int(_) | Value::Date(_) => 4,
5995        // 8-byte body: i64 / f64 / Timestamp.
5996        Value::BigInt(_) | Value::Float(_) | Value::Timestamp(_) => 8,
5997        Value::Bool(_) => 1,
5998        // Text/Varchar/Char/Json share the [u16 len][utf-8] layout.
5999        Value::Text(s) | Value::Json(s) => 2 + s.len(),
6000        // [u32 dim][f32 * dim]
6001        Value::Vector(vec) => 4 + 4 * vec.len(),
6002        // v6.0.1: SQ8 cell on-disk shape — [u32 dim][f32 min]
6003        // [f32 max][u8 * dim] = 12 + dim bytes. `hot_bytes`
6004        // tracking on `Table::insert` calls this every row, so
6005        // returning the real size now (even though the actual
6006        // `write_value_body` writer lands in step 6) keeps the
6007        // sizing arithmetic honest for in-memory benches.
6008        Value::Sq8Vector(q) => 4 + 4 + 4 + q.bytes.len(),
6009        // v6.0.3: halfvec on-disk shape — [u32 dim][u16 LE * dim]
6010        // = 4 + 2 * dim bytes.
6011        Value::HalfVector(h) => 4 + h.bytes.len(),
6012        // [i128 scaled][u8 scale]
6013        Value::Numeric { .. } => 16 + 1,
6014        // v7.10.4: BYTEA on-disk shape mirrors Text — [u16 len][bytes].
6015        // The 16-bit length cap is the same TEXT/JSON limit (~65 KB);
6016        // larger blobs need toast-style chunking which is a v7.11
6017        // carve-out (kept aligned with TEXT for now so the catalog
6018        // snapshot stays simple).
6019        Value::Bytes(b) => 2 + b.len(),
6020        // v7.10.9: TEXT[] on-disk shape — [u16 count][per element:
6021        // u8 null flag + (when non-null) u16 len + utf-8 bytes].
6022        Value::TextArray(items) => {
6023            let mut n = 2; // count prefix
6024            for item in items {
6025                n += 1; // null flag
6026                if let Some(s) = item {
6027                    n += 2 + s.len();
6028                }
6029            }
6030            n
6031        }
6032        // v7.11.12: INT[] / BIGINT[] — [u16 count][per element:
6033        // u8 null + (when non-null) fixed-width LE].
6034        Value::IntArray(items) => {
6035            2 + items
6036                .iter()
6037                .map(|x| if x.is_some() { 5 } else { 1 })
6038                .sum::<usize>()
6039        }
6040        Value::BigIntArray(items) => {
6041            2 + items
6042                .iter()
6043                .map(|x| if x.is_some() { 9 } else { 1 })
6044                .sum::<usize>()
6045        }
6046        // v7.12.0: tsvector dense body — [u16 lexeme_count][per
6047        // lex: u16 word_len + utf-8 word + u16 pos_count + (u16
6048        // LE * pos_count) + u8 weight].
6049        Value::TsVector(lexs) => {
6050            let mut n = 2;
6051            for l in lexs {
6052                n += 2 + l.word.len() + 2 + 2 * l.positions.len() + 1;
6053            }
6054            n
6055        }
6056        // v7.12.0: tsquery dense body — prefix-coded tree.
6057        // Sizing must match `write_tsquery_body` walker.
6058        Value::TsQuery(ast) => tsquery_encoded_len(ast),
6059        // NULL is encoded only in the bitmap, never in the body.
6060        Value::Null => 0,
6061        // INTERVAL has no on-disk encoding (see write_value_body).
6062        Value::Interval { .. } => {
6063            unreachable!("Value::Interval has no on-disk encoding")
6064        }
6065    }
6066}
6067
6068/// Encode one row's body in the v3.0.2 dense format (`FILE_VERSION`
6069/// 8): per-row NULL bitmap (1 bit/col, ceil(cols/8) bytes), then
6070/// each non-NULL cell as `write_value_body`. Same wire shape the
6071/// catalog snapshot writes per row inside its rows-block. Exposed
6072/// pub so v5.1+ cold-tier segment writers can produce row payloads
6073/// that the catalog [`decode_row_body_dense`] decodes 1:1.
6074///
6075/// `row.values.len()` must equal `schema.columns.len()` — the row
6076/// is expected to have been validated by `Table::insert` (the
6077/// engine's INSERT path) before reaching this function.
6078pub fn encode_row_body_dense(row: &Row, schema: &TableSchema) -> Vec<u8> {
6079    debug_assert_eq!(
6080        row.values.len(),
6081        schema.columns.len(),
6082        "dense encode: row arity must match schema"
6083    );
6084    let bitmap_bytes = schema.columns.len().div_ceil(8);
6085    // 8 B per fixed-width cell is a reasonable average; the buffer
6086    // grows past this for variable-width Text/Vector cells.
6087    let mut out = Vec::with_capacity(bitmap_bytes + schema.columns.len() * 8);
6088    let bitmap_offset = out.len();
6089    out.resize(bitmap_offset + bitmap_bytes, 0);
6090    for (i, v) in row.values.iter().enumerate() {
6091        if matches!(v, Value::Null) {
6092            out[bitmap_offset + i / 8] |= 1 << (i % 8);
6093        }
6094    }
6095    for (col_idx, v) in row.values.iter().enumerate() {
6096        if matches!(v, Value::Null) {
6097            continue;
6098        }
6099        write_value_body(&mut out, v, schema.columns[col_idx].ty);
6100    }
6101    out
6102}
6103
6104/// Inverse of [`encode_row_body_dense`]. Reads one row's body from
6105/// `bytes` and returns it plus the number of bytes consumed (so a
6106/// caller decoding a back-to-back stream of rows can advance its
6107/// cursor). Returns `StorageError::Corrupt` on truncation, bad
6108/// UTF-8, or unknown cell tags.
6109pub fn decode_row_body_dense(
6110    bytes: &[u8],
6111    schema: &TableSchema,
6112) -> Result<(Row, usize), StorageError> {
6113    let mut cur = Cursor::new(bytes);
6114    let bitmap_bytes = schema.columns.len().div_ceil(8);
6115    let mut bitmap_buf = [0u8; 32];
6116    if bitmap_bytes > bitmap_buf.len() {
6117        return Err(StorageError::Corrupt(format!(
6118            "row NULL bitmap {bitmap_bytes} B exceeds 32 B cap"
6119        )));
6120    }
6121    let slice = cur.take(bitmap_bytes)?;
6122    bitmap_buf[..bitmap_bytes].copy_from_slice(slice);
6123    let mut values = Vec::with_capacity(schema.columns.len());
6124    for (col_idx, col) in schema.columns.iter().enumerate() {
6125        if (bitmap_buf[col_idx / 8] >> (col_idx % 8)) & 1 == 1 {
6126            values.push(Value::Null);
6127        } else {
6128            values.push(cur.read_value_body(col.ty)?);
6129        }
6130    }
6131    Ok((Row { values }, cur.pos))
6132}
6133
6134/// Schema-driven dense value encoding (`FILE_VERSION` 8). Caller already
6135/// knows the column type and has decided this cell is non-NULL, so we
6136/// skip the per-cell type tag the v7 `write_value` was writing. NULL
6137/// is encoded via the per-row bitmap before this function runs, never
6138/// reaches here. Used only inside the row-encoding hot loop; the
6139/// schema-default path still goes through the legacy `write_value` so
6140/// DEFAULT values keep their self-describing tag and remain decodable
6141/// without consulting a column type.
6142fn write_value_body(out: &mut Vec<u8>, v: &Value, ty: DataType) {
6143    match (v, ty) {
6144        (Value::SmallInt(n), DataType::SmallInt) => out.extend_from_slice(&n.to_le_bytes()),
6145        (Value::Int(n), DataType::Int) => out.extend_from_slice(&n.to_le_bytes()),
6146        (Value::BigInt(n), DataType::BigInt) => out.extend_from_slice(&n.to_le_bytes()),
6147        (Value::Float(x), DataType::Float) => out.extend_from_slice(&x.to_le_bytes()),
6148        (Value::Bool(b), DataType::Bool) => out.push(u8::from(*b)),
6149        (Value::Text(s), DataType::Text | DataType::Varchar(_) | DataType::Char(_)) => {
6150            write_str(out, s);
6151        }
6152        (
6153            Value::Vector(v),
6154            DataType::Vector {
6155                encoding: VecEncoding::F32,
6156                ..
6157            },
6158        ) => {
6159            let dim = u32::try_from(v.len()).expect("vector dim fits in u32");
6160            out.extend_from_slice(&dim.to_le_bytes());
6161            for x in v {
6162                out.extend_from_slice(&x.to_le_bytes());
6163            }
6164        }
6165        // v6.0.1: SQ8 dense body — [u32 dim][f32 min][f32 max]
6166        // [u8 * dim]. Self-describes its length so v6 readers
6167        // walking rows of a v6 catalog stay aligned even if the
6168        // declared column dim drifts (defensive, not normally
6169        // possible since CREATE TABLE pins the dim).
6170        (
6171            Value::Sq8Vector(q),
6172            DataType::Vector {
6173                encoding: VecEncoding::Sq8,
6174                ..
6175            },
6176        ) => {
6177            let dim = u32::try_from(q.bytes.len()).expect("vector dim fits in u32");
6178            out.extend_from_slice(&dim.to_le_bytes());
6179            out.extend_from_slice(&q.min.to_le_bytes());
6180            out.extend_from_slice(&q.max.to_le_bytes());
6181            out.extend_from_slice(&q.bytes);
6182        }
6183        // v6.0.3: halfvec dense body — [u32 dim][u16 LE * dim].
6184        // The raw u16 bytes already live in `h.bytes` little-
6185        // endian, so we just splat them.
6186        (
6187            Value::HalfVector(h),
6188            DataType::Vector {
6189                encoding: VecEncoding::F16,
6190                ..
6191            },
6192        ) => {
6193            let dim = u32::try_from(h.dim()).expect("vector dim fits in u32");
6194            out.extend_from_slice(&dim.to_le_bytes());
6195            out.extend_from_slice(&h.bytes);
6196        }
6197        (Value::Numeric { scaled, .. }, DataType::Numeric { scale, .. }) => {
6198            out.extend_from_slice(&scaled.to_le_bytes());
6199            out.push(scale);
6200        }
6201        (Value::Date(d), DataType::Date) => out.extend_from_slice(&d.to_le_bytes()),
6202        (Value::Timestamp(t), DataType::Timestamp | DataType::Timestamptz) => {
6203            out.extend_from_slice(&t.to_le_bytes())
6204        }
6205        // v4.9: JSON stores as length-prefixed text; same shape as
6206        // Text — the type tag lives in the column schema, not the
6207        // per-cell body.
6208        (Value::Json(s), DataType::Json | DataType::Jsonb) => write_str(out, s),
6209        // v7.10.4: BYTEA shares the [u16 len][bytes] shape with
6210        // Text but writes raw bytes (no UTF-8 invariant).
6211        (Value::Bytes(b), DataType::Bytes) => {
6212            let len = u16::try_from(b.len()).expect("BYTEA cell ≤ 64 KiB");
6213            out.extend_from_slice(&len.to_le_bytes());
6214            out.extend_from_slice(b);
6215        }
6216        // v7.10.9: TEXT[] dense body — [u16 count][per element:
6217        // u8 null flag + (when non-null) u16 len + utf-8 bytes].
6218        (Value::TextArray(items), DataType::TextArray) => {
6219            let count = u16::try_from(items.len()).expect("TEXT[] ≤ 65k elements");
6220            out.extend_from_slice(&count.to_le_bytes());
6221            for item in items {
6222                match item {
6223                    None => out.push(1),
6224                    Some(s) => {
6225                        out.push(0);
6226                        let len = u16::try_from(s.len()).expect("TEXT[] element ≤ 64 KiB");
6227                        out.extend_from_slice(&len.to_le_bytes());
6228                        out.extend_from_slice(s.as_bytes());
6229                    }
6230                }
6231            }
6232        }
6233        // v7.11.12: INT[] dense body — [u16 count][per element:
6234        // u8 null + (when non-null) i32 LE].
6235        (Value::IntArray(items), DataType::IntArray) => {
6236            let count = u16::try_from(items.len()).expect("INT[] ≤ 65k elements");
6237            out.extend_from_slice(&count.to_le_bytes());
6238            for item in items {
6239                match item {
6240                    None => out.push(1),
6241                    Some(n) => {
6242                        out.push(0);
6243                        out.extend_from_slice(&n.to_le_bytes());
6244                    }
6245                }
6246            }
6247        }
6248        // v7.11.12: BIGINT[] dense body — [u16 count][per element:
6249        // u8 null + (when non-null) i64 LE].
6250        (Value::BigIntArray(items), DataType::BigIntArray) => {
6251            let count = u16::try_from(items.len()).expect("BIGINT[] ≤ 65k elements");
6252            out.extend_from_slice(&count.to_le_bytes());
6253            for item in items {
6254                match item {
6255                    None => out.push(1),
6256                    Some(n) => {
6257                        out.push(0);
6258                        out.extend_from_slice(&n.to_le_bytes());
6259                    }
6260                }
6261            }
6262        }
6263        // v7.12.0: tsvector dense body — see `value_body_encoded_len`
6264        // for layout. Lexemes are written in their already-sorted order.
6265        (Value::TsVector(lexs), DataType::TsVector) => write_tsvector_body(out, lexs),
6266        // v7.12.0: tsquery dense body — prefix-coded tree.
6267        (Value::TsQuery(ast), DataType::TsQuery) => write_tsquery_body(out, ast),
6268        // Type mismatch shouldn't happen — `Table::insert` validates
6269        // value type against column type before pushing. Treat as a
6270        // bug, not a runtime error.
6271        (other, ty) => unreachable!(
6272            "schema-driven encode received mismatched value/type pair: \
6273             value tag={:?}, column type={:?}",
6274            other.data_type(),
6275            ty
6276        ),
6277    }
6278}
6279
6280fn write_value(out: &mut Vec<u8>, v: &Value) {
6281    match v {
6282        Value::Null => out.push(0),
6283        Value::SmallInt(n) => {
6284            out.push(7);
6285            out.extend_from_slice(&n.to_le_bytes());
6286        }
6287        Value::Int(n) => {
6288            out.push(1);
6289            out.extend_from_slice(&n.to_le_bytes());
6290        }
6291        Value::BigInt(n) => {
6292            out.push(2);
6293            out.extend_from_slice(&n.to_le_bytes());
6294        }
6295        Value::Float(x) => {
6296            out.push(3);
6297            out.extend_from_slice(&x.to_le_bytes());
6298        }
6299        // v4.9: JSON shares the tag-4 (Text) on-disk encoding —
6300        // schema decides which variant comes back on read. The
6301        // bodies are byte-identical so collapsing the match keeps
6302        // clippy::match_same_arms quiet.
6303        Value::Text(s) | Value::Json(s) => {
6304            out.push(4);
6305            write_str(out, s);
6306        }
6307        Value::Bool(b) => {
6308            out.push(5);
6309            out.push(u8::from(*b));
6310        }
6311        Value::Vector(v) => {
6312            out.push(6);
6313            let dim = u32::try_from(v.len()).expect("vector dim fits in u32");
6314            out.extend_from_slice(&dim.to_le_bytes());
6315            for x in v {
6316                out.extend_from_slice(&x.to_le_bytes());
6317            }
6318        }
6319        // v6.0.1: new tag 11 for an SQ8 cell carried with its full
6320        // header. Layout matches the dense row body shape so a
6321        // round-trip through write_value → read_value bit-equals
6322        // the original `Value::Sq8Vector`.
6323        Value::Sq8Vector(q) => {
6324            out.push(11);
6325            let dim = u32::try_from(q.bytes.len()).expect("vector dim fits in u32");
6326            out.extend_from_slice(&dim.to_le_bytes());
6327            out.extend_from_slice(&q.min.to_le_bytes());
6328            out.extend_from_slice(&q.max.to_le_bytes());
6329            out.extend_from_slice(&q.bytes);
6330        }
6331        // v6.0.3: tag 12 for a HalfVector cell.
6332        // Layout: `[u32 dim][u16 LE × dim]` — bit-identical to the
6333        // dense row body so `write_value` / `read_value` bit-equal
6334        // the original `Value::HalfVector`.
6335        Value::HalfVector(h) => {
6336            out.push(12);
6337            let dim = u32::try_from(h.dim()).expect("vector dim fits in u32");
6338            out.extend_from_slice(&dim.to_le_bytes());
6339            out.extend_from_slice(&h.bytes);
6340        }
6341        Value::Numeric { scaled, scale } => {
6342            out.push(8);
6343            out.extend_from_slice(&scaled.to_le_bytes());
6344            out.push(*scale);
6345        }
6346        Value::Date(d) => {
6347            out.push(9);
6348            out.extend_from_slice(&d.to_le_bytes());
6349        }
6350        Value::Timestamp(t) => {
6351            out.push(10);
6352            out.extend_from_slice(&t.to_le_bytes());
6353        }
6354        // Interval is a runtime-only value (no on-disk representation in
6355        // v2.11). CREATE TABLE rejects `DataType::Interval` columns, so a
6356        // Value::Interval here would mean the engine bypassed that gate.
6357        Value::Interval { .. } => {
6358            unreachable!(
6359                "Value::Interval has no on-disk encoding; engine must reject it before write"
6360            )
6361        }
6362        // v7.10.4: BYTEA — [u8 tag=13_b][u16 len][bytes]. Tag
6363        // distinct from Text (4) so the schema-agnostic
6364        // read_value path can disambiguate. (Tag 11 is taken by
6365        // the WAL `auto_commit_sql` shape elsewhere, hence 14.)
6366        Value::Bytes(b) => {
6367            out.push(14);
6368            let len = u16::try_from(b.len()).expect("BYTEA value ≤ 64 KiB");
6369            out.extend_from_slice(&len.to_le_bytes());
6370            out.extend_from_slice(b);
6371        }
6372        // v7.10.9: TEXT[] — [u8 tag=15][u16 count][per elem: u8
6373        // null + (if non-null) u16 len + utf-8 bytes].
6374        Value::TextArray(items) => {
6375            out.push(15);
6376            let count = u16::try_from(items.len()).expect("TEXT[] ≤ 65k elements");
6377            out.extend_from_slice(&count.to_le_bytes());
6378            for item in items {
6379                match item {
6380                    None => out.push(1),
6381                    Some(s) => {
6382                        out.push(0);
6383                        let len = u16::try_from(s.len()).expect("TEXT[] element ≤ 64 KiB");
6384                        out.extend_from_slice(&len.to_le_bytes());
6385                        out.extend_from_slice(s.as_bytes());
6386                    }
6387                }
6388            }
6389        }
6390        // v7.11.12: INT[] — tag 16. [u16 count][per elem: u8 null +
6391        // (if non-null) i32 LE].
6392        Value::IntArray(items) => {
6393            out.push(16);
6394            let count = u16::try_from(items.len()).expect("INT[] ≤ 65k elements");
6395            out.extend_from_slice(&count.to_le_bytes());
6396            for item in items {
6397                match item {
6398                    None => out.push(1),
6399                    Some(n) => {
6400                        out.push(0);
6401                        out.extend_from_slice(&n.to_le_bytes());
6402                    }
6403                }
6404            }
6405        }
6406        // v7.11.12: BIGINT[] — tag 17. [u16 count][per elem: u8 null +
6407        // (if non-null) i64 LE].
6408        Value::BigIntArray(items) => {
6409            out.push(17);
6410            let count = u16::try_from(items.len()).expect("BIGINT[] ≤ 65k elements");
6411            out.extend_from_slice(&count.to_le_bytes());
6412            for item in items {
6413                match item {
6414                    None => out.push(1),
6415                    Some(n) => {
6416                        out.push(0);
6417                        out.extend_from_slice(&n.to_le_bytes());
6418                    }
6419                }
6420            }
6421        }
6422        // v7.12.0: tsvector — tag 18. Body shape matches
6423        // `write_tsvector_body`.
6424        Value::TsVector(lexs) => {
6425            out.push(18);
6426            write_tsvector_body(out, lexs);
6427        }
6428        // v7.12.0: tsquery — tag 19. Body shape matches
6429        // `write_tsquery_body`.
6430        Value::TsQuery(ast) => {
6431            out.push(19);
6432            write_tsquery_body(out, ast);
6433        }
6434    }
6435}
6436
6437/// v7.12.0: shared tsvector body writer (used by both dense and
6438/// schema-agnostic codecs).
6439fn write_tsvector_body(out: &mut Vec<u8>, lexs: &[TsLexeme]) {
6440    let count = u16::try_from(lexs.len()).expect("tsvector ≤ 65k lexemes");
6441    out.extend_from_slice(&count.to_le_bytes());
6442    for l in lexs {
6443        let wlen = u16::try_from(l.word.len()).expect("tsvector word ≤ 64 KiB");
6444        out.extend_from_slice(&wlen.to_le_bytes());
6445        out.extend_from_slice(l.word.as_bytes());
6446        let plen = u16::try_from(l.positions.len()).expect("tsvector pos count ≤ 65k");
6447        out.extend_from_slice(&plen.to_le_bytes());
6448        for p in &l.positions {
6449            out.extend_from_slice(&p.to_le_bytes());
6450        }
6451        out.push(l.weight);
6452    }
6453}
6454
6455/// v7.12.0: shared tsquery body writer. Prefix-coded tree: each
6456/// node starts with `[u8 tag]` then a tag-specific payload. Tags:
6457/// 0=Term, 1=And, 2=Or, 3=Not, 4=Phrase.
6458fn write_tsquery_body(out: &mut Vec<u8>, ast: &TsQueryAst) {
6459    match ast {
6460        TsQueryAst::Term { word, weight_mask } => {
6461            out.push(0);
6462            let len = u16::try_from(word.len()).expect("tsquery term ≤ 64 KiB");
6463            out.extend_from_slice(&len.to_le_bytes());
6464            out.extend_from_slice(word.as_bytes());
6465            out.push(*weight_mask);
6466        }
6467        TsQueryAst::And(a, b) => {
6468            out.push(1);
6469            write_tsquery_body(out, a);
6470            write_tsquery_body(out, b);
6471        }
6472        TsQueryAst::Or(a, b) => {
6473            out.push(2);
6474            write_tsquery_body(out, a);
6475            write_tsquery_body(out, b);
6476        }
6477        TsQueryAst::Not(x) => {
6478            out.push(3);
6479            write_tsquery_body(out, x);
6480        }
6481        TsQueryAst::Phrase {
6482            left,
6483            right,
6484            distance,
6485        } => {
6486            out.push(4);
6487            out.extend_from_slice(&distance.to_le_bytes());
6488            write_tsquery_body(out, left);
6489            write_tsquery_body(out, right);
6490        }
6491    }
6492}
6493
6494/// v7.12.0: byte length that `write_tsquery_body` would emit.
6495fn tsquery_encoded_len(ast: &TsQueryAst) -> usize {
6496    match ast {
6497        TsQueryAst::Term { word, .. } => 1 + 2 + word.len() + 1,
6498        TsQueryAst::And(a, b) | TsQueryAst::Or(a, b) => {
6499            1 + tsquery_encoded_len(a) + tsquery_encoded_len(b)
6500        }
6501        TsQueryAst::Not(x) => 1 + tsquery_encoded_len(x),
6502        TsQueryAst::Phrase { left, right, .. } => {
6503            1 + 2 + tsquery_encoded_len(left) + tsquery_encoded_len(right)
6504        }
6505    }
6506}
6507
6508fn write_u16(out: &mut Vec<u8>, n: u16) {
6509    out.extend_from_slice(&n.to_le_bytes());
6510}
6511fn write_u32(out: &mut Vec<u8>, n: u32) {
6512    out.extend_from_slice(&n.to_le_bytes());
6513}
6514fn write_str(out: &mut Vec<u8>, s: &str) {
6515    let len = u16::try_from(s.len()).expect("identifier / text fits in u16");
6516    write_u16(out, len);
6517    out.extend_from_slice(s.as_bytes());
6518}
6519
6520/// v7.12.4 — long-string variant: `[u32 LE len][bytes]`. For
6521/// payloads that can plausibly exceed 64 KiB (notably PL/pgSQL
6522/// function bodies). Identifiers + short text continue to use
6523/// the u16 [`write_str`] codec.
6524fn write_str_long(out: &mut Vec<u8>, s: &str) {
6525    let len = u32::try_from(s.len()).expect("function body fits in u32");
6526    write_u32(out, len);
6527    out.extend_from_slice(s.as_bytes());
6528}
6529
6530/// Serialise an [`IndexKey`] using the v9 tagged codec. `read_index_key`
6531/// is the inverse. v8 catalogs never wrote index keys (`BTree` entries were
6532/// rebuilt from `Table::rows`), so this codec is v9+ only.
6533fn write_index_key(out: &mut Vec<u8>, key: &IndexKey) {
6534    match key {
6535        IndexKey::Int(n) => {
6536            out.push(INDEX_KEY_TAG_INT);
6537            out.extend_from_slice(&n.to_le_bytes());
6538        }
6539        IndexKey::Text(s) => {
6540            out.push(INDEX_KEY_TAG_TEXT);
6541            write_str(out, s);
6542        }
6543        IndexKey::Bool(b) => {
6544            out.push(INDEX_KEY_TAG_BOOL);
6545            out.push(u8::from(*b));
6546        }
6547    }
6548}
6549
6550struct Cursor<'a> {
6551    buf: &'a [u8],
6552    pos: usize,
6553}
6554
6555impl<'a> Cursor<'a> {
6556    const fn new(buf: &'a [u8]) -> Self {
6557        Self { buf, pos: 0 }
6558    }
6559
6560    fn take(&mut self, n: usize) -> Result<&'a [u8], StorageError> {
6561        let end = self
6562            .pos
6563            .checked_add(n)
6564            .ok_or_else(|| StorageError::Corrupt(format!("length overflow taking {n} bytes")))?;
6565        if end > self.buf.len() {
6566            return Err(StorageError::Corrupt(format!(
6567                "unexpected EOF at offset {} (wanted {n} more bytes)",
6568                self.pos
6569            )));
6570        }
6571        let s = &self.buf[self.pos..end];
6572        self.pos = end;
6573        Ok(s)
6574    }
6575
6576    fn read_u8(&mut self) -> Result<u8, StorageError> {
6577        Ok(self.take(1)?[0])
6578    }
6579    fn read_u16(&mut self) -> Result<u16, StorageError> {
6580        let s = self.take(2)?;
6581        Ok(u16::from_le_bytes([s[0], s[1]]))
6582    }
6583    fn read_u32(&mut self) -> Result<u32, StorageError> {
6584        let s = self.take(4)?;
6585        Ok(u32::from_le_bytes([s[0], s[1], s[2], s[3]]))
6586    }
6587    fn read_i32(&mut self) -> Result<i32, StorageError> {
6588        let s = self.take(4)?;
6589        Ok(i32::from_le_bytes([s[0], s[1], s[2], s[3]]))
6590    }
6591    /// v6.7.2 — u64 LE read for the per-table `hot_tier_bytes`
6592    /// catalog appendix.
6593    fn read_u64(&mut self) -> Result<u64, StorageError> {
6594        let s = self.take(8)?;
6595        Ok(u64::from_le_bytes([
6596            s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7],
6597        ]))
6598    }
6599    fn read_i64(&mut self) -> Result<i64, StorageError> {
6600        let s = self.take(8)?;
6601        let arr: [u8; 8] = s.try_into().expect("checked");
6602        Ok(i64::from_le_bytes(arr))
6603    }
6604    fn read_f64(&mut self) -> Result<f64, StorageError> {
6605        let s = self.take(8)?;
6606        let arr: [u8; 8] = s.try_into().expect("checked");
6607        Ok(f64::from_le_bytes(arr))
6608    }
6609    fn read_f32(&mut self) -> Result<f32, StorageError> {
6610        let s = self.take(4)?;
6611        Ok(f32::from_le_bytes([s[0], s[1], s[2], s[3]]))
6612    }
6613    fn read_str(&mut self) -> Result<String, StorageError> {
6614        let len = self.read_u16()? as usize;
6615        let bytes = self.take(len)?;
6616        core::str::from_utf8(bytes)
6617            .map(String::from)
6618            .map_err(|_| StorageError::Corrupt("invalid UTF-8 in identifier or text".into()))
6619    }
6620
6621    /// v7.12.4 — long-string variant for payloads written via
6622    /// [`write_str_long`] (u32-length prefix). Used for PL/pgSQL
6623    /// function bodies which can plausibly exceed 64 KiB.
6624    fn read_str_long(&mut self) -> Result<String, StorageError> {
6625        let len = self.read_u32()? as usize;
6626        let bytes = self.take(len)?;
6627        core::str::from_utf8(bytes)
6628            .map(String::from)
6629            .map_err(|_| StorageError::Corrupt("invalid UTF-8 in long-string payload".into()))
6630    }
6631
6632    /// Parse an [`IndexKey`] emitted by `write_index_key` (v9 tagged
6633    /// codec). Returns `StorageError::Corrupt` on unknown tag or
6634    /// truncated payload.
6635    fn read_index_key(&mut self) -> Result<IndexKey, StorageError> {
6636        let tag = self.read_u8()?;
6637        match tag {
6638            INDEX_KEY_TAG_INT => Ok(IndexKey::Int(self.read_i64()?)),
6639            INDEX_KEY_TAG_TEXT => Ok(IndexKey::Text(self.read_str()?)),
6640            INDEX_KEY_TAG_BOOL => Ok(IndexKey::Bool(self.read_u8()? != 0)),
6641            other => Err(StorageError::Corrupt(format!(
6642                "unknown index key tag: {other}"
6643            ))),
6644        }
6645    }
6646    /// Schema-driven dense value decode (`FILE_VERSION` 8). Caller has
6647    /// already cleared the NULL bit from the row bitmap; we read the
6648    /// fixed-width body for the given column type. Used inside the row
6649    /// hot loop; column defaults still go through `read_value` (which
6650    /// reads its own type tag) so DEFAULT round-trips without a schema.
6651    fn read_value_body(&mut self, ty: DataType) -> Result<Value, StorageError> {
6652        match ty {
6653            DataType::SmallInt => {
6654                let s = self.take(2)?;
6655                Ok(Value::SmallInt(i16::from_le_bytes([s[0], s[1]])))
6656            }
6657            DataType::Int => Ok(Value::Int(self.read_i32()?)),
6658            DataType::BigInt => Ok(Value::BigInt(self.read_i64()?)),
6659            DataType::Float => Ok(Value::Float(self.read_f64()?)),
6660            DataType::Bool => Ok(Value::Bool(self.read_u8()? != 0)),
6661            DataType::Text | DataType::Varchar(_) | DataType::Char(_) => {
6662                Ok(Value::Text(self.read_str()?))
6663            }
6664            DataType::Vector {
6665                encoding: VecEncoding::F32,
6666                ..
6667            } => {
6668                let dim = self.read_u32()? as usize;
6669                let mut v = Vec::with_capacity(dim);
6670                for _ in 0..dim {
6671                    let bytes: [u8; 4] = self.take(4)?.try_into().expect("checked");
6672                    v.push(f32::from_le_bytes(bytes));
6673                }
6674                Ok(Value::Vector(v))
6675            }
6676            DataType::Vector {
6677                encoding: VecEncoding::Sq8,
6678                ..
6679            } => {
6680                let dim = self.read_u32()? as usize;
6681                let min = self.read_f32()?;
6682                let max = self.read_f32()?;
6683                let bytes = self.take(dim)?.to_vec();
6684                Ok(Value::Sq8Vector(quantize::Sq8Vector { min, max, bytes }))
6685            }
6686            DataType::Vector {
6687                encoding: VecEncoding::F16,
6688                ..
6689            } => {
6690                let dim = self.read_u32()? as usize;
6691                let bytes = self.take(dim * 2)?.to_vec();
6692                Ok(Value::HalfVector(halfvec::HalfVector { bytes }))
6693            }
6694            DataType::Numeric { .. } => {
6695                let s = self.take(16)?;
6696                let arr: [u8; 16] = s.try_into().expect("checked");
6697                let scaled = i128::from_le_bytes(arr);
6698                let scale = self.read_u8()?;
6699                Ok(Value::Numeric { scaled, scale })
6700            }
6701            DataType::Date => Ok(Value::Date(self.read_i32()?)),
6702            DataType::Timestamp => Ok(Value::Timestamp(self.read_i64()?)),
6703            DataType::Timestamptz => Ok(Value::Timestamp(self.read_i64()?)),
6704            DataType::Jsonb => Ok(Value::Json(self.read_str()?)),
6705            DataType::Interval => {
6706                // Defensive — schema gate (CREATE TABLE rejects Interval
6707                // columns) means this branch can't be hit through normal
6708                // flow; reject corrupt files explicitly rather than
6709                // panic.
6710                Err(StorageError::Corrupt(
6711                    "INTERVAL column found on disk — runtime-only type, v3.0.2 rejects it".into(),
6712                ))
6713            }
6714            DataType::Json => Ok(Value::Json(self.read_str()?)),
6715            // v7.10.4: BYTEA on-disk is [u16 len][bytes]. Same wire
6716            // shape as Text, but read as raw Vec<u8>.
6717            DataType::Bytes => {
6718                let len = self.read_u16()? as usize;
6719                let bytes = self.take(len)?.to_vec();
6720                Ok(Value::Bytes(bytes))
6721            }
6722            // v7.10.9: TEXT[] dense body.
6723            DataType::TextArray => {
6724                let count = self.read_u16()? as usize;
6725                let mut items: Vec<Option<String>> = Vec::with_capacity(count);
6726                for _ in 0..count {
6727                    match self.read_u8()? {
6728                        0 => items.push(Some(self.read_str()?)),
6729                        1 => items.push(None),
6730                        other => {
6731                            return Err(StorageError::Corrupt(format!(
6732                                "TEXT[] null flag: unknown byte {other}"
6733                            )));
6734                        }
6735                    }
6736                }
6737                Ok(Value::TextArray(items))
6738            }
6739            // v7.11.12: INT[] dense body.
6740            DataType::IntArray => {
6741                let count = self.read_u16()? as usize;
6742                let mut items: Vec<Option<i32>> = Vec::with_capacity(count);
6743                for _ in 0..count {
6744                    match self.read_u8()? {
6745                        0 => items.push(Some(self.read_i32()?)),
6746                        1 => items.push(None),
6747                        other => {
6748                            return Err(StorageError::Corrupt(format!(
6749                                "INT[] null flag: unknown byte {other}"
6750                            )));
6751                        }
6752                    }
6753                }
6754                Ok(Value::IntArray(items))
6755            }
6756            // v7.11.12: BIGINT[] dense body.
6757            DataType::BigIntArray => {
6758                let count = self.read_u16()? as usize;
6759                let mut items: Vec<Option<i64>> = Vec::with_capacity(count);
6760                for _ in 0..count {
6761                    match self.read_u8()? {
6762                        0 => items.push(Some(self.read_i64()?)),
6763                        1 => items.push(None),
6764                        other => {
6765                            return Err(StorageError::Corrupt(format!(
6766                                "BIGINT[] null flag: unknown byte {other}"
6767                            )));
6768                        }
6769                    }
6770                }
6771                Ok(Value::BigIntArray(items))
6772            }
6773            // v7.12.0: tsvector dense body — [u16 lex_count]
6774            // [per lex: u16 word_len + utf-8 word + u16 pos_count
6775            // + (u16 LE * pos_count) + u8 weight].
6776            DataType::TsVector => Ok(Value::TsVector(self.read_tsvector_body()?)),
6777            DataType::TsQuery => Ok(Value::TsQuery(self.read_tsquery_body()?)),
6778        }
6779    }
6780
6781    /// v7.12.0 — read a tsvector body emitted by `write_tsvector_body`.
6782    fn read_tsvector_body(&mut self) -> Result<Vec<TsLexeme>, StorageError> {
6783        let count = self.read_u16()? as usize;
6784        let mut out = Vec::with_capacity(count);
6785        for _ in 0..count {
6786            let word = self.read_str()?;
6787            let pos_count = self.read_u16()? as usize;
6788            let mut positions = Vec::with_capacity(pos_count);
6789            for _ in 0..pos_count {
6790                positions.push(self.read_u16()?);
6791            }
6792            let weight = self.read_u8()?;
6793            out.push(TsLexeme {
6794                word,
6795                positions,
6796                weight,
6797            });
6798        }
6799        Ok(out)
6800    }
6801
6802    /// v7.12.0 — read a tsquery body emitted by `write_tsquery_body`.
6803    fn read_tsquery_body(&mut self) -> Result<TsQueryAst, StorageError> {
6804        let tag = self.read_u8()?;
6805        match tag {
6806            0 => {
6807                let word = self.read_str()?;
6808                let weight_mask = self.read_u8()?;
6809                Ok(TsQueryAst::Term { word, weight_mask })
6810            }
6811            1 => {
6812                let a = self.read_tsquery_body()?;
6813                let b = self.read_tsquery_body()?;
6814                Ok(TsQueryAst::And(Box::new(a), Box::new(b)))
6815            }
6816            2 => {
6817                let a = self.read_tsquery_body()?;
6818                let b = self.read_tsquery_body()?;
6819                Ok(TsQueryAst::Or(Box::new(a), Box::new(b)))
6820            }
6821            3 => {
6822                let x = self.read_tsquery_body()?;
6823                Ok(TsQueryAst::Not(Box::new(x)))
6824            }
6825            4 => {
6826                let distance = self.read_u16()?;
6827                let left = self.read_tsquery_body()?;
6828                let right = self.read_tsquery_body()?;
6829                Ok(TsQueryAst::Phrase {
6830                    left: Box::new(left),
6831                    right: Box::new(right),
6832                    distance,
6833                })
6834            }
6835            other => Err(StorageError::Corrupt(format!(
6836                "tsquery: unknown node tag {other}"
6837            ))),
6838        }
6839    }
6840
6841    fn read_value(&mut self) -> Result<Value, StorageError> {
6842        let tag = self.read_u8()?;
6843        match tag {
6844            0 => Ok(Value::Null),
6845            1 => Ok(Value::Int(self.read_i32()?)),
6846            2 => Ok(Value::BigInt(self.read_i64()?)),
6847            3 => Ok(Value::Float(self.read_f64()?)),
6848            4 => Ok(Value::Text(self.read_str()?)),
6849            5 => Ok(Value::Bool(self.read_u8()? != 0)),
6850            6 => {
6851                let dim = self.read_u32()? as usize;
6852                let mut v = Vec::with_capacity(dim);
6853                for _ in 0..dim {
6854                    let bytes: [u8; 4] = self.take(4)?.try_into().expect("checked");
6855                    v.push(f32::from_le_bytes(bytes));
6856                }
6857                Ok(Value::Vector(v))
6858            }
6859            7 => {
6860                let s = self.take(2)?;
6861                Ok(Value::SmallInt(i16::from_le_bytes([s[0], s[1]])))
6862            }
6863            8 => {
6864                let s = self.take(16)?;
6865                let arr: [u8; 16] = s.try_into().expect("checked");
6866                let scaled = i128::from_le_bytes(arr);
6867                let scale = self.read_u8()?;
6868                Ok(Value::Numeric { scaled, scale })
6869            }
6870            9 => Ok(Value::Date(self.read_i32()?)),
6871            10 => Ok(Value::Timestamp(self.read_i64()?)),
6872            // v6.0.1: tag 11 — Sq8Vector. Pre-v6 readers fall
6873            // through to the catch-all and surface
6874            // `Corrupt("unknown value tag")`, matching the
6875            // forward-compat fence on the column-type side.
6876            11 => {
6877                let dim = self.read_u32()? as usize;
6878                let min = self.read_f32()?;
6879                let max = self.read_f32()?;
6880                let bytes = self.take(dim)?.to_vec();
6881                Ok(Value::Sq8Vector(quantize::Sq8Vector { min, max, bytes }))
6882            }
6883            // v6.0.3: tag 12 — HalfVector. Same forward-compat
6884            // fence story as tag 11.
6885            12 => {
6886                let dim = self.read_u32()? as usize;
6887                let bytes = self.take(dim * 2)?.to_vec();
6888                Ok(Value::HalfVector(halfvec::HalfVector { bytes }))
6889            }
6890            // v7.10.4: tag 14 — BYTEA. [u16 len][bytes].
6891            14 => {
6892                let len = self.read_u16()? as usize;
6893                let bytes = self.take(len)?.to_vec();
6894                Ok(Value::Bytes(bytes))
6895            }
6896            // v7.10.9: tag 15 — TEXT[]. [u16 count][per elem: u8
6897            // null + (when non-null) u16 len + utf-8 bytes].
6898            15 => {
6899                let count = self.read_u16()? as usize;
6900                let mut items: Vec<Option<String>> = Vec::with_capacity(count);
6901                for _ in 0..count {
6902                    match self.read_u8()? {
6903                        0 => items.push(Some(self.read_str()?)),
6904                        1 => items.push(None),
6905                        other => {
6906                            return Err(StorageError::Corrupt(format!(
6907                                "TEXT[] null flag in value tag: unknown byte {other}"
6908                            )));
6909                        }
6910                    }
6911                }
6912                Ok(Value::TextArray(items))
6913            }
6914            // v7.11.12: tags 16/17 — INT[] / BIGINT[].
6915            16 => {
6916                let count = self.read_u16()? as usize;
6917                let mut items: Vec<Option<i32>> = Vec::with_capacity(count);
6918                for _ in 0..count {
6919                    match self.read_u8()? {
6920                        0 => items.push(Some(self.read_i32()?)),
6921                        1 => items.push(None),
6922                        other => {
6923                            return Err(StorageError::Corrupt(format!(
6924                                "INT[] null flag in value tag: unknown byte {other}"
6925                            )));
6926                        }
6927                    }
6928                }
6929                Ok(Value::IntArray(items))
6930            }
6931            17 => {
6932                let count = self.read_u16()? as usize;
6933                let mut items: Vec<Option<i64>> = Vec::with_capacity(count);
6934                for _ in 0..count {
6935                    match self.read_u8()? {
6936                        0 => items.push(Some(self.read_i64()?)),
6937                        1 => items.push(None),
6938                        other => {
6939                            return Err(StorageError::Corrupt(format!(
6940                                "BIGINT[] null flag in value tag: unknown byte {other}"
6941                            )));
6942                        }
6943                    }
6944                }
6945                Ok(Value::BigIntArray(items))
6946            }
6947            // v7.12.0: tag 18 — tsvector. Body matches the dense
6948            // form (`read_tsvector_body`).
6949            18 => Ok(Value::TsVector(self.read_tsvector_body()?)),
6950            // v7.12.0: tag 19 — tsquery.
6951            19 => Ok(Value::TsQuery(self.read_tsquery_body()?)),
6952            other => Err(StorageError::Corrupt(format!("unknown value tag: {other}"))),
6953        }
6954    }
6955
6956    /// Read an NSW graph that was emitted via `write_nsw_graph`. `m`
6957    /// is passed in because it was already consumed from the per-
6958    /// index header. Returns the reconstituted `NswGraph`.
6959    fn read_nsw_graph(&mut self, m: usize) -> Result<NswGraph, StorageError> {
6960        let m_max_0 = self.read_u16()? as usize;
6961        let entry_raw = self.read_u32()?;
6962        let entry = if entry_raw == u32::MAX {
6963            None
6964        } else {
6965            Some(entry_raw as usize)
6966        };
6967        let entry_level = self.read_u8()?;
6968        let node_count = self.read_u32()? as usize;
6969        // v5.5.0: levels/per-layer are PV-backed in memory, but the wire
6970        // format is unchanged — decode element-by-element into a PV via
6971        // push_mut (transient in-place, no per-element path-copy here since
6972        // the freshly-built PV is uniquely owned).
6973        let mut levels: PersistentVec<u8> = PersistentVec::new();
6974        for _ in 0..node_count {
6975            levels.push_mut(self.read_u8()?);
6976        }
6977        let layer_count = self.read_u8()? as usize;
6978        let mut layers: Vec<PersistentVec<Vec<u32>>> = Vec::with_capacity(layer_count);
6979        for _ in 0..layer_count {
6980            let n = self.read_u32()? as usize;
6981            let mut per_layer: PersistentVec<Vec<u32>> = PersistentVec::new();
6982            for _ in 0..n {
6983                let cnt = self.read_u16()? as usize;
6984                let mut row: Vec<u32> = Vec::with_capacity(cnt);
6985                for _ in 0..cnt {
6986                    row.push(self.read_u32()?);
6987                }
6988                per_layer.push_mut(row);
6989            }
6990            layers.push(per_layer);
6991        }
6992        Ok(NswGraph {
6993            m,
6994            m_max_0,
6995            entry,
6996            entry_level,
6997            levels,
6998            layers,
6999        })
7000    }
7001}
7002
7003#[cfg(test)]
7004mod tests {
7005    use super::*;
7006    use alloc::string::ToString;
7007    use alloc::vec;
7008
7009    #[cfg(target_arch = "aarch64")]
7010    #[test]
7011    fn neon_l2_matches_scalar() {
7012        // For every dim that's a multiple of 4 (4, 8, 12, 16, 64,
7013        // 128, 256, 384, 512, 768, 1024, 1536), the NEON impl must
7014        // agree with the scalar reference within tight float
7015        // tolerance (FMA rounding differs from separate * + +).
7016        let dims = [4usize, 8, 12, 16, 64, 128, 256, 384, 512, 768, 1024, 1536];
7017        for &d in &dims {
7018            let mut state: u64 = (d as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15);
7019            let mut a = Vec::with_capacity(d);
7020            let mut b = Vec::with_capacity(d);
7021            for _ in 0..d {
7022                state = state
7023                    .wrapping_mul(6_364_136_223_846_793_005)
7024                    .wrapping_add(1);
7025                #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
7026                let x = (((state >> 32) & 0x00FF_FFFF) as f32) / (0x80_0000_u32 as f32) - 1.0;
7027                state = state
7028                    .wrapping_mul(6_364_136_223_846_793_005)
7029                    .wrapping_add(1);
7030                #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
7031                let y = (((state >> 32) & 0x00FF_FFFF) as f32) / (0x80_0000_u32 as f32) - 1.0;
7032                a.push(x);
7033                b.push(y);
7034            }
7035            let scalar = l2_distance_sq_scalar(&a, &b);
7036            let neon = unsafe { l2_distance_sq_neon(&a, &b) };
7037            let tol = (scalar.abs().max(1e-6)) * 1e-4;
7038            assert!(
7039                (scalar - neon).abs() <= tol,
7040                "dim={d}: scalar={scalar} neon={neon} diff={}",
7041                (scalar - neon).abs()
7042            );
7043        }
7044    }
7045
7046    #[cfg(target_arch = "aarch64")]
7047    #[test]
7048    fn neon_inner_product_matches_scalar() {
7049        // v6.0.2 step 1: NEON IP must agree with scalar across every
7050        // production-shaped dim. FMA rounding differs from
7051        // separate * + +, so the tolerance scales with magnitude.
7052        let dims = [4usize, 8, 12, 16, 64, 128, 256, 512, 1024];
7053        for &d in &dims {
7054            let mut state: u64 = (d as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15);
7055            let mut a = Vec::with_capacity(d);
7056            let mut b = Vec::with_capacity(d);
7057            for _ in 0..d {
7058                state = state
7059                    .wrapping_mul(6_364_136_223_846_793_005)
7060                    .wrapping_add(1);
7061                #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
7062                let x = (((state >> 32) & 0x00FF_FFFF) as f32) / (0x80_0000_u32 as f32) - 1.0;
7063                state = state
7064                    .wrapping_mul(6_364_136_223_846_793_005)
7065                    .wrapping_add(1);
7066                #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
7067                let y = (((state >> 32) & 0x00FF_FFFF) as f32) / (0x80_0000_u32 as f32) - 1.0;
7068                a.push(x);
7069                b.push(y);
7070            }
7071            let scalar = inner_product_scalar(&a, &b);
7072            let neon = unsafe { inner_product_neon(&a, &b) };
7073            #[allow(clippy::cast_precision_loss)]
7074            let tol = (scalar.abs().max(1e-6)) * 1e-4 + (d as f32) * 1e-6;
7075            assert!(
7076                (scalar - neon).abs() <= tol,
7077                "IP dim={d}: scalar={scalar} neon={neon} diff={}",
7078                (scalar - neon).abs()
7079            );
7080        }
7081    }
7082
7083    #[cfg(target_arch = "aarch64")]
7084    #[allow(clippy::similar_names)]
7085    #[test]
7086    fn neon_cosine_dot_norms_matches_scalar() {
7087        let dims = [4usize, 8, 12, 16, 64, 128, 256, 512, 1024];
7088        for &d in &dims {
7089            let mut state: u64 = (d as u64).wrapping_mul(0xBF58_476D_1CE4_E5B9);
7090            let mut a = Vec::with_capacity(d);
7091            let mut b = Vec::with_capacity(d);
7092            for _ in 0..d {
7093                state = state
7094                    .wrapping_mul(6_364_136_223_846_793_005)
7095                    .wrapping_add(1);
7096                #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
7097                let x = (((state >> 32) & 0x00FF_FFFF) as f32) / (0x80_0000_u32 as f32) - 1.0;
7098                state = state
7099                    .wrapping_mul(6_364_136_223_846_793_005)
7100                    .wrapping_add(1);
7101                #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
7102                let y = (((state >> 32) & 0x00FF_FFFF) as f32) / (0x80_0000_u32 as f32) - 1.0;
7103                a.push(x);
7104                b.push(y);
7105            }
7106            let (dot_s, na_s, nb_s) = cosine_dot_norms_scalar(&a, &b);
7107            let (dot_n, na_n, nb_n) = unsafe { cosine_dot_norms_neon(&a, &b) };
7108            #[allow(clippy::cast_precision_loss)]
7109            let tol_d = (dot_s.abs().max(1e-6)) * 1e-4 + (d as f32) * 1e-6;
7110            #[allow(clippy::cast_precision_loss)]
7111            let tol_n = (na_s.abs().max(1e-6)) * 1e-4 + (d as f32) * 1e-6;
7112            assert!(
7113                (dot_s - dot_n).abs() <= tol_d,
7114                "cosine dot dim={d}: scalar={dot_s} neon={dot_n}"
7115            );
7116            assert!(
7117                (na_s - na_n).abs() <= tol_n,
7118                "cosine na dim={d}: scalar={na_s} neon={na_n}"
7119            );
7120            assert!(
7121                (nb_s - nb_n).abs() <= tol_n,
7122                "cosine nb dim={d}: scalar={nb_s} neon={nb_n}"
7123            );
7124        }
7125    }
7126
7127    fn make_users_schema() -> TableSchema {
7128        TableSchema::new(
7129            "users",
7130            vec![
7131                ColumnSchema::new("id", DataType::Int, false),
7132                ColumnSchema::new("name", DataType::Text, false),
7133                ColumnSchema::new("score", DataType::Float, true),
7134            ],
7135        )
7136    }
7137
7138    #[test]
7139    fn value_type_tag_matches_variant() {
7140        assert_eq!(Value::Int(1).data_type(), Some(DataType::Int));
7141        assert_eq!(Value::BigInt(1).data_type(), Some(DataType::BigInt));
7142        assert_eq!(Value::Float(1.0).data_type(), Some(DataType::Float));
7143        assert_eq!(Value::Text("x".into()).data_type(), Some(DataType::Text));
7144        assert_eq!(Value::Bool(true).data_type(), Some(DataType::Bool));
7145        assert_eq!(Value::Null.data_type(), None);
7146        assert!(Value::Null.is_null());
7147        assert!(!Value::Int(0).is_null());
7148    }
7149
7150    #[test]
7151    fn sq8_value_reports_sq8_data_type() {
7152        // v6.0.1: a `Value::Sq8Vector` cell surfaces its dim
7153        // (= bytes.len()) and encoding through `data_type()` so
7154        // INSERT-time column type-checks (step 3) can route on
7155        // both shape and encoding.
7156        let q = crate::quantize::quantize(&[0.0, 0.25, 0.5, 0.75, 1.0]);
7157        let v = Value::Sq8Vector(q);
7158        assert_eq!(
7159            v.data_type(),
7160            Some(DataType::Vector {
7161                dim: 5,
7162                encoding: VecEncoding::Sq8,
7163            }),
7164        );
7165    }
7166
7167    #[test]
7168    fn datatype_display_matches_pg_keyword() {
7169        assert_eq!(DataType::Int.to_string(), "INT");
7170        assert_eq!(DataType::BigInt.to_string(), "BIGINT");
7171        assert_eq!(DataType::Float.to_string(), "FLOAT");
7172        assert_eq!(DataType::Text.to_string(), "TEXT");
7173        assert_eq!(DataType::Bool.to_string(), "BOOL");
7174    }
7175
7176    #[test]
7177    fn row_len_and_emptiness() {
7178        let r = Row::new(vec![Value::Int(1), Value::Null]);
7179        assert_eq!(r.len(), 2);
7180        assert!(!r.is_empty());
7181        assert!(Row::new(Vec::new()).is_empty());
7182    }
7183
7184    #[test]
7185    fn table_schema_column_position() {
7186        let s = make_users_schema();
7187        assert_eq!(s.column_position("id"), Some(0));
7188        assert_eq!(s.column_position("score"), Some(2));
7189        assert_eq!(s.column_position("missing"), None);
7190    }
7191
7192    #[test]
7193    fn catalog_create_table_then_lookup() {
7194        let mut cat = Catalog::new();
7195        cat.create_table(make_users_schema()).unwrap();
7196        assert_eq!(cat.table_count(), 1);
7197        assert!(cat.get("users").is_some());
7198        assert!(cat.get("nope").is_none());
7199    }
7200
7201    #[test]
7202    fn catalog_duplicate_table_is_rejected() {
7203        let mut cat = Catalog::new();
7204        cat.create_table(make_users_schema()).unwrap();
7205        let err = cat.create_table(make_users_schema()).unwrap_err();
7206        assert!(matches!(err, StorageError::DuplicateTable { ref name } if name == "users"));
7207    }
7208
7209    #[test]
7210    fn table_insert_happy_path_appends_row() {
7211        let mut cat = Catalog::new();
7212        cat.create_table(make_users_schema()).unwrap();
7213        let t = cat.get_mut("users").unwrap();
7214        t.insert(Row::new(vec![
7215            Value::Int(1),
7216            Value::Text("alice".into()),
7217            Value::Float(99.5),
7218        ]))
7219        .unwrap();
7220        assert_eq!(t.row_count(), 1);
7221        assert_eq!(t.rows()[0].values[1], Value::Text("alice".into()));
7222    }
7223
7224    #[test]
7225    fn table_insert_arity_mismatch() {
7226        let mut cat = Catalog::new();
7227        cat.create_table(make_users_schema()).unwrap();
7228        let t = cat.get_mut("users").unwrap();
7229        let err = t.insert(Row::new(vec![Value::Int(1)])).unwrap_err();
7230        assert!(matches!(
7231            err,
7232            StorageError::ArityMismatch {
7233                expected: 3,
7234                actual: 1
7235            }
7236        ));
7237        assert_eq!(t.row_count(), 0);
7238    }
7239
7240    #[test]
7241    fn table_insert_type_mismatch_reports_column() {
7242        let mut cat = Catalog::new();
7243        cat.create_table(make_users_schema()).unwrap();
7244        let t = cat.get_mut("users").unwrap();
7245        let err = t
7246            .insert(Row::new(vec![
7247                Value::Int(1),
7248                Value::Int(42), // name expects Text
7249                Value::Float(0.0),
7250            ]))
7251            .unwrap_err();
7252        match err {
7253            StorageError::TypeMismatch {
7254                ref column,
7255                expected,
7256                actual,
7257                position,
7258            } => {
7259                assert_eq!(column, "name");
7260                assert_eq!(expected, DataType::Text);
7261                assert_eq!(actual, DataType::Int);
7262                assert_eq!(position, 1);
7263            }
7264            other => panic!("unexpected: {other:?}"),
7265        }
7266        assert_eq!(t.row_count(), 0);
7267    }
7268
7269    #[test]
7270    fn table_insert_null_into_not_null_rejected() {
7271        let mut cat = Catalog::new();
7272        cat.create_table(make_users_schema()).unwrap();
7273        let t = cat.get_mut("users").unwrap();
7274        let err = t
7275            .insert(Row::new(vec![
7276                Value::Int(1),
7277                Value::Null, // name is NOT NULL
7278                Value::Float(1.0),
7279            ]))
7280            .unwrap_err();
7281        assert!(matches!(err, StorageError::NullInNotNull { ref column } if column == "name"));
7282    }
7283
7284    #[test]
7285    fn table_insert_null_into_nullable_ok() {
7286        let mut cat = Catalog::new();
7287        cat.create_table(make_users_schema()).unwrap();
7288        let t = cat.get_mut("users").unwrap();
7289        t.insert(Row::new(vec![
7290            Value::Int(1),
7291            Value::Text("bob".into()),
7292            Value::Null,
7293        ]))
7294        .unwrap();
7295        assert_eq!(t.row_count(), 1);
7296    }
7297
7298    #[test]
7299    fn catalog_get_mut_independent_per_table() {
7300        let mut cat = Catalog::new();
7301        cat.create_table(TableSchema::new(
7302            "a",
7303            vec![ColumnSchema::new("v", DataType::Int, false)],
7304        ))
7305        .unwrap();
7306        cat.create_table(TableSchema::new(
7307            "b",
7308            vec![ColumnSchema::new("v", DataType::Int, false)],
7309        ))
7310        .unwrap();
7311        cat.get_mut("a")
7312            .unwrap()
7313            .insert(Row::new(vec![Value::Int(1)]))
7314            .unwrap();
7315        assert_eq!(cat.get("a").unwrap().row_count(), 1);
7316        assert_eq!(cat.get("b").unwrap().row_count(), 0);
7317    }
7318
7319    // --- v0.6 persistence round-trips --------------------------------------
7320
7321    fn assert_round_trip(cat: &Catalog) {
7322        let bytes = cat.serialize();
7323        let restored = Catalog::deserialize(&bytes).expect("deserialize");
7324        // Compare semantic state: same tables in same order, same schema +
7325        // rows in each.
7326        assert_eq!(restored.table_count(), cat.table_count());
7327        for (a, b) in cat.tables.iter().zip(restored.tables.iter()) {
7328            assert_eq!(a.schema, b.schema);
7329            assert_eq!(a.rows, b.rows);
7330        }
7331    }
7332
7333    #[test]
7334    fn serialize_empty_catalog_round_trips() {
7335        assert_round_trip(&Catalog::new());
7336    }
7337
7338    #[test]
7339    fn serialize_single_empty_table_round_trips() {
7340        let mut cat = Catalog::new();
7341        cat.create_table(make_users_schema()).unwrap();
7342        assert_round_trip(&cat);
7343    }
7344
7345    #[test]
7346    fn nsw_clone_is_o1() {
7347        // v5.5.0: NswGraph::clone must be O(1) structural sharing, not the
7348        // pre-v5.5 O(N) element copy — it rides on Catalog::clone for every
7349        // group-commit write on a vector table. Build a non-trivial multi-
7350        // layer graph, clone it, and prove the clone shares the very same PV
7351        // storage (root+tail Arc) for `levels` and every `layers[l]`. Sharing
7352        // ⇒ no per-node element copy ⇒ clone cost independent of N (node
7353        // count); only the outer layer Vec (len ≤ 8) is copied, O(1) in
7354        // practice.
7355        let mut cat = Catalog::new();
7356        cat.create_table(TableSchema::new(
7357            "docs",
7358            alloc::vec![
7359                ColumnSchema::new("id", DataType::Int, false),
7360                ColumnSchema::new(
7361                    "v",
7362                    DataType::Vector {
7363                        dim: 3,
7364                        encoding: VecEncoding::F32
7365                    },
7366                    true
7367                ),
7368            ],
7369        ))
7370        .unwrap();
7371        let t = cat.get_mut("docs").unwrap();
7372        for i in 0..1500_i32 {
7373            #[allow(clippy::cast_precision_loss)] // 0..1500 — no precision lost
7374            let base = (i as f32) * 0.01;
7375            t.insert(Row::new(alloc::vec![
7376                Value::Int(i),
7377                Value::Vector(alloc::vec![base, base + 0.05, base + 0.1]),
7378            ]))
7379            .unwrap();
7380        }
7381        t.add_nsw_index("docs_nsw".into(), "v", NSW_DEFAULT_M)
7382            .unwrap();
7383        let g = match &cat.get("docs").unwrap().indices()[0].kind {
7384            IndexKind::Nsw(g) => g,
7385            IndexKind::BTree(_)
7386            | IndexKind::Brin { .. }
7387            | IndexKind::Gin(_)
7388            | IndexKind::GinTrgm(_) => {
7389                panic!("expected NSW")
7390            }
7391        };
7392        // Non-trivial graph: one level slot per row, and the geometric level
7393        // distribution puts some nodes above layer 0.
7394        assert_eq!(g.levels.len(), 1500, "one level slot per inserted row");
7395        assert!(
7396            g.layers.len() >= 2,
7397            "1500 nodes should populate at least two HNSW layers, got {}",
7398            g.layers.len()
7399        );
7400
7401        let cloned = g.clone();
7402
7403        assert!(
7404            g.levels.shares_storage_with(&cloned.levels),
7405            "levels PV not shared after clone — clone copied elements (O(N))"
7406        );
7407        assert_eq!(g.layers.len(), cloned.layers.len());
7408        for (l, (orig, cl)) in g.layers.iter().zip(cloned.layers.iter()).enumerate() {
7409            assert!(
7410                orig.shares_storage_with(cl),
7411                "layer {l} PV not shared after clone — clone copied elements (O(N))"
7412            );
7413        }
7414    }
7415
7416    #[test]
7417    fn sq8_catalog_serialise_roundtrip_preserves_cells_and_index() {
7418        // v6.0.1 step 6 verify: a catalog with an `VECTOR(N)
7419        // USING SQ8` column + NSW index survives a full
7420        // serialise → deserialise cycle. Cells re-decode bit-
7421        // identically (per-vector affine triple), the NSW
7422        // topology stays intact, and kNN search still routes
7423        // through the SQ8 ADC dispatcher after the catalog hop.
7424        let mut cat = Catalog::new();
7425        cat.create_table(TableSchema::new(
7426            "vecs",
7427            alloc::vec![
7428                ColumnSchema::new("id", DataType::Int, false),
7429                ColumnSchema::new(
7430                    "v",
7431                    DataType::Vector {
7432                        dim: 8,
7433                        encoding: VecEncoding::Sq8,
7434                    },
7435                    false,
7436                ),
7437            ],
7438        ))
7439        .unwrap();
7440        let t = cat.get_mut("vecs").unwrap();
7441        for i in 0..32_i32 {
7442            #[allow(clippy::cast_precision_loss)]
7443            let base = (i as f32) * 0.03;
7444            let v: Vec<f32> = (0..8_i32)
7445                .map(|j| {
7446                    #[allow(clippy::cast_precision_loss)]
7447                    let off = (j as f32) * 0.01;
7448                    base + off
7449                })
7450                .collect();
7451            t.insert(Row::new(alloc::vec![
7452                Value::Int(i),
7453                Value::Sq8Vector(quantize::quantize(&v)),
7454            ]))
7455            .unwrap();
7456        }
7457        t.add_nsw_index("v_idx".into(), "v", NSW_DEFAULT_M).unwrap();
7458        // Capture a pre-serialise reference cell + nsw hits to
7459        // compare against the restored catalog.
7460        let query = alloc::vec![0.15_f32, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21, 0.22];
7461        let (before_cell, before_ty, before_hits) = {
7462            let t_ref = cat.get("vecs").unwrap();
7463            (
7464                t_ref.rows()[5].values[1].clone(),
7465                t_ref.schema().columns[1].ty,
7466                nsw_query(t_ref, "v_idx", &query, 5, NswMetric::L2),
7467            )
7468        };
7469
7470        let bytes = cat.serialize();
7471        let restored = Catalog::deserialize(&bytes).expect("deserialize ok");
7472        let rt = restored.get("vecs").unwrap();
7473        assert_eq!(rt.schema().columns[1].ty, before_ty);
7474        assert_eq!(rt.rows()[5].values[1], before_cell);
7475        let after_hits = nsw_query(rt, "v_idx", &query, 5, NswMetric::L2);
7476        assert_eq!(before_hits, after_hits);
7477    }
7478
7479    #[test]
7480    fn half_catalog_serialise_roundtrip_preserves_cells_and_index() {
7481        // v6.0.3 step 4 verify: a catalog with a `VECTOR(N) USING
7482        // HALF` column + NSW index survives a full serialise →
7483        // deserialise cycle. Cells re-decode bit-identically (raw
7484        // u16 LE bytes), the NSW topology stays intact, and kNN
7485        // search still returns the same hit IDs against the
7486        // restored catalog.
7487        use crate::halfvec;
7488        let mut cat = Catalog::new();
7489        cat.create_table(TableSchema::new(
7490            "vecs",
7491            alloc::vec![
7492                ColumnSchema::new("id", DataType::Int, false),
7493                ColumnSchema::new(
7494                    "v",
7495                    DataType::Vector {
7496                        dim: 8,
7497                        encoding: VecEncoding::F16,
7498                    },
7499                    false,
7500                ),
7501            ],
7502        ))
7503        .unwrap();
7504        let t = cat.get_mut("vecs").unwrap();
7505        for i in 0..32_i32 {
7506            #[allow(clippy::cast_precision_loss)]
7507            let base = (i as f32) * 0.03;
7508            let v: Vec<f32> = (0..8_i32)
7509                .map(|j| {
7510                    #[allow(clippy::cast_precision_loss)]
7511                    let off = (j as f32) * 0.01;
7512                    base + off
7513                })
7514                .collect();
7515            t.insert(Row::new(alloc::vec![
7516                Value::Int(i),
7517                Value::HalfVector(halfvec::HalfVector::from_f32_slice(&v)),
7518            ]))
7519            .unwrap();
7520        }
7521        t.add_nsw_index("v_idx".into(), "v", NSW_DEFAULT_M).unwrap();
7522        let query = alloc::vec![0.15_f32, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21, 0.22];
7523        let (before_cell, before_ty, before_hits) = {
7524            let t_ref = cat.get("vecs").unwrap();
7525            (
7526                t_ref.rows()[5].values[1].clone(),
7527                t_ref.schema().columns[1].ty,
7528                nsw_query(t_ref, "v_idx", &query, 5, NswMetric::L2),
7529            )
7530        };
7531        let bytes = cat.serialize();
7532        let restored = Catalog::deserialize(&bytes).expect("deserialize ok");
7533        let rt = restored.get("vecs").unwrap();
7534        assert_eq!(rt.schema().columns[1].ty, before_ty);
7535        assert_eq!(rt.rows()[5].values[1], before_cell);
7536        let after_hits = nsw_query(rt, "v_idx", &query, 5, NswMetric::L2);
7537        assert_eq!(before_hits, after_hits);
7538    }
7539
7540    #[test]
7541    #[allow(clippy::similar_names)]
7542    fn hnsw_half_recall_at_10_matches_f32_groundtruth() {
7543        // v6.0.3 step 3 verify: HALF column NSW retrieves ≥ 95%
7544        // top-10 overlap vs brute-force F32 ground truth.
7545        // Half-precision dequantises bit-exactly at the storage
7546        // layer (no rerank pass), so the recall floor is tighter
7547        // than the SQ8 case — only the rounding noise from f32 →
7548        // f16 quantisation contributes.
7549        use crate::halfvec;
7550        fn next(state: &mut u64) -> f32 {
7551            *state = state
7552                .wrapping_add(0x9E37_79B9_7F4A_7C15)
7553                .wrapping_mul(0xBF58_476D_1CE4_E5B9);
7554            #[allow(clippy::cast_precision_loss)]
7555            let u = ((*state >> 32) as u32 as f32) / (u32::MAX as f32);
7556            2.0 * u - 1.0
7557        }
7558        let dim: u32 = 32;
7559        let n: usize = 512;
7560        let dim_us = dim as usize;
7561        let mut seed: u64 = 0xF16_F16_F16_F16_u64;
7562        let corpus: Vec<Vec<f32>> = (0..n)
7563            .map(|_| (0..dim_us).map(|_| next(&mut seed)).collect())
7564            .collect();
7565        let queries: Vec<Vec<f32>> = (0..32)
7566            .map(|_| (0..dim_us).map(|_| next(&mut seed)).collect())
7567            .collect();
7568        let exact_top10: Vec<Vec<usize>> = queries
7569            .iter()
7570            .map(|q| {
7571                let mut scored: Vec<(f32, usize)> = corpus
7572                    .iter()
7573                    .enumerate()
7574                    .map(|(i, v)| (l2_distance_sq(v, q), i))
7575                    .collect();
7576                scored.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
7577                scored.into_iter().take(10).map(|(_, i)| i).collect()
7578            })
7579            .collect();
7580        let mut cat = Catalog::new();
7581        cat.create_table(TableSchema::new(
7582            "vecs",
7583            alloc::vec![
7584                ColumnSchema::new("id", DataType::Int, false),
7585                ColumnSchema::new(
7586                    "v",
7587                    DataType::Vector {
7588                        dim,
7589                        encoding: VecEncoding::F16,
7590                    },
7591                    false,
7592                ),
7593            ],
7594        ))
7595        .unwrap();
7596        let t = cat.get_mut("vecs").unwrap();
7597        for (i, v) in corpus.iter().enumerate() {
7598            t.insert(Row::new(alloc::vec![
7599                Value::Int(i32::try_from(i).unwrap()),
7600                Value::HalfVector(halfvec::HalfVector::from_f32_slice(v)),
7601            ]))
7602            .unwrap();
7603        }
7604        t.add_nsw_index("v_idx".into(), "v", NSW_DEFAULT_M).unwrap();
7605        let table = cat.get("vecs").unwrap();
7606        let mut total_overlap = 0_usize;
7607        for (q, exact) in queries.iter().zip(exact_top10.iter()) {
7608            let hits = nsw_query(table, "v_idx", q, 10, NswMetric::L2);
7609            for h in &hits {
7610                if exact.contains(h) {
7611                    total_overlap += 1;
7612                }
7613            }
7614        }
7615        #[allow(clippy::cast_precision_loss)]
7616        let recall = total_overlap as f32 / (10.0 * queries.len() as f32);
7617        assert!(
7618            recall >= 0.95,
7619            "HALF HNSW recall@10 = {recall:.3}, below floor 0.95 — \
7620             check halfvec dispatch in `cell_to_query_metric_distance`"
7621        );
7622    }
7623
7624    #[test]
7625    fn hnsw_sq8_recall_at_10_above_0_95_vs_f32_groundtruth() {
7626        // v6.0.1 step 5 verify: build TWO catalogs over the same
7627        // corpus — one F32, one SQ8 — and confirm SQ8 NSW + f32
7628        // rerank retrieves ≥ 95% top-10 overlap vs brute-force F32
7629        // ground truth. The rerank pass (sq8_rerank) re-scores ADC
7630        // candidates with dequantised cells, recovering recall the
7631        // raw ADC sacrifices for 4× compression.
7632        use crate::quantize;
7633        // Deterministic Gaussian-ish corpus via splitmix64. Vectors
7634        // get normalised so SQ8's per-vector `(min, max)` lives in
7635        // a sensible range; matches the v6.0.0 fuzz harness.
7636        fn next(state: &mut u64) -> f32 {
7637            *state = state
7638                .wrapping_add(0x9E37_79B9_7F4A_7C15)
7639                .wrapping_mul(0xBF58_476D_1CE4_E5B9);
7640            #[allow(clippy::cast_precision_loss)]
7641            let u = ((*state >> 32) as u32 as f32) / (u32::MAX as f32);
7642            2.0 * u - 1.0
7643        }
7644        let dim: u32 = 32;
7645        let n: usize = 512;
7646        let dim_us = dim as usize;
7647        let mut seed: u64 = 0xCAFE_BABE_DEAD_BEEFu64;
7648        let corpus: Vec<Vec<f32>> = (0..n)
7649            .map(|_| (0..dim_us).map(|_| next(&mut seed)).collect())
7650            .collect();
7651        let queries: Vec<Vec<f32>> = (0..32)
7652            .map(|_| (0..dim_us).map(|_| next(&mut seed)).collect())
7653            .collect();
7654        // F32 ground truth — pure exact arithmetic, brute force.
7655        let exact_top10: Vec<Vec<usize>> = queries
7656            .iter()
7657            .map(|q| {
7658                let mut scored: Vec<(f32, usize)> = corpus
7659                    .iter()
7660                    .enumerate()
7661                    .map(|(i, v)| (l2_distance_sq(v, q), i))
7662                    .collect();
7663                scored.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
7664                scored.into_iter().take(10).map(|(_, i)| i).collect()
7665            })
7666            .collect();
7667        // SQ8 catalog — INSERTs land as `Value::Sq8Vector` cells;
7668        // HNSW build uses the ADC path verified in step 4.
7669        let mut cat = Catalog::new();
7670        cat.create_table(TableSchema::new(
7671            "vecs",
7672            alloc::vec![
7673                ColumnSchema::new("id", DataType::Int, false),
7674                ColumnSchema::new(
7675                    "v",
7676                    DataType::Vector {
7677                        dim,
7678                        encoding: VecEncoding::Sq8,
7679                    },
7680                    false,
7681                ),
7682            ],
7683        ))
7684        .unwrap();
7685        let t = cat.get_mut("vecs").unwrap();
7686        for (i, v) in corpus.iter().enumerate() {
7687            t.insert(Row::new(alloc::vec![
7688                Value::Int(i32::try_from(i).unwrap()),
7689                Value::Sq8Vector(quantize::quantize(v)),
7690            ]))
7691            .unwrap();
7692        }
7693        t.add_nsw_index("v_idx".into(), "v", NSW_DEFAULT_M).unwrap();
7694        let table = cat.get("vecs").unwrap();
7695        let mut total_overlap = 0_usize;
7696        for (q, exact) in queries.iter().zip(exact_top10.iter()) {
7697            let hits = nsw_query(table, "v_idx", q, 10, NswMetric::L2);
7698            for h in &hits {
7699                if exact.contains(h) {
7700                    total_overlap += 1;
7701                }
7702            }
7703        }
7704        #[allow(clippy::cast_precision_loss)]
7705        let recall = total_overlap as f32 / (10.0 * queries.len() as f32);
7706        assert!(
7707            recall >= 0.95,
7708            "SQ8 HNSW recall@10 = {recall:.3}, below floor 0.95 — \
7709             check `sq8_rerank` is wired in `nsw_search` for SQ8 columns"
7710        );
7711    }
7712
7713    #[test]
7714    fn nsw_index_topology_persists_through_round_trip() {
7715        // Build an NSW index, capture its (entry, neighbors) tuple, do
7716        // a full serialize → deserialize, and verify the restored
7717        // graph is byte-for-byte identical. The point of v2.7 is that
7718        // startup skips the rebuild, so the topology has to survive
7719        // the disk hop.
7720        let mut cat = Catalog::new();
7721        cat.create_table(TableSchema::new(
7722            "docs",
7723            alloc::vec![
7724                ColumnSchema::new("id", DataType::Int, false),
7725                ColumnSchema::new(
7726                    "v",
7727                    DataType::Vector {
7728                        dim: 3,
7729                        encoding: VecEncoding::F32
7730                    },
7731                    true
7732                ),
7733            ],
7734        ))
7735        .unwrap();
7736        let t = cat.get_mut("docs").unwrap();
7737        for i in 0..6_i32 {
7738            #[allow(clippy::cast_precision_loss)] // 0..6 — no precision lost
7739            let base = (i as f32) * 0.1;
7740            let row = Row::new(alloc::vec![
7741                Value::Int(i),
7742                Value::Vector(alloc::vec![base, base + 0.05, base + 0.1]),
7743            ]);
7744            t.insert(row).unwrap();
7745        }
7746        t.add_nsw_index("docs_nsw".into(), "v", NSW_DEFAULT_M)
7747            .unwrap();
7748        let original = match &cat.get("docs").unwrap().indices()[0].kind {
7749            IndexKind::Nsw(g) => g.clone(),
7750            IndexKind::BTree(_)
7751            | IndexKind::Brin { .. }
7752            | IndexKind::Gin(_)
7753            | IndexKind::GinTrgm(_) => {
7754                panic!("expected NSW")
7755            }
7756        };
7757        let bytes = cat.serialize();
7758        let restored = Catalog::deserialize(&bytes).expect("deserialize");
7759        let restored_graph = match &restored.get("docs").unwrap().indices()[0].kind {
7760            IndexKind::Nsw(g) => g.clone(),
7761            IndexKind::BTree(_)
7762            | IndexKind::Brin { .. }
7763            | IndexKind::Gin(_)
7764            | IndexKind::GinTrgm(_) => {
7765                panic!("expected NSW")
7766            }
7767        };
7768        assert_eq!(restored_graph.m, original.m);
7769        assert_eq!(restored_graph.m_max_0, original.m_max_0);
7770        assert_eq!(restored_graph.entry, original.entry);
7771        assert_eq!(restored_graph.entry_level, original.entry_level);
7772        assert_eq!(restored_graph.levels, original.levels);
7773        assert_eq!(restored_graph.layers, original.layers);
7774    }
7775
7776    #[test]
7777    fn hnsw_level_assignment_is_deterministic() {
7778        // Same row index always produces the same level — the topology
7779        // must be reproducible (matters for serialize round-trip).
7780        for i in 0..32usize {
7781            assert_eq!(nsw_assign_level(i), nsw_assign_level(i));
7782        }
7783    }
7784
7785    #[test]
7786    fn hnsw_layer_0_dominates_population() {
7787        // Sanity: out of N inserts, the vast majority should land on
7788        // layer 0. The 4-bit-clear promotion rule gives roughly 1/16
7789        // promotion to layer ≥ 1, so under 50 nodes we expect ~3 on
7790        // layer ≥ 1 and the rest on layer 0.
7791        let on_zero = (0..200usize).filter(|&i| nsw_assign_level(i) == 0).count();
7792        assert!(on_zero > 150, "level-0 nodes too few: {on_zero}");
7793    }
7794
7795    #[test]
7796    fn hnsw_search_matches_brute_force_for_l2_top1() {
7797        // Build a small dataset, query it, and confirm the top result
7798        // matches the brute-force nearest by L2. Topology variability
7799        // shouldn't break recall at k=1 for well-separated vectors.
7800        let mut cat = Catalog::new();
7801        cat.create_table(TableSchema::new(
7802            "vecs",
7803            alloc::vec![
7804                ColumnSchema::new("id", DataType::Int, false),
7805                ColumnSchema::new(
7806                    "v",
7807                    DataType::Vector {
7808                        dim: 3,
7809                        encoding: VecEncoding::F32
7810                    },
7811                    true
7812                ),
7813            ],
7814        ))
7815        .unwrap();
7816        let t = cat.get_mut("vecs").unwrap();
7817        let dataset: alloc::vec::Vec<(i32, [f32; 3])> = alloc::vec![
7818            (1, [0.0, 0.0, 0.0]),
7819            (2, [1.0, 0.0, 0.0]),
7820            (3, [0.0, 1.0, 0.0]),
7821            (4, [0.0, 0.0, 1.0]),
7822            (5, [1.0, 1.0, 0.0]),
7823            (6, [1.0, 0.0, 1.0]),
7824            (7, [0.0, 1.0, 1.0]),
7825            (8, [1.0, 1.0, 1.0]),
7826            (9, [0.5, 0.5, 0.5]),
7827            (10, [0.2, 0.8, 0.5]),
7828        ];
7829        for &(id, v) in &dataset {
7830            t.insert(Row::new(alloc::vec![
7831                Value::Int(id),
7832                Value::Vector(alloc::vec![v[0], v[1], v[2]]),
7833            ]))
7834            .unwrap();
7835        }
7836        t.add_nsw_index("v_idx".into(), "v", NSW_DEFAULT_M).unwrap();
7837        let idx_pos = cat
7838            .get("vecs")
7839            .unwrap()
7840            .indices()
7841            .iter()
7842            .position(|i| i.name == "v_idx")
7843            .unwrap();
7844        for query in [[0.4, 0.4, 0.4], [0.9, 0.1, 0.0], [0.0, 0.9, 0.9]] {
7845            let table = cat.get("vecs").unwrap();
7846            let hnsw_top = nsw_search(table, idx_pos, &query, 1, 16, NswMetric::L2);
7847            let mut brute: alloc::vec::Vec<(f32, usize)> = (0..table.rows.len())
7848                .map(|i| {
7849                    let Value::Vector(v) = &table.rows[i].values[1] else {
7850                        return (f32::INFINITY, i);
7851                    };
7852                    (l2_distance_sq(v, &query), i)
7853                })
7854                .collect();
7855            brute.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
7856            assert!(!hnsw_top.is_empty(), "HNSW returned no results");
7857            assert_eq!(
7858                hnsw_top[0].1, brute[0].1,
7859                "HNSW top-1 != brute-force top-1 for {query:?}"
7860            );
7861        }
7862    }
7863
7864    #[test]
7865    fn serialize_table_with_rows_round_trips() {
7866        let mut cat = Catalog::new();
7867        cat.create_table(make_users_schema()).unwrap();
7868        let t = cat.get_mut("users").unwrap();
7869        t.insert(Row::new(vec![
7870            Value::Int(1),
7871            Value::Text("alice".into()),
7872            Value::Float(95.5),
7873        ]))
7874        .unwrap();
7875        t.insert(Row::new(vec![
7876            Value::Int(2),
7877            Value::Text("bob".into()),
7878            Value::Null,
7879        ]))
7880        .unwrap();
7881        assert_round_trip(&cat);
7882    }
7883
7884    #[test]
7885    fn serialize_multiple_tables_round_trips() {
7886        let mut cat = Catalog::new();
7887        cat.create_table(make_users_schema()).unwrap();
7888        cat.create_table(TableSchema::new(
7889            "flags",
7890            vec![
7891                ColumnSchema::new("id", DataType::BigInt, false),
7892                ColumnSchema::new("active", DataType::Bool, false),
7893            ],
7894        ))
7895        .unwrap();
7896        cat.get_mut("flags")
7897            .unwrap()
7898            .insert(Row::new(vec![Value::BigInt(7), Value::Bool(true)]))
7899            .unwrap();
7900        assert_round_trip(&cat);
7901    }
7902
7903    #[test]
7904    fn deserialize_rejects_bad_magic() {
7905        let mut buf = b"BADMAGIC".to_vec();
7906        buf.push(FILE_VERSION);
7907        buf.extend_from_slice(&0u32.to_le_bytes());
7908        let err = Catalog::deserialize(&buf).unwrap_err();
7909        assert!(matches!(err, StorageError::Corrupt(_)));
7910    }
7911
7912    #[test]
7913    fn deserialize_rejects_unsupported_version() {
7914        let mut buf = FILE_MAGIC.to_vec();
7915        buf.push(99); // future version
7916        buf.extend_from_slice(&0u32.to_le_bytes());
7917        let err = Catalog::deserialize(&buf).unwrap_err();
7918        assert!(matches!(err, StorageError::Corrupt(ref s) if s.contains("version")));
7919    }
7920
7921    #[test]
7922    fn deserialize_rejects_truncated_file() {
7923        let mut cat = Catalog::new();
7924        cat.create_table(make_users_schema()).unwrap();
7925        let bytes = cat.serialize();
7926        // Drop the last byte to simulate truncation.
7927        let truncated = &bytes[..bytes.len() - 1];
7928        assert!(matches!(
7929            Catalog::deserialize(truncated),
7930            Err(StorageError::Corrupt(_))
7931        ));
7932    }
7933
7934    #[test]
7935    fn deserialize_rejects_trailing_garbage() {
7936        let cat = Catalog::new();
7937        let mut bytes = cat.serialize();
7938        bytes.push(0xFF);
7939        assert!(matches!(
7940            Catalog::deserialize(&bytes),
7941            Err(StorageError::Corrupt(ref s)) if s.contains("trailing")
7942        ));
7943    }
7944
7945    // --- v0.8 indices ------------------------------------------------------
7946
7947    fn populated_users() -> Catalog {
7948        let mut cat = Catalog::new();
7949        cat.create_table(make_users_schema()).unwrap();
7950        let t = cat.get_mut("users").unwrap();
7951        for (id, name, score) in [
7952            (1, "alice", Some(90.0)),
7953            (2, "bob", None),
7954            (3, "alice", Some(70.0)), // duplicate name → maps to two row idxs
7955        ] {
7956            t.insert(Row::new(vec![
7957                Value::Int(id),
7958                Value::Text(name.into()),
7959                score.map_or(Value::Null, Value::Float),
7960            ]))
7961            .unwrap();
7962        }
7963        cat
7964    }
7965
7966    #[test]
7967    fn add_index_builds_from_existing_rows() {
7968        let mut cat = populated_users();
7969        cat.get_mut("users")
7970            .unwrap()
7971            .add_index("by_id".into(), "id")
7972            .unwrap();
7973        let t = cat.get("users").unwrap();
7974        let idx = t.index_on(0).expect("index_on(0)");
7975        assert_eq!(idx.lookup_eq(&IndexKey::Int(2)), &[RowLocator::Hot(1)]);
7976        assert_eq!(idx.lookup_eq(&IndexKey::Int(99)), &[] as &[RowLocator]);
7977    }
7978
7979    #[test]
7980    fn add_index_dup_name_rejected() {
7981        let mut cat = populated_users();
7982        let t = cat.get_mut("users").unwrap();
7983        t.add_index("ix".into(), "id").unwrap();
7984        let err = t.add_index("ix".into(), "name").unwrap_err();
7985        assert!(matches!(err, StorageError::DuplicateIndex { ref name } if name == "ix"));
7986    }
7987
7988    #[test]
7989    fn add_index_unknown_column_rejected() {
7990        let mut cat = populated_users();
7991        let err = cat
7992            .get_mut("users")
7993            .unwrap()
7994            .add_index("ix".into(), "ghost")
7995            .unwrap_err();
7996        assert!(matches!(err, StorageError::ColumnNotFound { ref column } if column == "ghost"));
7997    }
7998
7999    #[test]
8000    fn insert_after_create_index_updates_it() {
8001        let mut cat = populated_users();
8002        let t = cat.get_mut("users").unwrap();
8003        t.add_index("by_name".into(), "name").unwrap();
8004        t.insert(Row::new(vec![
8005            Value::Int(4),
8006            Value::Text("dave".into()),
8007            Value::Null,
8008        ]))
8009        .unwrap();
8010        let idx = t.index_on(1).unwrap();
8011        assert_eq!(
8012            idx.lookup_eq(&IndexKey::Text("dave".into())),
8013            &[RowLocator::Hot(3)]
8014        );
8015        // Pre-existing duplicates remain mapped to the two original row idxs.
8016        assert_eq!(
8017            idx.lookup_eq(&IndexKey::Text("alice".into())),
8018            &[RowLocator::Hot(0), RowLocator::Hot(2)]
8019        );
8020    }
8021
8022    #[test]
8023    fn null_or_float_values_are_not_indexed() {
8024        let mut cat = populated_users();
8025        let t = cat.get_mut("users").unwrap();
8026        t.add_index("by_score".into(), "score").unwrap();
8027        let idx = t.index_on(2).unwrap();
8028        // bob's score is NULL → no entry for bob.
8029        // Score is Float → the spec says we don't index NaN-prone columns,
8030        // so even the present scores are absent. Lookups via IndexKey::Int(90)
8031        // mis-match the column type and trivially find nothing.
8032        assert_eq!(idx.lookup_eq(&IndexKey::Int(90)), &[] as &[RowLocator]);
8033    }
8034
8035    // --- v0.11 vector type -------------------------------------------------
8036
8037    #[test]
8038    fn vector_value_data_type_carries_dim() {
8039        let v = Value::Vector(vec![1.0, 2.0, 3.0]);
8040        assert_eq!(
8041            v.data_type(),
8042            Some(DataType::Vector {
8043                dim: 3,
8044                encoding: VecEncoding::F32
8045            })
8046        );
8047    }
8048
8049    #[test]
8050    fn vector_column_insert_matching_dim_ok() {
8051        let mut cat = Catalog::new();
8052        cat.create_table(TableSchema::new(
8053            "emb",
8054            vec![ColumnSchema::new(
8055                "v",
8056                DataType::Vector {
8057                    dim: 3,
8058                    encoding: VecEncoding::F32,
8059                },
8060                false,
8061            )],
8062        ))
8063        .unwrap();
8064        cat.get_mut("emb")
8065            .unwrap()
8066            .insert(Row::new(vec![Value::Vector(vec![1.0, 2.0, 3.0])]))
8067            .unwrap();
8068    }
8069
8070    #[test]
8071    fn vector_column_insert_dim_mismatch_rejected() {
8072        let mut cat = Catalog::new();
8073        cat.create_table(TableSchema::new(
8074            "emb",
8075            vec![ColumnSchema::new(
8076                "v",
8077                DataType::Vector {
8078                    dim: 3,
8079                    encoding: VecEncoding::F32,
8080                },
8081                false,
8082            )],
8083        ))
8084        .unwrap();
8085        let err = cat
8086            .get_mut("emb")
8087            .unwrap()
8088            .insert(Row::new(vec![Value::Vector(vec![1.0, 2.0])]))
8089            .unwrap_err();
8090        assert!(matches!(err, StorageError::TypeMismatch { .. }));
8091    }
8092
8093    #[test]
8094    fn vector_value_survives_catalog_round_trip() {
8095        let mut cat = Catalog::new();
8096        cat.create_table(TableSchema::new(
8097            "emb",
8098            vec![
8099                ColumnSchema::new("id", DataType::Int, false),
8100                ColumnSchema::new(
8101                    "v",
8102                    DataType::Vector {
8103                        dim: 4,
8104                        encoding: VecEncoding::F32,
8105                    },
8106                    false,
8107                ),
8108            ],
8109        ))
8110        .unwrap();
8111        cat.get_mut("emb")
8112            .unwrap()
8113            .insert(Row::new(vec![
8114                Value::Int(1),
8115                Value::Vector(vec![0.5, -1.25, 3.0, 7.0]),
8116            ]))
8117            .unwrap();
8118        let restored = Catalog::deserialize(&cat.serialize()).expect("round-trip");
8119        let table = restored.get("emb").unwrap();
8120        assert_eq!(
8121            table.schema().columns[1].ty,
8122            DataType::Vector {
8123                dim: 4,
8124                encoding: VecEncoding::F32
8125            }
8126        );
8127        assert_eq!(
8128            table.rows()[0].values[1],
8129            Value::Vector(vec![0.5, -1.25, 3.0, 7.0])
8130        );
8131    }
8132
8133    #[test]
8134    fn index_survives_serialize_deserialize_round_trip() {
8135        let mut cat = populated_users();
8136        cat.get_mut("users")
8137            .unwrap()
8138            .add_index("by_name".into(), "name")
8139            .unwrap();
8140        let restored = Catalog::deserialize(&cat.serialize()).unwrap();
8141        let idx = restored
8142            .get("users")
8143            .unwrap()
8144            .index_on(1)
8145            .expect("index_on(1) after restore");
8146        assert_eq!(idx.name, "by_name");
8147        // Data was rebuilt from rows, not deserialized directly.
8148        assert_eq!(
8149            idx.lookup_eq(&IndexKey::Text("alice".into())),
8150            &[RowLocator::Hot(0), RowLocator::Hot(2)]
8151        );
8152    }
8153
8154    // --- v5.1 cold-tier integration tests ----------------------
8155
8156    /// Schema with a BIGINT PK column matching what the v5.1 cold-
8157    /// tier path supports (`IndexKey::Int` → `u64` cast).
8158    fn bigint_pk_users_schema() -> TableSchema {
8159        TableSchema::new(
8160            "users",
8161            vec![
8162                ColumnSchema::new("id", DataType::BigInt, false),
8163                ColumnSchema::new("name", DataType::Text, false),
8164            ],
8165        )
8166    }
8167
8168    fn make_user_row(id: i64, name: &str) -> Row {
8169        Row::new(vec![Value::BigInt(id), Value::Text(name.into())])
8170    }
8171
8172    #[test]
8173    fn lookup_by_pk_finds_row_via_hot_index() {
8174        let mut cat = Catalog::new();
8175        cat.create_table(bigint_pk_users_schema()).unwrap();
8176        let t = cat.get_mut("users").unwrap();
8177        for (id, name) in [(1i64, "alice"), (2, "bob"), (3, "carol")] {
8178            t.insert(make_user_row(id, name)).unwrap();
8179        }
8180        t.add_index("by_id".into(), "id").unwrap();
8181        // All locators are Hot; cold_segments is empty.
8182        let got = cat
8183            .lookup_by_pk("users", "by_id", &IndexKey::Int(2))
8184            .unwrap();
8185        assert_eq!(got, make_user_row(2, "bob"));
8186        assert_eq!(cat.cold_segment_count(), 0);
8187    }
8188
8189    #[test]
8190    fn lookup_by_pk_returns_none_when_key_missing() {
8191        let mut cat = Catalog::new();
8192        cat.create_table(bigint_pk_users_schema()).unwrap();
8193        let t = cat.get_mut("users").unwrap();
8194        t.insert(make_user_row(1, "alice")).unwrap();
8195        t.add_index("by_id".into(), "id").unwrap();
8196        assert!(
8197            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(999))
8198                .is_none()
8199        );
8200        // Also: unknown table / unknown index name.
8201        assert!(
8202            cat.lookup_by_pk("other_table", "by_id", &IndexKey::Int(1))
8203                .is_none()
8204        );
8205        assert!(
8206            cat.lookup_by_pk("users", "no_such_index", &IndexKey::Int(1))
8207                .is_none()
8208        );
8209    }
8210
8211    #[test]
8212    fn lookup_by_pk_resolves_cold_locator_via_loaded_segment() {
8213        // Build a cold-tier segment whose payloads are dense-encoded
8214        // BIGINT rows. Wire each PK into the BTree index as a Cold
8215        // locator. The hot tier carries no rows for those PKs.
8216        let mut cat = Catalog::new();
8217        cat.create_table(bigint_pk_users_schema()).unwrap();
8218        let t = cat.get_mut("users").unwrap();
8219        t.add_index("by_id".into(), "id").unwrap();
8220        let schema = t.schema.clone();
8221
8222        let cold_rows: Vec<(i64, &str)> =
8223            vec![(100, "ivy"), (200, "joe"), (300, "kim"), (400, "lin")];
8224        let seg_rows: Vec<(u64, Vec<u8>)> = cold_rows
8225            .iter()
8226            .map(|(id, name)| {
8227                let row = make_user_row(*id, name);
8228                ((*id).cast_unsigned(), encode_row_body_dense(&row, &schema))
8229            })
8230            .collect();
8231        let (seg_bytes, _meta) =
8232            encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES).unwrap();
8233        let seg_id = cat.load_segment_bytes(seg_bytes).unwrap();
8234        assert_eq!(seg_id, 0);
8235        assert_eq!(cat.cold_segment_count(), 1);
8236
8237        let pairs: Vec<(IndexKey, RowLocator)> = cold_rows
8238            .iter()
8239            .map(|(id, _)| {
8240                (
8241                    IndexKey::Int(*id),
8242                    RowLocator::Cold {
8243                        segment_id: seg_id,
8244                        page_offset: 0,
8245                    },
8246                )
8247            })
8248            .collect();
8249        let registered = cat
8250            .get_mut("users")
8251            .unwrap()
8252            .register_cold_locators("by_id", pairs)
8253            .unwrap();
8254        assert_eq!(registered, 4);
8255
8256        for (id, name) in &cold_rows {
8257            let got = cat
8258                .lookup_by_pk("users", "by_id", &IndexKey::Int(*id))
8259                .unwrap_or_else(|| panic!("cold key {id} not found"));
8260            assert_eq!(got, make_user_row(*id, name));
8261        }
8262        // Cold key that isn't in the segment must return None.
8263        assert!(
8264            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(999))
8265                .is_none()
8266        );
8267    }
8268
8269    #[test]
8270    fn lookup_by_pk_mixes_hot_and_cold_tiers() {
8271        // Half the rows live in the hot tier (Table::rows + add_index
8272        // produces Hot locators); half live in a cold segment and have
8273        // Cold locators wired manually. Each lookup hits the right tier.
8274        let mut cat = Catalog::new();
8275        cat.create_table(bigint_pk_users_schema()).unwrap();
8276        let t = cat.get_mut("users").unwrap();
8277        for (id, name) in [(1i64, "alice"), (2, "bob")] {
8278            t.insert(make_user_row(id, name)).unwrap();
8279        }
8280        t.add_index("by_id".into(), "id").unwrap();
8281        let schema = t.schema.clone();
8282
8283        let cold_rows: Vec<(i64, &str)> = vec![(100, "ivy"), (200, "joe")];
8284        let seg_rows: Vec<(u64, Vec<u8>)> = cold_rows
8285            .iter()
8286            .map(|(id, name)| {
8287                let row = make_user_row(*id, name);
8288                ((*id).cast_unsigned(), encode_row_body_dense(&row, &schema))
8289            })
8290            .collect();
8291        let (seg_bytes, _) =
8292            encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES).unwrap();
8293        let seg_id = cat.load_segment_bytes(seg_bytes).unwrap();
8294        let pairs: Vec<(IndexKey, RowLocator)> = cold_rows
8295            .iter()
8296            .map(|(id, _)| {
8297                (
8298                    IndexKey::Int(*id),
8299                    RowLocator::Cold {
8300                        segment_id: seg_id,
8301                        page_offset: 0,
8302                    },
8303                )
8304            })
8305            .collect();
8306        cat.get_mut("users")
8307            .unwrap()
8308            .register_cold_locators("by_id", pairs)
8309            .unwrap();
8310
8311        // Hot tier hits.
8312        assert_eq!(
8313            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(1))
8314                .unwrap(),
8315            make_user_row(1, "alice")
8316        );
8317        assert_eq!(
8318            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(2))
8319                .unwrap(),
8320            make_user_row(2, "bob")
8321        );
8322        // Cold tier hits.
8323        assert_eq!(
8324            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(100))
8325                .unwrap(),
8326            make_user_row(100, "ivy")
8327        );
8328        assert_eq!(
8329            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(200))
8330                .unwrap(),
8331            make_user_row(200, "joe")
8332        );
8333        // Miss in both tiers.
8334        assert!(
8335            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(50))
8336                .is_none()
8337        );
8338    }
8339
8340    #[test]
8341    fn register_cold_locators_rejects_nsw_index() {
8342        let mut cat = Catalog::new();
8343        cat.create_table(TableSchema::new(
8344            "vecs",
8345            vec![
8346                ColumnSchema::new("id", DataType::Int, false),
8347                ColumnSchema::new(
8348                    "v",
8349                    DataType::Vector {
8350                        dim: 4,
8351                        encoding: VecEncoding::F32,
8352                    },
8353                    false,
8354                ),
8355            ],
8356        ))
8357        .unwrap();
8358        let t = cat.get_mut("vecs").unwrap();
8359        t.insert(Row::new(vec![
8360            Value::Int(1),
8361            Value::Vector(vec![1.0, 0.0, 0.0, 0.0]),
8362        ]))
8363        .unwrap();
8364        t.add_nsw_index("by_v".into(), "v", NSW_DEFAULT_M).unwrap();
8365        let err = t
8366            .register_cold_locators(
8367                "by_v",
8368                vec![(
8369                    IndexKey::Int(1),
8370                    RowLocator::Cold {
8371                        segment_id: 0,
8372                        page_offset: 0,
8373                    },
8374                )],
8375            )
8376            .unwrap_err();
8377        // v6.7.1: message switched from "is NSW" to "is not BTree"
8378        // when the Brin variant was added.
8379        assert!(matches!(err, StorageError::Corrupt(ref s) if s.contains("not BTree")));
8380    }
8381
8382    #[test]
8383    fn load_segment_bytes_rejects_garbage() {
8384        let mut cat = Catalog::new();
8385        let err = cat.load_segment_bytes(vec![0u8; 10]).unwrap_err();
8386        assert!(matches!(err, StorageError::Corrupt(ref s) if s.contains("segment")));
8387        // Loader doesn't mutate state on error.
8388        assert_eq!(cat.cold_segment_count(), 0);
8389    }
8390
8391    #[test]
8392    fn load_segment_bytes_returns_sequential_ids() {
8393        let mut cat = Catalog::new();
8394        cat.create_table(bigint_pk_users_schema()).unwrap();
8395        let schema = cat.get("users").unwrap().schema.clone();
8396        for batch in 0u32..3 {
8397            let rows: Vec<(u64, Vec<u8>)> = (0u64..4)
8398                .map(|i| {
8399                    let id = u64::from(batch) * 100 + i;
8400                    let row = make_user_row(id.cast_signed(), "x");
8401                    (id, encode_row_body_dense(&row, &schema))
8402                })
8403                .collect();
8404            let (bytes, _) = encode_segment(rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES).unwrap();
8405            assert_eq!(cat.load_segment_bytes(bytes).unwrap(), batch);
8406        }
8407        assert_eq!(cat.cold_segment_count(), 3);
8408    }
8409
8410    // --- v5.2 catalog format v9 ----------------------------------
8411
8412    /// Hand-craft a v8 catalog byte stream and confirm the v9 reader
8413    /// accepts it and surfaces every `BTree` entry as a Hot locator.
8414    /// Guards the backward-compat read path: existing v3.0.2 / v4.x
8415    /// snapshots on disk must keep loading after the v5.2 bump.
8416    #[test]
8417    fn v8_catalog_decodes_as_all_hot_under_v9_reader() {
8418        // Build a populated catalog in memory, snapshot it with the
8419        // v9 serializer, then patch the version byte back to 8 and
8420        // strip the v9 BTree payload bytes so the layout matches what
8421        // a real v8 snapshot would have produced on disk. The v9
8422        // reader's version dispatch path then rebuilds the index
8423        // from rows (every locator becomes Hot).
8424        let mut cat = populated_users();
8425        cat.get_mut("users")
8426            .unwrap()
8427            .add_index("by_name".into(), "name")
8428            .unwrap();
8429
8430        // To produce a faithful v8 byte stream we re-encode the same
8431        // catalog with the v8 layout: identical bytes up to (and
8432        // including) the per-index kind tag, but no inline BTree
8433        // entries.
8434        let v8_bytes = encode_as_v8(&cat);
8435        assert_eq!(v8_bytes[FILE_MAGIC.len()], 8, "version byte must be 8");
8436
8437        let restored = Catalog::deserialize(&v8_bytes).expect("v9 reader accepts v8 stream");
8438        let idx = restored
8439            .get("users")
8440            .unwrap()
8441            .index_on(1)
8442            .expect("index_on(1) after restore");
8443        // v8 path always materialises Hot locators (no cold tier
8444        // existed pre-v5.2).
8445        assert_eq!(
8446            idx.lookup_eq(&IndexKey::Text("alice".into())),
8447            &[RowLocator::Hot(0), RowLocator::Hot(2)]
8448        );
8449        // No accidental Cold leak.
8450        for entry in idx.lookup_eq(&IndexKey::Text("alice".into())) {
8451            assert!(entry.is_hot(), "v8 → v9 read must yield Hot only");
8452        }
8453    }
8454
8455    /// Encode `cat` using the v8 layout (no inline `BTree` entries,
8456    /// version byte = 8). Pure test helper — duplicates just enough
8457    /// of `Catalog::serialize` to produce a faithful v8 stream that
8458    /// real v3.0.2 / v4.x deployments wrote.
8459    fn encode_as_v8(cat: &Catalog) -> Vec<u8> {
8460        let mut out = Vec::with_capacity(64);
8461        out.extend_from_slice(FILE_MAGIC);
8462        out.push(8u8);
8463        write_u32(&mut out, u32::try_from(cat.tables.len()).unwrap());
8464        for t in &cat.tables {
8465            write_str(&mut out, &t.schema.name);
8466            write_u16(&mut out, u16::try_from(t.schema.columns.len()).unwrap());
8467            for c in &t.schema.columns {
8468                write_str(&mut out, &c.name);
8469                write_data_type(&mut out, c.ty);
8470                out.push(u8::from(c.nullable));
8471                match &c.default {
8472                    None => out.push(0),
8473                    Some(v) => {
8474                        out.push(1);
8475                        write_value(&mut out, v);
8476                    }
8477                }
8478                out.push(u8::from(c.auto_increment));
8479            }
8480            write_u32(&mut out, u32::try_from(t.rows.len()).unwrap());
8481            for row in &t.rows {
8482                out.extend_from_slice(&encode_row_body_dense(row, &t.schema));
8483            }
8484            write_u16(&mut out, u16::try_from(t.indices.len()).unwrap());
8485            for idx in &t.indices {
8486                write_str(&mut out, &idx.name);
8487                write_u16(&mut out, u16::try_from(idx.column_position).unwrap());
8488                match &idx.kind {
8489                    // v8 BTree wrote only the kind tag; entries
8490                    // rebuild from rows on read.
8491                    IndexKind::BTree(_) => out.push(0),
8492                    IndexKind::Nsw(g) => {
8493                        out.push(1);
8494                        write_u16(&mut out, u16::try_from(g.m).unwrap());
8495                        write_nsw_graph(&mut out, g);
8496                    }
8497                    // v8 had no BRIN / GIN; this test-only writer
8498                    // can't serialise either into the legacy format.
8499                    IndexKind::Brin { .. } => panic!(
8500                        "v8 catalog writer cannot serialise BRIN — \
8501                         tests with BRIN indices must use the current writer"
8502                    ),
8503                    IndexKind::Gin(_) => panic!(
8504                        "v8 catalog writer cannot serialise GIN — \
8505                         tests with GIN indices must use the current writer"
8506                    ),
8507                    IndexKind::GinTrgm(_) => panic!(
8508                        "v8 catalog writer cannot serialise trigram-GIN — \
8509                         tests with trgm indices must use the current writer"
8510                    ),
8511                }
8512            }
8513        }
8514        out
8515    }
8516
8517    /// Build a catalog that carries both hot and cold locators on a
8518    /// `BTree` index, snapshot it through `serialize`, then deserialise
8519    /// and confirm every Cold locator round-trips byte-identical and
8520    /// `lookup_by_pk` resolves through the rebuilt cold-segment
8521    /// registry.
8522    #[test]
8523    fn v9_catalog_round_trip_preserves_cold_locators() {
8524        let mut cat = Catalog::new();
8525        cat.create_table(bigint_pk_users_schema()).unwrap();
8526        let t = cat.get_mut("users").unwrap();
8527        // Hot rows: 1, 2
8528        for (id, name) in [(1i64, "alice"), (2, "bob")] {
8529            t.insert(make_user_row(id, name)).unwrap();
8530        }
8531        t.add_index("by_id".into(), "id").unwrap();
8532        let schema = t.schema.clone();
8533
8534        // Cold rows: 100, 200, 300 — sit in a single segment.
8535        let cold_rows: Vec<(i64, &str)> = vec![(100, "ivy"), (200, "joe"), (300, "kim")];
8536        let seg_rows: Vec<(u64, Vec<u8>)> = cold_rows
8537            .iter()
8538            .map(|(id, name)| {
8539                let row = make_user_row(*id, name);
8540                ((*id).cast_unsigned(), encode_row_body_dense(&row, &schema))
8541            })
8542            .collect();
8543        let (seg_bytes, _) =
8544            encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES).unwrap();
8545        let seg_id = cat.load_segment_bytes(seg_bytes.clone()).unwrap();
8546        let pairs: Vec<(IndexKey, RowLocator)> = cold_rows
8547            .iter()
8548            .map(|(id, _)| {
8549                (
8550                    IndexKey::Int(*id),
8551                    RowLocator::Cold {
8552                        segment_id: seg_id,
8553                        page_offset: 0,
8554                    },
8555                )
8556            })
8557            .collect();
8558        cat.get_mut("users")
8559            .unwrap()
8560            .register_cold_locators("by_id", pairs)
8561            .unwrap();
8562
8563        // Snapshot + restore via the v9 codec.
8564        let bytes = cat.serialize();
8565        assert_eq!(bytes[FILE_MAGIC.len()], FILE_VERSION);
8566        let mut restored = Catalog::deserialize(&bytes).expect("v9 round-trip parses");
8567
8568        // Catalog::serialize does not yet emit cold segment file
8569        // bytes (v5.3 manifest is the future home for that). For
8570        // this v9 test the caller side-loads the segment again so
8571        // lookup_by_pk can resolve the Cold locator. The point of
8572        // this assertion is that the locator metadata survived the
8573        // catalog round-trip.
8574        let restored_seg_id = restored.load_segment_bytes(seg_bytes).unwrap();
8575        assert_eq!(restored_seg_id, seg_id);
8576
8577        let idx = restored.get("users").unwrap().index_on(0).unwrap();
8578        // Hot locators round-trip.
8579        assert_eq!(idx.lookup_eq(&IndexKey::Int(1)), &[RowLocator::Hot(0)]);
8580        assert_eq!(idx.lookup_eq(&IndexKey::Int(2)), &[RowLocator::Hot(1)]);
8581        // Cold locators round-trip byte-identical.
8582        for (id, _) in &cold_rows {
8583            assert_eq!(
8584                idx.lookup_eq(&IndexKey::Int(*id)),
8585                &[RowLocator::Cold {
8586                    segment_id: seg_id,
8587                    page_offset: 0,
8588                }]
8589            );
8590        }
8591        // End-to-end: lookup_by_pk resolves both tiers.
8592        assert_eq!(
8593            restored
8594                .lookup_by_pk("users", "by_id", &IndexKey::Int(2))
8595                .unwrap(),
8596            make_user_row(2, "bob")
8597        );
8598        for (id, name) in &cold_rows {
8599            assert_eq!(
8600                restored
8601                    .lookup_by_pk("users", "by_id", &IndexKey::Int(*id))
8602                    .unwrap(),
8603                make_user_row(*id, name)
8604            );
8605        }
8606    }
8607
8608    // --- v5.2.1 hot tier byte tracking ---------------------------
8609
8610    /// `row_body_encoded_len` is the perf-critical fast path; pin it
8611    /// against `encode_row_body_dense(...).len()` for every
8612    /// representative cell type so an encoder change can't silently
8613    /// desync the counter.
8614    #[test]
8615    fn row_body_encoded_len_matches_actual_encode_for_all_types() {
8616        let schema = TableSchema::new(
8617            "wide",
8618            vec![
8619                ColumnSchema::new("a", DataType::SmallInt, true),
8620                ColumnSchema::new("b", DataType::Int, false),
8621                ColumnSchema::new("c", DataType::BigInt, false),
8622                ColumnSchema::new("d", DataType::Float, false),
8623                ColumnSchema::new("e", DataType::Bool, false),
8624                ColumnSchema::new("f", DataType::Text, false),
8625                ColumnSchema::new(
8626                    "g",
8627                    DataType::Vector {
8628                        dim: 3,
8629                        encoding: VecEncoding::F32,
8630                    },
8631                    false,
8632                ),
8633                ColumnSchema::new(
8634                    "h",
8635                    DataType::Numeric {
8636                        precision: 18,
8637                        scale: 2,
8638                    },
8639                    false,
8640                ),
8641                ColumnSchema::new("i", DataType::Date, false),
8642                ColumnSchema::new("j", DataType::Timestamp, false),
8643            ],
8644        );
8645        let cases: &[Row] = &[
8646            Row::new(vec![
8647                Value::SmallInt(7),
8648                Value::Int(42),
8649                Value::BigInt(1_000_000),
8650                Value::Float(1.5),
8651                Value::Bool(true),
8652                Value::Text("hello".into()),
8653                Value::Vector(vec![1.0, 2.0, 3.0]),
8654                Value::Numeric {
8655                    scaled: 12345,
8656                    scale: 2,
8657                },
8658                Value::Date(20_000),
8659                Value::Timestamp(1_700_000_000_000_000),
8660            ]),
8661            // NULL in the bitmap, varied text length.
8662            Row::new(vec![
8663                Value::Null,
8664                Value::Int(0),
8665                Value::BigInt(0),
8666                Value::Float(0.0),
8667                Value::Bool(false),
8668                Value::Text(String::new()),
8669                Value::Vector(vec![]),
8670                Value::Numeric {
8671                    scaled: 0,
8672                    scale: 2,
8673                },
8674                Value::Date(0),
8675                Value::Timestamp(0),
8676            ]),
8677            Row::new(vec![
8678                Value::SmallInt(-1),
8679                Value::Int(-1),
8680                Value::BigInt(-1),
8681                Value::Float(-0.5),
8682                Value::Bool(true),
8683                Value::Text("a much longer payload here".into()),
8684                Value::Vector(vec![0.1, 0.2, 0.3]),
8685                Value::Numeric {
8686                    scaled: -999_999_999,
8687                    scale: 2,
8688                },
8689                Value::Date(-1),
8690                Value::Timestamp(-1),
8691            ]),
8692        ];
8693        for row in cases {
8694            let actual = encode_row_body_dense(row, &schema).len();
8695            let fast = row_body_encoded_len(row, &schema);
8696            assert_eq!(actual, fast, "row {row:?}");
8697        }
8698    }
8699
8700    #[test]
8701    fn hot_bytes_grows_on_insert_and_matches_encoded_sum() {
8702        let mut cat = Catalog::new();
8703        cat.create_table(bigint_pk_users_schema()).unwrap();
8704        let t = cat.get_mut("users").unwrap();
8705        assert_eq!(t.hot_bytes(), 0);
8706        let mut expected: u64 = 0;
8707        for (id, name) in [(1i64, "alice"), (2, "bob"), (3, "carol")] {
8708            let row = make_user_row(id, name);
8709            expected += encode_row_body_dense(&row, &t.schema).len() as u64;
8710            t.insert(row).unwrap();
8711        }
8712        assert_eq!(t.hot_bytes(), expected);
8713        assert_eq!(cat.hot_tier_bytes(), expected);
8714    }
8715
8716    #[test]
8717    fn hot_bytes_shrinks_on_delete() {
8718        let mut cat = Catalog::new();
8719        cat.create_table(bigint_pk_users_schema()).unwrap();
8720        let t = cat.get_mut("users").unwrap();
8721        for (id, name) in [(1i64, "alice"), (2, "bob"), (3, "carol")] {
8722            t.insert(make_user_row(id, name)).unwrap();
8723        }
8724        let before = t.hot_bytes();
8725        // Delete row at position 1 (bob).
8726        let bob_row = make_user_row(2, "bob");
8727        let bob_bytes = encode_row_body_dense(&bob_row, &t.schema).len() as u64;
8728        let removed = t.delete_rows(&[1]);
8729        assert_eq!(removed, 1);
8730        assert_eq!(t.hot_bytes(), before - bob_bytes);
8731    }
8732
8733    #[test]
8734    fn hot_bytes_diffs_on_update_for_variable_width_columns() {
8735        let mut cat = Catalog::new();
8736        cat.create_table(bigint_pk_users_schema()).unwrap();
8737        let t = cat.get_mut("users").unwrap();
8738        t.insert(make_user_row(1, "alice")).unwrap();
8739        let after_insert = t.hot_bytes();
8740        // Update with a longer text payload — bytes must grow exactly
8741        // by the text-length delta.
8742        let new_row = make_user_row(1, "alice-the-longer-name");
8743        let old_len = encode_row_body_dense(&make_user_row(1, "alice"), &t.schema).len() as u64;
8744        let new_len = encode_row_body_dense(&new_row, &t.schema).len() as u64;
8745        t.update_row(0, new_row.values).unwrap();
8746        assert_eq!(t.hot_bytes(), after_insert - old_len + new_len);
8747        assert!(t.hot_bytes() > after_insert, "longer text grew the counter");
8748    }
8749
8750    #[test]
8751    fn hot_bytes_round_trips_through_serialize_deserialize() {
8752        let mut cat = Catalog::new();
8753        cat.create_table(bigint_pk_users_schema()).unwrap();
8754        let t = cat.get_mut("users").unwrap();
8755        for i in 0..10 {
8756            t.insert(make_user_row(i, &alloc::format!("name-{i}")))
8757                .unwrap();
8758        }
8759        let pre = cat.hot_tier_bytes();
8760        let restored = Catalog::deserialize(&cat.serialize()).unwrap();
8761        assert_eq!(restored.hot_tier_bytes(), pre);
8762        assert_eq!(restored.get("users").unwrap().hot_bytes(), pre);
8763    }
8764
8765    // --- v5.2.2 freezer atomic swap -------------------------------
8766
8767    /// Happy path: freeze the first half of a populated hot tier,
8768    /// confirm row counts shift, `hot_bytes` shrinks, and every frozen
8769    /// PK still resolves via `lookup_by_pk` (now through the cold
8770    /// segment registered by the freeze).
8771    #[test]
8772    fn freeze_oldest_to_cold_moves_rows_and_keeps_lookups_working() {
8773        let mut cat = Catalog::new();
8774        cat.create_table(bigint_pk_users_schema()).unwrap();
8775        let t = cat.get_mut("users").unwrap();
8776        for id in 0..10i64 {
8777            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
8778                .unwrap();
8779        }
8780        t.add_index("by_id".into(), "id").unwrap();
8781        let total_bytes_before = t.hot_bytes();
8782
8783        let report = cat
8784            .freeze_oldest_to_cold("users", "by_id", 6)
8785            .expect("freeze succeeds");
8786        assert_eq!(report.frozen_rows, 6);
8787        assert_eq!(report.segment_id, 0);
8788        assert!(report.bytes_freed > 0);
8789        assert!(!report.segment_bytes.is_empty());
8790
8791        let t = cat.get("users").unwrap();
8792        assert_eq!(t.row_count(), 4, "4 hot rows remain (10 - 6 frozen)");
8793        assert_eq!(cat.cold_segment_count(), 1);
8794        // Hot bytes shrank by exactly the freed amount.
8795        assert_eq!(
8796            t.hot_bytes(),
8797            total_bytes_before - report.bytes_freed,
8798            "hot_bytes accounting matches FreezeReport"
8799        );
8800
8801        // Every original PK still resolves — frozen ones via the
8802        // cold segment, kept ones via the (renumbered) hot tier.
8803        for id in 0..10i64 {
8804            let got = cat
8805                .lookup_by_pk("users", "by_id", &IndexKey::Int(id))
8806                .unwrap_or_else(|| panic!("PK {id} disappeared after freeze"));
8807            assert_eq!(got, make_user_row(id, &alloc::format!("u-{id}")));
8808        }
8809    }
8810
8811    /// Two successive freezes on the same index must preserve the
8812    /// first batch's cold locators when the second freeze runs.
8813    /// Catches the `rebuild_indices` wipe-Cold-on-delete bug that
8814    /// `collect_cold_locators` / re-register guards against.
8815    #[test]
8816    fn freeze_twice_preserves_prior_cold_locators() {
8817        let mut cat = Catalog::new();
8818        cat.create_table(bigint_pk_users_schema()).unwrap();
8819        let t = cat.get_mut("users").unwrap();
8820        for id in 0..12i64 {
8821            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
8822                .unwrap();
8823        }
8824        t.add_index("by_id".into(), "id").unwrap();
8825
8826        cat.freeze_oldest_to_cold("users", "by_id", 4)
8827            .expect("first freeze ok");
8828        cat.freeze_oldest_to_cold("users", "by_id", 4)
8829            .expect("second freeze ok");
8830
8831        assert_eq!(cat.get("users").unwrap().row_count(), 4);
8832        assert_eq!(cat.cold_segment_count(), 2);
8833        // All 12 PKs still resolve — first 4 via segment 0,
8834        // next 4 via segment 1, last 4 still hot.
8835        for id in 0..12i64 {
8836            let got = cat
8837                .lookup_by_pk("users", "by_id", &IndexKey::Int(id))
8838                .unwrap_or_else(|| panic!("PK {id} not resolvable after two freezes"));
8839            assert_eq!(got, make_user_row(id, &alloc::format!("u-{id}")));
8840        }
8841    }
8842
8843    /// Validation guard tests. Each must return `Err` and **not
8844    /// mutate the catalog** — the API is all-or-nothing.
8845    #[test]
8846    fn freeze_oldest_to_cold_rejects_invalid_input() {
8847        let mut cat = Catalog::new();
8848        cat.create_table(bigint_pk_users_schema()).unwrap();
8849        let t = cat.get_mut("users").unwrap();
8850        for id in 0..3i64 {
8851            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
8852                .unwrap();
8853        }
8854        t.add_index("by_id".into(), "id").unwrap();
8855
8856        // max_rows == 0
8857        assert!(matches!(
8858            cat.freeze_oldest_to_cold("users", "by_id", 0),
8859            Err(StorageError::Corrupt(_))
8860        ));
8861        // table missing
8862        assert!(matches!(
8863            cat.freeze_oldest_to_cold("missing", "by_id", 1),
8864            Err(StorageError::Corrupt(_))
8865        ));
8866        // index missing
8867        assert!(matches!(
8868            cat.freeze_oldest_to_cold("users", "no_such_index", 1),
8869            Err(StorageError::Corrupt(_))
8870        ));
8871        // max_rows > row_count
8872        assert!(matches!(
8873            cat.freeze_oldest_to_cold("users", "by_id", 999),
8874            Err(StorageError::Corrupt(_))
8875        ));
8876        // Catalog still untouched.
8877        assert_eq!(cat.get("users").unwrap().row_count(), 3);
8878        assert_eq!(cat.cold_segment_count(), 0);
8879    }
8880
8881    /// Freeze with a non-integer PK column must surface a clear
8882    /// error (Text PKs land in v5.5+).
8883    #[test]
8884    fn freeze_oldest_to_cold_rejects_non_integer_pk() {
8885        let mut cat = Catalog::new();
8886        cat.create_table(TableSchema::new(
8887            "by_name",
8888            vec![
8889                ColumnSchema::new("name", DataType::Text, false),
8890                ColumnSchema::new("payload", DataType::BigInt, false),
8891            ],
8892        ))
8893        .unwrap();
8894        let t = cat.get_mut("by_name").unwrap();
8895        t.insert(Row::new(vec![Value::Text("a".into()), Value::BigInt(1)]))
8896            .unwrap();
8897        t.add_index("by_n".into(), "name").unwrap();
8898        let err = cat
8899            .freeze_oldest_to_cold("by_name", "by_n", 1)
8900            .expect_err("non-integer PK rejected");
8901        match err {
8902            StorageError::Corrupt(s) => assert!(
8903                s.contains("non-integer"),
8904                "error message names the constraint: {s}"
8905            ),
8906            other => panic!("expected Corrupt, got {other:?}"),
8907        }
8908        // Catalog untouched.
8909        assert_eq!(cat.get("by_name").unwrap().row_count(), 1);
8910        assert_eq!(cat.cold_segment_count(), 0);
8911    }
8912
8913    /// Hot-tier rows after the freeze must keep their secondary-
8914    /// index lookups working — `delete_rows` shifts positions, and
8915    /// `rebuild_indices` must regenerate Hot locators at the new
8916    /// indices.
8917    #[test]
8918    fn freeze_keeps_remaining_hot_rows_addressable_via_secondary_index() {
8919        let mut cat = Catalog::new();
8920        cat.create_table(bigint_pk_users_schema()).unwrap();
8921        let t = cat.get_mut("users").unwrap();
8922        for id in 0..6i64 {
8923            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
8924                .unwrap();
8925        }
8926        t.add_index("by_id".into(), "id").unwrap();
8927        t.add_index("by_name".into(), "name").unwrap();
8928
8929        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
8930
8931        // Remaining hot rows: id 3, 4, 5. They moved to positions
8932        // 0, 1, 2 inside `self.rows`; the `by_name` index must now
8933        // resolve them via fresh Hot locators.
8934        let idx = cat.get("users").unwrap().index_on(1).unwrap();
8935        let got = idx.lookup_eq(&IndexKey::Text("u-4".into()));
8936        assert_eq!(got.len(), 1);
8937        assert!(got[0].is_hot(), "kept-hot rows still surface as Hot");
8938        match got[0] {
8939            RowLocator::Hot(i) => {
8940                // The 4th-inserted row was at position 4; after
8941                // dropping positions 0..3 it sits at position 1.
8942                assert_eq!(i, 1);
8943            }
8944            RowLocator::Cold { .. } => unreachable!(),
8945        }
8946    }
8947
8948    // --- v5.2.3 promote-on-write primitives ----------------------
8949
8950    /// Build a populated catalog with the first N rows frozen, then
8951    /// run `promote_cold_row` and verify the row crossed tiers
8952    /// correctly: the cold locator is retired, a fresh Hot locator
8953    /// appears, `lookup_by_pk` returns the row from the hot tier, and
8954    /// `hot_bytes` grew by the row's encoded byte length.
8955    #[test]
8956    fn promote_cold_row_pulls_frozen_row_back_to_hot_tier() {
8957        let mut cat = Catalog::new();
8958        cat.create_table(bigint_pk_users_schema()).unwrap();
8959        let t = cat.get_mut("users").unwrap();
8960        for id in 0..6i64 {
8961            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
8962                .unwrap();
8963        }
8964        t.add_index("by_id".into(), "id").unwrap();
8965        // Freeze first 4 rows (ids 0..3). After: hot rows = 4, 5 at
8966        // positions 0, 1; cold locators for keys 0..3.
8967        cat.freeze_oldest_to_cold("users", "by_id", 4).unwrap();
8968        let hot_bytes_before = cat.get("users").unwrap().hot_bytes();
8969
8970        // Promote PK=2 — it lives in segment 0 as a cold row.
8971        let new_idx = cat
8972            .promote_cold_row("users", "by_id", &IndexKey::Int(2))
8973            .expect("promote ok")
8974            .expect("PK 2 was cold");
8975        assert_eq!(
8976            new_idx, 2,
8977            "promoted row appended after the 2 surviving hot rows"
8978        );
8979
8980        let t = cat.get("users").unwrap();
8981        assert_eq!(t.row_count(), 3, "hot tier grew from 2 to 3");
8982        // Hot-bytes climbed by exactly one row's encoded length.
8983        let row = make_user_row(2, "u-2");
8984        let row_len = encode_row_body_dense(&row, &t.schema).len() as u64;
8985        assert_eq!(t.hot_bytes(), hot_bytes_before + row_len);
8986
8987        // The index now reports a Hot locator (the freshly inserted
8988        // row) — no Cold locator left for PK 2.
8989        let entries = t.index_on(0).unwrap().lookup_eq(&IndexKey::Int(2));
8990        assert_eq!(entries.len(), 1, "exactly one locator per key");
8991        assert!(entries[0].is_hot(), "promote retired the Cold locator");
8992        // End-to-end: lookup_by_pk still returns the row body.
8993        assert_eq!(
8994            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(2))
8995                .unwrap(),
8996            row
8997        );
8998        // Other cold rows untouched — still resolvable through the
8999        // segment.
9000        assert_eq!(
9001            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(0))
9002                .unwrap(),
9003            make_user_row(0, "u-0")
9004        );
9005    }
9006
9007    /// `promote_cold_row` on a key that's already hot (or absent)
9008    /// returns `Ok(None)` — not an error. The caller falls back to
9009    /// the hot-only update/delete path.
9010    #[test]
9011    fn promote_cold_row_returns_none_when_key_is_not_cold() {
9012        let mut cat = Catalog::new();
9013        cat.create_table(bigint_pk_users_schema()).unwrap();
9014        let t = cat.get_mut("users").unwrap();
9015        t.insert(make_user_row(7, "alice")).unwrap();
9016        t.add_index("by_id".into(), "id").unwrap();
9017
9018        // Hot-only key.
9019        assert!(
9020            cat.promote_cold_row("users", "by_id", &IndexKey::Int(7))
9021                .unwrap()
9022                .is_none()
9023        );
9024        // Absent key.
9025        assert!(
9026            cat.promote_cold_row("users", "by_id", &IndexKey::Int(99))
9027                .unwrap()
9028                .is_none()
9029        );
9030        // Catalog untouched on both no-op paths.
9031        assert_eq!(cat.get("users").unwrap().row_count(), 1);
9032        assert_eq!(cat.cold_segment_count(), 0);
9033    }
9034
9035    /// `shadow_cold_row` removes every Cold locator for a key on a
9036    /// `BTree` index. After the shadow, `lookup_by_pk` for that key
9037    /// returns None (the row data still sits in the segment file,
9038    /// but it's now garbage; compaction will reclaim it later).
9039    #[test]
9040    fn shadow_cold_row_removes_cold_locators_and_drops_lookup() {
9041        let mut cat = Catalog::new();
9042        cat.create_table(bigint_pk_users_schema()).unwrap();
9043        let t = cat.get_mut("users").unwrap();
9044        for id in 0..5i64 {
9045            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
9046                .unwrap();
9047        }
9048        t.add_index("by_id".into(), "id").unwrap();
9049        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
9050
9051        // Shadow PK=1 — pre-shadow lookup hits the cold tier.
9052        assert!(
9053            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(1))
9054                .is_some(),
9055            "frozen PK resolves before shadow"
9056        );
9057        let removed = cat
9058            .shadow_cold_row("users", "by_id", &IndexKey::Int(1))
9059            .unwrap();
9060        assert_eq!(removed, 1, "exactly one cold locator retired");
9061
9062        // Post-shadow: lookup misses, even though the row still
9063        // exists in segment 0.
9064        assert!(
9065            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(1))
9066                .is_none(),
9067            "shadowed key no longer resolves"
9068        );
9069        // Other cold keys still resolve.
9070        assert_eq!(
9071            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(0))
9072                .unwrap(),
9073            make_user_row(0, "u-0")
9074        );
9075        assert_eq!(
9076            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(2))
9077                .unwrap(),
9078            make_user_row(2, "u-2")
9079        );
9080    }
9081
9082    /// `shadow_cold_row` returns 0 (not Err) for keys with only Hot
9083    /// entries or no entries — the engine's DELETE path uses this
9084    /// signal to decide whether the cold-tier shadow path consumed
9085    /// the work.
9086    #[test]
9087    fn shadow_cold_row_returns_zero_when_key_is_not_cold() {
9088        let mut cat = Catalog::new();
9089        cat.create_table(bigint_pk_users_schema()).unwrap();
9090        let t = cat.get_mut("users").unwrap();
9091        t.insert(make_user_row(1, "alice")).unwrap();
9092        t.add_index("by_id".into(), "id").unwrap();
9093        assert_eq!(
9094            cat.shadow_cold_row("users", "by_id", &IndexKey::Int(1))
9095                .unwrap(),
9096            0,
9097            "hot-only key drops no cold locators"
9098        );
9099        assert_eq!(
9100            cat.shadow_cold_row("users", "by_id", &IndexKey::Int(999))
9101                .unwrap(),
9102            0,
9103            "absent key drops no cold locators"
9104        );
9105        assert_eq!(cat.get("users").unwrap().row_count(), 1);
9106    }
9107
9108    /// Validation guards on both promote / shadow primitives.
9109    #[test]
9110    fn promote_and_shadow_reject_invalid_inputs() {
9111        let mut cat = Catalog::new();
9112        cat.create_table(bigint_pk_users_schema()).unwrap();
9113        let t = cat.get_mut("users").unwrap();
9114        t.insert(make_user_row(1, "alice")).unwrap();
9115        t.add_index("by_id".into(), "id").unwrap();
9116
9117        // Missing table.
9118        assert!(matches!(
9119            cat.promote_cold_row("missing", "by_id", &IndexKey::Int(1)),
9120            Err(StorageError::Corrupt(_))
9121        ));
9122        assert!(matches!(
9123            cat.shadow_cold_row("missing", "by_id", &IndexKey::Int(1)),
9124            Err(StorageError::Corrupt(_))
9125        ));
9126        // Missing index.
9127        assert!(matches!(
9128            cat.promote_cold_row("users", "no_such_index", &IndexKey::Int(1)),
9129            Err(StorageError::Corrupt(_))
9130        ));
9131        assert!(matches!(
9132            cat.shadow_cold_row("users", "no_such_index", &IndexKey::Int(1)),
9133            Err(StorageError::Corrupt(_))
9134        ));
9135    }
9136
9137    // --- v6.7.4 parallel-freezer slice/commit API -----------------
9138
9139    /// One slice covering the entire freeze produces the same
9140    /// catalog state as the single-threaded `freeze_oldest_to_cold`
9141    /// — segment id, frozen row count, hot byte delta, and every
9142    /// post-freeze PK lookup match exactly.
9143    #[test]
9144    fn commit_freeze_slices_single_slice_matches_freeze_oldest() {
9145        let mut a = Catalog::new();
9146        let mut b = Catalog::new();
9147        for cat in [&mut a, &mut b] {
9148            cat.create_table(bigint_pk_users_schema()).unwrap();
9149            let t = cat.get_mut("users").unwrap();
9150            for id in 0..10i64 {
9151                t.insert(make_user_row(id, &alloc::format!("u-{id}")))
9152                    .unwrap();
9153            }
9154            t.add_index("by_id".into(), "id").unwrap();
9155        }
9156        let single = a.freeze_oldest_to_cold("users", "by_id", 6).unwrap();
9157        let slice = b
9158            .prepare_freeze_slice("users", "by_id", 0..6)
9159            .expect("prepare");
9160        let parallel = b
9161            .commit_freeze_slices("users", "by_id", alloc::vec![slice])
9162            .expect("commit");
9163        assert_eq!(single.segment_id, parallel.segment_id);
9164        assert_eq!(single.frozen_rows, parallel.frozen_rows);
9165        assert_eq!(single.bytes_freed, parallel.bytes_freed);
9166        assert_eq!(single.segment_bytes, parallel.segment_bytes);
9167        // Same post-freeze lookup behaviour on both catalogs.
9168        for id in 0..10i64 {
9169            assert_eq!(
9170                a.lookup_by_pk("users", "by_id", &IndexKey::Int(id)),
9171                b.lookup_by_pk("users", "by_id", &IndexKey::Int(id)),
9172                "PK {id} differs after single vs slice freeze"
9173            );
9174        }
9175    }
9176
9177    /// Two slices covering disjoint halves of the freeze produce
9178    /// the same merged segment as one slice covering the full
9179    /// range. The k-way merge preserves PK ordering even when
9180    /// slice halves alternate.
9181    #[test]
9182    fn commit_freeze_slices_two_slices_match_single_slice() {
9183        let mut a = Catalog::new();
9184        let mut b = Catalog::new();
9185        for cat in [&mut a, &mut b] {
9186            cat.create_table(bigint_pk_users_schema()).unwrap();
9187            let t = cat.get_mut("users").unwrap();
9188            // Random-ish PKs so the per-slice sort actually has
9189            // work to do (and slice halves carry interleaved keys).
9190            for id in [3, 7, 1, 9, 5, 0, 8, 4, 2, 6].iter().copied() {
9191                t.insert(make_user_row(id as i64, &alloc::format!("u-{id}")))
9192                    .unwrap();
9193            }
9194            t.add_index("by_id".into(), "id").unwrap();
9195        }
9196        let single = a
9197            .prepare_freeze_slice("users", "by_id", 0..8)
9198            .expect("prepare");
9199        let one = a
9200            .commit_freeze_slices("users", "by_id", alloc::vec![single])
9201            .expect("commit one");
9202        let s1 = b
9203            .prepare_freeze_slice("users", "by_id", 0..4)
9204            .expect("prepare s1");
9205        let s2 = b
9206            .prepare_freeze_slice("users", "by_id", 4..8)
9207            .expect("prepare s2");
9208        let two = b
9209            .commit_freeze_slices("users", "by_id", alloc::vec![s1, s2])
9210            .expect("commit two");
9211        assert_eq!(one.segment_bytes, two.segment_bytes);
9212        assert_eq!(one.frozen_rows, two.frozen_rows);
9213        // Every PK that survived freeze (hot or cold) resolves on
9214        // both catalogs.
9215        for id in 0..10i64 {
9216            assert_eq!(
9217                a.lookup_by_pk("users", "by_id", &IndexKey::Int(id)),
9218                b.lookup_by_pk("users", "by_id", &IndexKey::Int(id)),
9219                "PK {id} differs after one-slice vs two-slice freeze"
9220            );
9221        }
9222    }
9223
9224    /// Gap between slices → error before any mutation lands.
9225    #[test]
9226    fn commit_freeze_slices_rejects_gap() {
9227        let mut cat = Catalog::new();
9228        cat.create_table(bigint_pk_users_schema()).unwrap();
9229        let t = cat.get_mut("users").unwrap();
9230        for id in 0..6i64 {
9231            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
9232                .unwrap();
9233        }
9234        t.add_index("by_id".into(), "id").unwrap();
9235        let s1 = cat.prepare_freeze_slice("users", "by_id", 0..2).unwrap();
9236        let s2 = cat.prepare_freeze_slice("users", "by_id", 3..5).unwrap();
9237        assert!(matches!(
9238            cat.commit_freeze_slices("users", "by_id", alloc::vec![s1, s2]),
9239            Err(StorageError::Corrupt(_))
9240        ));
9241        // Catalog untouched.
9242        assert_eq!(cat.cold_segment_count(), 0);
9243        assert_eq!(cat.get("users").unwrap().row_count(), 6);
9244    }
9245
9246    /// Empty slice list → no-op success, catalog untouched.
9247    #[test]
9248    fn commit_freeze_slices_empty_is_noop() {
9249        let mut cat = Catalog::new();
9250        cat.create_table(bigint_pk_users_schema()).unwrap();
9251        let t = cat.get_mut("users").unwrap();
9252        for id in 0..3i64 {
9253            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
9254                .unwrap();
9255        }
9256        t.add_index("by_id".into(), "id").unwrap();
9257        let report = cat
9258            .commit_freeze_slices("users", "by_id", Vec::new())
9259            .unwrap();
9260        assert_eq!(report.frozen_rows, 0);
9261        assert_eq!(cat.cold_segment_count(), 0);
9262        assert_eq!(cat.get("users").unwrap().row_count(), 3);
9263    }
9264
9265    // --- v6.7.3 cold-segment compaction ---------------------------
9266
9267    /// Two small cold segments merge into a single larger one. The
9268    /// merged segment carries every cold-resident row; the source
9269    /// slots are tombstoned; every PK still resolves through the
9270    /// new merged segment via `lookup_by_pk`.
9271    #[test]
9272    fn compact_merges_small_segments_storage_unit() {
9273        let mut cat = Catalog::new();
9274        cat.create_table(bigint_pk_users_schema()).unwrap();
9275        let t = cat.get_mut("users").unwrap();
9276        for id in 0..8i64 {
9277            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
9278                .unwrap();
9279        }
9280        t.add_index("by_id".into(), "id").unwrap();
9281        // Two freezes of 3 rows each → two small cold segments.
9282        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
9283        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
9284        assert_eq!(cat.cold_segment_count(), 2);
9285        assert_eq!(cat.cold_segment_slot_count(), 2);
9286
9287        // Pick a threshold larger than either segment's size so
9288        // both qualify.
9289        let max_seg_bytes = cat
9290            .cold_segment_ids_global()
9291            .iter()
9292            .map(|id| cat.cold_segment(*id).unwrap().bytes().len() as u64)
9293            .max()
9294            .unwrap();
9295        let target = max_seg_bytes + 1;
9296
9297        let report = cat
9298            .compact_cold_segments("users", "by_id", target)
9299            .expect("compact succeeds");
9300        assert_eq!(report.sources.len(), 2);
9301        let merged_id = report.merged_segment_id.expect("merge happened");
9302        assert_eq!(report.merged_rows, 6);
9303        assert_eq!(report.deleted_rows_pruned, 0);
9304        assert!(!report.merged_segment_bytes.is_empty());
9305
9306        // Active count drops back to 1; slot count grew to 3
9307        // (2 sources tombstoned + 1 merged appended).
9308        assert_eq!(cat.cold_segment_count(), 1);
9309        assert_eq!(cat.cold_segment_slot_count(), 3);
9310        assert_eq!(cat.cold_segment_ids_global(), alloc::vec![merged_id]);
9311
9312        // Every PK that was frozen still resolves (via the merged
9313        // segment); the 2 hot rows still resolve too.
9314        for id in 0..8i64 {
9315            let got = cat
9316                .lookup_by_pk("users", "by_id", &IndexKey::Int(id))
9317                .unwrap_or_else(|| panic!("PK {id} lost after compaction"));
9318            assert_eq!(got, make_user_row(id, &alloc::format!("u-{id}")));
9319        }
9320    }
9321
9322    /// DELETE'd-but-frozen rows are dropped during the merge. Set
9323    /// up two small segments, then shadow one row in each; the
9324    /// merged segment must NOT carry the shadowed rows.
9325    #[test]
9326    fn compact_drops_shadowed_cold_rows() {
9327        let mut cat = Catalog::new();
9328        cat.create_table(bigint_pk_users_schema()).unwrap();
9329        let t = cat.get_mut("users").unwrap();
9330        for id in 0..6i64 {
9331            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
9332                .unwrap();
9333        }
9334        t.add_index("by_id".into(), "id").unwrap();
9335        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
9336        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
9337        // Shadow PK 1 (in seg 0) + PK 4 (in seg 1).
9338        assert_eq!(
9339            cat.shadow_cold_row("users", "by_id", &IndexKey::Int(1))
9340                .unwrap(),
9341            1
9342        );
9343        assert_eq!(
9344            cat.shadow_cold_row("users", "by_id", &IndexKey::Int(4))
9345                .unwrap(),
9346            1
9347        );
9348
9349        let max_seg_bytes = cat
9350            .cold_segment_ids_global()
9351            .iter()
9352            .map(|id| cat.cold_segment(*id).unwrap().bytes().len() as u64)
9353            .max()
9354            .unwrap();
9355        let report = cat
9356            .compact_cold_segments("users", "by_id", max_seg_bytes + 1)
9357            .expect("compact succeeds");
9358        assert_eq!(report.sources.len(), 2);
9359        assert_eq!(report.merged_rows, 4, "6 frozen − 2 shadowed = 4 live");
9360        assert_eq!(report.deleted_rows_pruned, 2);
9361
9362        // PK 1 and 4 stay invisible after compact.
9363        for shadowed in [1i64, 4i64] {
9364            assert!(
9365                cat.lookup_by_pk("users", "by_id", &IndexKey::Int(shadowed))
9366                    .is_none(),
9367                "shadowed PK {shadowed} must remain invisible after compact"
9368            );
9369        }
9370        // The other 4 frozen rows resolve.
9371        for live in [0i64, 2, 3, 5] {
9372            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(live))
9373                .unwrap_or_else(|| panic!("live PK {live} lost after compact"));
9374        }
9375    }
9376
9377    /// No-op cases: 0 or 1 candidate segment under the threshold
9378    /// leaves the catalog untouched.
9379    #[test]
9380    fn compact_is_noop_below_two_candidates() {
9381        let mut cat = Catalog::new();
9382        cat.create_table(bigint_pk_users_schema()).unwrap();
9383        let t = cat.get_mut("users").unwrap();
9384        for id in 0..6i64 {
9385            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
9386                .unwrap();
9387        }
9388        t.add_index("by_id".into(), "id").unwrap();
9389        // 0 cold segments.
9390        let report = cat
9391            .compact_cold_segments("users", "by_id", 1 << 30)
9392            .expect("noop ok");
9393        assert!(report.merged_segment_id.is_none());
9394        assert!(report.sources.is_empty());
9395
9396        // 1 cold segment — still a no-op (need ≥2 to merge).
9397        cat.freeze_oldest_to_cold("users", "by_id", 4).unwrap();
9398        let report = cat
9399            .compact_cold_segments("users", "by_id", 1 << 30)
9400            .expect("noop ok");
9401        assert!(report.merged_segment_id.is_none());
9402        assert_eq!(cat.cold_segment_count(), 1);
9403
9404        // Threshold too small to cover the single segment → still
9405        // no-op.
9406        let report = cat
9407            .compact_cold_segments("users", "by_id", 1)
9408            .expect("noop ok");
9409        assert!(report.merged_segment_id.is_none());
9410        assert_eq!(cat.cold_segment_count(), 1);
9411    }
9412
9413    /// Manifest-style atomicity: a Catalog snapshot taken AFTER
9414    /// `compact_cold_segments` returns must round-trip with the
9415    /// post-compact BTree state, while the cold-tier registry is
9416    /// re-derived from the source-of-truth manifest (=
9417    /// `load_segment_bytes_at` with the merged id + the still-on-
9418    /// disk merged bytes). This mirrors the boot path: catalog
9419    /// snapshot + cold-segment files = full state.
9420    #[test]
9421    fn compact_swap_survives_catalog_roundtrip_via_load_at() {
9422        let mut cat = Catalog::new();
9423        cat.create_table(bigint_pk_users_schema()).unwrap();
9424        let t = cat.get_mut("users").unwrap();
9425        for id in 0..6i64 {
9426            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
9427                .unwrap();
9428        }
9429        t.add_index("by_id".into(), "id").unwrap();
9430        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
9431        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
9432        let max_seg_bytes = cat
9433            .cold_segment_ids_global()
9434            .iter()
9435            .map(|id| cat.cold_segment(*id).unwrap().bytes().len() as u64)
9436            .max()
9437            .unwrap();
9438        let report = cat
9439            .compact_cold_segments("users", "by_id", max_seg_bytes + 1)
9440            .expect("compact ok");
9441        let merged_id = report.merged_segment_id.unwrap();
9442
9443        // Serialise the catalog (BTree index points at merged_id
9444        // now) and the merged segment bytes; pretend to crash; on
9445        // restart, re-hydrate the catalog and reload only the
9446        // merged segment at its baked-in id.
9447        let cat_bytes = cat.serialize();
9448        let merged_bytes = report.merged_segment_bytes.clone();
9449
9450        let mut restored = Catalog::deserialize(&cat_bytes).expect("deserialize ok");
9451        restored
9452            .load_segment_bytes_at(merged_id, merged_bytes)
9453            .expect("reload merged ok");
9454
9455        // All 6 PKs still resolve through the restored merged segment.
9456        for id in 0..6i64 {
9457            let got = restored
9458                .lookup_by_pk("users", "by_id", &IndexKey::Int(id))
9459                .unwrap_or_else(|| panic!("PK {id} lost across roundtrip"));
9460            assert_eq!(got, make_user_row(id, &alloc::format!("u-{id}")));
9461        }
9462        // No source slot ever rehydrates — confirmed by
9463        // `cold_segment_count` matching only the merged segment.
9464        assert_eq!(restored.cold_segment_count(), 1);
9465    }
9466
9467    /// `load_segment_bytes_at` refuses to stomp an occupied slot
9468    /// and pads with `None` when the target id is past the end.
9469    #[test]
9470    fn load_segment_bytes_at_pads_and_rejects_collision() {
9471        let mut cat = Catalog::new();
9472        cat.create_table(bigint_pk_users_schema()).unwrap();
9473        let t = cat.get_mut("users").unwrap();
9474        for id in 0..4i64 {
9475            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
9476                .unwrap();
9477        }
9478        t.add_index("by_id".into(), "id").unwrap();
9479        let report = cat.freeze_oldest_to_cold("users", "by_id", 2).unwrap();
9480        let bytes_seg0 = report.segment_bytes.clone();
9481
9482        // Pad to id=5 (slots 1..5 are None, slot 5 holds the
9483        // segment loaded back). The slot count jumps, the active
9484        // count is now 2 (seg 0 + seg 5).
9485        cat.load_segment_bytes_at(5, bytes_seg0.clone())
9486            .expect("pad + load ok");
9487        assert_eq!(cat.cold_segment_slot_count(), 6);
9488        assert_eq!(cat.cold_segment_count(), 2);
9489
9490        // Re-loading at the same id collides.
9491        assert!(matches!(
9492            cat.load_segment_bytes_at(5, bytes_seg0.clone()),
9493            Err(StorageError::Corrupt(_))
9494        ));
9495        // Re-loading at id 0 (already occupied) also collides.
9496        assert!(matches!(
9497            cat.load_segment_bytes_at(0, bytes_seg0),
9498            Err(StorageError::Corrupt(_))
9499        ));
9500    }
9501
9502    /// Round trip: freeze → promote → re-freeze. The same PK can
9503    /// migrate hot ↔ cold multiple times. After two cycles only the
9504    /// final Hot locator should be live.
9505    #[test]
9506    fn promote_then_refreeze_does_not_leave_orphan_locators() {
9507        let mut cat = Catalog::new();
9508        cat.create_table(bigint_pk_users_schema()).unwrap();
9509        let t = cat.get_mut("users").unwrap();
9510        for id in 0..4i64 {
9511            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
9512                .unwrap();
9513        }
9514        t.add_index("by_id".into(), "id").unwrap();
9515
9516        // Cycle 1: freeze first 2 rows, then promote PK 0.
9517        cat.freeze_oldest_to_cold("users", "by_id", 2).unwrap();
9518        let promoted = cat
9519            .promote_cold_row("users", "by_id", &IndexKey::Int(0))
9520            .unwrap();
9521        assert!(promoted.is_some());
9522        let entries_after_promote = cat
9523            .get("users")
9524            .unwrap()
9525            .index_on(0)
9526            .unwrap()
9527            .lookup_eq(&IndexKey::Int(0))
9528            .to_vec();
9529        assert_eq!(entries_after_promote.len(), 1);
9530        assert!(entries_after_promote[0].is_hot());
9531
9532        // Cycle 2: freeze the front rows again. PK 0 is now at
9533        // position 2 (after the survivors); it could still go cold
9534        // again on a future freeze depending on policy, but the
9535        // current "first N positions" policy leaves it alone here.
9536        // What matters: prior cold locators for PKs 0..1 are gone,
9537        // PKs 2..3 still resolve through their original segments.
9538        for id in [2i64, 3] {
9539            assert_eq!(
9540                cat.lookup_by_pk("users", "by_id", &IndexKey::Int(id))
9541                    .unwrap(),
9542                make_user_row(id, &alloc::format!("u-{id}"))
9543            );
9544        }
9545    }
9546}