Skip to main content

spg_storage/
lib.rs

1//! In-memory storage primitives.
2//!
3//! v0.3 is intentionally simple: a flat catalog of tables, each holding rows
4//! as `Vec<Value>` (positional, matching the table's `TableSchema`). No MVCC,
5//! no on-disk format — those land in later milestones.
6#![no_std]
7// v3.3.2 NEON path for l2_distance_sq (aarch64 only). Scoped allow:
8// `unsafe_code = "deny"` at workspace level stays in force for every
9// other crate.
10#![cfg_attr(target_arch = "aarch64", allow(unsafe_code))]
11
12extern crate alloc;
13
14pub mod bloom;
15pub mod fts_simple;
16pub mod halfvec;
17pub mod persistent;
18pub mod persistent_btree;
19pub mod quantize;
20pub mod row_locator;
21pub mod segment;
22pub mod trgm;
23
24pub use self::bloom::{BloomError, BloomFilter};
25pub use self::row_locator::{RowLocator, RowLocatorError};
26pub use self::segment::{
27    BRIN_SIDECAR_MAGIC, BrinSummary, OwnedSegment, SEGMENT_COMPRESS_ALGO_LZSS,
28    SEGMENT_COMPRESS_ALGO_NONE, SEGMENT_MAGIC, SEGMENT_MAGIC_V2, SEGMENT_PAGE_BYTES, SegmentError,
29    SegmentMeta, SegmentReader, derive_brin_summaries, encode_segment, wrap_v2_envelope,
30    wrap_v2_envelope_with_brin,
31};
32
33use alloc::boxed::Box;
34use alloc::collections::{BTreeMap, BTreeSet};
35use alloc::format;
36use alloc::string::{String, ToString};
37use alloc::sync::Arc;
38use alloc::vec::Vec;
39use core::fmt;
40
41use self::persistent::PersistentVec;
42use self::persistent_btree::PersistentBTreeMap;
43
44/// In-cell encoding for `DataType::Vector`. Mirrors
45/// `spg_sql::ast::VecEncoding` — kept here so storage stays
46/// dep-free of `spg-sql`. The engine bridges between the two
47/// at DDL-execution time.
48///
49/// `F32` is the pre-v6 default: each cell holds a raw `Vec<f32>`.
50/// `Sq8` (v6.0.1) stores `Sq8Vector { min, max, bytes: Vec<u8> }`
51/// per cell; 4× compression vs `F32` with recall@10 ≥ 0.95 on
52/// natural embeddings (Gaussian / unit-sphere corpora).
53/// `F16` (v6.0.3, DDL keyword `HALF`) stores each element as
54/// IEEE-754 binary16; 2× compression and bit-exact dequantise.
55#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
56pub enum VecEncoding {
57    #[default]
58    F32,
59    Sq8,
60    F16,
61}
62
63impl fmt::Display for VecEncoding {
64    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
65        match self {
66            Self::F32 => f.write_str("F32"),
67            Self::Sq8 => f.write_str("SQ8"),
68            Self::F16 => f.write_str("HALF"),
69        }
70    }
71}
72
73/// Runtime type tags. `Vector { dim, encoding }` / `Varchar(max)` /
74/// `Char(size)` are parameterised; the parameter travels with both
75/// the column schema and the on-wire serialised representation.
76#[derive(Debug, Clone, Copy, PartialEq, Eq)]
77pub enum DataType {
78    /// 16-bit signed. Backed by `Value::SmallInt(i16)`; arithmetic that
79    /// would overflow surfaces as a type error at INSERT time.
80    SmallInt,
81    Int,    // 32-bit signed
82    BigInt, // 64-bit signed
83    Float,  // f64 (PG double precision)
84    Text,
85    /// `VARCHAR(n)` — same byte representation as `Text`, but INSERT
86    /// rejects values longer than `n` Unicode characters.
87    Varchar(u32),
88    /// `CHAR(n)` — same representation as `Text`, but INSERT right-pads
89    /// with U+0020 to exactly `n` Unicode characters (or rejects when
90    /// the input is already longer).
91    Char(u32),
92    Bool,
93    /// pgvector-style fixed-dimension vector. `encoding` selects
94    /// the in-cell representation (`F32` = pre-v6 raw f32 buffer;
95    /// `Sq8` = v6.0.1 8-bit scalar-quantised). The DDL grammar
96    /// surfaces encoding via the optional `USING <encoding>`
97    /// clause: `VECTOR(128) USING SQ8`.
98    Vector {
99        dim: u32,
100        encoding: VecEncoding,
101    },
102    /// `NUMERIC(precision, scale)` — exact fixed-point decimal stored as
103    /// a scaled `i128`. `precision` caps total decimal digits, `scale`
104    /// fixes digits after the decimal point. v1.12 supports up to
105    /// precision 38 (the i128-safe ceiling). `NUMERIC` and `NUMERIC(p)`
106    /// surface as `Numeric { precision: p, scale: 0 }`.
107    Numeric {
108        precision: u8,
109        scale: u8,
110    },
111    /// `DATE` — calendar date with day precision, stored as `i32` days
112    /// since the Unix epoch (1970-01-01).
113    Date,
114    /// `TIMESTAMP` (a.k.a. `MySQL` `DATETIME`) — instant with microsecond
115    /// precision, stored as `i64` microseconds since the Unix epoch.
116    Timestamp,
117    /// v7.9.2 `TIMESTAMPTZ` — bit-identical to `Timestamp` on disk
118    /// (i64 microseconds, UTC by convention). Carried as a distinct
119    /// type tag so the PG-wire layer can advertise OID 1184 (PG's
120    /// `timestamp with time zone`) and `sqlx`/`pgx`/JDBC clients
121    /// decode into their TZ-aware datetime types. The internal
122    /// semantics are unchanged: SPG never stored per-row offsets,
123    /// and neither did PG — `TIMESTAMPTZ` in PG is also UTC i64.
124    Timestamptz,
125    /// `INTERVAL` — calendar-aware span (months + microseconds). v2.11
126    /// supports INTERVAL only as a runtime intermediate (literals,
127    /// arithmetic results); on-disk encoding is rejected so this branch
128    /// can't appear in a `ColumnSchema`.
129    Interval,
130    /// v4.9: `JSON` — text-backed JSON document. We don't parse
131    /// the content (no path operators or jsonb functions yet) —
132    /// the column accepts any TEXT-compatible value and round-trips
133    /// it verbatim. PG OID 114 on the wire.
134    Json,
135    /// v7.9.0: `JSONB` — semantically identical to `Json` on
136    /// the storage side (same `Value::Json` cells, same
137    /// row codec), but advertised as PG OID 3802 on the wire
138    /// so `sqlx`-style clients that bind `jsonb` columns
139    /// decode correctly. mailrs migration blocker #3.
140    Jsonb,
141    /// v7.10.4: `BYTES` / `BYTEA` — variable-length raw binary.
142    /// Backed by `Value::Bytes(Vec<u8>)`. PG wire OID 17. Literal
143    /// forms accepted by parser/engine: PG hex form `'\xDEADBEEF'`
144    /// (case-insensitive hex pairs) and escape form
145    /// `'foo\\000bar'` (the latter decoded at coercion time when
146    /// the target column is BYTEA — TEXT columns leave the
147    /// backslash sequence verbatim).
148    Bytes,
149    /// v7.10.9: `TEXT[]` — single-dimension TEXT array. Elements
150    /// may be NULL (PG semantics). PG wire OID 1009. Literal
151    /// forms: `ARRAY['a', 'b', NULL]` and the PG external form
152    /// `'{a,b,NULL}'::TEXT[]`. Engine implements `= ANY(arr)`,
153    /// `<> ALL(arr)`, and 1-based indexing `arr[i]`. Catalog
154    /// FILE_VERSION 18+; older snapshots reject this DataType
155    /// (forward-only by design — TEXT[] columns aren't readable
156    /// on a pre-v7.10 binary).
157    TextArray,
158    /// v7.11.12: `INT[]` — single-dimension i32 array. PG wire
159    /// OID 1007 (_int4). Same `ARRAY[...]` / `'{1,2,3}'::INT[]`
160    /// literal surface as TEXT[]. Catalog FILE_VERSION 19+.
161    IntArray,
162    /// v7.11.12: `BIGINT[]` — single-dimension i64 array. PG
163    /// wire OID 1016 (_int8). Catalog FILE_VERSION 19+.
164    BigIntArray,
165    /// v7.12.0: PG `tsvector` — ordered, deduplicated set of
166    /// `(lexeme, positions, weight)` tuples. PG wire OID 3614.
167    /// Catalog FILE_VERSION 20+. Storage shape is row-codec
168    /// tag 22; the schema-agnostic `write_value` path emits tag
169    /// 18. Literal: `'foo:1 bar:2,3'::tsvector` (PG external
170    /// form). G-CRIT-3 entry — v7.12.0 only ships the type +
171    /// codec; matching `@@` lands in v7.12.2.
172    TsVector,
173    /// v7.12.0: PG `tsquery` — parse tree of lexemes joined by
174    /// `&` `|` `!` and phrase operators. PG wire OID 3615.
175    /// Catalog FILE_VERSION 20+.
176    TsQuery,
177    /// v7.17.0: PG `uuid` — 128-bit identifier stored as
178    /// `Value::Uuid([u8; 16])`. PG wire OID 2950. Canonical
179    /// text form is lowercase 8-4-4-4-12 hyphenated; input
180    /// also accepts uppercase, unhyphenated, and brace-wrapped
181    /// forms (`{xxxx…}`). Catalog FILE_VERSION 36+; tag 24 on
182    /// the dense type-tag side, tag 20 on the schema-agnostic
183    /// value side. The drop-in PG/MySQL surface for Django /
184    /// Rails / Hibernate "id UUID PRIMARY KEY DEFAULT
185    /// gen_random_uuid()" default-PK pattern.
186    Uuid,
187    /// v7.17.0 Phase 3.P0-32: PG `time` (without time zone) — i64
188    /// microseconds since 00:00:00. PG wire OID 1083. Display:
189    /// canonical zero-padded `HH:MM:SS` when fractional is zero,
190    /// `HH:MM:SS.ffffff` otherwise. Catalog FILE_VERSION 37+;
191    /// tag 25 on the dense type-tag side, tag 21 on the schema-
192    /// agnostic value side. The wall-clock-of-day half of PG's
193    /// date/time triplet (date / time / timestamp).
194    Time,
195    /// v7.17.0 Phase 3.P0-33: MySQL `YEAR` — u16 in range
196    /// 1901..=2155 plus the special zero-year sentinel 0. No
197    /// dedicated PG OID (advertised as INT4 / OID 23 on the wire
198    /// — psql renders integers, MySQL CLI renders 4-digit
199    /// zero-padded text). Display always 4 digits: `0000` for the
200    /// zero-year, `1985` / `2007` / etc otherwise. Catalog
201    /// FILE_VERSION 38+; tag 26 on the dense type-tag side, tag
202    /// 22 on the schema-agnostic value side.
203    Year,
204    /// v7.17.0 Phase 3.P0-34: PG `time with time zone` (TIMETZ) —
205    /// i64 microseconds since 00:00:00 in the local wall clock
206    /// PLUS i32 offset-from-UTC in seconds. PG wire OID 1266.
207    /// Display: `HH:MM:SS[.ffffff]±HH[:MM]` (PG `timetz_out`).
208    /// Range: offset in ±50400 seconds (±14 hours). Catalog
209    /// FILE_VERSION 39+; tag 27 on the dense type-tag side, tag
210    /// 23 on the schema-agnostic value side.
211    TimeTz,
212    /// v7.17.0 Phase 3.P0-35: PG `money` — i64 cents (locale-
213    /// independent storage). PG wire OID 790. Display: en_US
214    /// locale (`$N,NNN.CC`, negative → `-$1.23`). Input accepts
215    /// `$N.NN`, `$N,NNN.NN`, bare integer (treated as major
216    /// units), optional leading `-`. Range: full i64. Catalog
217    /// FILE_VERSION 40+; tag 28 on the dense type-tag side, tag
218    /// 24 on the schema-agnostic value side.
219    Money,
220    /// v7.17.0 Phase 3.P0-38: PG range type. The same DataType
221    /// variant covers all six builtin ranges (int4range,
222    /// int8range, numrange, tsrange, tstzrange, daterange) —
223    /// `RangeKind` pins the element type so encode / decode /
224    /// display can route off one switch. Catalog FILE_VERSION
225    /// 43+; tag 29 + a 1-byte RangeKind on the dense type-tag
226    /// side, tag 25 on the schema-agnostic value side.
227    Range(RangeKind),
228    /// v7.17.0 Phase 3.P0-39: PG `hstore` extension type — flat
229    /// `text => text` map with NULL value support. Catalog
230    /// FILE_VERSION 44+; tag 30 on the dense type-tag side, tag
231    /// 26 on the schema-agnostic value side. The contrib OID is
232    /// installation-dependent in real PG; SPG advertises it via
233    /// dynamic lookup, falling back to TEXT (OID 25) on the wire
234    /// when the installed `hstore` extension hasn't claimed an
235    /// OID yet.
236    Hstore,
237    /// v7.17.0 Phase 3.P0-40: PG `int[][]` — 2-dimensional INT
238    /// matrix. Storage: row-major Vec<Vec<Option<i32>>>. All
239    /// rows must share the same column count. Wire OID 1007
240    /// (same as INT[]; the dimension count travels in the data
241    /// header, not the OID). Catalog FILE_VERSION 45+; tag 31
242    /// on the dense type-tag side, tag 27 on the schema-agnostic
243    /// value side.
244    IntArray2D,
245    /// v7.17.0 Phase 3.P0-40: PG `bigint[][]` — 2-dimensional
246    /// BIGINT matrix. Storage / OID / tags mirror IntArray2D.
247    /// Tag 32 dense, tag 28 schema-agnostic.
248    BigIntArray2D,
249    /// v7.17.0 Phase 3.P0-40: PG `text[][]` — 2-dimensional TEXT
250    /// matrix. Storage: row-major Vec<Vec<Option<String>>>.
251    /// Tag 33 dense, tag 29 schema-agnostic.
252    TextArray2D,
253}
254
255/// v7.17.0 Phase 3.P0-38 — pins the element type of a range value
256/// or column. Wire OIDs: Int4=3904, Int8=3926, Num=3906,
257/// Ts=3908, TsTz=3910, Date=3912.
258#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
259pub enum RangeKind {
260    Int4,
261    Int8,
262    Num,
263    Ts,
264    TsTz,
265    Date,
266}
267
268impl RangeKind {
269    pub const fn tag(self) -> u8 {
270        match self {
271            Self::Int4 => 0,
272            Self::Int8 => 1,
273            Self::Num => 2,
274            Self::Ts => 3,
275            Self::TsTz => 4,
276            Self::Date => 5,
277        }
278    }
279    pub const fn from_tag(t: u8) -> Option<Self> {
280        Some(match t {
281            0 => Self::Int4,
282            1 => Self::Int8,
283            2 => Self::Num,
284            3 => Self::Ts,
285            4 => Self::TsTz,
286            5 => Self::Date,
287            _ => return None,
288        })
289    }
290    pub const fn keyword(self) -> &'static str {
291        match self {
292            Self::Int4 => "INT4RANGE",
293            Self::Int8 => "INT8RANGE",
294            Self::Num => "NUMRANGE",
295            Self::Ts => "TSRANGE",
296            Self::TsTz => "TSTZRANGE",
297            Self::Date => "DATERANGE",
298        }
299    }
300}
301
302impl fmt::Display for DataType {
303    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
304        match self {
305            Self::SmallInt => f.write_str("SMALLINT"),
306            Self::Int => f.write_str("INT"),
307            Self::BigInt => f.write_str("BIGINT"),
308            Self::Float => f.write_str("FLOAT"),
309            Self::Text => f.write_str("TEXT"),
310            Self::Varchar(n) => write!(f, "VARCHAR({n})"),
311            Self::Char(n) => write!(f, "CHAR({n})"),
312            Self::Bool => f.write_str("BOOL"),
313            Self::Vector { dim, encoding } => match encoding {
314                VecEncoding::F32 => write!(f, "VECTOR({dim})"),
315                VecEncoding::Sq8 => write!(f, "VECTOR({dim}) USING SQ8"),
316                VecEncoding::F16 => write!(f, "VECTOR({dim}) USING HALF"),
317            },
318            Self::Numeric { precision, scale } => {
319                if *scale == 0 {
320                    write!(f, "NUMERIC({precision})")
321                } else {
322                    write!(f, "NUMERIC({precision}, {scale})")
323                }
324            }
325            Self::Date => f.write_str("DATE"),
326            Self::Timestamp => f.write_str("TIMESTAMP"),
327            Self::Timestamptz => f.write_str("TIMESTAMPTZ"),
328            Self::Interval => f.write_str("INTERVAL"),
329            Self::Json => f.write_str("JSON"),
330            Self::Jsonb => f.write_str("JSONB"),
331            Self::Bytes => f.write_str("BYTEA"),
332            Self::TextArray => f.write_str("TEXT[]"),
333            Self::IntArray => f.write_str("INT[]"),
334            Self::BigIntArray => f.write_str("BIGINT[]"),
335            Self::TsVector => f.write_str("TSVECTOR"),
336            Self::TsQuery => f.write_str("TSQUERY"),
337            Self::Uuid => f.write_str("UUID"),
338            Self::Time => f.write_str("TIME"),
339            Self::Year => f.write_str("YEAR"),
340            Self::TimeTz => f.write_str("TIMETZ"),
341            Self::Money => f.write_str("MONEY"),
342            Self::Range(k) => f.write_str(k.keyword()),
343            Self::Hstore => f.write_str("HSTORE"),
344            Self::IntArray2D => f.write_str("INT[][]"),
345            Self::BigIntArray2D => f.write_str("BIGINT[][]"),
346            Self::TextArray2D => f.write_str("TEXT[][]"),
347        }
348    }
349}
350
351/// v7.12.0 — one entry in a `Value::TsVector`. The lexeme is the
352/// (already-tokenised + stemmed in v7.12.1+) word; `positions` is
353/// a strictly-ascending list of 1-based positions; `weight` is the
354/// PG weight letter (A=3, B=2, C=1, D=0) — v7.12.0 defaults every
355/// lexeme to D, the v7.12.2 ranking path consumes the weight.
356#[derive(Debug, Clone, PartialEq, Eq)]
357pub struct TsLexeme {
358    pub word: String,
359    pub positions: Vec<u16>,
360    pub weight: u8,
361}
362
363/// v7.12.0 — parse tree for a PG `tsquery`. v7.12.0 ships the
364/// type + codec only; the `to_tsquery` / `plainto_tsquery` lexer
365/// lands in v7.12.1 and the `@@` evaluator in v7.12.2.
366#[derive(Debug, Clone, PartialEq, Eq)]
367pub enum TsQueryAst {
368    /// Single lexeme term. The `weight_mask` is the PG-style
369    /// bitmask of accepted weights (`A=1<<3`, `B=1<<2`, `C=1<<1`,
370    /// `D=1<<0`); `0` = any weight. v7.12.0 always sets it to 0.
371    Term {
372        word: String,
373        weight_mask: u8,
374    },
375    And(Box<TsQueryAst>, Box<TsQueryAst>),
376    Or(Box<TsQueryAst>, Box<TsQueryAst>),
377    Not(Box<TsQueryAst>),
378    /// `phrase <distance> phrase`. v7.12.0 only persists this; the
379    /// match semantics arrive in v7.12.2 alongside `@@`.
380    Phrase {
381        left: Box<TsQueryAst>,
382        right: Box<TsQueryAst>,
383        distance: u16,
384    },
385}
386
387/// A row-cell value, including SQL `NULL`. `Float` uses `f64`; NaN compares
388/// non-equal to itself (PG behaviour) — `PartialEq` is derived so callers
389/// must opt into NaN-aware comparison if they need stronger guarantees.
390#[derive(Debug, Clone, PartialEq)]
391#[non_exhaustive]
392pub enum Value {
393    SmallInt(i16),
394    Int(i32),
395    BigInt(i64),
396    Float(f64),
397    Text(String),
398    Bool(bool),
399    Vector(Vec<f32>),
400    /// v6.0.1: 8-bit scalar-quantised vector cell. Lives in
401    /// columns declared `VECTOR(N) USING SQ8`. Layout per cell:
402    /// `Sq8Vector { min: f32, max: f32, bytes: Vec<u8> }` —
403    /// 4× compression vs `Vector(Vec<f32>)`. The wire layer
404    /// dequantises to `f32` on SELECT; INSERT path quantises
405    /// incoming `Vector(Vec<f32>)` cells into this variant.
406    Sq8Vector(crate::quantize::Sq8Vector),
407    /// v6.0.3: IEEE-754 binary16 vector cell. Lives in columns
408    /// declared `VECTOR(N) USING HALF`. Stores raw u16 LE bits
409    /// (2× compression vs `Vector(Vec<f32>)`). Wire / display
410    /// paths dequantise to f32 bit-exactly; INSERT path converts
411    /// incoming f32 vectors at the engine boundary.
412    HalfVector(crate::halfvec::HalfVector),
413    /// Exact fixed-point decimal. `scaled` holds the value as
414    /// `actual * 10^scale` so the storage type is always integral —
415    /// arithmetic never falls back to floating-point.
416    Numeric {
417        scaled: i128,
418        scale: u8,
419    },
420    /// Days since the Unix epoch (1970-01-01). Negative for earlier dates.
421    Date(i32),
422    /// Microseconds since the Unix epoch (1970-01-01T00:00:00Z).
423    Timestamp(i64),
424    /// Calendar span: `months` (variable-length) + `micros` (fixed-length).
425    /// Runtime-only — cannot appear in a stored row in v2.11.
426    Interval {
427        months: i32,
428        micros: i64,
429    },
430    /// v4.9 `JSON` — raw JSON text. No structural validation
431    /// happens at the storage layer; whatever the parser hands us
432    /// round-trips verbatim. Equality is byte-wise.
433    Json(String),
434    /// v7.10.4 `BYTEA` — raw binary blob. Equality is byte-wise.
435    /// Layout matches `Text`'s length-prefixed shape (`[u32 LE
436    /// len][bytes]`) under tag 18; the engine accepts PG hex
437    /// literals (`'\xDEADBEEF'`) and escape literals at the
438    /// coercion boundary.
439    Bytes(Vec<u8>),
440    /// v7.10.9 `TEXT[]` — single-dimension TEXT array with
441    /// optional NULL elements. Equality is element-wise. PG's
442    /// NULL-element comparison semantics: NULL ≠ NULL inside
443    /// arrays under `=`, so `[NULL] != [NULL]` (the engine
444    /// honours this).
445    TextArray(Vec<Option<String>>),
446    /// v7.11.12 `INT[]` — single-dimension i32 array with optional
447    /// NULL elements. Codec mirrors TextArray with i32 LE per
448    /// element instead of length-prefixed UTF-8.
449    IntArray(Vec<Option<i32>>),
450    /// v7.11.12 `BIGINT[]` — single-dimension i64 array with optional
451    /// NULL elements.
452    BigIntArray(Vec<Option<i64>>),
453    /// v7.12.0 `tsvector` — sorted-by-word, deduped lexeme set with
454    /// positions + weights. The engine enforces sort/dedup on
455    /// construction; consumers can rely on `lexemes.windows(2)`
456    /// being strictly ascending by `word`.
457    TsVector(Vec<TsLexeme>),
458    /// v7.12.0 `tsquery` — boolean / phrase parse tree over
459    /// lexemes. Engine builds via `to_tsquery` family.
460    TsQuery(TsQueryAst),
461    /// v7.17.0 `uuid` — 128-bit identifier. Stored as 16 bytes
462    /// (big-endian / network-byte order, same as RFC 4122).
463    /// Display normalises to canonical lowercase 8-4-4-4-12
464    /// hyphenated form. Equality is byte-wise.
465    Uuid([u8; 16]),
466    /// v7.17.0 Phase 3.P0-32 — PG `time` (without time zone) —
467    /// i64 microseconds since 00:00:00. Range 0..86_400_000_000.
468    /// Display: `HH:MM:SS` zero-padded, with optional `.ffffff`
469    /// suffix when fractional is non-zero.
470    Time(i64),
471    /// v7.17.0 Phase 3.P0-33 — MySQL `YEAR` — u16 in range
472    /// 1901..=2155 plus the special zero-year sentinel 0.
473    /// Display always 4 digits zero-padded (`0000` for the
474    /// sentinel; `1985`/`2007` otherwise).
475    Year(u16),
476    /// v7.17.0 Phase 3.P0-34 — PG `time with time zone` — i64
477    /// microseconds since 00:00:00 in the LOCAL wall clock PLUS
478    /// an i32 offset-from-UTC in seconds. PG preserves the
479    /// offset on output, so the wall-clock value is NOT shifted
480    /// to UTC at storage time. Offset range: ±50400 seconds
481    /// (±14 hours).
482    TimeTz {
483        us: i64,
484        offset_secs: i32,
485    },
486    /// v7.17.0 Phase 3.P0-35 — PG `money` — i64 cents
487    /// (locale-independent storage; the en_US locale renders on
488    /// display via `$N,NNN.CC`).
489    Money(i64),
490    /// v7.17.0 Phase 3.P0-39 — PG `hstore` value: flat
491    /// `text => text` map with NULL value support. Insertion
492    /// order preserved on input; duplicate keys take last-write-
493    /// wins at parse time.
494    Hstore(Vec<(String, Option<String>)>),
495    /// v7.17.0 Phase 3.P0-40 — 2D INT matrix (row-major).
496    IntArray2D(Vec<Vec<Option<i32>>>),
497    /// v7.17.0 Phase 3.P0-40 — 2D BIGINT matrix (row-major).
498    BigIntArray2D(Vec<Vec<Option<i64>>>),
499    /// v7.17.0 Phase 3.P0-40 — 2D TEXT matrix (row-major).
500    TextArray2D(Vec<Vec<Option<String>>>),
501    /// v7.17.0 Phase 3.P0-38 — PG range value. One shape covers
502    /// all six builtin range types; `kind` pins the element type
503    /// (must match the column's `DataType::Range(kind)`).
504    /// `lower` / `upper` are `None` for the unbounded sides;
505    /// `lower_inc` / `upper_inc` mirror the canonical PG
506    /// `[` / `(` / `]` / `)` bracket inclusivity. `empty=true`
507    /// supersedes all other fields (the empty range has no
508    /// bounds).
509    Range {
510        kind: RangeKind,
511        lower: Option<alloc::boxed::Box<Value>>,
512        upper: Option<alloc::boxed::Box<Value>>,
513        lower_inc: bool,
514        upper_inc: bool,
515        empty: bool,
516    },
517    Null,
518}
519
520impl Value {
521    /// Type tag, or `None` for `NULL` (unknown at value level).
522    pub fn data_type(&self) -> Option<DataType> {
523        match self {
524            Self::SmallInt(_) => Some(DataType::SmallInt),
525            Self::Int(_) => Some(DataType::Int),
526            Self::BigInt(_) => Some(DataType::BigInt),
527            Self::Float(_) => Some(DataType::Float),
528            // `Text` covers both unbounded TEXT and bounded VARCHAR/CHAR
529            // — the constraint lives on the column schema, not the value.
530            Self::Text(_) => Some(DataType::Text),
531            Self::Bool(_) => Some(DataType::Bool),
532            Self::Vector(v) => Some(DataType::Vector {
533                dim: u32::try_from(v.len()).expect("vector dim ≤ u32"),
534                encoding: VecEncoding::F32,
535            }),
536            Self::Sq8Vector(q) => Some(DataType::Vector {
537                dim: u32::try_from(q.bytes.len()).expect("vector dim ≤ u32"),
538                encoding: VecEncoding::Sq8,
539            }),
540            Self::HalfVector(h) => Some(DataType::Vector {
541                dim: u32::try_from(h.dim()).expect("vector dim ≤ u32"),
542                encoding: VecEncoding::F16,
543            }),
544            // `Value::Numeric` doesn't carry its precision (the column
545            // schema does); we surface precision=0 as "unknown" and let
546            // the engine reconcile against the column type at coercion
547            // time.
548            Self::Numeric { scale, .. } => Some(DataType::Numeric {
549                precision: 0,
550                scale: *scale,
551            }),
552            Self::Date(_) => Some(DataType::Date),
553            Self::Timestamp(_) => Some(DataType::Timestamp),
554            Self::Interval { .. } => Some(DataType::Interval),
555            Self::Json(_) => Some(DataType::Json),
556            Self::Bytes(_) => Some(DataType::Bytes),
557            Self::TextArray(_) => Some(DataType::TextArray),
558            Self::IntArray(_) => Some(DataType::IntArray),
559            Self::BigIntArray(_) => Some(DataType::BigIntArray),
560            Self::TsVector(_) => Some(DataType::TsVector),
561            Self::TsQuery(_) => Some(DataType::TsQuery),
562            Self::Uuid(_) => Some(DataType::Uuid),
563            Self::Time(_) => Some(DataType::Time),
564            Self::Year(_) => Some(DataType::Year),
565            Self::TimeTz { .. } => Some(DataType::TimeTz),
566            Self::Money(_) => Some(DataType::Money),
567            Self::Range { kind, .. } => Some(DataType::Range(*kind)),
568            Self::Hstore(_) => Some(DataType::Hstore),
569            Self::IntArray2D(_) => Some(DataType::IntArray2D),
570            Self::BigIntArray2D(_) => Some(DataType::BigIntArray2D),
571            Self::TextArray2D(_) => Some(DataType::TextArray2D),
572            Self::Null => None,
573        }
574    }
575
576    pub const fn is_null(&self) -> bool {
577        matches!(self, Self::Null)
578    }
579}
580
581/// One table row — values are positional and must match
582/// `TableSchema.columns` in length and (modulo NULL) in `DataType`.
583#[derive(Debug, Clone, PartialEq)]
584pub struct Row {
585    pub values: Vec<Value>,
586}
587
588impl Row {
589    pub const fn new(values: Vec<Value>) -> Self {
590        Self { values }
591    }
592
593    pub fn len(&self) -> usize {
594        self.values.len()
595    }
596
597    pub fn is_empty(&self) -> bool {
598        self.values.is_empty()
599    }
600}
601
602#[derive(Debug, Clone, PartialEq)]
603pub struct ColumnSchema {
604    pub name: String,
605    pub ty: DataType,
606    pub nullable: bool,
607    /// Optional `DEFAULT` value, frozen at CREATE TABLE time. `None`
608    /// means "no default" (so omitted columns become NULL, or error
609    /// out when the column is NOT NULL). Literal defaults take this
610    /// path.
611    pub default: Option<Value>,
612    /// v7.9.21 — for DEFAULT expressions that need INSERT-time
613    /// evaluation (e.g. `DEFAULT now()`, `DEFAULT CURRENT_TIMESTAMP`),
614    /// the Display form of the expression. The engine re-parses
615    /// it on each INSERT default-fill, evaluates against an empty
616    /// row context, and coerces to the column type. mailrs G4.
617    /// Persisted in catalog FILE_VERSION 15+; older catalogs
618    /// deserialise with None.
619    pub runtime_default: Option<String>,
620    /// MySQL-style `AUTO_INCREMENT`. When set, an INSERT that leaves
621    /// this column unbound (or sets it to NULL) gets the next integer
622    /// computed from the column's current max + 1.
623    pub auto_increment: bool,
624    /// v7.17.0 Phase 1.4 — when the column is bound to a user-
625    /// defined ENUM type (the parser saw an unknown type ident
626    /// and the engine resolved it against `catalog.enum_types`),
627    /// this carries the enum name so INSERT/UPDATE can validate
628    /// the cell value against the enum's labels. `ty` is
629    /// `DataType::Text` in that case. Persisted in catalog
630    /// FILE_VERSION 29+; older catalogs deserialise with None.
631    pub user_enum_type: Option<String>,
632    /// v7.17.0 Phase 1.5 — when the column is bound to a user-
633    /// defined DOMAIN (the parser saw an unknown type ident and
634    /// the engine resolved it against `catalog.domain_types`),
635    /// this carries the domain name. `ty` is the domain's base
636    /// type; INSERT/UPDATE re-evaluates the domain's CHECK list
637    /// + NOT NULL against the cell value. Persisted in catalog
638    /// FILE_VERSION 30+; older catalogs deserialise with None.
639    pub user_domain_type: Option<String>,
640    /// v7.17.0 Phase 2.1 — MySQL `ON UPDATE CURRENT_TIMESTAMP`
641    /// column attribute. When `Some(expr_src)`, an UPDATE that
642    /// does NOT bind this column overrides the new value with
643    /// the engine-evaluated expression (always `now()` in
644    /// v7.17.0). Stored as Display-form source so storage
645    /// stays free of spg-sql; the engine re-parses at UPDATE
646    /// time. Persisted in catalog FILE_VERSION 32+; older
647    /// catalogs deserialise with None — preserves the existing
648    /// "silent ignore" behaviour for snapshots written before
649    /// the upgrade.
650    pub on_update_runtime: Option<String>,
651    /// v7.17.0 Phase 2.5 — text collation. Pre-2.5 SPG accepted
652    /// `COLLATE <name>` clauses but discarded the name, so a
653    /// column declared `COLLATE "case_insensitive"` (or any
654    /// MySQL `_ci` collation) still compared byte-wise — a
655    /// Tier-S silent failure where `WHERE name = 'foo'` never
656    /// matched stored `'Foo'`. This carries the parser-derived
657    /// classification so the engine's WHERE evaluator can route
658    /// text equality through a case-aware compare. `Binary` (the
659    /// default) preserves the prior byte-wise behaviour. Only
660    /// CaseInsensitive lands in the catalog appendix — Binary
661    /// columns stay implicit, keeping snapshots compact.
662    /// Persisted in catalog FILE_VERSION 34+; older catalogs
663    /// deserialise every column as `Binary`.
664    pub collation: Collation,
665    /// v7.17.0 Phase 4.4 — MySQL `UNSIGNED` modifier flag. Drives
666    /// engine-side INSERT / UPDATE range enforcement (rejects
667    /// negative values on UNSIGNED int columns). Pre-4.4 the
668    /// parser consumed and discarded the keyword silently, so
669    /// every UNSIGNED column quietly accepted negatives — a
670    /// Tier-A correctness drift. Sparse: only UNSIGNED columns
671    /// land in the catalog appendix; the default `false` keeps
672    /// snapshots compact for the common signed-int path.
673    /// Persisted in catalog FILE_VERSION 35+; older catalogs
674    /// deserialise every column as `is_unsigned = false`.
675    pub is_unsigned: bool,
676    /// v7.17.0 Phase 3.P0-36 — MySQL inline `ENUM('a','b','c')`
677    /// value list. Distinct from `user_enum_type` (which points
678    /// to a separately CREATE TYPE'd PG enum); this carries the
679    /// column-local list MySQL DDL declares inline. When `Some`,
680    /// `ty` is `DataType::Text` and INSERT/UPDATE validates the
681    /// cell value against this list. Variant ORDER is preserved
682    /// (MySQL uses it for `ORDER BY col`). Sparse: only ENUM
683    /// columns land in the catalog appendix.
684    /// Persisted in catalog FILE_VERSION 41+; older catalogs
685    /// deserialise with None — preserves silent-drop behaviour
686    /// for snapshots written before P0-36.
687    pub inline_enum_variants: Option<Vec<String>>,
688    /// v7.17.0 Phase 3.P0-37 — MySQL inline `SET('a','b','c')`
689    /// variant list. Storage is TEXT (canonical comma-joined in
690    /// definition order, de-duplicated). INSERT/UPDATE validates
691    /// every comma-separated token against this list. Sparse:
692    /// only SET columns land in the catalog appendix.
693    /// Persisted in catalog FILE_VERSION 42+; older catalogs
694    /// deserialise with None.
695    pub inline_set_variants: Option<Vec<String>>,
696}
697
698/// v7.17.0 Phase 2.5 — column-level text collation. Drives the
699/// engine's WHERE / GROUP BY equality routing for `Value::Text`.
700/// Only two variants are modelled in v7.17:
701///   * `Binary`  — byte-wise comparison (the SPG default;
702///                 matches PG `COLLATE "C"` / `pg_catalog.default`
703///                 and MySQL `*_bin`).
704///   * `CaseInsensitive` — ASCII case-folded comparison
705///                 (matches PG `COLLATE "case_insensitive"` and
706///                 MySQL `*_ci` collations). Non-ASCII bytes
707///                 still compare byte-wise; full ICU folding is
708///                 out of v7.17 scope.
709/// New variants append at the end — older catalogs read missing
710/// columns as `Binary`.
711#[derive(Debug, Clone, Copy, PartialEq, Eq)]
712pub enum Collation {
713    Binary,
714    CaseInsensitive,
715}
716
717#[allow(clippy::derivable_impls)]
718impl Default for Collation {
719    fn default() -> Self {
720        Self::Binary
721    }
722}
723
724impl Collation {
725    /// Wire tag persisted in the FILE_VERSION 34+ catalog appendix.
726    /// Stable: future variants append above the recognised range
727    /// and unknown tags read back as `Binary` for forward-compat
728    /// on rollback.
729    pub const TAG_BINARY: u8 = 0;
730    pub const TAG_CASE_INSENSITIVE: u8 = 1;
731}
732
733#[derive(Debug, Clone, PartialEq)]
734pub struct TableSchema {
735    pub name: String,
736    pub columns: Vec<ColumnSchema>,
737    /// v6.7.2 — per-table hot-tier byte budget override. `None`
738    /// falls through to the global `SPG_HOT_TIER_BYTES` setting;
739    /// `Some(n)` overrides it for this specific table. Set via
740    /// `ALTER TABLE t SET hot_tier_bytes = X`. Persisted in
741    /// catalog FILE_VERSION 11+.
742    pub hot_tier_bytes: Option<u64>,
743    /// v7.6.1 — FOREIGN KEY constraints declared on this table.
744    /// Engine maintains this in lock-step with `spg-sql`'s parser
745    /// AST; the storage layer carries the on-disk shape so a
746    /// catalog snapshot round-trips without external mapping.
747    /// Persisted in catalog FILE_VERSION 13+. Older catalogs
748    /// deserialise with an empty vec.
749    pub foreign_keys: Vec<ForeignKeyConstraint>,
750    /// v7.9.19 — composite UNIQUE / PRIMARY KEY constraints
751    /// declared at the table level. Each entry's leading column
752    /// has a BTree index (created via the constraint), and INSERT
753    /// path enforces the full-tuple uniqueness via a scan keyed
754    /// by the leading column. Persisted in catalog FILE_VERSION
755    /// 15+. Older catalogs (≤ 14) deserialise with an empty vec.
756    pub uniqueness_constraints: Vec<UniquenessConstraint>,
757    /// v7.13.0 — `CHECK (<expr>)` predicates declared on this
758    /// table. Both column-level inline `CHECK (…)` and
759    /// table-level `CHECK (…)` fold into this list. Each entry
760    /// is the AST Expr's `Display` form, re-parsed on every
761    /// INSERT/UPDATE and evaluated against the candidate row.
762    /// A false / NULL result rejects the mutation (PG semantics).
763    /// Persisted in catalog FILE_VERSION 23+. Older catalogs
764    /// deserialise with an empty vec.
765    pub checks: Vec<String>,
766}
767
768/// v7.9.19 — composite UNIQUE / PRIMARY KEY constraint persisted
769/// on the table schema. The leading column always has a BTree
770/// index (created at CREATE TABLE time); INSERT enforcement
771/// scans that index for collisions on the full column tuple.
772#[derive(Debug, Clone, PartialEq, Eq)]
773pub struct UniquenessConstraint {
774    /// `true` when this constraint was declared as `PRIMARY KEY`
775    /// (vs `UNIQUE`). Semantically PK implies NOT NULL on all
776    /// referenced columns; the engine enforces that at CREATE
777    /// TABLE time.
778    pub is_primary_key: bool,
779    /// Column positions on the parent table. ≥ 1 element. For
780    /// single-column UNIQUE this is exactly one position; the
781    /// BTree index alone enforces it.
782    pub columns: Vec<usize>,
783    /// v7.13.0 — `UNIQUE NULLS NOT DISTINCT` modifier
784    /// (mailrs round-5 G10; PG 15+ surface). When `true`, two
785    /// rows whose constrained columns are all NULL collide on
786    /// the constraint. Default (`false`) is the SQL-standard
787    /// `NULLS DISTINCT` behaviour where any NULL passes.
788    /// Persisted in catalog FILE_VERSION 23+.
789    pub nulls_not_distinct: bool,
790}
791
792/// v7.6.1 — Storage-layer mirror of `spg_sql::ast::ForeignKeyConstraint`.
793/// The engine's CREATE TABLE path translates between the two; keeping
794/// them separate preserves the no-deps boundary between
795/// `spg-storage` and `spg-sql`.
796#[derive(Debug, Clone, PartialEq, Eq)]
797pub struct ForeignKeyConstraint {
798    /// Optional user-supplied constraint name (`CONSTRAINT <name>`
799    /// prefix). Used by `ALTER TABLE DROP CONSTRAINT <name>` in
800    /// v7.6.8; ignored by enforcement.
801    pub name: Option<String>,
802    /// Positions of local columns in this table's column list.
803    /// Same arity as `parent_columns`.
804    pub local_columns: Vec<usize>,
805    /// Referenced parent table name.
806    pub parent_table: String,
807    /// Positions of parent columns in the parent's column list.
808    /// Engine resolves these at CREATE TABLE time (after the parent
809    /// schema is known) so enforcement paths can skip the name
810    /// lookup on every row.
811    pub parent_columns: Vec<usize>,
812    /// Referential action when a parent row is deleted.
813    pub on_delete: FkAction,
814    /// Referential action when a parent row's referenced columns
815    /// are updated.
816    pub on_update: FkAction,
817}
818
819/// v7.6.1 — referential action tag. Mirrors `spg_sql::ast::FkAction`.
820#[derive(Debug, Clone, Copy, PartialEq, Eq)]
821pub enum FkAction {
822    Restrict,
823    Cascade,
824    SetNull,
825    SetDefault,
826    NoAction,
827}
828
829impl FkAction {
830    /// On-disk tag byte (v13 catalog appendix).
831    pub const fn tag(self) -> u8 {
832        match self {
833            Self::Restrict => 0,
834            Self::Cascade => 1,
835            Self::SetNull => 2,
836            Self::SetDefault => 3,
837            Self::NoAction => 4,
838        }
839    }
840    pub const fn from_tag(b: u8) -> Option<Self> {
841        Some(match b {
842            0 => Self::Restrict,
843            1 => Self::Cascade,
844            2 => Self::SetNull,
845            3 => Self::SetDefault,
846            4 => Self::NoAction,
847            _ => return None,
848        })
849    }
850}
851
852impl TableSchema {
853    pub fn column_position(&self, name: &str) -> Option<usize> {
854        self.columns.iter().position(|c| c.name == name)
855    }
856}
857
858/// Key type accepted by secondary indices. Float / NULL / Vector values
859/// can't participate in a B-tree index — `f64` is only `PartialOrd`, NULL
860/// has SQL-three-valued semantics, and Vector belongs to the (future) HNSW
861/// path. Index lookups on those columns fall back to full scan.
862#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
863pub enum IndexKey {
864    Int(i64),
865    Text(String),
866    Bool(bool),
867    /// v7.17.0 — `Value::Uuid` index key. Comparison is byte-wise
868    /// (RFC 4122 byte order) so PRIMARY KEY UUID lookups land on
869    /// the same fast-path as Int / Text.
870    Uuid([u8; 16]),
871}
872
873impl IndexKey {
874    pub fn from_value(v: &Value) -> Option<Self> {
875        match v {
876            Value::SmallInt(n) => Some(Self::Int(i64::from(*n))),
877            Value::Int(n) => Some(Self::Int(i64::from(*n))),
878            Value::BigInt(n) => Some(Self::Int(*n)),
879            Value::Text(s) => Some(Self::Text(s.clone())),
880            Value::Bool(b) => Some(Self::Bool(*b)),
881            // Date/Timestamp use their integer storage repr as the
882            // index key — same order semantics, same comparison.
883            Value::Date(d) => Some(Self::Int(i64::from(*d))),
884            Value::Timestamp(t) => Some(Self::Int(*t)),
885            // v7.17.0: UUID indexable via byte-wise ordering. Lookup
886            // on `id = '...'::uuid` resolves through the secondary
887            // index rather than full-scan.
888            Value::Uuid(b) => Some(Self::Uuid(*b)),
889            // v7.17.0 Phase 3.P0-32: TIME indexable via i64 — same
890            // order semantics as Date/Timestamp.
891            Value::Time(us) => Some(Self::Int(*us)),
892            // v7.17.0 Phase 3.P0-33: YEAR indexable as i64 — u16
893            // widens losslessly and gives the natural calendar
894            // ordering.
895            Value::Year(y) => Some(Self::Int(i64::from(*y))),
896            // v7.17.0 Phase 3.P0-34: TIMETZ indexable by its
897            // UTC-equivalent microseconds (local wall - offset).
898            // Without normalising, two values for the same
899            // physical instant in different zones would sort
900            // wrong. Matches PG's TIMETZ index behaviour.
901            Value::TimeTz { us, offset_secs } => {
902                Some(Self::Int(us - i64::from(*offset_secs) * 1_000_000))
903            }
904            // v7.17.0 Phase 3.P0-35: MONEY indexable as i64 cents
905            // (no scaling needed — natural numeric ordering).
906            Value::Money(c) => Some(Self::Int(*c)),
907            // v7.17.0 Phase 3.P0-38: ranges are NOT indexable in
908            // v7.17.0 — they'd need a custom comparator (PG uses
909            // SP-GiST for this). Skip.
910            Value::Range { .. } => None,
911            // v7.17.0 Phase 3.P0-39: hstore is NOT indexable in
912            // v7.17.0 — map columns need GIN with bespoke ops.
913            Value::Hstore(_) => None,
914            // v7.17.0 Phase 3.P0-40: 2D arrays aren't indexable.
915            Value::IntArray2D(_) | Value::BigIntArray2D(_) | Value::TextArray2D(_) => None,
916            // Numeric isn't (yet) indexable — exact-decimal index keys
917            // would need a stable scale-normalised representation.
918            // Interval isn't index-eligible either (and can't reach this
919            // path through column storage anyway).
920            Value::Null
921            | Value::Float(_)
922            | Value::Vector(_)
923            | Value::Sq8Vector(_)
924            | Value::HalfVector(_)
925            | Value::Numeric { .. }
926            | Value::Interval { .. }
927            | Value::Json(_)
928            | Value::Bytes(_)
929            | Value::TextArray(_)
930            | Value::IntArray(_)
931            | Value::BigIntArray(_)
932            | Value::TsVector(_)
933            | Value::TsQuery(_) => None,
934        }
935    }
936}
937
938/// A single-column secondary index. v2.0 carries either a B-tree map
939/// (the default — used for equality / range lookups on scalar columns)
940/// or a navigable-small-world graph (used for kNN over vector
941/// columns).
942#[derive(Debug, Clone)]
943pub struct Index {
944    pub name: String,
945    pub column_position: usize,
946    pub kind: IndexKind,
947    /// v6.8.0 — column positions of `INCLUDE (col1, col2, …)`
948    /// non-key columns. Carries the planner's "this query is
949    /// covered by the index" signal; lookup paths still resolve
950    /// via the `RowLocator` to fetch the row body, but EXPLAIN
951    /// surfaces the covered-scan annotation so operators can
952    /// confirm the planner sees the coverage.
953    ///
954    /// Empty `Vec` = no `INCLUDE` clause (the legacy shape). v12
955    /// catalog snapshots deserialise with an empty vec.
956    pub included_columns: Vec<usize>,
957    /// v6.8.1 — partial-index predicate stored as its canonical
958    /// Display form (the engine re-parses it on the maintenance
959    /// path). `None` = unconditional index (the legacy shape).
960    /// Persisted as `[u8 has_pred][u16 LE len][bytes]` on the
961    /// catalog snapshot (FILE_VERSION 12, appended after
962    /// `included_columns`).
963    pub partial_predicate: Option<String>,
964    /// v6.8.2 — expression-index key, stored as the expression's
965    /// canonical Display form. `None` = bare column-reference
966    /// index (the legacy shape). Persisted alongside
967    /// `partial_predicate` on the v12 catalog snapshot.
968    pub expression: Option<String>,
969    /// v7.9.29 — `CREATE UNIQUE INDEX …`. When true the engine
970    /// rejects INSERTs whose key already appears in this index
971    /// (combined with `partial_predicate` when present — only
972    /// rows matching the predicate enter the uniqueness check).
973    /// Catalog FILE_VERSION 16+; older snapshots deserialise
974    /// with `false`. mailrs K1.
975    pub is_unique: bool,
976    /// v7.9.29 — extra (non-leading) column positions for
977    /// multi-column indexes (`CREATE INDEX … (a, b, c)`). The
978    /// planner today still only uses the leading
979    /// `column_position` for index seeks, but UNIQUE INDEX
980    /// enforcement walks the full tuple so partial-unique
981    /// invariants like CalDAV `(calendar_id, uid,
982    /// recurrence_id)` are enforced correctly. Catalog
983    /// FILE_VERSION 16+; older snapshots deserialise empty.
984    pub extra_column_positions: Vec<usize>,
985}
986
987/// Default neighbor degree (M) for the NSW graph. Picked at construction
988/// time and persisted with the index.
989pub const NSW_DEFAULT_M: usize = 16;
990
991/// v5.2.2: outcome of a successful [`Catalog::freeze_oldest_to_cold`]
992/// call. The catalog state has already been mutated by the time this
993/// is returned (hot rows dropped + segment registered + Cold locators
994/// flipped). The caller's only remaining concern is `segment_bytes` —
995/// persist them to disk under `<db>.spg/segments/seg_<id>.spg` so a
996/// future restart can reload via the v5.1 `SPG_PRELOAD_COLD_SEGMENT`
997/// path. (v5.3's manifest will subsume this manual step.)
998#[derive(Debug, Clone)]
999pub struct FreezeReport {
1000    /// Id allocated by [`Catalog::load_segment_bytes`] for the new
1001    /// cold-tier segment. Stable across the call's success path.
1002    pub segment_id: u32,
1003    /// Number of rows that moved hot → cold. Equals the `max_rows`
1004    /// the caller asked for (the API is strict on the count).
1005    pub frozen_rows: usize,
1006    /// Hot-tier bytes reclaimed by the freeze — the
1007    /// [`Table::hot_bytes`] delta before vs after. Useful to feed
1008    /// back into the freezer's budget check on the next tick.
1009    pub bytes_freed: u64,
1010    /// Encoded segment bytes, byte-identical to what
1011    /// [`encode_segment`] produced. The catalog already owns a
1012    /// copy inside `cold_segments`; this hand-off lets the caller
1013    /// persist them without re-encoding.
1014    pub segment_bytes: Vec<u8>,
1015}
1016
1017/// v6.7.4 — read-only output of [`Catalog::prepare_freeze_slice`].
1018/// Carries every row body + key in a contiguous hot-row range,
1019/// already encoded and sorted by PK so the coordinator's merge
1020/// step is a k-way merge over already-sorted streams.
1021///
1022/// `Vec<FreezeSlice>` from N independent workers feeds
1023/// [`Catalog::commit_freeze_slices`], which concats + encodes the
1024/// merged segment + atomically swaps the catalog state.
1025#[derive(Debug, Clone)]
1026pub struct FreezeSlice {
1027    /// Hot-row index range this slice covered (half-open, in the
1028    /// table's `rows: PersistentVec` ordering at call time). The
1029    /// commit step uses this to compute the union range that
1030    /// gets passed to [`Table::delete_rows`].
1031    pub row_range: core::ops::Range<usize>,
1032    /// `(pk_u64, encoded_row_body, IndexKey)` triples, sorted
1033    /// ascending by `pk_u64`. Per-slice sort happens inside
1034    /// `prepare_freeze_slice`; the coordinator does only a
1035    /// k-way merge to reach the global PK ordering
1036    /// [`encode_segment`] requires.
1037    pub rows: Vec<(u64, Vec<u8>, IndexKey)>,
1038}
1039
1040/// v6.7.3 — outcome of a [`Catalog::compact_cold_segments`] call.
1041/// The catalog state has already been mutated when this is returned:
1042/// the merged segment is loaded into `cold_segments`, the source
1043/// segment slots are tombstoned (`None`), and every BTree-index
1044/// `RowLocator::Cold` that previously pointed at a source now
1045/// points at the merged segment. The caller's remaining job is to
1046/// persist `merged_segment_bytes` under
1047/// `<db>.spg/segments/seg_<merged_segment_id>.spg` and update the
1048/// in-memory `segment_id → path` map (remove the source ids, add
1049/// the merged id) so the next CHECKPOINT writes a manifest that
1050/// no longer lists the retired sources.
1051///
1052/// On a no-op (fewer than 2 candidate segments under the threshold),
1053/// `merged_segment_id` is `None` and `sources` is empty; the
1054/// catalog was not mutated.
1055#[derive(Debug, Clone)]
1056pub struct CompactReport {
1057    /// Source segment ids that were merged + tombstoned.
1058    pub sources: Vec<u32>,
1059    /// Id allocated for the merged segment. `None` on no-op.
1060    pub merged_segment_id: Option<u32>,
1061    /// Encoded merged-segment bytes (empty on no-op).
1062    pub merged_segment_bytes: Vec<u8>,
1063    /// Number of rows that landed in the merged segment.
1064    pub merged_rows: usize,
1065    /// `Σ source.num_rows − merged_rows`. Rows present in source
1066    /// segment payloads but unreferenced by any live BTree
1067    /// `Cold` locator — DELETE'd-but-still-frozen rows that
1068    /// compaction GC'd during the merge.
1069    pub deleted_rows_pruned: usize,
1070    /// `Σ source.bytes() − merged.bytes()`. Estimate of on-disk
1071    /// space the merge will reclaim once the source segment files
1072    /// are GC'd. Saturating subtract — never negative.
1073    pub bytes_reclaimed_estimate: u64,
1074}
1075
1076#[derive(Debug, Clone)]
1077pub enum IndexKind {
1078    /// v4.40: structural-sharing B-tree over `IndexKey`. Replaces the v0.8
1079    /// `BTreeMap<IndexKey, Vec<usize>>` — `Index::clone` is now an `Arc`
1080    /// bump regardless of index size, so `Catalog::clone` inside the
1081    /// v4.34 auto-commit wrap stays O(1) even for tables with secondary
1082    /// indices (the case that bottlenecked v4.39 at 1M rows in the
1083    /// sweep).
1084    ///
1085    /// v5.1: value type widened from `Vec<usize>` to `Vec<RowLocator>` so
1086    /// a single key can point to a mix of hot-tier rows (`RowLocator::Hot`,
1087    /// equivalent to the pre-v5 `usize` row index) and cold-tier rows
1088    /// (`RowLocator::Cold { segment_id, page_offset }`) once the v5.2
1089    /// freezer starts producing them. Pre-v5.2 only `Hot` entries appear
1090    /// — the on-disk encoding stays at `FILE_VERSION` 8 (raw u64 row index)
1091    /// because every locator round-trips through `RowLocator::from_legacy_v8_u64`
1092    /// without information loss. `FILE_VERSION` 9 with tagged encoding lands
1093    /// alongside the first freezer commit (v5.1 step 2b / v5.2).
1094    BTree(PersistentBTreeMap<IndexKey, Vec<RowLocator>>),
1095    /// Navigable-small-world graph for vector kNN search.
1096    Nsw(NswGraph),
1097    /// v6.7.1 — BRIN (Block Range INdex). Pure metadata: BRIN
1098    /// indexes carry NO in-memory key→locator map. The (min,
1099    /// max) summaries live in each cold-tier segment's v2
1100    /// envelope sidecar; the BRIN entry in `Table.indices` only
1101    /// records THAT a BRIN index exists on this column so the
1102    /// segment encoder + planner can opt into the summary path.
1103    Brin {
1104        /// The cell type at `column_position` at CREATE INDEX time.
1105        /// Used by the planner to type-check WHERE-clause range
1106        /// predicates against the BRIN-indexed column.
1107        column_type: DataType,
1108    },
1109    /// v7.12.3 — GIN inverted index over a `tsvector` column.
1110    ///
1111    /// Storage shape: `lexeme word → Vec<RowLocator>`. The posting
1112    /// list per word is appended in row-order, so range scans are
1113    /// O(matching rows) once the per-word lookup is done. Multi-
1114    /// term queries intersect / union posting lists.
1115    ///
1116    /// `IndexKey::from_value(TsVector)` returns `None` — GIN doesn't
1117    /// participate in `try_index_seek` (which is BTree-equality-keyed).
1118    /// The engine consults this index through `try_gin_lookup` on
1119    /// `WHERE col @@ tsquery` predicates instead.
1120    ///
1121    /// Backed by a `PersistentBTreeMap` so `Catalog::clone` (the
1122    /// per-write snapshot) stays O(1) — same structural-sharing
1123    /// invariant as BTree.
1124    Gin(PersistentBTreeMap<alloc::string::String, Vec<RowLocator>>),
1125    /// v7.15.0 — `USING gin (col gin_trgm_ops)` over a `TEXT`
1126    /// column. Posting lists map `trigram` (PG-compatible 3-byte
1127    /// shingle on the lower-cased + space-padded input) to row
1128    /// locators. The planner uses this index to accelerate
1129    /// `WHERE col LIKE '…'` / `ILIKE '…'` / `similarity(col, q) >
1130    /// t` — every literal run of length ≥ 1 in the pattern
1131    /// produces a trigram set, the engine intersects the posting
1132    /// lists, and the LIKE / similarity predicate is re-evaluated
1133    /// per candidate row to filter the over-approximation.
1134    /// Persisted via tag-4 index payload in `FILE_VERSION` 24+.
1135    GinTrgm(PersistentBTreeMap<alloc::string::String, Vec<RowLocator>>),
1136    /// v7.17.0 Phase 2.2 — MySQL `FULLTEXT KEY (col)` over a
1137    /// `TEXT` / `VARCHAR` column. Posting lists map
1138    /// `tsvector('simple') lexeme` to row locators. At insert /
1139    /// build time the engine derives the lexemes from the cell
1140    /// via the same lower-case tokenisation rule as
1141    /// `to_tsvector('simple', ...)` — the column itself stays a
1142    /// plain text type on disk (mysqldump round-trips would be
1143    /// broken otherwise). The planner uses this index to
1144    /// accelerate MySQL-shape `MATCH(col) AGAINST('term')`
1145    /// queries by mapping them onto the existing tsquery `@@`
1146    /// walker. Persisted via tag-5 index payload in
1147    /// `FILE_VERSION` 33+.
1148    GinFulltext(PersistentBTreeMap<alloc::string::String, Vec<RowLocator>>),
1149}
1150
1151/// Multi-layer HNSW graph (v2.13). Each node is assigned a `top_level`;
1152/// it appears in layers `0..=top_level`. Higher layers are sparser, so
1153/// search starts from the entry at the top layer, greedy-descends to
1154/// layer 0, and beam-searches there. Layer 0 keeps a larger neighbour
1155/// budget (`m_max_0 = 2 * m` per the HNSW paper); upper layers cap at
1156/// `m`. The struct name stays `NswGraph` so external users / on-disk
1157/// callers don't have to track a rename — the algorithm changed, the
1158/// data slot didn't.
1159#[derive(Debug, Clone)]
1160pub struct NswGraph {
1161    /// Max neighbours per node on layers ≥ 1.
1162    pub m: usize,
1163    /// Max neighbours on layer 0 (the dense bottom layer). HNSW
1164    /// convention: `m_max_0 = 2 * m`.
1165    pub m_max_0: usize,
1166    /// Entry point — the node that sits on the topmost layer. Search
1167    /// always starts here.
1168    pub entry: Option<usize>,
1169    /// Top layer of the entry node (== `layers.len() - 1` when populated).
1170    pub entry_level: u8,
1171    /// `levels[i]` = top layer of node `i`. Nodes whose vector cell is
1172    /// NULL / non-Vector have `levels[i] = 0` and no neighbour entries.
1173    ///
1174    /// v5.5.0: backed by `PersistentVec` so `NswGraph::clone` (and the
1175    /// `Catalog::clone` on every group-commit write that contains it) is O(1)
1176    /// structural-sharing instead of an O(N) element copy.
1177    pub levels: PersistentVec<u8>,
1178    /// `layers[l][i]` = neighbours of node `i` at layer `l`. Inner vec
1179    /// is empty when node `i` doesn't reach layer `l`.
1180    ///
1181    /// v5.5.0: the per-node middle dimension (the O(N) one) is a
1182    /// `PersistentVec`; the outer layer dimension stays a plain `Vec`
1183    /// (layer count ≤ 8, so its clone is O(1) in practice) and the inner
1184    /// neighbour list stays a `Vec` (bounded by `m_max_0`).
1185    ///
1186    /// v6.1.x: neighbour slot widened from `usize` (8 B on 64-bit) to
1187    /// `u32` (4 B). Row indices are catalog-bounded by `u32::MAX` (4G
1188    /// rows per table); the cast at the NSW boundary asserts this. At
1189    /// 1M dim-128 SQ8, layer 0 adjacency alone shrinks by ~128 MiB
1190    /// — the largest single contribution to the v6.0.5-measured
1191    /// 624 MiB ambition gap. On-disk format already used u32 LE, so
1192    /// this is a pure in-memory layout change; no `FILE_VERSION` bump.
1193    pub layers: Vec<PersistentVec<Vec<u32>>>,
1194}
1195
1196impl NswGraph {
1197    fn new(m: usize) -> Self {
1198        Self {
1199            m,
1200            m_max_0: m.saturating_mul(2),
1201            entry: None,
1202            entry_level: 0,
1203            levels: PersistentVec::new(),
1204            layers: alloc::vec![PersistentVec::new()],
1205        }
1206    }
1207
1208    /// Max-neighbour budget for layer `l`.
1209    pub const fn cap_for_layer(&self, layer: u8) -> usize {
1210        if layer == 0 { self.m_max_0 } else { self.m }
1211    }
1212}
1213
1214/// Deterministic level assignment, seeded on the row index so the same
1215/// insert order reproduces the same topology. Distribution is roughly
1216/// HNSW-flavoured with `mL ≈ 1/ln(M) ≈ 0.36` for M=16: each 4-bit
1217/// chunk that comes up zero promotes the node one layer (so P(level ≥
1218/// L) ≈ (1/16)^L).
1219#[allow(clippy::verbose_bit_mask)] // clippy suggests trailing_zeros(); we need an explicit MAX cap and a stable distribution shape.
1220pub fn nsw_assign_level(row_idx: usize) -> u8 {
1221    const MAX_LEVEL: u8 = 7; // 7 ⇒ ~16^7 ≈ 2.7e8 expected nodes between promotions; ample.
1222    // SplitMix-style mixer — cheap and seedable.
1223    let mut x = (row_idx as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15);
1224    x ^= x >> 30;
1225    x = x.wrapping_mul(0xBF58_476D_1CE4_E5B9);
1226    x ^= x >> 27;
1227    x = x.wrapping_mul(0x94D0_49BB_1331_11EB);
1228    x ^= x >> 31;
1229    // Count contiguous low-end zero nibbles (4-bit chunks). Each zero
1230    // nibble has probability 1/16, mirroring HNSW's `mL ≈ 1/ln(M)` for
1231    // M=16. `trailing_zeros / 4` would lose the ordering when x = 0, so
1232    // a plain loop with a cap is clearer.
1233    let mut level: u8 = 0;
1234    while x & 0xF == 0 && level < MAX_LEVEL {
1235        level += 1;
1236        x >>= 4;
1237    }
1238    level
1239}
1240
1241impl Index {
1242    fn new_btree(name: String, column_position: usize) -> Self {
1243        Self {
1244            name,
1245            column_position,
1246            kind: IndexKind::BTree(PersistentBTreeMap::new()),
1247            included_columns: Vec::new(),
1248            partial_predicate: None,
1249            expression: None,
1250            is_unique: false,
1251            extra_column_positions: Vec::new(),
1252        }
1253    }
1254
1255    fn new_nsw(name: String, column_position: usize, m: usize) -> Self {
1256        Self {
1257            name,
1258            column_position,
1259            kind: IndexKind::Nsw(NswGraph::new(m)),
1260            included_columns: Vec::new(),
1261            partial_predicate: None,
1262            expression: None,
1263            is_unique: false,
1264            extra_column_positions: Vec::new(),
1265        }
1266    }
1267
1268    /// v6.7.1 — BRIN index constructor. BRIN carries no in-memory
1269    /// data; the `column_type` snapshot is used by the segment
1270    /// encoder + planner for type-checking range predicates.
1271    fn new_brin(name: String, column_position: usize, column_type: DataType) -> Self {
1272        Self {
1273            name,
1274            column_position,
1275            kind: IndexKind::Brin { column_type },
1276            included_columns: Vec::new(),
1277            partial_predicate: None,
1278            expression: None,
1279            is_unique: false,
1280            extra_column_positions: Vec::new(),
1281        }
1282    }
1283
1284    /// v7.12.3 — GIN inverted-index constructor. Empty posting-list
1285    /// map; caller (typically [`Table::add_gin_index`] or
1286    /// [`Table::restore_gin_index`]) populates it from existing rows
1287    /// or from a deserialised snapshot.
1288    fn new_gin(name: String, column_position: usize) -> Self {
1289        Self {
1290            name,
1291            column_position,
1292            kind: IndexKind::Gin(PersistentBTreeMap::new()),
1293            included_columns: Vec::new(),
1294            partial_predicate: None,
1295            expression: None,
1296            is_unique: false,
1297            extra_column_positions: Vec::new(),
1298        }
1299    }
1300
1301    /// v7.15.0 — `gin_trgm_ops`-flavoured GIN constructor. Same
1302    /// shape as `new_gin` but the posting-list keys are 3-byte
1303    /// trigram shingles (`pg_trgm`-compatible) and the column
1304    /// type is `TEXT` / `VARCHAR` (not `TSVECTOR`).
1305    fn new_gin_trgm(name: String, column_position: usize) -> Self {
1306        Self {
1307            name,
1308            column_position,
1309            kind: IndexKind::GinTrgm(PersistentBTreeMap::new()),
1310            included_columns: Vec::new(),
1311            partial_predicate: None,
1312            expression: None,
1313            is_unique: false,
1314            extra_column_positions: Vec::new(),
1315        }
1316    }
1317
1318    /// v7.17.0 Phase 2.2 — MySQL `FULLTEXT KEY` GIN constructor.
1319    /// Same shape as `new_gin_trgm` but the posting-list keys
1320    /// are lower-cased word lexemes (`to_tsvector('simple', col)`
1321    /// equivalent) instead of trigrams, and the column type is
1322    /// `TEXT` / `VARCHAR` (not `TSVECTOR`).
1323    fn new_gin_fulltext(name: String, column_position: usize) -> Self {
1324        Self {
1325            name,
1326            column_position,
1327            kind: IndexKind::GinFulltext(PersistentBTreeMap::new()),
1328            included_columns: Vec::new(),
1329            partial_predicate: None,
1330            expression: None,
1331            is_unique: false,
1332            extra_column_positions: Vec::new(),
1333        }
1334    }
1335
1336    /// Look up the locators stored under `key` (B-tree only). Returns
1337    /// an empty slice when the key is absent or the index isn't a
1338    /// BTree — callers can treat both cases uniformly.
1339    ///
1340    /// v5.1: return type widened from `&[usize]` to `&[RowLocator]`.
1341    /// Pre-v5.2 callers can read the slice and `.as_hot().unwrap()`
1342    /// each entry (no `Cold` variants exist until the freezer lands);
1343    /// post-v5.2 callers dispatch hot vs. cold per locator.
1344    pub fn lookup_eq(&self, key: &IndexKey) -> &[RowLocator] {
1345        match &self.kind {
1346            IndexKind::BTree(m) => m.get(key).map_or(&[][..], Vec::as_slice),
1347            // BRIN / NSW / GIN / trigram-GIN / fulltext-GIN have
1348            // no IndexKey-keyed map; lookup is a no-op. GIN uses
1349            // [`Index::gin_lookup_word`] instead.
1350            IndexKind::Nsw(_)
1351            | IndexKind::Brin { .. }
1352            | IndexKind::Gin(_)
1353            | IndexKind::GinTrgm(_)
1354            | IndexKind::GinFulltext(_) => &[][..],
1355        }
1356    }
1357
1358    /// v7.12.3 — GIN posting-list lookup. Returns the row locators
1359    /// whose `tsvector` cell contains `word`. Empty when the word is
1360    /// absent from the index or this isn't a GIN index.
1361    pub fn gin_lookup_word(&self, word: &str) -> &[RowLocator] {
1362        match &self.kind {
1363            // v7.17.0 Phase 2.2 — fulltext-GIN shares the same
1364            // lexeme-keyed posting list shape as the
1365            // tsvector-typed GIN, so the same lookup applies.
1366            IndexKind::Gin(m) | IndexKind::GinFulltext(m) => {
1367                m.get(&String::from(word)).map_or(&[][..], Vec::as_slice)
1368            }
1369            IndexKind::BTree(_)
1370            | IndexKind::Nsw(_)
1371            | IndexKind::Brin { .. }
1372            | IndexKind::GinTrgm(_) => &[][..],
1373        }
1374    }
1375
1376    /// v7.15.0 — trigram-GIN posting-list lookup. Returns the row
1377    /// locators whose indexed `TEXT` cell contains the trigram
1378    /// `tri`. Empty when the trigram is absent or this isn't a
1379    /// trigram-GIN index.
1380    pub fn gin_trgm_lookup(&self, tri: &str) -> &[RowLocator] {
1381        match &self.kind {
1382            IndexKind::GinTrgm(m) => m.get(&String::from(tri)).map_or(&[][..], Vec::as_slice),
1383            IndexKind::BTree(_)
1384            | IndexKind::Nsw(_)
1385            | IndexKind::Brin { .. }
1386            | IndexKind::Gin(_)
1387            | IndexKind::GinFulltext(_) => &[][..],
1388        }
1389    }
1390
1391    /// Borrow the NSW graph (if this is an NSW index). Callers that need
1392    /// the graph for a kNN search go through here.
1393    pub const fn nsw(&self) -> Option<&NswGraph> {
1394        match &self.kind {
1395            IndexKind::Nsw(g) => Some(g),
1396            IndexKind::BTree(_)
1397            | IndexKind::Brin { .. }
1398            | IndexKind::Gin(_)
1399            | IndexKind::GinTrgm(_)
1400            | IndexKind::GinFulltext(_) => None,
1401        }
1402    }
1403
1404    /// v6.7.1 — true when this index is a BRIN (block range) index.
1405    /// Used by the segment encoder to opt into BRIN sidecar emission
1406    /// at freeze time, and by the planner to opt into page-skipping
1407    /// on range predicates.
1408    pub const fn is_brin(&self) -> bool {
1409        matches!(self.kind, IndexKind::Brin { .. })
1410    }
1411
1412    /// v7.15.0 — true when this index is a trigram GIN
1413    /// (`gin_trgm_ops`-flavoured). Used by the LIKE planner to
1414    /// opt into trigram acceleration.
1415    pub const fn is_gin_trgm(&self) -> bool {
1416        matches!(self.kind, IndexKind::GinTrgm(_))
1417    }
1418
1419    /// v7.12.3 — true when this index is a GIN inverted index.
1420    /// Used by the planner to opt into posting-list acceleration on
1421    /// `WHERE col @@ tsquery` predicates.
1422    pub const fn is_gin(&self) -> bool {
1423        matches!(self.kind, IndexKind::Gin(_))
1424    }
1425
1426    /// v7.17.0 Phase 2.2 — true when this index is a fulltext
1427    /// GIN over a TEXT / VARCHAR column (MySQL `FULLTEXT KEY`
1428    /// surface). Used by the planner to opt the FULLTEXT-indexed
1429    /// column into MATCH AGAINST acceleration.
1430    pub const fn is_gin_fulltext(&self) -> bool {
1431        matches!(self.kind, IndexKind::GinFulltext(_))
1432    }
1433}
1434
1435/// In-memory table: schema + a persistent row vector + secondary indices.
1436///
1437/// v4.39: `rows` is a [`PersistentVec`] (Bitmapped Vector Trie, 32-way) so
1438/// `Table::clone()` is `O(1)` — the whole reason for v4.39's existence is
1439/// to make `Catalog::clone()` cheap inside the v4.34 auto-commit wrap.
1440///
1441/// v5.2.1: `hot_bytes` tracks the encoded byte size of every row currently
1442/// in [`Self::rows`], summed over rows. Updated incrementally by `insert`
1443/// (+= encoded row size), `delete_rows` (-= removed rows' encoded sizes),
1444/// and `update_row` (-= old size, += new size). The value is what the
1445/// v5.2 freezer reads to decide when to demote cold rows — when the
1446/// catalog-wide sum crosses `SPG_HOT_TIER_BYTES` (default 4 GiB) the
1447/// freezer thread wakes. v5.2.1 ships measurement only; the freezer
1448/// itself lands in v5.2.2. Stored as `u64` so a single field clone in
1449/// `Catalog::clone` stays at the O(1) invariant v4.39 built.
1450#[derive(Debug, Clone)]
1451pub struct Table {
1452    schema: TableSchema,
1453    rows: PersistentVec<Row>,
1454    indices: Vec<Index>,
1455    hot_bytes: u64,
1456    /// v6.7.0 — cached count of rows currently materialised in the
1457    /// cold tier via `RowLocator::Cold` entries across THIS table's
1458    /// indices. Populated by `ANALYZE` (walks every BTree index and
1459    /// counts Cold locators); the count survives until the next
1460    /// ANALYZE recomputes it. Surfaced via `spg_statistic.cold_row_count`
1461    /// and `spg_stat_segment.table_name`.
1462    ///
1463    /// Honest scope: this is a CACHED count, not a live one.
1464    /// Freezer / promote / DELETE don't currently update the cache
1465    /// incrementally — they invalidate it by setting the
1466    /// `cold_row_count_stale` flag, and the next ANALYZE re-walks.
1467    /// Incremental maintenance is a v6.7.x candidate if observation
1468    /// shows the ANALYZE walk cost dominates.
1469    cold_row_count: u64,
1470    /// v6.7.0 — set when the cached `cold_row_count` may be wrong
1471    /// because rows moved into / out of the cold tier since the last
1472    /// ANALYZE. The virtual-table surface reports the cached value
1473    /// regardless (operators run ANALYZE to refresh).
1474    cold_row_count_stale: bool,
1475}
1476
1477impl Table {
1478    pub fn new(schema: TableSchema) -> Self {
1479        Self {
1480            schema,
1481            rows: PersistentVec::new(),
1482            indices: Vec::new(),
1483            hot_bytes: 0,
1484            cold_row_count: 0,
1485            cold_row_count_stale: false,
1486        }
1487    }
1488
1489    /// Total encoded byte size of every row currently in the hot tier
1490    /// (`self.rows`). See struct docs for the maintenance contract.
1491    /// Returns 0 for an empty table.
1492    #[must_use]
1493    pub const fn hot_bytes(&self) -> u64 {
1494        self.hot_bytes
1495    }
1496
1497    /// v6.7.0 — cached count of cold-tier rows. See struct field
1498    /// docs for the staleness contract.
1499    #[must_use]
1500    pub const fn cold_row_count(&self) -> u64 {
1501        self.cold_row_count
1502    }
1503
1504    /// v6.7.0 — overwrite the cached count. Called by the engine's
1505    /// `analyze_one_table` after walking the indices.
1506    pub fn set_cold_row_count(&mut self, n: u64) {
1507        self.cold_row_count = n;
1508        self.cold_row_count_stale = false;
1509    }
1510
1511    /// v6.7.0 — mark the cached count as potentially out of date.
1512    /// Called by freezer / promote / DELETE paths so a subsequent
1513    /// `spg_statistic` read knows the number may not reflect the
1514    /// current state.
1515    pub fn mark_cold_row_count_stale(&mut self) {
1516        self.cold_row_count_stale = true;
1517    }
1518
1519    /// v6.7.0 — report whether the cached count is known to be out
1520    /// of date. Exposed for completeness; the virtual table surface
1521    /// returns the cached value regardless.
1522    #[must_use]
1523    pub const fn cold_row_count_stale(&self) -> bool {
1524        self.cold_row_count_stale
1525    }
1526
1527    /// v6.7.0 — walk every BTree index and count `RowLocator::Cold`
1528    /// entries; return the MAX across indices. The freeze path
1529    /// (`freeze_oldest_to_cold`) writes cold locators to ONE
1530    /// designated index — that index ends up with the full per-row
1531    /// count. MAX-across-indices yields the precise count when a
1532    /// PK-style index exists; for multi-index tables without a
1533    /// covering index it's a lower bound (rare in practice).
1534    /// Caller responsibility: only invoke under `engine.write()`
1535    /// or after taking ownership; the walk is O(N) over every
1536    /// (key, locator) pair.
1537    #[must_use]
1538    pub fn count_cold_locators(&self) -> u64 {
1539        let mut best: u64 = 0;
1540        for idx in &self.indices {
1541            if let IndexKind::BTree(map) = &idx.kind {
1542                let n: u64 = map
1543                    .iter()
1544                    .map(|(_, locs)| locs.iter().filter(|l| l.is_cold()).count() as u64)
1545                    .sum();
1546                if n > best {
1547                    best = n;
1548                }
1549            }
1550        }
1551        best
1552    }
1553
1554    pub const fn schema(&self) -> &TableSchema {
1555        &self.schema
1556    }
1557
1558    /// v6.7.2 — mutable schema accessor for ALTER TABLE paths.
1559    /// Used by `Engine::exec_alter_table` to flip per-table
1560    /// settings like `hot_tier_bytes`.
1561    pub const fn schema_mut(&mut self) -> &mut TableSchema {
1562        &mut self.schema
1563    }
1564
1565    /// v4.39: returns the persistent row vector by reference. Callers that
1566    /// used to take `&[Row]` should switch to `.iter()` (via
1567    /// `IntoIterator for &PersistentVec`) or `.get(i)` for indexing.
1568    pub const fn rows(&self) -> &PersistentVec<Row> {
1569        &self.rows
1570    }
1571
1572    pub const fn row_count(&self) -> usize {
1573        self.rows.len()
1574    }
1575
1576    /// v6.8.0 — exposed for the engine layer to patch
1577    /// `Index::included_columns` post-creation. Could fold into
1578    /// `add_index` once the engine's IF-NOT-EXISTS guard moves up,
1579    /// but the patch shape is the minimal change for v6.8.0.
1580    pub fn indices_mut(&mut self) -> &mut [Index] {
1581        &mut self.indices
1582    }
1583
1584    pub fn indices(&self) -> &[Index] {
1585        &self.indices
1586    }
1587
1588    /// Compute the next `AUTO_INCREMENT` value for the column at
1589    /// `col_pos`. Defined as `max(existing) + 1`, falling back to `1`
1590    /// when the column currently holds no integer values. NULL / non-
1591    /// integer cells are skipped. Returns `None` when the column isn't
1592    /// an integer type.
1593    pub fn next_auto_value(&self, col_pos: usize) -> Option<i64> {
1594        let ty = self.schema.columns.get(col_pos)?.ty;
1595        if !matches!(ty, DataType::SmallInt | DataType::Int | DataType::BigInt) {
1596            return None;
1597        }
1598        let mut max: Option<i64> = None;
1599        for row in &self.rows {
1600            match row.values.get(col_pos) {
1601                Some(Value::SmallInt(n)) => {
1602                    let v = i64::from(*n);
1603                    max = Some(max.map_or(v, |m| m.max(v)));
1604                }
1605                Some(Value::Int(n)) => {
1606                    let v = i64::from(*n);
1607                    max = Some(max.map_or(v, |m| m.max(v)));
1608                }
1609                Some(Value::BigInt(n)) => {
1610                    max = Some(max.map_or(*n, |m| m.max(*n)));
1611                }
1612                _ => {}
1613            }
1614        }
1615        Some(max.map_or(1, |m| m + 1))
1616    }
1617
1618    /// Return the first index defined over `column_position`, if any.
1619    /// (`v0.8` supports at most one index per column logically; the search
1620    /// just picks the first match.)
1621    pub fn index_on(&self, column_position: usize) -> Option<&Index> {
1622        // v6.7.1 — prefer BTree (has the key→locator map needed
1623        // for `lookup_eq`) over BRIN (metadata-only). When only a
1624        // BRIN exists on the column, return None so the executor
1625        // falls back to the hot-tier row scan instead of trying
1626        // to use BRIN for an equality lookup (which would always
1627        // return an empty slice and look like "no rows matched").
1628        self.indices
1629            .iter()
1630            .find(|i| i.column_position == column_position && matches!(i.kind, IndexKind::BTree(_)))
1631            .or_else(|| {
1632                self.indices.iter().find(|i| {
1633                    i.column_position == column_position && matches!(i.kind, IndexKind::Nsw(_))
1634                })
1635            })
1636    }
1637
1638    /// Insert one row after validating it matches the schema (length + type).
1639    /// Returns `StorageError` on mismatch — the table is left unchanged.
1640    /// Updates every defined index with the new row's key.
1641    pub fn insert(&mut self, row: Row) -> Result<(), StorageError> {
1642        if row.len() != self.schema.columns.len() {
1643            return Err(StorageError::ArityMismatch {
1644                expected: self.schema.columns.len(),
1645                actual: row.len(),
1646            });
1647        }
1648        for (i, (val, col)) in row.values.iter().zip(&self.schema.columns).enumerate() {
1649            if val.is_null() {
1650                if !col.nullable {
1651                    return Err(StorageError::NullInNotNull {
1652                        column: col.name.clone(),
1653                    });
1654                }
1655                continue;
1656            }
1657            let actual = val.data_type().expect("non-null");
1658            // Vector columns require both that the value's variant be Vector
1659            // *and* its dimension match. `actual == col.ty` already encodes
1660            // both because DataType::Vector carries the dim.
1661            //
1662            // VARCHAR(n) / CHAR(n) are storage-equivalent to TEXT — the
1663            // length / padding contract is enforced upstream by
1664            // `coerce_value`. Accept a `Text` value into either.
1665            //
1666            // NUMERIC's `Value::Numeric` carries its actual scale but the
1667            // column declares the *expected* scale (a scale-rescaled
1668            // Value::Numeric is produced upstream by `coerce_value`); the
1669            // structural check here only verifies "value is Numeric and
1670            // its scale equals the column scale".
1671            let compatible = actual == col.ty
1672                || matches!(
1673                    (actual, col.ty),
1674                    (
1675                        DataType::Text,
1676                        DataType::Varchar(_) | DataType::Char(_) | DataType::Json | DataType::Jsonb
1677                    ) | (DataType::Json | DataType::Jsonb, DataType::Text)
1678                        | (DataType::Json, DataType::Jsonb)
1679                        | (DataType::Jsonb, DataType::Json)
1680                        | (DataType::Timestamp, DataType::Timestamptz)
1681                        | (DataType::Timestamptz, DataType::Timestamp)
1682                )
1683                || matches!(
1684                    (actual, col.ty),
1685                    (
1686                        DataType::Numeric { scale: a, .. },
1687                        DataType::Numeric { scale: b, .. },
1688                    ) if a == b
1689                );
1690            if !compatible {
1691                return Err(StorageError::TypeMismatch {
1692                    column: col.name.clone(),
1693                    expected: col.ty,
1694                    actual,
1695                    position: i,
1696                });
1697            }
1698        }
1699        let new_row_idx = self.rows.len();
1700        // Pre-validate before mutating: ensure indices receive an IndexKey.
1701        // For NSW we defer the graph update to *after* the row is pushed
1702        // so the kNN search can see it in `self.rows`.
1703        for idx in &mut self.indices {
1704            match &mut idx.kind {
1705                IndexKind::BTree(map) => {
1706                    if let Some(key) = IndexKey::from_value(&row.values[idx.column_position]) {
1707                        // v4.40: PersistentBTreeMap has no in-place entry-or-default.
1708                        // Clone-then-insert keeps the same semantics — for typical
1709                        // unique-key schemas the Vec is 1-element so the clone is
1710                        // O(1). For dup-heavy columns it's O(M) per insert, traded
1711                        // for the structural-sharing win at clone time.
1712                        let mut entries = map.get(&key).cloned().unwrap_or_default();
1713                        entries.push(RowLocator::Hot(new_row_idx));
1714                        map.insert_mut(key, entries);
1715                    }
1716                }
1717                IndexKind::Gin(map) => {
1718                    // v7.12.3 — extend posting list per lexeme word.
1719                    // NULL or non-TsVector cell → no-op (cell carries
1720                    // no lexemes to index).
1721                    if let Value::TsVector(lexemes) = &row.values[idx.column_position] {
1722                        for lex in lexemes {
1723                            let mut entries = map.get(&lex.word).cloned().unwrap_or_default();
1724                            entries.push(RowLocator::Hot(new_row_idx));
1725                            map.insert_mut(lex.word.clone(), entries);
1726                        }
1727                    }
1728                }
1729                IndexKind::GinTrgm(map) => {
1730                    // v7.15.0 — trigram GIN. Shingle the TEXT cell
1731                    // into PG-compatible 3-byte trigrams and extend
1732                    // each trigram's posting list.
1733                    if let Value::Text(s) = &row.values[idx.column_position] {
1734                        for tri in trgm::extract_trigrams(s) {
1735                            let mut entries = map.get(&tri).cloned().unwrap_or_default();
1736                            entries.push(RowLocator::Hot(new_row_idx));
1737                            map.insert_mut(tri, entries);
1738                        }
1739                    }
1740                }
1741                IndexKind::GinFulltext(map) => {
1742                    // v7.17.0 Phase 2.2 — MySQL FULLTEXT-shape
1743                    // GIN over a TEXT / VARCHAR cell. Tokenise
1744                    // via the storage-local `simple_lex` (same
1745                    // rule as `to_tsvector('simple', text)`) and
1746                    // extend each lexeme's posting list.
1747                    let text_cell = match &row.values[idx.column_position] {
1748                        Value::Text(s) => Some(s.as_str()),
1749                        // mysqldump-style mediumtext / longtext
1750                        // land as Value::Text on insert; varchar
1751                        // cells likewise. Anything else (NULL,
1752                        // integer, …) contributes no lexemes.
1753                        _ => None,
1754                    };
1755                    if let Some(s) = text_cell {
1756                        for lex in fts_simple::simple_lex(s) {
1757                            let mut entries = map.get(&lex).cloned().unwrap_or_default();
1758                            entries.push(RowLocator::Hot(new_row_idx));
1759                            map.insert_mut(lex, entries);
1760                        }
1761                    }
1762                }
1763                // NSW handled below after the row push (so the new row
1764                // is visible to the kNN-graph connect step). BRIN
1765                // carries no per-row state.
1766                IndexKind::Nsw(_) | IndexKind::Brin { .. } => {}
1767            }
1768        }
1769        // v5.2.1: maintain incremental hot-tier byte counter. Computed
1770        // before the move so we don't need to borrow `row` after push.
1771        self.hot_bytes = self
1772            .hot_bytes
1773            .saturating_add(row_body_encoded_len(&row, &self.schema) as u64);
1774        // v4.39.1: push_mut keeps streaming inserts at Vec::push speed when
1775        // the table is uniquely owned (the spg-embedded path); inside a TX
1776        // wrap where a Catalog snapshot exists, push_mut path-copies the
1777        // tail just like push() and the snapshot stays valid.
1778        self.rows.push_mut(row);
1779        // NSW updates after the push so the new row is visible to the
1780        // greedy search used during connect.
1781        let new_row_idx = self.rows.len() - 1;
1782        let nsw_targets: Vec<usize> = self
1783            .indices
1784            .iter()
1785            .enumerate()
1786            .filter_map(|(i, idx)| {
1787                if matches!(idx.kind, IndexKind::Nsw(_)) {
1788                    Some(i)
1789                } else {
1790                    None
1791                }
1792            })
1793            .collect();
1794        for idx_pos in nsw_targets {
1795            nsw_insert_at(self, idx_pos, new_row_idx);
1796        }
1797        Ok(())
1798    }
1799
1800    /// Build a new B-tree index over the named column. Rebuilds from
1801    /// existing rows. Errors if `column_name` doesn't exist or the index
1802    /// name is taken.
1803    pub fn add_index(&mut self, name: String, column_name: &str) -> Result<(), StorageError> {
1804        if self.indices.iter().any(|i| i.name == name) {
1805            return Err(StorageError::DuplicateIndex { name });
1806        }
1807        let column_position = self.schema.column_position(column_name).ok_or_else(|| {
1808            StorageError::ColumnNotFound {
1809                column: column_name.into(),
1810            }
1811        })?;
1812        let mut idx = Index::new_btree(name, column_position);
1813        if let IndexKind::BTree(map) = &mut idx.kind {
1814            for (i, row) in self.rows.iter().enumerate() {
1815                if let Some(key) = IndexKey::from_value(&row.values[column_position]) {
1816                    let mut entries = map.get(&key).cloned().unwrap_or_default();
1817                    entries.push(RowLocator::Hot(i));
1818                    map.insert_mut(key, entries);
1819                }
1820            }
1821        }
1822        self.indices.push(idx);
1823        Ok(())
1824    }
1825
1826    /// Build a new NSW (HNSW-flavoured) index over the named column.
1827    /// Required for `ORDER BY col <-> literal LIMIT k` to plan as a
1828    /// graph traversal instead of a full scan. Column must be a Vector
1829    /// type. `m` is the maximum number of neighbours per node.
1830    pub fn add_nsw_index(
1831        &mut self,
1832        name: String,
1833        column_name: &str,
1834        m: usize,
1835    ) -> Result<(), StorageError> {
1836        self.add_nsw_index_inner(name, column_name, m, None)
1837    }
1838
1839    /// v6.0.4 — synchronous rebuild of the named NSW index. If
1840    /// `new_encoding` is `Some(target)` and differs from the column's
1841    /// current encoding, every stored cell at the indexed column is
1842    /// re-coded into the target encoding before the new graph
1843    /// builds. Returns `IndexNotFound` if no index by that name exists
1844    /// and `Unsupported` for non-NSW indexes (`BTree` REBUILD is a no-op
1845    /// the engine layer rejects, not a storage-level concept).
1846    ///
1847    /// Holds the caller's `&mut self` for the duration — no
1848    /// concurrency / staging / WAL-replay machinery in v6.0.4. The
1849    /// "live" optimisation lands as v6.0.4.1.
1850    pub fn rebuild_nsw_index(
1851        &mut self,
1852        name: &str,
1853        new_encoding: Option<VecEncoding>,
1854    ) -> Result<(), StorageError> {
1855        let idx_pos = self
1856            .indices
1857            .iter()
1858            .position(|i| i.name == name)
1859            .ok_or_else(|| StorageError::IndexNotFound {
1860                name: String::from(name),
1861            })?;
1862        let col_pos = self.indices[idx_pos].column_position;
1863        let m = match &self.indices[idx_pos].kind {
1864            IndexKind::Nsw(g) => g.m,
1865            IndexKind::BTree(_)
1866            | IndexKind::Brin { .. }
1867            | IndexKind::Gin(_)
1868            | IndexKind::GinTrgm(_)
1869            | IndexKind::GinFulltext(_) => {
1870                return Err(StorageError::Unsupported(format!(
1871                    "ALTER INDEX REBUILD on non-NSW index {name:?} — only NSW indexes can rebuild"
1872                )));
1873            }
1874        };
1875        let col_name = self.schema.columns[col_pos].name.clone();
1876        // 1. Optional re-encoding pass. Done first so the cells
1877        //    match the schema before the graph rebuild walks them.
1878        if let Some(target) = new_encoding {
1879            let current = match self.schema.columns[col_pos].ty {
1880                DataType::Vector { encoding, .. } => encoding,
1881                ref other => {
1882                    return Err(StorageError::Unsupported(format!(
1883                        "ALTER INDEX REBUILD WITH (encoding=…) on non-vector column type {other:?}"
1884                    )));
1885                }
1886            };
1887            if target != current {
1888                let DataType::Vector { dim, .. } = self.schema.columns[col_pos].ty else {
1889                    unreachable!("checked above")
1890                };
1891                let n = self.rows.len();
1892                for i in 0..n {
1893                    let row = self
1894                        .rows
1895                        .get_mut(i)
1896                        .expect("row index in bounds (we iterated up to len())");
1897                    let cell = core::mem::replace(&mut row.values[col_pos], Value::Null);
1898                    let recoded = recode_vector_cell(cell, target)?;
1899                    row.values[col_pos] = recoded;
1900                }
1901                self.schema.columns[col_pos].ty = DataType::Vector {
1902                    dim,
1903                    encoding: target,
1904                };
1905            }
1906        }
1907        // 2. Drop the existing index slot + rebuild from row payload.
1908        self.indices.remove(idx_pos);
1909        self.add_nsw_index_inner(String::from(name), &col_name, m, None)?;
1910        Ok(())
1911    }
1912
1913    /// Restore an NSW index from a pre-built graph (used on
1914    /// deserialize). Skips the bulk-build pass since the topology is
1915    /// already known. Returns `DuplicateIndex` or `ColumnNotFound` on
1916    /// schema mismatch as usual.
1917    pub fn restore_nsw_index(
1918        &mut self,
1919        name: String,
1920        column_name: &str,
1921        graph: NswGraph,
1922    ) -> Result<(), StorageError> {
1923        self.add_nsw_index_inner(name, column_name, graph.m, Some(graph))
1924    }
1925
1926    /// Restore a `BTree` index from a pre-built `(IndexKey, Vec<RowLocator>)`
1927    /// map. Used by [`Catalog::deserialize`] when reading a v9 (or later)
1928    /// catalog snapshot — the map travels on disk so cold-tier locators
1929    /// survive a round-trip, instead of being rebuilt from `self.rows`
1930    /// (which would lose every Cold entry). Same error contract as
1931    /// [`Table::add_index`].
1932    pub fn restore_btree_index(
1933        &mut self,
1934        name: String,
1935        column_name: &str,
1936        map: PersistentBTreeMap<IndexKey, Vec<RowLocator>>,
1937    ) -> Result<(), StorageError> {
1938        if self.indices.iter().any(|i| i.name == name) {
1939            return Err(StorageError::DuplicateIndex { name });
1940        }
1941        let column_position = self.schema.column_position(column_name).ok_or_else(|| {
1942            StorageError::ColumnNotFound {
1943                column: column_name.into(),
1944            }
1945        })?;
1946        self.indices.push(Index {
1947            name,
1948            column_position,
1949            kind: IndexKind::BTree(map),
1950            included_columns: Vec::new(),
1951            partial_predicate: None,
1952            expression: None,
1953            is_unique: false,
1954            extra_column_positions: Vec::new(),
1955        });
1956        Ok(())
1957    }
1958
1959    /// v6.7.1 — public restore counterpart for BRIN indices. Used
1960    /// by `Catalog::deserialize` when a v10 snapshot carries a
1961    /// BRIN index entry. BRIN carries no in-memory data — only the
1962    /// `column_type` snapshot is restored.
1963    pub fn restore_brin_index(
1964        &mut self,
1965        name: String,
1966        column_name: &str,
1967        column_type: DataType,
1968    ) -> Result<(), StorageError> {
1969        if self.indices.iter().any(|i| i.name == name) {
1970            return Err(StorageError::DuplicateIndex { name });
1971        }
1972        let column_position = self.schema.column_position(column_name).ok_or_else(|| {
1973            StorageError::ColumnNotFound {
1974                column: column_name.into(),
1975            }
1976        })?;
1977        self.indices
1978            .push(Index::new_brin(name, column_position, column_type));
1979        Ok(())
1980    }
1981
1982    /// v6.7.1 — public CREATE INDEX counterpart for BRIN. Creates
1983    /// the index entry with a snapshot of the indexed column's
1984    /// current `DataType`.
1985    pub fn add_brin_index(&mut self, name: String, column_name: &str) -> Result<(), StorageError> {
1986        if self.indices.iter().any(|i| i.name == name) {
1987            return Err(StorageError::DuplicateIndex { name });
1988        }
1989        let column_position = self.schema.column_position(column_name).ok_or_else(|| {
1990            StorageError::ColumnNotFound {
1991                column: column_name.into(),
1992            }
1993        })?;
1994        let column_type = self.schema.columns[column_position].ty;
1995        self.indices
1996            .push(Index::new_brin(name, column_position, column_type));
1997        Ok(())
1998    }
1999
2000    /// v7.12.3 — Build a new GIN inverted index over a `tsvector`
2001    /// column. Populates posting lists from existing rows. Errors
2002    /// if the column doesn't exist, isn't `TsVector`, or the index
2003    /// name is taken.
2004    pub fn add_gin_index(&mut self, name: String, column_name: &str) -> Result<(), StorageError> {
2005        if self.indices.iter().any(|i| i.name == name) {
2006            return Err(StorageError::DuplicateIndex { name });
2007        }
2008        let column_position = self.schema.column_position(column_name).ok_or_else(|| {
2009            StorageError::ColumnNotFound {
2010                column: column_name.into(),
2011            }
2012        })?;
2013        if self.schema.columns[column_position].ty != DataType::TsVector {
2014            return Err(StorageError::Corrupt(format!(
2015                "GIN index {name:?} requires a tsvector column; \
2016                 {column_name:?} is {:?}",
2017                self.schema.columns[column_position].ty
2018            )));
2019        }
2020        let mut idx = Index::new_gin(name, column_position);
2021        if let IndexKind::Gin(map) = &mut idx.kind {
2022            for (i, row) in self.rows.iter().enumerate() {
2023                if let Value::TsVector(lexemes) = &row.values[column_position] {
2024                    for lex in lexemes {
2025                        let mut entries = map.get(&lex.word).cloned().unwrap_or_default();
2026                        entries.push(RowLocator::Hot(i));
2027                        map.insert_mut(lex.word.clone(), entries);
2028                    }
2029                }
2030            }
2031        }
2032        self.indices.push(idx);
2033        Ok(())
2034    }
2035
2036    /// v7.12.3 — Restore a GIN index from a deserialised snapshot.
2037    /// Mirrors [`Self::restore_btree_index`] but takes the GIN's
2038    /// `word → Vec<RowLocator>` posting-list map (already populated
2039    /// from the catalog stream) instead of an `IndexKey` map.
2040    pub fn restore_gin_index(
2041        &mut self,
2042        name: String,
2043        column_name: &str,
2044        map: PersistentBTreeMap<String, Vec<RowLocator>>,
2045    ) -> Result<(), StorageError> {
2046        if self.indices.iter().any(|i| i.name == name) {
2047            return Err(StorageError::DuplicateIndex { name });
2048        }
2049        let column_position = self.schema.column_position(column_name).ok_or_else(|| {
2050            StorageError::ColumnNotFound {
2051                column: column_name.into(),
2052            }
2053        })?;
2054        let mut idx = Index::new_gin(name, column_position);
2055        idx.kind = IndexKind::Gin(map);
2056        self.indices.push(idx);
2057        Ok(())
2058    }
2059
2060    /// v7.15.0 — `gin_trgm_ops` GIN over a TEXT column. Walks
2061    /// every row, shingles the cell into PG-compatible trigrams,
2062    /// and builds the posting-list map. NULL / non-TEXT cells
2063    /// contribute nothing (no trigrams).
2064    pub fn add_gin_trgm_index(
2065        &mut self,
2066        name: String,
2067        column_name: &str,
2068    ) -> Result<(), StorageError> {
2069        if self.indices.iter().any(|i| i.name == name) {
2070            return Err(StorageError::DuplicateIndex { name });
2071        }
2072        let column_position = self.schema.column_position(column_name).ok_or_else(|| {
2073            StorageError::ColumnNotFound {
2074                column: column_name.into(),
2075            }
2076        })?;
2077        if !matches!(
2078            self.schema.columns[column_position].ty,
2079            DataType::Text | DataType::Varchar(_)
2080        ) {
2081            return Err(StorageError::Corrupt(format!(
2082                "trigram-GIN index {name:?} requires a TEXT/VARCHAR column; \
2083                 {column_name:?} is {:?}",
2084                self.schema.columns[column_position].ty
2085            )));
2086        }
2087        let mut idx = Index::new_gin_trgm(name, column_position);
2088        if let IndexKind::GinTrgm(map) = &mut idx.kind {
2089            for (i, row) in self.rows.iter().enumerate() {
2090                if let Value::Text(s) = &row.values[column_position] {
2091                    for tri in trgm::extract_trigrams(s) {
2092                        let mut entries = map.get(&tri).cloned().unwrap_or_default();
2093                        entries.push(RowLocator::Hot(i));
2094                        map.insert_mut(tri, entries);
2095                    }
2096                }
2097            }
2098        }
2099        self.indices.push(idx);
2100        Ok(())
2101    }
2102
2103    /// v7.15.0 — restore a trigram-GIN from its catalog snapshot
2104    /// payload. Mirrors [`Self::restore_gin_index`].
2105    pub fn restore_gin_trgm_index(
2106        &mut self,
2107        name: String,
2108        column_name: &str,
2109        map: PersistentBTreeMap<String, Vec<RowLocator>>,
2110    ) -> Result<(), StorageError> {
2111        if self.indices.iter().any(|i| i.name == name) {
2112            return Err(StorageError::DuplicateIndex { name });
2113        }
2114        let column_position = self.schema.column_position(column_name).ok_or_else(|| {
2115            StorageError::ColumnNotFound {
2116                column: column_name.into(),
2117            }
2118        })?;
2119        let mut idx = Index::new_gin_trgm(name, column_position);
2120        idx.kind = IndexKind::GinTrgm(map);
2121        self.indices.push(idx);
2122        Ok(())
2123    }
2124
2125    /// v7.17.0 Phase 2.2 — MySQL `FULLTEXT KEY` GIN over a TEXT
2126    /// column. Walks every row, tokenises the cell into lower-
2127    /// cased word lexemes (`fts_simple::simple_lex` — same rule
2128    /// as `to_tsvector('simple', text)`), and builds the
2129    /// posting-list map. NULL / non-TEXT cells contribute
2130    /// nothing (no lexemes).
2131    pub fn add_gin_fulltext_index(
2132        &mut self,
2133        name: String,
2134        column_name: &str,
2135    ) -> Result<(), StorageError> {
2136        if self.indices.iter().any(|i| i.name == name) {
2137            return Err(StorageError::DuplicateIndex { name });
2138        }
2139        let column_position = self.schema.column_position(column_name).ok_or_else(|| {
2140            StorageError::ColumnNotFound {
2141                column: column_name.into(),
2142            }
2143        })?;
2144        if !matches!(
2145            self.schema.columns[column_position].ty,
2146            DataType::Text | DataType::Varchar(_)
2147        ) {
2148            return Err(StorageError::Corrupt(format!(
2149                "fulltext-GIN index {name:?} requires a TEXT/VARCHAR column; \
2150                 {column_name:?} is {:?}",
2151                self.schema.columns[column_position].ty
2152            )));
2153        }
2154        let mut idx = Index::new_gin_fulltext(name, column_position);
2155        if let IndexKind::GinFulltext(map) = &mut idx.kind {
2156            for (i, row) in self.rows.iter().enumerate() {
2157                if let Value::Text(s) = &row.values[column_position] {
2158                    for lex in fts_simple::simple_lex(s) {
2159                        let mut entries = map.get(&lex).cloned().unwrap_or_default();
2160                        entries.push(RowLocator::Hot(i));
2161                        map.insert_mut(lex, entries);
2162                    }
2163                }
2164            }
2165        }
2166        self.indices.push(idx);
2167        Ok(())
2168    }
2169
2170    /// v7.17.0 Phase 2.2 — restore a fulltext-GIN from its
2171    /// catalog snapshot payload. Mirrors
2172    /// [`Self::restore_gin_trgm_index`].
2173    pub fn restore_gin_fulltext_index(
2174        &mut self,
2175        name: String,
2176        column_name: &str,
2177        map: PersistentBTreeMap<String, Vec<RowLocator>>,
2178    ) -> Result<(), StorageError> {
2179        if self.indices.iter().any(|i| i.name == name) {
2180            return Err(StorageError::DuplicateIndex { name });
2181        }
2182        let column_position = self.schema.column_position(column_name).ok_or_else(|| {
2183            StorageError::ColumnNotFound {
2184                column: column_name.into(),
2185            }
2186        })?;
2187        let mut idx = Index::new_gin_fulltext(name, column_position);
2188        idx.kind = IndexKind::GinFulltext(map);
2189        self.indices.push(idx);
2190        Ok(())
2191    }
2192
2193    /// v5.1: register cold-tier locators on a `BTree` index. Used
2194    /// after [`Catalog::load_segment_bytes`] to wire every cold-
2195    /// tier row's PK back to its segment so
2196    /// [`Catalog::lookup_by_pk`] can resolve it. Each call
2197    /// appends to the index — keys that already have hot or cold
2198    /// locators keep them. Returns the number of locators
2199    /// registered.
2200    ///
2201    /// Pre-v5.2 (freezer) this is the only path that adds Cold
2202    /// variants to a PB; post-freezer the background freezer
2203    /// thread produces these as a batch under the engine write
2204    /// lock and this API becomes its in-memory primitive.
2205    ///
2206    /// Errors if `index_name` doesn't exist or names an NSW graph
2207    /// (NSW indices don't carry per-key row locators — they're
2208    /// vector-search structures).
2209    pub fn register_cold_locators<I>(
2210        &mut self,
2211        index_name: &str,
2212        locators: I,
2213    ) -> Result<usize, StorageError>
2214    where
2215        I: IntoIterator<Item = (IndexKey, RowLocator)>,
2216    {
2217        let idx = self
2218            .indices
2219            .iter_mut()
2220            .find(|i| i.name == index_name)
2221            .ok_or_else(|| StorageError::Corrupt(format!("index {index_name:?} not found")))?;
2222        let map = match &mut idx.kind {
2223            IndexKind::BTree(map) => map,
2224            IndexKind::Nsw(_)
2225            | IndexKind::Brin { .. }
2226            | IndexKind::Gin(_)
2227            | IndexKind::GinTrgm(_)
2228            | IndexKind::GinFulltext(_) => {
2229                return Err(StorageError::Corrupt(format!(
2230                    "index {index_name:?} is not BTree; cold locators apply only to BTree indices"
2231                )));
2232            }
2233        };
2234        let mut count = 0usize;
2235        for (key, locator) in locators {
2236            let mut entries = map.get(&key).cloned().unwrap_or_default();
2237            entries.push(locator);
2238            map.insert_mut(key, entries);
2239            count += 1;
2240        }
2241        Ok(count)
2242    }
2243
2244    /// v7.12.3 — GIN-side parallel to [`Self::register_cold_locators`].
2245    /// Re-attaches `word → cold RowLocator` posting-list entries after
2246    /// the from-rows rebuild loop. Errors when the index doesn't
2247    /// exist or isn't a GIN. Both tsvector-GIN and trigram-GIN
2248    /// variants share posting-list shape (`String → Vec<RowLocator>`),
2249    /// so this helper accepts either.
2250    pub fn register_gin_cold_locators<I>(
2251        &mut self,
2252        index_name: &str,
2253        locators: I,
2254    ) -> Result<usize, StorageError>
2255    where
2256        I: IntoIterator<Item = (String, RowLocator)>,
2257    {
2258        let idx = self
2259            .indices
2260            .iter_mut()
2261            .find(|i| i.name == index_name)
2262            .ok_or_else(|| StorageError::Corrupt(format!("index {index_name:?} not found")))?;
2263        let map = match &mut idx.kind {
2264            // v7.17.0 Phase 2.2 — fulltext-GIN posting lists are
2265            // shape-compatible with tsvector / trigram GINs, so
2266            // cold-locator re-attach handles all three.
2267            IndexKind::Gin(map) | IndexKind::GinTrgm(map) | IndexKind::GinFulltext(map) => map,
2268            IndexKind::BTree(_) | IndexKind::Nsw(_) | IndexKind::Brin { .. } => {
2269                return Err(StorageError::Corrupt(format!(
2270                    "register_gin_cold_locators: index {index_name:?} is not GIN"
2271                )));
2272            }
2273        };
2274        let mut count = 0usize;
2275        for (word, locator) in locators {
2276            let mut entries = map.get(&word).cloned().unwrap_or_default();
2277            entries.push(locator);
2278            map.insert_mut(word, entries);
2279            count += 1;
2280        }
2281        Ok(count)
2282    }
2283
2284    /// v5.2.3: remove every `Cold` locator currently registered on
2285    /// `index_name` under the given `key`. `Hot` locators for the
2286    /// same key are left in place — useful when a row has just been
2287    /// promoted hot-side and the caller wants the old Cold pointer
2288    /// retired without losing the new hot entry.
2289    ///
2290    /// Returns the number of cold locators removed (0 when the key
2291    /// has only hot entries or the key isn't present at all).
2292    /// Errors when the index doesn't exist or isn't a `BTree`.
2293    pub fn remove_cold_locators_for_key(
2294        &mut self,
2295        index_name: &str,
2296        key: &IndexKey,
2297    ) -> Result<usize, StorageError> {
2298        let idx = self
2299            .indices
2300            .iter_mut()
2301            .find(|i| i.name == index_name)
2302            .ok_or_else(|| {
2303                StorageError::Corrupt(format!(
2304                    "remove_cold_locators_for_key: index {index_name:?} not found"
2305                ))
2306            })?;
2307        let map = match &mut idx.kind {
2308            IndexKind::BTree(map) => map,
2309            IndexKind::Nsw(_)
2310            | IndexKind::Brin { .. }
2311            | IndexKind::Gin(_)
2312            | IndexKind::GinTrgm(_)
2313            | IndexKind::GinFulltext(_) => {
2314                return Err(StorageError::Corrupt(format!(
2315                    "remove_cold_locators_for_key: index {index_name:?} is not BTree; \
2316                     cold locators apply only to BTree indices"
2317                )));
2318            }
2319        };
2320        let Some(entries) = map.get(key) else {
2321            return Ok(0);
2322        };
2323        let mut kept: Vec<RowLocator> =
2324            entries.iter().copied().filter(RowLocator::is_hot).collect();
2325        let removed = entries.len() - kept.len();
2326        if removed == 0 {
2327            return Ok(0);
2328        }
2329        kept.shrink_to_fit();
2330        // PersistentBTreeMap has no remove API in v5.2; when every
2331        // locator for `key` was Cold, the key keeps an empty Vec
2332        // entry. `Index::lookup_eq` already treats `Some(&[])` and
2333        // `None` as the same empty slice (via `Vec::as_slice`), so
2334        // callers can't distinguish the two. The space cost is one
2335        // empty Vec per shadowed-then-promoted key — bounded and
2336        // recoverable when the future compaction job lands.
2337        map.insert_mut(key.clone(), kept);
2338        Ok(removed)
2339    }
2340
2341    /// v7.13.0 — append a new column to the schema and back-fill
2342    /// every existing row with `fill_value`. Used by the engine's
2343    /// `ALTER TABLE t ADD COLUMN …` handler (mailrs round-5 G1).
2344    /// Indices on existing columns keep working — column positions
2345    /// don't shift since the new column lands at the end — so no
2346    /// index rebuild is needed.
2347    pub fn add_column(&mut self, col: ColumnSchema, fill_value: Value) {
2348        self.schema.columns.push(col);
2349        let mut new_rows: PersistentVec<Row> = PersistentVec::new();
2350        for row in self.rows.iter() {
2351            let mut values = row.values.clone();
2352            values.push(fill_value.clone());
2353            new_rows.push_mut(Row::new(values));
2354        }
2355        self.rows = new_rows;
2356    }
2357
2358    /// v7.15.0 — replace the partial-index predicate source on
2359    /// the index at slot `idx`. Used by `ALTER TABLE … RENAME
2360    /// COLUMN` after the engine rewrites column-identifier
2361    /// references in the predicate source text. Pure metadata
2362    /// edit; index rows are unaffected (they're keyed by
2363    /// column position, not predicate text).
2364    pub fn set_partial_predicate(&mut self, idx: usize, pred: Option<String>) {
2365        debug_assert!(idx < self.indices.len());
2366        self.indices[idx].partial_predicate = pred;
2367    }
2368
2369    /// v7.15.0 — rename the column at `col_pos` to `new_name`.
2370    /// The on-disk row encoding is positional, so no row rewrite
2371    /// is needed; only the schema's column name changes. Indices,
2372    /// UCs, FKs all key off column positions and are unaffected.
2373    /// Source-text references that hold the column name (CHECK
2374    /// predicates, partial-index predicates, runtime DEFAULT
2375    /// expressions, trigger `UPDATE OF` lists) are rewritten by
2376    /// the engine before this helper is called — the storage
2377    /// layer doesn't depend on `spg-sql` and so can't re-parse the
2378    /// predicate sources itself.
2379    pub fn rename_column(&mut self, col_pos: usize, new_name: &str) {
2380        debug_assert!(col_pos < self.schema.columns.len());
2381        self.schema.columns[col_pos].name = new_name.to_string();
2382    }
2383
2384    /// v7.13.3 — drop the column at `col_pos`. Removes the entry
2385    /// from the schema, the value from every row, any index that
2386    /// references the column (pure drop, not shift), and shifts
2387    /// every remaining index/UC/FK column position that pointed
2388    /// past `col_pos` down by one. Used by `ALTER TABLE t DROP
2389    /// COLUMN <c>` (mailrs round-7 S8). FK dependents on this
2390    /// column must already have been removed by the caller (CASCADE
2391    /// path); the helper assumes only same-column index removal is
2392    /// needed.
2393    pub fn drop_column(&mut self, col_pos: usize) {
2394        debug_assert!(col_pos < self.schema.columns.len());
2395        // Strip the column from the schema.
2396        self.schema.columns.remove(col_pos);
2397        // Rewrite every row to omit the cell at col_pos.
2398        let mut new_rows: PersistentVec<Row> = PersistentVec::new();
2399        for row in self.rows.iter() {
2400            let mut values = row.values.clone();
2401            if col_pos < values.len() {
2402                values.remove(col_pos);
2403            }
2404            new_rows.push_mut(Row::new(values));
2405        }
2406        self.rows = new_rows;
2407        // Drop indices on the column outright; shift the rest.
2408        self.indices.retain(|idx| idx.column_position != col_pos);
2409        for idx in &mut self.indices {
2410            if idx.column_position > col_pos {
2411                idx.column_position -= 1;
2412            }
2413            // Same shift for any included-columns reference.
2414            for inc in &mut idx.included_columns {
2415                if *inc > col_pos {
2416                    *inc -= 1;
2417                }
2418            }
2419        }
2420        // Shift uniqueness-constraint column positions (and drop
2421        // entries that lose all columns, though that shouldn't
2422        // happen in practice — caller has already CASCADE-removed
2423        // FKs and there's no general CASCADE for UCs).
2424        let mut surviving_ucs: Vec<UniquenessConstraint> = Vec::new();
2425        for mut uc in core::mem::take(&mut self.schema.uniqueness_constraints) {
2426            uc.columns.retain(|&c| c != col_pos);
2427            if uc.columns.is_empty() {
2428                continue;
2429            }
2430            for c in &mut uc.columns {
2431                if *c > col_pos {
2432                    *c -= 1;
2433                }
2434            }
2435            surviving_ucs.push(uc);
2436        }
2437        self.schema.uniqueness_constraints = surviving_ucs;
2438        // Shift FK local_columns (parent-pointing column positions
2439        // are off-table and untouched).
2440        for fk in &mut self.schema.foreign_keys {
2441            for c in &mut fk.local_columns {
2442                if *c > col_pos {
2443                    *c -= 1;
2444                }
2445            }
2446        }
2447        // Rebuild remaining indices' payload — the column-position
2448        // shift means existing IndexKey entries are still keyed by
2449        // the same column data but the position numbers changed;
2450        // existing key→locator maps stay valid because they're
2451        // keyed by Value not position. The rebuild is conservative
2452        // — same pattern delete_rows uses post-mutation.
2453        self.rebuild_indices();
2454    }
2455
2456    /// v4.4: delete the rows at the given positions in one pass.
2457    /// `positions` must be unique; ordering doesn't matter. Indices
2458    /// are rebuilt from scratch (cheaper than tracking incremental
2459    /// shifts across both B-tree and NSW). Returns the number of
2460    /// rows removed.
2461    /// v7.17.0 Phase 1.3 — wipe every row. Used by REFRESH
2462    /// MATERIALIZED VIEW; same effect as `delete_rows((0..N).into())`
2463    /// but skips the per-position bookkeeping for the all-removed
2464    /// fast path. Indices are rebuilt (empty).
2465    pub fn truncate(&mut self) {
2466        self.rows = PersistentVec::new();
2467        self.hot_bytes = 0;
2468        self.rebuild_indices();
2469    }
2470
2471    pub fn delete_rows(&mut self, positions: &[usize]) -> usize {
2472        if positions.is_empty() {
2473            return 0;
2474        }
2475        // Mark positions; v4.39: PV has no in-place retain, so we rebuild
2476        // a fresh PV by pushing the survivors. Still O(n log₃₂ n); the
2477        // structural-sharing win shows up at `Catalog::clone()`, not here.
2478        let mut to_remove = alloc::vec![false; self.rows.len()];
2479        let mut removed = 0;
2480        for &p in positions {
2481            if p < to_remove.len() && !to_remove[p] {
2482                to_remove[p] = true;
2483                removed += 1;
2484            }
2485        }
2486        let mut new_rows: PersistentVec<Row> = PersistentVec::new();
2487        let mut removed_bytes: u64 = 0;
2488        for (i, row) in self.rows.iter().enumerate() {
2489            if to_remove[i] {
2490                removed_bytes =
2491                    removed_bytes.saturating_add(row_body_encoded_len(row, &self.schema) as u64);
2492            } else {
2493                new_rows.push_mut(row.clone());
2494            }
2495        }
2496        self.rows = new_rows;
2497        self.hot_bytes = self.hot_bytes.saturating_sub(removed_bytes);
2498        self.rebuild_indices();
2499        removed
2500    }
2501
2502    /// v4.4: replace the row at `position` with `new_values` (must
2503    /// match the schema arity + types). v7.20: index maintenance is
2504    /// incremental — only indices whose key value changed are
2505    /// touched (B-tree entry move in place; NSW / BRIN / GIN fall
2506    /// back to a full rebuild when their column changed).
2507    pub fn update_row(
2508        &mut self,
2509        position: usize,
2510        new_values: Vec<Value>,
2511    ) -> Result<(), StorageError> {
2512        if position >= self.rows.len() {
2513            return Err(StorageError::Corrupt(alloc::format!(
2514                "update_row: position {position} out of bounds (rows={})",
2515                self.rows.len()
2516            )));
2517        }
2518        if new_values.len() != self.schema.columns.len() {
2519            return Err(StorageError::ArityMismatch {
2520                expected: self.schema.columns.len(),
2521                actual: new_values.len(),
2522            });
2523        }
2524        // Reuse the per-cell type-compat validation that `insert`
2525        // applies. The body below mirrors that check intentionally —
2526        // factoring it would be more code than the duplication.
2527        for (i, (val, col)) in new_values.iter().zip(&self.schema.columns).enumerate() {
2528            if val.is_null() {
2529                if !col.nullable {
2530                    return Err(StorageError::NullInNotNull {
2531                        column: col.name.clone(),
2532                    });
2533                }
2534                continue;
2535            }
2536            let actual = val.data_type().expect("non-null");
2537            let compatible = actual == col.ty
2538                || matches!(
2539                    (actual, col.ty),
2540                    (
2541                        DataType::Text,
2542                        DataType::Varchar(_) | DataType::Char(_) | DataType::Json | DataType::Jsonb
2543                    ) | (DataType::Json | DataType::Jsonb, DataType::Text)
2544                        | (DataType::Json, DataType::Jsonb)
2545                        | (DataType::Jsonb, DataType::Json)
2546                        | (DataType::Timestamp, DataType::Timestamptz)
2547                        | (DataType::Timestamptz, DataType::Timestamp)
2548                )
2549                || matches!(
2550                    (actual, col.ty),
2551                    (
2552                        DataType::Numeric { scale: a, .. },
2553                        DataType::Numeric { scale: b, .. },
2554                    ) if a == b
2555                );
2556            if !compatible {
2557                return Err(StorageError::TypeMismatch {
2558                    column: col.name.clone(),
2559                    expected: col.ty,
2560                    actual,
2561                    position: i,
2562                });
2563            }
2564        }
2565        let old_row = self
2566            .rows
2567            .get(position)
2568            .expect("position bounds-checked above");
2569        let old_bytes = row_body_encoded_len(old_row, &self.schema) as u64;
2570        let new_row = Row::new(new_values);
2571        let new_bytes = row_body_encoded_len(&new_row, &self.schema) as u64;
2572        // v7.20 P4 — incremental index maintenance. `rows.set`
2573        // replaces the row in place, so every OTHER row's Hot
2574        // locator stays valid; only indices whose key value
2575        // actually changed at `position` need touching. The
2576        // common OLTP shape (`UPDATE … SET non_indexed_col = …
2577        // WHERE pk = $1`) touches no index at all — pre-v7.20
2578        // this path paid a full rebuild_indices() (O(rows ×
2579        // indices)) per UPDATE, which dominated the profiled
2580        // write cost on a 5k-row table (~1 ms/stmt).
2581        //
2582        // BTree gets an in-place entry move (drop Hot(position)
2583        // from the old key's locator list, append to the new
2584        // key's). NSW graphs / BRIN summaries / GIN posting
2585        // lists have no cheap single-key move — a changed column
2586        // under one of those falls back to the full rebuild.
2587        enum IdxFix {
2588            BTreeMove {
2589                idx_pos: usize,
2590                old_key: Option<IndexKey>,
2591                new_key: Option<IndexKey>,
2592            },
2593            FullRebuild,
2594        }
2595        let mut fixes: Vec<IdxFix> = Vec::new();
2596        for (idx_pos, idx) in self.indices.iter().enumerate() {
2597            let col = idx.column_position;
2598            let old_v = &old_row.values[col];
2599            let new_v = &new_row.values[col];
2600            if old_v == new_v {
2601                continue;
2602            }
2603            match &idx.kind {
2604                IndexKind::BTree(_) => fixes.push(IdxFix::BTreeMove {
2605                    idx_pos,
2606                    old_key: IndexKey::from_value(old_v),
2607                    new_key: IndexKey::from_value(new_v),
2608                }),
2609                IndexKind::Nsw(_)
2610                | IndexKind::Brin { .. }
2611                | IndexKind::Gin(_)
2612                | IndexKind::GinTrgm(_)
2613                | IndexKind::GinFulltext(_) => {
2614                    fixes.clear();
2615                    fixes.push(IdxFix::FullRebuild);
2616                    break;
2617                }
2618            }
2619        }
2620        self.rows = self
2621            .rows
2622            .set(position, new_row)
2623            .expect("position bounds-checked above");
2624        self.hot_bytes = self
2625            .hot_bytes
2626            .saturating_sub(old_bytes)
2627            .saturating_add(new_bytes);
2628        for fix in fixes {
2629            match fix {
2630                IdxFix::FullRebuild => {
2631                    self.rebuild_indices();
2632                    break;
2633                }
2634                IdxFix::BTreeMove {
2635                    idx_pos,
2636                    old_key,
2637                    new_key,
2638                } => {
2639                    let IndexKind::BTree(map) = &mut self.indices[idx_pos].kind else {
2640                        unreachable!("IdxFix::BTreeMove built from a BTree index");
2641                    };
2642                    // NULL keys never enter the B-tree (from_value
2643                    // returns None), so a None on either side means
2644                    // "no entry on that side".
2645                    if let Some(k) = old_key
2646                        && let Some(locs) = map.get(&k)
2647                    {
2648                        let mut locs = locs.clone();
2649                        locs.retain(|l| *l != RowLocator::Hot(position));
2650                        // No remove_mut on the persistent map: an
2651                        // empty locator list is the tombstone —
2652                        // lookup_eq returns an empty slice, and the
2653                        // next rebuild_indices() drops the key.
2654                        map.insert_mut(k, locs);
2655                    }
2656                    if let Some(k) = new_key {
2657                        let mut entries = map.get(&k).cloned().unwrap_or_default();
2658                        entries.push(RowLocator::Hot(position));
2659                        map.insert_mut(k, entries);
2660                    }
2661                }
2662            }
2663        }
2664        Ok(())
2665    }
2666
2667    /// v4.4 helper used by `delete_rows` / `update_row`: discard all
2668    /// index payloads and rebuild from `self.rows`. Cheap enough
2669    /// for typical SPG scale (catalogs in the docker-compose
2670    /// deployment shape are small); the alternative — incremental
2671    /// shift bookkeeping across B-tree + NSW — would be far more
2672    /// invasive than the savings justify.
2673    fn rebuild_indices(&mut self) {
2674        // v5.2.3: capture every `Cold` locator on every BTree index
2675        // before the rebuild, so the from-rows re-emission below
2676        // (which only produces `Hot` locators) doesn't drop cold-
2677        // tier entries on keys unrelated to the row that changed.
2678        // Pre-v5.2.3 this was a `freeze_oldest_to_cold` worry only
2679        // and the freezer did its own capture-then-reregister; v5.2.3
2680        // promotes that pattern into the base helper because UPDATE
2681        // / DELETE now run rebuild_indices on tables with cold rows.
2682        let preserved_cold: Vec<(String, Vec<(IndexKey, RowLocator)>)> = self
2683            .indices
2684            .iter()
2685            .filter_map(|idx| match &idx.kind {
2686                IndexKind::BTree(map) => {
2687                    let cold: Vec<(IndexKey, RowLocator)> = map
2688                        .iter()
2689                        .flat_map(|(k, locs)| {
2690                            locs.iter()
2691                                .filter(|l| l.is_cold())
2692                                .copied()
2693                                .map(move |l| (k.clone(), l))
2694                        })
2695                        .collect();
2696                    if cold.is_empty() {
2697                        None
2698                    } else {
2699                        Some((idx.name.clone(), cold))
2700                    }
2701                }
2702                // BRIN / NSW carry no key→locator map. GIN handles
2703                // its own cold preservation below in `preserved_gin_cold`.
2704                IndexKind::Nsw(_)
2705                | IndexKind::Brin { .. }
2706                | IndexKind::Gin(_)
2707                | IndexKind::GinTrgm(_)
2708                | IndexKind::GinFulltext(_) => None,
2709            })
2710            .collect();
2711
2712        // v7.12.3 — same cold-preservation pattern for GIN's
2713        // `word → Vec<RowLocator>` posting lists. Parallel to the
2714        // BTree pass above (different key type so a separate vec is
2715        // cleaner than a generic merge). v7.15.0: trigram-GIN
2716        // (`gin_trgm_ops`) shares the same posting-list shape, so
2717        // one pass handles both — the `RebuildKind` carries the
2718        // kind tag to drive resurrection.
2719        let preserved_gin_cold: Vec<(String, Vec<(String, RowLocator)>)> = self
2720            .indices
2721            .iter()
2722            .filter_map(|idx| match &idx.kind {
2723                // v7.17.0 Phase 2.2 — fulltext-GIN posting lists
2724                // share the `String → Vec<RowLocator>` shape, so
2725                // cold preservation handles all three GIN flavours
2726                // in one pass.
2727                IndexKind::Gin(map) | IndexKind::GinTrgm(map) | IndexKind::GinFulltext(map) => {
2728                    let cold: Vec<(String, RowLocator)> = map
2729                        .iter()
2730                        .flat_map(|(w, locs)| {
2731                            locs.iter()
2732                                .filter(|l| l.is_cold())
2733                                .copied()
2734                                .map(move |l| (w.clone(), l))
2735                        })
2736                        .collect();
2737                    if cold.is_empty() {
2738                        None
2739                    } else {
2740                        Some((idx.name.clone(), cold))
2741                    }
2742                }
2743                IndexKind::BTree(_) | IndexKind::Nsw(_) | IndexKind::Brin { .. } => None,
2744            })
2745            .collect();
2746
2747        // v6.7.1 — descriptor needs to capture index kind so the
2748        // rebuild loop can resurrect BTree / NSW / BRIN / GIN exactly
2749        // as they were. (NSW carries m; BRIN carries the column type
2750        // snapshot; BTree / GIN need no extra payload.)
2751        #[derive(Clone)]
2752        enum RebuildKind {
2753            BTree,
2754            Nsw(usize),
2755            Brin(DataType),
2756            Gin,
2757            GinTrgm,
2758            GinFulltext,
2759        }
2760        let descriptors: Vec<(String, usize, RebuildKind)> = self
2761            .indices
2762            .iter()
2763            .map(|idx| {
2764                let kind = match &idx.kind {
2765                    IndexKind::Nsw(g) => RebuildKind::Nsw(g.m),
2766                    IndexKind::Brin { column_type } => RebuildKind::Brin(*column_type),
2767                    IndexKind::BTree(_) => RebuildKind::BTree,
2768                    IndexKind::Gin(_) => RebuildKind::Gin,
2769                    IndexKind::GinTrgm(_) => RebuildKind::GinTrgm,
2770                    IndexKind::GinFulltext(_) => RebuildKind::GinFulltext,
2771                };
2772                (idx.name.clone(), idx.column_position, kind)
2773            })
2774            .collect();
2775        self.indices.clear();
2776        for (name, column_position, rebuild_kind) in descriptors {
2777            match rebuild_kind {
2778                RebuildKind::Nsw(m) => {
2779                    let idx = Index::new_nsw(name, column_position, m);
2780                    self.indices.push(idx);
2781                    let idx_pos = self.indices.len() - 1;
2782                    let row_indices: Vec<usize> = (0..self.rows.len()).collect();
2783                    for row_idx in row_indices {
2784                        nsw_insert_at(self, idx_pos, row_idx);
2785                    }
2786                }
2787                RebuildKind::Brin(column_type) => {
2788                    // BRIN has no in-memory rebuild — the summaries
2789                    // live in cold segments which freeze emits.
2790                    self.indices
2791                        .push(Index::new_brin(name, column_position, column_type));
2792                }
2793                RebuildKind::BTree => {
2794                    let mut idx = Index::new_btree(name, column_position);
2795                    if let IndexKind::BTree(map) = &mut idx.kind {
2796                        for (i, row) in self.rows.iter().enumerate() {
2797                            if let Some(key) = IndexKey::from_value(&row.values[column_position]) {
2798                                let mut entries = map.get(&key).cloned().unwrap_or_default();
2799                                entries.push(RowLocator::Hot(i));
2800                                map.insert_mut(key, entries);
2801                            }
2802                        }
2803                    }
2804                    self.indices.push(idx);
2805                }
2806                RebuildKind::Gin => {
2807                    let mut idx = Index::new_gin(name, column_position);
2808                    if let IndexKind::Gin(map) = &mut idx.kind {
2809                        for (i, row) in self.rows.iter().enumerate() {
2810                            if let Value::TsVector(lexemes) = &row.values[column_position] {
2811                                for lex in lexemes {
2812                                    let mut entries =
2813                                        map.get(&lex.word).cloned().unwrap_or_default();
2814                                    entries.push(RowLocator::Hot(i));
2815                                    map.insert_mut(lex.word.clone(), entries);
2816                                }
2817                            }
2818                        }
2819                    }
2820                    self.indices.push(idx);
2821                }
2822                RebuildKind::GinTrgm => {
2823                    let mut idx = Index::new_gin_trgm(name, column_position);
2824                    if let IndexKind::GinTrgm(map) = &mut idx.kind {
2825                        for (i, row) in self.rows.iter().enumerate() {
2826                            if let Value::Text(s) = &row.values[column_position] {
2827                                for tri in trgm::extract_trigrams(s) {
2828                                    let mut entries = map.get(&tri).cloned().unwrap_or_default();
2829                                    entries.push(RowLocator::Hot(i));
2830                                    map.insert_mut(tri, entries);
2831                                }
2832                            }
2833                        }
2834                    }
2835                    self.indices.push(idx);
2836                }
2837                RebuildKind::GinFulltext => {
2838                    // v7.17.0 Phase 2.2 — re-derive the lexeme
2839                    // posting list from each TEXT/VARCHAR cell.
2840                    // Mirrors the GinTrgm rebuild shape but
2841                    // tokenises via `fts_simple::simple_lex`
2842                    // (same rule as `to_tsvector('simple')`).
2843                    let mut idx = Index::new_gin_fulltext(name, column_position);
2844                    if let IndexKind::GinFulltext(map) = &mut idx.kind {
2845                        for (i, row) in self.rows.iter().enumerate() {
2846                            if let Value::Text(s) = &row.values[column_position] {
2847                                for lex in fts_simple::simple_lex(s) {
2848                                    let mut entries = map.get(&lex).cloned().unwrap_or_default();
2849                                    entries.push(RowLocator::Hot(i));
2850                                    map.insert_mut(lex, entries);
2851                                }
2852                            }
2853                        }
2854                    }
2855                    self.indices.push(idx);
2856                }
2857            }
2858        }
2859
2860        // Re-attach preserved cold locators after the from-rows
2861        // rebuild. `register_cold_locators` handles the per-key
2862        // entries-vec append; no key collisions arise because the
2863        // rebuild loop above produced only Hot locators.
2864        for (idx_name, locators) in preserved_cold {
2865            // Errors here would only fire if the index disappeared
2866            // between snapshot and rebuild, which can't happen
2867            // because the rebuild restores the same descriptor set.
2868            let _ = self.register_cold_locators(&idx_name, locators);
2869        }
2870        // v7.12.3 — same for GIN posting-list cold locators.
2871        for (idx_name, locators) in preserved_gin_cold {
2872            let _ = self.register_gin_cold_locators(&idx_name, locators);
2873        }
2874    }
2875
2876    fn add_nsw_index_inner(
2877        &mut self,
2878        name: String,
2879        column_name: &str,
2880        m: usize,
2881        restore: Option<NswGraph>,
2882    ) -> Result<(), StorageError> {
2883        if self.indices.iter().any(|i| i.name == name) {
2884            return Err(StorageError::DuplicateIndex { name });
2885        }
2886        let column_position = self.schema.column_position(column_name).ok_or_else(|| {
2887            StorageError::ColumnNotFound {
2888                column: column_name.into(),
2889            }
2890        })?;
2891        if !matches!(
2892            self.schema.columns[column_position].ty,
2893            DataType::Vector { .. }
2894        ) {
2895            return Err(StorageError::TypeMismatch {
2896                column: column_name.into(),
2897                expected: DataType::Vector {
2898                    dim: 0,
2899                    encoding: VecEncoding::F32,
2900                },
2901                actual: self.schema.columns[column_position].ty,
2902                position: column_position,
2903            });
2904        }
2905        if let Some(graph) = restore {
2906            self.indices.push(Index {
2907                name,
2908                column_position,
2909                kind: IndexKind::Nsw(graph),
2910                included_columns: Vec::new(),
2911                partial_predicate: None,
2912                expression: None,
2913                is_unique: false,
2914                extra_column_positions: Vec::new(),
2915            });
2916            return Ok(());
2917        }
2918        let idx = Index::new_nsw(name, column_position, m);
2919        self.indices.push(idx);
2920        let idx_pos = self.indices.len() - 1;
2921        // Bulk-build by walking the existing rows in order — each insert
2922        // sees the partial graph and links into it.
2923        let row_indices: Vec<usize> = (0..self.rows.len()).collect();
2924        for row_idx in row_indices {
2925            nsw_insert_at(self, idx_pos, row_idx);
2926        }
2927        Ok(())
2928    }
2929}
2930
2931/// v6.0.4 — re-encode a single cell to the target `VecEncoding`.
2932/// Used by `Table::rebuild_nsw_index` when ALTER INDEX REBUILD
2933/// includes the optional `WITH (encoding = …)` clause. Round-trip
2934/// goes through f32: `current → Vec<f32> → target`, leaving NULL
2935/// cells untouched. Returns `Unsupported` on a non-vector cell —
2936/// the caller should have rejected the schema before reaching this.
2937fn recode_vector_cell(cell: Value, target: VecEncoding) -> Result<Value, StorageError> {
2938    if matches!(cell, Value::Null) {
2939        return Ok(cell);
2940    }
2941    // Step 1 — extract the f32 representation of the source cell.
2942    let as_f32: Vec<f32> = match &cell {
2943        Value::Vector(v) => v.clone(),
2944        Value::Sq8Vector(q) => quantize::dequantize(q),
2945        Value::HalfVector(h) => h.to_f32_vec(),
2946        other => {
2947            return Err(StorageError::Unsupported(format!(
2948                "ALTER INDEX REBUILD: cannot recode non-vector cell {:?}",
2949                other.data_type()
2950            )));
2951        }
2952    };
2953    // Step 2 — encode into the target shape. `F32` is the identity
2954    // path (saves one alloc round-trip when the source is already
2955    // F32 — but `Value::Vector(as_f32)` is the right answer
2956    // regardless).
2957    Ok(match target {
2958        VecEncoding::F32 => Value::Vector(as_f32),
2959        VecEncoding::Sq8 => Value::Sq8Vector(quantize::quantize(&as_f32)),
2960        VecEncoding::F16 => Value::HalfVector(halfvec::HalfVector::from_f32_slice(&as_f32)),
2961    })
2962}
2963
2964/// Insert one row into the HNSW graph held by index slot `idx_pos`.
2965/// No-op when the row's value at the indexed column isn't a vector.
2966/// v6.0.1: handles `Value::Sq8Vector` by dequantising into an f32
2967/// "query" surface — the existing greedy + beam-search machinery
2968/// then uses `cell_to_query_metric_distance` to route every
2969/// distance call through the cell's actual encoding.
2970fn nsw_insert_at(table: &mut Table, idx_pos: usize, new_row_idx: usize) {
2971    let col_pos = table.indices[idx_pos].column_position;
2972    let cell_dim: Option<usize> = match &table.rows[new_row_idx].values[col_pos] {
2973        Value::Vector(v) => Some(v.len()),
2974        Value::Sq8Vector(q) => Some(q.bytes.len()),
2975        Value::HalfVector(h) => Some(h.dim()),
2976        _ => None,
2977    };
2978    let Some(dim) = cell_dim else {
2979        // Even non-vector rows occupy a level slot so per-node Vec
2980        // lengths stay aligned with `table.rows.len()`.
2981        ensure_node_slot(table, idx_pos, new_row_idx, 0);
2982        return;
2983    };
2984    if dim == 0 {
2985        ensure_node_slot(table, idx_pos, new_row_idx, 0);
2986        return;
2987    }
2988    let level = nsw_assign_level(new_row_idx);
2989    ensure_node_slot(table, idx_pos, new_row_idx, level);
2990    let (entry, entry_level, m) = match &table.indices[idx_pos].kind {
2991        IndexKind::Nsw(g) => (g.entry, g.entry_level, g.m),
2992        IndexKind::BTree(_)
2993        | IndexKind::Brin { .. }
2994        | IndexKind::Gin(_)
2995        | IndexKind::GinTrgm(_)
2996        | IndexKind::GinFulltext(_) => {
2997            unreachable!("nsw_insert_at on a non-NSW index")
2998        }
2999    };
3000    // First node ever — declare it the entry (it gets its own level).
3001    if entry.is_none() {
3002        if let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind {
3003            g.entry = Some(new_row_idx);
3004            g.entry_level = level;
3005            *g.levels
3006                .get_mut(new_row_idx)
3007                .expect("levels slot padded by ensure_node_slot") = level;
3008        }
3009        return;
3010    }
3011    // Set the node's recorded level.
3012    if let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind {
3013        *g.levels
3014            .get_mut(new_row_idx)
3015            .expect("levels slot padded by ensure_node_slot") = level;
3016    }
3017    let query = match &table.rows[new_row_idx].values[col_pos] {
3018        Value::Vector(v) => v.clone(),
3019        // v6.0.1: dequantise the inserted SQ8 cell into an f32 query
3020        // surface so the existing greedy / beam machinery can route
3021        // distances through `cell_to_query_metric_distance`. The
3022        // small dequantisation error is what the recall@10 ≥ 0.95
3023        // envelope already accounts for (V6_DESIGN deliberation #3).
3024        Value::Sq8Vector(q) => quantize::dequantize(q),
3025        // v6.0.3: halfvec dequant is bit-exact at the storage layer,
3026        // so the inserted query is a faithful representation.
3027        Value::HalfVector(h) => h.to_f32_vec(),
3028        _ => return,
3029    };
3030    // Phase 1: greedy descend from `entry` down to `level + 1`, keeping
3031    // exactly one current best so the next layer starts from it.
3032    let mut current = entry.expect("entry was Some above");
3033    let mut current_d = vec_l2_sq(table, col_pos, current, &query);
3034    if entry_level > level {
3035        for layer in (level + 1..=entry_level).rev() {
3036            (current, current_d) =
3037                greedy_layer_walk(table, idx_pos, layer, current, current_d, &query);
3038        }
3039    }
3040    // Phase 2: from `min(level, entry_level)` down to 0, beam-search
3041    // `ef_construction` candidates, run the HNSW §4 heuristic neighbour
3042    // selection over them, and connect bidirectionally.
3043    let top = level.min(entry_level);
3044    let ef = (m * 2).max(8);
3045    for layer in (0..=top).rev() {
3046        let cap = if layer == 0 { m * 2 } else { m };
3047        let mut candidates = layer_beam_search(
3048            table,
3049            idx_pos,
3050            layer,
3051            current,
3052            current_d,
3053            &query,
3054            ef,
3055            NswMetric::L2,
3056        );
3057        candidates.retain(|&(_, n)| n != new_row_idx);
3058        // Take the closest as the entry for the next layer down — done
3059        // before heuristic narrowing because the heuristic can reorder.
3060        if let Some(&(d, n)) = candidates.first() {
3061            current = n;
3062            current_d = d;
3063        }
3064        let peers = select_neighbours_heuristic(&candidates, cap, table, col_pos);
3065        connect_at_layer(table, idx_pos, layer, new_row_idx, &peers);
3066    }
3067    // Phase 3: if the new node climbed above the current entry, take
3068    // over as entry so future inserts/searches start from the new top.
3069    if level > entry_level
3070        && let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind
3071    {
3072        g.entry = Some(new_row_idx);
3073        g.entry_level = level;
3074    }
3075}
3076
3077/// Make sure `layers[*][new_row_idx]` and `levels[new_row_idx]` exist,
3078/// padding with empty/zero entries as needed. Also grows `layers` to
3079/// accommodate the node's top `level`.
3080fn ensure_node_slot(table: &mut Table, idx_pos: usize, new_row_idx: usize, level: u8) {
3081    let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind else {
3082        unreachable!("ensure_node_slot on a BTree index");
3083    };
3084    while g.layers.len() <= level as usize {
3085        g.layers.push(PersistentVec::new());
3086    }
3087    while g.levels.len() <= new_row_idx {
3088        g.levels.push_mut(0);
3089    }
3090    for layer_vec in &mut g.layers {
3091        while layer_vec.len() <= new_row_idx {
3092            layer_vec.push_mut(Vec::new());
3093        }
3094    }
3095}
3096
3097/// Single-step greedy walk on one layer: from `current` (with cached
3098/// distance `current_d`), inspect that node's neighbours at `layer` and
3099/// hop to the closest if it beats `current_d`. Repeat until no move
3100/// improves the distance. Cheap variant of beam-search used for the
3101/// "descend" phase that only needs one survivor per layer.
3102fn greedy_layer_walk(
3103    table: &Table,
3104    idx_pos: usize,
3105    layer: u8,
3106    mut current: usize,
3107    mut current_d: f32,
3108    query: &[f32],
3109) -> (usize, f32) {
3110    let g = match &table.indices[idx_pos].kind {
3111        IndexKind::Nsw(g) => g,
3112        IndexKind::BTree(_)
3113        | IndexKind::Brin { .. }
3114        | IndexKind::Gin(_)
3115        | IndexKind::GinTrgm(_)
3116        | IndexKind::GinFulltext(_) => {
3117            return (current, current_d);
3118        }
3119    };
3120    let col_pos = table.indices[idx_pos].column_position;
3121    loop {
3122        let neighbours: &[u32] = g
3123            .layers
3124            .get(layer as usize)
3125            .and_then(|layer_v| layer_v.get(current))
3126            .map_or(&[][..], Vec::as_slice);
3127        let mut best = current;
3128        let mut best_d = current_d;
3129        for &n in neighbours {
3130            let n = n as usize;
3131            let d = vec_l2_sq(table, col_pos, n, query);
3132            if d < best_d {
3133                best = n;
3134                best_d = d;
3135            }
3136        }
3137        if best == current {
3138            return (current, current_d);
3139        }
3140        current = best;
3141        current_d = best_d;
3142    }
3143}
3144
3145/// Beam search on one layer starting from `entry_node` with cached
3146/// `entry_d`. Returns the top `ef` candidates in ascending-distance
3147/// order. Caller picks the closest as the next layer's entry and / or
3148/// trims to M for connection.
3149///
3150/// v3.0.1: uses two `BinaryHeap`s (min-heap for the open frontier,
3151/// max-heap for the working top-`ef` results) and a `Vec<bool>` visited
3152/// bitmap, replacing the v2.x `Vec` + `partition_point` + `BTreeSet`
3153/// implementation. Same algorithm shape (HNSW search algorithm 2 from
3154/// the paper); the data-structure swap cuts per-visit cost from
3155/// `O(ef + log row_count)` to amortised `O(log ef)`.
3156#[allow(clippy::too_many_arguments)] // Beam search threads layer, entry, query, ef, metric — each is intrinsic. Bundling them into a config struct hides the call sites.
3157fn layer_beam_search(
3158    table: &Table,
3159    idx_pos: usize,
3160    layer: u8,
3161    entry_node: usize,
3162    entry_d: f32,
3163    query: &[f32],
3164    ef: usize,
3165    metric: NswMetric,
3166) -> Vec<(f32, usize)> {
3167    let g = match &table.indices[idx_pos].kind {
3168        IndexKind::Nsw(g) => g,
3169        IndexKind::BTree(_)
3170        | IndexKind::Brin { .. }
3171        | IndexKind::Gin(_)
3172        | IndexKind::GinTrgm(_)
3173        | IndexKind::GinFulltext(_) => return Vec::new(),
3174    };
3175    let col_pos = table.indices[idx_pos].column_position;
3176    let d0 = if matches!(metric, NswMetric::L2) {
3177        entry_d
3178    } else {
3179        cell_to_query_metric_distance(table, col_pos, entry_node, query, metric)
3180    };
3181    let row_count = table.rows.len();
3182    let mut visited: Vec<bool> = alloc::vec![false; row_count];
3183    if entry_node < row_count {
3184        visited[entry_node] = true;
3185    }
3186    // candidates: min-heap by distance (Closest wrapper) — frontier
3187    // results:    max-heap by distance (Furthest wrapper) — top-ef working set
3188    let mut candidates: alloc::collections::BinaryHeap<NodeClosest> =
3189        alloc::collections::BinaryHeap::with_capacity(ef);
3190    let mut results: alloc::collections::BinaryHeap<NodeFurthest> =
3191        alloc::collections::BinaryHeap::with_capacity(ef);
3192    candidates.push(NodeClosest {
3193        dist: d0,
3194        node: entry_node,
3195    });
3196    results.push(NodeFurthest {
3197        dist: d0,
3198        node: entry_node,
3199    });
3200    while let Some(cur) = candidates.pop() {
3201        let worst = results.peek().map_or(f32::INFINITY, |c| c.dist);
3202        if cur.dist > worst && results.len() >= ef {
3203            break;
3204        }
3205        let neighbours: &[u32] = g
3206            .layers
3207            .get(layer as usize)
3208            .and_then(|layer_v| layer_v.get(cur.node))
3209            .map_or(&[][..], Vec::as_slice);
3210        for &n in neighbours {
3211            let n = n as usize;
3212            if n >= row_count || visited[n] {
3213                continue;
3214            }
3215            visited[n] = true;
3216            // v6.0.1: cell-aware distance — F32 cells take the
3217            // existing scalar metric, SQ8 cells route through
3218            // the asymmetric ADC variant for the same metric.
3219            let dn = cell_to_query_metric_distance(table, col_pos, n, query, metric);
3220            if !dn.is_finite() {
3221                continue;
3222            }
3223            let worst = results.peek().map_or(f32::INFINITY, |c| c.dist);
3224            if results.len() < ef || dn < worst {
3225                results.push(NodeFurthest { dist: dn, node: n });
3226                if results.len() > ef {
3227                    results.pop();
3228                }
3229                candidates.push(NodeClosest { dist: dn, node: n });
3230            }
3231        }
3232    }
3233    // Drain results (max-heap order) and re-sort ascending so callers
3234    // can take `closest = result[0]` without flipping.
3235    let mut out: Vec<(f32, usize)> = results.into_iter().map(|c| (c.dist, c.node)).collect();
3236    out.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
3237    out
3238}
3239
3240/// Min-heap wrapper: smaller `dist` → higher priority in a `BinaryHeap`
3241/// (which is a max-heap), so we flip the comparison. NaN sorts last
3242/// (lowest priority) to keep the heap total-ordered.
3243#[derive(Debug, Clone, Copy)]
3244struct NodeClosest {
3245    dist: f32,
3246    node: usize,
3247}
3248impl PartialEq for NodeClosest {
3249    fn eq(&self, other: &Self) -> bool {
3250        self.dist == other.dist && self.node == other.node
3251    }
3252}
3253impl Eq for NodeClosest {}
3254impl PartialOrd for NodeClosest {
3255    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
3256        Some(self.cmp(other))
3257    }
3258}
3259impl Ord for NodeClosest {
3260    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
3261        // Reversed: smaller dist = greater priority.
3262        other
3263            .dist
3264            .partial_cmp(&self.dist)
3265            .unwrap_or(core::cmp::Ordering::Equal)
3266    }
3267}
3268
3269/// Max-heap wrapper: larger `dist` sits at the top so the worst result
3270/// can be evicted in O(log n) when a better candidate arrives.
3271#[derive(Debug, Clone, Copy)]
3272struct NodeFurthest {
3273    dist: f32,
3274    node: usize,
3275}
3276impl PartialEq for NodeFurthest {
3277    fn eq(&self, other: &Self) -> bool {
3278        self.dist == other.dist && self.node == other.node
3279    }
3280}
3281impl Eq for NodeFurthest {}
3282impl PartialOrd for NodeFurthest {
3283    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
3284        Some(self.cmp(other))
3285    }
3286}
3287impl Ord for NodeFurthest {
3288    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
3289        self.dist
3290            .partial_cmp(&other.dist)
3291            .unwrap_or(core::cmp::Ordering::Equal)
3292    }
3293}
3294
3295/// HNSW paper §4 algorithm 4: pick `m` neighbours from `candidates` so
3296/// that each chosen point isn't already covered by a closer chosen
3297/// point. Improves graph diversity → fewer hops needed at search time.
3298///
3299/// `candidates` arrives sorted ascending by distance-to-query. We walk
3300/// it in order, keeping a candidate only when no already-chosen point
3301/// is closer to it than the query is. Result is a vector of row
3302/// indices (length ≤ `m`).
3303fn select_neighbours_heuristic(
3304    candidates: &[(f32, usize)],
3305    m: usize,
3306    table: &Table,
3307    col_pos: usize,
3308) -> Vec<usize> {
3309    let mut chosen: Vec<usize> = Vec::with_capacity(m);
3310    for &(d_q, e) in candidates {
3311        if chosen.len() >= m {
3312            break;
3313        }
3314        // v6.0.1: works on either `Value::Vector` (F32) or
3315        // `Value::Sq8Vector` (Sq8) cells — `cell_l2_sq` dispatches
3316        // on encoding. A non-vector cell yields `f32::INFINITY`
3317        // which the `< d_q` test will never accept.
3318        if !matches!(
3319            table.rows.get(e).and_then(|r| r.values.get(col_pos)),
3320            Some(Value::Vector(_) | Value::Sq8Vector(_) | Value::HalfVector(_))
3321        ) {
3322            continue;
3323        }
3324        let mut covered = false;
3325        for &r in &chosen {
3326            // dist(e, r) measured in the same metric the topology was
3327            // built with (L2). If a chosen `r` is closer to `e` than
3328            // the query is, `r` already "covers" `e` for navigation.
3329            if cell_l2_sq(table, col_pos, e, r) < d_q {
3330                covered = true;
3331                break;
3332            }
3333        }
3334        if !covered {
3335            chosen.push(e);
3336        }
3337    }
3338    chosen
3339}
3340
3341/// Bidirectionally connect `new_row_idx` to each of `peers` at `layer`,
3342/// trimming each endpoint's adjacency to that layer's degree cap by
3343/// keeping only the closest neighbours.
3344fn connect_at_layer(
3345    table: &mut Table,
3346    idx_pos: usize,
3347    layer: u8,
3348    new_row_idx: usize,
3349    peers: &[usize],
3350) {
3351    let col_pos = table.indices[idx_pos].column_position;
3352    let cap = match &table.indices[idx_pos].kind {
3353        IndexKind::Nsw(g) => g.cap_for_layer(layer),
3354        IndexKind::BTree(_)
3355        | IndexKind::Brin { .. }
3356        | IndexKind::Gin(_)
3357        | IndexKind::GinTrgm(_)
3358        | IndexKind::GinFulltext(_) => return,
3359    };
3360    // v6.1.x: NSW adjacency stores neighbour row indices as u32 (4 B
3361    // each) rather than usize (8 B on 64-bit). Boundary casts here
3362    // assert the row count fits in u32 — the catalog already enforces
3363    // ≤ 4G rows per table, so the conversion can't lose data.
3364    let new_row_u32 = u32::try_from(new_row_idx).expect("row index fits in u32");
3365    if let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind {
3366        let layer_v = &mut g.layers[layer as usize];
3367        if let Some(slot) = layer_v.get_mut(new_row_idx) {
3368            *slot = peers
3369                .iter()
3370                .map(|&p| u32::try_from(p).expect("row index fits in u32"))
3371                .collect();
3372        }
3373    }
3374    for &peer in peers {
3375        // Skip peers whose indexed cell isn't a vector — same fence
3376        // as the F32 path; SQ8 cells flow through `cell_l2_sq`
3377        // below without dequantising.
3378        if !matches!(
3379            &table.rows[peer].values[col_pos],
3380            Value::Vector(_) | Value::Sq8Vector(_) | Value::HalfVector(_)
3381        ) {
3382            continue;
3383        }
3384        // 1. add the new node to peer's adjacency
3385        if let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind {
3386            let layer_v = &mut g.layers[layer as usize];
3387            if let Some(slot) = layer_v.get_mut(peer)
3388                && !slot.contains(&new_row_u32)
3389            {
3390                slot.push(new_row_u32);
3391            }
3392        }
3393        // 2. if peer is over budget, rebuild its adjacency with the
3394        //    HNSW §4 heuristic — same diversity criterion as the
3395        //    insert path so connectivity stays consistent.
3396        let needs_trim = match &table.indices[idx_pos].kind {
3397            IndexKind::Nsw(g) => g.layers[layer as usize][peer].len() > cap,
3398            IndexKind::BTree(_)
3399            | IndexKind::Brin { .. }
3400            | IndexKind::Gin(_)
3401            | IndexKind::GinTrgm(_)
3402            | IndexKind::GinFulltext(_) => false,
3403        };
3404        if needs_trim {
3405            let current_peers: Vec<usize> = match &table.indices[idx_pos].kind {
3406                IndexKind::Nsw(g) => g.layers[layer as usize][peer]
3407                    .iter()
3408                    .map(|&n| n as usize)
3409                    .collect(),
3410                IndexKind::BTree(_)
3411                | IndexKind::Brin { .. }
3412                | IndexKind::Gin(_)
3413                | IndexKind::GinTrgm(_)
3414                | IndexKind::GinFulltext(_) => continue,
3415            };
3416            // Sort by distance from `peer`'s cell ascending so the
3417            // heuristic receives candidates closest-first. `cell_l2_sq`
3418            // dispatches on encoding so SQ8 columns trim using
3419            // symmetric ADC.
3420            let mut tagged: Vec<(f32, usize)> = current_peers
3421                .iter()
3422                .map(|&p| (cell_l2_sq(table, col_pos, peer, p), p))
3423                .collect();
3424            tagged.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
3425            let kept = select_neighbours_heuristic(&tagged, cap, table, col_pos);
3426            if let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind
3427                && let Some(slot) = g.layers[layer as usize].get_mut(peer)
3428            {
3429                *slot = kept
3430                    .into_iter()
3431                    .map(|p| u32::try_from(p).expect("row index fits in u32"))
3432                    .collect();
3433            }
3434        }
3435    }
3436}
3437
3438/// Squared L2 distance from `query` (raw f32) to the cell at
3439/// `(row, col_pos)`. Dispatches on cell encoding: `Value::Vector`
3440/// (F32) uses `l2_distance_sq`; `Value::Sq8Vector` uses
3441/// `sq8_l2_distance_sq_asymmetric` (the v6.0.1 quantised path).
3442/// Returns `f32::INFINITY` for any non-vector cell so callers can
3443/// compare uniformly.
3444fn vec_l2_sq(table: &Table, col_pos: usize, row: usize, query: &[f32]) -> f32 {
3445    match table.rows.get(row).and_then(|r| r.values.get(col_pos)) {
3446        Some(Value::Vector(v)) if v.len() == query.len() => l2_distance_sq(v, query),
3447        Some(Value::Sq8Vector(q)) if q.bytes.len() == query.len() => {
3448            quantize::sq8_l2_distance_sq_asymmetric(q, query)
3449        }
3450        // v6.0.6: halfvec → fused NEON SIMD kernel; no Vec<f32>
3451        // allocation. v6.0.3 used `to_f32_vec()` + f32 NEON which
3452        // was correct but allocated per call (5× slower than F32).
3453        Some(Value::HalfVector(h)) if h.dim() == query.len() => {
3454            halfvec::half_l2_distance_sq_asymmetric(h, query)
3455        }
3456        _ => f32::INFINITY,
3457    }
3458}
3459
3460/// Squared L2 distance between two stored cells (no f32 query in
3461/// sight). Used during HNSW graph build — both endpoints are
3462/// rows already in the table, so symmetric ADC applies for SQ8
3463/// columns. Mixed-encoding cells within one column are a
3464/// schema-level impossibility (INSERT-time coercion enforces
3465/// uniform encoding), so the catch-all is an abort.
3466fn cell_l2_sq(table: &Table, col_pos: usize, row_a: usize, row_b: usize) -> f32 {
3467    let Some(cell_a) = table.rows.get(row_a).and_then(|r| r.values.get(col_pos)) else {
3468        return f32::INFINITY;
3469    };
3470    let Some(cell_b) = table.rows.get(row_b).and_then(|r| r.values.get(col_pos)) else {
3471        return f32::INFINITY;
3472    };
3473    match (cell_a, cell_b) {
3474        (Value::Vector(a), Value::Vector(b)) if a.len() == b.len() => l2_distance_sq(a, b),
3475        (Value::Sq8Vector(a), Value::Sq8Vector(b)) if a.bytes.len() == b.bytes.len() => {
3476            quantize::sq8_l2_distance_sq(a, b)
3477        }
3478        // v6.0.6: halfvec symmetric NEON — fused SIMD kernel that
3479        // loads both cells' raw u16 bits, expands to f32 lanes
3480        // inline, FMA-accumulates the squared diff. No Vec<f32>
3481        // allocation per call.
3482        (Value::HalfVector(a), Value::HalfVector(b)) if a.dim() == b.dim() => {
3483            halfvec::half_l2_distance_sq(a, b)
3484        }
3485        _ => f32::INFINITY,
3486    }
3487}
3488
3489/// kNN-search-time distance: stored cell → f32 query under the
3490/// caller's metric. Dispatches on cell encoding so SQ8 columns
3491/// take the ADC path with the right asymmetric variant. NaN /
3492/// dim-mismatch / non-vector → `f32::INFINITY`.
3493fn cell_to_query_metric_distance(
3494    table: &Table,
3495    col_pos: usize,
3496    row: usize,
3497    query: &[f32],
3498    metric: NswMetric,
3499) -> f32 {
3500    match table.rows.get(row).and_then(|r| r.values.get(col_pos)) {
3501        Some(Value::Vector(v)) if v.len() == query.len() => metric_distance(metric, v, query),
3502        Some(Value::Sq8Vector(q)) if q.bytes.len() == query.len() => match metric {
3503            NswMetric::L2 => quantize::sq8_l2_distance_sq_asymmetric(q, query),
3504            NswMetric::InnerProduct => quantize::sq8_inner_product_asymmetric(q, query),
3505            NswMetric::Cosine => quantize::sq8_cosine_distance_asymmetric(q, query),
3506        },
3507        // v6.0.6: halfvec dispatches by metric to fused NEON
3508        // kernels — no Vec<f32> allocation per call.
3509        Some(Value::HalfVector(h)) if h.dim() == query.len() => match metric {
3510            NswMetric::L2 => halfvec::half_l2_distance_sq_asymmetric(h, query),
3511            NswMetric::InnerProduct => halfvec::half_inner_product_asymmetric(h, query),
3512            NswMetric::Cosine => halfvec::half_cosine_distance_asymmetric(h, query),
3513        },
3514        _ => f32::INFINITY,
3515    }
3516}
3517
3518/// Distance metric used at NSW search time. The graph topology is
3519/// always built with `L2`; querying with `InnerProduct` / `Cosine`
3520/// reuses the same edges but ranks candidates by the chosen metric.
3521/// For the corpus-sized graphs this loses negligible recall vs
3522/// building separate per-metric graphs.
3523#[derive(Debug, Clone, Copy, PartialEq, Eq)]
3524pub enum NswMetric {
3525    /// Squared Euclidean — ranks "smaller = closer" (the sqrt is
3526    /// monotonic so we skip it for ordering).
3527    L2,
3528    /// Negated dot product, matching pgvector `<#>` convention so
3529    /// "smaller = more similar" holds across all three metrics.
3530    InnerProduct,
3531    /// Cosine distance `1 - cos(a, b)`. Zero-norm operand yields
3532    /// `f32::INFINITY` so it sorts last.
3533    Cosine,
3534}
3535
3536/// Multi-layer HNSW kNN search: greedy-descend from the entry to layer 0,
3537/// then beam-search there with the requested `ef` to return the top `k`
3538/// results under the caller-chosen metric. Topology was built with L2 —
3539/// upper-layer descent uses L2 as a coarse heuristic; final beam search
3540/// runs in the requested metric so rankings are correct for `<#>` / `<=>`.
3541fn nsw_search(
3542    table: &Table,
3543    idx_pos: usize,
3544    query: &[f32],
3545    k: usize,
3546    ef: usize,
3547    metric: NswMetric,
3548) -> Vec<(f32, usize)> {
3549    let (entry, entry_level) = match &table.indices[idx_pos].kind {
3550        IndexKind::Nsw(g) => (g.entry, g.entry_level),
3551        IndexKind::BTree(_)
3552        | IndexKind::Brin { .. }
3553        | IndexKind::Gin(_)
3554        | IndexKind::GinTrgm(_)
3555        | IndexKind::GinFulltext(_) => return Vec::new(),
3556    };
3557    let Some(entry) = entry else {
3558        return Vec::new();
3559    };
3560    let col_pos = table.indices[idx_pos].column_position;
3561    // v6.0.1 step 5: SQ8 columns over-fetch by `SQ8_RERANK_OVER_FETCH`
3562    // so the rerank pass below sees enough candidates to recover
3563    // recall after the ADC re-ordering. F32 + F16 columns skip the
3564    // over-fetch — F32 distances are exact, F16 dequant is
3565    // bit-exact at the storage layer so the beam search already
3566    // ranks under the column's full precision.
3567    let sq8 = matches!(
3568        table.schema.columns.get(col_pos).map(|c| c.ty),
3569        Some(DataType::Vector {
3570            encoding: VecEncoding::Sq8,
3571            ..
3572        })
3573    );
3574    let ef = if sq8 {
3575        ef.max(k).max(k * SQ8_RERANK_OVER_FETCH)
3576    } else {
3577        ef.max(k)
3578    };
3579    // Descend by L2 (the topology metric) so layers prune consistently.
3580    let entry_d = vec_l2_sq(table, col_pos, entry, query);
3581    let mut current = entry;
3582    let mut current_d = entry_d;
3583    for layer in (1..=entry_level).rev() {
3584        (current, current_d) = greedy_layer_walk(table, idx_pos, layer, current, current_d, query);
3585    }
3586    // Final beam search on layer 0 under the caller's metric.
3587    let mut results = layer_beam_search(table, idx_pos, 0, current, current_d, query, ef, metric);
3588    if sq8 {
3589        results = sq8_rerank(table, col_pos, &results, query, metric);
3590    }
3591    results.truncate(k);
3592    results
3593}
3594
3595/// v6.0.1 step 5: re-score ADC top-`K*3` candidates with the
3596/// dequantised cell vs the f32 query, then re-sort. Recovers the
3597/// recall the SQ8 ADC sacrifices for 4× compression — the design's
3598/// "f32 rerank step is on by default" path (deliberation #3).
3599/// `metric` is the same metric the beam search used; the rerank
3600/// arithmetic re-derives the exact distance under that metric.
3601fn sq8_rerank(
3602    table: &Table,
3603    col_pos: usize,
3604    candidates: &[(f32, usize)],
3605    query: &[f32],
3606    metric: NswMetric,
3607) -> Vec<(f32, usize)> {
3608    let mut out: Vec<(f32, usize)> = candidates
3609        .iter()
3610        .filter_map(|&(adc_d, row)| {
3611            let cell = table.rows.get(row).and_then(|r| r.values.get(col_pos))?;
3612            let Value::Sq8Vector(q) = cell else {
3613                // F32 cells shouldn't reach this path (sq8 fence
3614                // above), but stay defensive: pass through with
3615                // the ADC distance unchanged.
3616                return Some((adc_d, row));
3617            };
3618            let deq = quantize::dequantize(q);
3619            if deq.len() != query.len() {
3620                return None;
3621            }
3622            Some((metric_distance(metric, &deq, query), row))
3623        })
3624        .collect();
3625    out.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
3626    out
3627}
3628
3629/// Multiplier applied to `k` so the SQ8 rerank pass sees a wider
3630/// candidate set. 3× is the design-stage value; v6.0.5 sweep work
3631/// can re-tune once full corpus profiling is in.
3632const SQ8_RERANK_OVER_FETCH: usize = 3;
3633
3634fn metric_distance(metric: NswMetric, a: &[f32], b: &[f32]) -> f32 {
3635    match metric {
3636        NswMetric::L2 => l2_distance_sq(a, b),
3637        NswMetric::InnerProduct => -inner_product_f32(a, b),
3638        NswMetric::Cosine => {
3639            let (dot, na, nb) = cosine_dot_norms_f32(a, b);
3640            if na == 0.0 || nb == 0.0 {
3641                return f32::INFINITY;
3642            }
3643            // `f32::sqrt` lives in std, so hand-roll Newton-Raphson on
3644            // f64 — same trick the L2 binary op already uses.
3645            let denom = sqrt_newton_f32(na) * sqrt_newton_f32(nb);
3646            1.0 - dot / denom
3647        }
3648    }
3649}
3650
3651/// v6.0.2: dispatch wrapper for the f32 dot product (used by `<#>` +
3652/// the cosine numerator). NEON path when `len % 4 == 0 && len >= 4`,
3653/// scalar fallback otherwise. Returns the positive dot — callers
3654/// negate for the pgvector `<#>` "smaller = closer" convention.
3655///
3656/// Public so perf gates + downstream benches can microbenchmark the
3657/// dispatch directly; not part of the STABILITY contract — internal
3658/// SIMD layout can evolve in any release.
3659#[doc(hidden)]
3660#[inline]
3661pub fn inner_product_f32(a: &[f32], b: &[f32]) -> f32 {
3662    #[cfg(target_arch = "aarch64")]
3663    {
3664        if a.len() == b.len() && a.len() >= 4 && a.len().is_multiple_of(4) {
3665            // SAFETY: NEON is a baseline aarch64 feature; preconditions
3666            // (matching lengths, ≥ 1 full lane group) are checked above.
3667            return unsafe { inner_product_neon(a, b) };
3668        }
3669    }
3670    inner_product_scalar(a, b)
3671}
3672
3673fn inner_product_scalar(a: &[f32], b: &[f32]) -> f32 {
3674    let mut dot: f32 = 0.0;
3675    for (x, y) in a.iter().zip(b.iter()) {
3676        dot += x * y;
3677    }
3678    dot
3679}
3680
3681#[cfg(target_arch = "aarch64")]
3682#[target_feature(enable = "neon")]
3683#[allow(clippy::many_single_char_names)] // NEON intrinsics work in single-letter regs by convention
3684unsafe fn inner_product_neon(a: &[f32], b: &[f32]) -> f32 {
3685    use core::arch::aarch64::{
3686        float32x4_t, vaddq_f32, vaddvq_f32, vdupq_n_f32, vfmaq_f32, vld1q_f32,
3687    };
3688    unsafe {
3689        // Two parallel accumulators (same trick as L2 NEON) so the
3690        // FMA dependency chain doesn't serialise.
3691        let zero: float32x4_t = vdupq_n_f32(0.0);
3692        let mut acc0 = zero;
3693        let mut acc1 = zero;
3694        let n = a.len();
3695        let mut i = 0usize;
3696        while i + 8 <= n {
3697            let av0 = vld1q_f32(a.as_ptr().add(i));
3698            let bv0 = vld1q_f32(b.as_ptr().add(i));
3699            acc0 = vfmaq_f32(acc0, av0, bv0);
3700            let av1 = vld1q_f32(a.as_ptr().add(i + 4));
3701            let bv1 = vld1q_f32(b.as_ptr().add(i + 4));
3702            acc1 = vfmaq_f32(acc1, av1, bv1);
3703            i += 8;
3704        }
3705        while i + 4 <= n {
3706            let av = vld1q_f32(a.as_ptr().add(i));
3707            let bv = vld1q_f32(b.as_ptr().add(i));
3708            acc0 = vfmaq_f32(acc0, av, bv);
3709            i += 4;
3710        }
3711        vaddvq_f32(vaddq_f32(acc0, acc1))
3712    }
3713}
3714
3715/// v6.0.2: dispatch wrapper for the three accumulators (`dot`, `||a||²`,
3716/// `||b||²`) cosine needs. Same NEON pre-condition as the L2 / IP
3717/// paths; same scalar fallback shape.
3718///
3719/// Public for benchmarking only (see `inner_product_f32`); not in the
3720/// STABILITY contract.
3721#[doc(hidden)]
3722#[inline]
3723pub fn cosine_dot_norms_f32(a: &[f32], b: &[f32]) -> (f32, f32, f32) {
3724    #[cfg(target_arch = "aarch64")]
3725    {
3726        if a.len() == b.len() && a.len() >= 4 && a.len().is_multiple_of(4) {
3727            // SAFETY: see `inner_product_neon`.
3728            return unsafe { cosine_dot_norms_neon(a, b) };
3729        }
3730    }
3731    cosine_dot_norms_scalar(a, b)
3732}
3733
3734fn cosine_dot_norms_scalar(a: &[f32], b: &[f32]) -> (f32, f32, f32) {
3735    let mut dot: f32 = 0.0;
3736    let mut na: f32 = 0.0;
3737    let mut nb: f32 = 0.0;
3738    for (x, y) in a.iter().zip(b.iter()) {
3739        dot += x * y;
3740        na += x * x;
3741        nb += y * y;
3742    }
3743    (dot, na, nb)
3744}
3745
3746#[cfg(target_arch = "aarch64")]
3747#[target_feature(enable = "neon")]
3748#[allow(clippy::many_single_char_names, clippy::similar_names)]
3749unsafe fn cosine_dot_norms_neon(a: &[f32], b: &[f32]) -> (f32, f32, f32) {
3750    use core::arch::aarch64::{float32x4_t, vaddvq_f32, vdupq_n_f32, vfmaq_f32, vld1q_f32};
3751    unsafe {
3752        let zero: float32x4_t = vdupq_n_f32(0.0);
3753        let mut acc_dot = zero;
3754        let mut acc_na = zero;
3755        let mut acc_nb = zero;
3756        let n = a.len();
3757        let mut i = 0usize;
3758        while i + 4 <= n {
3759            let av = vld1q_f32(a.as_ptr().add(i));
3760            let bv = vld1q_f32(b.as_ptr().add(i));
3761            acc_dot = vfmaq_f32(acc_dot, av, bv);
3762            acc_na = vfmaq_f32(acc_na, av, av);
3763            acc_nb = vfmaq_f32(acc_nb, bv, bv);
3764            i += 4;
3765        }
3766        (vaddvq_f32(acc_dot), vaddvq_f32(acc_na), vaddvq_f32(acc_nb))
3767    }
3768}
3769
3770fn sqrt_newton_f32(x: f32) -> f32 {
3771    if x <= 0.0 {
3772        return 0.0;
3773    }
3774    let mut g = x;
3775    for _ in 0..10 {
3776        g = 0.5 * (g + x / g);
3777    }
3778    g
3779}
3780
3781/// Squared Euclidean distance — used for ordering inside NSW (the sqrt
3782/// preserves the order). Caller takes sqrt before reporting back to SQL.
3783///
3784/// v3.3.2: aarch64 NEON path for `len % 4 == 0` (which covers every
3785/// HNSW-indexed VECTOR(N) where N is a multiple of 4 — i.e. all
3786/// production-shaped embeddings: 64, 128, 256, 384, 512, 768, 1024,
3787/// 1536, ...). Other shapes fall back to the scalar loop.
3788#[inline]
3789fn l2_distance_sq(a: &[f32], b: &[f32]) -> f32 {
3790    #[cfg(target_arch = "aarch64")]
3791    {
3792        if a.len() == b.len() && a.len() >= 4 && a.len().is_multiple_of(4) {
3793            // SAFETY: NEON is a baseline aarch64 feature (ARMv8);
3794            // the precondition is checked above (matching lengths,
3795            // multiple of 4, at least one 128-bit lane group).
3796            return unsafe { l2_distance_sq_neon(a, b) };
3797        }
3798    }
3799    l2_distance_sq_scalar(a, b)
3800}
3801
3802fn l2_distance_sq_scalar(a: &[f32], b: &[f32]) -> f32 {
3803    let mut sum: f32 = 0.0;
3804    for (x, y) in a.iter().zip(b.iter()) {
3805        let d = *x - *y;
3806        sum += d * d;
3807    }
3808    sum
3809}
3810
3811#[cfg(target_arch = "aarch64")]
3812#[target_feature(enable = "neon")]
3813#[allow(clippy::many_single_char_names)] // NEON intrinsics work in single-letter regs by convention
3814unsafe fn l2_distance_sq_neon(a: &[f32], b: &[f32]) -> f32 {
3815    use core::arch::aarch64::{
3816        float32x4_t, vaddq_f32, vaddvq_f32, vdupq_n_f32, vfmaq_f32, vld1q_f32, vsubq_f32,
3817    };
3818    unsafe {
3819        // Two independent accumulator registers so the FMA dependency
3820        // chain doesn't serialise (each FMA depends on prior FMA).
3821        // Pre-conditions checked by caller: `a.len() == b.len()`,
3822        // `a.len() % 4 == 0`, `a.len() >= 4`.
3823        let zero: float32x4_t = vdupq_n_f32(0.0);
3824        let mut acc0 = zero;
3825        let mut acc1 = zero;
3826        let n = a.len();
3827        let mut i = 0usize;
3828        // Process 8 floats per iter when available (two parallel
3829        // accumulators). Tail of 4 falls into the second loop.
3830        while i + 8 <= n {
3831            let d0 = vsubq_f32(vld1q_f32(a.as_ptr().add(i)), vld1q_f32(b.as_ptr().add(i)));
3832            acc0 = vfmaq_f32(acc0, d0, d0);
3833            let d1 = vsubq_f32(
3834                vld1q_f32(a.as_ptr().add(i + 4)),
3835                vld1q_f32(b.as_ptr().add(i + 4)),
3836            );
3837            acc1 = vfmaq_f32(acc1, d1, d1);
3838            i += 8;
3839        }
3840        while i + 4 <= n {
3841            let d = vsubq_f32(vld1q_f32(a.as_ptr().add(i)), vld1q_f32(b.as_ptr().add(i)));
3842            acc0 = vfmaq_f32(acc0, d, d);
3843            i += 4;
3844        }
3845        vaddvq_f32(vaddq_f32(acc0, acc1))
3846    }
3847}
3848
3849/// Public wrapper: run an NSW kNN search and return the top-k row
3850/// indices ordered by ascending distance under the given metric.
3851pub fn nsw_query(
3852    table: &Table,
3853    idx_name: &str,
3854    query: &[f32],
3855    k: usize,
3856    metric: NswMetric,
3857) -> Vec<usize> {
3858    let Some(idx_pos) = table.indices.iter().position(|i| i.name == idx_name) else {
3859        return Vec::new();
3860    };
3861    let ef = (k * 2).max(NSW_DEFAULT_M);
3862    let mut hits = nsw_search(table, idx_pos, query, k, ef, metric);
3863    hits.truncate(k);
3864    hits.into_iter().map(|(_, idx)| idx).collect()
3865}
3866
3867/// Find any NSW index on a column. Used by the planner to decide
3868/// whether an `ORDER BY col <-> literal LIMIT k` query can skip the
3869/// brute-force scan.
3870pub fn nsw_index_on(table: &Table, column_position: usize) -> Option<&Index> {
3871    table
3872        .indices
3873        .iter()
3874        .find(|i| i.column_position == column_position && matches!(i.kind, IndexKind::Nsw(_)))
3875}
3876
3877/// Catalog: insertion-ordered `Vec<Table>` for stable iter / serialize,
3878/// plus a `BTreeMap<String, usize>` sidecar index so `get` / `get_mut`
3879/// run in O(log n) instead of the old linear scan with per-element
3880/// string compares.
3881///
3882/// A pure `BTreeMap<String, Table>` was tried in an interim version
3883/// of v3.1.2 and regressed the single-table catalog benches by ~10%
3884/// (the per-element `BTreeMap` overhead outweighs the lookup win
3885/// when n is small). The sidecar shape preserves the insertion-order
3886/// iteration the on-disk encoding relies on and keeps `last_mut`
3887/// (used by the deserialize hot path) cheap.
3888#[derive(Debug, Clone, Default)]
3889pub struct Catalog {
3890    tables: Vec<Table>,
3891    /// `name → tables[index]`. Kept in lock-step with `tables`.
3892    /// `create_table` is the only write path.
3893    by_name: BTreeMap<String, usize>,
3894    /// v5.1: in-memory cold-tier segments. Side-loaded via
3895    /// [`Catalog::load_segment_bytes`] — they live outside the
3896    /// catalog snapshot (caller persists them as separate files
3897    /// and re-loads on boot, until v5.3's `CatalogManifest` makes
3898    /// that wiring automatic). `RowLocator::Cold { segment_id, .. }`
3899    /// indexes this `Vec`. Cleared on `Catalog::new` / fresh
3900    /// `deserialize`.
3901    ///
3902    /// `Arc` wrap keeps `Catalog::clone` at O(N segments) bumps
3903    /// (rather than O(total segment bytes) memcpy) so the v4.42
3904    /// group-commit pre-image rollback invariant — clone is
3905    /// effectively free — survives the cold-tier addition.
3906    ///
3907    /// v6.7.3 — slots became `Option<…>` so cold-segment compaction
3908    /// can tombstone merged sources without breaking the
3909    /// `segment_id = index_into_vec` contract that on-disk
3910    /// `RowLocator::Cold { segment_id }` already serialized.
3911    /// `None` slot = the segment was retired by compaction; the
3912    /// physical file may still be on disk (next CHECKPOINT writes
3913    /// a manifest that no longer lists it, and the file becomes
3914    /// an orphan eligible for offline cleanup).
3915    cold_segments: Vec<Option<Arc<OwnedSegment>>>,
3916    /// v7.12.4 — user-defined functions (PL/pgSQL + SQL).
3917    /// Keyed by function name (PG overloading is out of scope).
3918    /// Bodies are stored as the raw source text the parser saw
3919    /// between `$$ ... $$`; the engine re-parses on each
3920    /// invocation. This keeps `spg-storage` free of `spg-sql`
3921    /// dependency — same pattern as partial-index predicates.
3922    functions: BTreeMap<String, FunctionDef>,
3923    /// v7.12.4 — triggers in insertion order. Multiple triggers
3924    /// per table / event fire in this order (matching PG's
3925    /// alphabetical-by-default with insertion-stable tie-break
3926    /// behaviour — we just keep insertion order for now).
3927    triggers: Vec<TriggerDef>,
3928    /// v7.17.0 — catalogued SEQUENCE objects (Phase 1.1). Each
3929    /// `nextval(name)` reaches in here, atomically increments
3930    /// `last_value` / flips `is_called`, returns the new value.
3931    /// Persisted in catalog FILE_VERSION 26+; older catalogs
3932    /// deserialise with an empty map.
3933    sequences: BTreeMap<String, SequenceDef>,
3934    /// v7.17.0 — catalogued VIEW objects (Phase 1.2). Each
3935    /// `SELECT FROM v` at engine exec-time looks up `v` here and
3936    /// prepends the view body as a synthetic CTE. Persisted in
3937    /// catalog FILE_VERSION 27+; older catalogs deserialise with
3938    /// an empty map.
3939    views: BTreeMap<String, ViewDef>,
3940    /// v7.17.0 — catalogued MATERIALIZED VIEW source registry
3941    /// (Phase 1.3). Maps name → SELECT source. The materialised
3942    /// rows themselves live as a regular `Table` with the same
3943    /// name; REFRESH re-parses + re-executes the source against
3944    /// the table. Persisted in catalog FILE_VERSION 28+;
3945    /// older catalogs deserialise with an empty map.
3946    materialized_views: BTreeMap<String, String>,
3947    /// v7.17.0 — catalogued user-defined ENUM types (Phase 1.4).
3948    /// Maps name → label list. Columns reference these by name
3949    /// via `ColumnSchema.user_enum_type`. Persisted in catalog
3950    /// FILE_VERSION 29+; older catalogs deserialise with an empty
3951    /// map.
3952    enum_types: BTreeMap<String, EnumDef>,
3953    /// v7.17.0 — catalogued user-defined DOMAIN types (Phase 1.5).
3954    /// Maps name → base + CHECK constraints. Columns reference
3955    /// these by name via `ColumnSchema.user_domain_type`.
3956    /// Persisted in catalog FILE_VERSION 30+; older catalogs
3957    /// deserialise with an empty map.
3958    domain_types: BTreeMap<String, DomainDef>,
3959    /// v7.17.0 — schema-namespace registry (Phase 1.6). Tracks
3960    /// which schemas exist. `public`, `pg_catalog`, and
3961    /// `information_schema` are built-in and always present.
3962    /// Schema-qualified table references still strip the prefix
3963    /// at lookup time per v7.16-and-earlier — full
3964    /// schema-as-isolation is v7.18+ scope. Persisted in catalog
3965    /// FILE_VERSION 31+; older catalogs deserialise with just
3966    /// the built-ins.
3967    schemas: alloc::collections::BTreeSet<String>,
3968}
3969
3970/// v7.12.4 — catalogued user-defined function. `body` is the raw
3971/// source text between `$$ ... $$`; the engine re-parses it on
3972/// invocation. This keeps the storage codec stable when the
3973/// PL/pgSQL surface grows (no breaking-change risk on the disk
3974/// format).
3975#[derive(Debug, Clone, PartialEq, Eq)]
3976pub struct FunctionDef {
3977    pub name: String,
3978    /// Display form of the argument list, e.g.
3979    /// `"(name TEXT, ts TIMESTAMP)"`. Empty `"()"` for the trigger
3980    /// function shape. Parser-side canonicalised before storage.
3981    pub args_repr: String,
3982    /// Display form of the return type, e.g. `"TRIGGER"` /
3983    /// `"INT"` / `"SETOF text"`. The engine special-cases
3984    /// `"TRIGGER"` (case-insensitive) to gate trigger-only
3985    /// semantics (NEW/OLD).
3986    pub returns: String,
3987    /// `LANGUAGE` clause, lowercased. `"plpgsql"` / `"sql"`.
3988    pub language: String,
3989    /// Source body of the function. PL/pgSQL: includes the
3990    /// surrounding `BEGIN ... END;`. SQL: includes the
3991    /// statement(s). The engine re-parses on invocation; bad
3992    /// bodies surface as a parse error at CALL time, not CREATE.
3993    pub body: String,
3994}
3995
3996/// v7.12.4 — catalogued trigger. References its function by
3997/// name; the function must exist at TRIGGER creation time
3998/// (forward references are deferred to v7.12.5+).
3999#[derive(Debug, Clone, PartialEq, Eq)]
4000pub struct TriggerDef {
4001    pub name: String,
4002    /// Watched table. Trigger is dropped when the table drops.
4003    pub table: String,
4004    /// `"BEFORE"` / `"AFTER"` / `"INSTEAD OF"`. Stored as the
4005    /// uppercased keyword so deserialised catalogs round-trip
4006    /// without canonicalisation surprises.
4007    pub timing: String,
4008    /// Each entry is one of `"INSERT"` / `"UPDATE"` / `"DELETE"`
4009    /// / `"TRUNCATE"`. `INSERT OR UPDATE` parses to two entries.
4010    pub events: Vec<String>,
4011    /// `"ROW"` / `"STATEMENT"`. v7.12.4 ships `"ROW"` only;
4012    /// `"STATEMENT"` parses and persists but the executor
4013    /// refuses it at trigger fire time.
4014    pub for_each: String,
4015    /// Name of the PL/pgSQL function to invoke.
4016    pub function: String,
4017    /// v7.13.0 — `UPDATE OF col, col, …` column-list filter
4018    /// (mailrs round-5 G7). Non-empty means the trigger fires
4019    /// only when at least one of these columns appears in the
4020    /// UPDATE's SET list. Empty = no column filter. Stored in
4021    /// catalog FILE_VERSION 23+; older catalogs deserialise with
4022    /// an empty vec.
4023    pub update_columns: Vec<String>,
4024    /// v7.16.1 — whether the trigger fires when its watched
4025    /// event occurs. Toggled by `ALTER TABLE … { ENABLE |
4026    /// DISABLE } TRIGGER …`; pg_dump --disable-triggers wraps
4027    /// every data block with a DISABLE/ENABLE pair so the
4028    /// rows already-computed in prod don't get re-rewritten.
4029    /// Defaults to `true` at CREATE TRIGGER time. Stored in
4030    /// catalog FILE_VERSION 25+; older catalogs deserialise
4031    /// with `enabled = true`.
4032    pub enabled: bool,
4033}
4034
4035/// v7.17.0 — catalogued SEQUENCE. PG semantics: a counter object
4036/// returning monotonically increasing values via `nextval(name)`.
4037/// `last_value` is the most recent value handed out; `is_called`
4038/// is false until the first `nextval`/`setval`. Stored separately
4039/// from tables in the catalog.
4040#[derive(Debug, Clone, PartialEq, Eq)]
4041pub struct SequenceDef {
4042    pub name: String,
4043    /// Data type — narrows the i64 range. PG default BIGINT.
4044    pub data_type: SequenceDataType,
4045    pub start: i64,
4046    pub increment: i64,
4047    pub min_value: i64,
4048    pub max_value: i64,
4049    pub cache: i64,
4050    pub cycle: bool,
4051    /// `OWNED BY` target — `(table, column)` or NONE.
4052    pub owned_by: Option<(String, String)>,
4053    /// Most recently handed-out value. Meaningless when
4054    /// `is_called == false`; in that case the NEXT `nextval`
4055    /// will return `start`.
4056    pub last_value: i64,
4057    pub is_called: bool,
4058}
4059
4060/// v7.17.0 — sequence integer width.
4061#[derive(Debug, Clone, Copy, PartialEq, Eq)]
4062pub enum SequenceDataType {
4063    SmallInt,
4064    Int,
4065    BigInt,
4066}
4067
4068/// v7.17.0 Phase 1.6 — built-in schema names that every Catalog
4069/// understands without an explicit CREATE SCHEMA. Used by
4070/// [`Catalog::schema_exists`] and the engine's schema-qualified
4071/// lookup path.
4072#[must_use]
4073pub fn is_builtin_schema(name: &str) -> bool {
4074    name.eq_ignore_ascii_case("public")
4075        || name.eq_ignore_ascii_case("pg_catalog")
4076        || name.eq_ignore_ascii_case("information_schema")
4077}
4078
4079/// v7.17.0 — parse a PG-canonical UUID text representation into the
4080/// 16-byte network-order layout used by `Value::Uuid`. Accepted input
4081/// shapes (all case-insensitive):
4082///   * Canonical hyphenated 8-4-4-4-12 (`550e8400-e29b-41d4-a716-446655440000`)
4083///   * Unhyphenated 32-char hex (`550e8400e29b41d4a716446655440000`)
4084///   * Either form wrapped in `{ ... }`
4085///
4086/// Returns `None` for any malformed input (wrong length, non-hex
4087/// characters, misplaced hyphens). The caller surfaces a SQL error
4088/// at coercion time — silent acceptance of garbage would mask
4089/// application bugs and is exactly the divergence from PG that
4090/// breaks the 0-change cutover promise.
4091#[must_use]
4092pub fn parse_uuid_str(input: &str) -> Option<[u8; 16]> {
4093    let s = input.trim();
4094    // Strip surrounding braces if present.
4095    let s = if let Some(inner) = s.strip_prefix('{').and_then(|x| x.strip_suffix('}')) {
4096        inner
4097    } else {
4098        s
4099    };
4100    // Two valid shapes after braces are stripped: 32 hex chars or
4101    // the canonical 36-char hyphenated form.
4102    let hex: String = match s.len() {
4103        32 => s.to_ascii_lowercase(),
4104        36 => {
4105            // Hyphens must be exactly at positions 8, 13, 18, 23.
4106            let b = s.as_bytes();
4107            if b[8] != b'-' || b[13] != b'-' || b[18] != b'-' || b[23] != b'-' {
4108                return None;
4109            }
4110            let mut out = String::with_capacity(32);
4111            out.push_str(&s[0..8]);
4112            out.push_str(&s[9..13]);
4113            out.push_str(&s[14..18]);
4114            out.push_str(&s[19..23]);
4115            out.push_str(&s[24..36]);
4116            out.make_ascii_lowercase();
4117            out
4118        }
4119        _ => return None,
4120    };
4121    let bytes = hex.as_bytes();
4122    let mut out = [0u8; 16];
4123    for i in 0..16 {
4124        let hi = hex_nibble(bytes[i * 2])?;
4125        let lo = hex_nibble(bytes[i * 2 + 1])?;
4126        out[i] = (hi << 4) | lo;
4127    }
4128    Some(out)
4129}
4130
4131fn hex_nibble(b: u8) -> Option<u8> {
4132    match b {
4133        b'0'..=b'9' => Some(b - b'0'),
4134        b'a'..=b'f' => Some(10 + b - b'a'),
4135        b'A'..=b'F' => Some(10 + b - b'A'),
4136        _ => None,
4137    }
4138}
4139
4140/// v7.17.0 — render a `Value::Uuid` payload as the canonical
4141/// lowercase 8-4-4-4-12 hyphenated form PG `text` cast surfaces.
4142#[must_use]
4143pub fn format_uuid(b: &[u8; 16]) -> String {
4144    const HEX: &[u8; 16] = b"0123456789abcdef";
4145    let mut out = String::with_capacity(36);
4146    for (i, byte) in b.iter().enumerate() {
4147        if matches!(i, 4 | 6 | 8 | 10) {
4148            out.push('-');
4149        }
4150        out.push(HEX[(byte >> 4) as usize] as char);
4151        out.push(HEX[(byte & 0x0f) as usize] as char);
4152    }
4153    out
4154}
4155
4156/// v7.17.0 Phase 1.5 — catalogued user-defined DOMAIN. A domain
4157/// is a named CHECK-constrained alias over a built-in type;
4158/// columns bound to it inherit the base type plus the CHECK
4159/// predicates + NOT NULL + DEFAULT at INSERT/UPDATE time.
4160/// `default` / `checks` are stored as Display-form source so
4161/// `spg-storage` stays free of `spg-sql` dependency — same
4162/// pattern as FunctionDef / ViewDef.
4163#[derive(Debug, Clone, PartialEq, Eq)]
4164pub struct DomainDef {
4165    pub name: String,
4166    pub base_type: DataType,
4167    pub nullable: bool,
4168    pub default: Option<String>,
4169    pub checks: Vec<String>,
4170}
4171
4172/// v7.17.0 Phase 1.4 — catalogued user-defined ENUM type. The
4173/// label vector is order-preserving (PG enum ordering follows the
4174/// declared order). At INSERT/UPDATE on a column bound to this
4175/// enum, the engine looks up the value against `labels` and
4176/// rejects non-members.
4177#[derive(Debug, Clone, PartialEq, Eq)]
4178pub struct EnumDef {
4179    pub name: String,
4180    pub labels: Vec<String>,
4181}
4182
4183/// v7.17.0 Phase 1.2 — catalogued VIEW. The body is stored as the
4184/// raw source text the parser saw between `AS` and the statement
4185/// terminator; the engine re-parses on each invocation. Same
4186/// pattern as `FunctionDef` — keeps `spg-storage` free of
4187/// `spg-sql` dependency.
4188#[derive(Debug, Clone, PartialEq, Eq)]
4189pub struct ViewDef {
4190    pub name: String,
4191    /// Optional `(col, col, …)` rename list. Empty when the body's
4192    /// projected names are used directly.
4193    pub columns: Vec<String>,
4194    /// Raw SELECT source. Display-rendered at storage time so the
4195    /// catalog round-trips a deterministic form regardless of
4196    /// whitespace / comments in the original input. Re-parsed at
4197    /// SELECT-from-view time to materialise as a synthetic CTE.
4198    pub body: String,
4199}
4200
4201impl SequenceDataType {
4202    /// PG default min/max per AS clause.
4203    pub fn default_bounds(self, increment_positive: bool) -> (i64, i64) {
4204        match self {
4205            Self::SmallInt => {
4206                if increment_positive {
4207                    (1, i64::from(i16::MAX))
4208                } else {
4209                    (i64::from(i16::MIN), -1)
4210                }
4211            }
4212            Self::Int => {
4213                if increment_positive {
4214                    (1, i64::from(i32::MAX))
4215                } else {
4216                    (i64::from(i32::MIN), -1)
4217                }
4218            }
4219            Self::BigInt => {
4220                if increment_positive {
4221                    (1, i64::MAX)
4222                } else {
4223                    (i64::MIN, -1)
4224                }
4225            }
4226        }
4227    }
4228}
4229
4230impl Catalog {
4231    pub const fn new() -> Self {
4232        Self {
4233            tables: Vec::new(),
4234            by_name: BTreeMap::new(),
4235            cold_segments: Vec::new(),
4236            functions: BTreeMap::new(),
4237            triggers: Vec::new(),
4238            sequences: BTreeMap::new(),
4239            views: BTreeMap::new(),
4240            materialized_views: BTreeMap::new(),
4241            enum_types: BTreeMap::new(),
4242            domain_types: BTreeMap::new(),
4243            schemas: alloc::collections::BTreeSet::new(),
4244        }
4245    }
4246
4247    /// v7.12.4 — read-only view of catalogued user-defined
4248    /// functions. Engine callers go through here to look up the
4249    /// function body before re-parsing it for invocation.
4250    pub const fn functions(&self) -> &BTreeMap<String, FunctionDef> {
4251        &self.functions
4252    }
4253
4254    /// v7.12.4 — register a new user-defined function. With
4255    /// `or_replace = false`, errors if the name is taken. The
4256    /// engine validates the body before passing it here.
4257    pub fn create_function(
4258        &mut self,
4259        def: FunctionDef,
4260        or_replace: bool,
4261    ) -> Result<(), StorageError> {
4262        if !or_replace && self.functions.contains_key(&def.name) {
4263            return Err(StorageError::Corrupt(format!(
4264                "function {:?} already exists (drop or use CREATE OR REPLACE)",
4265                def.name
4266            )));
4267        }
4268        self.functions.insert(def.name.clone(), def);
4269        Ok(())
4270    }
4271
4272    /// v7.12.4 — remove a user-defined function by name. Returns
4273    /// `true` if a function was removed, `false` if none matched.
4274    /// Caller decides whether to surface `if_exists` semantics.
4275    pub fn drop_function(&mut self, name: &str) -> bool {
4276        self.functions.remove(name).is_some()
4277    }
4278
4279    /// v7.17.0 — read-only handle to catalogued sequences.
4280    pub const fn sequences(&self) -> &BTreeMap<String, SequenceDef> {
4281        &self.sequences
4282    }
4283
4284    /// v7.17.0 — register a new SEQUENCE. Errors if `name`
4285    /// collides with an existing sequence and `if_not_exists`
4286    /// is false.
4287    pub fn create_sequence(
4288        &mut self,
4289        def: SequenceDef,
4290        if_not_exists: bool,
4291    ) -> Result<(), StorageError> {
4292        if self.sequences.contains_key(&def.name) {
4293            if if_not_exists {
4294                return Ok(());
4295            }
4296            return Err(StorageError::Corrupt(format!(
4297                "sequence {:?} already exists",
4298                def.name
4299            )));
4300        }
4301        self.sequences.insert(def.name.clone(), def);
4302        Ok(())
4303    }
4304
4305    /// v7.17.0 — remove a SEQUENCE by name. Returns `true` if a
4306    /// sequence was removed, `false` if none matched. Caller
4307    /// surfaces IF EXISTS semantics.
4308    pub fn drop_sequence(&mut self, name: &str) -> bool {
4309        self.sequences.remove(name).is_some()
4310    }
4311
4312    /// v7.17.0 — atomic nextval. Increments `last_value` per
4313    /// `increment`, returns the new value, sets `is_called`.
4314    /// Returns an error on CYCLE-less overflow.
4315    pub fn sequence_next_value(&mut self, name: &str) -> Result<i64, StorageError> {
4316        let Some(seq) = self.sequences.get_mut(name) else {
4317            return Err(StorageError::Corrupt(format!(
4318                "sequence {name:?} does not exist"
4319            )));
4320        };
4321        // PG semantics: when !is_called (fresh sequence or
4322        // setval(_, false)), the next nextval returns the stored
4323        // `last_value`. When is_called, it advances by `increment`
4324        // and CYCLE-wraps on overflow.
4325        let candidate = if seq.is_called {
4326            let next = seq.last_value.checked_add(seq.increment).ok_or_else(|| {
4327                StorageError::Corrupt(format!("sequence {name:?} arithmetic overflow"))
4328            })?;
4329            if seq.increment > 0 {
4330                if next > seq.max_value {
4331                    if seq.cycle {
4332                        seq.min_value
4333                    } else {
4334                        return Err(StorageError::Corrupt(format!(
4335                            "sequence {name:?} reached MAXVALUE ({})",
4336                            seq.max_value
4337                        )));
4338                    }
4339                } else {
4340                    next
4341                }
4342            } else if next < seq.min_value {
4343                if seq.cycle {
4344                    seq.max_value
4345                } else {
4346                    return Err(StorageError::Corrupt(format!(
4347                        "sequence {name:?} reached MINVALUE ({})",
4348                        seq.min_value
4349                    )));
4350                }
4351            } else {
4352                next
4353            }
4354        } else {
4355            seq.last_value
4356        };
4357        seq.last_value = candidate;
4358        seq.is_called = true;
4359        Ok(candidate)
4360    }
4361
4362    /// v7.17.0 — currval. Errors if the session has never called
4363    /// nextval on this sequence (PG semantics). At the catalog
4364    /// level we approximate "session" with "is_called persisted";
4365    /// the engine session-tracking layer can wrap this for the
4366    /// strict per-session semantics later.
4367    pub fn sequence_current_value(&self, name: &str) -> Result<i64, StorageError> {
4368        let Some(seq) = self.sequences.get(name) else {
4369            return Err(StorageError::Corrupt(format!(
4370                "sequence {name:?} does not exist"
4371            )));
4372        };
4373        if !seq.is_called {
4374            return Err(StorageError::Corrupt(format!(
4375                "currval of sequence {name:?} is not yet defined in this session"
4376            )));
4377        }
4378        Ok(seq.last_value)
4379    }
4380
4381    /// v7.17.0 — setval(name, value [, is_called]). PG returns
4382    /// `value` regardless. `is_called=true` means the NEXT
4383    /// nextval will return `value + increment`; `is_called=false`
4384    /// means the next nextval will return `value`.
4385    pub fn sequence_set_value(
4386        &mut self,
4387        name: &str,
4388        value: i64,
4389        is_called: bool,
4390    ) -> Result<i64, StorageError> {
4391        let Some(seq) = self.sequences.get_mut(name) else {
4392            return Err(StorageError::Corrupt(format!(
4393                "sequence {name:?} does not exist"
4394            )));
4395        };
4396        seq.last_value = value;
4397        seq.is_called = is_called;
4398        Ok(value)
4399    }
4400
4401    /// v7.17.0 Phase 1.2 — read-only handle to catalogued views.
4402    pub const fn views(&self) -> &BTreeMap<String, ViewDef> {
4403        &self.views
4404    }
4405
4406    /// v7.17.0 Phase 1.2 — install a VIEW. `or_replace=true`
4407    /// overwrites an existing entry; `if_not_exists=true` is a
4408    /// silent no-op when the name is taken. Errors if both flags
4409    /// are off and the name collides.
4410    pub fn create_view(
4411        &mut self,
4412        def: ViewDef,
4413        or_replace: bool,
4414        if_not_exists: bool,
4415    ) -> Result<(), StorageError> {
4416        if self.views.contains_key(&def.name) {
4417            if or_replace {
4418                self.views.insert(def.name.clone(), def);
4419                return Ok(());
4420            }
4421            if if_not_exists {
4422                return Ok(());
4423            }
4424            return Err(StorageError::Corrupt(format!(
4425                "view {:?} already exists",
4426                def.name
4427            )));
4428        }
4429        // Reject name collision with tables / sequences — same
4430        // namespace per PG.
4431        if self.by_name.contains_key(&def.name) {
4432            return Err(StorageError::Corrupt(format!(
4433                "view {:?} would shadow an existing table",
4434                def.name
4435            )));
4436        }
4437        if self.sequences.contains_key(&def.name) {
4438            return Err(StorageError::Corrupt(format!(
4439                "view {:?} would shadow an existing sequence",
4440                def.name
4441            )));
4442        }
4443        self.views.insert(def.name.clone(), def);
4444        Ok(())
4445    }
4446
4447    /// v7.17.0 Phase 1.2 — remove a view by name. Returns true if
4448    /// a view was removed.
4449    pub fn drop_view(&mut self, name: &str) -> bool {
4450        self.views.remove(name).is_some()
4451    }
4452
4453    /// v7.17.0 Phase 1.3 — read-only handle to the materialised-
4454    /// view source registry. Each entry pairs with a regular
4455    /// table of the same name that holds the cached rows.
4456    pub const fn materialized_views(&self) -> &BTreeMap<String, String> {
4457        &self.materialized_views
4458    }
4459
4460    /// v7.17.0 Phase 1.3 — register a source for a materialised
4461    /// view. Caller has already created the backing table.
4462    pub fn register_materialized_view(&mut self, name: String, body: String) {
4463        self.materialized_views.insert(name, body);
4464    }
4465
4466    /// v7.17.0 Phase 1.3 — drop the source registry entry. Returns
4467    /// true if a source was unregistered. Caller separately drops
4468    /// the backing table.
4469    pub fn drop_materialized_view_source(&mut self, name: &str) -> bool {
4470        self.materialized_views.remove(name).is_some()
4471    }
4472
4473    /// v7.17.0 Phase 1.4 — read-only handle to user-defined ENUM
4474    /// catalog.
4475    pub const fn enum_types(&self) -> &BTreeMap<String, EnumDef> {
4476        &self.enum_types
4477    }
4478
4479    /// v7.17.0 Phase 1.4 — install a new ENUM type. Errors if
4480    /// `name` collides with an existing enum (no IF NOT EXISTS
4481    /// per PG semantics for CREATE TYPE).
4482    pub fn create_enum_type(&mut self, def: EnumDef) -> Result<(), StorageError> {
4483        if self.enum_types.contains_key(&def.name) {
4484            return Err(StorageError::Corrupt(format!(
4485                "type {:?} already exists",
4486                def.name
4487            )));
4488        }
4489        self.enum_types.insert(def.name.clone(), def);
4490        Ok(())
4491    }
4492
4493    /// v7.17.0 Phase 1.4 — drop an ENUM type by name. Returns
4494    /// true if a type was removed.
4495    pub fn drop_enum_type(&mut self, name: &str) -> bool {
4496        self.enum_types.remove(name).is_some()
4497    }
4498
4499    /// v7.17.0 Phase 1.5 — read-only handle to DOMAIN catalog.
4500    pub const fn domain_types(&self) -> &BTreeMap<String, DomainDef> {
4501        &self.domain_types
4502    }
4503
4504    /// v7.17.0 Phase 1.5 — install a DOMAIN. Errors on collision
4505    /// with an existing domain.
4506    pub fn create_domain_type(&mut self, def: DomainDef) -> Result<(), StorageError> {
4507        if self.domain_types.contains_key(&def.name) {
4508            return Err(StorageError::Corrupt(format!(
4509                "domain {:?} already exists",
4510                def.name
4511            )));
4512        }
4513        self.domain_types.insert(def.name.clone(), def);
4514        Ok(())
4515    }
4516
4517    /// v7.17.0 Phase 1.5 — drop a DOMAIN by name.
4518    pub fn drop_domain_type(&mut self, name: &str) -> bool {
4519        self.domain_types.remove(name).is_some()
4520    }
4521
4522    /// v7.17.0 Phase 1.6 — read-only handle to the user-created
4523    /// schema registry. Built-in schemas (`public`, `pg_catalog`,
4524    /// `information_schema`) are NOT included here; use
4525    /// [`schema_exists`](Self::schema_exists) for the full
4526    /// check.
4527    pub const fn user_schemas(&self) -> &alloc::collections::BTreeSet<String> {
4528        &self.schemas
4529    }
4530
4531    /// v7.17.0 Phase 1.6 — schema-name resolver. Returns true
4532    /// for built-in schemas + every user-CREATEd one. Used by
4533    /// CREATE SCHEMA collision checks and (future) by
4534    /// information_schema.schemata.
4535    pub fn schema_exists(&self, name: &str) -> bool {
4536        is_builtin_schema(name) || self.schemas.contains(name)
4537    }
4538
4539    /// v7.17.0 Phase 1.6 — register a new schema. Errors if the
4540    /// name already exists and `if_not_exists=false`. Built-in
4541    /// names cannot be redeclared.
4542    pub fn create_schema(&mut self, name: String, if_not_exists: bool) -> Result<(), StorageError> {
4543        if is_builtin_schema(&name) {
4544            if if_not_exists {
4545                return Ok(());
4546            }
4547            return Err(StorageError::Corrupt(format!(
4548                "schema {name:?} is built-in and cannot be redeclared"
4549            )));
4550        }
4551        if self.schemas.contains(&name) {
4552            if if_not_exists {
4553                return Ok(());
4554            }
4555            return Err(StorageError::Corrupt(format!(
4556                "schema {name:?} already exists"
4557            )));
4558        }
4559        self.schemas.insert(name);
4560        Ok(())
4561    }
4562
4563    /// v7.17.0 Phase 1.6 — drop a user-created schema. Returns
4564    /// true if a schema was removed. Built-in names always
4565    /// return false (cannot be dropped). Tables that previously
4566    /// used the schema as a prefix keep their bare name and stay
4567    /// queryable — this is the "prefix routing, not isolation"
4568    /// posture documented in v7.17 Phase 1.6.
4569    pub fn drop_schema(&mut self, name: &str) -> Result<bool, StorageError> {
4570        if is_builtin_schema(name) {
4571            return Err(StorageError::Corrupt(format!(
4572                "schema {name:?} is built-in and cannot be dropped"
4573            )));
4574        }
4575        Ok(self.schemas.remove(name))
4576    }
4577
4578    /// v7.17.0 — ALTER SEQUENCE option merge. Caller-provided
4579    /// updates overwrite the matching fields; unset fields keep
4580    /// their stored values. RESTART variants update last_value
4581    /// directly per PG: `RESTART` resets to current `start`;
4582    /// `RESTART WITH n` resets to `n`.
4583    #[allow(clippy::too_many_arguments)]
4584    pub fn alter_sequence(
4585        &mut self,
4586        name: &str,
4587        increment: Option<i64>,
4588        min_value: Option<i64>,
4589        max_value: Option<i64>,
4590        start: Option<i64>,
4591        restart: Option<Option<i64>>,
4592        cache: Option<i64>,
4593        cycle: Option<bool>,
4594        owned_by: Option<Option<(String, String)>>,
4595    ) -> Result<(), StorageError> {
4596        let Some(seq) = self.sequences.get_mut(name) else {
4597            return Err(StorageError::Corrupt(format!(
4598                "sequence {name:?} does not exist"
4599            )));
4600        };
4601        if let Some(v) = increment {
4602            seq.increment = v;
4603        }
4604        if let Some(v) = min_value {
4605            seq.min_value = v;
4606        }
4607        if let Some(v) = max_value {
4608            seq.max_value = v;
4609        }
4610        if let Some(v) = start {
4611            seq.start = v;
4612        }
4613        if let Some(restart_value) = restart {
4614            seq.last_value = restart_value.unwrap_or(seq.start);
4615            seq.is_called = false;
4616        }
4617        if let Some(v) = cache {
4618            seq.cache = v;
4619        }
4620        if let Some(v) = cycle {
4621            seq.cycle = v;
4622        }
4623        if let Some(v) = owned_by {
4624            seq.owned_by = v;
4625        }
4626        Ok(())
4627    }
4628
4629    /// v7.12.4 — read-only slice of all catalogued triggers.
4630    /// Engine row-write paths filter this by (table, event,
4631    /// timing) and fire matches in slice order.
4632    pub fn triggers(&self) -> &[TriggerDef] {
4633        &self.triggers
4634    }
4635
4636    /// v7.15.0 — mutable handle to the trigger slice for
4637    /// `ALTER TABLE … RENAME COLUMN`, which rewrites every
4638    /// `update_columns` entry that referenced the renamed
4639    /// column.
4640    pub fn triggers_mut(&mut self) -> &mut Vec<TriggerDef> {
4641        &mut self.triggers
4642    }
4643
4644    /// v7.12.4 — register a new trigger. With `or_replace = false`,
4645    /// errors when a trigger with the same name already exists on
4646    /// the same table (PG scoping rule — trigger names are
4647    /// per-table, not global). Trigger function must already
4648    /// exist in the catalog at registration time.
4649    pub fn create_trigger(
4650        &mut self,
4651        def: TriggerDef,
4652        or_replace: bool,
4653    ) -> Result<(), StorageError> {
4654        if !self.by_name.contains_key(&def.table) {
4655            return Err(StorageError::TableNotFound {
4656                name: def.table.clone(),
4657            });
4658        }
4659        if !self.functions.contains_key(&def.function) {
4660            return Err(StorageError::Corrupt(format!(
4661                "trigger {:?} references unknown function {:?}",
4662                def.name, def.function
4663            )));
4664        }
4665        let dup = self
4666            .triggers
4667            .iter()
4668            .position(|t| t.name == def.name && t.table == def.table);
4669        match (dup, or_replace) {
4670            (Some(_), false) => Err(StorageError::Corrupt(format!(
4671                "trigger {:?} already exists on table {:?}",
4672                def.name, def.table
4673            ))),
4674            (Some(i), true) => {
4675                self.triggers[i] = def;
4676                Ok(())
4677            }
4678            (None, _) => {
4679                self.triggers.push(def);
4680                Ok(())
4681            }
4682        }
4683    }
4684
4685    /// v7.12.4 — remove a trigger by `(name, table)`. Returns
4686    /// `true` if one was removed.
4687    pub fn drop_trigger(&mut self, name: &str, table: &str) -> bool {
4688        let before = self.triggers.len();
4689        self.triggers
4690            .retain(|t| !(t.name == name && t.table == table));
4691        before != self.triggers.len()
4692    }
4693
4694    pub fn create_table(&mut self, schema: TableSchema) -> Result<(), StorageError> {
4695        if self.by_name.contains_key(&schema.name) {
4696            return Err(StorageError::DuplicateTable {
4697                name: schema.name.clone(),
4698            });
4699        }
4700        let idx = self.tables.len();
4701        let name = schema.name.clone();
4702        self.tables.push(Table::new(schema));
4703        self.by_name.insert(name, idx);
4704        Ok(())
4705    }
4706
4707    pub fn get(&self, name: &str) -> Option<&Table> {
4708        let idx = *self.by_name.get(name)?;
4709        self.tables.get(idx)
4710    }
4711
4712    pub fn get_mut(&mut self, name: &str) -> Option<&mut Table> {
4713        let idx = *self.by_name.get(name)?;
4714        self.tables.get_mut(idx)
4715    }
4716
4717    pub fn table_count(&self) -> usize {
4718        self.tables.len()
4719    }
4720
4721    /// v7.14.0 — remove a table by name. Returns `true` when the
4722    /// table existed (and is now gone), `false` when it didn't.
4723    /// Used by `DROP TABLE` from pg_dump / mysqldump preambles
4724    /// where the dump re-creates schema and starts with
4725    /// `DROP TABLE IF EXISTS`.
4726    pub fn drop_table(&mut self, name: &str) -> bool {
4727        let Some(idx) = self.by_name.remove(name) else {
4728            return false;
4729        };
4730        // swap_remove invalidates the trailing index → rebuild
4731        // by_name for affected entries.
4732        self.tables.swap_remove(idx);
4733        // Re-stamp moved table's index slot in by_name.
4734        if idx < self.tables.len() {
4735            let moved_name = self.tables[idx].schema.name.clone();
4736            self.by_name.insert(moved_name, idx);
4737        }
4738        true
4739    }
4740
4741    /// v7.16.2 — rename a table (mailrs round-10 A.5). Updates
4742    /// the schema name, the catalog name → index map, and
4743    /// rewrites every reference dangling at the table name:
4744    ///   * every FK on every OTHER table whose `parent_table`
4745    ///     pointed at the old name now points at the new
4746    ///     name, so FK enforcement keeps working
4747    ///   * every trigger watching the table updates its `table`
4748    ///     field
4749    /// Returns `Ok` on success; `Err(StorageError::TableNotFound)`
4750    /// when the old name isn't in the catalog and
4751    /// `Err(StorageError::DuplicateTable)` when the new name is
4752    /// already taken.
4753    pub fn rename_table(&mut self, old: &str, new: &str) -> Result<(), StorageError> {
4754        if old == new {
4755            return Ok(());
4756        }
4757        if self.by_name.contains_key(new) {
4758            return Err(StorageError::Corrupt(format!(
4759                "rename_table: target name {new:?} already exists"
4760            )));
4761        }
4762        let idx = self
4763            .by_name
4764            .remove(old)
4765            .ok_or_else(|| StorageError::TableNotFound { name: old.into() })?;
4766        self.tables[idx].schema.name = new.to_string();
4767        self.by_name.insert(new.to_string(), idx);
4768        for t in &mut self.tables {
4769            for fk in &mut t.schema.foreign_keys {
4770                if fk.parent_table == old {
4771                    fk.parent_table = new.to_string();
4772                }
4773            }
4774        }
4775        for trig in &mut self.triggers {
4776            if trig.table == old {
4777                trig.table = new.to_string();
4778            }
4779        }
4780        Ok(())
4781    }
4782
4783    /// v7.16.2 — rename an index by name. Walks every table
4784    /// since the index lives on its owning table; updates the
4785    /// name in place. Errors with `IndexNotFound` when no
4786    /// index matches. mailrs round-10 A.5.
4787    pub fn rename_index(&mut self, old: &str, new: &str) -> Result<(), StorageError> {
4788        if old == new {
4789            return Ok(());
4790        }
4791        // Reject the new name if it already exists anywhere.
4792        for t in &self.tables {
4793            if t.indices.iter().any(|i| i.name == new) {
4794                return Err(StorageError::Corrupt(format!(
4795                    "rename_index: target name {new:?} already exists"
4796                )));
4797            }
4798        }
4799        for t in &mut self.tables {
4800            for i in &mut t.indices {
4801                if i.name == old {
4802                    i.name = new.to_string();
4803                    return Ok(());
4804                }
4805            }
4806        }
4807        Err(StorageError::IndexNotFound { name: old.into() })
4808    }
4809
4810    /// v7.14.0 — remove a named index across the catalog.
4811    /// Returns `true` when found + dropped.
4812    pub fn drop_named_index(&mut self, name: &str) -> bool {
4813        for t in &mut self.tables {
4814            let before = t.indices.len();
4815            t.indices.retain(|i| i.name != name);
4816            if t.indices.len() != before {
4817                return true;
4818            }
4819        }
4820        false
4821    }
4822
4823    /// Borrow-free copy of every table's name in catalog order
4824    /// (= insertion order, matching the on-disk encoding).
4825    pub fn table_names(&self) -> Vec<String> {
4826        self.tables.iter().map(|t| t.schema.name.clone()).collect()
4827    }
4828
4829    /// v5.1: register a cold-tier segment that already lives in
4830    /// memory (caller did the file read). Returns the
4831    /// `segment_id` that `RowLocator::Cold { segment_id, .. }`
4832    /// will reference — currently this is just the index into
4833    /// `cold_segments`, but treat it as an opaque token.
4834    ///
4835    /// Storage is `no_std`, so file I/O is the caller's
4836    /// responsibility — `spg-server` reads the file and forwards
4837    /// the bytes here. The bytes stay resident in the catalog
4838    /// for the life of the `Catalog`, parsed only once.
4839    pub fn load_segment_bytes(&mut self, bytes: Vec<u8>) -> Result<u32, StorageError> {
4840        let id = u32::try_from(self.cold_segments.len()).map_err(|_| {
4841            StorageError::Corrupt("cold segment count would exceed u32::MAX".into())
4842        })?;
4843        let seg = OwnedSegment::from_bytes(bytes)
4844            .map_err(|e| StorageError::Corrupt(format!("cold segment parse failed: {e}")))?;
4845        self.cold_segments.push(Some(Arc::new(seg)));
4846        Ok(id)
4847    }
4848
4849    /// v6.7.3 — register a cold-tier segment at a specific id. Used
4850    /// by the spg-server manifest-boot path so segments whose
4851    /// neighbouring ids were retired by compaction still get back
4852    /// the same `segment_id` they had pre-restart (the
4853    /// `RowLocator::Cold { segment_id }` baked into the BTree-index
4854    /// snapshot persists across restart and must continue to
4855    /// resolve).
4856    ///
4857    /// Pads the Vec with `None` slots up to `target_id` if needed.
4858    /// Errors when the target slot is already occupied (would
4859    /// stomp another segment), the parse fails, or `target_id`
4860    /// exceeds `u32::MAX`.
4861    pub fn load_segment_bytes_at(
4862        &mut self,
4863        target_id: u32,
4864        bytes: Vec<u8>,
4865    ) -> Result<(), StorageError> {
4866        let seg = OwnedSegment::from_bytes(bytes)
4867            .map_err(|e| StorageError::Corrupt(format!("cold segment parse failed: {e}")))?;
4868        let idx = target_id as usize;
4869        while self.cold_segments.len() <= idx {
4870            self.cold_segments.push(None);
4871        }
4872        if self.cold_segments[idx].is_some() {
4873            return Err(StorageError::Corrupt(format!(
4874                "load_segment_bytes_at: segment_id {target_id} already occupied"
4875            )));
4876        }
4877        self.cold_segments[idx] = Some(Arc::new(seg));
4878        Ok(())
4879    }
4880
4881    /// v6.7.3 — retire a cold-tier segment slot (compaction-driven).
4882    /// The physical file is the caller's concern (typically kept
4883    /// on disk until the next CHECKPOINT writes a manifest that
4884    /// no longer lists it); this just flips the in-memory slot
4885    /// to `None` so later cold lookups for `segment_id` resolve
4886    /// as "unknown" instead of returning a stale row.
4887    ///
4888    /// No-op when the slot is already `None`. Errors only when
4889    /// `segment_id` is out of bounds.
4890    pub fn tombstone_segment(&mut self, segment_id: u32) -> Result<(), StorageError> {
4891        let idx = segment_id as usize;
4892        if idx >= self.cold_segments.len() {
4893            return Err(StorageError::Corrupt(format!(
4894                "tombstone_segment: segment_id {segment_id} out of bounds (len={})",
4895                self.cold_segments.len()
4896            )));
4897        }
4898        self.cold_segments[idx] = None;
4899        Ok(())
4900    }
4901
4902    /// Number of *active* (non-tombstoned) cold segments.
4903    #[must_use]
4904    pub fn cold_segment_count(&self) -> usize {
4905        self.cold_segments.iter().filter(|s| s.is_some()).count()
4906    }
4907
4908    /// Slot count including tombstones (= the next id the
4909    /// no-arg `load_segment_bytes` would allocate).
4910    #[must_use]
4911    pub fn cold_segment_slot_count(&self) -> usize {
4912        self.cold_segments.len()
4913    }
4914
4915    /// v6.2.7 — list every *active* cold-tier segment id known to
4916    /// this catalog (skips compaction tombstones since v6.7.3).
4917    /// Used by EXPLAIN ANALYZE to annotate scan nodes with the
4918    /// segments they could have walked.
4919    #[must_use]
4920    pub fn cold_segment_ids_global(&self) -> Vec<u32> {
4921        self.cold_segments
4922            .iter()
4923            .enumerate()
4924            .filter_map(|(i, s)| s.as_ref().map(|_| i as u32))
4925            .collect()
4926    }
4927
4928    /// v5.2.1: sum of `Table::hot_bytes` across every table. The v5.2
4929    /// freezer compares this against `SPG_HOT_TIER_BYTES` (parsed at
4930    /// server startup; default 4 GiB) and wakes when the budget is
4931    /// crossed. Pre-freezer (v5.2.1) this is measurement-only — the
4932    /// counter exposes whether the budget is being approached without
4933    /// triggering any demotion.
4934    #[must_use]
4935    pub fn hot_tier_bytes(&self) -> u64 {
4936        self.tables
4937            .iter()
4938            .map(Table::hot_bytes)
4939            .fold(0u64, u64::saturating_add)
4940    }
4941
4942    /// v5.2.2: freeze the **first** `max_rows` rows of `table_name`'s
4943    /// hot tier into a brand-new cold-tier segment. The named `BTree`
4944    /// index supplies the per-row PK (its column must be an integer
4945    /// type — v5.2.2 only supports `IndexKey::Int` PKs, matching the
4946    /// `index_key_as_u64` constraint used by the cold-tier lookup
4947    /// path). On success returns a [`FreezeReport`] with the
4948    /// freshly-allocated segment id, the count of rows that moved,
4949    /// the encoded segment bytes (so the caller can persist them to
4950    /// disk for later reload via `SPG_PRELOAD_COLD_SEGMENT`), and the
4951    /// hot-tier byte delta that was reclaimed.
4952    ///
4953    /// **Semantics**:
4954    /// 1. The first `max_rows` rows (by hot-tier position — same as
4955    ///    insertion order under v4.39 `PersistentVec`) are read.
4956    /// 2. Rows are sorted ascending by PK and serialised into a new
4957    ///    segment via [`encode_segment`].
4958    /// 3. The hot rows are dropped via [`Table::delete_rows`]; the
4959    ///    `rebuild_indices` it triggers regenerates `Hot` locators
4960    ///    for every remaining row (their positions shift down by
4961    ///    `max_rows`). Existing `Cold` locators in this index — from
4962    ///    a previous freeze — are also rebuilt **but with empty
4963    ///    payload** since rebuild reads only `self.rows`; this
4964    ///    routine re-registers them at the end of the call so the
4965    ///    user-visible state preserves all prior cold locators.
4966    /// 4. The new segment is loaded into `self.cold_segments` via
4967    ///    [`Catalog::load_segment_bytes`] (allocating a fresh
4968    ///    `segment_id`). New `Cold` locators are registered on the
4969    ///    named index — one per frozen row.
4970    ///
4971    /// **v5.2.2 limits** (relaxed in later sub-versions):
4972    /// - INSERT-only flow: subsequent UPDATE/DELETE on a frozen row
4973    ///   returns a stale-locator error (no promote-on-write until
4974    ///   v5.2.3).
4975    /// - Single-table scope: callers iterate tables themselves.
4976    /// - All-or-nothing: returns `Err` and leaves catalog unchanged
4977    ///   if any step fails before the atomic swap point.
4978    ///
4979    /// Errors:
4980    /// - [`StorageError::Corrupt`] for missing table/index, non-`BTree`
4981    ///   index, non-integer PK column, `max_rows == 0`, or
4982    ///   `max_rows > row_count`.
4983    /// - The encoder's [`SegmentError`] surfaces as `Corrupt` (the
4984    ///   only realistic source is "a single row is larger than the
4985    ///   page size"; SPG schemas don't hit it in practice).
4986    pub fn freeze_oldest_to_cold(
4987        &mut self,
4988        table_name: &str,
4989        index_name: &str,
4990        max_rows: usize,
4991    ) -> Result<FreezeReport, StorageError> {
4992        // --- validation phase: never mutates ---------------------
4993        if max_rows == 0 {
4994            return Err(StorageError::Corrupt(
4995                "freeze_oldest_to_cold: max_rows must be > 0".into(),
4996            ));
4997        }
4998        let table = self.get(table_name).ok_or_else(|| {
4999            StorageError::Corrupt(format!(
5000                "freeze_oldest_to_cold: table {table_name:?} not found"
5001            ))
5002        })?;
5003        if max_rows > table.rows.len() {
5004            return Err(StorageError::Corrupt(format!(
5005                "freeze_oldest_to_cold: max_rows {max_rows} > row_count {}",
5006                table.rows.len()
5007            )));
5008        }
5009        let idx = table
5010            .indices
5011            .iter()
5012            .find(|i| i.name == index_name)
5013            .ok_or_else(|| {
5014                StorageError::Corrupt(format!(
5015                    "freeze_oldest_to_cold: index {index_name:?} not found on {table_name:?}"
5016                ))
5017            })?;
5018        if !matches!(idx.kind, IndexKind::BTree(_)) {
5019            return Err(StorageError::Corrupt(format!(
5020                "freeze_oldest_to_cold: index {index_name:?} is NSW; only BTree indices may freeze"
5021            )));
5022        }
5023        let column_position = idx.column_position;
5024
5025        // --- segment build phase: reads only --------------------
5026        let schema = table.schema.clone();
5027        let mut to_freeze: Vec<(u64, Vec<u8>, IndexKey)> = Vec::with_capacity(max_rows);
5028        for row_idx in 0..max_rows {
5029            let row = table.rows.get(row_idx).expect("bounds-checked above");
5030            let key = IndexKey::from_value(&row.values[column_position]).ok_or_else(|| {
5031                StorageError::Corrupt(format!(
5032                    "freeze_oldest_to_cold: row {row_idx} has NULL / non-key value in index column"
5033                ))
5034            })?;
5035            let pk_u64 = index_key_as_u64(&key).ok_or_else(|| {
5036                StorageError::Corrupt(format!(
5037                    "freeze_oldest_to_cold: index {index_name:?} column type is non-integer; \
5038                     v5.2.2 cold tier requires IndexKey::Int (Text PK lands in v5.5+)"
5039                ))
5040            })?;
5041            to_freeze.push((pk_u64, encode_row_body_dense(row, &schema), key));
5042        }
5043        // encode_segment requires ascending u64 keys. Sort by PK
5044        // before encoding; the caller's row-position order is not
5045        // necessarily PK order (e.g. workloads that insert random
5046        // PKs).
5047        to_freeze.sort_by_key(|(k, _, _)| *k);
5048        // Reject duplicate PKs — encode_segment also rejects them
5049        // (`SegmentError::UnsortedKey`), but the resulting error
5050        // message there is misleading. Surface a clearer one.
5051        for w in to_freeze.windows(2) {
5052            if w[0].0 == w[1].0 {
5053                return Err(StorageError::Corrupt(format!(
5054                    "freeze_oldest_to_cold: duplicate PK {} in freeze batch",
5055                    w[0].0
5056                )));
5057            }
5058        }
5059        // Snapshot the (key, locator) pairs that will be registered
5060        // post-swap. Cloning the IndexKey out before the move makes
5061        // the registration loop borrow-free.
5062        let post_swap_keys: Vec<IndexKey> = to_freeze.iter().map(|(_, _, k)| k.clone()).collect();
5063        // Segment encode is now infallible w.r.t. ordering. Map the
5064        // `SegmentError` into a `StorageError::Corrupt` so the
5065        // public surface stays one error type.
5066        let seg_rows: Vec<(u64, Vec<u8>)> = to_freeze
5067            .into_iter()
5068            .map(|(k, body, _)| (k, body))
5069            .collect();
5070        let frozen_rows = seg_rows.len();
5071        let (seg_bytes, _meta) = encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES)
5072            .map_err(|e| StorageError::Corrupt(format!("freeze_oldest_to_cold: encode: {e}")))?;
5073
5074        // --- atomic swap phase: mutations only past this point ---
5075        // v5.2.3 made `Table::rebuild_indices` preserve every Cold
5076        // locator across the per-table rebuild, so `delete_rows`
5077        // below no longer wipes prior-freeze cold entries. The pre-
5078        // v5.2.3 capture-then-re-register that used to live here
5079        // was removed in v5.3.1 — keeping it would double-count
5080        // every prior-frozen key's Cold locator on each subsequent
5081        // freeze.
5082        let bytes_before = self.get(table_name).expect("just validated").hot_bytes();
5083        let positions: Vec<usize> = (0..max_rows).collect();
5084        let t_mut = self
5085            .get_mut(table_name)
5086            .expect("just validated; still present");
5087        let removed = t_mut.delete_rows(&positions);
5088        debug_assert_eq!(removed, max_rows, "delete_rows count matches request");
5089        let bytes_after = t_mut.hot_bytes();
5090        let bytes_freed = bytes_before.saturating_sub(bytes_after);
5091
5092        let segment_id = self
5093            .load_segment_bytes(seg_bytes.clone())
5094            .map_err(|e| StorageError::Corrupt(format!("freeze_oldest_to_cold: load: {e}")))?;
5095        let new_cold = post_swap_keys.into_iter().map(|k| {
5096            (
5097                k,
5098                RowLocator::Cold {
5099                    segment_id,
5100                    page_offset: 0,
5101                },
5102            )
5103        });
5104        let t_mut = self.get_mut(table_name).expect("still present");
5105        t_mut.register_cold_locators(index_name, new_cold)?;
5106
5107        Ok(FreezeReport {
5108            segment_id,
5109            frozen_rows,
5110            bytes_freed,
5111            segment_bytes: seg_bytes,
5112        })
5113    }
5114
5115    /// v5.1: borrow the cold segment at `segment_id`. Used by the
5116    /// spg-server preload path to enumerate (key, locator) pairs
5117    /// after loading a segment, so it can call
5118    /// [`Table::register_cold_locators`] without re-parsing the
5119    /// bytes.
5120    #[must_use]
5121    pub fn cold_segment(&self, segment_id: u32) -> Option<&OwnedSegment> {
5122        self.cold_segments
5123            .get(segment_id as usize)
5124            .and_then(|s| s.as_deref())
5125    }
5126
5127    /// v5.1: resolve a single `RowLocator::Cold` to its underlying
5128    /// `Row`. Decoupled from [`Catalog::lookup_by_pk`] so callers
5129    /// iterating a multi-locator slice (e.g. the engine's index
5130    /// seek path) can dispatch per locator instead of getting back
5131    /// only the first row for a key. Returns `None` when the
5132    /// segment isn't registered, the key isn't `u64`-coercible, or
5133    /// the segment doesn't actually carry the key (bloom or page-
5134    /// index reject).
5135    pub fn resolve_cold_locator(
5136        &self,
5137        table_name: &str,
5138        segment_id: u32,
5139        key: &IndexKey,
5140    ) -> Option<Row> {
5141        let t = self.get(table_name)?;
5142        let u64_key = index_key_as_u64(key)?;
5143        let seg = self.cold_segments.get(segment_id as usize)?.as_ref()?;
5144        let payload = seg.lookup(u64_key)?;
5145        let (row, _) = decode_row_body_dense(&payload, &t.schema, seg.codec_version()).ok()?;
5146        Some(row)
5147    }
5148
5149    /// v5.1: indexed PK lookup that dispatches per locator,
5150    /// returning the first matching row from either the hot tier
5151    /// (`Table::rows`) or a registered cold segment.
5152    ///
5153    /// The cold path requires the index column to be coercible to
5154    /// a `u64` (the segment's PK type) and the segment payload to
5155    /// be a [`encode_row_body_dense`]-encoded row body for the
5156    /// same schema. v5.1 ships this for BIGINT / INT / SMALLINT
5157    /// PKs; other types fall through to hot-only behavior.
5158    ///
5159    /// Returns `None` if (a) the table or index doesn't exist,
5160    /// (b) the key isn't in the index at all, or (c) the key was
5161    /// resolved to a stale locator (Hot index out of range, Cold
5162    /// segment id unknown, segment lookup miss). Does not surface
5163    /// segment-decode errors — those would indicate corrupted
5164    /// cold-tier files and should be caught at
5165    /// [`Catalog::load_segment_bytes`] time.
5166    pub fn lookup_by_pk(&self, table: &str, index_name: &str, key: &IndexKey) -> Option<Row> {
5167        let t = self.get(table)?;
5168        let idx = t.indices.iter().find(|i| i.name == index_name)?;
5169        let locators = idx.lookup_eq(key);
5170        let cold_u64_key = index_key_as_u64(key);
5171        for loc in locators {
5172            match *loc {
5173                RowLocator::Hot(i) => {
5174                    if let Some(row) = t.rows.get(i) {
5175                        return Some(row.clone());
5176                    }
5177                }
5178                RowLocator::Cold {
5179                    segment_id,
5180                    page_offset: _,
5181                } => {
5182                    let Some(u64_key) = cold_u64_key else {
5183                        // Key type not coercible to u64 — cold tier
5184                        // only handles BIGINT/INT/SMALLINT in v5.1.
5185                        continue;
5186                    };
5187                    let Some(seg) = self
5188                        .cold_segments
5189                        .get(segment_id as usize)
5190                        .and_then(|s| s.as_deref())
5191                    else {
5192                        // v6.7.3 — `None` slot = compaction
5193                        // retired this segment; the live locator
5194                        // on a freshly-compacted index points to
5195                        // the merged segment_id, so a Cold hit
5196                        // here against a tombstone means the BTree
5197                        // entry hasn't been swapped yet (mid-
5198                        // compaction reader race) or the caller is
5199                        // looking up a stale snapshot. Skip — the
5200                        // next locator in the list, if any, is
5201                        // typically the merged segment.
5202                        continue;
5203                    };
5204                    let Some(payload) = seg.lookup(u64_key) else {
5205                        continue;
5206                    };
5207                    let (row, _) =
5208                        decode_row_body_dense(&payload, &t.schema, seg.codec_version()).ok()?;
5209                    return Some(row);
5210                }
5211            }
5212        }
5213        None
5214    }
5215
5216    /// v5.2.3: promote a frozen row back to the hot tier so an
5217    /// UPDATE / DELETE can mutate it. Reads the cold-tier row body
5218    /// (decoded from its registered segment), pushes it into
5219    /// `table.rows` via [`Table::insert`] (which also adds a fresh
5220    /// `Hot(new_idx)` locator on `index_name`), then retires the
5221    /// shadowed `Cold` locator via
5222    /// [`Table::remove_cold_locators_for_key`]. The cold-tier row
5223    /// in the segment file becomes garbage — recoverable when a
5224    /// future cold-segment compaction job lands.
5225    ///
5226    /// Returns:
5227    /// - `Ok(Some(new_hot_idx))` when the key resolved through a
5228    ///   cold locator and the promote completed. `new_hot_idx` is
5229    ///   the position the row now occupies in `table.rows`.
5230    /// - `Ok(None)` when the key has no Cold locator on the index
5231    ///   (already hot, or wasn't present at all). Callers treat this
5232    ///   as "nothing to do here, fall back to the hot-only path".
5233    ///
5234    /// Errors when the table / index doesn't exist, the index isn't
5235    /// `BTree`, the cold segment is missing / can't decode the row,
5236    /// or the inferred row body fails `Table::insert` validation.
5237    pub fn promote_cold_row(
5238        &mut self,
5239        table_name: &str,
5240        index_name: &str,
5241        key: &IndexKey,
5242    ) -> Result<Option<usize>, StorageError> {
5243        let cold_loc = self.find_cold_locator(table_name, index_name, key)?;
5244        let Some((segment_id, _page_offset)) = cold_loc else {
5245            return Ok(None);
5246        };
5247        let u64_key = index_key_as_u64(key).ok_or_else(|| {
5248            StorageError::Corrupt(
5249                "promote_cold_row: key type not coercible to u64 (cold tier requires integer PK)"
5250                    .into(),
5251            )
5252        })?;
5253        // Read the row body from the segment. Borrow the segment +
5254        // schema short-term so we can then take `&mut self` for the
5255        // hot-side insert.
5256        let schema = self
5257            .get(table_name)
5258            .ok_or_else(|| {
5259                StorageError::Corrupt(format!("promote_cold_row: table {table_name:?} not found"))
5260            })?
5261            .schema
5262            .clone();
5263        let seg = self
5264            .cold_segments
5265            .get(segment_id as usize)
5266            .and_then(|s| s.as_ref())
5267            .ok_or_else(|| {
5268                StorageError::Corrupt(format!(
5269                    "promote_cold_row: segment {segment_id} not registered on catalog"
5270                ))
5271            })?;
5272        let payload = seg.lookup(u64_key).ok_or_else(|| {
5273            StorageError::Corrupt(format!(
5274                "promote_cold_row: key {u64_key} resolves to segment {segment_id} \
5275                 but the segment's bloom/page lookup didn't return a row"
5276            ))
5277        })?;
5278        let (row, _consumed) = decode_row_body_dense(&payload, &schema, seg.codec_version())?;
5279        // Insert the promoted row into the hot tier. `Table::insert`
5280        // appends to `self.rows`, adds a `Hot(new_idx)` locator to
5281        // every BTree index covering the row's keyed columns, and
5282        // increments `hot_bytes`.
5283        let t = self
5284            .get_mut(table_name)
5285            .expect("table existed at lookup time");
5286        t.insert(row)?;
5287        let new_hot_idx =
5288            t.rows.len().checked_sub(1).ok_or_else(|| {
5289                StorageError::Corrupt("promote_cold_row: empty after insert".into())
5290            })?;
5291        // The hot insert added Hot(new_idx) alongside the still-
5292        // present Cold locator. Drop the Cold entry so future
5293        // lookups return only the fresh hot row.
5294        t.remove_cold_locators_for_key(index_name, key)?;
5295        Ok(Some(new_hot_idx))
5296    }
5297
5298    /// v5.2.3: shadow a frozen row's index entry. Used by DELETE
5299    /// when the row to remove lives in a cold-tier segment — the
5300    /// row body stays in the segment file (becoming garbage) but
5301    /// every `Cold` locator for `key` on `index_name` is removed
5302    /// so PK lookups stop returning it.
5303    ///
5304    /// Returns the number of cold locators retired (0 when the key
5305    /// has no cold entries — the DELETE fell on a hot row or a
5306    /// key that was already absent). Errors when the table /
5307    /// index doesn't exist or the index isn't `BTree`.
5308    ///
5309    /// Cold-segment compaction (which merges shadowed-heavy
5310    /// segments and reclaims their disk footprint) lands in a
5311    /// later v5.x sub-version; until then, repeated UPDATE/DELETE
5312    /// of cold rows can amplify cold-segment disk usage by up to
5313    /// 1-2× — still well under typical LSM-tree shadowing because
5314    /// SPG segments are bulk-baked, not write-merged.
5315    pub fn shadow_cold_row(
5316        &mut self,
5317        table_name: &str,
5318        index_name: &str,
5319        key: &IndexKey,
5320    ) -> Result<usize, StorageError> {
5321        let t = self.get_mut(table_name).ok_or_else(|| {
5322            StorageError::Corrupt(format!("shadow_cold_row: table {table_name:?} not found"))
5323        })?;
5324        t.remove_cold_locators_for_key(index_name, key)
5325    }
5326
5327    /// v6.7.4 — read-only slice preparation for the parallel
5328    /// freezer. Walks rows in `row_range`, builds the
5329    /// `(pk_u64, encoded_body, IndexKey)` triples that the
5330    /// coordinator's k-way merge consumes, sorts the slice by
5331    /// `pk_u64`, and returns a [`FreezeSlice`].
5332    ///
5333    /// Caller invariants:
5334    /// - `row_range.end <= table.rows.len()` (caller's job to
5335    ///   compute the partition).
5336    /// - All slices passed to `commit_freeze_slices` must cover a
5337    ///   contiguous half-open range `[0, total_max_rows)` with no
5338    ///   gaps and no overlaps. The coordinator validates this
5339    ///   invariant before committing.
5340    ///
5341    /// `&self`-only — multiple workers can run this concurrently
5342    /// against the same `Catalog` reference under the engine's
5343    /// write lock (workers don't mutate; the coordinator does).
5344    pub fn prepare_freeze_slice(
5345        &self,
5346        table_name: &str,
5347        index_name: &str,
5348        row_range: core::ops::Range<usize>,
5349    ) -> Result<FreezeSlice, StorageError> {
5350        let table = self.get(table_name).ok_or_else(|| {
5351            StorageError::Corrupt(format!(
5352                "prepare_freeze_slice: table {table_name:?} not found"
5353            ))
5354        })?;
5355        let idx = table
5356            .indices
5357            .iter()
5358            .find(|i| i.name == index_name)
5359            .ok_or_else(|| {
5360                StorageError::Corrupt(format!(
5361                    "prepare_freeze_slice: index {index_name:?} not found on {table_name:?}"
5362                ))
5363            })?;
5364        if !matches!(idx.kind, IndexKind::BTree(_)) {
5365            return Err(StorageError::Corrupt(format!(
5366                "prepare_freeze_slice: index {index_name:?} is NSW; only BTree indices may freeze"
5367            )));
5368        }
5369        if row_range.end > table.rows.len() {
5370            return Err(StorageError::Corrupt(format!(
5371                "prepare_freeze_slice: row_range end {} > row_count {}",
5372                row_range.end,
5373                table.rows.len()
5374            )));
5375        }
5376        let column_position = idx.column_position;
5377        let schema = table.schema.clone();
5378        let mut rows: Vec<(u64, Vec<u8>, IndexKey)> = Vec::with_capacity(row_range.len());
5379        for row_idx in row_range.clone() {
5380            let row = table.rows.get(row_idx).expect("bounds-checked above");
5381            let key = IndexKey::from_value(&row.values[column_position]).ok_or_else(|| {
5382                StorageError::Corrupt(format!(
5383                    "prepare_freeze_slice: row {row_idx} has NULL / non-key value in index column"
5384                ))
5385            })?;
5386            let pk_u64 = index_key_as_u64(&key).ok_or_else(|| {
5387                StorageError::Corrupt(format!(
5388                    "prepare_freeze_slice: index {index_name:?} column type is non-integer; \
5389                     v5.2.2 cold tier requires IndexKey::Int (Text PK lands in v5.5+)"
5390                ))
5391            })?;
5392            rows.push((pk_u64, encode_row_body_dense(row, &schema), key));
5393        }
5394        rows.sort_by_key(|(k, _, _)| *k);
5395        Ok(FreezeSlice { row_range, rows })
5396    }
5397
5398    /// v6.7.4 — coordinator commit step. Merges N
5399    /// [`FreezeSlice`]s into one segment via the standard
5400    /// [`encode_segment`] path, atomically swaps the catalog
5401    /// state (delete the union row range + register Cold
5402    /// locators + load the segment).
5403    ///
5404    /// Validates that the slices cover a contiguous, gap-free,
5405    /// overlap-free half-open range starting at index 0 (the
5406    /// freezer always freezes "oldest first" — same semantics as
5407    /// the single-threaded [`Catalog::freeze_oldest_to_cold`]).
5408    ///
5409    /// Empty `slices` → no-op success (returns a zero-row report
5410    /// without mutating). Total row count = `Σ slice.rows.len()`.
5411    pub fn commit_freeze_slices(
5412        &mut self,
5413        table_name: &str,
5414        index_name: &str,
5415        slices: Vec<FreezeSlice>,
5416    ) -> Result<FreezeReport, StorageError> {
5417        // --- validation phase: never mutates ---------------------
5418        let table = self.get(table_name).ok_or_else(|| {
5419            StorageError::Corrupt(format!(
5420                "commit_freeze_slices: table {table_name:?} not found"
5421            ))
5422        })?;
5423        let idx = table
5424            .indices
5425            .iter()
5426            .find(|i| i.name == index_name)
5427            .ok_or_else(|| {
5428                StorageError::Corrupt(format!(
5429                    "commit_freeze_slices: index {index_name:?} not found on {table_name:?}"
5430                ))
5431            })?;
5432        if !matches!(idx.kind, IndexKind::BTree(_)) {
5433            return Err(StorageError::Corrupt(format!(
5434                "commit_freeze_slices: index {index_name:?} is NSW; only BTree indices may freeze"
5435            )));
5436        }
5437        // Validate slice coverage: contiguous from 0, no gaps, no
5438        // overlaps. Allow the caller to pass slices in any order —
5439        // sort by row_range.start first.
5440        let mut ordered = slices;
5441        ordered.sort_by_key(|s| s.row_range.start);
5442        // Drop fully-empty slices that fell out of an uneven
5443        // partition; they carry no data but contribute to the
5444        // contiguity check, so keep them in line.
5445        let mut expected_start = 0usize;
5446        for s in &ordered {
5447            if s.row_range.start != expected_start {
5448                return Err(StorageError::Corrupt(format!(
5449                    "commit_freeze_slices: gap/overlap at row {}; expected start {}",
5450                    s.row_range.start, expected_start
5451                )));
5452            }
5453            expected_start = s.row_range.end;
5454        }
5455        let max_rows = expected_start;
5456        if max_rows > table.rows.len() {
5457            return Err(StorageError::Corrupt(format!(
5458                "commit_freeze_slices: total row range {} exceeds row_count {}",
5459                max_rows,
5460                table.rows.len()
5461            )));
5462        }
5463        if max_rows == 0 {
5464            return Ok(FreezeReport {
5465                segment_id: u32::MAX,
5466                frozen_rows: 0,
5467                bytes_freed: 0,
5468                segment_bytes: Vec::new(),
5469            });
5470        }
5471
5472        // --- segment build phase: reads only --------------------
5473        // K-way merge of already-sorted slices. Each slice's rows
5474        // are ascending by pk_u64; we keep a per-slice cursor and
5475        // pull the next-smallest head until every cursor drains.
5476        let total_rows: usize = ordered.iter().map(|s| s.rows.len()).sum();
5477        if total_rows != max_rows {
5478            return Err(StorageError::Corrupt(format!(
5479                "commit_freeze_slices: total slice rows {total_rows} ≠ row_range coverage {max_rows}"
5480            )));
5481        }
5482        let mut cursors: Vec<usize> = alloc::vec![0; ordered.len()];
5483        let mut merged: Vec<(u64, Vec<u8>, IndexKey)> = Vec::with_capacity(total_rows);
5484        loop {
5485            // Pick the slice whose head row has the smallest key
5486            // and isn't yet exhausted.
5487            let mut pick: Option<usize> = None;
5488            for (i, c) in cursors.iter().enumerate() {
5489                let slice = &ordered[i];
5490                if *c >= slice.rows.len() {
5491                    continue;
5492                }
5493                match pick {
5494                    None => pick = Some(i),
5495                    Some(j) => {
5496                        if slice.rows[*c].0 < ordered[j].rows[cursors[j]].0 {
5497                            pick = Some(i);
5498                        }
5499                    }
5500                }
5501            }
5502            let Some(i) = pick else { break };
5503            let row = ordered[i].rows[cursors[i]].clone();
5504            cursors[i] += 1;
5505            merged.push(row);
5506        }
5507        // Reject duplicate PKs — same error as the single-threaded
5508        // path so callers get a uniform surface.
5509        for w in merged.windows(2) {
5510            if w[0].0 == w[1].0 {
5511                return Err(StorageError::Corrupt(format!(
5512                    "commit_freeze_slices: duplicate PK {} across slices",
5513                    w[0].0
5514                )));
5515            }
5516        }
5517        let post_swap_keys: Vec<IndexKey> = merged.iter().map(|(_, _, k)| k.clone()).collect();
5518        let seg_rows: Vec<(u64, Vec<u8>)> =
5519            merged.into_iter().map(|(k, body, _)| (k, body)).collect();
5520        let frozen_rows = seg_rows.len();
5521        let (seg_bytes, _meta) = encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES)
5522            .map_err(|e| StorageError::Corrupt(format!("commit_freeze_slices: encode: {e}")))?;
5523
5524        // --- atomic swap phase: mutations only past this point ---
5525        let bytes_before = self.get(table_name).expect("just validated").hot_bytes();
5526        let positions: Vec<usize> = (0..max_rows).collect();
5527        let t_mut = self
5528            .get_mut(table_name)
5529            .expect("just validated; still present");
5530        let removed = t_mut.delete_rows(&positions);
5531        debug_assert_eq!(removed, max_rows, "delete_rows count matches request");
5532        let bytes_after = t_mut.hot_bytes();
5533        let bytes_freed = bytes_before.saturating_sub(bytes_after);
5534
5535        let segment_id = self
5536            .load_segment_bytes(seg_bytes.clone())
5537            .map_err(|e| StorageError::Corrupt(format!("commit_freeze_slices: load: {e}")))?;
5538        let new_cold = post_swap_keys.into_iter().map(|k| {
5539            (
5540                k,
5541                RowLocator::Cold {
5542                    segment_id,
5543                    page_offset: 0,
5544                },
5545            )
5546        });
5547        let t_mut = self.get_mut(table_name).expect("still present");
5548        t_mut.register_cold_locators(index_name, new_cold)?;
5549
5550        Ok(FreezeReport {
5551            segment_id,
5552            frozen_rows,
5553            bytes_freed,
5554            segment_bytes: seg_bytes,
5555        })
5556    }
5557
5558    /// v6.7.3 — compact every cold segment on `(table, index)` whose
5559    /// `OwnedSegment::bytes().len()` is below `target_segment_bytes`
5560    /// into a single larger merged segment. Rows present in source
5561    /// segment payloads but no longer referenced by any
5562    /// `RowLocator::Cold` on the index (DELETE'd + frozen rows
5563    /// retired via [`Catalog::shadow_cold_row`]) are GC'd in the
5564    /// merge.
5565    ///
5566    /// **Semantics**:
5567    /// 1. Walk the BTree index to collect every Cold locator that
5568    ///    targets a small (< threshold) segment. Each such
5569    ///    `(key, segment_id)` becomes a row in the merged segment;
5570    ///    payload is looked up from the source segment in-place.
5571    /// 2. Encode the collected rows into one new segment via
5572    ///    [`encode_segment`]; register it via
5573    ///    [`Catalog::load_segment_bytes`] (allocating a fresh
5574    ///    `merged_segment_id` at the end of `cold_segments`).
5575    /// 3. Rewrite the BTree index in one pass: every
5576    ///    `RowLocator::Cold { segment_id ∈ sources }` becomes
5577    ///    `RowLocator::Cold { segment_id = merged_id, page_offset = 0 }`.
5578    ///    Hot locators are untouched.
5579    /// 4. Tombstone every source slot via
5580    ///    [`Catalog::tombstone_segment`]. Source segment payloads
5581    ///    are no longer reachable through the catalog; the on-disk
5582    ///    files are the caller's concern.
5583    ///
5584    /// On fewer than 2 candidate segments the catalog is **not**
5585    /// mutated and a no-op report (`merged_segment_id: None`,
5586    /// `sources: []`) is returned. This is the routine case — a
5587    /// freshly-frozen table has at most 1 small segment, no merge
5588    /// possible.
5589    ///
5590    /// Atomicity: every mutating step runs after the read-only
5591    /// gather phase, so a panic before the merge encode leaves the
5592    /// catalog unchanged. The mutation block itself (load + rewrite +
5593    /// tombstone) takes only `&mut self` — callers serialise the
5594    /// engine write lock outside this function.
5595    ///
5596    /// Errors when the table / index doesn't exist, the index isn't
5597    /// `BTree`, the index column type isn't u64-coercible (cold-tier
5598    /// pre-condition), or a source segment fails its in-place
5599    /// row-body lookup (would indicate prior catalog corruption).
5600    pub fn compact_cold_segments(
5601        &mut self,
5602        table_name: &str,
5603        index_name: &str,
5604        target_segment_bytes: u64,
5605    ) -> Result<CompactReport, StorageError> {
5606        // --- validation phase ----------------------------------
5607        let t = self.get(table_name).ok_or_else(|| {
5608            StorageError::Corrupt(format!(
5609                "compact_cold_segments: table {table_name:?} not found"
5610            ))
5611        })?;
5612        let idx = t
5613            .indices
5614            .iter()
5615            .find(|i| i.name == index_name)
5616            .ok_or_else(|| {
5617                StorageError::Corrupt(format!(
5618                    "compact_cold_segments: index {index_name:?} not found on {table_name:?}"
5619                ))
5620            })?;
5621        let map = match &idx.kind {
5622            IndexKind::BTree(m) => m,
5623            IndexKind::Nsw(_)
5624            | IndexKind::Brin { .. }
5625            | IndexKind::Gin(_)
5626            | IndexKind::GinTrgm(_)
5627            | IndexKind::GinFulltext(_) => {
5628                return Err(StorageError::Corrupt(format!(
5629                    "compact_cold_segments: index {index_name:?} is not BTree; \
5630                     compaction applies only to BTree cold-tier indices"
5631                )));
5632            }
5633        };
5634
5635        // --- gather phase --------------------------------------
5636        // Step A: every segment_id this BTree index Cold-references.
5637        let mut referenced_ids: BTreeSet<u32> = BTreeSet::new();
5638        for (_key, locators) in map.iter() {
5639            for loc in locators {
5640                if let RowLocator::Cold { segment_id, .. } = loc {
5641                    referenced_ids.insert(*segment_id);
5642                }
5643            }
5644        }
5645        // Step B: keep only the small + still-active ones.
5646        let candidate_set: BTreeSet<u32> = referenced_ids
5647            .into_iter()
5648            .filter(|id| {
5649                self.cold_segments
5650                    .get(*id as usize)
5651                    .and_then(|s| s.as_deref())
5652                    .is_some_and(|s| (s.bytes().len() as u64) < target_segment_bytes)
5653            })
5654            .collect();
5655        if candidate_set.len() < 2 {
5656            return Ok(CompactReport {
5657                sources: Vec::new(),
5658                merged_segment_id: None,
5659                merged_segment_bytes: Vec::new(),
5660                merged_rows: 0,
5661                deleted_rows_pruned: 0,
5662                bytes_reclaimed_estimate: 0,
5663            });
5664        }
5665        // Step C: pre-count source rows for the deleted-pruned metric.
5666        let mut source_row_count: usize = 0;
5667        let mut source_byte_total: u64 = 0;
5668        for &id in &candidate_set {
5669            let seg = self.cold_segments[id as usize]
5670                .as_ref()
5671                .expect("candidate selected only when slot is Some");
5672            source_row_count = source_row_count.saturating_add(seg.meta().num_rows as usize);
5673            source_byte_total = source_byte_total.saturating_add(seg.bytes().len() as u64);
5674        }
5675        // Step D: collect (key, body) pairs from every live Cold
5676        // locator pointing at a candidate. dedupe by key — one
5677        // BTree key resolves to at most one cold payload (the
5678        // freezer + promote/shadow flow keeps Cold locators
5679        // unique per key).
5680        let mut collected: BTreeMap<u64, (Vec<u8>, IndexKey)> = BTreeMap::new();
5681        for (key, locators) in map.iter() {
5682            for loc in locators {
5683                let RowLocator::Cold { segment_id, .. } = loc else {
5684                    continue;
5685                };
5686                if !candidate_set.contains(segment_id) {
5687                    continue;
5688                }
5689                let u64_key = index_key_as_u64(key).ok_or_else(|| {
5690                    StorageError::Corrupt(format!(
5691                        "compact_cold_segments: index {index_name:?} has non-integer Cold key; \
5692                         cold tier requires IndexKey::Int (Text PK lands in v5.5+)"
5693                    ))
5694                })?;
5695                let seg = self.cold_segments[*segment_id as usize]
5696                    .as_ref()
5697                    .expect("candidate slot guaranteed Some above");
5698                let payload = seg.lookup(u64_key).ok_or_else(|| {
5699                    StorageError::Corrupt(format!(
5700                        "compact_cold_segments: BTree {index_name:?} points key={u64_key} \
5701                         at segment {segment_id} but the segment lookup missed"
5702                    ))
5703                })?;
5704                collected.insert(u64_key, (payload, key.clone()));
5705                break;
5706            }
5707        }
5708        let merged_rows = collected.len();
5709        let deleted_rows_pruned = source_row_count.saturating_sub(merged_rows);
5710
5711        // Step E: encode the merged segment. `BTreeMap<u64, _>`
5712        // iteration is ascending by key, which is what
5713        // `encode_segment` requires.
5714        let seg_rows: Vec<(u64, Vec<u8>)> = collected
5715            .iter()
5716            .map(|(k, (body, _))| (*k, body.clone()))
5717            .collect();
5718        let (seg_bytes, _meta) = encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES)
5719            .map_err(|e| StorageError::Corrupt(format!("compact_cold_segments: encode: {e}")))?;
5720        let merged_bytes_len = seg_bytes.len() as u64;
5721
5722        // --- atomic mutation phase ------------------------------
5723        let merged_segment_id = self
5724            .load_segment_bytes(seg_bytes.clone())
5725            .map_err(|e| StorageError::Corrupt(format!("compact_cold_segments: load: {e}")))?;
5726
5727        // Rewrite the BTree index: every Cold locator pointing at
5728        // a candidate source becomes a Cold locator pointing at
5729        // the merged segment. Use a flat collect-then-replace
5730        // pattern so we never hold a `&self` borrow across the
5731        // `&mut self` write.
5732        let entries: Vec<(IndexKey, Vec<RowLocator>)> = {
5733            let t = self
5734                .get(table_name)
5735                .expect("table existed at the start of this fn");
5736            let idx = t
5737                .indices
5738                .iter()
5739                .find(|i| i.name == index_name)
5740                .expect("index existed at the start of this fn");
5741            let IndexKind::BTree(map) = &idx.kind else {
5742                unreachable!("validated above");
5743            };
5744            map.iter().map(|(k, v)| (k.clone(), v.clone())).collect()
5745        };
5746        let t_mut = self
5747            .get_mut(table_name)
5748            .expect("table existed at the start of this fn");
5749        let idx_mut = t_mut
5750            .indices
5751            .iter_mut()
5752            .find(|i| i.name == index_name)
5753            .expect("index existed at the start of this fn");
5754        let IndexKind::BTree(map_mut) = &mut idx_mut.kind else {
5755            unreachable!("validated above");
5756        };
5757        for (key, locators) in entries {
5758            let mut new_locs: Vec<RowLocator> = Vec::with_capacity(locators.len());
5759            let mut changed = false;
5760            for loc in &locators {
5761                match *loc {
5762                    RowLocator::Cold {
5763                        segment_id,
5764                        page_offset: _,
5765                    } if candidate_set.contains(&segment_id) => {
5766                        let replacement = RowLocator::Cold {
5767                            segment_id: merged_segment_id,
5768                            page_offset: 0,
5769                        };
5770                        if !new_locs.contains(&replacement) {
5771                            new_locs.push(replacement);
5772                        }
5773                        changed = true;
5774                    }
5775                    other => new_locs.push(other),
5776                }
5777            }
5778            if changed {
5779                map_mut.insert_mut(key, new_locs);
5780            }
5781        }
5782
5783        // Tombstone every source slot. Last step — failures here
5784        // would leave the segment double-referenced in both
5785        // memory + manifest, but `tombstone_segment` only errors
5786        // on out-of-bounds, which we've already validated.
5787        for &id in &candidate_set {
5788            self.tombstone_segment(id)?;
5789        }
5790
5791        let bytes_reclaimed_estimate = source_byte_total.saturating_sub(merged_bytes_len);
5792        Ok(CompactReport {
5793            sources: candidate_set.into_iter().collect(),
5794            merged_segment_id: Some(merged_segment_id),
5795            merged_segment_bytes: seg_bytes,
5796            merged_rows,
5797            deleted_rows_pruned,
5798            bytes_reclaimed_estimate,
5799        })
5800    }
5801
5802    /// Internal helper: scan `(table, index)` for a `Cold` locator
5803    /// keyed by `key`. Returns `Ok(Some((segment_id, page_offset)))`
5804    /// when found, `Ok(None)` when the key has only hot entries
5805    /// or no entries at all, `Err` on the same input-validation
5806    /// errors as the public `promote_cold_row` / `shadow_cold_row`.
5807    fn find_cold_locator(
5808        &self,
5809        table_name: &str,
5810        index_name: &str,
5811        key: &IndexKey,
5812    ) -> Result<Option<(u32, u32)>, StorageError> {
5813        let t = self.get(table_name).ok_or_else(|| {
5814            StorageError::Corrupt(format!("find_cold_locator: table {table_name:?} not found"))
5815        })?;
5816        let idx = t
5817            .indices
5818            .iter()
5819            .find(|i| i.name == index_name)
5820            .ok_or_else(|| {
5821                StorageError::Corrupt(format!(
5822                    "find_cold_locator: index {index_name:?} not found on {table_name:?}"
5823                ))
5824            })?;
5825        if !matches!(idx.kind, IndexKind::BTree(_)) {
5826            return Err(StorageError::Corrupt(format!(
5827                "find_cold_locator: index {index_name:?} is NSW; promote-on-write only applies to BTree indices"
5828            )));
5829        }
5830        for loc in idx.lookup_eq(key) {
5831            if let RowLocator::Cold {
5832                segment_id,
5833                page_offset,
5834            } = *loc
5835            {
5836                return Ok(Some((segment_id, page_offset)));
5837            }
5838        }
5839        Ok(None)
5840    }
5841}
5842
5843/// Coerce an [`IndexKey`] to the `u64` that v5.1 cold-tier
5844/// segments use as their on-disk PK. Returns `None` for keys that
5845/// aren't representable as `u64` — Text PKs need a hash mapping
5846/// the segment writer baked in (deferred to v5.2+), Bool PKs are
5847/// almost never wide enough to be sharded into a cold tier.
5848fn index_key_as_u64(key: &IndexKey) -> Option<u64> {
5849    match key {
5850        // Reinterpret the i64 bit pattern as u64. Cold-tier segments
5851        // are sorted by this u64 view, so the chosen interpretation
5852        // only has to match between insert (bake_segment / freezer)
5853        // and lookup — using cast_unsigned keeps both sides honest
5854        // and silences clippy::cast_sign_loss.
5855        IndexKey::Int(n) => Some(n.cast_unsigned()),
5856        // Text / Bool / Uuid PKs aren't representable as u64 and so
5857        // can't participate in the u64-sorted cold-tier segment
5858        // PK layout. Same deferral story as Text — lookup falls
5859        // through the in-memory btree.
5860        IndexKey::Text(_) | IndexKey::Bool(_) | IndexKey::Uuid(_) => None,
5861    }
5862}
5863
5864#[derive(Debug, Clone, PartialEq, Eq)]
5865#[non_exhaustive]
5866pub enum StorageError {
5867    DuplicateTable {
5868        name: String,
5869    },
5870    TableNotFound {
5871        name: String,
5872    },
5873    ArityMismatch {
5874        expected: usize,
5875        actual: usize,
5876    },
5877    TypeMismatch {
5878        column: String,
5879        expected: DataType,
5880        actual: DataType,
5881        position: usize,
5882    },
5883    NullInNotNull {
5884        column: String,
5885    },
5886    /// Index with this name already exists on the table.
5887    DuplicateIndex {
5888        name: String,
5889    },
5890    /// Column referenced by an index doesn't exist on the table.
5891    ColumnNotFound {
5892        column: String,
5893    },
5894    /// On-disk format failed to parse — corrupted file, wrong magic, truncated
5895    /// payload, or unknown tag bytes.
5896    Corrupt(String),
5897    /// v6.0.4 — ALTER INDEX targeted an index name that doesn't
5898    /// exist on any table in this catalog.
5899    IndexNotFound {
5900        name: String,
5901    },
5902    /// v6.0.4 — operation requested isn't supported on this index
5903    /// kind / column type (e.g. ALTER INDEX REBUILD on a `BTree`
5904    /// index, or REBUILD WITH (encoding=…) on a non-vector column).
5905    Unsupported(String),
5906}
5907
5908impl fmt::Display for StorageError {
5909    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
5910        match self {
5911            Self::DuplicateTable { name } => write!(f, "table already exists: {name}"),
5912            Self::TableNotFound { name } => write!(f, "table not found: {name}"),
5913            Self::ArityMismatch { expected, actual } => write!(
5914                f,
5915                "row arity mismatch: expected {expected} columns, got {actual}"
5916            ),
5917            Self::TypeMismatch {
5918                column,
5919                expected,
5920                actual,
5921                position,
5922            } => write!(
5923                f,
5924                "type mismatch in column {column:?} (position {position}): expected {expected}, got {actual}"
5925            ),
5926            Self::NullInNotNull { column } => {
5927                write!(f, "NULL value in NOT NULL column {column:?}")
5928            }
5929            Self::DuplicateIndex { name } => write!(f, "index already exists: {name}"),
5930            Self::ColumnNotFound { column } => write!(f, "column not found: {column}"),
5931            Self::Corrupt(detail) => write!(f, "corrupt on-disk format: {detail}"),
5932            Self::IndexNotFound { name } => write!(f, "index not found: {name}"),
5933            Self::Unsupported(detail) => write!(f, "unsupported: {detail}"),
5934        }
5935    }
5936}
5937
5938impl ColumnSchema {
5939    pub fn new(name: impl Into<String>, ty: DataType, nullable: bool) -> Self {
5940        Self {
5941            name: name.into(),
5942            ty,
5943            nullable,
5944            default: None,
5945            runtime_default: None,
5946            auto_increment: false,
5947            user_enum_type: None,
5948            user_domain_type: None,
5949            on_update_runtime: None,
5950            collation: Collation::Binary,
5951            is_unsigned: false,
5952            inline_enum_variants: None,
5953            inline_set_variants: None,
5954        }
5955    }
5956
5957    /// Builder-style helper to attach a default value to an otherwise
5958    /// plain column schema. Used by the engine when CREATE TABLE
5959    /// specifies `column TYPE DEFAULT <expr>`.
5960    #[must_use]
5961    pub fn with_default(mut self, default: Value) -> Self {
5962        self.default = Some(default);
5963        self
5964    }
5965
5966    /// v7.9.21 — builder for runtime-evaluated defaults
5967    /// (`DEFAULT now()`, `DEFAULT CURRENT_TIMESTAMP`, …).
5968    /// `expr` is the Expr's `Display` form, re-parsed by the
5969    /// engine at each INSERT.
5970    #[must_use]
5971    pub fn with_runtime_default(mut self, expr: impl Into<String>) -> Self {
5972        self.runtime_default = Some(expr.into());
5973        self
5974    }
5975
5976    /// Builder-style helper to mark a column as `AUTO_INCREMENT`.
5977    #[must_use]
5978    pub const fn with_auto_increment(mut self) -> Self {
5979        self.auto_increment = true;
5980        self
5981    }
5982}
5983
5984impl TableSchema {
5985    pub fn new(name: impl Into<String>, columns: Vec<ColumnSchema>) -> Self {
5986        Self {
5987            name: name.into(),
5988            columns,
5989            hot_tier_bytes: None,
5990            foreign_keys: Vec::new(),
5991            uniqueness_constraints: Vec::new(),
5992            checks: Vec::new(),
5993        }
5994    }
5995}
5996
5997// =========================================================================
5998// Persistent binary format for the catalog.
5999//
6000// Layout (little-endian throughout):
6001//
6002//   [magic "SPGDB001" 8 bytes][version u8]
6003//   [table_count u32]
6004//   for each table:
6005//       [name_len u16][name bytes]
6006//       [col_count u16]
6007//       for each col:
6008//           [name_len u16][name bytes]
6009//           [type_tag u8 + optional payload]
6010//               1=Int 2=BigInt 3=Float 4=Text 5=Bool
6011//               6=Vector(u32 dim)
6012//               7=SmallInt
6013//               8=Varchar(u32 max)
6014//               9=Char(u32 size)
6015//               10=Numeric(u8 precision, u8 scale)
6016//               11=Date
6017//               12=Timestamp
6018//           [nullable u8]   0/1
6019//           [default_tag u8] 0=none 1=value (followed by [value_tag u8] + bytes)
6020//       [row_count u32]
6021//       for each row, for each col, one [value_tag u8] + value bytes:
6022//           tag 0 (Null)     → no body
6023//           tag 1 (Int)      → i32 LE
6024//           tag 2 (BigInt)   → i64 LE
6025//           tag 3 (Float)    → f64 LE
6026//           tag 4 (Text)     → u16 LE len + UTF-8 bytes
6027//           tag 5 (Bool)     → u8 0/1
6028//           tag 6 (Vector)   → u32 LE dim + dim×f32 LE
6029//           tag 7 (SmallInt) → i16 LE
6030//           tag 8 (Numeric)  → i128 LE (16 bytes) + u8 scale
6031//           tag 9 (Date)     → i32 LE (days since Unix epoch)
6032//           tag 10 (Timestamp) → i64 LE (microseconds since Unix epoch)
6033//
6034// Bumped to version 3 when NUMERIC was added; to version 4 when
6035// AUTO_INCREMENT (per-column flag) + NSW index `kind` byte landed;
6036// to version 5 when DATE / TIMESTAMP were added; to version 6 when
6037// NSW graph topology started travelling on disk (v2.7); to version 7
6038// when the NSW topology became multi-layer HNSW (v2.13); to version 8
6039// when row encoding switched to schema-driven dense layout (v3.0.2 —
6040// per-row NULL bitmap + per-column fixed-width body, no per-cell type
6041// tag).
6042// =========================================================================
6043
6044const FILE_MAGIC: &[u8; 8] = b"SPGDB001";
6045/// Current catalog snapshot format version emitted by [`Catalog::serialize`].
6046///
6047/// v9 (v5.2) extends v8 by serialising `BTree` index entries directly — every
6048/// `(IndexKey, Vec<RowLocator>)` pair travels on disk with the v5.1
6049/// `RowLocator::write_le` tag-prefixed codec. v8 `BTree` indices stored no
6050/// entries at all (the map was rebuilt from `Table::rows` on load); v9
6051/// preserves on-disk Cold locators so freezer-produced cold-tier index
6052/// entries survive a catalog snapshot round-trip. v8 readers are accepted
6053/// by version dispatch in [`Catalog::deserialize`] — every entry decodes
6054/// as `RowLocator::Hot(_)` via `add_index` rebuild, identical to v5.1
6055/// behaviour.
6056/// v6.7.2 — bumped from 10 to 11 to append per-table
6057/// `hot_tier_bytes: Option<u64>` after the per-table indices
6058/// section. v10 catalogs (v6.7.1) load with `hot_tier_bytes =
6059/// None` for every table (the deserialiser short-circuits when
6060/// version < 11). v11 snapshots written by a pre-v6.7.2 binary
6061/// fail loudly at the version check, matching the v6.1.2 /
6062/// v6.1.4 / v6.2.0 / v6.7.1 envelope-bump upgrade fences.
6063///
6064/// v6.8.0 — bumped from 11 to 12: per-index
6065/// `included_columns: Vec<u16>` appended at the tail of each
6066/// index payload. v11 (= v6.7.2) catalogs load with
6067/// `included_columns = Vec::new()` for every index — same
6068/// "older readers, append-only extension" pattern as the v6.7.2
6069/// hot_tier_bytes byte.
6070/// v7.13.0 — bumped from 22 to 23. mailrs round-5 G3 / G10.
6071/// Per-table appendix gains two new sections:
6072///   * `checks: Vec<String>` — CHECK predicate sources (Display
6073///     form of the AST Expr); re-parsed on INSERT/UPDATE to
6074///     enforce against candidate rows. Same persistence pattern
6075///     as `Index::partial_predicate`.
6076///   * Per `UniquenessConstraint`: trailing `nulls_not_distinct:
6077///     u8` flag for PG 15+ `UNIQUE NULLS NOT DISTINCT (cols)`
6078///     semantics.
6079/// v22 catalogs deserialise with empty `checks` and every UC
6080/// at `nulls_not_distinct = false`.
6081/// v24 introduces:
6082///   * Index kind tag 4 = trigram-GIN (`gin_trgm_ops`-flavoured
6083///     `USING gin` over a TEXT/VARCHAR column). Payload shape is
6084///     identical to tag-3 GIN (String → Vec<RowLocator>); the
6085///     keys are PG-compatible 3-byte trigram shingles instead of
6086///     tsvector lexemes. v23 catalogs deserialise unchanged — no
6087///     v23 writer ever emitted tag 4.
6088/// v25 introduces:
6089///   * Per `TriggerDef`: trailing `enabled: u8` flag (mailrs
6090///     round-9 A.2.b — `ALTER TABLE … { ENABLE | DISABLE }
6091///     TRIGGER …`). v24 catalogs deserialise with every trigger
6092///     `enabled = true`, matching pre-v7.16.1 behaviour.
6093/// v26 introduces (v7.17.0 Phase 1.1):
6094///   * Trailing SEQUENCE catalog block after triggers. Encoded
6095///     as `u32 count` followed by per-sequence:
6096///     `name`, `data_type: u8` (0=SmallInt,1=Int,2=BigInt),
6097///     `start i64`, `increment i64`, `min_value i64`,
6098///     `max_value i64`, `cache i64`, `cycle u8`,
6099///     `owned_by_tag u8` (0=NONE, 1=Column → `table`,`column`),
6100///     `last_value i64`, `is_called u8`. v25-and-below catalogs
6101///     deserialise with an empty sequences map.
6102/// v27 introduces (v7.17.0 Phase 1.2):
6103///   * Trailing VIEW catalog block after sequences. Encoded as
6104///     `u32 count` followed by per-view:
6105///     `name`, `column_count u16`, then column names, then
6106///     `body` long-string. v26-and-below catalogs deserialise
6107///     with an empty views map.
6108/// v28 introduces (v7.17.0 Phase 1.3):
6109///   * Trailing MATERIALIZED VIEW source registry block after
6110///     views. Encoded as `u32 count` followed by per-entry:
6111///     `name`, `body` long-string. The materialised rows live
6112///     as a regular Table of the same name (already covered by
6113///     the pre-existing tables block). v27-and-below catalogs
6114///     deserialise with an empty map.
6115/// v29 introduces (v7.17.0 Phase 1.4):
6116///   * Per-table user_enum_type appendix (after the CHECK
6117///     appendix). Layout: `u16 count` followed by per-binding
6118///     `[u16 col_pos][str enum_name]`. Only columns whose
6119///     `user_enum_type` is Some land here; the catalog stays
6120///     compact for the common no-enum case.
6121///   * Trailing ENUM types catalog block after materialized
6122///     views. Encoded as `u32 count` followed by per-entry:
6123///     `name`, `u16 label_count`, then `label_count` short
6124///     strings. v28-and-below catalogs deserialise with an
6125///     empty enum_types map and every column's
6126///     `user_enum_type = None`.
6127/// v30 introduces (v7.17.0 Phase 1.5):
6128///   * Per-table user_domain_type appendix (after the
6129///     user_enum_type appendix). Same shape as the enum one.
6130///   * Trailing DOMAIN types catalog block after the enum
6131///     block. Encoded as `u32 count` followed by per-entry:
6132///     `name`, `data_type` byte, `nullable u8`,
6133///     `default_present u8` + optional default string,
6134///     `u16 check_count` then `check_count` Display-form
6135///     CHECK strings. v29-and-below catalogs deserialise with
6136///     an empty domain_types map and `user_domain_type = None`.
6137/// v31 introduces (v7.17.0 Phase 1.6):
6138///   * Trailing user-schemas block after the DOMAIN block.
6139///     Encoded as `u32 count` followed by `count` schema-name
6140///     short strings. Built-in schemas (`public`, `pg_catalog`,
6141///     `information_schema`) are NOT serialised — they're
6142///     hardcoded in `is_builtin_schema`. v30-and-below catalogs
6143///     deserialise with an empty user-schemas set.
6144/// v32 introduces (v7.17.0 Phase 2.1):
6145///   * Per-table on_update_runtime appendix (after the
6146///     user_domain_type appendix). Layout: `u16 count` followed
6147///     by per-binding `[u16 col_pos][str expr_src]`. Only
6148///     columns whose `on_update_runtime` is Some land here;
6149///     the catalog stays compact when no MySQL-shaped table
6150///     uses the attribute. v31-and-below catalogs deserialise
6151///     with every column's `on_update_runtime = None`.
6152/// v33 introduces (v7.17.0 Phase 2.2):
6153///   * Index kind tag 5 = fulltext-GIN (MySQL `FULLTEXT KEY`
6154///     surface over a TEXT / VARCHAR column). Payload shape is
6155///     identical to tag-3 / tag-4 GIN (`String → Vec<RowLocator>`);
6156///     the keys are lower-cased word lexemes (same rule as
6157///     `to_tsvector('simple', text)`). v32 catalogs deserialise
6158///     unchanged — no v32 writer ever emitted tag 5, and FULLTEXT
6159///     KEY was silently dropped pre-v7.17 so no rebuild shim is
6160///     needed for round-tripped catalogs.
6161/// v34 introduces (v7.17.0 Phase 2.5):
6162///   * Per-table collation appendix (after the on_update_runtime
6163///     appendix). Sparse layout: only columns whose `collation`
6164///     is non-Binary land here. `u16 count` then per-binding
6165///     `[u16 col_pos][u8 collation_tag]` where the tag matches
6166///     `Collation::TAG_*`. Snapshots written by v33-and-below
6167///     readers deserialise every column with `collation =
6168///     Binary`, preserving the prior byte-wise compare
6169///     semantics. Unknown tags read back as Binary too — keeps
6170///     a forward-compat path if a future v35 adds variants
6171///     and someone rolls back to a v34 reader.
6172/// v35 introduces (v7.17.0 Phase 4.4):
6173///   * Per-table is_unsigned appendix (after the collation
6174///     appendix). Sparse layout: only `is_unsigned = true`
6175///     columns land. `u16 count` then per-binding `[u16 col_pos]`.
6176///     v34-and-below catalogs deserialise every column as
6177///     `is_unsigned = false`, preserving the prior silent-
6178///     accept behaviour for negative inserts on UNSIGNED columns.
6179/// v46 introduces (v7.23, mailrs round-14):
6180///   * Escaped short-string codec — `write_str` lengths >= 0xFFFF
6181///     emit `[u16 0xFFFF][u32 real_len]` so TEXT cells (mail bodies,
6182///     document text) above 64 KiB encode instead of panicking.
6183///     One-way upgrade: v45-and-below readers reject v46 catalogs
6184///     loudly via the version gate; v46 readers decode v45 catalogs
6185///     with the plain-u16 rules (0xFFFF is a legitimate length
6186///     there).
6187/// v47 introduces (v7.27, mailrs round-21):
6188///   * Escaped lengths for the REMAINING u16-length cell payloads —
6189///     BYTEA cells, TEXT[] elements, tsvector lexemes and tsquery
6190///     terms — the same `[u16 0xFFFF][u32 real_len]` escape v46
6191///     gave short strings. Round-14 fixed TEXT and missed these;
6192///     round-21 fired the BYTEA twin during a production migration.
6193///     One-way upgrade, same posture as v46.
6194const FILE_VERSION: u8 = 47;
6195/// Oldest format version [`Catalog::deserialize`] still accepts. v8 is the
6196/// v3.0.2 dense-row layout; pre-v8 catalogs require an offline migration.
6197const MIN_SUPPORTED_FILE_VERSION: u8 = 8;
6198
6199// IndexKey wire format (v9):
6200//   tag 0 = Int  → [i64 LE]
6201//   tag 1 = Text → [u16 LE len + UTF-8 bytes] (via write_str / read_str)
6202//   tag 2 = Bool → [u8 0/1]
6203const INDEX_KEY_TAG_INT: u8 = 0;
6204const INDEX_KEY_TAG_TEXT: u8 = 1;
6205const INDEX_KEY_TAG_BOOL: u8 = 2;
6206/// v7.17.0 — `IndexKey::Uuid([u8; 16])`. Body = raw 16 bytes
6207/// (RFC 4122 byte order). Persisted only in FILE_VERSION 36+
6208/// catalogs.
6209const INDEX_KEY_TAG_UUID: u8 = 3;
6210
6211impl Catalog {
6212    /// Serialize the whole catalog (schema + every row) into a self-contained
6213    /// byte buffer. Format is documented above the impl block.
6214    pub fn serialize(&self) -> Vec<u8> {
6215        let mut out = Vec::with_capacity(64);
6216        out.extend_from_slice(FILE_MAGIC);
6217        out.push(FILE_VERSION);
6218        write_u32(
6219            &mut out,
6220            u32::try_from(self.tables.len()).expect("≤ 4G tables"),
6221        );
6222        for t in &self.tables {
6223            write_str(&mut out, &t.schema.name);
6224            write_u16(
6225                &mut out,
6226                u16::try_from(t.schema.columns.len()).expect("≤ 65k columns/table"),
6227            );
6228            for c in &t.schema.columns {
6229                write_str(&mut out, &c.name);
6230                write_data_type(&mut out, c.ty);
6231                out.push(u8::from(c.nullable));
6232                match &c.default {
6233                    None => out.push(0),
6234                    Some(v) => {
6235                        out.push(1);
6236                        write_value(&mut out, v);
6237                    }
6238                }
6239                out.push(u8::from(c.auto_increment));
6240            }
6241            write_u32(
6242                &mut out,
6243                u32::try_from(t.rows.len()).expect("≤ 4G rows/table"),
6244            );
6245            // v3.0.2 dense row encoding (FILE_VERSION 8): per-row NULL
6246            // bitmap, then tightly-packed bodies. Identical wire format
6247            // as before — extracted into `encode_row_body_dense` so cold-
6248            // tier segments (v5.1+) can share the encoding.
6249            for row in &t.rows {
6250                out.extend_from_slice(&encode_row_body_dense(row, &t.schema));
6251            }
6252            // Index definitions. Per-index payload:
6253            //   [name][col_pos u16][kind u8]
6254            //     kind 0 = B-tree           (no params — rebuilt on load)
6255            //     kind 1 = NSW graph        (u16 M + serialized graph)
6256            // For NSW the graph topology travels on disk so startup
6257            // doesn't re-run the O(n²M) rebuild — see v2.7 notes.
6258            write_u16(
6259                &mut out,
6260                u16::try_from(t.indices.len()).expect("≤ 65k indices/table"),
6261            );
6262            for idx in &t.indices {
6263                write_str(&mut out, &idx.name);
6264                write_u16(
6265                    &mut out,
6266                    u16::try_from(idx.column_position).expect("≤ 65k columns/table"),
6267                );
6268                match &idx.kind {
6269                    IndexKind::BTree(map) => {
6270                        out.push(0);
6271                        // v9: serialise the full PB map. Each entry's
6272                        // RowLocator list travels with the tag-prefixed
6273                        // codec from `row_locator::write_le`, so freezer-
6274                        // produced Cold locators survive a snapshot
6275                        // round-trip. v8 BTree wrote nothing here and
6276                        // rebuilt from rows — v9 readers tolerate v8 by
6277                        // version dispatch in `Catalog::deserialize`.
6278                        write_u32(
6279                            &mut out,
6280                            u32::try_from(map.len()).expect("≤ 4G index entries/index"),
6281                        );
6282                        for (key, locators) in map {
6283                            write_index_key(&mut out, key);
6284                            write_u32(
6285                                &mut out,
6286                                u32::try_from(locators.len()).expect("≤ 4G locators/key"),
6287                            );
6288                            for loc in locators {
6289                                loc.write_le(&mut out);
6290                            }
6291                        }
6292                    }
6293                    IndexKind::Nsw(g) => {
6294                        out.push(1);
6295                        write_u16(&mut out, u16::try_from(g.m).expect("≤ 65k NSW neighbours"));
6296                        write_nsw_graph(&mut out, g);
6297                    }
6298                    IndexKind::Brin { column_type } => {
6299                        // v6.7.1 — tag byte 2 = BRIN. Payload is the
6300                        // column type code (1 byte mapping to the
6301                        // shared DataType numeric encoding); no
6302                        // further data — BRIN summaries live in
6303                        // cold segments, not the catalog.
6304                        out.push(2);
6305                        write_data_type(&mut out, *column_type);
6306                    }
6307                    IndexKind::Gin(map) => {
6308                        // v7.12.3 — tag byte 3 = GIN. Payload mirrors
6309                        // the BTree encoding but with String (lexeme
6310                        // word) keys instead of IndexKey. Tag-prefixed
6311                        // RowLocator codec so freezer-produced Cold
6312                        // locators survive snapshot round-trip.
6313                        // FILE_VERSION 21+; v20 catalogs never wrote a
6314                        // GIN index (the AM degraded to BTree fallback
6315                        // pre-v7.12.3), so no migration shim is needed.
6316                        out.push(3);
6317                        write_u32(
6318                            &mut out,
6319                            u32::try_from(map.len()).expect("≤ 4G GIN posting lists"),
6320                        );
6321                        for (word, locators) in map {
6322                            write_str(&mut out, word);
6323                            write_u32(
6324                                &mut out,
6325                                u32::try_from(locators.len()).expect("≤ 4G locators/posting list"),
6326                            );
6327                            for loc in locators {
6328                                loc.write_le(&mut out);
6329                            }
6330                        }
6331                    }
6332                    IndexKind::GinTrgm(map) => {
6333                        // v7.15.0 — tag byte 4 = GinTrgm
6334                        // (`gin_trgm_ops` GIN over a TEXT column).
6335                        // Payload shape is identical to tag-3 GIN —
6336                        // `String → Vec<RowLocator>` posting lists.
6337                        // The String keys are 3-byte trigrams instead
6338                        // of tsvector lexemes; the deserializer
6339                        // dispatches on the tag, not the key shape.
6340                        // FILE_VERSION 24+; v23 catalogs never wrote
6341                        // a trigram-GIN.
6342                        out.push(4);
6343                        write_u32(
6344                            &mut out,
6345                            u32::try_from(map.len()).expect("≤ 4G trigram-GIN posting lists"),
6346                        );
6347                        for (tri, locators) in map {
6348                            write_str(&mut out, tri);
6349                            write_u32(
6350                                &mut out,
6351                                u32::try_from(locators.len()).expect("≤ 4G locators/posting list"),
6352                            );
6353                            for loc in locators {
6354                                loc.write_le(&mut out);
6355                            }
6356                        }
6357                    }
6358                    IndexKind::GinFulltext(map) => {
6359                        // v7.17.0 Phase 2.2 — tag byte 5 =
6360                        // GinFulltext (MySQL `FULLTEXT KEY` GIN
6361                        // over a TEXT/VARCHAR column). Payload
6362                        // shape mirrors tag-3 / tag-4 GIN —
6363                        // `String → Vec<RowLocator>` posting
6364                        // lists keyed by lower-cased word
6365                        // lexemes. FILE_VERSION 33+; v32 catalogs
6366                        // never wrote a fulltext-GIN (FULLTEXT
6367                        // KEY was silently dropped pre-v7.17).
6368                        out.push(5);
6369                        write_u32(
6370                            &mut out,
6371                            u32::try_from(map.len()).expect("≤ 4G fulltext-GIN posting lists"),
6372                        );
6373                        for (lex, locators) in map {
6374                            write_str(&mut out, lex);
6375                            write_u32(
6376                                &mut out,
6377                                u32::try_from(locators.len()).expect("≤ 4G locators/posting list"),
6378                            );
6379                            for loc in locators {
6380                                loc.write_le(&mut out);
6381                            }
6382                        }
6383                    }
6384                }
6385                // v6.8.0 — included_columns appendix per index.
6386                // Layout: [u16 num_included][num × u16 column_position].
6387                // v11 readers stop before this u16 (deserialise loop
6388                // gated on version >= 12); v12+ readers always
6389                // consume it. Empty Vec serialises as a bare 0u16.
6390                write_u16(
6391                    &mut out,
6392                    u16::try_from(idx.included_columns.len()).expect("≤ 65k INCLUDE columns/index"),
6393                );
6394                for col_pos in &idx.included_columns {
6395                    write_u16(
6396                        &mut out,
6397                        u16::try_from(*col_pos).expect("≤ 65k columns/table"),
6398                    );
6399                }
6400                // v6.8.1 — partial_predicate appendix per index.
6401                // Layout: [u8 has_pred][u16 LE len][bytes (if has_pred)].
6402                // Same v12 gate as included_columns.
6403                match &idx.partial_predicate {
6404                    None => out.push(0),
6405                    Some(pred) => {
6406                        out.push(1);
6407                        write_str(&mut out, pred);
6408                    }
6409                }
6410                // v6.8.2 — expression appendix. Same shape as
6411                // partial_predicate.
6412                match &idx.expression {
6413                    None => out.push(0),
6414                    Some(expr) => {
6415                        out.push(1);
6416                        write_str(&mut out, expr);
6417                    }
6418                }
6419                // v7.9.29 — is_unique appendix (FILE_VERSION 16+).
6420                // Single byte 0/1. v15-and-below readers stop before
6421                // this byte; v16 readers always consume it. mailrs K1.
6422                out.push(u8::from(idx.is_unique));
6423                // v7.9.29 — extra_column_positions appendix.
6424                // Layout: [u16 count][count × u16 column_position].
6425                write_u16(
6426                    &mut out,
6427                    u16::try_from(idx.extra_column_positions.len())
6428                        .expect("≤ 65k extra cols / index"),
6429                );
6430                for cp in &idx.extra_column_positions {
6431                    write_u16(&mut out, u16::try_from(*cp).expect("≤ 65k columns/table"));
6432                }
6433            }
6434            // v6.7.2 — per-table hot_tier_bytes Option<u64>.
6435            // Layout: [u8 has_value][u64 LE value (if has_value)].
6436            // v10 readers stop before this byte (deserialise loop
6437            // gated on version >= 11); v11+ readers always
6438            // consume it.
6439            match t.schema.hot_tier_bytes {
6440                None => out.push(0),
6441                Some(n) => {
6442                    out.push(1);
6443                    out.extend_from_slice(&n.to_le_bytes());
6444                }
6445            }
6446            // v7.6.1 — FOREIGN KEY appendix (catalog FILE_VERSION 13+).
6447            // Layout: [u16 LE fk_count]
6448            //   per fk:
6449            //     [u8 has_name] [str name (if has_name)]
6450            //     [u16 LE local_arity] [u16 LE local_pos]*arity
6451            //     [str parent_table]
6452            //     [u16 LE parent_arity] [u16 LE parent_pos]*arity
6453            //     [u8 on_delete_tag] [u8 on_update_tag]
6454            // Older catalogs (v12 and below) skip this block entirely;
6455            // their reader stops before this byte.
6456            write_u16(
6457                &mut out,
6458                u16::try_from(t.schema.foreign_keys.len()).expect("≤ 65k FKs/table"),
6459            );
6460            for fk in &t.schema.foreign_keys {
6461                match &fk.name {
6462                    None => out.push(0),
6463                    Some(n) => {
6464                        out.push(1);
6465                        write_str(&mut out, n);
6466                    }
6467                }
6468                write_u16(
6469                    &mut out,
6470                    u16::try_from(fk.local_columns.len()).expect("≤ 65k FK columns"),
6471                );
6472                for &p in &fk.local_columns {
6473                    write_u16(&mut out, u16::try_from(p).expect("≤ 65k columns/table"));
6474                }
6475                write_str(&mut out, &fk.parent_table);
6476                write_u16(
6477                    &mut out,
6478                    u16::try_from(fk.parent_columns.len()).expect("≤ 65k FK parent columns"),
6479                );
6480                for &p in &fk.parent_columns {
6481                    write_u16(&mut out, u16::try_from(p).expect("≤ 65k columns/table"));
6482                }
6483                out.push(fk.on_delete.tag());
6484                out.push(fk.on_update.tag());
6485            }
6486            // v7.9.19 — UniquenessConstraint appendix (catalog
6487            // FILE_VERSION 15+). Layout per table after the FK
6488            // block:
6489            //   [u16 count]
6490            //     per constraint:
6491            //       [u8 is_primary_key]
6492            //       [u16 arity][u16 col_pos]*arity
6493            // Older catalogs (v14 and below) skip this block.
6494            write_u16(
6495                &mut out,
6496                u16::try_from(t.schema.uniqueness_constraints.len())
6497                    .expect("≤ 65k uniqueness constraints/table"),
6498            );
6499            for uc in &t.schema.uniqueness_constraints {
6500                out.push(u8::from(uc.is_primary_key));
6501                write_u16(
6502                    &mut out,
6503                    u16::try_from(uc.columns.len()).expect("≤ 65k cols in uniqueness constraint"),
6504                );
6505                for &p in &uc.columns {
6506                    write_u16(&mut out, u16::try_from(p).expect("≤ 65k columns/table"));
6507                }
6508                // v7.13.0 — `nulls_not_distinct` flag
6509                // (FILE_VERSION 23+). Always written by writers at
6510                // version 23+; deserialise gates on `version >= 23`
6511                // so v22-and-below catalogs round-trip cleanly.
6512                out.push(u8::from(uc.nulls_not_distinct));
6513            }
6514            // v7.9.21 — runtime_default appendix per table.
6515            // Layout: [u16 count] then for each:
6516            //   [u16 col_pos][str expr]
6517            // Only columns whose runtime_default is Some land here;
6518            // catalog stays compact for the common literal-default
6519            // case.
6520            let mut rt_defaults: Vec<(usize, &str)> = Vec::new();
6521            for (i, c) in t.schema.columns.iter().enumerate() {
6522                if let Some(e) = &c.runtime_default {
6523                    rt_defaults.push((i, e.as_str()));
6524                }
6525            }
6526            write_u16(
6527                &mut out,
6528                u16::try_from(rt_defaults.len()).expect("≤ 65k runtime defaults/table"),
6529            );
6530            for (pos, expr) in rt_defaults {
6531                write_u16(&mut out, u16::try_from(pos).expect("≤ 65k columns/table"));
6532                write_str(&mut out, expr);
6533            }
6534            // v7.13.0 — CHECK constraint appendix per table.
6535            // Layout: [u16 count] then `count` Display-form
6536            // expression strings. Re-parsed on every INSERT/UPDATE
6537            // by the engine. FILE_VERSION 23+ only; v22 readers
6538            // never reach this block because the writer also moves
6539            // to v23 in lock-step.
6540            write_u16(
6541                &mut out,
6542                u16::try_from(t.schema.checks.len()).expect("≤ 65k CHECK constraints/table"),
6543            );
6544            for c in &t.schema.checks {
6545                write_str(&mut out, c.as_str());
6546            }
6547            // v7.17.0 Phase 1.4 — per-table user_enum_type
6548            // appendix. Layout: [u16 count] then
6549            // [u16 col_pos][str enum_name] per binding. Only
6550            // columns whose user_enum_type is Some land here.
6551            let mut enum_bindings: Vec<(usize, &str)> = Vec::new();
6552            for (i, c) in t.schema.columns.iter().enumerate() {
6553                if let Some(e) = &c.user_enum_type {
6554                    enum_bindings.push((i, e.as_str()));
6555                }
6556            }
6557            write_u16(
6558                &mut out,
6559                u16::try_from(enum_bindings.len()).expect("≤ 65k enum-typed columns/table"),
6560            );
6561            for (pos, ename) in enum_bindings {
6562                write_u16(&mut out, u16::try_from(pos).expect("≤ 65k columns/table"));
6563                write_str(&mut out, ename);
6564            }
6565            // v7.17.0 Phase 1.5 — per-table user_domain_type
6566            // appendix. Same layout as the enum one. v29-and-
6567            // below readers stop after the enum appendix.
6568            let mut domain_bindings: Vec<(usize, &str)> = Vec::new();
6569            for (i, c) in t.schema.columns.iter().enumerate() {
6570                if let Some(d) = &c.user_domain_type {
6571                    domain_bindings.push((i, d.as_str()));
6572                }
6573            }
6574            write_u16(
6575                &mut out,
6576                u16::try_from(domain_bindings.len()).expect("≤ 65k domain-typed columns/table"),
6577            );
6578            for (pos, dname) in domain_bindings {
6579                write_u16(&mut out, u16::try_from(pos).expect("≤ 65k columns/table"));
6580                write_str(&mut out, dname);
6581            }
6582            // v7.17.0 Phase 2.1 — per-table on_update_runtime
6583            // appendix. Sparse: only ON UPDATE-bound columns.
6584            let mut on_update_bindings: Vec<(usize, &str)> = Vec::new();
6585            for (i, c) in t.schema.columns.iter().enumerate() {
6586                if let Some(e) = &c.on_update_runtime {
6587                    on_update_bindings.push((i, e.as_str()));
6588                }
6589            }
6590            write_u16(
6591                &mut out,
6592                u16::try_from(on_update_bindings.len()).expect("≤ 65k ON UPDATE columns/table"),
6593            );
6594            for (pos, expr_src) in on_update_bindings {
6595                write_u16(&mut out, u16::try_from(pos).expect("≤ 65k columns/table"));
6596                write_str(&mut out, expr_src);
6597            }
6598            // v7.17.0 Phase 2.5 — per-table collation appendix.
6599            // Sparse: only non-Binary columns land. Layout:
6600            // `[u16 count][u16 col_pos][u8 tag] × count`.
6601            let mut coll_bindings: Vec<(usize, u8)> = Vec::new();
6602            for (i, c) in t.schema.columns.iter().enumerate() {
6603                let tag = match c.collation {
6604                    Collation::Binary => continue,
6605                    Collation::CaseInsensitive => Collation::TAG_CASE_INSENSITIVE,
6606                };
6607                coll_bindings.push((i, tag));
6608            }
6609            write_u16(
6610                &mut out,
6611                u16::try_from(coll_bindings.len()).expect("≤ 65k collation bindings/table"),
6612            );
6613            for (pos, tag) in coll_bindings {
6614                write_u16(&mut out, u16::try_from(pos).expect("≤ 65k columns/table"));
6615                out.push(tag);
6616            }
6617            // v7.17.0 Phase 4.4 — per-table is_unsigned appendix.
6618            // Sparse: only UNSIGNED columns land. Layout:
6619            // `[u16 count][u16 col_pos] × count`.
6620            let mut unsigned_bindings: Vec<usize> = Vec::new();
6621            for (i, c) in t.schema.columns.iter().enumerate() {
6622                if c.is_unsigned {
6623                    unsigned_bindings.push(i);
6624                }
6625            }
6626            write_u16(
6627                &mut out,
6628                u16::try_from(unsigned_bindings.len()).expect("≤ 65k UNSIGNED columns/table"),
6629            );
6630            for pos in unsigned_bindings {
6631                write_u16(&mut out, u16::try_from(pos).expect("≤ 65k columns/table"));
6632            }
6633            // v7.17.0 Phase 3.P0-36 — per-table inline_enum_variants
6634            // appendix. Sparse: only ENUM columns land. Layout:
6635            // `[u16 count] then per binding [u16 col_pos]
6636            // [u16 variant_count] then variant strings`.
6637            // FILE_VERSION 41+; v40 readers never reach this block.
6638            let mut enum_inline_bindings: Vec<(usize, &[String])> = Vec::new();
6639            for (i, c) in t.schema.columns.iter().enumerate() {
6640                if let Some(vs) = &c.inline_enum_variants {
6641                    enum_inline_bindings.push((i, vs.as_slice()));
6642                }
6643            }
6644            write_u16(
6645                &mut out,
6646                u16::try_from(enum_inline_bindings.len()).expect("≤ 65k inline-ENUM columns/table"),
6647            );
6648            for (pos, variants) in enum_inline_bindings {
6649                write_u16(&mut out, u16::try_from(pos).expect("≤ 65k columns/table"));
6650                write_u16(
6651                    &mut out,
6652                    u16::try_from(variants.len()).expect("≤ 65k variants/ENUM"),
6653                );
6654                for v in variants {
6655                    write_str(&mut out, v.as_str());
6656                }
6657            }
6658            // v7.17.0 Phase 3.P0-37 — per-table inline_set_variants
6659            // appendix. Same layout as the inline ENUM block.
6660            // FILE_VERSION 42+; v41 readers never reach this block.
6661            let mut set_inline_bindings: Vec<(usize, &[String])> = Vec::new();
6662            for (i, c) in t.schema.columns.iter().enumerate() {
6663                if let Some(vs) = &c.inline_set_variants {
6664                    set_inline_bindings.push((i, vs.as_slice()));
6665                }
6666            }
6667            write_u16(
6668                &mut out,
6669                u16::try_from(set_inline_bindings.len()).expect("≤ 65k inline-SET columns/table"),
6670            );
6671            for (pos, variants) in set_inline_bindings {
6672                write_u16(&mut out, u16::try_from(pos).expect("≤ 65k columns/table"));
6673                write_u16(
6674                    &mut out,
6675                    u16::try_from(variants.len()).expect("≤ 65k variants/SET"),
6676                );
6677                for v in variants {
6678                    write_str(&mut out, v.as_str());
6679                }
6680            }
6681        }
6682        // v7.12.4 — catalog-wide appendix: user-defined functions
6683        // then triggers. FILE_VERSION 22+ only. v21 and earlier
6684        // readers stop after the last table; v22 readers always
6685        // consume two `u32` counts (possibly zero).
6686        //
6687        // Function entry layout:
6688        //   [str name] [str args_repr] [str returns]
6689        //   [str language] [str body]
6690        // Trigger entry layout:
6691        //   [str name] [str table] [str timing]
6692        //   [u16 event_count] (event_count × str)
6693        //   [str for_each] [str function]
6694        write_u32(
6695            &mut out,
6696            u32::try_from(self.functions.len()).expect("≤ 4G functions"),
6697        );
6698        for fd in self.functions.values() {
6699            write_str(&mut out, &fd.name);
6700            write_str(&mut out, &fd.args_repr);
6701            write_str(&mut out, &fd.returns);
6702            write_str(&mut out, &fd.language);
6703            write_str_long(&mut out, &fd.body);
6704        }
6705        write_u32(
6706            &mut out,
6707            u32::try_from(self.triggers.len()).expect("≤ 4G triggers"),
6708        );
6709        for td in &self.triggers {
6710            write_str(&mut out, &td.name);
6711            write_str(&mut out, &td.table);
6712            write_str(&mut out, &td.timing);
6713            write_u16(
6714                &mut out,
6715                u16::try_from(td.events.len()).expect("≤ 65k events / trigger"),
6716            );
6717            for ev in &td.events {
6718                write_str(&mut out, ev);
6719            }
6720            write_str(&mut out, &td.for_each);
6721            write_str(&mut out, &td.function);
6722            // v7.13.0 — `UPDATE OF cols` filter
6723            // (FILE_VERSION 23+). v22 readers omit; v23 writers
6724            // always emit (possibly zero).
6725            write_u16(
6726                &mut out,
6727                u16::try_from(td.update_columns.len()).expect("≤ 65k cols / trigger"),
6728            );
6729            for c in &td.update_columns {
6730                write_str(&mut out, c);
6731            }
6732            // v7.16.1 — TriggerDef.enabled (FILE_VERSION 25+).
6733            out.push(u8::from(td.enabled));
6734        }
6735        // v7.17.0 Phase 1.1 — SEQUENCE catalog block (FILE_VERSION 26+).
6736        write_u32(
6737            &mut out,
6738            u32::try_from(self.sequences.len()).expect("≤ 4G sequences"),
6739        );
6740        for seq in self.sequences.values() {
6741            write_str(&mut out, &seq.name);
6742            out.push(match seq.data_type {
6743                SequenceDataType::SmallInt => 0,
6744                SequenceDataType::Int => 1,
6745                SequenceDataType::BigInt => 2,
6746            });
6747            out.extend_from_slice(&seq.start.to_le_bytes());
6748            out.extend_from_slice(&seq.increment.to_le_bytes());
6749            out.extend_from_slice(&seq.min_value.to_le_bytes());
6750            out.extend_from_slice(&seq.max_value.to_le_bytes());
6751            out.extend_from_slice(&seq.cache.to_le_bytes());
6752            out.push(u8::from(seq.cycle));
6753            match &seq.owned_by {
6754                None => out.push(0),
6755                Some((table, column)) => {
6756                    out.push(1);
6757                    write_str(&mut out, table);
6758                    write_str(&mut out, column);
6759                }
6760            }
6761            out.extend_from_slice(&seq.last_value.to_le_bytes());
6762            out.push(u8::from(seq.is_called));
6763        }
6764        // v7.17.0 Phase 1.2 — VIEW catalog block (FILE_VERSION 27+).
6765        write_u32(
6766            &mut out,
6767            u32::try_from(self.views.len()).expect("≤ 4G views"),
6768        );
6769        for view in self.views.values() {
6770            write_str(&mut out, &view.name);
6771            write_u16(
6772                &mut out,
6773                u16::try_from(view.columns.len()).expect("≤ 65k cols / view"),
6774            );
6775            for c in &view.columns {
6776                write_str(&mut out, c);
6777            }
6778            write_str_long(&mut out, &view.body);
6779        }
6780        // v7.17.0 Phase 1.3 — MATERIALIZED VIEW source registry
6781        // (FILE_VERSION 28+). The backing rows live as a regular
6782        // table of the same name already in the tables block.
6783        write_u32(
6784            &mut out,
6785            u32::try_from(self.materialized_views.len()).expect("≤ 4G materialized views"),
6786        );
6787        for (name, body) in &self.materialized_views {
6788            write_str(&mut out, name);
6789            write_str_long(&mut out, body);
6790        }
6791        // v7.17.0 Phase 1.4 — ENUM types catalog block
6792        // (FILE_VERSION 29+).
6793        write_u32(
6794            &mut out,
6795            u32::try_from(self.enum_types.len()).expect("≤ 4G enum types"),
6796        );
6797        for e in self.enum_types.values() {
6798            write_str(&mut out, &e.name);
6799            write_u16(
6800                &mut out,
6801                u16::try_from(e.labels.len()).expect("≤ 65k labels / enum"),
6802            );
6803            for l in &e.labels {
6804                write_str(&mut out, l);
6805            }
6806        }
6807        // v7.17.0 Phase 1.5 — DOMAIN types catalog block
6808        // (FILE_VERSION 30+).
6809        write_u32(
6810            &mut out,
6811            u32::try_from(self.domain_types.len()).expect("≤ 4G domain types"),
6812        );
6813        for d in self.domain_types.values() {
6814            write_str(&mut out, &d.name);
6815            write_data_type(&mut out, d.base_type);
6816            out.push(u8::from(d.nullable));
6817            match &d.default {
6818                None => out.push(0),
6819                Some(s) => {
6820                    out.push(1);
6821                    write_str(&mut out, s);
6822                }
6823            }
6824            write_u16(
6825                &mut out,
6826                u16::try_from(d.checks.len()).expect("≤ 65k CHECKs / domain"),
6827            );
6828            for c in &d.checks {
6829                write_str(&mut out, c);
6830            }
6831        }
6832        // v7.17.0 Phase 1.6 — user-schemas registry
6833        // (FILE_VERSION 31+). Built-ins are hardcoded in
6834        // `is_builtin_schema` and not persisted.
6835        write_u32(
6836            &mut out,
6837            u32::try_from(self.schemas.len()).expect("≤ 4G schemas"),
6838        );
6839        for name in &self.schemas {
6840            write_str(&mut out, name);
6841        }
6842        out
6843    }
6844
6845    /// Deserialize a previously-serialized catalog. Rejects bad magic, version
6846    /// mismatch, unknown tags, truncation, and trailing bytes.
6847    pub fn deserialize(buf: &[u8]) -> Result<Self, StorageError> {
6848        let mut cur = Cursor::new(buf);
6849        let magic = cur.take(8)?;
6850        if magic != FILE_MAGIC {
6851            return Err(StorageError::Corrupt(format!(
6852                "bad magic: expected SPGDB001, got {magic:?}"
6853            )));
6854        }
6855        let version = cur.read_u8()?;
6856        if !(MIN_SUPPORTED_FILE_VERSION..=FILE_VERSION).contains(&version) {
6857            return Err(StorageError::Corrupt(format!(
6858                "unsupported file version: {version} (supported: {MIN_SUPPORTED_FILE_VERSION}..={FILE_VERSION})"
6859            )));
6860        }
6861        // v7.23/v7.27 — escape decoding is version-gated (see
6862        // STR_LEN_ESCAPE / Cursor::codec_version).
6863        cur.codec_version = version;
6864        let table_count = cur.read_u32()? as usize;
6865        let mut cat = Self::new();
6866        for _ in 0..table_count {
6867            deserialize_table(&mut cur, &mut cat, version)?;
6868        }
6869        // v7.12.4 — catalog-wide function + trigger appendix.
6870        // FILE_VERSION 22+ only; v21 and earlier catalogs stop
6871        // after the last table.
6872        if version >= 22 {
6873            let fn_count = cur.read_u32()? as usize;
6874            for _ in 0..fn_count {
6875                let name = cur.read_str()?;
6876                let args_repr = cur.read_str()?;
6877                let returns = cur.read_str()?;
6878                let language = cur.read_str()?;
6879                let body = cur.read_str_long()?;
6880                cat.functions.insert(
6881                    name.clone(),
6882                    FunctionDef {
6883                        name,
6884                        args_repr,
6885                        returns,
6886                        language,
6887                        body,
6888                    },
6889                );
6890            }
6891            let trg_count = cur.read_u32()? as usize;
6892            for _ in 0..trg_count {
6893                let name = cur.read_str()?;
6894                let table = cur.read_str()?;
6895                let timing = cur.read_str()?;
6896                let ev_count = cur.read_u16()? as usize;
6897                let mut events = Vec::with_capacity(ev_count);
6898                for _ in 0..ev_count {
6899                    events.push(cur.read_str()?);
6900                }
6901                let for_each = cur.read_str()?;
6902                let function = cur.read_str()?;
6903                // v7.13.0 — trailing `UPDATE OF cols` filter
6904                // (FILE_VERSION 23+ only; v22 catalogs omit and
6905                // deserialise with an empty vec).
6906                let update_columns = if version >= 23 {
6907                    let n = cur.read_u16()? as usize;
6908                    let mut cols = Vec::with_capacity(n);
6909                    for _ in 0..n {
6910                        cols.push(cur.read_str()?);
6911                    }
6912                    cols
6913                } else {
6914                    Vec::new()
6915                };
6916                // v7.16.1 — TriggerDef.enabled (FILE_VERSION 25+).
6917                // v24-and-below catalogs deserialise with `true`
6918                // — pre-v7.16.1 every trigger always fired.
6919                let enabled = if version >= 25 {
6920                    cur.read_u8()? != 0
6921                } else {
6922                    true
6923                };
6924                cat.triggers.push(TriggerDef {
6925                    name,
6926                    table,
6927                    timing,
6928                    events,
6929                    for_each,
6930                    function,
6931                    update_columns,
6932                    enabled,
6933                });
6934            }
6935        }
6936        // v7.17.0 Phase 1.1 — SEQUENCE block (FILE_VERSION 26+).
6937        // v25-and-below catalogs omit; we leave the map empty.
6938        if version >= 26 {
6939            let seq_count = cur.read_u32()? as usize;
6940            for _ in 0..seq_count {
6941                let name = cur.read_str()?;
6942                let data_type = match cur.read_u8()? {
6943                    0 => SequenceDataType::SmallInt,
6944                    1 => SequenceDataType::Int,
6945                    2 => SequenceDataType::BigInt,
6946                    other => {
6947                        return Err(StorageError::Corrupt(format!(
6948                            "unknown SEQUENCE data-type tag {other}"
6949                        )));
6950                    }
6951                };
6952                let start = cur.read_i64()?;
6953                let increment = cur.read_i64()?;
6954                let min_value = cur.read_i64()?;
6955                let max_value = cur.read_i64()?;
6956                let cache = cur.read_i64()?;
6957                let cycle = cur.read_u8()? != 0;
6958                let owned_by = match cur.read_u8()? {
6959                    0 => None,
6960                    1 => {
6961                        let t = cur.read_str()?;
6962                        let c = cur.read_str()?;
6963                        Some((t, c))
6964                    }
6965                    other => {
6966                        return Err(StorageError::Corrupt(format!(
6967                            "unknown SEQUENCE owned-by tag {other}"
6968                        )));
6969                    }
6970                };
6971                let last_value = cur.read_i64()?;
6972                let is_called = cur.read_u8()? != 0;
6973                cat.sequences.insert(
6974                    name.clone(),
6975                    SequenceDef {
6976                        name,
6977                        data_type,
6978                        start,
6979                        increment,
6980                        min_value,
6981                        max_value,
6982                        cache,
6983                        cycle,
6984                        owned_by,
6985                        last_value,
6986                        is_called,
6987                    },
6988                );
6989            }
6990        }
6991        // v7.17.0 Phase 1.2 — VIEW block (FILE_VERSION 27+).
6992        // v26-and-below catalogs omit; we leave the map empty.
6993        if version >= 27 {
6994            let view_count = cur.read_u32()? as usize;
6995            for _ in 0..view_count {
6996                let name = cur.read_str()?;
6997                let col_count = cur.read_u16()? as usize;
6998                let mut columns = Vec::with_capacity(col_count);
6999                for _ in 0..col_count {
7000                    columns.push(cur.read_str()?);
7001                }
7002                let body = cur.read_str_long()?;
7003                cat.views.insert(
7004                    name.clone(),
7005                    ViewDef {
7006                        name,
7007                        columns,
7008                        body,
7009                    },
7010                );
7011            }
7012        }
7013        // v7.17.0 Phase 1.3 — MATERIALIZED VIEW source registry
7014        // (FILE_VERSION 28+). v27-and-below catalogs omit.
7015        if version >= 28 {
7016            let mv_count = cur.read_u32()? as usize;
7017            for _ in 0..mv_count {
7018                let name = cur.read_str()?;
7019                let body = cur.read_str_long()?;
7020                cat.materialized_views.insert(name, body);
7021            }
7022        }
7023        // v7.17.0 Phase 1.4 — ENUM types catalog block
7024        // (FILE_VERSION 29+).
7025        if version >= 29 {
7026            let etype_count = cur.read_u32()? as usize;
7027            for _ in 0..etype_count {
7028                let name = cur.read_str()?;
7029                let label_count = cur.read_u16()? as usize;
7030                let mut labels = Vec::with_capacity(label_count);
7031                for _ in 0..label_count {
7032                    labels.push(cur.read_str()?);
7033                }
7034                cat.enum_types
7035                    .insert(name.clone(), EnumDef { name, labels });
7036            }
7037        }
7038        // v7.17.0 Phase 1.5 — DOMAIN types catalog block
7039        // (FILE_VERSION 30+).
7040        if version >= 30 {
7041            let dtype_count = cur.read_u32()? as usize;
7042            for _ in 0..dtype_count {
7043                let name = cur.read_str()?;
7044                let base_type = cur.read_data_type()?;
7045                let nullable = cur.read_u8()? != 0;
7046                let default = match cur.read_u8()? {
7047                    0 => None,
7048                    1 => Some(cur.read_str()?),
7049                    other => {
7050                        return Err(StorageError::Corrupt(format!(
7051                            "unknown DOMAIN default tag {other}"
7052                        )));
7053                    }
7054                };
7055                let check_count = cur.read_u16()? as usize;
7056                let mut checks = Vec::with_capacity(check_count);
7057                for _ in 0..check_count {
7058                    checks.push(cur.read_str()?);
7059                }
7060                cat.domain_types.insert(
7061                    name.clone(),
7062                    DomainDef {
7063                        name,
7064                        base_type,
7065                        nullable,
7066                        default,
7067                        checks,
7068                    },
7069                );
7070            }
7071        }
7072        // v7.17.0 Phase 1.6 — user-schemas registry
7073        // (FILE_VERSION 31+).
7074        if version >= 31 {
7075            let sch_count = cur.read_u32()? as usize;
7076            for _ in 0..sch_count {
7077                let name = cur.read_str()?;
7078                cat.schemas.insert(name);
7079            }
7080        }
7081        if cur.pos < buf.len() {
7082            return Err(StorageError::Corrupt(format!(
7083                "trailing bytes: {} unread",
7084                buf.len() - cur.pos
7085            )));
7086        }
7087        Ok(cat)
7088    }
7089}
7090
7091/// Per-table deserialize body — schema, rows, indices. Pulled out of
7092/// `Catalog::deserialize` to keep the latter under the line-budget lint
7093/// and to give the row hot loop its own scope (so the borrow on `t`
7094/// stays scoped here rather than across the whole catalog loop).
7095fn deserialize_table(
7096    cur: &mut Cursor<'_>,
7097    cat: &mut Catalog,
7098    version: u8,
7099) -> Result<(), StorageError> {
7100    let table_name = cur.read_str()?;
7101    let name = table_name.clone();
7102    let col_count = cur.read_u16()? as usize;
7103    let mut cols = Vec::with_capacity(col_count);
7104    for _ in 0..col_count {
7105        let c_name = cur.read_str()?;
7106        let ty = cur.read_data_type()?;
7107        let nullable = cur.read_u8()? != 0;
7108        let default = match cur.read_u8()? {
7109            0 => None,
7110            1 => Some(cur.read_value()?),
7111            other => {
7112                return Err(StorageError::Corrupt(format!(
7113                    "unknown default tag: {other}"
7114                )));
7115            }
7116        };
7117        let auto_increment = cur.read_u8()? != 0;
7118        // Note: deserialiser sets runtime_default = None for
7119        // older catalogs (≤ v14). v15+ reads it from the
7120        // per-column appendix below.
7121        cols.push(ColumnSchema {
7122            name: c_name,
7123            ty,
7124            nullable,
7125            default,
7126            runtime_default: None,
7127            auto_increment,
7128            user_enum_type: None,
7129            user_domain_type: None,
7130            on_update_runtime: None,
7131            collation: Collation::Binary,
7132            is_unsigned: false,
7133            inline_enum_variants: None,
7134            inline_set_variants: None,
7135        });
7136    }
7137    let n_cols = cols.len();
7138    cat.create_table(TableSchema::new(name, cols))?;
7139    // Vec<Table> with insertion-order semantics — the just-pushed
7140    // table is at the end. Sidecar `by_name` is already wired up but
7141    // we skip the map lookup here since we know the position.
7142    let t = cat.tables.last_mut().expect("create_table just pushed");
7143    deserialize_rows(cur, t, n_cols)?;
7144    deserialize_indices(cur, t, version)?;
7145    // v6.7.2 — per-table hot_tier_bytes appendix. v11+ writes
7146    // `[u8 has_value][u64 LE value (if has_value)]`. v10 / v9 / v8
7147    // catalogs skip this entirely (the deserialiser reads no extra
7148    // bytes; the table's hot_tier_bytes stays None from
7149    // TableSchema::new).
7150    if version >= 11 {
7151        let has = cur.read_u8()?;
7152        let hot_tier_bytes = match has {
7153            0 => None,
7154            1 => Some(cur.read_u64()?),
7155            other => {
7156                return Err(StorageError::Corrupt(format!(
7157                    "hot_tier_bytes appendix: unknown has-value byte {other}"
7158                )));
7159            }
7160        };
7161        t.schema_mut().hot_tier_bytes = hot_tier_bytes;
7162    }
7163    // v7.6.1 — FOREIGN KEY appendix (FILE_VERSION 13+). v12 / v11 / …
7164    // catalogs skip this entirely.
7165    if version >= 13 {
7166        let fk_count = cur.read_u16()? as usize;
7167        let mut fks = Vec::with_capacity(fk_count);
7168        for _ in 0..fk_count {
7169            let name = match cur.read_u8()? {
7170                0 => None,
7171                1 => Some(cur.read_str()?),
7172                other => {
7173                    return Err(StorageError::Corrupt(format!(
7174                        "FK appendix: unknown has-name byte {other}"
7175                    )));
7176                }
7177            };
7178            let local_arity = cur.read_u16()? as usize;
7179            let mut local_columns = Vec::with_capacity(local_arity);
7180            for _ in 0..local_arity {
7181                local_columns.push(cur.read_u16()? as usize);
7182            }
7183            let parent_table = cur.read_str()?;
7184            let parent_arity = cur.read_u16()? as usize;
7185            if parent_arity != local_arity {
7186                return Err(StorageError::Corrupt(format!(
7187                    "FK arity mismatch in catalog: local {local_arity} vs parent {parent_arity}"
7188                )));
7189            }
7190            let mut parent_columns = Vec::with_capacity(parent_arity);
7191            for _ in 0..parent_arity {
7192                parent_columns.push(cur.read_u16()? as usize);
7193            }
7194            let on_delete = FkAction::from_tag(cur.read_u8()?).ok_or_else(|| {
7195                StorageError::Corrupt("FK appendix: unknown on_delete tag".into())
7196            })?;
7197            let on_update = FkAction::from_tag(cur.read_u8()?).ok_or_else(|| {
7198                StorageError::Corrupt("FK appendix: unknown on_update tag".into())
7199            })?;
7200            fks.push(ForeignKeyConstraint {
7201                name,
7202                local_columns,
7203                parent_table,
7204                parent_columns,
7205                on_delete,
7206                on_update,
7207            });
7208        }
7209        t.schema_mut().foreign_keys = fks;
7210    }
7211    // v7.9.19 — UniquenessConstraint appendix (FILE_VERSION 15+).
7212    // v14 and below skip this entirely.
7213    if version >= 15 {
7214        let uc_count = cur.read_u16()? as usize;
7215        let mut ucs = Vec::with_capacity(uc_count);
7216        for _ in 0..uc_count {
7217            let is_pk = cur.read_u8()? != 0;
7218            let arity = cur.read_u16()? as usize;
7219            let mut cols = Vec::with_capacity(arity);
7220            for _ in 0..arity {
7221                cols.push(cur.read_u16()? as usize);
7222            }
7223            // v7.13.0 — trailing `nulls_not_distinct` flag
7224            // (FILE_VERSION 23+). v22 and below skip — flag
7225            // defaults to false (= NULLS DISTINCT).
7226            let nulls_not_distinct = if version >= 23 {
7227                cur.read_u8()? != 0
7228            } else {
7229                false
7230            };
7231            ucs.push(UniquenessConstraint {
7232                is_primary_key: is_pk,
7233                columns: cols,
7234                nulls_not_distinct,
7235            });
7236        }
7237        t.schema_mut().uniqueness_constraints = ucs;
7238        // v7.9.21 — runtime_default appendix (FILE_VERSION 15+).
7239        let rt_count = cur.read_u16()? as usize;
7240        for _ in 0..rt_count {
7241            let pos = cur.read_u16()? as usize;
7242            let expr = cur.read_str()?;
7243            if let Some(col) = t.schema_mut().columns.get_mut(pos) {
7244                col.runtime_default = Some(expr);
7245            }
7246        }
7247    }
7248    // v7.13.0 — CHECK constraints appendix (FILE_VERSION 23+).
7249    // v22 and below leave the vec empty.
7250    if version >= 23 {
7251        let check_count = cur.read_u16()? as usize;
7252        let mut checks = Vec::with_capacity(check_count);
7253        for _ in 0..check_count {
7254            checks.push(cur.read_str()?);
7255        }
7256        t.schema_mut().checks = checks;
7257    }
7258    // v7.17.0 Phase 1.4 — per-table user_enum_type appendix
7259    // (FILE_VERSION 29+). Layout: [u16 count] then
7260    // [u16 col_pos][str enum_name] per binding.
7261    if version >= 29 {
7262        let binding_count = cur.read_u16()? as usize;
7263        for _ in 0..binding_count {
7264            let col_pos = cur.read_u16()? as usize;
7265            let ename = cur.read_str()?;
7266            if let Some(col) = t.schema_mut().columns.get_mut(col_pos) {
7267                col.user_enum_type = Some(ename);
7268            }
7269        }
7270    }
7271    // v7.17.0 Phase 1.5 — per-table user_domain_type appendix
7272    // (FILE_VERSION 30+). Same shape as the enum one.
7273    if version >= 30 {
7274        let binding_count = cur.read_u16()? as usize;
7275        for _ in 0..binding_count {
7276            let col_pos = cur.read_u16()? as usize;
7277            let dname = cur.read_str()?;
7278            if let Some(col) = t.schema_mut().columns.get_mut(col_pos) {
7279                col.user_domain_type = Some(dname);
7280            }
7281        }
7282    }
7283    // v7.17.0 Phase 2.1 — per-table on_update_runtime appendix
7284    // (FILE_VERSION 32+). Sparse layout matches the enum/
7285    // domain bindings.
7286    if version >= 32 {
7287        let binding_count = cur.read_u16()? as usize;
7288        for _ in 0..binding_count {
7289            let col_pos = cur.read_u16()? as usize;
7290            let expr_src = cur.read_str()?;
7291            if let Some(col) = t.schema_mut().columns.get_mut(col_pos) {
7292                col.on_update_runtime = Some(expr_src);
7293            }
7294        }
7295    }
7296    // v7.17.0 Phase 2.5 — per-table collation appendix
7297    // (FILE_VERSION 34+). Sparse: only non-Binary columns
7298    // land. v33-and-below readers leave every column at its
7299    // ColumnSchema::new default (Binary). Unknown tags from a
7300    // forward-incompat snapshot read back as Binary.
7301    if version >= 34 {
7302        let binding_count = cur.read_u16()? as usize;
7303        for _ in 0..binding_count {
7304            let col_pos = cur.read_u16()? as usize;
7305            let tag = cur.read_u8()?;
7306            let collation = match tag {
7307                Collation::TAG_CASE_INSENSITIVE => Collation::CaseInsensitive,
7308                _ => Collation::Binary,
7309            };
7310            if let Some(col) = t.schema_mut().columns.get_mut(col_pos) {
7311                col.collation = collation;
7312            }
7313        }
7314    }
7315    // v7.17.0 Phase 4.4 — per-table is_unsigned appendix
7316    // (FILE_VERSION 35+). Sparse: only UNSIGNED columns land.
7317    // v34-and-below readers leave every column at
7318    // `is_unsigned = false`.
7319    if version >= 35 {
7320        let binding_count = cur.read_u16()? as usize;
7321        for _ in 0..binding_count {
7322            let col_pos = cur.read_u16()? as usize;
7323            if let Some(col) = t.schema_mut().columns.get_mut(col_pos) {
7324                col.is_unsigned = true;
7325            }
7326        }
7327    }
7328    // v7.17.0 Phase 3.P0-36 — per-table inline_enum_variants
7329    // appendix (FILE_VERSION 41+). Sparse: only ENUM columns land.
7330    // v40-and-below readers leave every column at
7331    // `inline_enum_variants = None`.
7332    if version >= 41 {
7333        let binding_count = cur.read_u16()? as usize;
7334        for _ in 0..binding_count {
7335            let col_pos = cur.read_u16()? as usize;
7336            let variant_count = cur.read_u16()? as usize;
7337            let mut variants = Vec::with_capacity(variant_count);
7338            for _ in 0..variant_count {
7339                variants.push(cur.read_str()?);
7340            }
7341            if let Some(col) = t.schema_mut().columns.get_mut(col_pos) {
7342                col.inline_enum_variants = Some(variants);
7343            }
7344        }
7345    }
7346    // v7.17.0 Phase 3.P0-37 — per-table inline_set_variants
7347    // appendix (FILE_VERSION 42+). Sparse: only SET columns land.
7348    if version >= 42 {
7349        let binding_count = cur.read_u16()? as usize;
7350        for _ in 0..binding_count {
7351            let col_pos = cur.read_u16()? as usize;
7352            let variant_count = cur.read_u16()? as usize;
7353            let mut variants = Vec::with_capacity(variant_count);
7354            for _ in 0..variant_count {
7355                variants.push(cur.read_str()?);
7356            }
7357            if let Some(col) = t.schema_mut().columns.get_mut(col_pos) {
7358                col.inline_set_variants = Some(variants);
7359            }
7360        }
7361    }
7362    let _ = table_name;
7363    Ok(())
7364}
7365
7366fn deserialize_rows(
7367    cur: &mut Cursor<'_>,
7368    t: &mut Table,
7369    _n_cols: usize,
7370) -> Result<(), StorageError> {
7371    let row_count = cur.read_u32()? as usize;
7372    // v4.39: PV has no `reserve` (the BVT doesn't preallocate a
7373    // contiguous buffer); we just push directly and let the trie
7374    // grow. v5.1: row decode reuses `decode_row_body_dense` so the
7375    // catalog and cold-tier segments share one row codec.
7376    let mut hot_bytes: u64 = 0;
7377    for _ in 0..row_count {
7378        let tail = &cur.buf[cur.pos..];
7379        let (row, consumed) = decode_row_body_dense(tail, &t.schema, cur.codec_version)?;
7380        cur.pos += consumed;
7381        // v5.2.1: account for hot bytes as we go; the snapshot's row
7382        // block bytes are exactly what `encode_row_body_dense` would
7383        // produce, so `consumed` would do too — but going via the
7384        // helper keeps the counter's definition coupled to the
7385        // encoder rather than the snapshot's row prefix layout.
7386        hot_bytes = hot_bytes.saturating_add(row_body_encoded_len(&row, &t.schema) as u64);
7387        t.rows.push_mut(row);
7388    }
7389    t.hot_bytes = hot_bytes;
7390    Ok(())
7391}
7392
7393fn deserialize_indices(
7394    cur: &mut Cursor<'_>,
7395    t: &mut Table,
7396    version: u8,
7397) -> Result<(), StorageError> {
7398    let index_count = cur.read_u16()? as usize;
7399    for _ in 0..index_count {
7400        let idx_name = cur.read_str()?;
7401        let col_pos = cur.read_u16()? as usize;
7402        let column_name = t
7403            .schema
7404            .columns
7405            .get(col_pos)
7406            .ok_or_else(|| {
7407                StorageError::Corrupt(format!(
7408                    "index {idx_name:?} points at non-existent column position {col_pos}"
7409                ))
7410            })?
7411            .name
7412            .clone();
7413        let kind_tag = cur.read_u8()?;
7414        match kind_tag {
7415            0 => {
7416                if version >= 9 {
7417                    // v9+: BTree entries serialised inline (tag-prefixed
7418                    // locator codec). Restore the map directly so any
7419                    // freezer-produced Cold locators come back exactly
7420                    // as they went out.
7421                    let map = read_btree_map(cur)?;
7422                    t.restore_btree_index(idx_name, &column_name, map)?;
7423                } else {
7424                    // v8: no entries on disk; rebuild from rows. Every
7425                    // entry is materialised as `RowLocator::Hot(i)` —
7426                    // semantically identical to the v5.1 in-memory state
7427                    // since v8 catalogs never produced Cold locators.
7428                    t.add_index(idx_name, &column_name)?;
7429                }
7430            }
7431            1 => {
7432                let m = cur.read_u16()? as usize;
7433                let graph = cur.read_nsw_graph(m)?;
7434                t.restore_nsw_index(idx_name, &column_name, graph)?;
7435            }
7436            2 => {
7437                // v6.7.1 — BRIN tag. Payload is the column type
7438                // tag. No further data — summaries live in cold
7439                // segments.
7440                let column_type = cur.read_data_type()?;
7441                t.restore_brin_index(idx_name, &column_name, column_type)?;
7442            }
7443            3 => {
7444                // v7.12.3 — GIN tag. Payload mirrors the BTree
7445                // encoding but with String (lexeme word) keys.
7446                // Only emitted by FILE_VERSION 21+ writers — v20
7447                // and earlier degraded `USING gin` to BTree.
7448                let map = read_gin_map(cur)?;
7449                t.restore_gin_index(idx_name, &column_name, map)?;
7450            }
7451            4 => {
7452                // v7.15.0 — trigram-GIN tag (`gin_trgm_ops`).
7453                // Same payload shape as tag 3 (String → posting
7454                // list); only emitted by FILE_VERSION 24+ writers.
7455                if version < 24 {
7456                    return Err(StorageError::Corrupt(format!(
7457                        "trigram-GIN index tag 4 found in catalog FILE_VERSION {version}; \
7458                         FILE_VERSION 24+ required (v7.15.0 introduced this tag)"
7459                    )));
7460                }
7461                let map = read_gin_map(cur)?;
7462                t.restore_gin_trgm_index(idx_name, &column_name, map)?;
7463            }
7464            5 => {
7465                // v7.17.0 Phase 2.2 — fulltext-GIN tag (MySQL
7466                // `FULLTEXT KEY` surface). Same payload shape as
7467                // tag 3 / tag 4 (String → posting list); only
7468                // emitted by FILE_VERSION 33+ writers.
7469                if version < 33 {
7470                    return Err(StorageError::Corrupt(format!(
7471                        "fulltext-GIN index tag 5 found in catalog FILE_VERSION {version}; \
7472                         FILE_VERSION 33+ required (v7.17.0 Phase 2.2 introduced this tag)"
7473                    )));
7474                }
7475                let map = read_gin_map(cur)?;
7476                t.restore_gin_fulltext_index(idx_name, &column_name, map)?;
7477            }
7478            other => {
7479                return Err(StorageError::Corrupt(format!(
7480                    "unknown index kind tag: {other}"
7481                )));
7482            }
7483        }
7484        // v6.8.0 — included_columns appendix per index. v11- snapshots
7485        // stop before this u16; v12+ always carries it (possibly 0).
7486        if version >= 12 {
7487            let num_included = cur.read_u16()? as usize;
7488            if num_included > 0 {
7489                let mut included: Vec<usize> = Vec::with_capacity(num_included);
7490                for _ in 0..num_included {
7491                    let cp = cur.read_u16()? as usize;
7492                    if cp >= t.schema.columns.len() {
7493                        return Err(StorageError::Corrupt(format!(
7494                            "INCLUDE column position {cp} out of range \
7495                             ({} schema columns)",
7496                            t.schema.columns.len()
7497                        )));
7498                    }
7499                    included.push(cp);
7500                }
7501                if let Some(last) = t.indices.last_mut() {
7502                    last.included_columns = included;
7503                }
7504            }
7505            // v6.8.1 — partial_predicate appendix.
7506            match cur.read_u8()? {
7507                0 => {}
7508                1 => {
7509                    let pred = cur.read_str()?;
7510                    if let Some(last) = t.indices.last_mut() {
7511                        last.partial_predicate = Some(pred);
7512                    }
7513                }
7514                other => {
7515                    return Err(StorageError::Corrupt(format!(
7516                        "partial_predicate tag: unknown byte {other}"
7517                    )));
7518                }
7519            }
7520            // v6.8.2 — expression appendix.
7521            match cur.read_u8()? {
7522                0 => {}
7523                1 => {
7524                    let expr = cur.read_str()?;
7525                    if let Some(last) = t.indices.last_mut() {
7526                        last.expression = Some(expr);
7527                    }
7528                }
7529                other => {
7530                    return Err(StorageError::Corrupt(format!(
7531                        "expression tag: unknown byte {other}"
7532                    )));
7533                }
7534            }
7535            // v7.9.29 — is_unique appendix (FILE_VERSION 16+).
7536            // v15-and-below catalogs stop before this byte. mailrs K1.
7537            if version >= 16 {
7538                match cur.read_u8()? {
7539                    0 => {}
7540                    1 => {
7541                        if let Some(last) = t.indices.last_mut() {
7542                            last.is_unique = true;
7543                        }
7544                    }
7545                    other => {
7546                        return Err(StorageError::Corrupt(format!(
7547                            "is_unique tag: unknown byte {other}"
7548                        )));
7549                    }
7550                }
7551                // v7.9.29 — extra_column_positions appendix.
7552                let n = cur.read_u16()? as usize;
7553                if n > 0 {
7554                    let mut extras: Vec<usize> = Vec::with_capacity(n);
7555                    for _ in 0..n {
7556                        let cp = cur.read_u16()? as usize;
7557                        if cp >= t.schema.columns.len() {
7558                            return Err(StorageError::Corrupt(format!(
7559                                "extra column position {cp} out of range \
7560                                 ({} schema columns)",
7561                                t.schema.columns.len()
7562                            )));
7563                        }
7564                        extras.push(cp);
7565                    }
7566                    if let Some(last) = t.indices.last_mut() {
7567                        last.extra_column_positions = extras;
7568                    }
7569                }
7570            }
7571        }
7572    }
7573    Ok(())
7574}
7575
7576/// Parse a v9 `BTree` index payload — `[u32 entry_count]` followed by
7577/// `entry_count` `(IndexKey, Vec<RowLocator>)` pairs. The locator list
7578/// uses the v5.1 tag-prefixed wire format (`RowLocator::read_le`).
7579fn read_btree_map(
7580    cur: &mut Cursor<'_>,
7581) -> Result<PersistentBTreeMap<IndexKey, Vec<RowLocator>>, StorageError> {
7582    let entry_count = cur.read_u32()? as usize;
7583    let mut map = PersistentBTreeMap::new();
7584    for _ in 0..entry_count {
7585        let key = cur.read_index_key()?;
7586        let locator_count = cur.read_u32()? as usize;
7587        let mut locators = Vec::with_capacity(locator_count);
7588        for _ in 0..locator_count {
7589            let tail = &cur.buf[cur.pos..];
7590            let (loc, consumed) = RowLocator::read_le(tail).map_err(|e| {
7591                StorageError::Corrupt(format!("row_locator decode at offset {}: {e}", cur.pos))
7592            })?;
7593            cur.pos += consumed;
7594            locators.push(loc);
7595        }
7596        map.insert_mut(key, locators);
7597    }
7598    Ok(map)
7599}
7600
7601/// v7.12.3 — parse a `Gin` index payload. Mirrors [`read_btree_map`]
7602/// but with `String` (lexeme word) keys instead of `IndexKey`.
7603/// FILE_VERSION 21+ only.
7604fn read_gin_map(
7605    cur: &mut Cursor<'_>,
7606) -> Result<PersistentBTreeMap<String, Vec<RowLocator>>, StorageError> {
7607    let entry_count = cur.read_u32()? as usize;
7608    let mut map = PersistentBTreeMap::new();
7609    for _ in 0..entry_count {
7610        let word = cur.read_str()?;
7611        let locator_count = cur.read_u32()? as usize;
7612        let mut locators = Vec::with_capacity(locator_count);
7613        for _ in 0..locator_count {
7614            let tail = &cur.buf[cur.pos..];
7615            let (loc, consumed) = RowLocator::read_le(tail).map_err(|e| {
7616                StorageError::Corrupt(format!("row_locator decode at offset {}: {e}", cur.pos))
7617            })?;
7618            cur.pos += consumed;
7619            locators.push(loc);
7620        }
7621        map.insert_mut(word, locators);
7622    }
7623    Ok(map)
7624}
7625
7626// --- low-level binary helpers ---------------------------------------------
7627
7628/// Write a `DataType` as a tag byte + optional payload (Vector carries its
7629/// `u32` dimension). Inverse: [`read_data_type`].
7630/// Serialize an HNSW graph after the `[kind=1][u16 M]` header (v7).
7631/// Layout:
7632/// - `[u16 m_max_0]`
7633/// - `[entry u32]` — `u32::MAX` means `None`, else the entry node index
7634/// - `[u8 entry_level]`
7635/// - `[node_count u32]`
7636/// - for each node: `[u8 level]`  (top layer for this node)
7637/// - `[layer_count u8]`
7638/// - for each layer `0..layer_count`:
7639///     - `[u32 layer_node_count]` (== `node_count`; per-layer slot)
7640///     - for each node: `[u16 neighbor_count] [u32 neighbor]*`
7641fn write_nsw_graph(out: &mut Vec<u8>, g: &NswGraph) {
7642    let entry = g.entry.map_or(u32::MAX, |e| {
7643        u32::try_from(e).expect("NSW entry fits in u32")
7644    });
7645    write_u16(
7646        out,
7647        u16::try_from(g.m_max_0).expect("HNSW m_max_0 fits in u16"),
7648    );
7649    out.extend_from_slice(&entry.to_le_bytes());
7650    out.push(g.entry_level);
7651    let node_count = g.levels.len();
7652    write_u32(
7653        out,
7654        u32::try_from(node_count).expect("HNSW node count fits in u32"),
7655    );
7656    for &lvl in &g.levels {
7657        out.push(lvl);
7658    }
7659    let layer_count = u8::try_from(g.layers.len()).expect("HNSW layer count ≤ 255");
7660    out.push(layer_count);
7661    for layer in &g.layers {
7662        write_u32(
7663            out,
7664            u32::try_from(layer.len()).expect("HNSW per-layer node count fits in u32"),
7665        );
7666        for neighbors in layer {
7667            write_u16(
7668                out,
7669                u16::try_from(neighbors.len()).expect("HNSW neighbour list fits in u16"),
7670            );
7671            // v6.1.x: neighbour slot is already u32 in memory; just
7672            // emit the raw bytes. (v6.0 stored usize and converted
7673            // here.)
7674            for &peer in neighbors {
7675                write_u32(out, peer);
7676            }
7677        }
7678    }
7679}
7680
7681fn write_data_type(out: &mut Vec<u8>, t: DataType) {
7682    match t {
7683        DataType::Int => out.push(1),
7684        DataType::BigInt => out.push(2),
7685        DataType::Float => out.push(3),
7686        DataType::Text => out.push(4),
7687        DataType::Bool => out.push(5),
7688        DataType::Vector { dim, encoding } => match encoding {
7689            // Tag 6: pre-v6 F32 vector. Layout unchanged; pre-v6
7690            // binaries continue to deserialise this exactly as
7691            // before.
7692            VecEncoding::F32 => {
7693                out.push(6);
7694                out.extend_from_slice(&dim.to_le_bytes());
7695            }
7696            // v6.0.3: tag 15 for `VECTOR(N) USING HALF`. Same
7697            // forward-compat fence story as SQ8 below.
7698            VecEncoding::F16 => {
7699                out.push(15);
7700                out.extend_from_slice(&dim.to_le_bytes());
7701            }
7702            // v6.0.1: new tag 14 for `VECTOR(N) USING SQ8` column
7703            // type. Pre-v6 readers fall through `read_data_type`'s
7704            // catch-all and surface `Corrupt("unknown data type tag")`
7705            // — the explicit forward-compat fence called out in
7706            // V6_DESIGN deliberation #5.
7707            VecEncoding::Sq8 => {
7708                out.push(14);
7709                out.extend_from_slice(&dim.to_le_bytes());
7710            }
7711        },
7712        DataType::SmallInt => out.push(7),
7713        DataType::Varchar(max) => {
7714            out.push(8);
7715            out.extend_from_slice(&max.to_le_bytes());
7716        }
7717        DataType::Char(size) => {
7718            out.push(9);
7719            out.extend_from_slice(&size.to_le_bytes());
7720        }
7721        DataType::Numeric { precision, scale } => {
7722            out.push(10);
7723            out.push(precision);
7724            out.push(scale);
7725        }
7726        DataType::Date => out.push(11),
7727        DataType::Timestamp => out.push(12),
7728        // v7.9.2 — tag 17 for TIMESTAMPTZ. Body = i64 microseconds
7729        // UTC, identical to tag 12. Only the schema-side type tag
7730        // differs (for wire OID advertisement).
7731        DataType::Timestamptz => out.push(17),
7732        // INTERVAL is runtime-only — CREATE TABLE never produces a
7733        // column with this type, so write_data_type must not be called
7734        // on it. (Disk-format codepoint reserved for a future v3 where
7735        // INTERVAL becomes storable.)
7736        DataType::Interval => {
7737            unreachable!("DataType::Interval has no on-disk encoding in v2.11")
7738        }
7739        DataType::Json => out.push(13),
7740        // v7.9.0: tag 16 for `JSONB`. Same on-disk layout as
7741        // tag 13 — only the wire OID differs.
7742        DataType::Jsonb => out.push(16),
7743        // v7.10.4: tag 18 for `BYTEA`. Body = [u16 len][bytes].
7744        DataType::Bytes => out.push(18),
7745        // v7.10.9: tag 19 for `TEXT[]`. Body = [u16 count][per
7746        // element: u8 null + (if non-null) u16 len + utf-8].
7747        DataType::TextArray => out.push(19),
7748        // v7.11.12: tag 20 for `INT[]`. Body = [u16 count][per
7749        // element: u8 null + (if non-null) i32 LE].
7750        DataType::IntArray => out.push(20),
7751        // v7.11.12: tag 21 for `BIGINT[]`. Body = [u16 count][per
7752        // element: u8 null + (if non-null) i64 LE].
7753        DataType::BigIntArray => out.push(21),
7754        // v7.12.0: tag 22 for `tsvector`. No body — type identity
7755        // alone. Catalog FILE_VERSION 20+.
7756        DataType::TsVector => out.push(22),
7757        // v7.12.0: tag 23 for `tsquery`. No body. Catalog
7758        // FILE_VERSION 20+.
7759        DataType::TsQuery => out.push(23),
7760        // v7.17.0: tag 24 for `UUID`. No body — type identity
7761        // alone. Catalog FILE_VERSION 36+.
7762        DataType::Uuid => out.push(24),
7763        // v7.17.0 Phase 3.P0-32: tag 25 for `TIME`. No body — type
7764        // identity alone. Catalog FILE_VERSION 37+.
7765        DataType::Time => out.push(25),
7766        // v7.17.0 Phase 3.P0-33: tag 26 for `YEAR`. No body — type
7767        // identity alone. Catalog FILE_VERSION 38+.
7768        DataType::Year => out.push(26),
7769        // v7.17.0 Phase 3.P0-34: tag 27 for `TIMETZ`. No body —
7770        // type identity alone. Catalog FILE_VERSION 39+.
7771        DataType::TimeTz => out.push(27),
7772        // v7.17.0 Phase 3.P0-35: tag 28 for `MONEY`. No body —
7773        // type identity alone. Catalog FILE_VERSION 40+.
7774        DataType::Money => out.push(28),
7775        // v7.17.0 Phase 3.P0-38: tag 29 for range types. Body
7776        // = `[u8 RangeKind tag]`. Catalog FILE_VERSION 43+.
7777        DataType::Range(k) => {
7778            out.push(29);
7779            out.push(k.tag());
7780        }
7781        // v7.17.0 Phase 3.P0-39: tag 30 for hstore. No body —
7782        // type identity alone. Catalog FILE_VERSION 44+.
7783        DataType::Hstore => out.push(30),
7784        // v7.17.0 Phase 3.P0-40: tag 31/32/33 for 2D arrays.
7785        // No body — type identity alone. Catalog FILE_VERSION 45+.
7786        DataType::IntArray2D => out.push(31),
7787        DataType::BigIntArray2D => out.push(32),
7788        DataType::TextArray2D => out.push(33),
7789    }
7790}
7791
7792impl Cursor<'_> {
7793    fn read_data_type(&mut self) -> Result<DataType, StorageError> {
7794        let tag = self.read_u8()?;
7795        match tag {
7796            1 => Ok(DataType::Int),
7797            2 => Ok(DataType::BigInt),
7798            3 => Ok(DataType::Float),
7799            4 => Ok(DataType::Text),
7800            5 => Ok(DataType::Bool),
7801            6 => Ok(DataType::Vector {
7802                dim: self.read_u32()?,
7803                encoding: VecEncoding::F32,
7804            }),
7805            7 => Ok(DataType::SmallInt),
7806            8 => Ok(DataType::Varchar(self.read_u32()?)),
7807            9 => Ok(DataType::Char(self.read_u32()?)),
7808            10 => {
7809                let precision = self.read_u8()?;
7810                let scale = self.read_u8()?;
7811                Ok(DataType::Numeric { precision, scale })
7812            }
7813            11 => Ok(DataType::Date),
7814            12 => Ok(DataType::Timestamp),
7815            13 => Ok(DataType::Json),
7816            14 => Ok(DataType::Vector {
7817                dim: self.read_u32()?,
7818                encoding: VecEncoding::Sq8,
7819            }),
7820            // v6.0.3: tag 15 for `VECTOR(N) USING HALF`. Same
7821            // [u32 dim] type-tag payload as F32 / SQ8; the encoding
7822            // lives in the tag byte itself.
7823            15 => Ok(DataType::Vector {
7824                dim: self.read_u32()?,
7825                encoding: VecEncoding::F16,
7826            }),
7827            // v7.9.0: tag 16 for `JSONB`. Storage shape == Json;
7828            // we only carry the type tag so the wire layer can
7829            // emit PG OID 3802 instead of 114.
7830            16 => Ok(DataType::Jsonb),
7831            // v7.9.2: tag 17 for `TIMESTAMPTZ`. Storage shape ==
7832            // Timestamp (i64 microseconds UTC); only the wire OID
7833            // (1184) differs.
7834            17 => Ok(DataType::Timestamptz),
7835            // v7.10.4: tag 18 for `BYTEA`. Catalog FILE_VERSION 17+.
7836            18 => Ok(DataType::Bytes),
7837            // v7.10.9: tag 19 for `TEXT[]`. Catalog FILE_VERSION 18+.
7838            19 => Ok(DataType::TextArray),
7839            // v7.11.12: tags 20/21 for INT[]/BIGINT[]. FILE_VERSION 19+.
7840            20 => Ok(DataType::IntArray),
7841            21 => Ok(DataType::BigIntArray),
7842            // v7.12.0: tags 22/23 for tsvector / tsquery. Catalog
7843            // FILE_VERSION 20+.
7844            22 => Ok(DataType::TsVector),
7845            23 => Ok(DataType::TsQuery),
7846            // v7.17.0: tag 24 — UUID. Catalog FILE_VERSION 36+.
7847            24 => Ok(DataType::Uuid),
7848            // v7.17.0 Phase 3.P0-32: tag 25 — TIME. Catalog
7849            // FILE_VERSION 37+.
7850            25 => Ok(DataType::Time),
7851            // v7.17.0 Phase 3.P0-33: tag 26 — YEAR. Catalog
7852            // FILE_VERSION 38+.
7853            26 => Ok(DataType::Year),
7854            // v7.17.0 Phase 3.P0-34: tag 27 — TIMETZ. Catalog
7855            // FILE_VERSION 39+.
7856            27 => Ok(DataType::TimeTz),
7857            // v7.17.0 Phase 3.P0-35: tag 28 — MONEY. Catalog
7858            // FILE_VERSION 40+.
7859            28 => Ok(DataType::Money),
7860            // v7.17.0 Phase 3.P0-38: tag 29 + RangeKind tag.
7861            29 => {
7862                let kt = self.read_u8()?;
7863                let k = RangeKind::from_tag(kt)
7864                    .ok_or_else(|| StorageError::Corrupt(format!("unknown RangeKind tag: {kt}")))?;
7865                Ok(DataType::Range(k))
7866            }
7867            // v7.17.0 Phase 3.P0-39: tag 30 — HSTORE.
7868            30 => Ok(DataType::Hstore),
7869            // v7.17.0 Phase 3.P0-40: tag 31/32/33 — 2D arrays.
7870            31 => Ok(DataType::IntArray2D),
7871            32 => Ok(DataType::BigIntArray2D),
7872            33 => Ok(DataType::TextArray2D),
7873            other => Err(StorageError::Corrupt(format!(
7874                "unknown data type tag: {other}"
7875            ))),
7876        }
7877    }
7878}
7879
7880/// Fast computation of the byte length [`encode_row_body_dense`]
7881/// would produce, without allocating the output buffer. Mirrors the
7882/// encoder's per-column body sizing so the v5.2.1 `Table::hot_bytes`
7883/// incremental counter doesn't pay an alloc-per-insert tax. Returns
7884/// the exact same `usize` as `encode_row_body_dense(row, schema).len()`.
7885pub fn row_body_encoded_len(row: &Row, schema: &TableSchema) -> usize {
7886    debug_assert_eq!(
7887        row.values.len(),
7888        schema.columns.len(),
7889        "row_body_encoded_len: row arity must match schema"
7890    );
7891    let bitmap_bytes = schema.columns.len().div_ceil(8);
7892    let mut n = bitmap_bytes;
7893    for (col_idx, v) in row.values.iter().enumerate() {
7894        if matches!(v, Value::Null) {
7895            continue;
7896        }
7897        n += value_body_encoded_len(v, schema.columns[col_idx].ty);
7898    }
7899    n
7900}
7901
7902/// Byte length a single cell consumes when written by
7903/// `write_value_body`. Used by [`row_body_encoded_len`]; kept in
7904/// lock-step with the encoder. The `_ty` slot is reserved for future
7905/// type-dependent encodings — every variant currently writes a fixed
7906/// body shape regardless of the declared column type.
7907fn value_body_encoded_len(v: &Value, _ty: DataType) -> usize {
7908    match v {
7909        Value::SmallInt(_) => 2,
7910        // 4-byte body: i32 / Date.
7911        Value::Int(_) | Value::Date(_) => 4,
7912        // 8-byte body: i64 / f64 / Timestamp.
7913        Value::BigInt(_) | Value::Float(_) | Value::Timestamp(_) => 8,
7914        Value::Bool(_) => 1,
7915        // Text/Varchar/Char/Json share the [u16 len][utf-8] layout;
7916        // v7.23 — texts >= 64 KiB take the 6-byte escape header
7917        // (these sizes feed the freezer's hot-bytes budget, so the
7918        // estimate must not undercount).
7919        Value::Text(s) | Value::Json(s) => {
7920            if s.len() >= STR_LEN_ESCAPE as usize {
7921                6 + s.len()
7922            } else {
7923                2 + s.len()
7924            }
7925        }
7926        // [u32 dim][f32 * dim]
7927        Value::Vector(vec) => 4 + 4 * vec.len(),
7928        // v6.0.1: SQ8 cell on-disk shape — [u32 dim][f32 min]
7929        // [f32 max][u8 * dim] = 12 + dim bytes. `hot_bytes`
7930        // tracking on `Table::insert` calls this every row, so
7931        // returning the real size now (even though the actual
7932        // `write_value_body` writer lands in step 6) keeps the
7933        // sizing arithmetic honest for in-memory benches.
7934        Value::Sq8Vector(q) => 4 + 4 + 4 + q.bytes.len(),
7935        // v6.0.3: halfvec on-disk shape — [u32 dim][u16 LE * dim]
7936        // = 4 + 2 * dim bytes.
7937        Value::HalfVector(h) => 4 + h.bytes.len(),
7938        // [i128 scaled][u8 scale]
7939        Value::Numeric { .. } => 16 + 1,
7940        // v7.10.4: BYTEA on-disk shape mirrors Text — [u16 len][bytes].
7941        // The 16-bit length cap is the same TEXT/JSON limit (~65 KB);
7942        // larger blobs need toast-style chunking which is a v7.11
7943        // carve-out (kept aligned with TEXT for now so the catalog
7944        // snapshot stays simple).
7945        Value::Bytes(b) => 2 + b.len(),
7946        // v7.10.9: TEXT[] on-disk shape — [u16 count][per element:
7947        // u8 null flag + (when non-null) u16 len + utf-8 bytes].
7948        Value::TextArray(items) => {
7949            let mut n = 2; // count prefix
7950            for item in items {
7951                n += 1; // null flag
7952                if let Some(s) = item {
7953                    n += 2 + s.len();
7954                }
7955            }
7956            n
7957        }
7958        // v7.11.12: INT[] / BIGINT[] — [u16 count][per element:
7959        // u8 null + (when non-null) fixed-width LE].
7960        Value::IntArray(items) => {
7961            2 + items
7962                .iter()
7963                .map(|x| if x.is_some() { 5 } else { 1 })
7964                .sum::<usize>()
7965        }
7966        Value::BigIntArray(items) => {
7967            2 + items
7968                .iter()
7969                .map(|x| if x.is_some() { 9 } else { 1 })
7970                .sum::<usize>()
7971        }
7972        // v7.12.0: tsvector dense body — [u16 lexeme_count][per
7973        // lex: u16 word_len + utf-8 word + u16 pos_count + (u16
7974        // LE * pos_count) + u8 weight].
7975        Value::TsVector(lexs) => {
7976            let mut n = 2;
7977            for l in lexs {
7978                n += 2 + l.word.len() + 2 + 2 * l.positions.len() + 1;
7979            }
7980            n
7981        }
7982        // v7.12.0: tsquery dense body — prefix-coded tree.
7983        // Sizing must match `write_tsquery_body` walker.
7984        Value::TsQuery(ast) => tsquery_encoded_len(ast),
7985        // v7.17.0: UUID dense body — fixed 16 bytes, no prefix.
7986        Value::Uuid(_) => 16,
7987        // v7.17.0 Phase 3.P0-32: TIME dense body — fixed i64 LE.
7988        Value::Time(_) => 8,
7989        // v7.17.0 Phase 3.P0-33: YEAR dense body — fixed u16 LE.
7990        Value::Year(_) => 2,
7991        // v7.17.0 Phase 3.P0-34: TIMETZ dense body — i64 LE + i32 LE.
7992        Value::TimeTz { .. } => 12,
7993        // v7.17.0 Phase 3.P0-35: MONEY dense body — i64 LE cents.
7994        Value::Money(_) => 8,
7995        // v7.17.0 Phase 3.P0-38: range dense body — `[u8 flags]
7996        // [if lower: write_value(lower)] [if upper: write_value(upper)]`.
7997        // Element uses the schema-agnostic write_value codec
7998        // (which carries its own tag byte). The flags byte
7999        // captures empty/lower_some/upper_some/lower_inc/upper_inc.
8000        Value::Range { lower, upper, .. } => {
8001            1 + lower
8002                .as_ref()
8003                .map(|v| write_value_encoded_len(v))
8004                .unwrap_or(0)
8005                + upper
8006                    .as_ref()
8007                    .map(|v| write_value_encoded_len(v))
8008                    .unwrap_or(0)
8009        }
8010        // v7.17.0 Phase 3.P0-39: hstore dense body — `[u32 count]
8011        // then per pair [u32 klen][k bytes][u8 has_val][if has_val:
8012        // u32 vlen][v bytes]`.
8013        Value::Hstore(pairs) => {
8014            let mut n = 4;
8015            for (k, v) in pairs {
8016                n += 4 + k.len() + 1;
8017                if let Some(val) = v {
8018                    n += 4 + val.len();
8019                }
8020            }
8021            n
8022        }
8023        // v7.17.0 Phase 3.P0-40: 2D arrays dense body — `[u32 rows]
8024        // [u32 cols] then row-major elements with per-element
8025        // `[u8 null_flag][if non-null: element body]`.
8026        Value::IntArray2D(rows) => {
8027            let cols = rows.first().map(|r| r.len()).unwrap_or(0);
8028            8 + rows.len() * cols * (1 + 4)
8029        }
8030        Value::BigIntArray2D(rows) => {
8031            let cols = rows.first().map(|r| r.len()).unwrap_or(0);
8032            8 + rows.len() * cols * (1 + 8)
8033        }
8034        Value::TextArray2D(rows) => {
8035            let cols = rows.first().map(|r| r.len()).unwrap_or(0);
8036            let mut n = 8 + rows.len() * cols;
8037            for row in rows {
8038                for s in row.iter().flatten() {
8039                    n += 4 + s.len();
8040                }
8041            }
8042            n
8043        }
8044        // NULL is encoded only in the bitmap, never in the body.
8045        Value::Null => 0,
8046        // INTERVAL has no on-disk encoding (see write_value_body).
8047        Value::Interval { .. } => {
8048            unreachable!("Value::Interval has no on-disk encoding")
8049        }
8050    }
8051}
8052
8053/// Encode one row's body in the v3.0.2 dense format (`FILE_VERSION`
8054/// 8): per-row NULL bitmap (1 bit/col, ceil(cols/8) bytes), then
8055/// each non-NULL cell as `write_value_body`. Same wire shape the
8056/// catalog snapshot writes per row inside its rows-block. Exposed
8057/// pub so v5.1+ cold-tier segment writers can produce row payloads
8058/// that the catalog [`decode_row_body_dense`] decodes 1:1.
8059///
8060/// `row.values.len()` must equal `schema.columns.len()` — the row
8061/// is expected to have been validated by `Table::insert` (the
8062/// engine's INSERT path) before reaching this function.
8063pub fn encode_row_body_dense(row: &Row, schema: &TableSchema) -> Vec<u8> {
8064    debug_assert_eq!(
8065        row.values.len(),
8066        schema.columns.len(),
8067        "dense encode: row arity must match schema"
8068    );
8069    let bitmap_bytes = schema.columns.len().div_ceil(8);
8070    // 8 B per fixed-width cell is a reasonable average; the buffer
8071    // grows past this for variable-width Text/Vector cells.
8072    let mut out = Vec::with_capacity(bitmap_bytes + schema.columns.len() * 8);
8073    let bitmap_offset = out.len();
8074    out.resize(bitmap_offset + bitmap_bytes, 0);
8075    for (i, v) in row.values.iter().enumerate() {
8076        if matches!(v, Value::Null) {
8077            out[bitmap_offset + i / 8] |= 1 << (i % 8);
8078        }
8079    }
8080    for (col_idx, v) in row.values.iter().enumerate() {
8081        if matches!(v, Value::Null) {
8082            continue;
8083        }
8084        write_value_body(&mut out, v, schema.columns[col_idx].ty);
8085    }
8086    out
8087}
8088
8089/// Inverse of [`encode_row_body_dense`]. Reads one row's body from
8090/// `bytes` and returns it plus the number of bytes consumed (so a
8091/// caller decoding a back-to-back stream of rows can advance its
8092/// cursor). Returns `StorageError::Corrupt` on truncation, bad
8093/// UTF-8, or unknown cell tags.
8094pub fn decode_row_body_dense(
8095    bytes: &[u8],
8096    schema: &TableSchema,
8097    codec_version: u8,
8098) -> Result<(Row, usize), StorageError> {
8099    let mut cur = Cursor::new(bytes).with_codec_version(codec_version);
8100    let bitmap_bytes = schema.columns.len().div_ceil(8);
8101    let mut bitmap_buf = [0u8; 32];
8102    if bitmap_bytes > bitmap_buf.len() {
8103        return Err(StorageError::Corrupt(format!(
8104            "row NULL bitmap {bitmap_bytes} B exceeds 32 B cap"
8105        )));
8106    }
8107    let slice = cur.take(bitmap_bytes)?;
8108    bitmap_buf[..bitmap_bytes].copy_from_slice(slice);
8109    let mut values = Vec::with_capacity(schema.columns.len());
8110    for (col_idx, col) in schema.columns.iter().enumerate() {
8111        if (bitmap_buf[col_idx / 8] >> (col_idx % 8)) & 1 == 1 {
8112            values.push(Value::Null);
8113        } else {
8114            values.push(cur.read_value_body(col.ty)?);
8115        }
8116    }
8117    Ok((Row { values }, cur.pos))
8118}
8119
8120/// Schema-driven dense value encoding (`FILE_VERSION` 8). Caller already
8121/// knows the column type and has decided this cell is non-NULL, so we
8122/// skip the per-cell type tag the v7 `write_value` was writing. NULL
8123/// is encoded via the per-row bitmap before this function runs, never
8124/// reaches here. Used only inside the row-encoding hot loop; the
8125/// schema-default path still goes through the legacy `write_value` so
8126/// DEFAULT values keep their self-describing tag and remain decodable
8127/// without consulting a column type.
8128fn write_value_body(out: &mut Vec<u8>, v: &Value, ty: DataType) {
8129    match (v, ty) {
8130        (Value::SmallInt(n), DataType::SmallInt) => out.extend_from_slice(&n.to_le_bytes()),
8131        (Value::Int(n), DataType::Int) => out.extend_from_slice(&n.to_le_bytes()),
8132        (Value::BigInt(n), DataType::BigInt) => out.extend_from_slice(&n.to_le_bytes()),
8133        (Value::Float(x), DataType::Float) => out.extend_from_slice(&x.to_le_bytes()),
8134        (Value::Bool(b), DataType::Bool) => out.push(u8::from(*b)),
8135        (Value::Text(s), DataType::Text | DataType::Varchar(_) | DataType::Char(_)) => {
8136            write_str(out, s);
8137        }
8138        (
8139            Value::Vector(v),
8140            DataType::Vector {
8141                encoding: VecEncoding::F32,
8142                ..
8143            },
8144        ) => {
8145            let dim = u32::try_from(v.len()).expect("vector dim fits in u32");
8146            out.extend_from_slice(&dim.to_le_bytes());
8147            for x in v {
8148                out.extend_from_slice(&x.to_le_bytes());
8149            }
8150        }
8151        // v6.0.1: SQ8 dense body — [u32 dim][f32 min][f32 max]
8152        // [u8 * dim]. Self-describes its length so v6 readers
8153        // walking rows of a v6 catalog stay aligned even if the
8154        // declared column dim drifts (defensive, not normally
8155        // possible since CREATE TABLE pins the dim).
8156        (
8157            Value::Sq8Vector(q),
8158            DataType::Vector {
8159                encoding: VecEncoding::Sq8,
8160                ..
8161            },
8162        ) => {
8163            let dim = u32::try_from(q.bytes.len()).expect("vector dim fits in u32");
8164            out.extend_from_slice(&dim.to_le_bytes());
8165            out.extend_from_slice(&q.min.to_le_bytes());
8166            out.extend_from_slice(&q.max.to_le_bytes());
8167            out.extend_from_slice(&q.bytes);
8168        }
8169        // v6.0.3: halfvec dense body — [u32 dim][u16 LE * dim].
8170        // The raw u16 bytes already live in `h.bytes` little-
8171        // endian, so we just splat them.
8172        (
8173            Value::HalfVector(h),
8174            DataType::Vector {
8175                encoding: VecEncoding::F16,
8176                ..
8177            },
8178        ) => {
8179            let dim = u32::try_from(h.dim()).expect("vector dim fits in u32");
8180            out.extend_from_slice(&dim.to_le_bytes());
8181            out.extend_from_slice(&h.bytes);
8182        }
8183        (Value::Numeric { scaled, .. }, DataType::Numeric { scale, .. }) => {
8184            out.extend_from_slice(&scaled.to_le_bytes());
8185            out.push(scale);
8186        }
8187        (Value::Date(d), DataType::Date) => out.extend_from_slice(&d.to_le_bytes()),
8188        (Value::Timestamp(t), DataType::Timestamp | DataType::Timestamptz) => {
8189            out.extend_from_slice(&t.to_le_bytes())
8190        }
8191        // v4.9: JSON stores as length-prefixed text; same shape as
8192        // Text — the type tag lives in the column schema, not the
8193        // per-cell body.
8194        (Value::Json(s), DataType::Json | DataType::Jsonb) => write_str(out, s),
8195        // v7.10.4: BYTEA shares the [u16 len][bytes] shape with
8196        // Text but writes raw bytes (no UTF-8 invariant).
8197        // v7.27 (round-21) — BYTEA takes the escaped length: round-14
8198        // moved TEXT to the escape codec and missed this arm; the
8199        // twin fired during mailrs's production migration window.
8200        (Value::Bytes(b), DataType::Bytes) => write_bytes_escaped(out, b),
8201        // v7.10.9: TEXT[] dense body — [u16 count][per element:
8202        // u8 null flag + (when non-null) u16 len + utf-8 bytes].
8203        (Value::TextArray(items), DataType::TextArray) => {
8204            let count = u16::try_from(items.len()).expect("TEXT[] ≤ 65k elements");
8205            out.extend_from_slice(&count.to_le_bytes());
8206            for item in items {
8207                match item {
8208                    None => out.push(1),
8209                    Some(s) => {
8210                        out.push(0);
8211                        write_bytes_escaped(out, s.as_bytes());
8212                    }
8213                }
8214            }
8215        }
8216        // v7.11.12: INT[] dense body — [u16 count][per element:
8217        // u8 null + (when non-null) i32 LE].
8218        (Value::IntArray(items), DataType::IntArray) => {
8219            let count = u16::try_from(items.len()).expect("INT[] ≤ 65k elements");
8220            out.extend_from_slice(&count.to_le_bytes());
8221            for item in items {
8222                match item {
8223                    None => out.push(1),
8224                    Some(n) => {
8225                        out.push(0);
8226                        out.extend_from_slice(&n.to_le_bytes());
8227                    }
8228                }
8229            }
8230        }
8231        // v7.11.12: BIGINT[] dense body — [u16 count][per element:
8232        // u8 null + (when non-null) i64 LE].
8233        (Value::BigIntArray(items), DataType::BigIntArray) => {
8234            let count = u16::try_from(items.len()).expect("BIGINT[] ≤ 65k elements");
8235            out.extend_from_slice(&count.to_le_bytes());
8236            for item in items {
8237                match item {
8238                    None => out.push(1),
8239                    Some(n) => {
8240                        out.push(0);
8241                        out.extend_from_slice(&n.to_le_bytes());
8242                    }
8243                }
8244            }
8245        }
8246        // v7.12.0: tsvector dense body — see `value_body_encoded_len`
8247        // for layout. Lexemes are written in their already-sorted order.
8248        (Value::TsVector(lexs), DataType::TsVector) => write_tsvector_body(out, lexs),
8249        // v7.12.0: tsquery dense body — prefix-coded tree.
8250        (Value::TsQuery(ast), DataType::TsQuery) => write_tsquery_body(out, ast),
8251        // v7.17.0: UUID dense body — raw 16 bytes (RFC 4122 byte
8252        // order). No length prefix; the type's fixed width makes
8253        // the codec stateless.
8254        (Value::Uuid(b), DataType::Uuid) => out.extend_from_slice(&b[..]),
8255        // v7.17.0 Phase 3.P0-32: TIME dense body — i64 LE
8256        // microseconds since 00:00:00.
8257        (Value::Time(us), DataType::Time) => out.extend_from_slice(&us.to_le_bytes()),
8258        // v7.17.0 Phase 3.P0-33: YEAR dense body — u16 LE.
8259        (Value::Year(y), DataType::Year) => out.extend_from_slice(&y.to_le_bytes()),
8260        // v7.17.0 Phase 3.P0-34: TIMETZ dense body — i64 LE us +
8261        // i32 LE offset_secs.
8262        (Value::TimeTz { us, offset_secs }, DataType::TimeTz) => {
8263            out.extend_from_slice(&us.to_le_bytes());
8264            out.extend_from_slice(&offset_secs.to_le_bytes());
8265        }
8266        // v7.17.0 Phase 3.P0-35: MONEY dense body — i64 LE cents.
8267        (Value::Money(c), DataType::Money) => out.extend_from_slice(&c.to_le_bytes()),
8268        // v7.17.0 Phase 3.P0-38: range dense body — see
8269        // value_body_encoded_len for layout. `kind` is implicit
8270        // from the column DataType.
8271        (
8272            Value::Range {
8273                lower,
8274                upper,
8275                lower_inc,
8276                upper_inc,
8277                empty,
8278                ..
8279            },
8280            DataType::Range(_),
8281        ) => {
8282            let mut flags: u8 = 0;
8283            if *empty {
8284                flags |= 0b0000_0001;
8285            }
8286            if lower.is_some() {
8287                flags |= 0b0000_0010;
8288            }
8289            if upper.is_some() {
8290                flags |= 0b0000_0100;
8291            }
8292            if *lower_inc {
8293                flags |= 0b0000_1000;
8294            }
8295            if *upper_inc {
8296                flags |= 0b0001_0000;
8297            }
8298            out.push(flags);
8299            if let Some(l) = lower {
8300                write_value(out, l);
8301            }
8302            if let Some(u) = upper {
8303                write_value(out, u);
8304            }
8305        }
8306        // v7.17.0 Phase 3.P0-39: hstore dense body — same shape
8307        // as write_value_body for hstore (no leading tag — that
8308        // lives on the data type).
8309        (Value::Hstore(pairs), DataType::Hstore) => write_hstore_body(out, pairs),
8310        // v7.17.0 Phase 3.P0-40: 2D array dense body.
8311        (Value::IntArray2D(rows), DataType::IntArray2D) => write_int_2d_body(out, rows),
8312        (Value::BigIntArray2D(rows), DataType::BigIntArray2D) => write_bigint_2d_body(out, rows),
8313        (Value::TextArray2D(rows), DataType::TextArray2D) => write_text_2d_body(out, rows),
8314        // Type mismatch shouldn't happen — `Table::insert` validates
8315        // value type against column type before pushing. Treat as a
8316        // bug, not a runtime error.
8317        (other, ty) => unreachable!(
8318            "schema-driven encode received mismatched value/type pair: \
8319             value tag={:?}, column type={:?}",
8320            other.data_type(),
8321            ty
8322        ),
8323    }
8324}
8325
8326/// v7.17.0 Phase 3.P0-38 — length the schema-agnostic
8327/// `write_value` would emit for `v`. Used by the range codec to
8328/// pre-size cells. We mirror the tag-byte + body shape from
8329/// `write_value` rather than serialising to a temp Vec.
8330fn write_value_encoded_len(v: &Value) -> usize {
8331    match v {
8332        Value::Null => 1,
8333        Value::SmallInt(_) => 1 + 2,
8334        Value::Int(_) | Value::Date(_) => 1 + 4,
8335        Value::BigInt(_)
8336        | Value::Float(_)
8337        | Value::Timestamp(_)
8338        | Value::Time(_)
8339        | Value::Money(_) => 1 + 8,
8340        Value::Bool(_) => 1 + 1,
8341        Value::Year(_) => 1 + 2,
8342        Value::Text(s) | Value::Json(s) => 1 + 4 + s.len(),
8343        Value::Bytes(b) => 1 + 4 + b.len(),
8344        Value::Numeric { .. } => 1 + 16 + 1,
8345        Value::Uuid(_) => 1 + 16,
8346        Value::TimeTz { .. } => 1 + 12,
8347        Value::Hstore(pairs) => {
8348            let mut n = 1 + 4;
8349            for (k, v) in pairs {
8350                n += 4 + k.len() + 1;
8351                if let Some(val) = v {
8352                    n += 4 + val.len();
8353                }
8354            }
8355            n
8356        }
8357        Value::IntArray2D(rows) => {
8358            let cols = rows.first().map(|r| r.len()).unwrap_or(0);
8359            1 + 8 + rows.len() * cols * (1 + 4)
8360        }
8361        Value::BigIntArray2D(rows) => {
8362            let cols = rows.first().map(|r| r.len()).unwrap_or(0);
8363            1 + 8 + rows.len() * cols * (1 + 8)
8364        }
8365        Value::TextArray2D(rows) => {
8366            let cols = rows.first().map(|r| r.len()).unwrap_or(0);
8367            let mut n = 1 + 8 + rows.len() * cols;
8368            for row in rows {
8369                for s in row.iter().flatten() {
8370                    n += 4 + s.len();
8371                }
8372            }
8373            n
8374        }
8375        // Range-of-range and other nested cases — not currently
8376        // representable but defensively measured via the dense
8377        // body when the data_type is known.
8378        other => {
8379            let ty = other.data_type().unwrap_or(DataType::Int);
8380            1 + value_body_encoded_len(other, ty)
8381        }
8382    }
8383}
8384
8385fn write_value(out: &mut Vec<u8>, v: &Value) {
8386    match v {
8387        Value::Null => out.push(0),
8388        Value::SmallInt(n) => {
8389            out.push(7);
8390            out.extend_from_slice(&n.to_le_bytes());
8391        }
8392        Value::Int(n) => {
8393            out.push(1);
8394            out.extend_from_slice(&n.to_le_bytes());
8395        }
8396        Value::BigInt(n) => {
8397            out.push(2);
8398            out.extend_from_slice(&n.to_le_bytes());
8399        }
8400        Value::Float(x) => {
8401            out.push(3);
8402            out.extend_from_slice(&x.to_le_bytes());
8403        }
8404        // v4.9: JSON shares the tag-4 (Text) on-disk encoding —
8405        // schema decides which variant comes back on read. The
8406        // bodies are byte-identical so collapsing the match keeps
8407        // clippy::match_same_arms quiet.
8408        Value::Text(s) | Value::Json(s) => {
8409            out.push(4);
8410            write_str(out, s);
8411        }
8412        Value::Bool(b) => {
8413            out.push(5);
8414            out.push(u8::from(*b));
8415        }
8416        Value::Vector(v) => {
8417            out.push(6);
8418            let dim = u32::try_from(v.len()).expect("vector dim fits in u32");
8419            out.extend_from_slice(&dim.to_le_bytes());
8420            for x in v {
8421                out.extend_from_slice(&x.to_le_bytes());
8422            }
8423        }
8424        // v6.0.1: new tag 11 for an SQ8 cell carried with its full
8425        // header. Layout matches the dense row body shape so a
8426        // round-trip through write_value → read_value bit-equals
8427        // the original `Value::Sq8Vector`.
8428        Value::Sq8Vector(q) => {
8429            out.push(11);
8430            let dim = u32::try_from(q.bytes.len()).expect("vector dim fits in u32");
8431            out.extend_from_slice(&dim.to_le_bytes());
8432            out.extend_from_slice(&q.min.to_le_bytes());
8433            out.extend_from_slice(&q.max.to_le_bytes());
8434            out.extend_from_slice(&q.bytes);
8435        }
8436        // v6.0.3: tag 12 for a HalfVector cell.
8437        // Layout: `[u32 dim][u16 LE × dim]` — bit-identical to the
8438        // dense row body so `write_value` / `read_value` bit-equal
8439        // the original `Value::HalfVector`.
8440        Value::HalfVector(h) => {
8441            out.push(12);
8442            let dim = u32::try_from(h.dim()).expect("vector dim fits in u32");
8443            out.extend_from_slice(&dim.to_le_bytes());
8444            out.extend_from_slice(&h.bytes);
8445        }
8446        Value::Numeric { scaled, scale } => {
8447            out.push(8);
8448            out.extend_from_slice(&scaled.to_le_bytes());
8449            out.push(*scale);
8450        }
8451        Value::Date(d) => {
8452            out.push(9);
8453            out.extend_from_slice(&d.to_le_bytes());
8454        }
8455        Value::Timestamp(t) => {
8456            out.push(10);
8457            out.extend_from_slice(&t.to_le_bytes());
8458        }
8459        // Interval is a runtime-only value (no on-disk representation in
8460        // v2.11). CREATE TABLE rejects `DataType::Interval` columns, so a
8461        // Value::Interval here would mean the engine bypassed that gate.
8462        Value::Interval { .. } => {
8463            unreachable!(
8464                "Value::Interval has no on-disk encoding; engine must reject it before write"
8465            )
8466        }
8467        // v7.10.4: BYTEA — [u8 tag=13_b][u16 len][bytes]. Tag
8468        // distinct from Text (4) so the schema-agnostic
8469        // read_value path can disambiguate. (Tag 11 is taken by
8470        // the WAL `auto_commit_sql` shape elsewhere, hence 14.)
8471        Value::Bytes(b) => {
8472            out.push(14);
8473            write_bytes_escaped(out, b);
8474        }
8475        // v7.10.9: TEXT[] — [u8 tag=15][u16 count][per elem: u8
8476        // null + (if non-null) u16 len + utf-8 bytes].
8477        Value::TextArray(items) => {
8478            out.push(15);
8479            let count = u16::try_from(items.len()).expect("TEXT[] ≤ 65k elements");
8480            out.extend_from_slice(&count.to_le_bytes());
8481            for item in items {
8482                match item {
8483                    None => out.push(1),
8484                    Some(s) => {
8485                        out.push(0);
8486                        write_bytes_escaped(out, s.as_bytes());
8487                    }
8488                }
8489            }
8490        }
8491        // v7.11.12: INT[] — tag 16. [u16 count][per elem: u8 null +
8492        // (if non-null) i32 LE].
8493        Value::IntArray(items) => {
8494            out.push(16);
8495            let count = u16::try_from(items.len()).expect("INT[] ≤ 65k elements");
8496            out.extend_from_slice(&count.to_le_bytes());
8497            for item in items {
8498                match item {
8499                    None => out.push(1),
8500                    Some(n) => {
8501                        out.push(0);
8502                        out.extend_from_slice(&n.to_le_bytes());
8503                    }
8504                }
8505            }
8506        }
8507        // v7.11.12: BIGINT[] — tag 17. [u16 count][per elem: u8 null +
8508        // (if non-null) i64 LE].
8509        Value::BigIntArray(items) => {
8510            out.push(17);
8511            let count = u16::try_from(items.len()).expect("BIGINT[] ≤ 65k elements");
8512            out.extend_from_slice(&count.to_le_bytes());
8513            for item in items {
8514                match item {
8515                    None => out.push(1),
8516                    Some(n) => {
8517                        out.push(0);
8518                        out.extend_from_slice(&n.to_le_bytes());
8519                    }
8520                }
8521            }
8522        }
8523        // v7.12.0: tsvector — tag 18. Body shape matches
8524        // `write_tsvector_body`.
8525        Value::TsVector(lexs) => {
8526            out.push(18);
8527            write_tsvector_body(out, lexs);
8528        }
8529        // v7.12.0: tsquery — tag 19. Body shape matches
8530        // `write_tsquery_body`.
8531        Value::TsQuery(ast) => {
8532            out.push(19);
8533            write_tsquery_body(out, ast);
8534        }
8535        // v7.17.0: UUID — tag 20. Body = raw 16 bytes (RFC 4122
8536        // byte order).
8537        Value::Uuid(b) => {
8538            out.push(20);
8539            out.extend_from_slice(&b[..]);
8540        }
8541        // v7.17.0 Phase 3.P0-32: TIME — tag 21. Body = i64 LE
8542        // microseconds since 00:00:00.
8543        Value::Time(us) => {
8544            out.push(21);
8545            out.extend_from_slice(&us.to_le_bytes());
8546        }
8547        // v7.17.0 Phase 3.P0-33: YEAR — tag 22. Body = u16 LE.
8548        Value::Year(y) => {
8549            out.push(22);
8550            out.extend_from_slice(&y.to_le_bytes());
8551        }
8552        // v7.17.0 Phase 3.P0-34: TIMETZ — tag 23. Body = i64 LE
8553        // us + i32 LE offset_secs.
8554        Value::TimeTz { us, offset_secs } => {
8555            out.push(23);
8556            out.extend_from_slice(&us.to_le_bytes());
8557            out.extend_from_slice(&offset_secs.to_le_bytes());
8558        }
8559        // v7.17.0 Phase 3.P0-35: MONEY — tag 24. Body = i64 LE cents.
8560        Value::Money(c) => {
8561            out.push(24);
8562            out.extend_from_slice(&c.to_le_bytes());
8563        }
8564        // v7.17.0 Phase 3.P0-38: range — tag 25. Body =
8565        // [u8 RangeKind tag][u8 flags][if lower: write_value(lower)]
8566        // [if upper: write_value(upper)].
8567        Value::Range {
8568            kind,
8569            lower,
8570            upper,
8571            lower_inc,
8572            upper_inc,
8573            empty,
8574        } => {
8575            out.push(25);
8576            out.push(kind.tag());
8577            let mut flags: u8 = 0;
8578            if *empty {
8579                flags |= 0b0000_0001;
8580            }
8581            if lower.is_some() {
8582                flags |= 0b0000_0010;
8583            }
8584            if upper.is_some() {
8585                flags |= 0b0000_0100;
8586            }
8587            if *lower_inc {
8588                flags |= 0b0000_1000;
8589            }
8590            if *upper_inc {
8591                flags |= 0b0001_0000;
8592            }
8593            out.push(flags);
8594            if let Some(l) = lower {
8595                write_value(out, l);
8596            }
8597            if let Some(u) = upper {
8598                write_value(out, u);
8599            }
8600        }
8601        // v7.17.0 Phase 3.P0-39: hstore — tag 26. Body =
8602        // [u32 count] then per pair `[u32 klen][k bytes][u8 has_val]
8603        // [if has_val: u32 vlen][v bytes]`.
8604        Value::Hstore(pairs) => {
8605            out.push(26);
8606            write_hstore_body(out, pairs);
8607        }
8608        // v7.17.0 Phase 3.P0-40: 2D arrays — tag 27/28/29.
8609        Value::IntArray2D(rows) => {
8610            out.push(27);
8611            write_int_2d_body(out, rows);
8612        }
8613        Value::BigIntArray2D(rows) => {
8614            out.push(28);
8615            write_bigint_2d_body(out, rows);
8616        }
8617        Value::TextArray2D(rows) => {
8618            out.push(29);
8619            write_text_2d_body(out, rows);
8620        }
8621    }
8622}
8623
8624/// v7.17.0 Phase 3.P0-40 — shared 2D INT writer.
8625fn write_int_2d_body(out: &mut Vec<u8>, rows: &[Vec<Option<i32>>]) {
8626    let nrows = u32::try_from(rows.len()).expect("≤ 4G rows");
8627    let ncols = u32::try_from(rows.first().map(|r| r.len()).unwrap_or(0)).expect("≤ 4G cols");
8628    out.extend_from_slice(&nrows.to_le_bytes());
8629    out.extend_from_slice(&ncols.to_le_bytes());
8630    for row in rows {
8631        for cell in row {
8632            match cell {
8633                None => out.push(1),
8634                Some(n) => {
8635                    out.push(0);
8636                    out.extend_from_slice(&n.to_le_bytes());
8637                }
8638            }
8639        }
8640    }
8641}
8642
8643/// v7.17.0 Phase 3.P0-40 — shared 2D BIGINT writer.
8644fn write_bigint_2d_body(out: &mut Vec<u8>, rows: &[Vec<Option<i64>>]) {
8645    let nrows = u32::try_from(rows.len()).expect("≤ 4G rows");
8646    let ncols = u32::try_from(rows.first().map(|r| r.len()).unwrap_or(0)).expect("≤ 4G cols");
8647    out.extend_from_slice(&nrows.to_le_bytes());
8648    out.extend_from_slice(&ncols.to_le_bytes());
8649    for row in rows {
8650        for cell in row {
8651            match cell {
8652                None => out.push(1),
8653                Some(n) => {
8654                    out.push(0);
8655                    out.extend_from_slice(&n.to_le_bytes());
8656                }
8657            }
8658        }
8659    }
8660}
8661
8662/// v7.17.0 Phase 3.P0-40 — shared 2D TEXT writer. Cells use
8663/// `[u8 null_flag][if non-null: u32 len][utf-8 bytes]` layout.
8664fn write_text_2d_body(out: &mut Vec<u8>, rows: &[Vec<Option<String>>]) {
8665    let nrows = u32::try_from(rows.len()).expect("≤ 4G rows");
8666    let ncols = u32::try_from(rows.first().map(|r| r.len()).unwrap_or(0)).expect("≤ 4G cols");
8667    out.extend_from_slice(&nrows.to_le_bytes());
8668    out.extend_from_slice(&ncols.to_le_bytes());
8669    for row in rows {
8670        for cell in row {
8671            match cell {
8672                None => out.push(1),
8673                Some(s) => {
8674                    out.push(0);
8675                    let l = u32::try_from(s.len()).expect("≤ 4 GiB cell");
8676                    out.extend_from_slice(&l.to_le_bytes());
8677                    out.extend_from_slice(s.as_bytes());
8678                }
8679            }
8680        }
8681    }
8682}
8683
8684/// v7.17.0 Phase 3.P0-39 — shared hstore body writer.
8685fn write_hstore_body(out: &mut Vec<u8>, pairs: &[(String, Option<String>)]) {
8686    let count = u32::try_from(pairs.len()).expect("hstore ≤ u32::MAX pairs");
8687    out.extend_from_slice(&count.to_le_bytes());
8688    for (k, v) in pairs {
8689        let klen = u32::try_from(k.len()).expect("hstore key ≤ 4 GiB");
8690        out.extend_from_slice(&klen.to_le_bytes());
8691        out.extend_from_slice(k.as_bytes());
8692        match v {
8693            None => out.push(0),
8694            Some(val) => {
8695                out.push(1);
8696                let vlen = u32::try_from(val.len()).expect("hstore val ≤ 4 GiB");
8697                out.extend_from_slice(&vlen.to_le_bytes());
8698                out.extend_from_slice(val.as_bytes());
8699            }
8700        }
8701    }
8702}
8703
8704/// v7.12.0: shared tsvector body writer (used by both dense and
8705/// schema-agnostic codecs).
8706fn write_tsvector_body(out: &mut Vec<u8>, lexs: &[TsLexeme]) {
8707    let count = u16::try_from(lexs.len()).expect("tsvector ≤ 65k lexemes");
8708    out.extend_from_slice(&count.to_le_bytes());
8709    for l in lexs {
8710        // v7.27 — escaped length (codec sweep, round-21).
8711        write_bytes_escaped(out, l.word.as_bytes());
8712        let plen = u16::try_from(l.positions.len()).expect("tsvector pos count ≤ 65k");
8713        out.extend_from_slice(&plen.to_le_bytes());
8714        for p in &l.positions {
8715            out.extend_from_slice(&p.to_le_bytes());
8716        }
8717        out.push(l.weight);
8718    }
8719}
8720
8721/// v7.12.0: shared tsquery body writer. Prefix-coded tree: each
8722/// node starts with `[u8 tag]` then a tag-specific payload. Tags:
8723/// 0=Term, 1=And, 2=Or, 3=Not, 4=Phrase.
8724fn write_tsquery_body(out: &mut Vec<u8>, ast: &TsQueryAst) {
8725    match ast {
8726        TsQueryAst::Term { word, weight_mask } => {
8727            out.push(0);
8728            // v7.27 — escaped length (codec sweep, round-21).
8729            write_bytes_escaped(out, word.as_bytes());
8730            out.push(*weight_mask);
8731        }
8732        TsQueryAst::And(a, b) => {
8733            out.push(1);
8734            write_tsquery_body(out, a);
8735            write_tsquery_body(out, b);
8736        }
8737        TsQueryAst::Or(a, b) => {
8738            out.push(2);
8739            write_tsquery_body(out, a);
8740            write_tsquery_body(out, b);
8741        }
8742        TsQueryAst::Not(x) => {
8743            out.push(3);
8744            write_tsquery_body(out, x);
8745        }
8746        TsQueryAst::Phrase {
8747            left,
8748            right,
8749            distance,
8750        } => {
8751            out.push(4);
8752            out.extend_from_slice(&distance.to_le_bytes());
8753            write_tsquery_body(out, left);
8754            write_tsquery_body(out, right);
8755        }
8756    }
8757}
8758
8759/// v7.12.0: byte length that `write_tsquery_body` would emit.
8760fn tsquery_encoded_len(ast: &TsQueryAst) -> usize {
8761    match ast {
8762        TsQueryAst::Term { word, .. } => 1 + 2 + word.len() + 1,
8763        TsQueryAst::And(a, b) | TsQueryAst::Or(a, b) => {
8764            1 + tsquery_encoded_len(a) + tsquery_encoded_len(b)
8765        }
8766        TsQueryAst::Not(x) => 1 + tsquery_encoded_len(x),
8767        TsQueryAst::Phrase { left, right, .. } => {
8768            1 + 2 + tsquery_encoded_len(left) + tsquery_encoded_len(right)
8769        }
8770    }
8771}
8772
8773fn write_u16(out: &mut Vec<u8>, n: u16) {
8774    out.extend_from_slice(&n.to_le_bytes());
8775}
8776fn write_u32(out: &mut Vec<u8>, n: u32) {
8777    out.extend_from_slice(&n.to_le_bytes());
8778}
8779/// v7.23 (mailrs round-14) — sentinel for the escape form of the
8780/// short-string codec: a u16 length of `0xFFFF` means "the REAL
8781/// length follows as a u32". Strings of length `>= 0xFFFF` take the
8782/// escape form (including exactly 65 535, so the sentinel is
8783/// unambiguous within v46+ payloads); shorter strings keep the
8784/// 2-byte header — zero overhead for identifiers and typical text.
8785/// Pre-v46 catalogs (and pre-V3 segments) may legitimately contain
8786/// a plain length of 0xFFFF, so DECODING is gated on the container
8787/// version (`Cursor::codec_version`); encoding always emits the v46
8788/// form because every new container carries the new version mark.
8789const STR_LEN_ESCAPE: u16 = u16::MAX;
8790
8791/// v7.27 (round-21) — escaped length for RAW BYTE payloads (BYTEA
8792/// cells, TEXT[] elements when paired with their own validity
8793/// rules): same sentinel scheme as [`write_str`], decoding gated on
8794/// codec_version >= 47.
8795fn write_bytes_escaped(out: &mut Vec<u8>, b: &[u8]) {
8796    if b.len() >= STR_LEN_ESCAPE as usize {
8797        let len = u32::try_from(b.len()).expect("cell fits in u32 (4 GiB cap)");
8798        write_u16(out, STR_LEN_ESCAPE);
8799        write_u32(out, len);
8800    } else {
8801        write_u16(out, b.len() as u16);
8802    }
8803    out.extend_from_slice(b);
8804}
8805
8806fn write_str(out: &mut Vec<u8>, s: &str) {
8807    if s.len() >= STR_LEN_ESCAPE as usize {
8808        // Real mail bodies / document text routinely exceed 64 KiB
8809        // (mailrs round-14: the old `fits in u16` expect PANICKED —
8810        // after the INSERT was acknowledged — at the next snapshot
8811        // encode).
8812        let len = u32::try_from(s.len()).expect("text fits in u32 (4 GiB cap)");
8813        write_u16(out, STR_LEN_ESCAPE);
8814        write_u32(out, len);
8815    } else {
8816        write_u16(out, s.len() as u16);
8817    }
8818    out.extend_from_slice(s.as_bytes());
8819}
8820
8821/// v7.12.4 — long-string variant: `[u32 LE len][bytes]`. For
8822/// payloads that can plausibly exceed 64 KiB (notably PL/pgSQL
8823/// function bodies). Identifiers + short text continue to use
8824/// the u16 [`write_str`] codec.
8825fn write_str_long(out: &mut Vec<u8>, s: &str) {
8826    let len = u32::try_from(s.len()).expect("function body fits in u32");
8827    write_u32(out, len);
8828    out.extend_from_slice(s.as_bytes());
8829}
8830
8831/// Serialise an [`IndexKey`] using the v9 tagged codec. `read_index_key`
8832/// is the inverse. v8 catalogs never wrote index keys (`BTree` entries were
8833/// rebuilt from `Table::rows`), so this codec is v9+ only.
8834fn write_index_key(out: &mut Vec<u8>, key: &IndexKey) {
8835    match key {
8836        IndexKey::Int(n) => {
8837            out.push(INDEX_KEY_TAG_INT);
8838            out.extend_from_slice(&n.to_le_bytes());
8839        }
8840        IndexKey::Text(s) => {
8841            out.push(INDEX_KEY_TAG_TEXT);
8842            write_str(out, s);
8843        }
8844        IndexKey::Bool(b) => {
8845            out.push(INDEX_KEY_TAG_BOOL);
8846            out.push(u8::from(*b));
8847        }
8848        IndexKey::Uuid(b) => {
8849            out.push(INDEX_KEY_TAG_UUID);
8850            out.extend_from_slice(&b[..]);
8851        }
8852    }
8853}
8854
8855struct Cursor<'a> {
8856    buf: &'a [u8],
8857    pos: usize,
8858    /// v7.23/v7.27 — the container's codec version (catalog
8859    /// FILE_VERSION, or the segment magic mapped onto it). Gates
8860    /// length-escape decoding: >= 46 strings escape via
8861    /// [`STR_LEN_ESCAPE`], >= 47 BYTEA / TEXT[] elements / ts
8862    /// lexemes escape too. 0 = legacy (plain u16 everywhere —
8863    /// 0xFFFF is a legitimate length there).
8864    codec_version: u8,
8865}
8866
8867impl<'a> Cursor<'a> {
8868    const fn new(buf: &'a [u8]) -> Self {
8869        Self {
8870            buf,
8871            pos: 0,
8872            codec_version: 0,
8873        }
8874    }
8875
8876    /// v7.23/v7.27 — builder for version-gated escape decoding.
8877    const fn with_codec_version(mut self, v: u8) -> Self {
8878        self.codec_version = v;
8879        self
8880    }
8881
8882    fn take(&mut self, n: usize) -> Result<&'a [u8], StorageError> {
8883        let end = self
8884            .pos
8885            .checked_add(n)
8886            .ok_or_else(|| StorageError::Corrupt(format!("length overflow taking {n} bytes")))?;
8887        if end > self.buf.len() {
8888            return Err(StorageError::Corrupt(format!(
8889                "unexpected EOF at offset {} (wanted {n} more bytes)",
8890                self.pos
8891            )));
8892        }
8893        let s = &self.buf[self.pos..end];
8894        self.pos = end;
8895        Ok(s)
8896    }
8897
8898    fn read_u8(&mut self) -> Result<u8, StorageError> {
8899        Ok(self.take(1)?[0])
8900    }
8901    fn read_u16(&mut self) -> Result<u16, StorageError> {
8902        let s = self.take(2)?;
8903        Ok(u16::from_le_bytes([s[0], s[1]]))
8904    }
8905    fn read_u32(&mut self) -> Result<u32, StorageError> {
8906        let s = self.take(4)?;
8907        Ok(u32::from_le_bytes([s[0], s[1], s[2], s[3]]))
8908    }
8909    fn read_i32(&mut self) -> Result<i32, StorageError> {
8910        let s = self.take(4)?;
8911        Ok(i32::from_le_bytes([s[0], s[1], s[2], s[3]]))
8912    }
8913    /// v6.7.2 — u64 LE read for the per-table `hot_tier_bytes`
8914    /// catalog appendix.
8915    fn read_u64(&mut self) -> Result<u64, StorageError> {
8916        let s = self.take(8)?;
8917        Ok(u64::from_le_bytes([
8918            s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7],
8919        ]))
8920    }
8921    fn read_i64(&mut self) -> Result<i64, StorageError> {
8922        let s = self.take(8)?;
8923        let arr: [u8; 8] = s.try_into().expect("checked");
8924        Ok(i64::from_le_bytes(arr))
8925    }
8926    fn read_f64(&mut self) -> Result<f64, StorageError> {
8927        let s = self.take(8)?;
8928        let arr: [u8; 8] = s.try_into().expect("checked");
8929        Ok(f64::from_le_bytes(arr))
8930    }
8931    fn read_f32(&mut self) -> Result<f32, StorageError> {
8932        let s = self.take(4)?;
8933        Ok(f32::from_le_bytes([s[0], s[1], s[2], s[3]]))
8934    }
8935    /// v7.27 — length field with the >=47 escape (BYTEA cells,
8936    /// TEXT[] elements, ts lexemes/terms).
8937    fn read_len_escaped_v47(&mut self) -> Result<usize, StorageError> {
8938        let short = self.read_u16()?;
8939        if self.codec_version >= 47 && short == STR_LEN_ESCAPE {
8940            Ok(self.read_u32()? as usize)
8941        } else {
8942            Ok(short as usize)
8943        }
8944    }
8945
8946    /// v7.27 — string whose length uses the >=47 escape (TEXT[]
8947    /// elements, ts lexemes/terms — payloads that were plain u16
8948    /// through v46).
8949    fn read_str_escaped_v47(&mut self) -> Result<String, StorageError> {
8950        let len = self.read_len_escaped_v47()?;
8951        let bytes = self.take(len)?;
8952        core::str::from_utf8(bytes)
8953            .map(String::from)
8954            .map_err(|_| StorageError::Corrupt("invalid UTF-8 in cell payload".into()))
8955    }
8956
8957    fn read_str(&mut self) -> Result<String, StorageError> {
8958        let short = self.read_u16()?;
8959        let len = if self.codec_version >= 46 && short == STR_LEN_ESCAPE {
8960            // v7.23 escape form — real length follows as u32.
8961            self.read_u32()? as usize
8962        } else {
8963            short as usize
8964        };
8965        let bytes = self.take(len)?;
8966        core::str::from_utf8(bytes)
8967            .map(String::from)
8968            .map_err(|_| StorageError::Corrupt("invalid UTF-8 in identifier or text".into()))
8969    }
8970
8971    /// v7.12.4 — long-string variant for payloads written via
8972    /// [`write_str_long`] (u32-length prefix). Used for PL/pgSQL
8973    /// function bodies which can plausibly exceed 64 KiB.
8974    fn read_str_long(&mut self) -> Result<String, StorageError> {
8975        let len = self.read_u32()? as usize;
8976        let bytes = self.take(len)?;
8977        core::str::from_utf8(bytes)
8978            .map(String::from)
8979            .map_err(|_| StorageError::Corrupt("invalid UTF-8 in long-string payload".into()))
8980    }
8981
8982    /// Parse an [`IndexKey`] emitted by `write_index_key` (v9 tagged
8983    /// codec). Returns `StorageError::Corrupt` on unknown tag or
8984    /// truncated payload.
8985    fn read_index_key(&mut self) -> Result<IndexKey, StorageError> {
8986        let tag = self.read_u8()?;
8987        match tag {
8988            INDEX_KEY_TAG_INT => Ok(IndexKey::Int(self.read_i64()?)),
8989            INDEX_KEY_TAG_TEXT => Ok(IndexKey::Text(self.read_str()?)),
8990            INDEX_KEY_TAG_BOOL => Ok(IndexKey::Bool(self.read_u8()? != 0)),
8991            INDEX_KEY_TAG_UUID => {
8992                let s = self.take(16)?;
8993                let mut b = [0u8; 16];
8994                b.copy_from_slice(s);
8995                Ok(IndexKey::Uuid(b))
8996            }
8997            other => Err(StorageError::Corrupt(format!(
8998                "unknown index key tag: {other}"
8999            ))),
9000        }
9001    }
9002    /// Schema-driven dense value decode (`FILE_VERSION` 8). Caller has
9003    /// already cleared the NULL bit from the row bitmap; we read the
9004    /// fixed-width body for the given column type. Used inside the row
9005    /// hot loop; column defaults still go through `read_value` (which
9006    /// reads its own type tag) so DEFAULT round-trips without a schema.
9007    fn read_value_body(&mut self, ty: DataType) -> Result<Value, StorageError> {
9008        match ty {
9009            DataType::SmallInt => {
9010                let s = self.take(2)?;
9011                Ok(Value::SmallInt(i16::from_le_bytes([s[0], s[1]])))
9012            }
9013            DataType::Int => Ok(Value::Int(self.read_i32()?)),
9014            DataType::BigInt => Ok(Value::BigInt(self.read_i64()?)),
9015            DataType::Float => Ok(Value::Float(self.read_f64()?)),
9016            DataType::Bool => Ok(Value::Bool(self.read_u8()? != 0)),
9017            DataType::Text | DataType::Varchar(_) | DataType::Char(_) => {
9018                Ok(Value::Text(self.read_str()?))
9019            }
9020            DataType::Vector {
9021                encoding: VecEncoding::F32,
9022                ..
9023            } => {
9024                let dim = self.read_u32()? as usize;
9025                let mut v = Vec::with_capacity(dim);
9026                for _ in 0..dim {
9027                    let bytes: [u8; 4] = self.take(4)?.try_into().expect("checked");
9028                    v.push(f32::from_le_bytes(bytes));
9029                }
9030                Ok(Value::Vector(v))
9031            }
9032            DataType::Vector {
9033                encoding: VecEncoding::Sq8,
9034                ..
9035            } => {
9036                let dim = self.read_u32()? as usize;
9037                let min = self.read_f32()?;
9038                let max = self.read_f32()?;
9039                let bytes = self.take(dim)?.to_vec();
9040                Ok(Value::Sq8Vector(quantize::Sq8Vector { min, max, bytes }))
9041            }
9042            DataType::Vector {
9043                encoding: VecEncoding::F16,
9044                ..
9045            } => {
9046                let dim = self.read_u32()? as usize;
9047                let bytes = self.take(dim * 2)?.to_vec();
9048                Ok(Value::HalfVector(halfvec::HalfVector { bytes }))
9049            }
9050            DataType::Numeric { .. } => {
9051                let s = self.take(16)?;
9052                let arr: [u8; 16] = s.try_into().expect("checked");
9053                let scaled = i128::from_le_bytes(arr);
9054                let scale = self.read_u8()?;
9055                Ok(Value::Numeric { scaled, scale })
9056            }
9057            DataType::Date => Ok(Value::Date(self.read_i32()?)),
9058            DataType::Timestamp => Ok(Value::Timestamp(self.read_i64()?)),
9059            DataType::Timestamptz => Ok(Value::Timestamp(self.read_i64()?)),
9060            DataType::Jsonb => Ok(Value::Json(self.read_str()?)),
9061            DataType::Interval => {
9062                // Defensive — schema gate (CREATE TABLE rejects Interval
9063                // columns) means this branch can't be hit through normal
9064                // flow; reject corrupt files explicitly rather than
9065                // panic.
9066                Err(StorageError::Corrupt(
9067                    "INTERVAL column found on disk — runtime-only type, v3.0.2 rejects it".into(),
9068                ))
9069            }
9070            DataType::Json => Ok(Value::Json(self.read_str()?)),
9071            // v7.10.4: BYTEA on-disk is [u16 len][bytes]. Same wire
9072            // shape as Text, but read as raw Vec<u8>.
9073            DataType::Bytes => {
9074                // v7.27 (round-21) — escaped length at >= 47.
9075                let len = self.read_len_escaped_v47()?;
9076                let bytes = self.take(len)?.to_vec();
9077                Ok(Value::Bytes(bytes))
9078            }
9079            // v7.10.9: TEXT[] dense body.
9080            DataType::TextArray => {
9081                let count = self.read_u16()? as usize;
9082                let mut items: Vec<Option<String>> = Vec::with_capacity(count);
9083                for _ in 0..count {
9084                    match self.read_u8()? {
9085                        0 => items.push(Some(self.read_str_escaped_v47()?)),
9086                        1 => items.push(None),
9087                        other => {
9088                            return Err(StorageError::Corrupt(format!(
9089                                "TEXT[] null flag: unknown byte {other}"
9090                            )));
9091                        }
9092                    }
9093                }
9094                Ok(Value::TextArray(items))
9095            }
9096            // v7.11.12: INT[] dense body.
9097            DataType::IntArray => {
9098                let count = self.read_u16()? as usize;
9099                let mut items: Vec<Option<i32>> = Vec::with_capacity(count);
9100                for _ in 0..count {
9101                    match self.read_u8()? {
9102                        0 => items.push(Some(self.read_i32()?)),
9103                        1 => items.push(None),
9104                        other => {
9105                            return Err(StorageError::Corrupt(format!(
9106                                "INT[] null flag: unknown byte {other}"
9107                            )));
9108                        }
9109                    }
9110                }
9111                Ok(Value::IntArray(items))
9112            }
9113            // v7.11.12: BIGINT[] dense body.
9114            DataType::BigIntArray => {
9115                let count = self.read_u16()? as usize;
9116                let mut items: Vec<Option<i64>> = Vec::with_capacity(count);
9117                for _ in 0..count {
9118                    match self.read_u8()? {
9119                        0 => items.push(Some(self.read_i64()?)),
9120                        1 => items.push(None),
9121                        other => {
9122                            return Err(StorageError::Corrupt(format!(
9123                                "BIGINT[] null flag: unknown byte {other}"
9124                            )));
9125                        }
9126                    }
9127                }
9128                Ok(Value::BigIntArray(items))
9129            }
9130            // v7.12.0: tsvector dense body — [u16 lex_count]
9131            // [per lex: u16 word_len + utf-8 word + u16 pos_count
9132            // + (u16 LE * pos_count) + u8 weight].
9133            DataType::TsVector => Ok(Value::TsVector(self.read_tsvector_body()?)),
9134            DataType::TsQuery => Ok(Value::TsQuery(self.read_tsquery_body()?)),
9135            // v7.17.0: UUID dense body — raw 16 bytes.
9136            DataType::Uuid => {
9137                let s = self.take(16)?;
9138                let mut b = [0u8; 16];
9139                b.copy_from_slice(s);
9140                Ok(Value::Uuid(b))
9141            }
9142            // v7.17.0 Phase 3.P0-32: TIME dense body — i64 LE.
9143            DataType::Time => Ok(Value::Time(self.read_i64()?)),
9144            // v7.17.0 Phase 3.P0-33: YEAR dense body — u16 LE.
9145            DataType::Year => Ok(Value::Year(self.read_u16()?)),
9146            // v7.17.0 Phase 3.P0-34: TIMETZ dense body —
9147            // i64 LE us + i32 LE offset_secs.
9148            DataType::TimeTz => {
9149                let us = self.read_i64()?;
9150                let offset_secs = self.read_i32()?;
9151                Ok(Value::TimeTz { us, offset_secs })
9152            }
9153            // v7.17.0 Phase 3.P0-35: MONEY dense body — i64 LE cents.
9154            DataType::Money => Ok(Value::Money(self.read_i64()?)),
9155            // v7.17.0 Phase 3.P0-39: hstore dense body. Body
9156            // shape == read_hstore_body.
9157            DataType::Hstore => Ok(Value::Hstore(self.read_hstore_body()?)),
9158            // v7.17.0 Phase 3.P0-40: 2D arrays dense body.
9159            DataType::IntArray2D => Ok(Value::IntArray2D(self.read_int_2d_body()?)),
9160            DataType::BigIntArray2D => Ok(Value::BigIntArray2D(self.read_bigint_2d_body()?)),
9161            DataType::TextArray2D => Ok(Value::TextArray2D(self.read_text_2d_body()?)),
9162            // v7.17.0 Phase 3.P0-38: range dense body. Element
9163            // type is determined by the surrounding RangeKind.
9164            DataType::Range(kind) => {
9165                let flags = self.read_u8()?;
9166                let empty = flags & 0b0000_0001 != 0;
9167                let has_lower = flags & 0b0000_0010 != 0;
9168                let has_upper = flags & 0b0000_0100 != 0;
9169                let lower_inc = flags & 0b0000_1000 != 0;
9170                let upper_inc = flags & 0b0001_0000 != 0;
9171                let lower = if has_lower {
9172                    Some(alloc::boxed::Box::new(self.read_value()?))
9173                } else {
9174                    None
9175                };
9176                let upper = if has_upper {
9177                    Some(alloc::boxed::Box::new(self.read_value()?))
9178                } else {
9179                    None
9180                };
9181                Ok(Value::Range {
9182                    kind,
9183                    lower,
9184                    upper,
9185                    lower_inc,
9186                    upper_inc,
9187                    empty,
9188                })
9189            }
9190        }
9191    }
9192
9193    /// v7.17.0 Phase 3.P0-40 — read a 2D INT array body emitted
9194    /// by `write_int_2d_body`.
9195    fn read_int_2d_body(&mut self) -> Result<Vec<Vec<Option<i32>>>, StorageError> {
9196        let nrows = self.read_u32()? as usize;
9197        let ncols = self.read_u32()? as usize;
9198        let mut rows = Vec::with_capacity(nrows);
9199        for _ in 0..nrows {
9200            let mut row = Vec::with_capacity(ncols);
9201            for _ in 0..ncols {
9202                let null = self.read_u8()?;
9203                row.push(if null == 1 {
9204                    None
9205                } else {
9206                    Some(self.read_i32()?)
9207                });
9208            }
9209            rows.push(row);
9210        }
9211        Ok(rows)
9212    }
9213
9214    /// v7.17.0 Phase 3.P0-40 — read a 2D BIGINT array body.
9215    fn read_bigint_2d_body(&mut self) -> Result<Vec<Vec<Option<i64>>>, StorageError> {
9216        let nrows = self.read_u32()? as usize;
9217        let ncols = self.read_u32()? as usize;
9218        let mut rows = Vec::with_capacity(nrows);
9219        for _ in 0..nrows {
9220            let mut row = Vec::with_capacity(ncols);
9221            for _ in 0..ncols {
9222                let null = self.read_u8()?;
9223                row.push(if null == 1 {
9224                    None
9225                } else {
9226                    Some(self.read_i64()?)
9227                });
9228            }
9229            rows.push(row);
9230        }
9231        Ok(rows)
9232    }
9233
9234    /// v7.17.0 Phase 3.P0-40 — read a 2D TEXT array body. Each
9235    /// cell is `[u8 null_flag][if non-null: u32 len + utf-8 bytes]`.
9236    fn read_text_2d_body(&mut self) -> Result<Vec<Vec<Option<String>>>, StorageError> {
9237        let nrows = self.read_u32()? as usize;
9238        let ncols = self.read_u32()? as usize;
9239        let mut rows = Vec::with_capacity(nrows);
9240        for _ in 0..nrows {
9241            let mut row = Vec::with_capacity(ncols);
9242            for _ in 0..ncols {
9243                let null = self.read_u8()?;
9244                if null == 1 {
9245                    row.push(None);
9246                } else {
9247                    let l = self.read_u32()? as usize;
9248                    let bytes = self.take(l)?.to_vec();
9249                    let s = String::from_utf8(bytes).map_err(|_| {
9250                        StorageError::Corrupt("2D TEXT cell is not valid UTF-8".into())
9251                    })?;
9252                    row.push(Some(s));
9253                }
9254            }
9255            rows.push(row);
9256        }
9257        Ok(rows)
9258    }
9259
9260    /// v7.17.0 Phase 3.P0-39 — read a hstore body emitted by
9261    /// `write_hstore_body`.
9262    fn read_hstore_body(&mut self) -> Result<Vec<(String, Option<String>)>, StorageError> {
9263        let count = self.read_u32()? as usize;
9264        let mut out = Vec::with_capacity(count);
9265        for _ in 0..count {
9266            let klen = self.read_u32()? as usize;
9267            let k_bytes = self.take(klen)?.to_vec();
9268            let k = String::from_utf8(k_bytes)
9269                .map_err(|_| StorageError::Corrupt("hstore key is not valid UTF-8".into()))?;
9270            let has_val = self.read_u8()? != 0;
9271            let v =
9272                if has_val {
9273                    let vlen = self.read_u32()? as usize;
9274                    let v_bytes = self.take(vlen)?.to_vec();
9275                    Some(String::from_utf8(v_bytes).map_err(|_| {
9276                        StorageError::Corrupt("hstore value is not valid UTF-8".into())
9277                    })?)
9278                } else {
9279                    None
9280                };
9281            out.push((k, v));
9282        }
9283        Ok(out)
9284    }
9285
9286    /// v7.12.0 — read a tsvector body emitted by `write_tsvector_body`.
9287    fn read_tsvector_body(&mut self) -> Result<Vec<TsLexeme>, StorageError> {
9288        let count = self.read_u16()? as usize;
9289        let mut out = Vec::with_capacity(count);
9290        for _ in 0..count {
9291            let word = self.read_str_escaped_v47()?;
9292            let pos_count = self.read_u16()? as usize;
9293            let mut positions = Vec::with_capacity(pos_count);
9294            for _ in 0..pos_count {
9295                positions.push(self.read_u16()?);
9296            }
9297            let weight = self.read_u8()?;
9298            out.push(TsLexeme {
9299                word,
9300                positions,
9301                weight,
9302            });
9303        }
9304        Ok(out)
9305    }
9306
9307    /// v7.12.0 — read a tsquery body emitted by `write_tsquery_body`.
9308    fn read_tsquery_body(&mut self) -> Result<TsQueryAst, StorageError> {
9309        let tag = self.read_u8()?;
9310        match tag {
9311            0 => {
9312                let word = self.read_str_escaped_v47()?;
9313                let weight_mask = self.read_u8()?;
9314                Ok(TsQueryAst::Term { word, weight_mask })
9315            }
9316            1 => {
9317                let a = self.read_tsquery_body()?;
9318                let b = self.read_tsquery_body()?;
9319                Ok(TsQueryAst::And(Box::new(a), Box::new(b)))
9320            }
9321            2 => {
9322                let a = self.read_tsquery_body()?;
9323                let b = self.read_tsquery_body()?;
9324                Ok(TsQueryAst::Or(Box::new(a), Box::new(b)))
9325            }
9326            3 => {
9327                let x = self.read_tsquery_body()?;
9328                Ok(TsQueryAst::Not(Box::new(x)))
9329            }
9330            4 => {
9331                let distance = self.read_u16()?;
9332                let left = self.read_tsquery_body()?;
9333                let right = self.read_tsquery_body()?;
9334                Ok(TsQueryAst::Phrase {
9335                    left: Box::new(left),
9336                    right: Box::new(right),
9337                    distance,
9338                })
9339            }
9340            other => Err(StorageError::Corrupt(format!(
9341                "tsquery: unknown node tag {other}"
9342            ))),
9343        }
9344    }
9345
9346    fn read_value(&mut self) -> Result<Value, StorageError> {
9347        let tag = self.read_u8()?;
9348        match tag {
9349            0 => Ok(Value::Null),
9350            1 => Ok(Value::Int(self.read_i32()?)),
9351            2 => Ok(Value::BigInt(self.read_i64()?)),
9352            3 => Ok(Value::Float(self.read_f64()?)),
9353            4 => Ok(Value::Text(self.read_str()?)),
9354            5 => Ok(Value::Bool(self.read_u8()? != 0)),
9355            6 => {
9356                let dim = self.read_u32()? as usize;
9357                let mut v = Vec::with_capacity(dim);
9358                for _ in 0..dim {
9359                    let bytes: [u8; 4] = self.take(4)?.try_into().expect("checked");
9360                    v.push(f32::from_le_bytes(bytes));
9361                }
9362                Ok(Value::Vector(v))
9363            }
9364            7 => {
9365                let s = self.take(2)?;
9366                Ok(Value::SmallInt(i16::from_le_bytes([s[0], s[1]])))
9367            }
9368            8 => {
9369                let s = self.take(16)?;
9370                let arr: [u8; 16] = s.try_into().expect("checked");
9371                let scaled = i128::from_le_bytes(arr);
9372                let scale = self.read_u8()?;
9373                Ok(Value::Numeric { scaled, scale })
9374            }
9375            9 => Ok(Value::Date(self.read_i32()?)),
9376            10 => Ok(Value::Timestamp(self.read_i64()?)),
9377            // v6.0.1: tag 11 — Sq8Vector. Pre-v6 readers fall
9378            // through to the catch-all and surface
9379            // `Corrupt("unknown value tag")`, matching the
9380            // forward-compat fence on the column-type side.
9381            11 => {
9382                let dim = self.read_u32()? as usize;
9383                let min = self.read_f32()?;
9384                let max = self.read_f32()?;
9385                let bytes = self.take(dim)?.to_vec();
9386                Ok(Value::Sq8Vector(quantize::Sq8Vector { min, max, bytes }))
9387            }
9388            // v6.0.3: tag 12 — HalfVector. Same forward-compat
9389            // fence story as tag 11.
9390            12 => {
9391                let dim = self.read_u32()? as usize;
9392                let bytes = self.take(dim * 2)?.to_vec();
9393                Ok(Value::HalfVector(halfvec::HalfVector { bytes }))
9394            }
9395            // v7.10.4: tag 14 — BYTEA. [u16 len][bytes].
9396            14 => {
9397                // v7.27 (round-21) — escaped length at >= 47.
9398                let len = self.read_len_escaped_v47()?;
9399                let bytes = self.take(len)?.to_vec();
9400                Ok(Value::Bytes(bytes))
9401            }
9402            // v7.10.9: tag 15 — TEXT[]. [u16 count][per elem: u8
9403            // null + (when non-null) u16 len + utf-8 bytes].
9404            15 => {
9405                let count = self.read_u16()? as usize;
9406                let mut items: Vec<Option<String>> = Vec::with_capacity(count);
9407                for _ in 0..count {
9408                    match self.read_u8()? {
9409                        0 => items.push(Some(self.read_str_escaped_v47()?)),
9410                        1 => items.push(None),
9411                        other => {
9412                            return Err(StorageError::Corrupt(format!(
9413                                "TEXT[] null flag in value tag: unknown byte {other}"
9414                            )));
9415                        }
9416                    }
9417                }
9418                Ok(Value::TextArray(items))
9419            }
9420            // v7.11.12: tags 16/17 — INT[] / BIGINT[].
9421            16 => {
9422                let count = self.read_u16()? as usize;
9423                let mut items: Vec<Option<i32>> = Vec::with_capacity(count);
9424                for _ in 0..count {
9425                    match self.read_u8()? {
9426                        0 => items.push(Some(self.read_i32()?)),
9427                        1 => items.push(None),
9428                        other => {
9429                            return Err(StorageError::Corrupt(format!(
9430                                "INT[] null flag in value tag: unknown byte {other}"
9431                            )));
9432                        }
9433                    }
9434                }
9435                Ok(Value::IntArray(items))
9436            }
9437            17 => {
9438                let count = self.read_u16()? as usize;
9439                let mut items: Vec<Option<i64>> = Vec::with_capacity(count);
9440                for _ in 0..count {
9441                    match self.read_u8()? {
9442                        0 => items.push(Some(self.read_i64()?)),
9443                        1 => items.push(None),
9444                        other => {
9445                            return Err(StorageError::Corrupt(format!(
9446                                "BIGINT[] null flag in value tag: unknown byte {other}"
9447                            )));
9448                        }
9449                    }
9450                }
9451                Ok(Value::BigIntArray(items))
9452            }
9453            // v7.12.0: tag 18 — tsvector. Body matches the dense
9454            // form (`read_tsvector_body`).
9455            18 => Ok(Value::TsVector(self.read_tsvector_body()?)),
9456            // v7.12.0: tag 19 — tsquery.
9457            19 => Ok(Value::TsQuery(self.read_tsquery_body()?)),
9458            // v7.17.0: tag 20 — UUID. Raw 16 bytes.
9459            20 => {
9460                let s = self.take(16)?;
9461                let mut b = [0u8; 16];
9462                b.copy_from_slice(s);
9463                Ok(Value::Uuid(b))
9464            }
9465            // v7.17.0 Phase 3.P0-32: tag 21 — TIME. i64 LE.
9466            21 => Ok(Value::Time(self.read_i64()?)),
9467            // v7.17.0 Phase 3.P0-33: tag 22 — YEAR. u16 LE.
9468            22 => Ok(Value::Year(self.read_u16()?)),
9469            // v7.17.0 Phase 3.P0-34: tag 23 — TIMETZ. i64 LE us +
9470            // i32 LE offset_secs.
9471            23 => {
9472                let us = self.read_i64()?;
9473                let offset_secs = self.read_i32()?;
9474                Ok(Value::TimeTz { us, offset_secs })
9475            }
9476            // v7.17.0 Phase 3.P0-35: tag 24 — MONEY. i64 LE cents.
9477            24 => Ok(Value::Money(self.read_i64()?)),
9478            // v7.17.0 Phase 3.P0-39: tag 26 — Hstore. Body shape
9479            // == read_hstore_body.
9480            26 => Ok(Value::Hstore(self.read_hstore_body()?)),
9481            // v7.17.0 Phase 3.P0-40: tag 27/28/29 — 2D arrays.
9482            27 => Ok(Value::IntArray2D(self.read_int_2d_body()?)),
9483            28 => Ok(Value::BigIntArray2D(self.read_bigint_2d_body()?)),
9484            29 => Ok(Value::TextArray2D(self.read_text_2d_body()?)),
9485            // v7.17.0 Phase 3.P0-38: tag 25 — Range.
9486            // [u8 RangeKind tag][u8 flags][opt lower][opt upper].
9487            25 => {
9488                let kt = self.read_u8()?;
9489                let kind = RangeKind::from_tag(kt)
9490                    .ok_or_else(|| StorageError::Corrupt(format!("unknown RangeKind tag: {kt}")))?;
9491                let flags = self.read_u8()?;
9492                let empty = flags & 0b0000_0001 != 0;
9493                let has_lower = flags & 0b0000_0010 != 0;
9494                let has_upper = flags & 0b0000_0100 != 0;
9495                let lower_inc = flags & 0b0000_1000 != 0;
9496                let upper_inc = flags & 0b0001_0000 != 0;
9497                let lower = if has_lower {
9498                    Some(alloc::boxed::Box::new(self.read_value()?))
9499                } else {
9500                    None
9501                };
9502                let upper = if has_upper {
9503                    Some(alloc::boxed::Box::new(self.read_value()?))
9504                } else {
9505                    None
9506                };
9507                Ok(Value::Range {
9508                    kind,
9509                    lower,
9510                    upper,
9511                    lower_inc,
9512                    upper_inc,
9513                    empty,
9514                })
9515            }
9516            other => Err(StorageError::Corrupt(format!("unknown value tag: {other}"))),
9517        }
9518    }
9519
9520    /// Read an NSW graph that was emitted via `write_nsw_graph`. `m`
9521    /// is passed in because it was already consumed from the per-
9522    /// index header. Returns the reconstituted `NswGraph`.
9523    fn read_nsw_graph(&mut self, m: usize) -> Result<NswGraph, StorageError> {
9524        let m_max_0 = self.read_u16()? as usize;
9525        let entry_raw = self.read_u32()?;
9526        let entry = if entry_raw == u32::MAX {
9527            None
9528        } else {
9529            Some(entry_raw as usize)
9530        };
9531        let entry_level = self.read_u8()?;
9532        let node_count = self.read_u32()? as usize;
9533        // v5.5.0: levels/per-layer are PV-backed in memory, but the wire
9534        // format is unchanged — decode element-by-element into a PV via
9535        // push_mut (transient in-place, no per-element path-copy here since
9536        // the freshly-built PV is uniquely owned).
9537        let mut levels: PersistentVec<u8> = PersistentVec::new();
9538        for _ in 0..node_count {
9539            levels.push_mut(self.read_u8()?);
9540        }
9541        let layer_count = self.read_u8()? as usize;
9542        let mut layers: Vec<PersistentVec<Vec<u32>>> = Vec::with_capacity(layer_count);
9543        for _ in 0..layer_count {
9544            let n = self.read_u32()? as usize;
9545            let mut per_layer: PersistentVec<Vec<u32>> = PersistentVec::new();
9546            for _ in 0..n {
9547                let cnt = self.read_u16()? as usize;
9548                let mut row: Vec<u32> = Vec::with_capacity(cnt);
9549                for _ in 0..cnt {
9550                    row.push(self.read_u32()?);
9551                }
9552                per_layer.push_mut(row);
9553            }
9554            layers.push(per_layer);
9555        }
9556        Ok(NswGraph {
9557            m,
9558            m_max_0,
9559            entry,
9560            entry_level,
9561            levels,
9562            layers,
9563        })
9564    }
9565}
9566
9567#[cfg(test)]
9568mod tests {
9569    use super::*;
9570    use alloc::string::ToString;
9571    use alloc::vec;
9572
9573    /// v7.27 (mailrs round-21) — the remaining u16 cells take the
9574    /// escape: a > 64 KiB BYTEA cell and a > 64 KiB TEXT[] element
9575    /// round-trip through snapshot serialise/deserialise (the BYTEA
9576    /// twin of round-14 fired during a production migration).
9577    #[test]
9578    fn snapshot_round_trips_large_bytea_and_text_array_element() {
9579        let mut cat = Catalog::new();
9580        cat.create_table(TableSchema::new(
9581            "q",
9582            vec![
9583                ColumnSchema::new("id", DataType::BigInt, false),
9584                ColumnSchema::new("data", DataType::Bytes, true),
9585                ColumnSchema::new("uris", DataType::TextArray, true),
9586            ],
9587        ))
9588        .unwrap();
9589        let big_blob = alloc::vec![0xAB_u8; 200_000];
9590        let big_elem = "u".repeat(100_000);
9591        cat.get_mut("q")
9592            .unwrap()
9593            .insert(Row::new(alloc::vec![
9594                Value::BigInt(1),
9595                Value::Bytes(big_blob.clone()),
9596                Value::TextArray(alloc::vec![Some(big_elem.clone()), None, Some("s".into())]),
9597            ]))
9598            .unwrap();
9599        let bytes = cat.serialize();
9600        let re = Catalog::deserialize(&bytes).unwrap();
9601        let row = re.get("q").unwrap().rows.get(0).unwrap().clone();
9602        match &row.values[1] {
9603            Value::Bytes(b) => assert_eq!(b.len(), big_blob.len()),
9604            other => panic!("expected Bytes, got {other:?}"),
9605        }
9606        match &row.values[2] {
9607            Value::TextArray(items) => {
9608                assert_eq!(items[0].as_ref().unwrap().len(), big_elem.len());
9609                assert!(items[1].is_none());
9610            }
9611            other => panic!("expected TextArray, got {other:?}"),
9612        }
9613    }
9614
9615    /// Pre-v47 containers carry PLAIN u16 lengths for these cells —
9616    /// 0xFFFF must not be treated as an escape there.
9617    #[test]
9618    fn plain_u16_bytea_len_ffff_decodes_under_v46_rules() {
9619        let payload = alloc::vec![7_u8; 65_535];
9620        let mut buf = Vec::new();
9621        write_u16(&mut buf, 65_535);
9622        buf.extend_from_slice(&payload);
9623        let mut cur = Cursor::new(&buf).with_codec_version(46);
9624        let len = cur.read_len_escaped_v47().unwrap();
9625        assert_eq!(len, 65_535);
9626        assert_eq!(cur.take(len).unwrap().len(), 65_535);
9627    }
9628
9629    /// v7.23 (mailrs round-14) — the escaped short-string codec.
9630    /// Boundary cases: 0xFFFE stays plain-u16, 0xFFFF and above take
9631    /// the escape form, round-trips are exact at 1 MiB.
9632    #[test]
9633    fn escaped_string_codec_round_trips_large_text() {
9634        for len in [0usize, 1, 65_534, 65_535, 65_536, 1_048_576] {
9635            let s: String = "x".repeat(len);
9636            let mut buf = Vec::new();
9637            write_str(&mut buf, &s);
9638            let expected_header = if len >= STR_LEN_ESCAPE as usize { 6 } else { 2 };
9639            assert_eq!(buf.len(), expected_header + len, "header width for {len}");
9640            let mut cur = Cursor::new(&buf).with_codec_version(FILE_VERSION);
9641            assert_eq!(cur.read_str().unwrap().len(), len, "round-trip {len}");
9642        }
9643    }
9644
9645    /// Pre-v46 containers may carry a PLAIN length of exactly 0xFFFF
9646    /// — the decoder must not treat it as an escape there.
9647    #[test]
9648    fn plain_u16_len_ffff_decodes_under_old_rules() {
9649        let s = "y".repeat(65_535);
9650        let mut buf = Vec::new();
9651        // Hand-encode the OLD form: plain u16 length.
9652        write_u16(&mut buf, 65_535);
9653        buf.extend_from_slice(s.as_bytes());
9654        let mut old = Cursor::new(&buf); // codec_version = 0 (legacy rules)
9655        assert_eq!(old.read_str().unwrap(), s);
9656    }
9657
9658    /// End-to-end: a catalog holding a 1 MiB TEXT row snapshots and
9659    /// reloads — the exact shape that panicked at 7.22's graceful
9660    /// close ("identifier / text fits in u16").
9661    #[test]
9662    fn snapshot_round_trips_megabyte_text_row() {
9663        let mut cat = Catalog::new();
9664        cat.create_table(TableSchema::new(
9665            "mail",
9666            vec![
9667                ColumnSchema::new("id", DataType::BigInt, false),
9668                ColumnSchema::new("body", DataType::Text, false),
9669            ],
9670        ))
9671        .unwrap();
9672        let body = "m".repeat(1_048_576);
9673        cat.get_mut("mail")
9674            .unwrap()
9675            .insert(Row::new(vec![Value::BigInt(1), Value::Text(body.clone())]))
9676            .unwrap();
9677        let bytes = cat.serialize();
9678        let re = Catalog::deserialize(&bytes).unwrap();
9679        let t = re.get("mail").unwrap();
9680        match &t.rows.get(0).unwrap().values[1] {
9681            Value::Text(s) => assert_eq!(s.len(), body.len()),
9682            other => panic!("expected Text, got {other:?}"),
9683        }
9684    }
9685
9686    /// Cold tier: a segment holding a > 64 KiB TEXT row encodes (V3
9687    /// magic) and looks up; a hand-built V1 segment with a legal
9688    /// 0xFFFF-length text still decodes under old rules.
9689    #[test]
9690    fn segment_v3_round_trips_large_text_rows() {
9691        let schema = TableSchema::new(
9692            "mail",
9693            vec![
9694                ColumnSchema::new("id", DataType::BigInt, false),
9695                ColumnSchema::new("body", DataType::Text, false),
9696            ],
9697        );
9698        let big = "b".repeat(200_000);
9699        let rows: Vec<(u64, Vec<u8>)> = (0u64..3)
9700            .map(|i| {
9701                let row = Row::new(vec![
9702                    Value::BigInt(i.cast_signed()),
9703                    Value::Text(big.clone()),
9704                ]);
9705                (i, encode_row_body_dense(&row, &schema))
9706            })
9707            .collect();
9708        let (bytes, _) = encode_segment(rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES).unwrap();
9709        assert_eq!(&bytes[..8], b"SPGSEG\x04\x00", "new segments are V4");
9710        let seg = OwnedSegment::from_bytes(bytes).unwrap();
9711        assert!(seg.codec_version() >= 47);
9712        let payload = seg.lookup(1).expect("pk 1 present");
9713        let (row, _) = decode_row_body_dense(&payload, &schema, seg.codec_version()).unwrap();
9714        match &row.values[1] {
9715            Value::Text(s) => assert_eq!(s.len(), big.len()),
9716            other => panic!("expected Text, got {other:?}"),
9717        }
9718    }
9719
9720    /// Index keys derive from TEXT columns — a > 64 KiB key must
9721    /// round-trip through the v9 tagged index-key codec too.
9722    #[test]
9723    fn index_key_round_trips_large_text() {
9724        let key = IndexKey::Text("k".repeat(100_000));
9725        let mut buf = Vec::new();
9726        write_index_key(&mut buf, &key);
9727        let mut cur = Cursor::new(&buf).with_codec_version(FILE_VERSION);
9728        let back = cur.read_index_key().unwrap();
9729        assert_eq!(back, key);
9730    }
9731
9732    #[cfg(target_arch = "aarch64")]
9733    #[test]
9734    fn neon_l2_matches_scalar() {
9735        // For every dim that's a multiple of 4 (4, 8, 12, 16, 64,
9736        // 128, 256, 384, 512, 768, 1024, 1536), the NEON impl must
9737        // agree with the scalar reference within tight float
9738        // tolerance (FMA rounding differs from separate * + +).
9739        let dims = [4usize, 8, 12, 16, 64, 128, 256, 384, 512, 768, 1024, 1536];
9740        for &d in &dims {
9741            let mut state: u64 = (d as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15);
9742            let mut a = Vec::with_capacity(d);
9743            let mut b = Vec::with_capacity(d);
9744            for _ in 0..d {
9745                state = state
9746                    .wrapping_mul(6_364_136_223_846_793_005)
9747                    .wrapping_add(1);
9748                #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
9749                let x = (((state >> 32) & 0x00FF_FFFF) as f32) / (0x80_0000_u32 as f32) - 1.0;
9750                state = state
9751                    .wrapping_mul(6_364_136_223_846_793_005)
9752                    .wrapping_add(1);
9753                #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
9754                let y = (((state >> 32) & 0x00FF_FFFF) as f32) / (0x80_0000_u32 as f32) - 1.0;
9755                a.push(x);
9756                b.push(y);
9757            }
9758            let scalar = l2_distance_sq_scalar(&a, &b);
9759            let neon = unsafe { l2_distance_sq_neon(&a, &b) };
9760            let tol = (scalar.abs().max(1e-6)) * 1e-4;
9761            assert!(
9762                (scalar - neon).abs() <= tol,
9763                "dim={d}: scalar={scalar} neon={neon} diff={}",
9764                (scalar - neon).abs()
9765            );
9766        }
9767    }
9768
9769    #[cfg(target_arch = "aarch64")]
9770    #[test]
9771    fn neon_inner_product_matches_scalar() {
9772        // v6.0.2 step 1: NEON IP must agree with scalar across every
9773        // production-shaped dim. FMA rounding differs from
9774        // separate * + +, so the tolerance scales with magnitude.
9775        let dims = [4usize, 8, 12, 16, 64, 128, 256, 512, 1024];
9776        for &d in &dims {
9777            let mut state: u64 = (d as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15);
9778            let mut a = Vec::with_capacity(d);
9779            let mut b = Vec::with_capacity(d);
9780            for _ in 0..d {
9781                state = state
9782                    .wrapping_mul(6_364_136_223_846_793_005)
9783                    .wrapping_add(1);
9784                #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
9785                let x = (((state >> 32) & 0x00FF_FFFF) as f32) / (0x80_0000_u32 as f32) - 1.0;
9786                state = state
9787                    .wrapping_mul(6_364_136_223_846_793_005)
9788                    .wrapping_add(1);
9789                #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
9790                let y = (((state >> 32) & 0x00FF_FFFF) as f32) / (0x80_0000_u32 as f32) - 1.0;
9791                a.push(x);
9792                b.push(y);
9793            }
9794            let scalar = inner_product_scalar(&a, &b);
9795            let neon = unsafe { inner_product_neon(&a, &b) };
9796            #[allow(clippy::cast_precision_loss)]
9797            let tol = (scalar.abs().max(1e-6)) * 1e-4 + (d as f32) * 1e-6;
9798            assert!(
9799                (scalar - neon).abs() <= tol,
9800                "IP dim={d}: scalar={scalar} neon={neon} diff={}",
9801                (scalar - neon).abs()
9802            );
9803        }
9804    }
9805
9806    #[cfg(target_arch = "aarch64")]
9807    #[allow(clippy::similar_names)]
9808    #[test]
9809    fn neon_cosine_dot_norms_matches_scalar() {
9810        let dims = [4usize, 8, 12, 16, 64, 128, 256, 512, 1024];
9811        for &d in &dims {
9812            let mut state: u64 = (d as u64).wrapping_mul(0xBF58_476D_1CE4_E5B9);
9813            let mut a = Vec::with_capacity(d);
9814            let mut b = Vec::with_capacity(d);
9815            for _ in 0..d {
9816                state = state
9817                    .wrapping_mul(6_364_136_223_846_793_005)
9818                    .wrapping_add(1);
9819                #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
9820                let x = (((state >> 32) & 0x00FF_FFFF) as f32) / (0x80_0000_u32 as f32) - 1.0;
9821                state = state
9822                    .wrapping_mul(6_364_136_223_846_793_005)
9823                    .wrapping_add(1);
9824                #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
9825                let y = (((state >> 32) & 0x00FF_FFFF) as f32) / (0x80_0000_u32 as f32) - 1.0;
9826                a.push(x);
9827                b.push(y);
9828            }
9829            let (dot_s, na_s, nb_s) = cosine_dot_norms_scalar(&a, &b);
9830            let (dot_n, na_n, nb_n) = unsafe { cosine_dot_norms_neon(&a, &b) };
9831            #[allow(clippy::cast_precision_loss)]
9832            let tol_d = (dot_s.abs().max(1e-6)) * 1e-4 + (d as f32) * 1e-6;
9833            #[allow(clippy::cast_precision_loss)]
9834            let tol_n = (na_s.abs().max(1e-6)) * 1e-4 + (d as f32) * 1e-6;
9835            assert!(
9836                (dot_s - dot_n).abs() <= tol_d,
9837                "cosine dot dim={d}: scalar={dot_s} neon={dot_n}"
9838            );
9839            assert!(
9840                (na_s - na_n).abs() <= tol_n,
9841                "cosine na dim={d}: scalar={na_s} neon={na_n}"
9842            );
9843            assert!(
9844                (nb_s - nb_n).abs() <= tol_n,
9845                "cosine nb dim={d}: scalar={nb_s} neon={nb_n}"
9846            );
9847        }
9848    }
9849
9850    fn make_users_schema() -> TableSchema {
9851        TableSchema::new(
9852            "users",
9853            vec![
9854                ColumnSchema::new("id", DataType::Int, false),
9855                ColumnSchema::new("name", DataType::Text, false),
9856                ColumnSchema::new("score", DataType::Float, true),
9857            ],
9858        )
9859    }
9860
9861    #[test]
9862    fn value_type_tag_matches_variant() {
9863        assert_eq!(Value::Int(1).data_type(), Some(DataType::Int));
9864        assert_eq!(Value::BigInt(1).data_type(), Some(DataType::BigInt));
9865        assert_eq!(Value::Float(1.0).data_type(), Some(DataType::Float));
9866        assert_eq!(Value::Text("x".into()).data_type(), Some(DataType::Text));
9867        assert_eq!(Value::Bool(true).data_type(), Some(DataType::Bool));
9868        assert_eq!(Value::Null.data_type(), None);
9869        assert!(Value::Null.is_null());
9870        assert!(!Value::Int(0).is_null());
9871    }
9872
9873    #[test]
9874    fn sq8_value_reports_sq8_data_type() {
9875        // v6.0.1: a `Value::Sq8Vector` cell surfaces its dim
9876        // (= bytes.len()) and encoding through `data_type()` so
9877        // INSERT-time column type-checks (step 3) can route on
9878        // both shape and encoding.
9879        let q = crate::quantize::quantize(&[0.0, 0.25, 0.5, 0.75, 1.0]);
9880        let v = Value::Sq8Vector(q);
9881        assert_eq!(
9882            v.data_type(),
9883            Some(DataType::Vector {
9884                dim: 5,
9885                encoding: VecEncoding::Sq8,
9886            }),
9887        );
9888    }
9889
9890    #[test]
9891    fn datatype_display_matches_pg_keyword() {
9892        assert_eq!(DataType::Int.to_string(), "INT");
9893        assert_eq!(DataType::BigInt.to_string(), "BIGINT");
9894        assert_eq!(DataType::Float.to_string(), "FLOAT");
9895        assert_eq!(DataType::Text.to_string(), "TEXT");
9896        assert_eq!(DataType::Bool.to_string(), "BOOL");
9897    }
9898
9899    #[test]
9900    fn row_len_and_emptiness() {
9901        let r = Row::new(vec![Value::Int(1), Value::Null]);
9902        assert_eq!(r.len(), 2);
9903        assert!(!r.is_empty());
9904        assert!(Row::new(Vec::new()).is_empty());
9905    }
9906
9907    #[test]
9908    fn table_schema_column_position() {
9909        let s = make_users_schema();
9910        assert_eq!(s.column_position("id"), Some(0));
9911        assert_eq!(s.column_position("score"), Some(2));
9912        assert_eq!(s.column_position("missing"), None);
9913    }
9914
9915    #[test]
9916    fn catalog_create_table_then_lookup() {
9917        let mut cat = Catalog::new();
9918        cat.create_table(make_users_schema()).unwrap();
9919        assert_eq!(cat.table_count(), 1);
9920        assert!(cat.get("users").is_some());
9921        assert!(cat.get("nope").is_none());
9922    }
9923
9924    #[test]
9925    fn catalog_duplicate_table_is_rejected() {
9926        let mut cat = Catalog::new();
9927        cat.create_table(make_users_schema()).unwrap();
9928        let err = cat.create_table(make_users_schema()).unwrap_err();
9929        assert!(matches!(err, StorageError::DuplicateTable { ref name } if name == "users"));
9930    }
9931
9932    #[test]
9933    fn table_insert_happy_path_appends_row() {
9934        let mut cat = Catalog::new();
9935        cat.create_table(make_users_schema()).unwrap();
9936        let t = cat.get_mut("users").unwrap();
9937        t.insert(Row::new(vec![
9938            Value::Int(1),
9939            Value::Text("alice".into()),
9940            Value::Float(99.5),
9941        ]))
9942        .unwrap();
9943        assert_eq!(t.row_count(), 1);
9944        assert_eq!(t.rows()[0].values[1], Value::Text("alice".into()));
9945    }
9946
9947    #[test]
9948    fn table_insert_arity_mismatch() {
9949        let mut cat = Catalog::new();
9950        cat.create_table(make_users_schema()).unwrap();
9951        let t = cat.get_mut("users").unwrap();
9952        let err = t.insert(Row::new(vec![Value::Int(1)])).unwrap_err();
9953        assert!(matches!(
9954            err,
9955            StorageError::ArityMismatch {
9956                expected: 3,
9957                actual: 1
9958            }
9959        ));
9960        assert_eq!(t.row_count(), 0);
9961    }
9962
9963    #[test]
9964    fn table_insert_type_mismatch_reports_column() {
9965        let mut cat = Catalog::new();
9966        cat.create_table(make_users_schema()).unwrap();
9967        let t = cat.get_mut("users").unwrap();
9968        let err = t
9969            .insert(Row::new(vec![
9970                Value::Int(1),
9971                Value::Int(42), // name expects Text
9972                Value::Float(0.0),
9973            ]))
9974            .unwrap_err();
9975        match err {
9976            StorageError::TypeMismatch {
9977                ref column,
9978                expected,
9979                actual,
9980                position,
9981            } => {
9982                assert_eq!(column, "name");
9983                assert_eq!(expected, DataType::Text);
9984                assert_eq!(actual, DataType::Int);
9985                assert_eq!(position, 1);
9986            }
9987            other => panic!("unexpected: {other:?}"),
9988        }
9989        assert_eq!(t.row_count(), 0);
9990    }
9991
9992    #[test]
9993    fn table_insert_null_into_not_null_rejected() {
9994        let mut cat = Catalog::new();
9995        cat.create_table(make_users_schema()).unwrap();
9996        let t = cat.get_mut("users").unwrap();
9997        let err = t
9998            .insert(Row::new(vec![
9999                Value::Int(1),
10000                Value::Null, // name is NOT NULL
10001                Value::Float(1.0),
10002            ]))
10003            .unwrap_err();
10004        assert!(matches!(err, StorageError::NullInNotNull { ref column } if column == "name"));
10005    }
10006
10007    #[test]
10008    fn table_insert_null_into_nullable_ok() {
10009        let mut cat = Catalog::new();
10010        cat.create_table(make_users_schema()).unwrap();
10011        let t = cat.get_mut("users").unwrap();
10012        t.insert(Row::new(vec![
10013            Value::Int(1),
10014            Value::Text("bob".into()),
10015            Value::Null,
10016        ]))
10017        .unwrap();
10018        assert_eq!(t.row_count(), 1);
10019    }
10020
10021    #[test]
10022    fn catalog_get_mut_independent_per_table() {
10023        let mut cat = Catalog::new();
10024        cat.create_table(TableSchema::new(
10025            "a",
10026            vec![ColumnSchema::new("v", DataType::Int, false)],
10027        ))
10028        .unwrap();
10029        cat.create_table(TableSchema::new(
10030            "b",
10031            vec![ColumnSchema::new("v", DataType::Int, false)],
10032        ))
10033        .unwrap();
10034        cat.get_mut("a")
10035            .unwrap()
10036            .insert(Row::new(vec![Value::Int(1)]))
10037            .unwrap();
10038        assert_eq!(cat.get("a").unwrap().row_count(), 1);
10039        assert_eq!(cat.get("b").unwrap().row_count(), 0);
10040    }
10041
10042    // --- v0.6 persistence round-trips --------------------------------------
10043
10044    fn assert_round_trip(cat: &Catalog) {
10045        let bytes = cat.serialize();
10046        let restored = Catalog::deserialize(&bytes).expect("deserialize");
10047        // Compare semantic state: same tables in same order, same schema +
10048        // rows in each.
10049        assert_eq!(restored.table_count(), cat.table_count());
10050        for (a, b) in cat.tables.iter().zip(restored.tables.iter()) {
10051            assert_eq!(a.schema, b.schema);
10052            assert_eq!(a.rows, b.rows);
10053        }
10054    }
10055
10056    #[test]
10057    fn serialize_empty_catalog_round_trips() {
10058        assert_round_trip(&Catalog::new());
10059    }
10060
10061    #[test]
10062    fn serialize_single_empty_table_round_trips() {
10063        let mut cat = Catalog::new();
10064        cat.create_table(make_users_schema()).unwrap();
10065        assert_round_trip(&cat);
10066    }
10067
10068    #[test]
10069    fn nsw_clone_is_o1() {
10070        // v5.5.0: NswGraph::clone must be O(1) structural sharing, not the
10071        // pre-v5.5 O(N) element copy — it rides on Catalog::clone for every
10072        // group-commit write on a vector table. Build a non-trivial multi-
10073        // layer graph, clone it, and prove the clone shares the very same PV
10074        // storage (root+tail Arc) for `levels` and every `layers[l]`. Sharing
10075        // ⇒ no per-node element copy ⇒ clone cost independent of N (node
10076        // count); only the outer layer Vec (len ≤ 8) is copied, O(1) in
10077        // practice.
10078        let mut cat = Catalog::new();
10079        cat.create_table(TableSchema::new(
10080            "docs",
10081            alloc::vec![
10082                ColumnSchema::new("id", DataType::Int, false),
10083                ColumnSchema::new(
10084                    "v",
10085                    DataType::Vector {
10086                        dim: 3,
10087                        encoding: VecEncoding::F32
10088                    },
10089                    true
10090                ),
10091            ],
10092        ))
10093        .unwrap();
10094        let t = cat.get_mut("docs").unwrap();
10095        for i in 0..1500_i32 {
10096            #[allow(clippy::cast_precision_loss)] // 0..1500 — no precision lost
10097            let base = (i as f32) * 0.01;
10098            t.insert(Row::new(alloc::vec![
10099                Value::Int(i),
10100                Value::Vector(alloc::vec![base, base + 0.05, base + 0.1]),
10101            ]))
10102            .unwrap();
10103        }
10104        t.add_nsw_index("docs_nsw".into(), "v", NSW_DEFAULT_M)
10105            .unwrap();
10106        let g = match &cat.get("docs").unwrap().indices()[0].kind {
10107            IndexKind::Nsw(g) => g,
10108            IndexKind::BTree(_)
10109            | IndexKind::Brin { .. }
10110            | IndexKind::Gin(_)
10111            | IndexKind::GinTrgm(_)
10112            | IndexKind::GinFulltext(_) => {
10113                panic!("expected NSW")
10114            }
10115        };
10116        // Non-trivial graph: one level slot per row, and the geometric level
10117        // distribution puts some nodes above layer 0.
10118        assert_eq!(g.levels.len(), 1500, "one level slot per inserted row");
10119        assert!(
10120            g.layers.len() >= 2,
10121            "1500 nodes should populate at least two HNSW layers, got {}",
10122            g.layers.len()
10123        );
10124
10125        let cloned = g.clone();
10126
10127        assert!(
10128            g.levels.shares_storage_with(&cloned.levels),
10129            "levels PV not shared after clone — clone copied elements (O(N))"
10130        );
10131        assert_eq!(g.layers.len(), cloned.layers.len());
10132        for (l, (orig, cl)) in g.layers.iter().zip(cloned.layers.iter()).enumerate() {
10133            assert!(
10134                orig.shares_storage_with(cl),
10135                "layer {l} PV not shared after clone — clone copied elements (O(N))"
10136            );
10137        }
10138    }
10139
10140    #[test]
10141    fn sq8_catalog_serialise_roundtrip_preserves_cells_and_index() {
10142        // v6.0.1 step 6 verify: a catalog with an `VECTOR(N)
10143        // USING SQ8` column + NSW index survives a full
10144        // serialise → deserialise cycle. Cells re-decode bit-
10145        // identically (per-vector affine triple), the NSW
10146        // topology stays intact, and kNN search still routes
10147        // through the SQ8 ADC dispatcher after the catalog hop.
10148        let mut cat = Catalog::new();
10149        cat.create_table(TableSchema::new(
10150            "vecs",
10151            alloc::vec![
10152                ColumnSchema::new("id", DataType::Int, false),
10153                ColumnSchema::new(
10154                    "v",
10155                    DataType::Vector {
10156                        dim: 8,
10157                        encoding: VecEncoding::Sq8,
10158                    },
10159                    false,
10160                ),
10161            ],
10162        ))
10163        .unwrap();
10164        let t = cat.get_mut("vecs").unwrap();
10165        for i in 0..32_i32 {
10166            #[allow(clippy::cast_precision_loss)]
10167            let base = (i as f32) * 0.03;
10168            let v: Vec<f32> = (0..8_i32)
10169                .map(|j| {
10170                    #[allow(clippy::cast_precision_loss)]
10171                    let off = (j as f32) * 0.01;
10172                    base + off
10173                })
10174                .collect();
10175            t.insert(Row::new(alloc::vec![
10176                Value::Int(i),
10177                Value::Sq8Vector(quantize::quantize(&v)),
10178            ]))
10179            .unwrap();
10180        }
10181        t.add_nsw_index("v_idx".into(), "v", NSW_DEFAULT_M).unwrap();
10182        // Capture a pre-serialise reference cell + nsw hits to
10183        // compare against the restored catalog.
10184        let query = alloc::vec![0.15_f32, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21, 0.22];
10185        let (before_cell, before_ty, before_hits) = {
10186            let t_ref = cat.get("vecs").unwrap();
10187            (
10188                t_ref.rows()[5].values[1].clone(),
10189                t_ref.schema().columns[1].ty,
10190                nsw_query(t_ref, "v_idx", &query, 5, NswMetric::L2),
10191            )
10192        };
10193
10194        let bytes = cat.serialize();
10195        let restored = Catalog::deserialize(&bytes).expect("deserialize ok");
10196        let rt = restored.get("vecs").unwrap();
10197        assert_eq!(rt.schema().columns[1].ty, before_ty);
10198        assert_eq!(rt.rows()[5].values[1], before_cell);
10199        let after_hits = nsw_query(rt, "v_idx", &query, 5, NswMetric::L2);
10200        assert_eq!(before_hits, after_hits);
10201    }
10202
10203    #[test]
10204    fn half_catalog_serialise_roundtrip_preserves_cells_and_index() {
10205        // v6.0.3 step 4 verify: a catalog with a `VECTOR(N) USING
10206        // HALF` column + NSW index survives a full serialise →
10207        // deserialise cycle. Cells re-decode bit-identically (raw
10208        // u16 LE bytes), the NSW topology stays intact, and kNN
10209        // search still returns the same hit IDs against the
10210        // restored catalog.
10211        use crate::halfvec;
10212        let mut cat = Catalog::new();
10213        cat.create_table(TableSchema::new(
10214            "vecs",
10215            alloc::vec![
10216                ColumnSchema::new("id", DataType::Int, false),
10217                ColumnSchema::new(
10218                    "v",
10219                    DataType::Vector {
10220                        dim: 8,
10221                        encoding: VecEncoding::F16,
10222                    },
10223                    false,
10224                ),
10225            ],
10226        ))
10227        .unwrap();
10228        let t = cat.get_mut("vecs").unwrap();
10229        for i in 0..32_i32 {
10230            #[allow(clippy::cast_precision_loss)]
10231            let base = (i as f32) * 0.03;
10232            let v: Vec<f32> = (0..8_i32)
10233                .map(|j| {
10234                    #[allow(clippy::cast_precision_loss)]
10235                    let off = (j as f32) * 0.01;
10236                    base + off
10237                })
10238                .collect();
10239            t.insert(Row::new(alloc::vec![
10240                Value::Int(i),
10241                Value::HalfVector(halfvec::HalfVector::from_f32_slice(&v)),
10242            ]))
10243            .unwrap();
10244        }
10245        t.add_nsw_index("v_idx".into(), "v", NSW_DEFAULT_M).unwrap();
10246        let query = alloc::vec![0.15_f32, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21, 0.22];
10247        let (before_cell, before_ty, before_hits) = {
10248            let t_ref = cat.get("vecs").unwrap();
10249            (
10250                t_ref.rows()[5].values[1].clone(),
10251                t_ref.schema().columns[1].ty,
10252                nsw_query(t_ref, "v_idx", &query, 5, NswMetric::L2),
10253            )
10254        };
10255        let bytes = cat.serialize();
10256        let restored = Catalog::deserialize(&bytes).expect("deserialize ok");
10257        let rt = restored.get("vecs").unwrap();
10258        assert_eq!(rt.schema().columns[1].ty, before_ty);
10259        assert_eq!(rt.rows()[5].values[1], before_cell);
10260        let after_hits = nsw_query(rt, "v_idx", &query, 5, NswMetric::L2);
10261        assert_eq!(before_hits, after_hits);
10262    }
10263
10264    #[test]
10265    #[allow(clippy::similar_names)]
10266    fn hnsw_half_recall_at_10_matches_f32_groundtruth() {
10267        // v6.0.3 step 3 verify: HALF column NSW retrieves ≥ 95%
10268        // top-10 overlap vs brute-force F32 ground truth.
10269        // Half-precision dequantises bit-exactly at the storage
10270        // layer (no rerank pass), so the recall floor is tighter
10271        // than the SQ8 case — only the rounding noise from f32 →
10272        // f16 quantisation contributes.
10273        use crate::halfvec;
10274        fn next(state: &mut u64) -> f32 {
10275            *state = state
10276                .wrapping_add(0x9E37_79B9_7F4A_7C15)
10277                .wrapping_mul(0xBF58_476D_1CE4_E5B9);
10278            #[allow(clippy::cast_precision_loss)]
10279            let u = ((*state >> 32) as u32 as f32) / (u32::MAX as f32);
10280            2.0 * u - 1.0
10281        }
10282        let dim: u32 = 32;
10283        let n: usize = 512;
10284        let dim_us = dim as usize;
10285        let mut seed: u64 = 0xF16_F16_F16_F16_u64;
10286        let corpus: Vec<Vec<f32>> = (0..n)
10287            .map(|_| (0..dim_us).map(|_| next(&mut seed)).collect())
10288            .collect();
10289        let queries: Vec<Vec<f32>> = (0..32)
10290            .map(|_| (0..dim_us).map(|_| next(&mut seed)).collect())
10291            .collect();
10292        let exact_top10: Vec<Vec<usize>> = queries
10293            .iter()
10294            .map(|q| {
10295                let mut scored: Vec<(f32, usize)> = corpus
10296                    .iter()
10297                    .enumerate()
10298                    .map(|(i, v)| (l2_distance_sq(v, q), i))
10299                    .collect();
10300                scored.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
10301                scored.into_iter().take(10).map(|(_, i)| i).collect()
10302            })
10303            .collect();
10304        let mut cat = Catalog::new();
10305        cat.create_table(TableSchema::new(
10306            "vecs",
10307            alloc::vec![
10308                ColumnSchema::new("id", DataType::Int, false),
10309                ColumnSchema::new(
10310                    "v",
10311                    DataType::Vector {
10312                        dim,
10313                        encoding: VecEncoding::F16,
10314                    },
10315                    false,
10316                ),
10317            ],
10318        ))
10319        .unwrap();
10320        let t = cat.get_mut("vecs").unwrap();
10321        for (i, v) in corpus.iter().enumerate() {
10322            t.insert(Row::new(alloc::vec![
10323                Value::Int(i32::try_from(i).unwrap()),
10324                Value::HalfVector(halfvec::HalfVector::from_f32_slice(v)),
10325            ]))
10326            .unwrap();
10327        }
10328        t.add_nsw_index("v_idx".into(), "v", NSW_DEFAULT_M).unwrap();
10329        let table = cat.get("vecs").unwrap();
10330        let mut total_overlap = 0_usize;
10331        for (q, exact) in queries.iter().zip(exact_top10.iter()) {
10332            let hits = nsw_query(table, "v_idx", q, 10, NswMetric::L2);
10333            for h in &hits {
10334                if exact.contains(h) {
10335                    total_overlap += 1;
10336                }
10337            }
10338        }
10339        #[allow(clippy::cast_precision_loss)]
10340        let recall = total_overlap as f32 / (10.0 * queries.len() as f32);
10341        assert!(
10342            recall >= 0.95,
10343            "HALF HNSW recall@10 = {recall:.3}, below floor 0.95 — \
10344             check halfvec dispatch in `cell_to_query_metric_distance`"
10345        );
10346    }
10347
10348    #[test]
10349    fn hnsw_sq8_recall_at_10_above_0_95_vs_f32_groundtruth() {
10350        // v6.0.1 step 5 verify: build TWO catalogs over the same
10351        // corpus — one F32, one SQ8 — and confirm SQ8 NSW + f32
10352        // rerank retrieves ≥ 95% top-10 overlap vs brute-force F32
10353        // ground truth. The rerank pass (sq8_rerank) re-scores ADC
10354        // candidates with dequantised cells, recovering recall the
10355        // raw ADC sacrifices for 4× compression.
10356        use crate::quantize;
10357        // Deterministic Gaussian-ish corpus via splitmix64. Vectors
10358        // get normalised so SQ8's per-vector `(min, max)` lives in
10359        // a sensible range; matches the v6.0.0 fuzz harness.
10360        fn next(state: &mut u64) -> f32 {
10361            *state = state
10362                .wrapping_add(0x9E37_79B9_7F4A_7C15)
10363                .wrapping_mul(0xBF58_476D_1CE4_E5B9);
10364            #[allow(clippy::cast_precision_loss)]
10365            let u = ((*state >> 32) as u32 as f32) / (u32::MAX as f32);
10366            2.0 * u - 1.0
10367        }
10368        let dim: u32 = 32;
10369        let n: usize = 512;
10370        let dim_us = dim as usize;
10371        let mut seed: u64 = 0xCAFE_BABE_DEAD_BEEFu64;
10372        let corpus: Vec<Vec<f32>> = (0..n)
10373            .map(|_| (0..dim_us).map(|_| next(&mut seed)).collect())
10374            .collect();
10375        let queries: Vec<Vec<f32>> = (0..32)
10376            .map(|_| (0..dim_us).map(|_| next(&mut seed)).collect())
10377            .collect();
10378        // F32 ground truth — pure exact arithmetic, brute force.
10379        let exact_top10: Vec<Vec<usize>> = queries
10380            .iter()
10381            .map(|q| {
10382                let mut scored: Vec<(f32, usize)> = corpus
10383                    .iter()
10384                    .enumerate()
10385                    .map(|(i, v)| (l2_distance_sq(v, q), i))
10386                    .collect();
10387                scored.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
10388                scored.into_iter().take(10).map(|(_, i)| i).collect()
10389            })
10390            .collect();
10391        // SQ8 catalog — INSERTs land as `Value::Sq8Vector` cells;
10392        // HNSW build uses the ADC path verified in step 4.
10393        let mut cat = Catalog::new();
10394        cat.create_table(TableSchema::new(
10395            "vecs",
10396            alloc::vec![
10397                ColumnSchema::new("id", DataType::Int, false),
10398                ColumnSchema::new(
10399                    "v",
10400                    DataType::Vector {
10401                        dim,
10402                        encoding: VecEncoding::Sq8,
10403                    },
10404                    false,
10405                ),
10406            ],
10407        ))
10408        .unwrap();
10409        let t = cat.get_mut("vecs").unwrap();
10410        for (i, v) in corpus.iter().enumerate() {
10411            t.insert(Row::new(alloc::vec![
10412                Value::Int(i32::try_from(i).unwrap()),
10413                Value::Sq8Vector(quantize::quantize(v)),
10414            ]))
10415            .unwrap();
10416        }
10417        t.add_nsw_index("v_idx".into(), "v", NSW_DEFAULT_M).unwrap();
10418        let table = cat.get("vecs").unwrap();
10419        let mut total_overlap = 0_usize;
10420        for (q, exact) in queries.iter().zip(exact_top10.iter()) {
10421            let hits = nsw_query(table, "v_idx", q, 10, NswMetric::L2);
10422            for h in &hits {
10423                if exact.contains(h) {
10424                    total_overlap += 1;
10425                }
10426            }
10427        }
10428        #[allow(clippy::cast_precision_loss)]
10429        let recall = total_overlap as f32 / (10.0 * queries.len() as f32);
10430        assert!(
10431            recall >= 0.95,
10432            "SQ8 HNSW recall@10 = {recall:.3}, below floor 0.95 — \
10433             check `sq8_rerank` is wired in `nsw_search` for SQ8 columns"
10434        );
10435    }
10436
10437    #[test]
10438    fn nsw_index_topology_persists_through_round_trip() {
10439        // Build an NSW index, capture its (entry, neighbors) tuple, do
10440        // a full serialize → deserialize, and verify the restored
10441        // graph is byte-for-byte identical. The point of v2.7 is that
10442        // startup skips the rebuild, so the topology has to survive
10443        // the disk hop.
10444        let mut cat = Catalog::new();
10445        cat.create_table(TableSchema::new(
10446            "docs",
10447            alloc::vec![
10448                ColumnSchema::new("id", DataType::Int, false),
10449                ColumnSchema::new(
10450                    "v",
10451                    DataType::Vector {
10452                        dim: 3,
10453                        encoding: VecEncoding::F32
10454                    },
10455                    true
10456                ),
10457            ],
10458        ))
10459        .unwrap();
10460        let t = cat.get_mut("docs").unwrap();
10461        for i in 0..6_i32 {
10462            #[allow(clippy::cast_precision_loss)] // 0..6 — no precision lost
10463            let base = (i as f32) * 0.1;
10464            let row = Row::new(alloc::vec![
10465                Value::Int(i),
10466                Value::Vector(alloc::vec![base, base + 0.05, base + 0.1]),
10467            ]);
10468            t.insert(row).unwrap();
10469        }
10470        t.add_nsw_index("docs_nsw".into(), "v", NSW_DEFAULT_M)
10471            .unwrap();
10472        let original = match &cat.get("docs").unwrap().indices()[0].kind {
10473            IndexKind::Nsw(g) => g.clone(),
10474            IndexKind::BTree(_)
10475            | IndexKind::Brin { .. }
10476            | IndexKind::Gin(_)
10477            | IndexKind::GinTrgm(_)
10478            | IndexKind::GinFulltext(_) => {
10479                panic!("expected NSW")
10480            }
10481        };
10482        let bytes = cat.serialize();
10483        let restored = Catalog::deserialize(&bytes).expect("deserialize");
10484        let restored_graph = match &restored.get("docs").unwrap().indices()[0].kind {
10485            IndexKind::Nsw(g) => g.clone(),
10486            IndexKind::BTree(_)
10487            | IndexKind::Brin { .. }
10488            | IndexKind::Gin(_)
10489            | IndexKind::GinTrgm(_)
10490            | IndexKind::GinFulltext(_) => {
10491                panic!("expected NSW")
10492            }
10493        };
10494        assert_eq!(restored_graph.m, original.m);
10495        assert_eq!(restored_graph.m_max_0, original.m_max_0);
10496        assert_eq!(restored_graph.entry, original.entry);
10497        assert_eq!(restored_graph.entry_level, original.entry_level);
10498        assert_eq!(restored_graph.levels, original.levels);
10499        assert_eq!(restored_graph.layers, original.layers);
10500    }
10501
10502    #[test]
10503    fn hnsw_level_assignment_is_deterministic() {
10504        // Same row index always produces the same level — the topology
10505        // must be reproducible (matters for serialize round-trip).
10506        for i in 0..32usize {
10507            assert_eq!(nsw_assign_level(i), nsw_assign_level(i));
10508        }
10509    }
10510
10511    #[test]
10512    fn hnsw_layer_0_dominates_population() {
10513        // Sanity: out of N inserts, the vast majority should land on
10514        // layer 0. The 4-bit-clear promotion rule gives roughly 1/16
10515        // promotion to layer ≥ 1, so under 50 nodes we expect ~3 on
10516        // layer ≥ 1 and the rest on layer 0.
10517        let on_zero = (0..200usize).filter(|&i| nsw_assign_level(i) == 0).count();
10518        assert!(on_zero > 150, "level-0 nodes too few: {on_zero}");
10519    }
10520
10521    #[test]
10522    fn hnsw_search_matches_brute_force_for_l2_top1() {
10523        // Build a small dataset, query it, and confirm the top result
10524        // matches the brute-force nearest by L2. Topology variability
10525        // shouldn't break recall at k=1 for well-separated vectors.
10526        let mut cat = Catalog::new();
10527        cat.create_table(TableSchema::new(
10528            "vecs",
10529            alloc::vec![
10530                ColumnSchema::new("id", DataType::Int, false),
10531                ColumnSchema::new(
10532                    "v",
10533                    DataType::Vector {
10534                        dim: 3,
10535                        encoding: VecEncoding::F32
10536                    },
10537                    true
10538                ),
10539            ],
10540        ))
10541        .unwrap();
10542        let t = cat.get_mut("vecs").unwrap();
10543        let dataset: alloc::vec::Vec<(i32, [f32; 3])> = alloc::vec![
10544            (1, [0.0, 0.0, 0.0]),
10545            (2, [1.0, 0.0, 0.0]),
10546            (3, [0.0, 1.0, 0.0]),
10547            (4, [0.0, 0.0, 1.0]),
10548            (5, [1.0, 1.0, 0.0]),
10549            (6, [1.0, 0.0, 1.0]),
10550            (7, [0.0, 1.0, 1.0]),
10551            (8, [1.0, 1.0, 1.0]),
10552            (9, [0.5, 0.5, 0.5]),
10553            (10, [0.2, 0.8, 0.5]),
10554        ];
10555        for &(id, v) in &dataset {
10556            t.insert(Row::new(alloc::vec![
10557                Value::Int(id),
10558                Value::Vector(alloc::vec![v[0], v[1], v[2]]),
10559            ]))
10560            .unwrap();
10561        }
10562        t.add_nsw_index("v_idx".into(), "v", NSW_DEFAULT_M).unwrap();
10563        let idx_pos = cat
10564            .get("vecs")
10565            .unwrap()
10566            .indices()
10567            .iter()
10568            .position(|i| i.name == "v_idx")
10569            .unwrap();
10570        for query in [[0.4, 0.4, 0.4], [0.9, 0.1, 0.0], [0.0, 0.9, 0.9]] {
10571            let table = cat.get("vecs").unwrap();
10572            let hnsw_top = nsw_search(table, idx_pos, &query, 1, 16, NswMetric::L2);
10573            let mut brute: alloc::vec::Vec<(f32, usize)> = (0..table.rows.len())
10574                .map(|i| {
10575                    let Value::Vector(v) = &table.rows[i].values[1] else {
10576                        return (f32::INFINITY, i);
10577                    };
10578                    (l2_distance_sq(v, &query), i)
10579                })
10580                .collect();
10581            brute.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
10582            assert!(!hnsw_top.is_empty(), "HNSW returned no results");
10583            assert_eq!(
10584                hnsw_top[0].1, brute[0].1,
10585                "HNSW top-1 != brute-force top-1 for {query:?}"
10586            );
10587        }
10588    }
10589
10590    #[test]
10591    fn serialize_table_with_rows_round_trips() {
10592        let mut cat = Catalog::new();
10593        cat.create_table(make_users_schema()).unwrap();
10594        let t = cat.get_mut("users").unwrap();
10595        t.insert(Row::new(vec![
10596            Value::Int(1),
10597            Value::Text("alice".into()),
10598            Value::Float(95.5),
10599        ]))
10600        .unwrap();
10601        t.insert(Row::new(vec![
10602            Value::Int(2),
10603            Value::Text("bob".into()),
10604            Value::Null,
10605        ]))
10606        .unwrap();
10607        assert_round_trip(&cat);
10608    }
10609
10610    #[test]
10611    fn serialize_multiple_tables_round_trips() {
10612        let mut cat = Catalog::new();
10613        cat.create_table(make_users_schema()).unwrap();
10614        cat.create_table(TableSchema::new(
10615            "flags",
10616            vec![
10617                ColumnSchema::new("id", DataType::BigInt, false),
10618                ColumnSchema::new("active", DataType::Bool, false),
10619            ],
10620        ))
10621        .unwrap();
10622        cat.get_mut("flags")
10623            .unwrap()
10624            .insert(Row::new(vec![Value::BigInt(7), Value::Bool(true)]))
10625            .unwrap();
10626        assert_round_trip(&cat);
10627    }
10628
10629    #[test]
10630    fn deserialize_rejects_bad_magic() {
10631        let mut buf = b"BADMAGIC".to_vec();
10632        buf.push(FILE_VERSION);
10633        buf.extend_from_slice(&0u32.to_le_bytes());
10634        let err = Catalog::deserialize(&buf).unwrap_err();
10635        assert!(matches!(err, StorageError::Corrupt(_)));
10636    }
10637
10638    #[test]
10639    fn deserialize_rejects_unsupported_version() {
10640        let mut buf = FILE_MAGIC.to_vec();
10641        buf.push(99); // future version
10642        buf.extend_from_slice(&0u32.to_le_bytes());
10643        let err = Catalog::deserialize(&buf).unwrap_err();
10644        assert!(matches!(err, StorageError::Corrupt(ref s) if s.contains("version")));
10645    }
10646
10647    #[test]
10648    fn deserialize_rejects_truncated_file() {
10649        let mut cat = Catalog::new();
10650        cat.create_table(make_users_schema()).unwrap();
10651        let bytes = cat.serialize();
10652        // Drop the last byte to simulate truncation.
10653        let truncated = &bytes[..bytes.len() - 1];
10654        assert!(matches!(
10655            Catalog::deserialize(truncated),
10656            Err(StorageError::Corrupt(_))
10657        ));
10658    }
10659
10660    #[test]
10661    fn deserialize_rejects_trailing_garbage() {
10662        let cat = Catalog::new();
10663        let mut bytes = cat.serialize();
10664        bytes.push(0xFF);
10665        assert!(matches!(
10666            Catalog::deserialize(&bytes),
10667            Err(StorageError::Corrupt(ref s)) if s.contains("trailing")
10668        ));
10669    }
10670
10671    // --- v0.8 indices ------------------------------------------------------
10672
10673    fn populated_users() -> Catalog {
10674        let mut cat = Catalog::new();
10675        cat.create_table(make_users_schema()).unwrap();
10676        let t = cat.get_mut("users").unwrap();
10677        for (id, name, score) in [
10678            (1, "alice", Some(90.0)),
10679            (2, "bob", None),
10680            (3, "alice", Some(70.0)), // duplicate name → maps to two row idxs
10681        ] {
10682            t.insert(Row::new(vec![
10683                Value::Int(id),
10684                Value::Text(name.into()),
10685                score.map_or(Value::Null, Value::Float),
10686            ]))
10687            .unwrap();
10688        }
10689        cat
10690    }
10691
10692    #[test]
10693    fn add_index_builds_from_existing_rows() {
10694        let mut cat = populated_users();
10695        cat.get_mut("users")
10696            .unwrap()
10697            .add_index("by_id".into(), "id")
10698            .unwrap();
10699        let t = cat.get("users").unwrap();
10700        let idx = t.index_on(0).expect("index_on(0)");
10701        assert_eq!(idx.lookup_eq(&IndexKey::Int(2)), &[RowLocator::Hot(1)]);
10702        assert_eq!(idx.lookup_eq(&IndexKey::Int(99)), &[] as &[RowLocator]);
10703    }
10704
10705    #[test]
10706    fn add_index_dup_name_rejected() {
10707        let mut cat = populated_users();
10708        let t = cat.get_mut("users").unwrap();
10709        t.add_index("ix".into(), "id").unwrap();
10710        let err = t.add_index("ix".into(), "name").unwrap_err();
10711        assert!(matches!(err, StorageError::DuplicateIndex { ref name } if name == "ix"));
10712    }
10713
10714    #[test]
10715    fn add_index_unknown_column_rejected() {
10716        let mut cat = populated_users();
10717        let err = cat
10718            .get_mut("users")
10719            .unwrap()
10720            .add_index("ix".into(), "ghost")
10721            .unwrap_err();
10722        assert!(matches!(err, StorageError::ColumnNotFound { ref column } if column == "ghost"));
10723    }
10724
10725    #[test]
10726    fn insert_after_create_index_updates_it() {
10727        let mut cat = populated_users();
10728        let t = cat.get_mut("users").unwrap();
10729        t.add_index("by_name".into(), "name").unwrap();
10730        t.insert(Row::new(vec![
10731            Value::Int(4),
10732            Value::Text("dave".into()),
10733            Value::Null,
10734        ]))
10735        .unwrap();
10736        let idx = t.index_on(1).unwrap();
10737        assert_eq!(
10738            idx.lookup_eq(&IndexKey::Text("dave".into())),
10739            &[RowLocator::Hot(3)]
10740        );
10741        // Pre-existing duplicates remain mapped to the two original row idxs.
10742        assert_eq!(
10743            idx.lookup_eq(&IndexKey::Text("alice".into())),
10744            &[RowLocator::Hot(0), RowLocator::Hot(2)]
10745        );
10746    }
10747
10748    #[test]
10749    fn null_or_float_values_are_not_indexed() {
10750        let mut cat = populated_users();
10751        let t = cat.get_mut("users").unwrap();
10752        t.add_index("by_score".into(), "score").unwrap();
10753        let idx = t.index_on(2).unwrap();
10754        // bob's score is NULL → no entry for bob.
10755        // Score is Float → the spec says we don't index NaN-prone columns,
10756        // so even the present scores are absent. Lookups via IndexKey::Int(90)
10757        // mis-match the column type and trivially find nothing.
10758        assert_eq!(idx.lookup_eq(&IndexKey::Int(90)), &[] as &[RowLocator]);
10759    }
10760
10761    // --- v0.11 vector type -------------------------------------------------
10762
10763    #[test]
10764    fn vector_value_data_type_carries_dim() {
10765        let v = Value::Vector(vec![1.0, 2.0, 3.0]);
10766        assert_eq!(
10767            v.data_type(),
10768            Some(DataType::Vector {
10769                dim: 3,
10770                encoding: VecEncoding::F32
10771            })
10772        );
10773    }
10774
10775    #[test]
10776    fn vector_column_insert_matching_dim_ok() {
10777        let mut cat = Catalog::new();
10778        cat.create_table(TableSchema::new(
10779            "emb",
10780            vec![ColumnSchema::new(
10781                "v",
10782                DataType::Vector {
10783                    dim: 3,
10784                    encoding: VecEncoding::F32,
10785                },
10786                false,
10787            )],
10788        ))
10789        .unwrap();
10790        cat.get_mut("emb")
10791            .unwrap()
10792            .insert(Row::new(vec![Value::Vector(vec![1.0, 2.0, 3.0])]))
10793            .unwrap();
10794    }
10795
10796    #[test]
10797    fn vector_column_insert_dim_mismatch_rejected() {
10798        let mut cat = Catalog::new();
10799        cat.create_table(TableSchema::new(
10800            "emb",
10801            vec![ColumnSchema::new(
10802                "v",
10803                DataType::Vector {
10804                    dim: 3,
10805                    encoding: VecEncoding::F32,
10806                },
10807                false,
10808            )],
10809        ))
10810        .unwrap();
10811        let err = cat
10812            .get_mut("emb")
10813            .unwrap()
10814            .insert(Row::new(vec![Value::Vector(vec![1.0, 2.0])]))
10815            .unwrap_err();
10816        assert!(matches!(err, StorageError::TypeMismatch { .. }));
10817    }
10818
10819    #[test]
10820    fn vector_value_survives_catalog_round_trip() {
10821        let mut cat = Catalog::new();
10822        cat.create_table(TableSchema::new(
10823            "emb",
10824            vec![
10825                ColumnSchema::new("id", DataType::Int, false),
10826                ColumnSchema::new(
10827                    "v",
10828                    DataType::Vector {
10829                        dim: 4,
10830                        encoding: VecEncoding::F32,
10831                    },
10832                    false,
10833                ),
10834            ],
10835        ))
10836        .unwrap();
10837        cat.get_mut("emb")
10838            .unwrap()
10839            .insert(Row::new(vec![
10840                Value::Int(1),
10841                Value::Vector(vec![0.5, -1.25, 3.0, 7.0]),
10842            ]))
10843            .unwrap();
10844        let restored = Catalog::deserialize(&cat.serialize()).expect("round-trip");
10845        let table = restored.get("emb").unwrap();
10846        assert_eq!(
10847            table.schema().columns[1].ty,
10848            DataType::Vector {
10849                dim: 4,
10850                encoding: VecEncoding::F32
10851            }
10852        );
10853        assert_eq!(
10854            table.rows()[0].values[1],
10855            Value::Vector(vec![0.5, -1.25, 3.0, 7.0])
10856        );
10857    }
10858
10859    #[test]
10860    fn index_survives_serialize_deserialize_round_trip() {
10861        let mut cat = populated_users();
10862        cat.get_mut("users")
10863            .unwrap()
10864            .add_index("by_name".into(), "name")
10865            .unwrap();
10866        let restored = Catalog::deserialize(&cat.serialize()).unwrap();
10867        let idx = restored
10868            .get("users")
10869            .unwrap()
10870            .index_on(1)
10871            .expect("index_on(1) after restore");
10872        assert_eq!(idx.name, "by_name");
10873        // Data was rebuilt from rows, not deserialized directly.
10874        assert_eq!(
10875            idx.lookup_eq(&IndexKey::Text("alice".into())),
10876            &[RowLocator::Hot(0), RowLocator::Hot(2)]
10877        );
10878    }
10879
10880    // --- v5.1 cold-tier integration tests ----------------------
10881
10882    /// Schema with a BIGINT PK column matching what the v5.1 cold-
10883    /// tier path supports (`IndexKey::Int` → `u64` cast).
10884    fn bigint_pk_users_schema() -> TableSchema {
10885        TableSchema::new(
10886            "users",
10887            vec![
10888                ColumnSchema::new("id", DataType::BigInt, false),
10889                ColumnSchema::new("name", DataType::Text, false),
10890            ],
10891        )
10892    }
10893
10894    fn make_user_row(id: i64, name: &str) -> Row {
10895        Row::new(vec![Value::BigInt(id), Value::Text(name.into())])
10896    }
10897
10898    // v7.20 P4 — update_row incremental index maintenance.
10899
10900    #[test]
10901    fn update_row_non_indexed_column_keeps_index_intact() {
10902        let mut cat = Catalog::new();
10903        cat.create_table(bigint_pk_users_schema()).unwrap();
10904        let t = cat.get_mut("users").unwrap();
10905        for (id, name) in [(1i64, "alice"), (2, "bob"), (3, "carol")] {
10906            t.insert(make_user_row(id, name)).unwrap();
10907        }
10908        t.add_index("by_id".into(), "id").unwrap();
10909        // Change only the non-indexed `name` column — the by_id
10910        // entry for key 2 must still resolve position 1.
10911        t.update_row(1, vec![Value::BigInt(2), Value::Text("bobby".into())])
10912            .unwrap();
10913        let idx = t.index_on(0).unwrap();
10914        assert_eq!(
10915            idx.lookup_eq(&IndexKey::Int(2)),
10916            &[RowLocator::Hot(1)],
10917            "old key still resolves the in-place position"
10918        );
10919        assert_eq!(t.rows()[1].values[1], Value::Text("bobby".into()));
10920    }
10921
10922    #[test]
10923    fn update_row_indexed_column_moves_entry() {
10924        let mut cat = Catalog::new();
10925        cat.create_table(bigint_pk_users_schema()).unwrap();
10926        let t = cat.get_mut("users").unwrap();
10927        for (id, name) in [(1i64, "alice"), (2, "bob"), (3, "carol")] {
10928            t.insert(make_user_row(id, name)).unwrap();
10929        }
10930        t.add_index("by_id".into(), "id").unwrap();
10931        // Change the indexed key 2 → 20.
10932        t.update_row(1, vec![Value::BigInt(20), Value::Text("bob".into())])
10933            .unwrap();
10934        let idx = t.index_on(0).unwrap();
10935        assert!(
10936            idx.lookup_eq(&IndexKey::Int(2)).is_empty(),
10937            "old key entry removed"
10938        );
10939        assert_eq!(
10940            idx.lookup_eq(&IndexKey::Int(20)),
10941            &[RowLocator::Hot(1)],
10942            "new key entry resolves the position"
10943        );
10944        // Untouched neighbours unaffected.
10945        assert_eq!(idx.lookup_eq(&IndexKey::Int(1)), &[RowLocator::Hot(0)]);
10946        assert_eq!(idx.lookup_eq(&IndexKey::Int(3)), &[RowLocator::Hot(2)]);
10947    }
10948
10949    #[test]
10950    fn update_row_duplicate_key_moves_only_target_position() {
10951        let mut cat = Catalog::new();
10952        cat.create_table(bigint_pk_users_schema()).unwrap();
10953        let t = cat.get_mut("users").unwrap();
10954        // Two rows share key 7.
10955        for (id, name) in [(7i64, "a"), (7, "b"), (9, "c")] {
10956            t.insert(make_user_row(id, name)).unwrap();
10957        }
10958        t.add_index("by_id".into(), "id").unwrap();
10959        // Move position 1's key 7 → 8; position 0 must keep its 7.
10960        t.update_row(1, vec![Value::BigInt(8), Value::Text("b".into())])
10961            .unwrap();
10962        let idx = t.index_on(0).unwrap();
10963        assert_eq!(idx.lookup_eq(&IndexKey::Int(7)), &[RowLocator::Hot(0)]);
10964        assert_eq!(idx.lookup_eq(&IndexKey::Int(8)), &[RowLocator::Hot(1)]);
10965        assert_eq!(idx.lookup_eq(&IndexKey::Int(9)), &[RowLocator::Hot(2)]);
10966    }
10967
10968    #[test]
10969    fn update_row_null_transition_on_indexed_nullable_column() {
10970        let mut cat = Catalog::new();
10971        cat.create_table(TableSchema::new(
10972            "n",
10973            vec![
10974                ColumnSchema::new("id", DataType::BigInt, false),
10975                ColumnSchema::new("tag", DataType::BigInt, true),
10976            ],
10977        ))
10978        .unwrap();
10979        let t = cat.get_mut("n").unwrap();
10980        t.insert(Row::new(vec![Value::BigInt(1), Value::BigInt(5)]))
10981            .unwrap();
10982        t.add_index("by_tag".into(), "tag").unwrap();
10983        // 5 → NULL: entry leaves the index (NULL never enters a B-tree).
10984        t.update_row(0, vec![Value::BigInt(1), Value::Null])
10985            .unwrap();
10986        let idx = t.index_on(1).unwrap();
10987        assert!(idx.lookup_eq(&IndexKey::Int(5)).is_empty());
10988        // NULL → 6: entry re-enters under the new key.
10989        t.update_row(0, vec![Value::BigInt(1), Value::BigInt(6)])
10990            .unwrap();
10991        let idx = t.index_on(1).unwrap();
10992        assert_eq!(idx.lookup_eq(&IndexKey::Int(6)), &[RowLocator::Hot(0)]);
10993    }
10994
10995    #[test]
10996    fn lookup_by_pk_finds_row_via_hot_index() {
10997        let mut cat = Catalog::new();
10998        cat.create_table(bigint_pk_users_schema()).unwrap();
10999        let t = cat.get_mut("users").unwrap();
11000        for (id, name) in [(1i64, "alice"), (2, "bob"), (3, "carol")] {
11001            t.insert(make_user_row(id, name)).unwrap();
11002        }
11003        t.add_index("by_id".into(), "id").unwrap();
11004        // All locators are Hot; cold_segments is empty.
11005        let got = cat
11006            .lookup_by_pk("users", "by_id", &IndexKey::Int(2))
11007            .unwrap();
11008        assert_eq!(got, make_user_row(2, "bob"));
11009        assert_eq!(cat.cold_segment_count(), 0);
11010    }
11011
11012    #[test]
11013    fn lookup_by_pk_returns_none_when_key_missing() {
11014        let mut cat = Catalog::new();
11015        cat.create_table(bigint_pk_users_schema()).unwrap();
11016        let t = cat.get_mut("users").unwrap();
11017        t.insert(make_user_row(1, "alice")).unwrap();
11018        t.add_index("by_id".into(), "id").unwrap();
11019        assert!(
11020            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(999))
11021                .is_none()
11022        );
11023        // Also: unknown table / unknown index name.
11024        assert!(
11025            cat.lookup_by_pk("other_table", "by_id", &IndexKey::Int(1))
11026                .is_none()
11027        );
11028        assert!(
11029            cat.lookup_by_pk("users", "no_such_index", &IndexKey::Int(1))
11030                .is_none()
11031        );
11032    }
11033
11034    #[test]
11035    fn lookup_by_pk_resolves_cold_locator_via_loaded_segment() {
11036        // Build a cold-tier segment whose payloads are dense-encoded
11037        // BIGINT rows. Wire each PK into the BTree index as a Cold
11038        // locator. The hot tier carries no rows for those PKs.
11039        let mut cat = Catalog::new();
11040        cat.create_table(bigint_pk_users_schema()).unwrap();
11041        let t = cat.get_mut("users").unwrap();
11042        t.add_index("by_id".into(), "id").unwrap();
11043        let schema = t.schema.clone();
11044
11045        let cold_rows: Vec<(i64, &str)> =
11046            vec![(100, "ivy"), (200, "joe"), (300, "kim"), (400, "lin")];
11047        let seg_rows: Vec<(u64, Vec<u8>)> = cold_rows
11048            .iter()
11049            .map(|(id, name)| {
11050                let row = make_user_row(*id, name);
11051                ((*id).cast_unsigned(), encode_row_body_dense(&row, &schema))
11052            })
11053            .collect();
11054        let (seg_bytes, _meta) =
11055            encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES).unwrap();
11056        let seg_id = cat.load_segment_bytes(seg_bytes).unwrap();
11057        assert_eq!(seg_id, 0);
11058        assert_eq!(cat.cold_segment_count(), 1);
11059
11060        let pairs: Vec<(IndexKey, RowLocator)> = cold_rows
11061            .iter()
11062            .map(|(id, _)| {
11063                (
11064                    IndexKey::Int(*id),
11065                    RowLocator::Cold {
11066                        segment_id: seg_id,
11067                        page_offset: 0,
11068                    },
11069                )
11070            })
11071            .collect();
11072        let registered = cat
11073            .get_mut("users")
11074            .unwrap()
11075            .register_cold_locators("by_id", pairs)
11076            .unwrap();
11077        assert_eq!(registered, 4);
11078
11079        for (id, name) in &cold_rows {
11080            let got = cat
11081                .lookup_by_pk("users", "by_id", &IndexKey::Int(*id))
11082                .unwrap_or_else(|| panic!("cold key {id} not found"));
11083            assert_eq!(got, make_user_row(*id, name));
11084        }
11085        // Cold key that isn't in the segment must return None.
11086        assert!(
11087            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(999))
11088                .is_none()
11089        );
11090    }
11091
11092    #[test]
11093    fn lookup_by_pk_mixes_hot_and_cold_tiers() {
11094        // Half the rows live in the hot tier (Table::rows + add_index
11095        // produces Hot locators); half live in a cold segment and have
11096        // Cold locators wired manually. Each lookup hits the right tier.
11097        let mut cat = Catalog::new();
11098        cat.create_table(bigint_pk_users_schema()).unwrap();
11099        let t = cat.get_mut("users").unwrap();
11100        for (id, name) in [(1i64, "alice"), (2, "bob")] {
11101            t.insert(make_user_row(id, name)).unwrap();
11102        }
11103        t.add_index("by_id".into(), "id").unwrap();
11104        let schema = t.schema.clone();
11105
11106        let cold_rows: Vec<(i64, &str)> = vec![(100, "ivy"), (200, "joe")];
11107        let seg_rows: Vec<(u64, Vec<u8>)> = cold_rows
11108            .iter()
11109            .map(|(id, name)| {
11110                let row = make_user_row(*id, name);
11111                ((*id).cast_unsigned(), encode_row_body_dense(&row, &schema))
11112            })
11113            .collect();
11114        let (seg_bytes, _) =
11115            encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES).unwrap();
11116        let seg_id = cat.load_segment_bytes(seg_bytes).unwrap();
11117        let pairs: Vec<(IndexKey, RowLocator)> = cold_rows
11118            .iter()
11119            .map(|(id, _)| {
11120                (
11121                    IndexKey::Int(*id),
11122                    RowLocator::Cold {
11123                        segment_id: seg_id,
11124                        page_offset: 0,
11125                    },
11126                )
11127            })
11128            .collect();
11129        cat.get_mut("users")
11130            .unwrap()
11131            .register_cold_locators("by_id", pairs)
11132            .unwrap();
11133
11134        // Hot tier hits.
11135        assert_eq!(
11136            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(1))
11137                .unwrap(),
11138            make_user_row(1, "alice")
11139        );
11140        assert_eq!(
11141            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(2))
11142                .unwrap(),
11143            make_user_row(2, "bob")
11144        );
11145        // Cold tier hits.
11146        assert_eq!(
11147            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(100))
11148                .unwrap(),
11149            make_user_row(100, "ivy")
11150        );
11151        assert_eq!(
11152            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(200))
11153                .unwrap(),
11154            make_user_row(200, "joe")
11155        );
11156        // Miss in both tiers.
11157        assert!(
11158            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(50))
11159                .is_none()
11160        );
11161    }
11162
11163    #[test]
11164    fn register_cold_locators_rejects_nsw_index() {
11165        let mut cat = Catalog::new();
11166        cat.create_table(TableSchema::new(
11167            "vecs",
11168            vec![
11169                ColumnSchema::new("id", DataType::Int, false),
11170                ColumnSchema::new(
11171                    "v",
11172                    DataType::Vector {
11173                        dim: 4,
11174                        encoding: VecEncoding::F32,
11175                    },
11176                    false,
11177                ),
11178            ],
11179        ))
11180        .unwrap();
11181        let t = cat.get_mut("vecs").unwrap();
11182        t.insert(Row::new(vec![
11183            Value::Int(1),
11184            Value::Vector(vec![1.0, 0.0, 0.0, 0.0]),
11185        ]))
11186        .unwrap();
11187        t.add_nsw_index("by_v".into(), "v", NSW_DEFAULT_M).unwrap();
11188        let err = t
11189            .register_cold_locators(
11190                "by_v",
11191                vec![(
11192                    IndexKey::Int(1),
11193                    RowLocator::Cold {
11194                        segment_id: 0,
11195                        page_offset: 0,
11196                    },
11197                )],
11198            )
11199            .unwrap_err();
11200        // v6.7.1: message switched from "is NSW" to "is not BTree"
11201        // when the Brin variant was added.
11202        assert!(matches!(err, StorageError::Corrupt(ref s) if s.contains("not BTree")));
11203    }
11204
11205    #[test]
11206    fn load_segment_bytes_rejects_garbage() {
11207        let mut cat = Catalog::new();
11208        let err = cat.load_segment_bytes(vec![0u8; 10]).unwrap_err();
11209        assert!(matches!(err, StorageError::Corrupt(ref s) if s.contains("segment")));
11210        // Loader doesn't mutate state on error.
11211        assert_eq!(cat.cold_segment_count(), 0);
11212    }
11213
11214    #[test]
11215    fn load_segment_bytes_returns_sequential_ids() {
11216        let mut cat = Catalog::new();
11217        cat.create_table(bigint_pk_users_schema()).unwrap();
11218        let schema = cat.get("users").unwrap().schema.clone();
11219        for batch in 0u32..3 {
11220            let rows: Vec<(u64, Vec<u8>)> = (0u64..4)
11221                .map(|i| {
11222                    let id = u64::from(batch) * 100 + i;
11223                    let row = make_user_row(id.cast_signed(), "x");
11224                    (id, encode_row_body_dense(&row, &schema))
11225                })
11226                .collect();
11227            let (bytes, _) = encode_segment(rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES).unwrap();
11228            assert_eq!(cat.load_segment_bytes(bytes).unwrap(), batch);
11229        }
11230        assert_eq!(cat.cold_segment_count(), 3);
11231    }
11232
11233    // --- v5.2 catalog format v9 ----------------------------------
11234
11235    /// Hand-craft a v8 catalog byte stream and confirm the v9 reader
11236    /// accepts it and surfaces every `BTree` entry as a Hot locator.
11237    /// Guards the backward-compat read path: existing v3.0.2 / v4.x
11238    /// snapshots on disk must keep loading after the v5.2 bump.
11239    #[test]
11240    fn v8_catalog_decodes_as_all_hot_under_v9_reader() {
11241        // Build a populated catalog in memory, snapshot it with the
11242        // v9 serializer, then patch the version byte back to 8 and
11243        // strip the v9 BTree payload bytes so the layout matches what
11244        // a real v8 snapshot would have produced on disk. The v9
11245        // reader's version dispatch path then rebuilds the index
11246        // from rows (every locator becomes Hot).
11247        let mut cat = populated_users();
11248        cat.get_mut("users")
11249            .unwrap()
11250            .add_index("by_name".into(), "name")
11251            .unwrap();
11252
11253        // To produce a faithful v8 byte stream we re-encode the same
11254        // catalog with the v8 layout: identical bytes up to (and
11255        // including) the per-index kind tag, but no inline BTree
11256        // entries.
11257        let v8_bytes = encode_as_v8(&cat);
11258        assert_eq!(v8_bytes[FILE_MAGIC.len()], 8, "version byte must be 8");
11259
11260        let restored = Catalog::deserialize(&v8_bytes).expect("v9 reader accepts v8 stream");
11261        let idx = restored
11262            .get("users")
11263            .unwrap()
11264            .index_on(1)
11265            .expect("index_on(1) after restore");
11266        // v8 path always materialises Hot locators (no cold tier
11267        // existed pre-v5.2).
11268        assert_eq!(
11269            idx.lookup_eq(&IndexKey::Text("alice".into())),
11270            &[RowLocator::Hot(0), RowLocator::Hot(2)]
11271        );
11272        // No accidental Cold leak.
11273        for entry in idx.lookup_eq(&IndexKey::Text("alice".into())) {
11274            assert!(entry.is_hot(), "v8 → v9 read must yield Hot only");
11275        }
11276    }
11277
11278    /// Encode `cat` using the v8 layout (no inline `BTree` entries,
11279    /// version byte = 8). Pure test helper — duplicates just enough
11280    /// of `Catalog::serialize` to produce a faithful v8 stream that
11281    /// real v3.0.2 / v4.x deployments wrote.
11282    fn encode_as_v8(cat: &Catalog) -> Vec<u8> {
11283        let mut out = Vec::with_capacity(64);
11284        out.extend_from_slice(FILE_MAGIC);
11285        out.push(8u8);
11286        write_u32(&mut out, u32::try_from(cat.tables.len()).unwrap());
11287        for t in &cat.tables {
11288            write_str(&mut out, &t.schema.name);
11289            write_u16(&mut out, u16::try_from(t.schema.columns.len()).unwrap());
11290            for c in &t.schema.columns {
11291                write_str(&mut out, &c.name);
11292                write_data_type(&mut out, c.ty);
11293                out.push(u8::from(c.nullable));
11294                match &c.default {
11295                    None => out.push(0),
11296                    Some(v) => {
11297                        out.push(1);
11298                        write_value(&mut out, v);
11299                    }
11300                }
11301                out.push(u8::from(c.auto_increment));
11302            }
11303            write_u32(&mut out, u32::try_from(t.rows.len()).unwrap());
11304            for row in &t.rows {
11305                out.extend_from_slice(&encode_row_body_dense(row, &t.schema));
11306            }
11307            write_u16(&mut out, u16::try_from(t.indices.len()).unwrap());
11308            for idx in &t.indices {
11309                write_str(&mut out, &idx.name);
11310                write_u16(&mut out, u16::try_from(idx.column_position).unwrap());
11311                match &idx.kind {
11312                    // v8 BTree wrote only the kind tag; entries
11313                    // rebuild from rows on read.
11314                    IndexKind::BTree(_) => out.push(0),
11315                    IndexKind::Nsw(g) => {
11316                        out.push(1);
11317                        write_u16(&mut out, u16::try_from(g.m).unwrap());
11318                        write_nsw_graph(&mut out, g);
11319                    }
11320                    // v8 had no BRIN / GIN; this test-only writer
11321                    // can't serialise either into the legacy format.
11322                    IndexKind::Brin { .. } => panic!(
11323                        "v8 catalog writer cannot serialise BRIN — \
11324                         tests with BRIN indices must use the current writer"
11325                    ),
11326                    IndexKind::Gin(_) => panic!(
11327                        "v8 catalog writer cannot serialise GIN — \
11328                         tests with GIN indices must use the current writer"
11329                    ),
11330                    IndexKind::GinTrgm(_) => panic!(
11331                        "v8 catalog writer cannot serialise trigram-GIN — \
11332                         tests with trgm indices must use the current writer"
11333                    ),
11334                    IndexKind::GinFulltext(_) => panic!(
11335                        "v8 catalog writer cannot serialise fulltext-GIN — \
11336                         tests with FULLTEXT KEY must use the current writer"
11337                    ),
11338                }
11339            }
11340        }
11341        out
11342    }
11343
11344    /// Build a catalog that carries both hot and cold locators on a
11345    /// `BTree` index, snapshot it through `serialize`, then deserialise
11346    /// and confirm every Cold locator round-trips byte-identical and
11347    /// `lookup_by_pk` resolves through the rebuilt cold-segment
11348    /// registry.
11349    #[test]
11350    fn v9_catalog_round_trip_preserves_cold_locators() {
11351        let mut cat = Catalog::new();
11352        cat.create_table(bigint_pk_users_schema()).unwrap();
11353        let t = cat.get_mut("users").unwrap();
11354        // Hot rows: 1, 2
11355        for (id, name) in [(1i64, "alice"), (2, "bob")] {
11356            t.insert(make_user_row(id, name)).unwrap();
11357        }
11358        t.add_index("by_id".into(), "id").unwrap();
11359        let schema = t.schema.clone();
11360
11361        // Cold rows: 100, 200, 300 — sit in a single segment.
11362        let cold_rows: Vec<(i64, &str)> = vec![(100, "ivy"), (200, "joe"), (300, "kim")];
11363        let seg_rows: Vec<(u64, Vec<u8>)> = cold_rows
11364            .iter()
11365            .map(|(id, name)| {
11366                let row = make_user_row(*id, name);
11367                ((*id).cast_unsigned(), encode_row_body_dense(&row, &schema))
11368            })
11369            .collect();
11370        let (seg_bytes, _) =
11371            encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES).unwrap();
11372        let seg_id = cat.load_segment_bytes(seg_bytes.clone()).unwrap();
11373        let pairs: Vec<(IndexKey, RowLocator)> = cold_rows
11374            .iter()
11375            .map(|(id, _)| {
11376                (
11377                    IndexKey::Int(*id),
11378                    RowLocator::Cold {
11379                        segment_id: seg_id,
11380                        page_offset: 0,
11381                    },
11382                )
11383            })
11384            .collect();
11385        cat.get_mut("users")
11386            .unwrap()
11387            .register_cold_locators("by_id", pairs)
11388            .unwrap();
11389
11390        // Snapshot + restore via the v9 codec.
11391        let bytes = cat.serialize();
11392        assert_eq!(bytes[FILE_MAGIC.len()], FILE_VERSION);
11393        let mut restored = Catalog::deserialize(&bytes).expect("v9 round-trip parses");
11394
11395        // Catalog::serialize does not yet emit cold segment file
11396        // bytes (v5.3 manifest is the future home for that). For
11397        // this v9 test the caller side-loads the segment again so
11398        // lookup_by_pk can resolve the Cold locator. The point of
11399        // this assertion is that the locator metadata survived the
11400        // catalog round-trip.
11401        let restored_seg_id = restored.load_segment_bytes(seg_bytes).unwrap();
11402        assert_eq!(restored_seg_id, seg_id);
11403
11404        let idx = restored.get("users").unwrap().index_on(0).unwrap();
11405        // Hot locators round-trip.
11406        assert_eq!(idx.lookup_eq(&IndexKey::Int(1)), &[RowLocator::Hot(0)]);
11407        assert_eq!(idx.lookup_eq(&IndexKey::Int(2)), &[RowLocator::Hot(1)]);
11408        // Cold locators round-trip byte-identical.
11409        for (id, _) in &cold_rows {
11410            assert_eq!(
11411                idx.lookup_eq(&IndexKey::Int(*id)),
11412                &[RowLocator::Cold {
11413                    segment_id: seg_id,
11414                    page_offset: 0,
11415                }]
11416            );
11417        }
11418        // End-to-end: lookup_by_pk resolves both tiers.
11419        assert_eq!(
11420            restored
11421                .lookup_by_pk("users", "by_id", &IndexKey::Int(2))
11422                .unwrap(),
11423            make_user_row(2, "bob")
11424        );
11425        for (id, name) in &cold_rows {
11426            assert_eq!(
11427                restored
11428                    .lookup_by_pk("users", "by_id", &IndexKey::Int(*id))
11429                    .unwrap(),
11430                make_user_row(*id, name)
11431            );
11432        }
11433    }
11434
11435    // --- v5.2.1 hot tier byte tracking ---------------------------
11436
11437    /// `row_body_encoded_len` is the perf-critical fast path; pin it
11438    /// against `encode_row_body_dense(...).len()` for every
11439    /// representative cell type so an encoder change can't silently
11440    /// desync the counter.
11441    #[test]
11442    fn row_body_encoded_len_matches_actual_encode_for_all_types() {
11443        let schema = TableSchema::new(
11444            "wide",
11445            vec![
11446                ColumnSchema::new("a", DataType::SmallInt, true),
11447                ColumnSchema::new("b", DataType::Int, false),
11448                ColumnSchema::new("c", DataType::BigInt, false),
11449                ColumnSchema::new("d", DataType::Float, false),
11450                ColumnSchema::new("e", DataType::Bool, false),
11451                ColumnSchema::new("f", DataType::Text, false),
11452                ColumnSchema::new(
11453                    "g",
11454                    DataType::Vector {
11455                        dim: 3,
11456                        encoding: VecEncoding::F32,
11457                    },
11458                    false,
11459                ),
11460                ColumnSchema::new(
11461                    "h",
11462                    DataType::Numeric {
11463                        precision: 18,
11464                        scale: 2,
11465                    },
11466                    false,
11467                ),
11468                ColumnSchema::new("i", DataType::Date, false),
11469                ColumnSchema::new("j", DataType::Timestamp, false),
11470            ],
11471        );
11472        let cases: &[Row] = &[
11473            Row::new(vec![
11474                Value::SmallInt(7),
11475                Value::Int(42),
11476                Value::BigInt(1_000_000),
11477                Value::Float(1.5),
11478                Value::Bool(true),
11479                Value::Text("hello".into()),
11480                Value::Vector(vec![1.0, 2.0, 3.0]),
11481                Value::Numeric {
11482                    scaled: 12345,
11483                    scale: 2,
11484                },
11485                Value::Date(20_000),
11486                Value::Timestamp(1_700_000_000_000_000),
11487            ]),
11488            // NULL in the bitmap, varied text length.
11489            Row::new(vec![
11490                Value::Null,
11491                Value::Int(0),
11492                Value::BigInt(0),
11493                Value::Float(0.0),
11494                Value::Bool(false),
11495                Value::Text(String::new()),
11496                Value::Vector(vec![]),
11497                Value::Numeric {
11498                    scaled: 0,
11499                    scale: 2,
11500                },
11501                Value::Date(0),
11502                Value::Timestamp(0),
11503            ]),
11504            Row::new(vec![
11505                Value::SmallInt(-1),
11506                Value::Int(-1),
11507                Value::BigInt(-1),
11508                Value::Float(-0.5),
11509                Value::Bool(true),
11510                Value::Text("a much longer payload here".into()),
11511                Value::Vector(vec![0.1, 0.2, 0.3]),
11512                Value::Numeric {
11513                    scaled: -999_999_999,
11514                    scale: 2,
11515                },
11516                Value::Date(-1),
11517                Value::Timestamp(-1),
11518            ]),
11519        ];
11520        for row in cases {
11521            let actual = encode_row_body_dense(row, &schema).len();
11522            let fast = row_body_encoded_len(row, &schema);
11523            assert_eq!(actual, fast, "row {row:?}");
11524        }
11525    }
11526
11527    #[test]
11528    fn hot_bytes_grows_on_insert_and_matches_encoded_sum() {
11529        let mut cat = Catalog::new();
11530        cat.create_table(bigint_pk_users_schema()).unwrap();
11531        let t = cat.get_mut("users").unwrap();
11532        assert_eq!(t.hot_bytes(), 0);
11533        let mut expected: u64 = 0;
11534        for (id, name) in [(1i64, "alice"), (2, "bob"), (3, "carol")] {
11535            let row = make_user_row(id, name);
11536            expected += encode_row_body_dense(&row, &t.schema).len() as u64;
11537            t.insert(row).unwrap();
11538        }
11539        assert_eq!(t.hot_bytes(), expected);
11540        assert_eq!(cat.hot_tier_bytes(), expected);
11541    }
11542
11543    #[test]
11544    fn hot_bytes_shrinks_on_delete() {
11545        let mut cat = Catalog::new();
11546        cat.create_table(bigint_pk_users_schema()).unwrap();
11547        let t = cat.get_mut("users").unwrap();
11548        for (id, name) in [(1i64, "alice"), (2, "bob"), (3, "carol")] {
11549            t.insert(make_user_row(id, name)).unwrap();
11550        }
11551        let before = t.hot_bytes();
11552        // Delete row at position 1 (bob).
11553        let bob_row = make_user_row(2, "bob");
11554        let bob_bytes = encode_row_body_dense(&bob_row, &t.schema).len() as u64;
11555        let removed = t.delete_rows(&[1]);
11556        assert_eq!(removed, 1);
11557        assert_eq!(t.hot_bytes(), before - bob_bytes);
11558    }
11559
11560    #[test]
11561    fn hot_bytes_diffs_on_update_for_variable_width_columns() {
11562        let mut cat = Catalog::new();
11563        cat.create_table(bigint_pk_users_schema()).unwrap();
11564        let t = cat.get_mut("users").unwrap();
11565        t.insert(make_user_row(1, "alice")).unwrap();
11566        let after_insert = t.hot_bytes();
11567        // Update with a longer text payload — bytes must grow exactly
11568        // by the text-length delta.
11569        let new_row = make_user_row(1, "alice-the-longer-name");
11570        let old_len = encode_row_body_dense(&make_user_row(1, "alice"), &t.schema).len() as u64;
11571        let new_len = encode_row_body_dense(&new_row, &t.schema).len() as u64;
11572        t.update_row(0, new_row.values).unwrap();
11573        assert_eq!(t.hot_bytes(), after_insert - old_len + new_len);
11574        assert!(t.hot_bytes() > after_insert, "longer text grew the counter");
11575    }
11576
11577    #[test]
11578    fn hot_bytes_round_trips_through_serialize_deserialize() {
11579        let mut cat = Catalog::new();
11580        cat.create_table(bigint_pk_users_schema()).unwrap();
11581        let t = cat.get_mut("users").unwrap();
11582        for i in 0..10 {
11583            t.insert(make_user_row(i, &alloc::format!("name-{i}")))
11584                .unwrap();
11585        }
11586        let pre = cat.hot_tier_bytes();
11587        let restored = Catalog::deserialize(&cat.serialize()).unwrap();
11588        assert_eq!(restored.hot_tier_bytes(), pre);
11589        assert_eq!(restored.get("users").unwrap().hot_bytes(), pre);
11590    }
11591
11592    // --- v5.2.2 freezer atomic swap -------------------------------
11593
11594    /// Happy path: freeze the first half of a populated hot tier,
11595    /// confirm row counts shift, `hot_bytes` shrinks, and every frozen
11596    /// PK still resolves via `lookup_by_pk` (now through the cold
11597    /// segment registered by the freeze).
11598    #[test]
11599    fn freeze_oldest_to_cold_moves_rows_and_keeps_lookups_working() {
11600        let mut cat = Catalog::new();
11601        cat.create_table(bigint_pk_users_schema()).unwrap();
11602        let t = cat.get_mut("users").unwrap();
11603        for id in 0..10i64 {
11604            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
11605                .unwrap();
11606        }
11607        t.add_index("by_id".into(), "id").unwrap();
11608        let total_bytes_before = t.hot_bytes();
11609
11610        let report = cat
11611            .freeze_oldest_to_cold("users", "by_id", 6)
11612            .expect("freeze succeeds");
11613        assert_eq!(report.frozen_rows, 6);
11614        assert_eq!(report.segment_id, 0);
11615        assert!(report.bytes_freed > 0);
11616        assert!(!report.segment_bytes.is_empty());
11617
11618        let t = cat.get("users").unwrap();
11619        assert_eq!(t.row_count(), 4, "4 hot rows remain (10 - 6 frozen)");
11620        assert_eq!(cat.cold_segment_count(), 1);
11621        // Hot bytes shrank by exactly the freed amount.
11622        assert_eq!(
11623            t.hot_bytes(),
11624            total_bytes_before - report.bytes_freed,
11625            "hot_bytes accounting matches FreezeReport"
11626        );
11627
11628        // Every original PK still resolves — frozen ones via the
11629        // cold segment, kept ones via the (renumbered) hot tier.
11630        for id in 0..10i64 {
11631            let got = cat
11632                .lookup_by_pk("users", "by_id", &IndexKey::Int(id))
11633                .unwrap_or_else(|| panic!("PK {id} disappeared after freeze"));
11634            assert_eq!(got, make_user_row(id, &alloc::format!("u-{id}")));
11635        }
11636    }
11637
11638    /// Two successive freezes on the same index must preserve the
11639    /// first batch's cold locators when the second freeze runs.
11640    /// Catches the `rebuild_indices` wipe-Cold-on-delete bug that
11641    /// `collect_cold_locators` / re-register guards against.
11642    #[test]
11643    fn freeze_twice_preserves_prior_cold_locators() {
11644        let mut cat = Catalog::new();
11645        cat.create_table(bigint_pk_users_schema()).unwrap();
11646        let t = cat.get_mut("users").unwrap();
11647        for id in 0..12i64 {
11648            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
11649                .unwrap();
11650        }
11651        t.add_index("by_id".into(), "id").unwrap();
11652
11653        cat.freeze_oldest_to_cold("users", "by_id", 4)
11654            .expect("first freeze ok");
11655        cat.freeze_oldest_to_cold("users", "by_id", 4)
11656            .expect("second freeze ok");
11657
11658        assert_eq!(cat.get("users").unwrap().row_count(), 4);
11659        assert_eq!(cat.cold_segment_count(), 2);
11660        // All 12 PKs still resolve — first 4 via segment 0,
11661        // next 4 via segment 1, last 4 still hot.
11662        for id in 0..12i64 {
11663            let got = cat
11664                .lookup_by_pk("users", "by_id", &IndexKey::Int(id))
11665                .unwrap_or_else(|| panic!("PK {id} not resolvable after two freezes"));
11666            assert_eq!(got, make_user_row(id, &alloc::format!("u-{id}")));
11667        }
11668    }
11669
11670    /// Validation guard tests. Each must return `Err` and **not
11671    /// mutate the catalog** — the API is all-or-nothing.
11672    #[test]
11673    fn freeze_oldest_to_cold_rejects_invalid_input() {
11674        let mut cat = Catalog::new();
11675        cat.create_table(bigint_pk_users_schema()).unwrap();
11676        let t = cat.get_mut("users").unwrap();
11677        for id in 0..3i64 {
11678            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
11679                .unwrap();
11680        }
11681        t.add_index("by_id".into(), "id").unwrap();
11682
11683        // max_rows == 0
11684        assert!(matches!(
11685            cat.freeze_oldest_to_cold("users", "by_id", 0),
11686            Err(StorageError::Corrupt(_))
11687        ));
11688        // table missing
11689        assert!(matches!(
11690            cat.freeze_oldest_to_cold("missing", "by_id", 1),
11691            Err(StorageError::Corrupt(_))
11692        ));
11693        // index missing
11694        assert!(matches!(
11695            cat.freeze_oldest_to_cold("users", "no_such_index", 1),
11696            Err(StorageError::Corrupt(_))
11697        ));
11698        // max_rows > row_count
11699        assert!(matches!(
11700            cat.freeze_oldest_to_cold("users", "by_id", 999),
11701            Err(StorageError::Corrupt(_))
11702        ));
11703        // Catalog still untouched.
11704        assert_eq!(cat.get("users").unwrap().row_count(), 3);
11705        assert_eq!(cat.cold_segment_count(), 0);
11706    }
11707
11708    /// Freeze with a non-integer PK column must surface a clear
11709    /// error (Text PKs land in v5.5+).
11710    #[test]
11711    fn freeze_oldest_to_cold_rejects_non_integer_pk() {
11712        let mut cat = Catalog::new();
11713        cat.create_table(TableSchema::new(
11714            "by_name",
11715            vec![
11716                ColumnSchema::new("name", DataType::Text, false),
11717                ColumnSchema::new("payload", DataType::BigInt, false),
11718            ],
11719        ))
11720        .unwrap();
11721        let t = cat.get_mut("by_name").unwrap();
11722        t.insert(Row::new(vec![Value::Text("a".into()), Value::BigInt(1)]))
11723            .unwrap();
11724        t.add_index("by_n".into(), "name").unwrap();
11725        let err = cat
11726            .freeze_oldest_to_cold("by_name", "by_n", 1)
11727            .expect_err("non-integer PK rejected");
11728        match err {
11729            StorageError::Corrupt(s) => assert!(
11730                s.contains("non-integer"),
11731                "error message names the constraint: {s}"
11732            ),
11733            other => panic!("expected Corrupt, got {other:?}"),
11734        }
11735        // Catalog untouched.
11736        assert_eq!(cat.get("by_name").unwrap().row_count(), 1);
11737        assert_eq!(cat.cold_segment_count(), 0);
11738    }
11739
11740    /// Hot-tier rows after the freeze must keep their secondary-
11741    /// index lookups working — `delete_rows` shifts positions, and
11742    /// `rebuild_indices` must regenerate Hot locators at the new
11743    /// indices.
11744    #[test]
11745    fn freeze_keeps_remaining_hot_rows_addressable_via_secondary_index() {
11746        let mut cat = Catalog::new();
11747        cat.create_table(bigint_pk_users_schema()).unwrap();
11748        let t = cat.get_mut("users").unwrap();
11749        for id in 0..6i64 {
11750            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
11751                .unwrap();
11752        }
11753        t.add_index("by_id".into(), "id").unwrap();
11754        t.add_index("by_name".into(), "name").unwrap();
11755
11756        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
11757
11758        // Remaining hot rows: id 3, 4, 5. They moved to positions
11759        // 0, 1, 2 inside `self.rows`; the `by_name` index must now
11760        // resolve them via fresh Hot locators.
11761        let idx = cat.get("users").unwrap().index_on(1).unwrap();
11762        let got = idx.lookup_eq(&IndexKey::Text("u-4".into()));
11763        assert_eq!(got.len(), 1);
11764        assert!(got[0].is_hot(), "kept-hot rows still surface as Hot");
11765        match got[0] {
11766            RowLocator::Hot(i) => {
11767                // The 4th-inserted row was at position 4; after
11768                // dropping positions 0..3 it sits at position 1.
11769                assert_eq!(i, 1);
11770            }
11771            RowLocator::Cold { .. } => unreachable!(),
11772        }
11773    }
11774
11775    // --- v5.2.3 promote-on-write primitives ----------------------
11776
11777    /// Build a populated catalog with the first N rows frozen, then
11778    /// run `promote_cold_row` and verify the row crossed tiers
11779    /// correctly: the cold locator is retired, a fresh Hot locator
11780    /// appears, `lookup_by_pk` returns the row from the hot tier, and
11781    /// `hot_bytes` grew by the row's encoded byte length.
11782    #[test]
11783    fn promote_cold_row_pulls_frozen_row_back_to_hot_tier() {
11784        let mut cat = Catalog::new();
11785        cat.create_table(bigint_pk_users_schema()).unwrap();
11786        let t = cat.get_mut("users").unwrap();
11787        for id in 0..6i64 {
11788            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
11789                .unwrap();
11790        }
11791        t.add_index("by_id".into(), "id").unwrap();
11792        // Freeze first 4 rows (ids 0..3). After: hot rows = 4, 5 at
11793        // positions 0, 1; cold locators for keys 0..3.
11794        cat.freeze_oldest_to_cold("users", "by_id", 4).unwrap();
11795        let hot_bytes_before = cat.get("users").unwrap().hot_bytes();
11796
11797        // Promote PK=2 — it lives in segment 0 as a cold row.
11798        let new_idx = cat
11799            .promote_cold_row("users", "by_id", &IndexKey::Int(2))
11800            .expect("promote ok")
11801            .expect("PK 2 was cold");
11802        assert_eq!(
11803            new_idx, 2,
11804            "promoted row appended after the 2 surviving hot rows"
11805        );
11806
11807        let t = cat.get("users").unwrap();
11808        assert_eq!(t.row_count(), 3, "hot tier grew from 2 to 3");
11809        // Hot-bytes climbed by exactly one row's encoded length.
11810        let row = make_user_row(2, "u-2");
11811        let row_len = encode_row_body_dense(&row, &t.schema).len() as u64;
11812        assert_eq!(t.hot_bytes(), hot_bytes_before + row_len);
11813
11814        // The index now reports a Hot locator (the freshly inserted
11815        // row) — no Cold locator left for PK 2.
11816        let entries = t.index_on(0).unwrap().lookup_eq(&IndexKey::Int(2));
11817        assert_eq!(entries.len(), 1, "exactly one locator per key");
11818        assert!(entries[0].is_hot(), "promote retired the Cold locator");
11819        // End-to-end: lookup_by_pk still returns the row body.
11820        assert_eq!(
11821            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(2))
11822                .unwrap(),
11823            row
11824        );
11825        // Other cold rows untouched — still resolvable through the
11826        // segment.
11827        assert_eq!(
11828            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(0))
11829                .unwrap(),
11830            make_user_row(0, "u-0")
11831        );
11832    }
11833
11834    /// `promote_cold_row` on a key that's already hot (or absent)
11835    /// returns `Ok(None)` — not an error. The caller falls back to
11836    /// the hot-only update/delete path.
11837    #[test]
11838    fn promote_cold_row_returns_none_when_key_is_not_cold() {
11839        let mut cat = Catalog::new();
11840        cat.create_table(bigint_pk_users_schema()).unwrap();
11841        let t = cat.get_mut("users").unwrap();
11842        t.insert(make_user_row(7, "alice")).unwrap();
11843        t.add_index("by_id".into(), "id").unwrap();
11844
11845        // Hot-only key.
11846        assert!(
11847            cat.promote_cold_row("users", "by_id", &IndexKey::Int(7))
11848                .unwrap()
11849                .is_none()
11850        );
11851        // Absent key.
11852        assert!(
11853            cat.promote_cold_row("users", "by_id", &IndexKey::Int(99))
11854                .unwrap()
11855                .is_none()
11856        );
11857        // Catalog untouched on both no-op paths.
11858        assert_eq!(cat.get("users").unwrap().row_count(), 1);
11859        assert_eq!(cat.cold_segment_count(), 0);
11860    }
11861
11862    /// `shadow_cold_row` removes every Cold locator for a key on a
11863    /// `BTree` index. After the shadow, `lookup_by_pk` for that key
11864    /// returns None (the row data still sits in the segment file,
11865    /// but it's now garbage; compaction will reclaim it later).
11866    #[test]
11867    fn shadow_cold_row_removes_cold_locators_and_drops_lookup() {
11868        let mut cat = Catalog::new();
11869        cat.create_table(bigint_pk_users_schema()).unwrap();
11870        let t = cat.get_mut("users").unwrap();
11871        for id in 0..5i64 {
11872            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
11873                .unwrap();
11874        }
11875        t.add_index("by_id".into(), "id").unwrap();
11876        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
11877
11878        // Shadow PK=1 — pre-shadow lookup hits the cold tier.
11879        assert!(
11880            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(1))
11881                .is_some(),
11882            "frozen PK resolves before shadow"
11883        );
11884        let removed = cat
11885            .shadow_cold_row("users", "by_id", &IndexKey::Int(1))
11886            .unwrap();
11887        assert_eq!(removed, 1, "exactly one cold locator retired");
11888
11889        // Post-shadow: lookup misses, even though the row still
11890        // exists in segment 0.
11891        assert!(
11892            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(1))
11893                .is_none(),
11894            "shadowed key no longer resolves"
11895        );
11896        // Other cold keys still resolve.
11897        assert_eq!(
11898            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(0))
11899                .unwrap(),
11900            make_user_row(0, "u-0")
11901        );
11902        assert_eq!(
11903            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(2))
11904                .unwrap(),
11905            make_user_row(2, "u-2")
11906        );
11907    }
11908
11909    /// `shadow_cold_row` returns 0 (not Err) for keys with only Hot
11910    /// entries or no entries — the engine's DELETE path uses this
11911    /// signal to decide whether the cold-tier shadow path consumed
11912    /// the work.
11913    #[test]
11914    fn shadow_cold_row_returns_zero_when_key_is_not_cold() {
11915        let mut cat = Catalog::new();
11916        cat.create_table(bigint_pk_users_schema()).unwrap();
11917        let t = cat.get_mut("users").unwrap();
11918        t.insert(make_user_row(1, "alice")).unwrap();
11919        t.add_index("by_id".into(), "id").unwrap();
11920        assert_eq!(
11921            cat.shadow_cold_row("users", "by_id", &IndexKey::Int(1))
11922                .unwrap(),
11923            0,
11924            "hot-only key drops no cold locators"
11925        );
11926        assert_eq!(
11927            cat.shadow_cold_row("users", "by_id", &IndexKey::Int(999))
11928                .unwrap(),
11929            0,
11930            "absent key drops no cold locators"
11931        );
11932        assert_eq!(cat.get("users").unwrap().row_count(), 1);
11933    }
11934
11935    /// Validation guards on both promote / shadow primitives.
11936    #[test]
11937    fn promote_and_shadow_reject_invalid_inputs() {
11938        let mut cat = Catalog::new();
11939        cat.create_table(bigint_pk_users_schema()).unwrap();
11940        let t = cat.get_mut("users").unwrap();
11941        t.insert(make_user_row(1, "alice")).unwrap();
11942        t.add_index("by_id".into(), "id").unwrap();
11943
11944        // Missing table.
11945        assert!(matches!(
11946            cat.promote_cold_row("missing", "by_id", &IndexKey::Int(1)),
11947            Err(StorageError::Corrupt(_))
11948        ));
11949        assert!(matches!(
11950            cat.shadow_cold_row("missing", "by_id", &IndexKey::Int(1)),
11951            Err(StorageError::Corrupt(_))
11952        ));
11953        // Missing index.
11954        assert!(matches!(
11955            cat.promote_cold_row("users", "no_such_index", &IndexKey::Int(1)),
11956            Err(StorageError::Corrupt(_))
11957        ));
11958        assert!(matches!(
11959            cat.shadow_cold_row("users", "no_such_index", &IndexKey::Int(1)),
11960            Err(StorageError::Corrupt(_))
11961        ));
11962    }
11963
11964    // --- v6.7.4 parallel-freezer slice/commit API -----------------
11965
11966    /// One slice covering the entire freeze produces the same
11967    /// catalog state as the single-threaded `freeze_oldest_to_cold`
11968    /// — segment id, frozen row count, hot byte delta, and every
11969    /// post-freeze PK lookup match exactly.
11970    #[test]
11971    fn commit_freeze_slices_single_slice_matches_freeze_oldest() {
11972        let mut a = Catalog::new();
11973        let mut b = Catalog::new();
11974        for cat in [&mut a, &mut b] {
11975            cat.create_table(bigint_pk_users_schema()).unwrap();
11976            let t = cat.get_mut("users").unwrap();
11977            for id in 0..10i64 {
11978                t.insert(make_user_row(id, &alloc::format!("u-{id}")))
11979                    .unwrap();
11980            }
11981            t.add_index("by_id".into(), "id").unwrap();
11982        }
11983        let single = a.freeze_oldest_to_cold("users", "by_id", 6).unwrap();
11984        let slice = b
11985            .prepare_freeze_slice("users", "by_id", 0..6)
11986            .expect("prepare");
11987        let parallel = b
11988            .commit_freeze_slices("users", "by_id", alloc::vec![slice])
11989            .expect("commit");
11990        assert_eq!(single.segment_id, parallel.segment_id);
11991        assert_eq!(single.frozen_rows, parallel.frozen_rows);
11992        assert_eq!(single.bytes_freed, parallel.bytes_freed);
11993        assert_eq!(single.segment_bytes, parallel.segment_bytes);
11994        // Same post-freeze lookup behaviour on both catalogs.
11995        for id in 0..10i64 {
11996            assert_eq!(
11997                a.lookup_by_pk("users", "by_id", &IndexKey::Int(id)),
11998                b.lookup_by_pk("users", "by_id", &IndexKey::Int(id)),
11999                "PK {id} differs after single vs slice freeze"
12000            );
12001        }
12002    }
12003
12004    /// Two slices covering disjoint halves of the freeze produce
12005    /// the same merged segment as one slice covering the full
12006    /// range. The k-way merge preserves PK ordering even when
12007    /// slice halves alternate.
12008    #[test]
12009    fn commit_freeze_slices_two_slices_match_single_slice() {
12010        let mut a = Catalog::new();
12011        let mut b = Catalog::new();
12012        for cat in [&mut a, &mut b] {
12013            cat.create_table(bigint_pk_users_schema()).unwrap();
12014            let t = cat.get_mut("users").unwrap();
12015            // Random-ish PKs so the per-slice sort actually has
12016            // work to do (and slice halves carry interleaved keys).
12017            for id in [3, 7, 1, 9, 5, 0, 8, 4, 2, 6].iter().copied() {
12018                t.insert(make_user_row(id as i64, &alloc::format!("u-{id}")))
12019                    .unwrap();
12020            }
12021            t.add_index("by_id".into(), "id").unwrap();
12022        }
12023        let single = a
12024            .prepare_freeze_slice("users", "by_id", 0..8)
12025            .expect("prepare");
12026        let one = a
12027            .commit_freeze_slices("users", "by_id", alloc::vec![single])
12028            .expect("commit one");
12029        let s1 = b
12030            .prepare_freeze_slice("users", "by_id", 0..4)
12031            .expect("prepare s1");
12032        let s2 = b
12033            .prepare_freeze_slice("users", "by_id", 4..8)
12034            .expect("prepare s2");
12035        let two = b
12036            .commit_freeze_slices("users", "by_id", alloc::vec![s1, s2])
12037            .expect("commit two");
12038        assert_eq!(one.segment_bytes, two.segment_bytes);
12039        assert_eq!(one.frozen_rows, two.frozen_rows);
12040        // Every PK that survived freeze (hot or cold) resolves on
12041        // both catalogs.
12042        for id in 0..10i64 {
12043            assert_eq!(
12044                a.lookup_by_pk("users", "by_id", &IndexKey::Int(id)),
12045                b.lookup_by_pk("users", "by_id", &IndexKey::Int(id)),
12046                "PK {id} differs after one-slice vs two-slice freeze"
12047            );
12048        }
12049    }
12050
12051    /// Gap between slices → error before any mutation lands.
12052    #[test]
12053    fn commit_freeze_slices_rejects_gap() {
12054        let mut cat = Catalog::new();
12055        cat.create_table(bigint_pk_users_schema()).unwrap();
12056        let t = cat.get_mut("users").unwrap();
12057        for id in 0..6i64 {
12058            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
12059                .unwrap();
12060        }
12061        t.add_index("by_id".into(), "id").unwrap();
12062        let s1 = cat.prepare_freeze_slice("users", "by_id", 0..2).unwrap();
12063        let s2 = cat.prepare_freeze_slice("users", "by_id", 3..5).unwrap();
12064        assert!(matches!(
12065            cat.commit_freeze_slices("users", "by_id", alloc::vec![s1, s2]),
12066            Err(StorageError::Corrupt(_))
12067        ));
12068        // Catalog untouched.
12069        assert_eq!(cat.cold_segment_count(), 0);
12070        assert_eq!(cat.get("users").unwrap().row_count(), 6);
12071    }
12072
12073    /// Empty slice list → no-op success, catalog untouched.
12074    #[test]
12075    fn commit_freeze_slices_empty_is_noop() {
12076        let mut cat = Catalog::new();
12077        cat.create_table(bigint_pk_users_schema()).unwrap();
12078        let t = cat.get_mut("users").unwrap();
12079        for id in 0..3i64 {
12080            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
12081                .unwrap();
12082        }
12083        t.add_index("by_id".into(), "id").unwrap();
12084        let report = cat
12085            .commit_freeze_slices("users", "by_id", Vec::new())
12086            .unwrap();
12087        assert_eq!(report.frozen_rows, 0);
12088        assert_eq!(cat.cold_segment_count(), 0);
12089        assert_eq!(cat.get("users").unwrap().row_count(), 3);
12090    }
12091
12092    // --- v6.7.3 cold-segment compaction ---------------------------
12093
12094    /// Two small cold segments merge into a single larger one. The
12095    /// merged segment carries every cold-resident row; the source
12096    /// slots are tombstoned; every PK still resolves through the
12097    /// new merged segment via `lookup_by_pk`.
12098    #[test]
12099    fn compact_merges_small_segments_storage_unit() {
12100        let mut cat = Catalog::new();
12101        cat.create_table(bigint_pk_users_schema()).unwrap();
12102        let t = cat.get_mut("users").unwrap();
12103        for id in 0..8i64 {
12104            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
12105                .unwrap();
12106        }
12107        t.add_index("by_id".into(), "id").unwrap();
12108        // Two freezes of 3 rows each → two small cold segments.
12109        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
12110        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
12111        assert_eq!(cat.cold_segment_count(), 2);
12112        assert_eq!(cat.cold_segment_slot_count(), 2);
12113
12114        // Pick a threshold larger than either segment's size so
12115        // both qualify.
12116        let max_seg_bytes = cat
12117            .cold_segment_ids_global()
12118            .iter()
12119            .map(|id| cat.cold_segment(*id).unwrap().bytes().len() as u64)
12120            .max()
12121            .unwrap();
12122        let target = max_seg_bytes + 1;
12123
12124        let report = cat
12125            .compact_cold_segments("users", "by_id", target)
12126            .expect("compact succeeds");
12127        assert_eq!(report.sources.len(), 2);
12128        let merged_id = report.merged_segment_id.expect("merge happened");
12129        assert_eq!(report.merged_rows, 6);
12130        assert_eq!(report.deleted_rows_pruned, 0);
12131        assert!(!report.merged_segment_bytes.is_empty());
12132
12133        // Active count drops back to 1; slot count grew to 3
12134        // (2 sources tombstoned + 1 merged appended).
12135        assert_eq!(cat.cold_segment_count(), 1);
12136        assert_eq!(cat.cold_segment_slot_count(), 3);
12137        assert_eq!(cat.cold_segment_ids_global(), alloc::vec![merged_id]);
12138
12139        // Every PK that was frozen still resolves (via the merged
12140        // segment); the 2 hot rows still resolve too.
12141        for id in 0..8i64 {
12142            let got = cat
12143                .lookup_by_pk("users", "by_id", &IndexKey::Int(id))
12144                .unwrap_or_else(|| panic!("PK {id} lost after compaction"));
12145            assert_eq!(got, make_user_row(id, &alloc::format!("u-{id}")));
12146        }
12147    }
12148
12149    /// DELETE'd-but-frozen rows are dropped during the merge. Set
12150    /// up two small segments, then shadow one row in each; the
12151    /// merged segment must NOT carry the shadowed rows.
12152    #[test]
12153    fn compact_drops_shadowed_cold_rows() {
12154        let mut cat = Catalog::new();
12155        cat.create_table(bigint_pk_users_schema()).unwrap();
12156        let t = cat.get_mut("users").unwrap();
12157        for id in 0..6i64 {
12158            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
12159                .unwrap();
12160        }
12161        t.add_index("by_id".into(), "id").unwrap();
12162        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
12163        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
12164        // Shadow PK 1 (in seg 0) + PK 4 (in seg 1).
12165        assert_eq!(
12166            cat.shadow_cold_row("users", "by_id", &IndexKey::Int(1))
12167                .unwrap(),
12168            1
12169        );
12170        assert_eq!(
12171            cat.shadow_cold_row("users", "by_id", &IndexKey::Int(4))
12172                .unwrap(),
12173            1
12174        );
12175
12176        let max_seg_bytes = cat
12177            .cold_segment_ids_global()
12178            .iter()
12179            .map(|id| cat.cold_segment(*id).unwrap().bytes().len() as u64)
12180            .max()
12181            .unwrap();
12182        let report = cat
12183            .compact_cold_segments("users", "by_id", max_seg_bytes + 1)
12184            .expect("compact succeeds");
12185        assert_eq!(report.sources.len(), 2);
12186        assert_eq!(report.merged_rows, 4, "6 frozen − 2 shadowed = 4 live");
12187        assert_eq!(report.deleted_rows_pruned, 2);
12188
12189        // PK 1 and 4 stay invisible after compact.
12190        for shadowed in [1i64, 4i64] {
12191            assert!(
12192                cat.lookup_by_pk("users", "by_id", &IndexKey::Int(shadowed))
12193                    .is_none(),
12194                "shadowed PK {shadowed} must remain invisible after compact"
12195            );
12196        }
12197        // The other 4 frozen rows resolve.
12198        for live in [0i64, 2, 3, 5] {
12199            cat.lookup_by_pk("users", "by_id", &IndexKey::Int(live))
12200                .unwrap_or_else(|| panic!("live PK {live} lost after compact"));
12201        }
12202    }
12203
12204    /// No-op cases: 0 or 1 candidate segment under the threshold
12205    /// leaves the catalog untouched.
12206    #[test]
12207    fn compact_is_noop_below_two_candidates() {
12208        let mut cat = Catalog::new();
12209        cat.create_table(bigint_pk_users_schema()).unwrap();
12210        let t = cat.get_mut("users").unwrap();
12211        for id in 0..6i64 {
12212            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
12213                .unwrap();
12214        }
12215        t.add_index("by_id".into(), "id").unwrap();
12216        // 0 cold segments.
12217        let report = cat
12218            .compact_cold_segments("users", "by_id", 1 << 30)
12219            .expect("noop ok");
12220        assert!(report.merged_segment_id.is_none());
12221        assert!(report.sources.is_empty());
12222
12223        // 1 cold segment — still a no-op (need ≥2 to merge).
12224        cat.freeze_oldest_to_cold("users", "by_id", 4).unwrap();
12225        let report = cat
12226            .compact_cold_segments("users", "by_id", 1 << 30)
12227            .expect("noop ok");
12228        assert!(report.merged_segment_id.is_none());
12229        assert_eq!(cat.cold_segment_count(), 1);
12230
12231        // Threshold too small to cover the single segment → still
12232        // no-op.
12233        let report = cat
12234            .compact_cold_segments("users", "by_id", 1)
12235            .expect("noop ok");
12236        assert!(report.merged_segment_id.is_none());
12237        assert_eq!(cat.cold_segment_count(), 1);
12238    }
12239
12240    /// Manifest-style atomicity: a Catalog snapshot taken AFTER
12241    /// `compact_cold_segments` returns must round-trip with the
12242    /// post-compact BTree state, while the cold-tier registry is
12243    /// re-derived from the source-of-truth manifest (=
12244    /// `load_segment_bytes_at` with the merged id + the still-on-
12245    /// disk merged bytes). This mirrors the boot path: catalog
12246    /// snapshot + cold-segment files = full state.
12247    #[test]
12248    fn compact_swap_survives_catalog_roundtrip_via_load_at() {
12249        let mut cat = Catalog::new();
12250        cat.create_table(bigint_pk_users_schema()).unwrap();
12251        let t = cat.get_mut("users").unwrap();
12252        for id in 0..6i64 {
12253            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
12254                .unwrap();
12255        }
12256        t.add_index("by_id".into(), "id").unwrap();
12257        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
12258        cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
12259        let max_seg_bytes = cat
12260            .cold_segment_ids_global()
12261            .iter()
12262            .map(|id| cat.cold_segment(*id).unwrap().bytes().len() as u64)
12263            .max()
12264            .unwrap();
12265        let report = cat
12266            .compact_cold_segments("users", "by_id", max_seg_bytes + 1)
12267            .expect("compact ok");
12268        let merged_id = report.merged_segment_id.unwrap();
12269
12270        // Serialise the catalog (BTree index points at merged_id
12271        // now) and the merged segment bytes; pretend to crash; on
12272        // restart, re-hydrate the catalog and reload only the
12273        // merged segment at its baked-in id.
12274        let cat_bytes = cat.serialize();
12275        let merged_bytes = report.merged_segment_bytes.clone();
12276
12277        let mut restored = Catalog::deserialize(&cat_bytes).expect("deserialize ok");
12278        restored
12279            .load_segment_bytes_at(merged_id, merged_bytes)
12280            .expect("reload merged ok");
12281
12282        // All 6 PKs still resolve through the restored merged segment.
12283        for id in 0..6i64 {
12284            let got = restored
12285                .lookup_by_pk("users", "by_id", &IndexKey::Int(id))
12286                .unwrap_or_else(|| panic!("PK {id} lost across roundtrip"));
12287            assert_eq!(got, make_user_row(id, &alloc::format!("u-{id}")));
12288        }
12289        // No source slot ever rehydrates — confirmed by
12290        // `cold_segment_count` matching only the merged segment.
12291        assert_eq!(restored.cold_segment_count(), 1);
12292    }
12293
12294    /// `load_segment_bytes_at` refuses to stomp an occupied slot
12295    /// and pads with `None` when the target id is past the end.
12296    #[test]
12297    fn load_segment_bytes_at_pads_and_rejects_collision() {
12298        let mut cat = Catalog::new();
12299        cat.create_table(bigint_pk_users_schema()).unwrap();
12300        let t = cat.get_mut("users").unwrap();
12301        for id in 0..4i64 {
12302            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
12303                .unwrap();
12304        }
12305        t.add_index("by_id".into(), "id").unwrap();
12306        let report = cat.freeze_oldest_to_cold("users", "by_id", 2).unwrap();
12307        let bytes_seg0 = report.segment_bytes.clone();
12308
12309        // Pad to id=5 (slots 1..5 are None, slot 5 holds the
12310        // segment loaded back). The slot count jumps, the active
12311        // count is now 2 (seg 0 + seg 5).
12312        cat.load_segment_bytes_at(5, bytes_seg0.clone())
12313            .expect("pad + load ok");
12314        assert_eq!(cat.cold_segment_slot_count(), 6);
12315        assert_eq!(cat.cold_segment_count(), 2);
12316
12317        // Re-loading at the same id collides.
12318        assert!(matches!(
12319            cat.load_segment_bytes_at(5, bytes_seg0.clone()),
12320            Err(StorageError::Corrupt(_))
12321        ));
12322        // Re-loading at id 0 (already occupied) also collides.
12323        assert!(matches!(
12324            cat.load_segment_bytes_at(0, bytes_seg0),
12325            Err(StorageError::Corrupt(_))
12326        ));
12327    }
12328
12329    /// Round trip: freeze → promote → re-freeze. The same PK can
12330    /// migrate hot ↔ cold multiple times. After two cycles only the
12331    /// final Hot locator should be live.
12332    #[test]
12333    fn promote_then_refreeze_does_not_leave_orphan_locators() {
12334        let mut cat = Catalog::new();
12335        cat.create_table(bigint_pk_users_schema()).unwrap();
12336        let t = cat.get_mut("users").unwrap();
12337        for id in 0..4i64 {
12338            t.insert(make_user_row(id, &alloc::format!("u-{id}")))
12339                .unwrap();
12340        }
12341        t.add_index("by_id".into(), "id").unwrap();
12342
12343        // Cycle 1: freeze first 2 rows, then promote PK 0.
12344        cat.freeze_oldest_to_cold("users", "by_id", 2).unwrap();
12345        let promoted = cat
12346            .promote_cold_row("users", "by_id", &IndexKey::Int(0))
12347            .unwrap();
12348        assert!(promoted.is_some());
12349        let entries_after_promote = cat
12350            .get("users")
12351            .unwrap()
12352            .index_on(0)
12353            .unwrap()
12354            .lookup_eq(&IndexKey::Int(0))
12355            .to_vec();
12356        assert_eq!(entries_after_promote.len(), 1);
12357        assert!(entries_after_promote[0].is_hot());
12358
12359        // Cycle 2: freeze the front rows again. PK 0 is now at
12360        // position 2 (after the survivors); it could still go cold
12361        // again on a future freeze depending on policy, but the
12362        // current "first N positions" policy leaves it alone here.
12363        // What matters: prior cold locators for PKs 0..1 are gone,
12364        // PKs 2..3 still resolve through their original segments.
12365        for id in [2i64, 3] {
12366            assert_eq!(
12367                cat.lookup_by_pk("users", "by_id", &IndexKey::Int(id))
12368                    .unwrap(),
12369                make_user_row(id, &alloc::format!("u-{id}"))
12370            );
12371        }
12372    }
12373}