spg_storage/lib.rs
1//! In-memory storage primitives.
2//!
3//! v0.3 is intentionally simple: a flat catalog of tables, each holding rows
4//! as `Vec<Value>` (positional, matching the table's `TableSchema`). No MVCC,
5//! no on-disk format — those land in later milestones.
6#![no_std]
7// v3.3.2 NEON path for l2_distance_sq (aarch64 only). Scoped allow:
8// `unsafe_code = "deny"` at workspace level stays in force for every
9// other crate.
10#![cfg_attr(target_arch = "aarch64", allow(unsafe_code))]
11
12extern crate alloc;
13
14pub mod bloom;
15pub mod fts_simple;
16pub mod halfvec;
17pub mod persistent;
18pub mod persistent_btree;
19pub mod quantize;
20pub mod row_locator;
21pub mod segment;
22pub mod trgm;
23
24pub use self::bloom::{BloomError, BloomFilter};
25pub use self::row_locator::{RowLocator, RowLocatorError};
26pub use self::segment::{
27 BRIN_SIDECAR_MAGIC, BrinSummary, OwnedSegment, SEGMENT_COMPRESS_ALGO_LZSS,
28 SEGMENT_COMPRESS_ALGO_NONE, SEGMENT_MAGIC, SEGMENT_MAGIC_V2, SEGMENT_PAGE_BYTES, SegmentError,
29 SegmentMeta, SegmentReader, derive_brin_summaries, encode_segment, wrap_v2_envelope,
30 wrap_v2_envelope_with_brin,
31};
32
33use alloc::boxed::Box;
34use alloc::collections::{BTreeMap, BTreeSet};
35use alloc::format;
36use alloc::string::{String, ToString};
37use alloc::sync::Arc;
38use alloc::vec::Vec;
39use core::fmt;
40
41use self::persistent::PersistentVec;
42use self::persistent_btree::PersistentBTreeMap;
43
44/// In-cell encoding for `DataType::Vector`. Mirrors
45/// `spg_sql::ast::VecEncoding` — kept here so storage stays
46/// dep-free of `spg-sql`. The engine bridges between the two
47/// at DDL-execution time.
48///
49/// `F32` is the pre-v6 default: each cell holds a raw `Vec<f32>`.
50/// `Sq8` (v6.0.1) stores `Sq8Vector { min, max, bytes: Vec<u8> }`
51/// per cell; 4× compression vs `F32` with recall@10 ≥ 0.95 on
52/// natural embeddings (Gaussian / unit-sphere corpora).
53/// `F16` (v6.0.3, DDL keyword `HALF`) stores each element as
54/// IEEE-754 binary16; 2× compression and bit-exact dequantise.
55#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
56pub enum VecEncoding {
57 #[default]
58 F32,
59 Sq8,
60 F16,
61}
62
63impl fmt::Display for VecEncoding {
64 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
65 match self {
66 Self::F32 => f.write_str("F32"),
67 Self::Sq8 => f.write_str("SQ8"),
68 Self::F16 => f.write_str("HALF"),
69 }
70 }
71}
72
73/// Runtime type tags. `Vector { dim, encoding }` / `Varchar(max)` /
74/// `Char(size)` are parameterised; the parameter travels with both
75/// the column schema and the on-wire serialised representation.
76#[derive(Debug, Clone, Copy, PartialEq, Eq)]
77pub enum DataType {
78 /// 16-bit signed. Backed by `Value::SmallInt(i16)`; arithmetic that
79 /// would overflow surfaces as a type error at INSERT time.
80 SmallInt,
81 Int, // 32-bit signed
82 BigInt, // 64-bit signed
83 Float, // f64 (PG double precision)
84 Text,
85 /// `VARCHAR(n)` — same byte representation as `Text`, but INSERT
86 /// rejects values longer than `n` Unicode characters.
87 Varchar(u32),
88 /// `CHAR(n)` — same representation as `Text`, but INSERT right-pads
89 /// with U+0020 to exactly `n` Unicode characters (or rejects when
90 /// the input is already longer).
91 Char(u32),
92 Bool,
93 /// pgvector-style fixed-dimension vector. `encoding` selects
94 /// the in-cell representation (`F32` = pre-v6 raw f32 buffer;
95 /// `Sq8` = v6.0.1 8-bit scalar-quantised). The DDL grammar
96 /// surfaces encoding via the optional `USING <encoding>`
97 /// clause: `VECTOR(128) USING SQ8`.
98 Vector {
99 dim: u32,
100 encoding: VecEncoding,
101 },
102 /// `NUMERIC(precision, scale)` — exact fixed-point decimal stored as
103 /// a scaled `i128`. `precision` caps total decimal digits, `scale`
104 /// fixes digits after the decimal point. v1.12 supports up to
105 /// precision 38 (the i128-safe ceiling). `NUMERIC` and `NUMERIC(p)`
106 /// surface as `Numeric { precision: p, scale: 0 }`.
107 Numeric {
108 precision: u8,
109 scale: u8,
110 },
111 /// `DATE` — calendar date with day precision, stored as `i32` days
112 /// since the Unix epoch (1970-01-01).
113 Date,
114 /// `TIMESTAMP` (a.k.a. `MySQL` `DATETIME`) — instant with microsecond
115 /// precision, stored as `i64` microseconds since the Unix epoch.
116 Timestamp,
117 /// v7.9.2 `TIMESTAMPTZ` — bit-identical to `Timestamp` on disk
118 /// (i64 microseconds, UTC by convention). Carried as a distinct
119 /// type tag so the PG-wire layer can advertise OID 1184 (PG's
120 /// `timestamp with time zone`) and `sqlx`/`pgx`/JDBC clients
121 /// decode into their TZ-aware datetime types. The internal
122 /// semantics are unchanged: SPG never stored per-row offsets,
123 /// and neither did PG — `TIMESTAMPTZ` in PG is also UTC i64.
124 Timestamptz,
125 /// `INTERVAL` — calendar-aware span (months + microseconds). v2.11
126 /// supports INTERVAL only as a runtime intermediate (literals,
127 /// arithmetic results); on-disk encoding is rejected so this branch
128 /// can't appear in a `ColumnSchema`.
129 Interval,
130 /// v4.9: `JSON` — text-backed JSON document. We don't parse
131 /// the content (no path operators or jsonb functions yet) —
132 /// the column accepts any TEXT-compatible value and round-trips
133 /// it verbatim. PG OID 114 on the wire.
134 Json,
135 /// v7.9.0: `JSONB` — semantically identical to `Json` on
136 /// the storage side (same `Value::Json` cells, same
137 /// row codec), but advertised as PG OID 3802 on the wire
138 /// so `sqlx`-style clients that bind `jsonb` columns
139 /// decode correctly. mailrs migration blocker #3.
140 Jsonb,
141 /// v7.10.4: `BYTES` / `BYTEA` — variable-length raw binary.
142 /// Backed by `Value::Bytes(Vec<u8>)`. PG wire OID 17. Literal
143 /// forms accepted by parser/engine: PG hex form `'\xDEADBEEF'`
144 /// (case-insensitive hex pairs) and escape form
145 /// `'foo\\000bar'` (the latter decoded at coercion time when
146 /// the target column is BYTEA — TEXT columns leave the
147 /// backslash sequence verbatim).
148 Bytes,
149 /// v7.10.9: `TEXT[]` — single-dimension TEXT array. Elements
150 /// may be NULL (PG semantics). PG wire OID 1009. Literal
151 /// forms: `ARRAY['a', 'b', NULL]` and the PG external form
152 /// `'{a,b,NULL}'::TEXT[]`. Engine implements `= ANY(arr)`,
153 /// `<> ALL(arr)`, and 1-based indexing `arr[i]`. Catalog
154 /// FILE_VERSION 18+; older snapshots reject this DataType
155 /// (forward-only by design — TEXT[] columns aren't readable
156 /// on a pre-v7.10 binary).
157 TextArray,
158 /// v7.11.12: `INT[]` — single-dimension i32 array. PG wire
159 /// OID 1007 (_int4). Same `ARRAY[...]` / `'{1,2,3}'::INT[]`
160 /// literal surface as TEXT[]. Catalog FILE_VERSION 19+.
161 IntArray,
162 /// v7.11.12: `BIGINT[]` — single-dimension i64 array. PG
163 /// wire OID 1016 (_int8). Catalog FILE_VERSION 19+.
164 BigIntArray,
165 /// v7.12.0: PG `tsvector` — ordered, deduplicated set of
166 /// `(lexeme, positions, weight)` tuples. PG wire OID 3614.
167 /// Catalog FILE_VERSION 20+. Storage shape is row-codec
168 /// tag 22; the schema-agnostic `write_value` path emits tag
169 /// 18. Literal: `'foo:1 bar:2,3'::tsvector` (PG external
170 /// form). G-CRIT-3 entry — v7.12.0 only ships the type +
171 /// codec; matching `@@` lands in v7.12.2.
172 TsVector,
173 /// v7.12.0: PG `tsquery` — parse tree of lexemes joined by
174 /// `&` `|` `!` and phrase operators. PG wire OID 3615.
175 /// Catalog FILE_VERSION 20+.
176 TsQuery,
177 /// v7.17.0: PG `uuid` — 128-bit identifier stored as
178 /// `Value::Uuid([u8; 16])`. PG wire OID 2950. Canonical
179 /// text form is lowercase 8-4-4-4-12 hyphenated; input
180 /// also accepts uppercase, unhyphenated, and brace-wrapped
181 /// forms (`{xxxx…}`). Catalog FILE_VERSION 36+; tag 24 on
182 /// the dense type-tag side, tag 20 on the schema-agnostic
183 /// value side. The drop-in PG/MySQL surface for Django /
184 /// Rails / Hibernate "id UUID PRIMARY KEY DEFAULT
185 /// gen_random_uuid()" default-PK pattern.
186 Uuid,
187 /// v7.17.0 Phase 3.P0-32: PG `time` (without time zone) — i64
188 /// microseconds since 00:00:00. PG wire OID 1083. Display:
189 /// canonical zero-padded `HH:MM:SS` when fractional is zero,
190 /// `HH:MM:SS.ffffff` otherwise. Catalog FILE_VERSION 37+;
191 /// tag 25 on the dense type-tag side, tag 21 on the schema-
192 /// agnostic value side. The wall-clock-of-day half of PG's
193 /// date/time triplet (date / time / timestamp).
194 Time,
195 /// v7.17.0 Phase 3.P0-33: MySQL `YEAR` — u16 in range
196 /// 1901..=2155 plus the special zero-year sentinel 0. No
197 /// dedicated PG OID (advertised as INT4 / OID 23 on the wire
198 /// — psql renders integers, MySQL CLI renders 4-digit
199 /// zero-padded text). Display always 4 digits: `0000` for the
200 /// zero-year, `1985` / `2007` / etc otherwise. Catalog
201 /// FILE_VERSION 38+; tag 26 on the dense type-tag side, tag
202 /// 22 on the schema-agnostic value side.
203 Year,
204 /// v7.17.0 Phase 3.P0-34: PG `time with time zone` (TIMETZ) —
205 /// i64 microseconds since 00:00:00 in the local wall clock
206 /// PLUS i32 offset-from-UTC in seconds. PG wire OID 1266.
207 /// Display: `HH:MM:SS[.ffffff]±HH[:MM]` (PG `timetz_out`).
208 /// Range: offset in ±50400 seconds (±14 hours). Catalog
209 /// FILE_VERSION 39+; tag 27 on the dense type-tag side, tag
210 /// 23 on the schema-agnostic value side.
211 TimeTz,
212 /// v7.17.0 Phase 3.P0-35: PG `money` — i64 cents (locale-
213 /// independent storage). PG wire OID 790. Display: en_US
214 /// locale (`$N,NNN.CC`, negative → `-$1.23`). Input accepts
215 /// `$N.NN`, `$N,NNN.NN`, bare integer (treated as major
216 /// units), optional leading `-`. Range: full i64. Catalog
217 /// FILE_VERSION 40+; tag 28 on the dense type-tag side, tag
218 /// 24 on the schema-agnostic value side.
219 Money,
220 /// v7.17.0 Phase 3.P0-38: PG range type. The same DataType
221 /// variant covers all six builtin ranges (int4range,
222 /// int8range, numrange, tsrange, tstzrange, daterange) —
223 /// `RangeKind` pins the element type so encode / decode /
224 /// display can route off one switch. Catalog FILE_VERSION
225 /// 43+; tag 29 + a 1-byte RangeKind on the dense type-tag
226 /// side, tag 25 on the schema-agnostic value side.
227 Range(RangeKind),
228 /// v7.17.0 Phase 3.P0-39: PG `hstore` extension type — flat
229 /// `text => text` map with NULL value support. Catalog
230 /// FILE_VERSION 44+; tag 30 on the dense type-tag side, tag
231 /// 26 on the schema-agnostic value side. The contrib OID is
232 /// installation-dependent in real PG; SPG advertises it via
233 /// dynamic lookup, falling back to TEXT (OID 25) on the wire
234 /// when the installed `hstore` extension hasn't claimed an
235 /// OID yet.
236 Hstore,
237 /// v7.17.0 Phase 3.P0-40: PG `int[][]` — 2-dimensional INT
238 /// matrix. Storage: row-major Vec<Vec<Option<i32>>>. All
239 /// rows must share the same column count. Wire OID 1007
240 /// (same as INT[]; the dimension count travels in the data
241 /// header, not the OID). Catalog FILE_VERSION 45+; tag 31
242 /// on the dense type-tag side, tag 27 on the schema-agnostic
243 /// value side.
244 IntArray2D,
245 /// v7.17.0 Phase 3.P0-40: PG `bigint[][]` — 2-dimensional
246 /// BIGINT matrix. Storage / OID / tags mirror IntArray2D.
247 /// Tag 32 dense, tag 28 schema-agnostic.
248 BigIntArray2D,
249 /// v7.17.0 Phase 3.P0-40: PG `text[][]` — 2-dimensional TEXT
250 /// matrix. Storage: row-major Vec<Vec<Option<String>>>.
251 /// Tag 33 dense, tag 29 schema-agnostic.
252 TextArray2D,
253}
254
255/// v7.17.0 Phase 3.P0-38 — pins the element type of a range value
256/// or column. Wire OIDs: Int4=3904, Int8=3926, Num=3906,
257/// Ts=3908, TsTz=3910, Date=3912.
258#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
259pub enum RangeKind {
260 Int4,
261 Int8,
262 Num,
263 Ts,
264 TsTz,
265 Date,
266}
267
268impl RangeKind {
269 pub const fn tag(self) -> u8 {
270 match self {
271 Self::Int4 => 0,
272 Self::Int8 => 1,
273 Self::Num => 2,
274 Self::Ts => 3,
275 Self::TsTz => 4,
276 Self::Date => 5,
277 }
278 }
279 pub const fn from_tag(t: u8) -> Option<Self> {
280 Some(match t {
281 0 => Self::Int4,
282 1 => Self::Int8,
283 2 => Self::Num,
284 3 => Self::Ts,
285 4 => Self::TsTz,
286 5 => Self::Date,
287 _ => return None,
288 })
289 }
290 pub const fn keyword(self) -> &'static str {
291 match self {
292 Self::Int4 => "INT4RANGE",
293 Self::Int8 => "INT8RANGE",
294 Self::Num => "NUMRANGE",
295 Self::Ts => "TSRANGE",
296 Self::TsTz => "TSTZRANGE",
297 Self::Date => "DATERANGE",
298 }
299 }
300}
301
302impl fmt::Display for DataType {
303 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
304 match self {
305 Self::SmallInt => f.write_str("SMALLINT"),
306 Self::Int => f.write_str("INT"),
307 Self::BigInt => f.write_str("BIGINT"),
308 Self::Float => f.write_str("FLOAT"),
309 Self::Text => f.write_str("TEXT"),
310 Self::Varchar(n) => write!(f, "VARCHAR({n})"),
311 Self::Char(n) => write!(f, "CHAR({n})"),
312 Self::Bool => f.write_str("BOOL"),
313 Self::Vector { dim, encoding } => match encoding {
314 VecEncoding::F32 => write!(f, "VECTOR({dim})"),
315 VecEncoding::Sq8 => write!(f, "VECTOR({dim}) USING SQ8"),
316 VecEncoding::F16 => write!(f, "VECTOR({dim}) USING HALF"),
317 },
318 Self::Numeric { precision, scale } => {
319 if *scale == 0 {
320 write!(f, "NUMERIC({precision})")
321 } else {
322 write!(f, "NUMERIC({precision}, {scale})")
323 }
324 }
325 Self::Date => f.write_str("DATE"),
326 Self::Timestamp => f.write_str("TIMESTAMP"),
327 Self::Timestamptz => f.write_str("TIMESTAMPTZ"),
328 Self::Interval => f.write_str("INTERVAL"),
329 Self::Json => f.write_str("JSON"),
330 Self::Jsonb => f.write_str("JSONB"),
331 Self::Bytes => f.write_str("BYTEA"),
332 Self::TextArray => f.write_str("TEXT[]"),
333 Self::IntArray => f.write_str("INT[]"),
334 Self::BigIntArray => f.write_str("BIGINT[]"),
335 Self::TsVector => f.write_str("TSVECTOR"),
336 Self::TsQuery => f.write_str("TSQUERY"),
337 Self::Uuid => f.write_str("UUID"),
338 Self::Time => f.write_str("TIME"),
339 Self::Year => f.write_str("YEAR"),
340 Self::TimeTz => f.write_str("TIMETZ"),
341 Self::Money => f.write_str("MONEY"),
342 Self::Range(k) => f.write_str(k.keyword()),
343 Self::Hstore => f.write_str("HSTORE"),
344 Self::IntArray2D => f.write_str("INT[][]"),
345 Self::BigIntArray2D => f.write_str("BIGINT[][]"),
346 Self::TextArray2D => f.write_str("TEXT[][]"),
347 }
348 }
349}
350
351/// v7.12.0 — one entry in a `Value::TsVector`. The lexeme is the
352/// (already-tokenised + stemmed in v7.12.1+) word; `positions` is
353/// a strictly-ascending list of 1-based positions; `weight` is the
354/// PG weight letter (A=3, B=2, C=1, D=0) — v7.12.0 defaults every
355/// lexeme to D, the v7.12.2 ranking path consumes the weight.
356#[derive(Debug, Clone, PartialEq, Eq)]
357pub struct TsLexeme {
358 pub word: String,
359 pub positions: Vec<u16>,
360 pub weight: u8,
361}
362
363/// v7.12.0 — parse tree for a PG `tsquery`. v7.12.0 ships the
364/// type + codec only; the `to_tsquery` / `plainto_tsquery` lexer
365/// lands in v7.12.1 and the `@@` evaluator in v7.12.2.
366#[derive(Debug, Clone, PartialEq, Eq)]
367pub enum TsQueryAst {
368 /// Single lexeme term. The `weight_mask` is the PG-style
369 /// bitmask of accepted weights (`A=1<<3`, `B=1<<2`, `C=1<<1`,
370 /// `D=1<<0`); `0` = any weight. v7.12.0 always sets it to 0.
371 Term {
372 word: String,
373 weight_mask: u8,
374 },
375 And(Box<TsQueryAst>, Box<TsQueryAst>),
376 Or(Box<TsQueryAst>, Box<TsQueryAst>),
377 Not(Box<TsQueryAst>),
378 /// `phrase <distance> phrase`. v7.12.0 only persists this; the
379 /// match semantics arrive in v7.12.2 alongside `@@`.
380 Phrase {
381 left: Box<TsQueryAst>,
382 right: Box<TsQueryAst>,
383 distance: u16,
384 },
385}
386
387/// A row-cell value, including SQL `NULL`. `Float` uses `f64`; NaN compares
388/// non-equal to itself (PG behaviour) — `PartialEq` is derived so callers
389/// must opt into NaN-aware comparison if they need stronger guarantees.
390#[derive(Debug, Clone, PartialEq)]
391#[non_exhaustive]
392pub enum Value {
393 SmallInt(i16),
394 Int(i32),
395 BigInt(i64),
396 Float(f64),
397 Text(String),
398 Bool(bool),
399 Vector(Vec<f32>),
400 /// v6.0.1: 8-bit scalar-quantised vector cell. Lives in
401 /// columns declared `VECTOR(N) USING SQ8`. Layout per cell:
402 /// `Sq8Vector { min: f32, max: f32, bytes: Vec<u8> }` —
403 /// 4× compression vs `Vector(Vec<f32>)`. The wire layer
404 /// dequantises to `f32` on SELECT; INSERT path quantises
405 /// incoming `Vector(Vec<f32>)` cells into this variant.
406 Sq8Vector(crate::quantize::Sq8Vector),
407 /// v6.0.3: IEEE-754 binary16 vector cell. Lives in columns
408 /// declared `VECTOR(N) USING HALF`. Stores raw u16 LE bits
409 /// (2× compression vs `Vector(Vec<f32>)`). Wire / display
410 /// paths dequantise to f32 bit-exactly; INSERT path converts
411 /// incoming f32 vectors at the engine boundary.
412 HalfVector(crate::halfvec::HalfVector),
413 /// Exact fixed-point decimal. `scaled` holds the value as
414 /// `actual * 10^scale` so the storage type is always integral —
415 /// arithmetic never falls back to floating-point.
416 Numeric {
417 scaled: i128,
418 scale: u8,
419 },
420 /// Days since the Unix epoch (1970-01-01). Negative for earlier dates.
421 Date(i32),
422 /// Microseconds since the Unix epoch (1970-01-01T00:00:00Z).
423 Timestamp(i64),
424 /// Calendar span: `months` (variable-length) + `micros` (fixed-length).
425 /// Runtime-only — cannot appear in a stored row in v2.11.
426 Interval {
427 months: i32,
428 micros: i64,
429 },
430 /// v4.9 `JSON` — raw JSON text. No structural validation
431 /// happens at the storage layer; whatever the parser hands us
432 /// round-trips verbatim. Equality is byte-wise.
433 Json(String),
434 /// v7.10.4 `BYTEA` — raw binary blob. Equality is byte-wise.
435 /// Layout matches `Text`'s length-prefixed shape (`[u32 LE
436 /// len][bytes]`) under tag 18; the engine accepts PG hex
437 /// literals (`'\xDEADBEEF'`) and escape literals at the
438 /// coercion boundary.
439 Bytes(Vec<u8>),
440 /// v7.10.9 `TEXT[]` — single-dimension TEXT array with
441 /// optional NULL elements. Equality is element-wise. PG's
442 /// NULL-element comparison semantics: NULL ≠ NULL inside
443 /// arrays under `=`, so `[NULL] != [NULL]` (the engine
444 /// honours this).
445 TextArray(Vec<Option<String>>),
446 /// v7.11.12 `INT[]` — single-dimension i32 array with optional
447 /// NULL elements. Codec mirrors TextArray with i32 LE per
448 /// element instead of length-prefixed UTF-8.
449 IntArray(Vec<Option<i32>>),
450 /// v7.11.12 `BIGINT[]` — single-dimension i64 array with optional
451 /// NULL elements.
452 BigIntArray(Vec<Option<i64>>),
453 /// v7.12.0 `tsvector` — sorted-by-word, deduped lexeme set with
454 /// positions + weights. The engine enforces sort/dedup on
455 /// construction; consumers can rely on `lexemes.windows(2)`
456 /// being strictly ascending by `word`.
457 TsVector(Vec<TsLexeme>),
458 /// v7.12.0 `tsquery` — boolean / phrase parse tree over
459 /// lexemes. Engine builds via `to_tsquery` family.
460 TsQuery(TsQueryAst),
461 /// v7.17.0 `uuid` — 128-bit identifier. Stored as 16 bytes
462 /// (big-endian / network-byte order, same as RFC 4122).
463 /// Display normalises to canonical lowercase 8-4-4-4-12
464 /// hyphenated form. Equality is byte-wise.
465 Uuid([u8; 16]),
466 /// v7.17.0 Phase 3.P0-32 — PG `time` (without time zone) —
467 /// i64 microseconds since 00:00:00. Range 0..86_400_000_000.
468 /// Display: `HH:MM:SS` zero-padded, with optional `.ffffff`
469 /// suffix when fractional is non-zero.
470 Time(i64),
471 /// v7.17.0 Phase 3.P0-33 — MySQL `YEAR` — u16 in range
472 /// 1901..=2155 plus the special zero-year sentinel 0.
473 /// Display always 4 digits zero-padded (`0000` for the
474 /// sentinel; `1985`/`2007` otherwise).
475 Year(u16),
476 /// v7.17.0 Phase 3.P0-34 — PG `time with time zone` — i64
477 /// microseconds since 00:00:00 in the LOCAL wall clock PLUS
478 /// an i32 offset-from-UTC in seconds. PG preserves the
479 /// offset on output, so the wall-clock value is NOT shifted
480 /// to UTC at storage time. Offset range: ±50400 seconds
481 /// (±14 hours).
482 TimeTz {
483 us: i64,
484 offset_secs: i32,
485 },
486 /// v7.17.0 Phase 3.P0-35 — PG `money` — i64 cents
487 /// (locale-independent storage; the en_US locale renders on
488 /// display via `$N,NNN.CC`).
489 Money(i64),
490 /// v7.17.0 Phase 3.P0-39 — PG `hstore` value: flat
491 /// `text => text` map with NULL value support. Insertion
492 /// order preserved on input; duplicate keys take last-write-
493 /// wins at parse time.
494 Hstore(Vec<(String, Option<String>)>),
495 /// v7.17.0 Phase 3.P0-40 — 2D INT matrix (row-major).
496 IntArray2D(Vec<Vec<Option<i32>>>),
497 /// v7.17.0 Phase 3.P0-40 — 2D BIGINT matrix (row-major).
498 BigIntArray2D(Vec<Vec<Option<i64>>>),
499 /// v7.17.0 Phase 3.P0-40 — 2D TEXT matrix (row-major).
500 TextArray2D(Vec<Vec<Option<String>>>),
501 /// v7.17.0 Phase 3.P0-38 — PG range value. One shape covers
502 /// all six builtin range types; `kind` pins the element type
503 /// (must match the column's `DataType::Range(kind)`).
504 /// `lower` / `upper` are `None` for the unbounded sides;
505 /// `lower_inc` / `upper_inc` mirror the canonical PG
506 /// `[` / `(` / `]` / `)` bracket inclusivity. `empty=true`
507 /// supersedes all other fields (the empty range has no
508 /// bounds).
509 Range {
510 kind: RangeKind,
511 lower: Option<alloc::boxed::Box<Value>>,
512 upper: Option<alloc::boxed::Box<Value>>,
513 lower_inc: bool,
514 upper_inc: bool,
515 empty: bool,
516 },
517 Null,
518}
519
520impl Value {
521 /// Type tag, or `None` for `NULL` (unknown at value level).
522 pub fn data_type(&self) -> Option<DataType> {
523 match self {
524 Self::SmallInt(_) => Some(DataType::SmallInt),
525 Self::Int(_) => Some(DataType::Int),
526 Self::BigInt(_) => Some(DataType::BigInt),
527 Self::Float(_) => Some(DataType::Float),
528 // `Text` covers both unbounded TEXT and bounded VARCHAR/CHAR
529 // — the constraint lives on the column schema, not the value.
530 Self::Text(_) => Some(DataType::Text),
531 Self::Bool(_) => Some(DataType::Bool),
532 Self::Vector(v) => Some(DataType::Vector {
533 dim: u32::try_from(v.len()).expect("vector dim ≤ u32"),
534 encoding: VecEncoding::F32,
535 }),
536 Self::Sq8Vector(q) => Some(DataType::Vector {
537 dim: u32::try_from(q.bytes.len()).expect("vector dim ≤ u32"),
538 encoding: VecEncoding::Sq8,
539 }),
540 Self::HalfVector(h) => Some(DataType::Vector {
541 dim: u32::try_from(h.dim()).expect("vector dim ≤ u32"),
542 encoding: VecEncoding::F16,
543 }),
544 // `Value::Numeric` doesn't carry its precision (the column
545 // schema does); we surface precision=0 as "unknown" and let
546 // the engine reconcile against the column type at coercion
547 // time.
548 Self::Numeric { scale, .. } => Some(DataType::Numeric {
549 precision: 0,
550 scale: *scale,
551 }),
552 Self::Date(_) => Some(DataType::Date),
553 Self::Timestamp(_) => Some(DataType::Timestamp),
554 Self::Interval { .. } => Some(DataType::Interval),
555 Self::Json(_) => Some(DataType::Json),
556 Self::Bytes(_) => Some(DataType::Bytes),
557 Self::TextArray(_) => Some(DataType::TextArray),
558 Self::IntArray(_) => Some(DataType::IntArray),
559 Self::BigIntArray(_) => Some(DataType::BigIntArray),
560 Self::TsVector(_) => Some(DataType::TsVector),
561 Self::TsQuery(_) => Some(DataType::TsQuery),
562 Self::Uuid(_) => Some(DataType::Uuid),
563 Self::Time(_) => Some(DataType::Time),
564 Self::Year(_) => Some(DataType::Year),
565 Self::TimeTz { .. } => Some(DataType::TimeTz),
566 Self::Money(_) => Some(DataType::Money),
567 Self::Range { kind, .. } => Some(DataType::Range(*kind)),
568 Self::Hstore(_) => Some(DataType::Hstore),
569 Self::IntArray2D(_) => Some(DataType::IntArray2D),
570 Self::BigIntArray2D(_) => Some(DataType::BigIntArray2D),
571 Self::TextArray2D(_) => Some(DataType::TextArray2D),
572 Self::Null => None,
573 }
574 }
575
576 pub const fn is_null(&self) -> bool {
577 matches!(self, Self::Null)
578 }
579}
580
581/// One table row — values are positional and must match
582/// `TableSchema.columns` in length and (modulo NULL) in `DataType`.
583#[derive(Debug, Clone, PartialEq)]
584pub struct Row {
585 pub values: Vec<Value>,
586}
587
588impl Row {
589 pub const fn new(values: Vec<Value>) -> Self {
590 Self { values }
591 }
592
593 pub fn len(&self) -> usize {
594 self.values.len()
595 }
596
597 pub fn is_empty(&self) -> bool {
598 self.values.is_empty()
599 }
600}
601
602#[derive(Debug, Clone, PartialEq)]
603pub struct ColumnSchema {
604 pub name: String,
605 pub ty: DataType,
606 pub nullable: bool,
607 /// Optional `DEFAULT` value, frozen at CREATE TABLE time. `None`
608 /// means "no default" (so omitted columns become NULL, or error
609 /// out when the column is NOT NULL). Literal defaults take this
610 /// path.
611 pub default: Option<Value>,
612 /// v7.9.21 — for DEFAULT expressions that need INSERT-time
613 /// evaluation (e.g. `DEFAULT now()`, `DEFAULT CURRENT_TIMESTAMP`),
614 /// the Display form of the expression. The engine re-parses
615 /// it on each INSERT default-fill, evaluates against an empty
616 /// row context, and coerces to the column type. mailrs G4.
617 /// Persisted in catalog FILE_VERSION 15+; older catalogs
618 /// deserialise with None.
619 pub runtime_default: Option<String>,
620 /// MySQL-style `AUTO_INCREMENT`. When set, an INSERT that leaves
621 /// this column unbound (or sets it to NULL) gets the next integer
622 /// computed from the column's current max + 1.
623 pub auto_increment: bool,
624 /// v7.17.0 Phase 1.4 — when the column is bound to a user-
625 /// defined ENUM type (the parser saw an unknown type ident
626 /// and the engine resolved it against `catalog.enum_types`),
627 /// this carries the enum name so INSERT/UPDATE can validate
628 /// the cell value against the enum's labels. `ty` is
629 /// `DataType::Text` in that case. Persisted in catalog
630 /// FILE_VERSION 29+; older catalogs deserialise with None.
631 pub user_enum_type: Option<String>,
632 /// v7.17.0 Phase 1.5 — when the column is bound to a user-
633 /// defined DOMAIN (the parser saw an unknown type ident and
634 /// the engine resolved it against `catalog.domain_types`),
635 /// this carries the domain name. `ty` is the domain's base
636 /// type; INSERT/UPDATE re-evaluates the domain's CHECK list
637 /// + NOT NULL against the cell value. Persisted in catalog
638 /// FILE_VERSION 30+; older catalogs deserialise with None.
639 pub user_domain_type: Option<String>,
640 /// v7.17.0 Phase 2.1 — MySQL `ON UPDATE CURRENT_TIMESTAMP`
641 /// column attribute. When `Some(expr_src)`, an UPDATE that
642 /// does NOT bind this column overrides the new value with
643 /// the engine-evaluated expression (always `now()` in
644 /// v7.17.0). Stored as Display-form source so storage
645 /// stays free of spg-sql; the engine re-parses at UPDATE
646 /// time. Persisted in catalog FILE_VERSION 32+; older
647 /// catalogs deserialise with None — preserves the existing
648 /// "silent ignore" behaviour for snapshots written before
649 /// the upgrade.
650 pub on_update_runtime: Option<String>,
651 /// v7.17.0 Phase 2.5 — text collation. Pre-2.5 SPG accepted
652 /// `COLLATE <name>` clauses but discarded the name, so a
653 /// column declared `COLLATE "case_insensitive"` (or any
654 /// MySQL `_ci` collation) still compared byte-wise — a
655 /// Tier-S silent failure where `WHERE name = 'foo'` never
656 /// matched stored `'Foo'`. This carries the parser-derived
657 /// classification so the engine's WHERE evaluator can route
658 /// text equality through a case-aware compare. `Binary` (the
659 /// default) preserves the prior byte-wise behaviour. Only
660 /// CaseInsensitive lands in the catalog appendix — Binary
661 /// columns stay implicit, keeping snapshots compact.
662 /// Persisted in catalog FILE_VERSION 34+; older catalogs
663 /// deserialise every column as `Binary`.
664 pub collation: Collation,
665 /// v7.17.0 Phase 4.4 — MySQL `UNSIGNED` modifier flag. Drives
666 /// engine-side INSERT / UPDATE range enforcement (rejects
667 /// negative values on UNSIGNED int columns). Pre-4.4 the
668 /// parser consumed and discarded the keyword silently, so
669 /// every UNSIGNED column quietly accepted negatives — a
670 /// Tier-A correctness drift. Sparse: only UNSIGNED columns
671 /// land in the catalog appendix; the default `false` keeps
672 /// snapshots compact for the common signed-int path.
673 /// Persisted in catalog FILE_VERSION 35+; older catalogs
674 /// deserialise every column as `is_unsigned = false`.
675 pub is_unsigned: bool,
676 /// v7.17.0 Phase 3.P0-36 — MySQL inline `ENUM('a','b','c')`
677 /// value list. Distinct from `user_enum_type` (which points
678 /// to a separately CREATE TYPE'd PG enum); this carries the
679 /// column-local list MySQL DDL declares inline. When `Some`,
680 /// `ty` is `DataType::Text` and INSERT/UPDATE validates the
681 /// cell value against this list. Variant ORDER is preserved
682 /// (MySQL uses it for `ORDER BY col`). Sparse: only ENUM
683 /// columns land in the catalog appendix.
684 /// Persisted in catalog FILE_VERSION 41+; older catalogs
685 /// deserialise with None — preserves silent-drop behaviour
686 /// for snapshots written before P0-36.
687 pub inline_enum_variants: Option<Vec<String>>,
688 /// v7.17.0 Phase 3.P0-37 — MySQL inline `SET('a','b','c')`
689 /// variant list. Storage is TEXT (canonical comma-joined in
690 /// definition order, de-duplicated). INSERT/UPDATE validates
691 /// every comma-separated token against this list. Sparse:
692 /// only SET columns land in the catalog appendix.
693 /// Persisted in catalog FILE_VERSION 42+; older catalogs
694 /// deserialise with None.
695 pub inline_set_variants: Option<Vec<String>>,
696}
697
698/// v7.17.0 Phase 2.5 — column-level text collation. Drives the
699/// engine's WHERE / GROUP BY equality routing for `Value::Text`.
700/// Only two variants are modelled in v7.17:
701/// * `Binary` — byte-wise comparison (the SPG default;
702/// matches PG `COLLATE "C"` / `pg_catalog.default`
703/// and MySQL `*_bin`).
704/// * `CaseInsensitive` — ASCII case-folded comparison
705/// (matches PG `COLLATE "case_insensitive"` and
706/// MySQL `*_ci` collations). Non-ASCII bytes
707/// still compare byte-wise; full ICU folding is
708/// out of v7.17 scope.
709/// New variants append at the end — older catalogs read missing
710/// columns as `Binary`.
711#[derive(Debug, Clone, Copy, PartialEq, Eq)]
712pub enum Collation {
713 Binary,
714 CaseInsensitive,
715}
716
717#[allow(clippy::derivable_impls)]
718impl Default for Collation {
719 fn default() -> Self {
720 Self::Binary
721 }
722}
723
724impl Collation {
725 /// Wire tag persisted in the FILE_VERSION 34+ catalog appendix.
726 /// Stable: future variants append above the recognised range
727 /// and unknown tags read back as `Binary` for forward-compat
728 /// on rollback.
729 pub const TAG_BINARY: u8 = 0;
730 pub const TAG_CASE_INSENSITIVE: u8 = 1;
731}
732
733#[derive(Debug, Clone, PartialEq)]
734pub struct TableSchema {
735 pub name: String,
736 pub columns: Vec<ColumnSchema>,
737 /// v6.7.2 — per-table hot-tier byte budget override. `None`
738 /// falls through to the global `SPG_HOT_TIER_BYTES` setting;
739 /// `Some(n)` overrides it for this specific table. Set via
740 /// `ALTER TABLE t SET hot_tier_bytes = X`. Persisted in
741 /// catalog FILE_VERSION 11+.
742 pub hot_tier_bytes: Option<u64>,
743 /// v7.6.1 — FOREIGN KEY constraints declared on this table.
744 /// Engine maintains this in lock-step with `spg-sql`'s parser
745 /// AST; the storage layer carries the on-disk shape so a
746 /// catalog snapshot round-trips without external mapping.
747 /// Persisted in catalog FILE_VERSION 13+. Older catalogs
748 /// deserialise with an empty vec.
749 pub foreign_keys: Vec<ForeignKeyConstraint>,
750 /// v7.9.19 — composite UNIQUE / PRIMARY KEY constraints
751 /// declared at the table level. Each entry's leading column
752 /// has a BTree index (created via the constraint), and INSERT
753 /// path enforces the full-tuple uniqueness via a scan keyed
754 /// by the leading column. Persisted in catalog FILE_VERSION
755 /// 15+. Older catalogs (≤ 14) deserialise with an empty vec.
756 pub uniqueness_constraints: Vec<UniquenessConstraint>,
757 /// v7.13.0 — `CHECK (<expr>)` predicates declared on this
758 /// table. Both column-level inline `CHECK (…)` and
759 /// table-level `CHECK (…)` fold into this list. Each entry
760 /// is the AST Expr's `Display` form, re-parsed on every
761 /// INSERT/UPDATE and evaluated against the candidate row.
762 /// A false / NULL result rejects the mutation (PG semantics).
763 /// Persisted in catalog FILE_VERSION 23+. Older catalogs
764 /// deserialise with an empty vec.
765 pub checks: Vec<String>,
766}
767
768/// v7.9.19 — composite UNIQUE / PRIMARY KEY constraint persisted
769/// on the table schema. The leading column always has a BTree
770/// index (created at CREATE TABLE time); INSERT enforcement
771/// scans that index for collisions on the full column tuple.
772#[derive(Debug, Clone, PartialEq, Eq)]
773pub struct UniquenessConstraint {
774 /// `true` when this constraint was declared as `PRIMARY KEY`
775 /// (vs `UNIQUE`). Semantically PK implies NOT NULL on all
776 /// referenced columns; the engine enforces that at CREATE
777 /// TABLE time.
778 pub is_primary_key: bool,
779 /// Column positions on the parent table. ≥ 1 element. For
780 /// single-column UNIQUE this is exactly one position; the
781 /// BTree index alone enforces it.
782 pub columns: Vec<usize>,
783 /// v7.13.0 — `UNIQUE NULLS NOT DISTINCT` modifier
784 /// (mailrs round-5 G10; PG 15+ surface). When `true`, two
785 /// rows whose constrained columns are all NULL collide on
786 /// the constraint. Default (`false`) is the SQL-standard
787 /// `NULLS DISTINCT` behaviour where any NULL passes.
788 /// Persisted in catalog FILE_VERSION 23+.
789 pub nulls_not_distinct: bool,
790}
791
792/// v7.6.1 — Storage-layer mirror of `spg_sql::ast::ForeignKeyConstraint`.
793/// The engine's CREATE TABLE path translates between the two; keeping
794/// them separate preserves the no-deps boundary between
795/// `spg-storage` and `spg-sql`.
796#[derive(Debug, Clone, PartialEq, Eq)]
797pub struct ForeignKeyConstraint {
798 /// Optional user-supplied constraint name (`CONSTRAINT <name>`
799 /// prefix). Used by `ALTER TABLE DROP CONSTRAINT <name>` in
800 /// v7.6.8; ignored by enforcement.
801 pub name: Option<String>,
802 /// Positions of local columns in this table's column list.
803 /// Same arity as `parent_columns`.
804 pub local_columns: Vec<usize>,
805 /// Referenced parent table name.
806 pub parent_table: String,
807 /// Positions of parent columns in the parent's column list.
808 /// Engine resolves these at CREATE TABLE time (after the parent
809 /// schema is known) so enforcement paths can skip the name
810 /// lookup on every row.
811 pub parent_columns: Vec<usize>,
812 /// Referential action when a parent row is deleted.
813 pub on_delete: FkAction,
814 /// Referential action when a parent row's referenced columns
815 /// are updated.
816 pub on_update: FkAction,
817}
818
819/// v7.6.1 — referential action tag. Mirrors `spg_sql::ast::FkAction`.
820#[derive(Debug, Clone, Copy, PartialEq, Eq)]
821pub enum FkAction {
822 Restrict,
823 Cascade,
824 SetNull,
825 SetDefault,
826 NoAction,
827}
828
829impl FkAction {
830 /// On-disk tag byte (v13 catalog appendix).
831 pub const fn tag(self) -> u8 {
832 match self {
833 Self::Restrict => 0,
834 Self::Cascade => 1,
835 Self::SetNull => 2,
836 Self::SetDefault => 3,
837 Self::NoAction => 4,
838 }
839 }
840 pub const fn from_tag(b: u8) -> Option<Self> {
841 Some(match b {
842 0 => Self::Restrict,
843 1 => Self::Cascade,
844 2 => Self::SetNull,
845 3 => Self::SetDefault,
846 4 => Self::NoAction,
847 _ => return None,
848 })
849 }
850}
851
852impl TableSchema {
853 pub fn column_position(&self, name: &str) -> Option<usize> {
854 self.columns.iter().position(|c| c.name == name)
855 }
856}
857
858/// Key type accepted by secondary indices. Float / NULL / Vector values
859/// can't participate in a B-tree index — `f64` is only `PartialOrd`, NULL
860/// has SQL-three-valued semantics, and Vector belongs to the (future) HNSW
861/// path. Index lookups on those columns fall back to full scan.
862#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
863pub enum IndexKey {
864 Int(i64),
865 Text(String),
866 Bool(bool),
867 /// v7.17.0 — `Value::Uuid` index key. Comparison is byte-wise
868 /// (RFC 4122 byte order) so PRIMARY KEY UUID lookups land on
869 /// the same fast-path as Int / Text.
870 Uuid([u8; 16]),
871}
872
873impl IndexKey {
874 pub fn from_value(v: &Value) -> Option<Self> {
875 match v {
876 Value::SmallInt(n) => Some(Self::Int(i64::from(*n))),
877 Value::Int(n) => Some(Self::Int(i64::from(*n))),
878 Value::BigInt(n) => Some(Self::Int(*n)),
879 Value::Text(s) => Some(Self::Text(s.clone())),
880 Value::Bool(b) => Some(Self::Bool(*b)),
881 // Date/Timestamp use their integer storage repr as the
882 // index key — same order semantics, same comparison.
883 Value::Date(d) => Some(Self::Int(i64::from(*d))),
884 Value::Timestamp(t) => Some(Self::Int(*t)),
885 // v7.17.0: UUID indexable via byte-wise ordering. Lookup
886 // on `id = '...'::uuid` resolves through the secondary
887 // index rather than full-scan.
888 Value::Uuid(b) => Some(Self::Uuid(*b)),
889 // v7.17.0 Phase 3.P0-32: TIME indexable via i64 — same
890 // order semantics as Date/Timestamp.
891 Value::Time(us) => Some(Self::Int(*us)),
892 // v7.17.0 Phase 3.P0-33: YEAR indexable as i64 — u16
893 // widens losslessly and gives the natural calendar
894 // ordering.
895 Value::Year(y) => Some(Self::Int(i64::from(*y))),
896 // v7.17.0 Phase 3.P0-34: TIMETZ indexable by its
897 // UTC-equivalent microseconds (local wall - offset).
898 // Without normalising, two values for the same
899 // physical instant in different zones would sort
900 // wrong. Matches PG's TIMETZ index behaviour.
901 Value::TimeTz { us, offset_secs } => {
902 Some(Self::Int(us - i64::from(*offset_secs) * 1_000_000))
903 }
904 // v7.17.0 Phase 3.P0-35: MONEY indexable as i64 cents
905 // (no scaling needed — natural numeric ordering).
906 Value::Money(c) => Some(Self::Int(*c)),
907 // v7.17.0 Phase 3.P0-38: ranges are NOT indexable in
908 // v7.17.0 — they'd need a custom comparator (PG uses
909 // SP-GiST for this). Skip.
910 Value::Range { .. } => None,
911 // v7.17.0 Phase 3.P0-39: hstore is NOT indexable in
912 // v7.17.0 — map columns need GIN with bespoke ops.
913 Value::Hstore(_) => None,
914 // v7.17.0 Phase 3.P0-40: 2D arrays aren't indexable.
915 Value::IntArray2D(_) | Value::BigIntArray2D(_) | Value::TextArray2D(_) => None,
916 // Numeric isn't (yet) indexable — exact-decimal index keys
917 // would need a stable scale-normalised representation.
918 // Interval isn't index-eligible either (and can't reach this
919 // path through column storage anyway).
920 Value::Null
921 | Value::Float(_)
922 | Value::Vector(_)
923 | Value::Sq8Vector(_)
924 | Value::HalfVector(_)
925 | Value::Numeric { .. }
926 | Value::Interval { .. }
927 | Value::Json(_)
928 | Value::Bytes(_)
929 | Value::TextArray(_)
930 | Value::IntArray(_)
931 | Value::BigIntArray(_)
932 | Value::TsVector(_)
933 | Value::TsQuery(_) => None,
934 }
935 }
936}
937
938/// A single-column secondary index. v2.0 carries either a B-tree map
939/// (the default — used for equality / range lookups on scalar columns)
940/// or a navigable-small-world graph (used for kNN over vector
941/// columns).
942#[derive(Debug, Clone)]
943pub struct Index {
944 pub name: String,
945 pub column_position: usize,
946 pub kind: IndexKind,
947 /// v6.8.0 — column positions of `INCLUDE (col1, col2, …)`
948 /// non-key columns. Carries the planner's "this query is
949 /// covered by the index" signal; lookup paths still resolve
950 /// via the `RowLocator` to fetch the row body, but EXPLAIN
951 /// surfaces the covered-scan annotation so operators can
952 /// confirm the planner sees the coverage.
953 ///
954 /// Empty `Vec` = no `INCLUDE` clause (the legacy shape). v12
955 /// catalog snapshots deserialise with an empty vec.
956 pub included_columns: Vec<usize>,
957 /// v6.8.1 — partial-index predicate stored as its canonical
958 /// Display form (the engine re-parses it on the maintenance
959 /// path). `None` = unconditional index (the legacy shape).
960 /// Persisted as `[u8 has_pred][u16 LE len][bytes]` on the
961 /// catalog snapshot (FILE_VERSION 12, appended after
962 /// `included_columns`).
963 pub partial_predicate: Option<String>,
964 /// v6.8.2 — expression-index key, stored as the expression's
965 /// canonical Display form. `None` = bare column-reference
966 /// index (the legacy shape). Persisted alongside
967 /// `partial_predicate` on the v12 catalog snapshot.
968 pub expression: Option<String>,
969 /// v7.9.29 — `CREATE UNIQUE INDEX …`. When true the engine
970 /// rejects INSERTs whose key already appears in this index
971 /// (combined with `partial_predicate` when present — only
972 /// rows matching the predicate enter the uniqueness check).
973 /// Catalog FILE_VERSION 16+; older snapshots deserialise
974 /// with `false`. mailrs K1.
975 pub is_unique: bool,
976 /// v7.9.29 — extra (non-leading) column positions for
977 /// multi-column indexes (`CREATE INDEX … (a, b, c)`). The
978 /// planner today still only uses the leading
979 /// `column_position` for index seeks, but UNIQUE INDEX
980 /// enforcement walks the full tuple so partial-unique
981 /// invariants like CalDAV `(calendar_id, uid,
982 /// recurrence_id)` are enforced correctly. Catalog
983 /// FILE_VERSION 16+; older snapshots deserialise empty.
984 pub extra_column_positions: Vec<usize>,
985}
986
987/// Default neighbor degree (M) for the NSW graph. Picked at construction
988/// time and persisted with the index.
989pub const NSW_DEFAULT_M: usize = 16;
990
991/// v5.2.2: outcome of a successful [`Catalog::freeze_oldest_to_cold`]
992/// call. The catalog state has already been mutated by the time this
993/// is returned (hot rows dropped + segment registered + Cold locators
994/// flipped). The caller's only remaining concern is `segment_bytes` —
995/// persist them to disk under `<db>.spg/segments/seg_<id>.spg` so a
996/// future restart can reload via the v5.1 `SPG_PRELOAD_COLD_SEGMENT`
997/// path. (v5.3's manifest will subsume this manual step.)
998#[derive(Debug, Clone)]
999pub struct FreezeReport {
1000 /// Id allocated by [`Catalog::load_segment_bytes`] for the new
1001 /// cold-tier segment. Stable across the call's success path.
1002 pub segment_id: u32,
1003 /// Number of rows that moved hot → cold. Equals the `max_rows`
1004 /// the caller asked for (the API is strict on the count).
1005 pub frozen_rows: usize,
1006 /// Hot-tier bytes reclaimed by the freeze — the
1007 /// [`Table::hot_bytes`] delta before vs after. Useful to feed
1008 /// back into the freezer's budget check on the next tick.
1009 pub bytes_freed: u64,
1010 /// Encoded segment bytes, byte-identical to what
1011 /// [`encode_segment`] produced. The catalog already owns a
1012 /// copy inside `cold_segments`; this hand-off lets the caller
1013 /// persist them without re-encoding.
1014 pub segment_bytes: Vec<u8>,
1015}
1016
1017/// v6.7.4 — read-only output of [`Catalog::prepare_freeze_slice`].
1018/// Carries every row body + key in a contiguous hot-row range,
1019/// already encoded and sorted by PK so the coordinator's merge
1020/// step is a k-way merge over already-sorted streams.
1021///
1022/// `Vec<FreezeSlice>` from N independent workers feeds
1023/// [`Catalog::commit_freeze_slices`], which concats + encodes the
1024/// merged segment + atomically swaps the catalog state.
1025#[derive(Debug, Clone)]
1026pub struct FreezeSlice {
1027 /// Hot-row index range this slice covered (half-open, in the
1028 /// table's `rows: PersistentVec` ordering at call time). The
1029 /// commit step uses this to compute the union range that
1030 /// gets passed to [`Table::delete_rows`].
1031 pub row_range: core::ops::Range<usize>,
1032 /// `(pk_u64, encoded_row_body, IndexKey)` triples, sorted
1033 /// ascending by `pk_u64`. Per-slice sort happens inside
1034 /// `prepare_freeze_slice`; the coordinator does only a
1035 /// k-way merge to reach the global PK ordering
1036 /// [`encode_segment`] requires.
1037 pub rows: Vec<(u64, Vec<u8>, IndexKey)>,
1038}
1039
1040/// v6.7.3 — outcome of a [`Catalog::compact_cold_segments`] call.
1041/// The catalog state has already been mutated when this is returned:
1042/// the merged segment is loaded into `cold_segments`, the source
1043/// segment slots are tombstoned (`None`), and every BTree-index
1044/// `RowLocator::Cold` that previously pointed at a source now
1045/// points at the merged segment. The caller's remaining job is to
1046/// persist `merged_segment_bytes` under
1047/// `<db>.spg/segments/seg_<merged_segment_id>.spg` and update the
1048/// in-memory `segment_id → path` map (remove the source ids, add
1049/// the merged id) so the next CHECKPOINT writes a manifest that
1050/// no longer lists the retired sources.
1051///
1052/// On a no-op (fewer than 2 candidate segments under the threshold),
1053/// `merged_segment_id` is `None` and `sources` is empty; the
1054/// catalog was not mutated.
1055#[derive(Debug, Clone)]
1056pub struct CompactReport {
1057 /// Source segment ids that were merged + tombstoned.
1058 pub sources: Vec<u32>,
1059 /// Id allocated for the merged segment. `None` on no-op.
1060 pub merged_segment_id: Option<u32>,
1061 /// Encoded merged-segment bytes (empty on no-op).
1062 pub merged_segment_bytes: Vec<u8>,
1063 /// Number of rows that landed in the merged segment.
1064 pub merged_rows: usize,
1065 /// `Σ source.num_rows − merged_rows`. Rows present in source
1066 /// segment payloads but unreferenced by any live BTree
1067 /// `Cold` locator — DELETE'd-but-still-frozen rows that
1068 /// compaction GC'd during the merge.
1069 pub deleted_rows_pruned: usize,
1070 /// `Σ source.bytes() − merged.bytes()`. Estimate of on-disk
1071 /// space the merge will reclaim once the source segment files
1072 /// are GC'd. Saturating subtract — never negative.
1073 pub bytes_reclaimed_estimate: u64,
1074}
1075
1076#[derive(Debug, Clone)]
1077pub enum IndexKind {
1078 /// v4.40: structural-sharing B-tree over `IndexKey`. Replaces the v0.8
1079 /// `BTreeMap<IndexKey, Vec<usize>>` — `Index::clone` is now an `Arc`
1080 /// bump regardless of index size, so `Catalog::clone` inside the
1081 /// v4.34 auto-commit wrap stays O(1) even for tables with secondary
1082 /// indices (the case that bottlenecked v4.39 at 1M rows in the
1083 /// sweep).
1084 ///
1085 /// v5.1: value type widened from `Vec<usize>` to `Vec<RowLocator>` so
1086 /// a single key can point to a mix of hot-tier rows (`RowLocator::Hot`,
1087 /// equivalent to the pre-v5 `usize` row index) and cold-tier rows
1088 /// (`RowLocator::Cold { segment_id, page_offset }`) once the v5.2
1089 /// freezer starts producing them. Pre-v5.2 only `Hot` entries appear
1090 /// — the on-disk encoding stays at `FILE_VERSION` 8 (raw u64 row index)
1091 /// because every locator round-trips through `RowLocator::from_legacy_v8_u64`
1092 /// without information loss. `FILE_VERSION` 9 with tagged encoding lands
1093 /// alongside the first freezer commit (v5.1 step 2b / v5.2).
1094 BTree(PersistentBTreeMap<IndexKey, Vec<RowLocator>>),
1095 /// Navigable-small-world graph for vector kNN search.
1096 Nsw(NswGraph),
1097 /// v6.7.1 — BRIN (Block Range INdex). Pure metadata: BRIN
1098 /// indexes carry NO in-memory key→locator map. The (min,
1099 /// max) summaries live in each cold-tier segment's v2
1100 /// envelope sidecar; the BRIN entry in `Table.indices` only
1101 /// records THAT a BRIN index exists on this column so the
1102 /// segment encoder + planner can opt into the summary path.
1103 Brin {
1104 /// The cell type at `column_position` at CREATE INDEX time.
1105 /// Used by the planner to type-check WHERE-clause range
1106 /// predicates against the BRIN-indexed column.
1107 column_type: DataType,
1108 },
1109 /// v7.12.3 — GIN inverted index over a `tsvector` column.
1110 ///
1111 /// Storage shape: `lexeme word → Vec<RowLocator>`. The posting
1112 /// list per word is appended in row-order, so range scans are
1113 /// O(matching rows) once the per-word lookup is done. Multi-
1114 /// term queries intersect / union posting lists.
1115 ///
1116 /// `IndexKey::from_value(TsVector)` returns `None` — GIN doesn't
1117 /// participate in `try_index_seek` (which is BTree-equality-keyed).
1118 /// The engine consults this index through `try_gin_lookup` on
1119 /// `WHERE col @@ tsquery` predicates instead.
1120 ///
1121 /// Backed by a `PersistentBTreeMap` so `Catalog::clone` (the
1122 /// per-write snapshot) stays O(1) — same structural-sharing
1123 /// invariant as BTree.
1124 Gin(PersistentBTreeMap<alloc::string::String, Vec<RowLocator>>),
1125 /// v7.15.0 — `USING gin (col gin_trgm_ops)` over a `TEXT`
1126 /// column. Posting lists map `trigram` (PG-compatible 3-byte
1127 /// shingle on the lower-cased + space-padded input) to row
1128 /// locators. The planner uses this index to accelerate
1129 /// `WHERE col LIKE '…'` / `ILIKE '…'` / `similarity(col, q) >
1130 /// t` — every literal run of length ≥ 1 in the pattern
1131 /// produces a trigram set, the engine intersects the posting
1132 /// lists, and the LIKE / similarity predicate is re-evaluated
1133 /// per candidate row to filter the over-approximation.
1134 /// Persisted via tag-4 index payload in `FILE_VERSION` 24+.
1135 GinTrgm(PersistentBTreeMap<alloc::string::String, Vec<RowLocator>>),
1136 /// v7.17.0 Phase 2.2 — MySQL `FULLTEXT KEY (col)` over a
1137 /// `TEXT` / `VARCHAR` column. Posting lists map
1138 /// `tsvector('simple') lexeme` to row locators. At insert /
1139 /// build time the engine derives the lexemes from the cell
1140 /// via the same lower-case tokenisation rule as
1141 /// `to_tsvector('simple', ...)` — the column itself stays a
1142 /// plain text type on disk (mysqldump round-trips would be
1143 /// broken otherwise). The planner uses this index to
1144 /// accelerate MySQL-shape `MATCH(col) AGAINST('term')`
1145 /// queries by mapping them onto the existing tsquery `@@`
1146 /// walker. Persisted via tag-5 index payload in
1147 /// `FILE_VERSION` 33+.
1148 GinFulltext(PersistentBTreeMap<alloc::string::String, Vec<RowLocator>>),
1149}
1150
1151/// Multi-layer HNSW graph (v2.13). Each node is assigned a `top_level`;
1152/// it appears in layers `0..=top_level`. Higher layers are sparser, so
1153/// search starts from the entry at the top layer, greedy-descends to
1154/// layer 0, and beam-searches there. Layer 0 keeps a larger neighbour
1155/// budget (`m_max_0 = 2 * m` per the HNSW paper); upper layers cap at
1156/// `m`. The struct name stays `NswGraph` so external users / on-disk
1157/// callers don't have to track a rename — the algorithm changed, the
1158/// data slot didn't.
1159#[derive(Debug, Clone)]
1160pub struct NswGraph {
1161 /// Max neighbours per node on layers ≥ 1.
1162 pub m: usize,
1163 /// Max neighbours on layer 0 (the dense bottom layer). HNSW
1164 /// convention: `m_max_0 = 2 * m`.
1165 pub m_max_0: usize,
1166 /// Entry point — the node that sits on the topmost layer. Search
1167 /// always starts here.
1168 pub entry: Option<usize>,
1169 /// Top layer of the entry node (== `layers.len() - 1` when populated).
1170 pub entry_level: u8,
1171 /// `levels[i]` = top layer of node `i`. Nodes whose vector cell is
1172 /// NULL / non-Vector have `levels[i] = 0` and no neighbour entries.
1173 ///
1174 /// v5.5.0: backed by `PersistentVec` so `NswGraph::clone` (and the
1175 /// `Catalog::clone` on every group-commit write that contains it) is O(1)
1176 /// structural-sharing instead of an O(N) element copy.
1177 pub levels: PersistentVec<u8>,
1178 /// `layers[l][i]` = neighbours of node `i` at layer `l`. Inner vec
1179 /// is empty when node `i` doesn't reach layer `l`.
1180 ///
1181 /// v5.5.0: the per-node middle dimension (the O(N) one) is a
1182 /// `PersistentVec`; the outer layer dimension stays a plain `Vec`
1183 /// (layer count ≤ 8, so its clone is O(1) in practice) and the inner
1184 /// neighbour list stays a `Vec` (bounded by `m_max_0`).
1185 ///
1186 /// v6.1.x: neighbour slot widened from `usize` (8 B on 64-bit) to
1187 /// `u32` (4 B). Row indices are catalog-bounded by `u32::MAX` (4G
1188 /// rows per table); the cast at the NSW boundary asserts this. At
1189 /// 1M dim-128 SQ8, layer 0 adjacency alone shrinks by ~128 MiB
1190 /// — the largest single contribution to the v6.0.5-measured
1191 /// 624 MiB ambition gap. On-disk format already used u32 LE, so
1192 /// this is a pure in-memory layout change; no `FILE_VERSION` bump.
1193 pub layers: Vec<PersistentVec<Vec<u32>>>,
1194}
1195
1196impl NswGraph {
1197 fn new(m: usize) -> Self {
1198 Self {
1199 m,
1200 m_max_0: m.saturating_mul(2),
1201 entry: None,
1202 entry_level: 0,
1203 levels: PersistentVec::new(),
1204 layers: alloc::vec![PersistentVec::new()],
1205 }
1206 }
1207
1208 /// Max-neighbour budget for layer `l`.
1209 pub const fn cap_for_layer(&self, layer: u8) -> usize {
1210 if layer == 0 { self.m_max_0 } else { self.m }
1211 }
1212}
1213
1214/// Deterministic level assignment, seeded on the row index so the same
1215/// insert order reproduces the same topology. Distribution is roughly
1216/// HNSW-flavoured with `mL ≈ 1/ln(M) ≈ 0.36` for M=16: each 4-bit
1217/// chunk that comes up zero promotes the node one layer (so P(level ≥
1218/// L) ≈ (1/16)^L).
1219#[allow(clippy::verbose_bit_mask)] // clippy suggests trailing_zeros(); we need an explicit MAX cap and a stable distribution shape.
1220pub fn nsw_assign_level(row_idx: usize) -> u8 {
1221 const MAX_LEVEL: u8 = 7; // 7 ⇒ ~16^7 ≈ 2.7e8 expected nodes between promotions; ample.
1222 // SplitMix-style mixer — cheap and seedable.
1223 let mut x = (row_idx as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15);
1224 x ^= x >> 30;
1225 x = x.wrapping_mul(0xBF58_476D_1CE4_E5B9);
1226 x ^= x >> 27;
1227 x = x.wrapping_mul(0x94D0_49BB_1331_11EB);
1228 x ^= x >> 31;
1229 // Count contiguous low-end zero nibbles (4-bit chunks). Each zero
1230 // nibble has probability 1/16, mirroring HNSW's `mL ≈ 1/ln(M)` for
1231 // M=16. `trailing_zeros / 4` would lose the ordering when x = 0, so
1232 // a plain loop with a cap is clearer.
1233 let mut level: u8 = 0;
1234 while x & 0xF == 0 && level < MAX_LEVEL {
1235 level += 1;
1236 x >>= 4;
1237 }
1238 level
1239}
1240
1241impl Index {
1242 fn new_btree(name: String, column_position: usize) -> Self {
1243 Self {
1244 name,
1245 column_position,
1246 kind: IndexKind::BTree(PersistentBTreeMap::new()),
1247 included_columns: Vec::new(),
1248 partial_predicate: None,
1249 expression: None,
1250 is_unique: false,
1251 extra_column_positions: Vec::new(),
1252 }
1253 }
1254
1255 fn new_nsw(name: String, column_position: usize, m: usize) -> Self {
1256 Self {
1257 name,
1258 column_position,
1259 kind: IndexKind::Nsw(NswGraph::new(m)),
1260 included_columns: Vec::new(),
1261 partial_predicate: None,
1262 expression: None,
1263 is_unique: false,
1264 extra_column_positions: Vec::new(),
1265 }
1266 }
1267
1268 /// v6.7.1 — BRIN index constructor. BRIN carries no in-memory
1269 /// data; the `column_type` snapshot is used by the segment
1270 /// encoder + planner for type-checking range predicates.
1271 fn new_brin(name: String, column_position: usize, column_type: DataType) -> Self {
1272 Self {
1273 name,
1274 column_position,
1275 kind: IndexKind::Brin { column_type },
1276 included_columns: Vec::new(),
1277 partial_predicate: None,
1278 expression: None,
1279 is_unique: false,
1280 extra_column_positions: Vec::new(),
1281 }
1282 }
1283
1284 /// v7.12.3 — GIN inverted-index constructor. Empty posting-list
1285 /// map; caller (typically [`Table::add_gin_index`] or
1286 /// [`Table::restore_gin_index`]) populates it from existing rows
1287 /// or from a deserialised snapshot.
1288 fn new_gin(name: String, column_position: usize) -> Self {
1289 Self {
1290 name,
1291 column_position,
1292 kind: IndexKind::Gin(PersistentBTreeMap::new()),
1293 included_columns: Vec::new(),
1294 partial_predicate: None,
1295 expression: None,
1296 is_unique: false,
1297 extra_column_positions: Vec::new(),
1298 }
1299 }
1300
1301 /// v7.15.0 — `gin_trgm_ops`-flavoured GIN constructor. Same
1302 /// shape as `new_gin` but the posting-list keys are 3-byte
1303 /// trigram shingles (`pg_trgm`-compatible) and the column
1304 /// type is `TEXT` / `VARCHAR` (not `TSVECTOR`).
1305 fn new_gin_trgm(name: String, column_position: usize) -> Self {
1306 Self {
1307 name,
1308 column_position,
1309 kind: IndexKind::GinTrgm(PersistentBTreeMap::new()),
1310 included_columns: Vec::new(),
1311 partial_predicate: None,
1312 expression: None,
1313 is_unique: false,
1314 extra_column_positions: Vec::new(),
1315 }
1316 }
1317
1318 /// v7.17.0 Phase 2.2 — MySQL `FULLTEXT KEY` GIN constructor.
1319 /// Same shape as `new_gin_trgm` but the posting-list keys
1320 /// are lower-cased word lexemes (`to_tsvector('simple', col)`
1321 /// equivalent) instead of trigrams, and the column type is
1322 /// `TEXT` / `VARCHAR` (not `TSVECTOR`).
1323 fn new_gin_fulltext(name: String, column_position: usize) -> Self {
1324 Self {
1325 name,
1326 column_position,
1327 kind: IndexKind::GinFulltext(PersistentBTreeMap::new()),
1328 included_columns: Vec::new(),
1329 partial_predicate: None,
1330 expression: None,
1331 is_unique: false,
1332 extra_column_positions: Vec::new(),
1333 }
1334 }
1335
1336 /// Look up the locators stored under `key` (B-tree only). Returns
1337 /// an empty slice when the key is absent or the index isn't a
1338 /// BTree — callers can treat both cases uniformly.
1339 ///
1340 /// v5.1: return type widened from `&[usize]` to `&[RowLocator]`.
1341 /// Pre-v5.2 callers can read the slice and `.as_hot().unwrap()`
1342 /// each entry (no `Cold` variants exist until the freezer lands);
1343 /// post-v5.2 callers dispatch hot vs. cold per locator.
1344 pub fn lookup_eq(&self, key: &IndexKey) -> &[RowLocator] {
1345 match &self.kind {
1346 IndexKind::BTree(m) => m.get(key).map_or(&[][..], Vec::as_slice),
1347 // BRIN / NSW / GIN / trigram-GIN / fulltext-GIN have
1348 // no IndexKey-keyed map; lookup is a no-op. GIN uses
1349 // [`Index::gin_lookup_word`] instead.
1350 IndexKind::Nsw(_)
1351 | IndexKind::Brin { .. }
1352 | IndexKind::Gin(_)
1353 | IndexKind::GinTrgm(_)
1354 | IndexKind::GinFulltext(_) => &[][..],
1355 }
1356 }
1357
1358 /// v7.12.3 — GIN posting-list lookup. Returns the row locators
1359 /// whose `tsvector` cell contains `word`. Empty when the word is
1360 /// absent from the index or this isn't a GIN index.
1361 pub fn gin_lookup_word(&self, word: &str) -> &[RowLocator] {
1362 match &self.kind {
1363 // v7.17.0 Phase 2.2 — fulltext-GIN shares the same
1364 // lexeme-keyed posting list shape as the
1365 // tsvector-typed GIN, so the same lookup applies.
1366 IndexKind::Gin(m) | IndexKind::GinFulltext(m) => {
1367 m.get(&String::from(word)).map_or(&[][..], Vec::as_slice)
1368 }
1369 IndexKind::BTree(_)
1370 | IndexKind::Nsw(_)
1371 | IndexKind::Brin { .. }
1372 | IndexKind::GinTrgm(_) => &[][..],
1373 }
1374 }
1375
1376 /// v7.15.0 — trigram-GIN posting-list lookup. Returns the row
1377 /// locators whose indexed `TEXT` cell contains the trigram
1378 /// `tri`. Empty when the trigram is absent or this isn't a
1379 /// trigram-GIN index.
1380 pub fn gin_trgm_lookup(&self, tri: &str) -> &[RowLocator] {
1381 match &self.kind {
1382 IndexKind::GinTrgm(m) => m.get(&String::from(tri)).map_or(&[][..], Vec::as_slice),
1383 IndexKind::BTree(_)
1384 | IndexKind::Nsw(_)
1385 | IndexKind::Brin { .. }
1386 | IndexKind::Gin(_)
1387 | IndexKind::GinFulltext(_) => &[][..],
1388 }
1389 }
1390
1391 /// Borrow the NSW graph (if this is an NSW index). Callers that need
1392 /// the graph for a kNN search go through here.
1393 pub const fn nsw(&self) -> Option<&NswGraph> {
1394 match &self.kind {
1395 IndexKind::Nsw(g) => Some(g),
1396 IndexKind::BTree(_)
1397 | IndexKind::Brin { .. }
1398 | IndexKind::Gin(_)
1399 | IndexKind::GinTrgm(_)
1400 | IndexKind::GinFulltext(_) => None,
1401 }
1402 }
1403
1404 /// v6.7.1 — true when this index is a BRIN (block range) index.
1405 /// Used by the segment encoder to opt into BRIN sidecar emission
1406 /// at freeze time, and by the planner to opt into page-skipping
1407 /// on range predicates.
1408 pub const fn is_brin(&self) -> bool {
1409 matches!(self.kind, IndexKind::Brin { .. })
1410 }
1411
1412 /// v7.15.0 — true when this index is a trigram GIN
1413 /// (`gin_trgm_ops`-flavoured). Used by the LIKE planner to
1414 /// opt into trigram acceleration.
1415 pub const fn is_gin_trgm(&self) -> bool {
1416 matches!(self.kind, IndexKind::GinTrgm(_))
1417 }
1418
1419 /// v7.12.3 — true when this index is a GIN inverted index.
1420 /// Used by the planner to opt into posting-list acceleration on
1421 /// `WHERE col @@ tsquery` predicates.
1422 pub const fn is_gin(&self) -> bool {
1423 matches!(self.kind, IndexKind::Gin(_))
1424 }
1425
1426 /// v7.17.0 Phase 2.2 — true when this index is a fulltext
1427 /// GIN over a TEXT / VARCHAR column (MySQL `FULLTEXT KEY`
1428 /// surface). Used by the planner to opt the FULLTEXT-indexed
1429 /// column into MATCH AGAINST acceleration.
1430 pub const fn is_gin_fulltext(&self) -> bool {
1431 matches!(self.kind, IndexKind::GinFulltext(_))
1432 }
1433}
1434
1435/// In-memory table: schema + a persistent row vector + secondary indices.
1436///
1437/// v4.39: `rows` is a [`PersistentVec`] (Bitmapped Vector Trie, 32-way) so
1438/// `Table::clone()` is `O(1)` — the whole reason for v4.39's existence is
1439/// to make `Catalog::clone()` cheap inside the v4.34 auto-commit wrap.
1440///
1441/// v5.2.1: `hot_bytes` tracks the encoded byte size of every row currently
1442/// in [`Self::rows`], summed over rows. Updated incrementally by `insert`
1443/// (+= encoded row size), `delete_rows` (-= removed rows' encoded sizes),
1444/// and `update_row` (-= old size, += new size). The value is what the
1445/// v5.2 freezer reads to decide when to demote cold rows — when the
1446/// catalog-wide sum crosses `SPG_HOT_TIER_BYTES` (default 4 GiB) the
1447/// freezer thread wakes. v5.2.1 ships measurement only; the freezer
1448/// itself lands in v5.2.2. Stored as `u64` so a single field clone in
1449/// `Catalog::clone` stays at the O(1) invariant v4.39 built.
1450#[derive(Debug, Clone)]
1451pub struct Table {
1452 schema: TableSchema,
1453 rows: PersistentVec<Row>,
1454 indices: Vec<Index>,
1455 hot_bytes: u64,
1456 /// v6.7.0 — cached count of rows currently materialised in the
1457 /// cold tier via `RowLocator::Cold` entries across THIS table's
1458 /// indices. Populated by `ANALYZE` (walks every BTree index and
1459 /// counts Cold locators); the count survives until the next
1460 /// ANALYZE recomputes it. Surfaced via `spg_statistic.cold_row_count`
1461 /// and `spg_stat_segment.table_name`.
1462 ///
1463 /// Honest scope: this is a CACHED count, not a live one.
1464 /// Freezer / promote / DELETE don't currently update the cache
1465 /// incrementally — they invalidate it by setting the
1466 /// `cold_row_count_stale` flag, and the next ANALYZE re-walks.
1467 /// Incremental maintenance is a v6.7.x candidate if observation
1468 /// shows the ANALYZE walk cost dominates.
1469 cold_row_count: u64,
1470 /// v6.7.0 — set when the cached `cold_row_count` may be wrong
1471 /// because rows moved into / out of the cold tier since the last
1472 /// ANALYZE. The virtual-table surface reports the cached value
1473 /// regardless (operators run ANALYZE to refresh).
1474 cold_row_count_stale: bool,
1475}
1476
1477impl Table {
1478 pub fn new(schema: TableSchema) -> Self {
1479 Self {
1480 schema,
1481 rows: PersistentVec::new(),
1482 indices: Vec::new(),
1483 hot_bytes: 0,
1484 cold_row_count: 0,
1485 cold_row_count_stale: false,
1486 }
1487 }
1488
1489 /// Total encoded byte size of every row currently in the hot tier
1490 /// (`self.rows`). See struct docs for the maintenance contract.
1491 /// Returns 0 for an empty table.
1492 #[must_use]
1493 pub const fn hot_bytes(&self) -> u64 {
1494 self.hot_bytes
1495 }
1496
1497 /// v6.7.0 — cached count of cold-tier rows. See struct field
1498 /// docs for the staleness contract.
1499 #[must_use]
1500 pub const fn cold_row_count(&self) -> u64 {
1501 self.cold_row_count
1502 }
1503
1504 /// v6.7.0 — overwrite the cached count. Called by the engine's
1505 /// `analyze_one_table` after walking the indices.
1506 pub fn set_cold_row_count(&mut self, n: u64) {
1507 self.cold_row_count = n;
1508 self.cold_row_count_stale = false;
1509 }
1510
1511 /// v6.7.0 — mark the cached count as potentially out of date.
1512 /// Called by freezer / promote / DELETE paths so a subsequent
1513 /// `spg_statistic` read knows the number may not reflect the
1514 /// current state.
1515 pub fn mark_cold_row_count_stale(&mut self) {
1516 self.cold_row_count_stale = true;
1517 }
1518
1519 /// v6.7.0 — report whether the cached count is known to be out
1520 /// of date. Exposed for completeness; the virtual table surface
1521 /// returns the cached value regardless.
1522 #[must_use]
1523 pub const fn cold_row_count_stale(&self) -> bool {
1524 self.cold_row_count_stale
1525 }
1526
1527 /// v6.7.0 — walk every BTree index and count `RowLocator::Cold`
1528 /// entries; return the MAX across indices. The freeze path
1529 /// (`freeze_oldest_to_cold`) writes cold locators to ONE
1530 /// designated index — that index ends up with the full per-row
1531 /// count. MAX-across-indices yields the precise count when a
1532 /// PK-style index exists; for multi-index tables without a
1533 /// covering index it's a lower bound (rare in practice).
1534 /// Caller responsibility: only invoke under `engine.write()`
1535 /// or after taking ownership; the walk is O(N) over every
1536 /// (key, locator) pair.
1537 #[must_use]
1538 pub fn count_cold_locators(&self) -> u64 {
1539 let mut best: u64 = 0;
1540 for idx in &self.indices {
1541 if let IndexKind::BTree(map) = &idx.kind {
1542 let n: u64 = map
1543 .iter()
1544 .map(|(_, locs)| locs.iter().filter(|l| l.is_cold()).count() as u64)
1545 .sum();
1546 if n > best {
1547 best = n;
1548 }
1549 }
1550 }
1551 best
1552 }
1553
1554 pub const fn schema(&self) -> &TableSchema {
1555 &self.schema
1556 }
1557
1558 /// v6.7.2 — mutable schema accessor for ALTER TABLE paths.
1559 /// Used by `Engine::exec_alter_table` to flip per-table
1560 /// settings like `hot_tier_bytes`.
1561 pub const fn schema_mut(&mut self) -> &mut TableSchema {
1562 &mut self.schema
1563 }
1564
1565 /// v4.39: returns the persistent row vector by reference. Callers that
1566 /// used to take `&[Row]` should switch to `.iter()` (via
1567 /// `IntoIterator for &PersistentVec`) or `.get(i)` for indexing.
1568 pub const fn rows(&self) -> &PersistentVec<Row> {
1569 &self.rows
1570 }
1571
1572 pub const fn row_count(&self) -> usize {
1573 self.rows.len()
1574 }
1575
1576 /// v6.8.0 — exposed for the engine layer to patch
1577 /// `Index::included_columns` post-creation. Could fold into
1578 /// `add_index` once the engine's IF-NOT-EXISTS guard moves up,
1579 /// but the patch shape is the minimal change for v6.8.0.
1580 pub fn indices_mut(&mut self) -> &mut [Index] {
1581 &mut self.indices
1582 }
1583
1584 pub fn indices(&self) -> &[Index] {
1585 &self.indices
1586 }
1587
1588 /// Compute the next `AUTO_INCREMENT` value for the column at
1589 /// `col_pos`. Defined as `max(existing) + 1`, falling back to `1`
1590 /// when the column currently holds no integer values. NULL / non-
1591 /// integer cells are skipped. Returns `None` when the column isn't
1592 /// an integer type.
1593 pub fn next_auto_value(&self, col_pos: usize) -> Option<i64> {
1594 let ty = self.schema.columns.get(col_pos)?.ty;
1595 if !matches!(ty, DataType::SmallInt | DataType::Int | DataType::BigInt) {
1596 return None;
1597 }
1598 let mut max: Option<i64> = None;
1599 for row in &self.rows {
1600 match row.values.get(col_pos) {
1601 Some(Value::SmallInt(n)) => {
1602 let v = i64::from(*n);
1603 max = Some(max.map_or(v, |m| m.max(v)));
1604 }
1605 Some(Value::Int(n)) => {
1606 let v = i64::from(*n);
1607 max = Some(max.map_or(v, |m| m.max(v)));
1608 }
1609 Some(Value::BigInt(n)) => {
1610 max = Some(max.map_or(*n, |m| m.max(*n)));
1611 }
1612 _ => {}
1613 }
1614 }
1615 Some(max.map_or(1, |m| m + 1))
1616 }
1617
1618 /// Return the first index defined over `column_position`, if any.
1619 /// (`v0.8` supports at most one index per column logically; the search
1620 /// just picks the first match.)
1621 pub fn index_on(&self, column_position: usize) -> Option<&Index> {
1622 // v6.7.1 — prefer BTree (has the key→locator map needed
1623 // for `lookup_eq`) over BRIN (metadata-only). When only a
1624 // BRIN exists on the column, return None so the executor
1625 // falls back to the hot-tier row scan instead of trying
1626 // to use BRIN for an equality lookup (which would always
1627 // return an empty slice and look like "no rows matched").
1628 self.indices
1629 .iter()
1630 .find(|i| i.column_position == column_position && matches!(i.kind, IndexKind::BTree(_)))
1631 .or_else(|| {
1632 self.indices.iter().find(|i| {
1633 i.column_position == column_position && matches!(i.kind, IndexKind::Nsw(_))
1634 })
1635 })
1636 }
1637
1638 /// Insert one row after validating it matches the schema (length + type).
1639 /// Returns `StorageError` on mismatch — the table is left unchanged.
1640 /// Updates every defined index with the new row's key.
1641 pub fn insert(&mut self, row: Row) -> Result<(), StorageError> {
1642 if row.len() != self.schema.columns.len() {
1643 return Err(StorageError::ArityMismatch {
1644 expected: self.schema.columns.len(),
1645 actual: row.len(),
1646 });
1647 }
1648 for (i, (val, col)) in row.values.iter().zip(&self.schema.columns).enumerate() {
1649 if val.is_null() {
1650 if !col.nullable {
1651 return Err(StorageError::NullInNotNull {
1652 column: col.name.clone(),
1653 });
1654 }
1655 continue;
1656 }
1657 let actual = val.data_type().expect("non-null");
1658 // Vector columns require both that the value's variant be Vector
1659 // *and* its dimension match. `actual == col.ty` already encodes
1660 // both because DataType::Vector carries the dim.
1661 //
1662 // VARCHAR(n) / CHAR(n) are storage-equivalent to TEXT — the
1663 // length / padding contract is enforced upstream by
1664 // `coerce_value`. Accept a `Text` value into either.
1665 //
1666 // NUMERIC's `Value::Numeric` carries its actual scale but the
1667 // column declares the *expected* scale (a scale-rescaled
1668 // Value::Numeric is produced upstream by `coerce_value`); the
1669 // structural check here only verifies "value is Numeric and
1670 // its scale equals the column scale".
1671 let compatible = actual == col.ty
1672 || matches!(
1673 (actual, col.ty),
1674 (
1675 DataType::Text,
1676 DataType::Varchar(_) | DataType::Char(_) | DataType::Json | DataType::Jsonb
1677 ) | (DataType::Json | DataType::Jsonb, DataType::Text)
1678 | (DataType::Json, DataType::Jsonb)
1679 | (DataType::Jsonb, DataType::Json)
1680 | (DataType::Timestamp, DataType::Timestamptz)
1681 | (DataType::Timestamptz, DataType::Timestamp)
1682 )
1683 || matches!(
1684 (actual, col.ty),
1685 (
1686 DataType::Numeric { scale: a, .. },
1687 DataType::Numeric { scale: b, .. },
1688 ) if a == b
1689 );
1690 if !compatible {
1691 return Err(StorageError::TypeMismatch {
1692 column: col.name.clone(),
1693 expected: col.ty,
1694 actual,
1695 position: i,
1696 });
1697 }
1698 }
1699 let new_row_idx = self.rows.len();
1700 // Pre-validate before mutating: ensure indices receive an IndexKey.
1701 // For NSW we defer the graph update to *after* the row is pushed
1702 // so the kNN search can see it in `self.rows`.
1703 for idx in &mut self.indices {
1704 match &mut idx.kind {
1705 IndexKind::BTree(map) => {
1706 if let Some(key) = IndexKey::from_value(&row.values[idx.column_position]) {
1707 // v4.40: PersistentBTreeMap has no in-place entry-or-default.
1708 // Clone-then-insert keeps the same semantics — for typical
1709 // unique-key schemas the Vec is 1-element so the clone is
1710 // O(1). For dup-heavy columns it's O(M) per insert, traded
1711 // for the structural-sharing win at clone time.
1712 let mut entries = map.get(&key).cloned().unwrap_or_default();
1713 entries.push(RowLocator::Hot(new_row_idx));
1714 map.insert_mut(key, entries);
1715 }
1716 }
1717 IndexKind::Gin(map) => {
1718 // v7.12.3 — extend posting list per lexeme word.
1719 // NULL or non-TsVector cell → no-op (cell carries
1720 // no lexemes to index).
1721 if let Value::TsVector(lexemes) = &row.values[idx.column_position] {
1722 for lex in lexemes {
1723 let mut entries = map.get(&lex.word).cloned().unwrap_or_default();
1724 entries.push(RowLocator::Hot(new_row_idx));
1725 map.insert_mut(lex.word.clone(), entries);
1726 }
1727 }
1728 }
1729 IndexKind::GinTrgm(map) => {
1730 // v7.15.0 — trigram GIN. Shingle the TEXT cell
1731 // into PG-compatible 3-byte trigrams and extend
1732 // each trigram's posting list.
1733 if let Value::Text(s) = &row.values[idx.column_position] {
1734 for tri in trgm::extract_trigrams(s) {
1735 let mut entries = map.get(&tri).cloned().unwrap_or_default();
1736 entries.push(RowLocator::Hot(new_row_idx));
1737 map.insert_mut(tri, entries);
1738 }
1739 }
1740 }
1741 IndexKind::GinFulltext(map) => {
1742 // v7.17.0 Phase 2.2 — MySQL FULLTEXT-shape
1743 // GIN over a TEXT / VARCHAR cell. Tokenise
1744 // via the storage-local `simple_lex` (same
1745 // rule as `to_tsvector('simple', text)`) and
1746 // extend each lexeme's posting list.
1747 let text_cell = match &row.values[idx.column_position] {
1748 Value::Text(s) => Some(s.as_str()),
1749 // mysqldump-style mediumtext / longtext
1750 // land as Value::Text on insert; varchar
1751 // cells likewise. Anything else (NULL,
1752 // integer, …) contributes no lexemes.
1753 _ => None,
1754 };
1755 if let Some(s) = text_cell {
1756 for lex in fts_simple::simple_lex(s) {
1757 let mut entries = map.get(&lex).cloned().unwrap_or_default();
1758 entries.push(RowLocator::Hot(new_row_idx));
1759 map.insert_mut(lex, entries);
1760 }
1761 }
1762 }
1763 // NSW handled below after the row push (so the new row
1764 // is visible to the kNN-graph connect step). BRIN
1765 // carries no per-row state.
1766 IndexKind::Nsw(_) | IndexKind::Brin { .. } => {}
1767 }
1768 }
1769 // v5.2.1: maintain incremental hot-tier byte counter. Computed
1770 // before the move so we don't need to borrow `row` after push.
1771 self.hot_bytes = self
1772 .hot_bytes
1773 .saturating_add(row_body_encoded_len(&row, &self.schema) as u64);
1774 // v4.39.1: push_mut keeps streaming inserts at Vec::push speed when
1775 // the table is uniquely owned (the spg-embedded path); inside a TX
1776 // wrap where a Catalog snapshot exists, push_mut path-copies the
1777 // tail just like push() and the snapshot stays valid.
1778 self.rows.push_mut(row);
1779 // NSW updates after the push so the new row is visible to the
1780 // greedy search used during connect.
1781 let new_row_idx = self.rows.len() - 1;
1782 let nsw_targets: Vec<usize> = self
1783 .indices
1784 .iter()
1785 .enumerate()
1786 .filter_map(|(i, idx)| {
1787 if matches!(idx.kind, IndexKind::Nsw(_)) {
1788 Some(i)
1789 } else {
1790 None
1791 }
1792 })
1793 .collect();
1794 for idx_pos in nsw_targets {
1795 nsw_insert_at(self, idx_pos, new_row_idx);
1796 }
1797 Ok(())
1798 }
1799
1800 /// Build a new B-tree index over the named column. Rebuilds from
1801 /// existing rows. Errors if `column_name` doesn't exist or the index
1802 /// name is taken.
1803 pub fn add_index(&mut self, name: String, column_name: &str) -> Result<(), StorageError> {
1804 if self.indices.iter().any(|i| i.name == name) {
1805 return Err(StorageError::DuplicateIndex { name });
1806 }
1807 let column_position = self.schema.column_position(column_name).ok_or_else(|| {
1808 StorageError::ColumnNotFound {
1809 column: column_name.into(),
1810 }
1811 })?;
1812 let mut idx = Index::new_btree(name, column_position);
1813 if let IndexKind::BTree(map) = &mut idx.kind {
1814 for (i, row) in self.rows.iter().enumerate() {
1815 if let Some(key) = IndexKey::from_value(&row.values[column_position]) {
1816 let mut entries = map.get(&key).cloned().unwrap_or_default();
1817 entries.push(RowLocator::Hot(i));
1818 map.insert_mut(key, entries);
1819 }
1820 }
1821 }
1822 self.indices.push(idx);
1823 Ok(())
1824 }
1825
1826 /// Build a new NSW (HNSW-flavoured) index over the named column.
1827 /// Required for `ORDER BY col <-> literal LIMIT k` to plan as a
1828 /// graph traversal instead of a full scan. Column must be a Vector
1829 /// type. `m` is the maximum number of neighbours per node.
1830 pub fn add_nsw_index(
1831 &mut self,
1832 name: String,
1833 column_name: &str,
1834 m: usize,
1835 ) -> Result<(), StorageError> {
1836 self.add_nsw_index_inner(name, column_name, m, None)
1837 }
1838
1839 /// v6.0.4 — synchronous rebuild of the named NSW index. If
1840 /// `new_encoding` is `Some(target)` and differs from the column's
1841 /// current encoding, every stored cell at the indexed column is
1842 /// re-coded into the target encoding before the new graph
1843 /// builds. Returns `IndexNotFound` if no index by that name exists
1844 /// and `Unsupported` for non-NSW indexes (`BTree` REBUILD is a no-op
1845 /// the engine layer rejects, not a storage-level concept).
1846 ///
1847 /// Holds the caller's `&mut self` for the duration — no
1848 /// concurrency / staging / WAL-replay machinery in v6.0.4. The
1849 /// "live" optimisation lands as v6.0.4.1.
1850 pub fn rebuild_nsw_index(
1851 &mut self,
1852 name: &str,
1853 new_encoding: Option<VecEncoding>,
1854 ) -> Result<(), StorageError> {
1855 let idx_pos = self
1856 .indices
1857 .iter()
1858 .position(|i| i.name == name)
1859 .ok_or_else(|| StorageError::IndexNotFound {
1860 name: String::from(name),
1861 })?;
1862 let col_pos = self.indices[idx_pos].column_position;
1863 let m = match &self.indices[idx_pos].kind {
1864 IndexKind::Nsw(g) => g.m,
1865 IndexKind::BTree(_)
1866 | IndexKind::Brin { .. }
1867 | IndexKind::Gin(_)
1868 | IndexKind::GinTrgm(_)
1869 | IndexKind::GinFulltext(_) => {
1870 return Err(StorageError::Unsupported(format!(
1871 "ALTER INDEX REBUILD on non-NSW index {name:?} — only NSW indexes can rebuild"
1872 )));
1873 }
1874 };
1875 let col_name = self.schema.columns[col_pos].name.clone();
1876 // 1. Optional re-encoding pass. Done first so the cells
1877 // match the schema before the graph rebuild walks them.
1878 if let Some(target) = new_encoding {
1879 let current = match self.schema.columns[col_pos].ty {
1880 DataType::Vector { encoding, .. } => encoding,
1881 ref other => {
1882 return Err(StorageError::Unsupported(format!(
1883 "ALTER INDEX REBUILD WITH (encoding=…) on non-vector column type {other:?}"
1884 )));
1885 }
1886 };
1887 if target != current {
1888 let DataType::Vector { dim, .. } = self.schema.columns[col_pos].ty else {
1889 unreachable!("checked above")
1890 };
1891 let n = self.rows.len();
1892 for i in 0..n {
1893 let row = self
1894 .rows
1895 .get_mut(i)
1896 .expect("row index in bounds (we iterated up to len())");
1897 let cell = core::mem::replace(&mut row.values[col_pos], Value::Null);
1898 let recoded = recode_vector_cell(cell, target)?;
1899 row.values[col_pos] = recoded;
1900 }
1901 self.schema.columns[col_pos].ty = DataType::Vector {
1902 dim,
1903 encoding: target,
1904 };
1905 }
1906 }
1907 // 2. Drop the existing index slot + rebuild from row payload.
1908 self.indices.remove(idx_pos);
1909 self.add_nsw_index_inner(String::from(name), &col_name, m, None)?;
1910 Ok(())
1911 }
1912
1913 /// Restore an NSW index from a pre-built graph (used on
1914 /// deserialize). Skips the bulk-build pass since the topology is
1915 /// already known. Returns `DuplicateIndex` or `ColumnNotFound` on
1916 /// schema mismatch as usual.
1917 pub fn restore_nsw_index(
1918 &mut self,
1919 name: String,
1920 column_name: &str,
1921 graph: NswGraph,
1922 ) -> Result<(), StorageError> {
1923 self.add_nsw_index_inner(name, column_name, graph.m, Some(graph))
1924 }
1925
1926 /// Restore a `BTree` index from a pre-built `(IndexKey, Vec<RowLocator>)`
1927 /// map. Used by [`Catalog::deserialize`] when reading a v9 (or later)
1928 /// catalog snapshot — the map travels on disk so cold-tier locators
1929 /// survive a round-trip, instead of being rebuilt from `self.rows`
1930 /// (which would lose every Cold entry). Same error contract as
1931 /// [`Table::add_index`].
1932 pub fn restore_btree_index(
1933 &mut self,
1934 name: String,
1935 column_name: &str,
1936 map: PersistentBTreeMap<IndexKey, Vec<RowLocator>>,
1937 ) -> Result<(), StorageError> {
1938 if self.indices.iter().any(|i| i.name == name) {
1939 return Err(StorageError::DuplicateIndex { name });
1940 }
1941 let column_position = self.schema.column_position(column_name).ok_or_else(|| {
1942 StorageError::ColumnNotFound {
1943 column: column_name.into(),
1944 }
1945 })?;
1946 self.indices.push(Index {
1947 name,
1948 column_position,
1949 kind: IndexKind::BTree(map),
1950 included_columns: Vec::new(),
1951 partial_predicate: None,
1952 expression: None,
1953 is_unique: false,
1954 extra_column_positions: Vec::new(),
1955 });
1956 Ok(())
1957 }
1958
1959 /// v6.7.1 — public restore counterpart for BRIN indices. Used
1960 /// by `Catalog::deserialize` when a v10 snapshot carries a
1961 /// BRIN index entry. BRIN carries no in-memory data — only the
1962 /// `column_type` snapshot is restored.
1963 pub fn restore_brin_index(
1964 &mut self,
1965 name: String,
1966 column_name: &str,
1967 column_type: DataType,
1968 ) -> Result<(), StorageError> {
1969 if self.indices.iter().any(|i| i.name == name) {
1970 return Err(StorageError::DuplicateIndex { name });
1971 }
1972 let column_position = self.schema.column_position(column_name).ok_or_else(|| {
1973 StorageError::ColumnNotFound {
1974 column: column_name.into(),
1975 }
1976 })?;
1977 self.indices
1978 .push(Index::new_brin(name, column_position, column_type));
1979 Ok(())
1980 }
1981
1982 /// v6.7.1 — public CREATE INDEX counterpart for BRIN. Creates
1983 /// the index entry with a snapshot of the indexed column's
1984 /// current `DataType`.
1985 pub fn add_brin_index(&mut self, name: String, column_name: &str) -> Result<(), StorageError> {
1986 if self.indices.iter().any(|i| i.name == name) {
1987 return Err(StorageError::DuplicateIndex { name });
1988 }
1989 let column_position = self.schema.column_position(column_name).ok_or_else(|| {
1990 StorageError::ColumnNotFound {
1991 column: column_name.into(),
1992 }
1993 })?;
1994 let column_type = self.schema.columns[column_position].ty;
1995 self.indices
1996 .push(Index::new_brin(name, column_position, column_type));
1997 Ok(())
1998 }
1999
2000 /// v7.12.3 — Build a new GIN inverted index over a `tsvector`
2001 /// column. Populates posting lists from existing rows. Errors
2002 /// if the column doesn't exist, isn't `TsVector`, or the index
2003 /// name is taken.
2004 pub fn add_gin_index(&mut self, name: String, column_name: &str) -> Result<(), StorageError> {
2005 if self.indices.iter().any(|i| i.name == name) {
2006 return Err(StorageError::DuplicateIndex { name });
2007 }
2008 let column_position = self.schema.column_position(column_name).ok_or_else(|| {
2009 StorageError::ColumnNotFound {
2010 column: column_name.into(),
2011 }
2012 })?;
2013 if self.schema.columns[column_position].ty != DataType::TsVector {
2014 return Err(StorageError::Corrupt(format!(
2015 "GIN index {name:?} requires a tsvector column; \
2016 {column_name:?} is {:?}",
2017 self.schema.columns[column_position].ty
2018 )));
2019 }
2020 let mut idx = Index::new_gin(name, column_position);
2021 if let IndexKind::Gin(map) = &mut idx.kind {
2022 for (i, row) in self.rows.iter().enumerate() {
2023 if let Value::TsVector(lexemes) = &row.values[column_position] {
2024 for lex in lexemes {
2025 let mut entries = map.get(&lex.word).cloned().unwrap_or_default();
2026 entries.push(RowLocator::Hot(i));
2027 map.insert_mut(lex.word.clone(), entries);
2028 }
2029 }
2030 }
2031 }
2032 self.indices.push(idx);
2033 Ok(())
2034 }
2035
2036 /// v7.12.3 — Restore a GIN index from a deserialised snapshot.
2037 /// Mirrors [`Self::restore_btree_index`] but takes the GIN's
2038 /// `word → Vec<RowLocator>` posting-list map (already populated
2039 /// from the catalog stream) instead of an `IndexKey` map.
2040 pub fn restore_gin_index(
2041 &mut self,
2042 name: String,
2043 column_name: &str,
2044 map: PersistentBTreeMap<String, Vec<RowLocator>>,
2045 ) -> Result<(), StorageError> {
2046 if self.indices.iter().any(|i| i.name == name) {
2047 return Err(StorageError::DuplicateIndex { name });
2048 }
2049 let column_position = self.schema.column_position(column_name).ok_or_else(|| {
2050 StorageError::ColumnNotFound {
2051 column: column_name.into(),
2052 }
2053 })?;
2054 let mut idx = Index::new_gin(name, column_position);
2055 idx.kind = IndexKind::Gin(map);
2056 self.indices.push(idx);
2057 Ok(())
2058 }
2059
2060 /// v7.15.0 — `gin_trgm_ops` GIN over a TEXT column. Walks
2061 /// every row, shingles the cell into PG-compatible trigrams,
2062 /// and builds the posting-list map. NULL / non-TEXT cells
2063 /// contribute nothing (no trigrams).
2064 pub fn add_gin_trgm_index(
2065 &mut self,
2066 name: String,
2067 column_name: &str,
2068 ) -> Result<(), StorageError> {
2069 if self.indices.iter().any(|i| i.name == name) {
2070 return Err(StorageError::DuplicateIndex { name });
2071 }
2072 let column_position = self.schema.column_position(column_name).ok_or_else(|| {
2073 StorageError::ColumnNotFound {
2074 column: column_name.into(),
2075 }
2076 })?;
2077 if !matches!(
2078 self.schema.columns[column_position].ty,
2079 DataType::Text | DataType::Varchar(_)
2080 ) {
2081 return Err(StorageError::Corrupt(format!(
2082 "trigram-GIN index {name:?} requires a TEXT/VARCHAR column; \
2083 {column_name:?} is {:?}",
2084 self.schema.columns[column_position].ty
2085 )));
2086 }
2087 let mut idx = Index::new_gin_trgm(name, column_position);
2088 if let IndexKind::GinTrgm(map) = &mut idx.kind {
2089 for (i, row) in self.rows.iter().enumerate() {
2090 if let Value::Text(s) = &row.values[column_position] {
2091 for tri in trgm::extract_trigrams(s) {
2092 let mut entries = map.get(&tri).cloned().unwrap_or_default();
2093 entries.push(RowLocator::Hot(i));
2094 map.insert_mut(tri, entries);
2095 }
2096 }
2097 }
2098 }
2099 self.indices.push(idx);
2100 Ok(())
2101 }
2102
2103 /// v7.15.0 — restore a trigram-GIN from its catalog snapshot
2104 /// payload. Mirrors [`Self::restore_gin_index`].
2105 pub fn restore_gin_trgm_index(
2106 &mut self,
2107 name: String,
2108 column_name: &str,
2109 map: PersistentBTreeMap<String, Vec<RowLocator>>,
2110 ) -> Result<(), StorageError> {
2111 if self.indices.iter().any(|i| i.name == name) {
2112 return Err(StorageError::DuplicateIndex { name });
2113 }
2114 let column_position = self.schema.column_position(column_name).ok_or_else(|| {
2115 StorageError::ColumnNotFound {
2116 column: column_name.into(),
2117 }
2118 })?;
2119 let mut idx = Index::new_gin_trgm(name, column_position);
2120 idx.kind = IndexKind::GinTrgm(map);
2121 self.indices.push(idx);
2122 Ok(())
2123 }
2124
2125 /// v7.17.0 Phase 2.2 — MySQL `FULLTEXT KEY` GIN over a TEXT
2126 /// column. Walks every row, tokenises the cell into lower-
2127 /// cased word lexemes (`fts_simple::simple_lex` — same rule
2128 /// as `to_tsvector('simple', text)`), and builds the
2129 /// posting-list map. NULL / non-TEXT cells contribute
2130 /// nothing (no lexemes).
2131 pub fn add_gin_fulltext_index(
2132 &mut self,
2133 name: String,
2134 column_name: &str,
2135 ) -> Result<(), StorageError> {
2136 if self.indices.iter().any(|i| i.name == name) {
2137 return Err(StorageError::DuplicateIndex { name });
2138 }
2139 let column_position = self.schema.column_position(column_name).ok_or_else(|| {
2140 StorageError::ColumnNotFound {
2141 column: column_name.into(),
2142 }
2143 })?;
2144 if !matches!(
2145 self.schema.columns[column_position].ty,
2146 DataType::Text | DataType::Varchar(_)
2147 ) {
2148 return Err(StorageError::Corrupt(format!(
2149 "fulltext-GIN index {name:?} requires a TEXT/VARCHAR column; \
2150 {column_name:?} is {:?}",
2151 self.schema.columns[column_position].ty
2152 )));
2153 }
2154 let mut idx = Index::new_gin_fulltext(name, column_position);
2155 if let IndexKind::GinFulltext(map) = &mut idx.kind {
2156 for (i, row) in self.rows.iter().enumerate() {
2157 if let Value::Text(s) = &row.values[column_position] {
2158 for lex in fts_simple::simple_lex(s) {
2159 let mut entries = map.get(&lex).cloned().unwrap_or_default();
2160 entries.push(RowLocator::Hot(i));
2161 map.insert_mut(lex, entries);
2162 }
2163 }
2164 }
2165 }
2166 self.indices.push(idx);
2167 Ok(())
2168 }
2169
2170 /// v7.17.0 Phase 2.2 — restore a fulltext-GIN from its
2171 /// catalog snapshot payload. Mirrors
2172 /// [`Self::restore_gin_trgm_index`].
2173 pub fn restore_gin_fulltext_index(
2174 &mut self,
2175 name: String,
2176 column_name: &str,
2177 map: PersistentBTreeMap<String, Vec<RowLocator>>,
2178 ) -> Result<(), StorageError> {
2179 if self.indices.iter().any(|i| i.name == name) {
2180 return Err(StorageError::DuplicateIndex { name });
2181 }
2182 let column_position = self.schema.column_position(column_name).ok_or_else(|| {
2183 StorageError::ColumnNotFound {
2184 column: column_name.into(),
2185 }
2186 })?;
2187 let mut idx = Index::new_gin_fulltext(name, column_position);
2188 idx.kind = IndexKind::GinFulltext(map);
2189 self.indices.push(idx);
2190 Ok(())
2191 }
2192
2193 /// v5.1: register cold-tier locators on a `BTree` index. Used
2194 /// after [`Catalog::load_segment_bytes`] to wire every cold-
2195 /// tier row's PK back to its segment so
2196 /// [`Catalog::lookup_by_pk`] can resolve it. Each call
2197 /// appends to the index — keys that already have hot or cold
2198 /// locators keep them. Returns the number of locators
2199 /// registered.
2200 ///
2201 /// Pre-v5.2 (freezer) this is the only path that adds Cold
2202 /// variants to a PB; post-freezer the background freezer
2203 /// thread produces these as a batch under the engine write
2204 /// lock and this API becomes its in-memory primitive.
2205 ///
2206 /// Errors if `index_name` doesn't exist or names an NSW graph
2207 /// (NSW indices don't carry per-key row locators — they're
2208 /// vector-search structures).
2209 pub fn register_cold_locators<I>(
2210 &mut self,
2211 index_name: &str,
2212 locators: I,
2213 ) -> Result<usize, StorageError>
2214 where
2215 I: IntoIterator<Item = (IndexKey, RowLocator)>,
2216 {
2217 let idx = self
2218 .indices
2219 .iter_mut()
2220 .find(|i| i.name == index_name)
2221 .ok_or_else(|| StorageError::Corrupt(format!("index {index_name:?} not found")))?;
2222 let map = match &mut idx.kind {
2223 IndexKind::BTree(map) => map,
2224 IndexKind::Nsw(_)
2225 | IndexKind::Brin { .. }
2226 | IndexKind::Gin(_)
2227 | IndexKind::GinTrgm(_)
2228 | IndexKind::GinFulltext(_) => {
2229 return Err(StorageError::Corrupt(format!(
2230 "index {index_name:?} is not BTree; cold locators apply only to BTree indices"
2231 )));
2232 }
2233 };
2234 let mut count = 0usize;
2235 for (key, locator) in locators {
2236 let mut entries = map.get(&key).cloned().unwrap_or_default();
2237 entries.push(locator);
2238 map.insert_mut(key, entries);
2239 count += 1;
2240 }
2241 Ok(count)
2242 }
2243
2244 /// v7.12.3 — GIN-side parallel to [`Self::register_cold_locators`].
2245 /// Re-attaches `word → cold RowLocator` posting-list entries after
2246 /// the from-rows rebuild loop. Errors when the index doesn't
2247 /// exist or isn't a GIN. Both tsvector-GIN and trigram-GIN
2248 /// variants share posting-list shape (`String → Vec<RowLocator>`),
2249 /// so this helper accepts either.
2250 pub fn register_gin_cold_locators<I>(
2251 &mut self,
2252 index_name: &str,
2253 locators: I,
2254 ) -> Result<usize, StorageError>
2255 where
2256 I: IntoIterator<Item = (String, RowLocator)>,
2257 {
2258 let idx = self
2259 .indices
2260 .iter_mut()
2261 .find(|i| i.name == index_name)
2262 .ok_or_else(|| StorageError::Corrupt(format!("index {index_name:?} not found")))?;
2263 let map = match &mut idx.kind {
2264 // v7.17.0 Phase 2.2 — fulltext-GIN posting lists are
2265 // shape-compatible with tsvector / trigram GINs, so
2266 // cold-locator re-attach handles all three.
2267 IndexKind::Gin(map) | IndexKind::GinTrgm(map) | IndexKind::GinFulltext(map) => map,
2268 IndexKind::BTree(_) | IndexKind::Nsw(_) | IndexKind::Brin { .. } => {
2269 return Err(StorageError::Corrupt(format!(
2270 "register_gin_cold_locators: index {index_name:?} is not GIN"
2271 )));
2272 }
2273 };
2274 let mut count = 0usize;
2275 for (word, locator) in locators {
2276 let mut entries = map.get(&word).cloned().unwrap_or_default();
2277 entries.push(locator);
2278 map.insert_mut(word, entries);
2279 count += 1;
2280 }
2281 Ok(count)
2282 }
2283
2284 /// v5.2.3: remove every `Cold` locator currently registered on
2285 /// `index_name` under the given `key`. `Hot` locators for the
2286 /// same key are left in place — useful when a row has just been
2287 /// promoted hot-side and the caller wants the old Cold pointer
2288 /// retired without losing the new hot entry.
2289 ///
2290 /// Returns the number of cold locators removed (0 when the key
2291 /// has only hot entries or the key isn't present at all).
2292 /// Errors when the index doesn't exist or isn't a `BTree`.
2293 pub fn remove_cold_locators_for_key(
2294 &mut self,
2295 index_name: &str,
2296 key: &IndexKey,
2297 ) -> Result<usize, StorageError> {
2298 let idx = self
2299 .indices
2300 .iter_mut()
2301 .find(|i| i.name == index_name)
2302 .ok_or_else(|| {
2303 StorageError::Corrupt(format!(
2304 "remove_cold_locators_for_key: index {index_name:?} not found"
2305 ))
2306 })?;
2307 let map = match &mut idx.kind {
2308 IndexKind::BTree(map) => map,
2309 IndexKind::Nsw(_)
2310 | IndexKind::Brin { .. }
2311 | IndexKind::Gin(_)
2312 | IndexKind::GinTrgm(_)
2313 | IndexKind::GinFulltext(_) => {
2314 return Err(StorageError::Corrupt(format!(
2315 "remove_cold_locators_for_key: index {index_name:?} is not BTree; \
2316 cold locators apply only to BTree indices"
2317 )));
2318 }
2319 };
2320 let Some(entries) = map.get(key) else {
2321 return Ok(0);
2322 };
2323 let mut kept: Vec<RowLocator> =
2324 entries.iter().copied().filter(RowLocator::is_hot).collect();
2325 let removed = entries.len() - kept.len();
2326 if removed == 0 {
2327 return Ok(0);
2328 }
2329 kept.shrink_to_fit();
2330 // PersistentBTreeMap has no remove API in v5.2; when every
2331 // locator for `key` was Cold, the key keeps an empty Vec
2332 // entry. `Index::lookup_eq` already treats `Some(&[])` and
2333 // `None` as the same empty slice (via `Vec::as_slice`), so
2334 // callers can't distinguish the two. The space cost is one
2335 // empty Vec per shadowed-then-promoted key — bounded and
2336 // recoverable when the future compaction job lands.
2337 map.insert_mut(key.clone(), kept);
2338 Ok(removed)
2339 }
2340
2341 /// v7.13.0 — append a new column to the schema and back-fill
2342 /// every existing row with `fill_value`. Used by the engine's
2343 /// `ALTER TABLE t ADD COLUMN …` handler (mailrs round-5 G1).
2344 /// Indices on existing columns keep working — column positions
2345 /// don't shift since the new column lands at the end — so no
2346 /// index rebuild is needed.
2347 pub fn add_column(&mut self, col: ColumnSchema, fill_value: Value) {
2348 self.schema.columns.push(col);
2349 let mut new_rows: PersistentVec<Row> = PersistentVec::new();
2350 for row in self.rows.iter() {
2351 let mut values = row.values.clone();
2352 values.push(fill_value.clone());
2353 new_rows.push_mut(Row::new(values));
2354 }
2355 self.rows = new_rows;
2356 }
2357
2358 /// v7.15.0 — replace the partial-index predicate source on
2359 /// the index at slot `idx`. Used by `ALTER TABLE … RENAME
2360 /// COLUMN` after the engine rewrites column-identifier
2361 /// references in the predicate source text. Pure metadata
2362 /// edit; index rows are unaffected (they're keyed by
2363 /// column position, not predicate text).
2364 pub fn set_partial_predicate(&mut self, idx: usize, pred: Option<String>) {
2365 debug_assert!(idx < self.indices.len());
2366 self.indices[idx].partial_predicate = pred;
2367 }
2368
2369 /// v7.15.0 — rename the column at `col_pos` to `new_name`.
2370 /// The on-disk row encoding is positional, so no row rewrite
2371 /// is needed; only the schema's column name changes. Indices,
2372 /// UCs, FKs all key off column positions and are unaffected.
2373 /// Source-text references that hold the column name (CHECK
2374 /// predicates, partial-index predicates, runtime DEFAULT
2375 /// expressions, trigger `UPDATE OF` lists) are rewritten by
2376 /// the engine before this helper is called — the storage
2377 /// layer doesn't depend on `spg-sql` and so can't re-parse the
2378 /// predicate sources itself.
2379 pub fn rename_column(&mut self, col_pos: usize, new_name: &str) {
2380 debug_assert!(col_pos < self.schema.columns.len());
2381 self.schema.columns[col_pos].name = new_name.to_string();
2382 }
2383
2384 /// v7.13.3 — drop the column at `col_pos`. Removes the entry
2385 /// from the schema, the value from every row, any index that
2386 /// references the column (pure drop, not shift), and shifts
2387 /// every remaining index/UC/FK column position that pointed
2388 /// past `col_pos` down by one. Used by `ALTER TABLE t DROP
2389 /// COLUMN <c>` (mailrs round-7 S8). FK dependents on this
2390 /// column must already have been removed by the caller (CASCADE
2391 /// path); the helper assumes only same-column index removal is
2392 /// needed.
2393 pub fn drop_column(&mut self, col_pos: usize) {
2394 debug_assert!(col_pos < self.schema.columns.len());
2395 // Strip the column from the schema.
2396 self.schema.columns.remove(col_pos);
2397 // Rewrite every row to omit the cell at col_pos.
2398 let mut new_rows: PersistentVec<Row> = PersistentVec::new();
2399 for row in self.rows.iter() {
2400 let mut values = row.values.clone();
2401 if col_pos < values.len() {
2402 values.remove(col_pos);
2403 }
2404 new_rows.push_mut(Row::new(values));
2405 }
2406 self.rows = new_rows;
2407 // Drop indices on the column outright; shift the rest.
2408 self.indices.retain(|idx| idx.column_position != col_pos);
2409 for idx in &mut self.indices {
2410 if idx.column_position > col_pos {
2411 idx.column_position -= 1;
2412 }
2413 // Same shift for any included-columns reference.
2414 for inc in &mut idx.included_columns {
2415 if *inc > col_pos {
2416 *inc -= 1;
2417 }
2418 }
2419 }
2420 // Shift uniqueness-constraint column positions (and drop
2421 // entries that lose all columns, though that shouldn't
2422 // happen in practice — caller has already CASCADE-removed
2423 // FKs and there's no general CASCADE for UCs).
2424 let mut surviving_ucs: Vec<UniquenessConstraint> = Vec::new();
2425 for mut uc in core::mem::take(&mut self.schema.uniqueness_constraints) {
2426 uc.columns.retain(|&c| c != col_pos);
2427 if uc.columns.is_empty() {
2428 continue;
2429 }
2430 for c in &mut uc.columns {
2431 if *c > col_pos {
2432 *c -= 1;
2433 }
2434 }
2435 surviving_ucs.push(uc);
2436 }
2437 self.schema.uniqueness_constraints = surviving_ucs;
2438 // Shift FK local_columns (parent-pointing column positions
2439 // are off-table and untouched).
2440 for fk in &mut self.schema.foreign_keys {
2441 for c in &mut fk.local_columns {
2442 if *c > col_pos {
2443 *c -= 1;
2444 }
2445 }
2446 }
2447 // Rebuild remaining indices' payload — the column-position
2448 // shift means existing IndexKey entries are still keyed by
2449 // the same column data but the position numbers changed;
2450 // existing key→locator maps stay valid because they're
2451 // keyed by Value not position. The rebuild is conservative
2452 // — same pattern delete_rows uses post-mutation.
2453 self.rebuild_indices();
2454 }
2455
2456 /// v4.4: delete the rows at the given positions in one pass.
2457 /// `positions` must be unique; ordering doesn't matter. Indices
2458 /// are rebuilt from scratch (cheaper than tracking incremental
2459 /// shifts across both B-tree and NSW). Returns the number of
2460 /// rows removed.
2461 /// v7.17.0 Phase 1.3 — wipe every row. Used by REFRESH
2462 /// MATERIALIZED VIEW; same effect as `delete_rows((0..N).into())`
2463 /// but skips the per-position bookkeeping for the all-removed
2464 /// fast path. Indices are rebuilt (empty).
2465 pub fn truncate(&mut self) {
2466 self.rows = PersistentVec::new();
2467 self.hot_bytes = 0;
2468 self.rebuild_indices();
2469 }
2470
2471 pub fn delete_rows(&mut self, positions: &[usize]) -> usize {
2472 if positions.is_empty() {
2473 return 0;
2474 }
2475 // Mark positions; v4.39: PV has no in-place retain, so we rebuild
2476 // a fresh PV by pushing the survivors. Still O(n log₃₂ n); the
2477 // structural-sharing win shows up at `Catalog::clone()`, not here.
2478 let mut to_remove = alloc::vec![false; self.rows.len()];
2479 let mut removed = 0;
2480 for &p in positions {
2481 if p < to_remove.len() && !to_remove[p] {
2482 to_remove[p] = true;
2483 removed += 1;
2484 }
2485 }
2486 let mut new_rows: PersistentVec<Row> = PersistentVec::new();
2487 let mut removed_bytes: u64 = 0;
2488 for (i, row) in self.rows.iter().enumerate() {
2489 if to_remove[i] {
2490 removed_bytes =
2491 removed_bytes.saturating_add(row_body_encoded_len(row, &self.schema) as u64);
2492 } else {
2493 new_rows.push_mut(row.clone());
2494 }
2495 }
2496 self.rows = new_rows;
2497 self.hot_bytes = self.hot_bytes.saturating_sub(removed_bytes);
2498 self.rebuild_indices();
2499 removed
2500 }
2501
2502 /// v4.4: replace the row at `position` with `new_values` (must
2503 /// match the schema arity + types). v7.20: index maintenance is
2504 /// incremental — only indices whose key value changed are
2505 /// touched (B-tree entry move in place; NSW / BRIN / GIN fall
2506 /// back to a full rebuild when their column changed).
2507 pub fn update_row(
2508 &mut self,
2509 position: usize,
2510 new_values: Vec<Value>,
2511 ) -> Result<(), StorageError> {
2512 if position >= self.rows.len() {
2513 return Err(StorageError::Corrupt(alloc::format!(
2514 "update_row: position {position} out of bounds (rows={})",
2515 self.rows.len()
2516 )));
2517 }
2518 if new_values.len() != self.schema.columns.len() {
2519 return Err(StorageError::ArityMismatch {
2520 expected: self.schema.columns.len(),
2521 actual: new_values.len(),
2522 });
2523 }
2524 // Reuse the per-cell type-compat validation that `insert`
2525 // applies. The body below mirrors that check intentionally —
2526 // factoring it would be more code than the duplication.
2527 for (i, (val, col)) in new_values.iter().zip(&self.schema.columns).enumerate() {
2528 if val.is_null() {
2529 if !col.nullable {
2530 return Err(StorageError::NullInNotNull {
2531 column: col.name.clone(),
2532 });
2533 }
2534 continue;
2535 }
2536 let actual = val.data_type().expect("non-null");
2537 let compatible = actual == col.ty
2538 || matches!(
2539 (actual, col.ty),
2540 (
2541 DataType::Text,
2542 DataType::Varchar(_) | DataType::Char(_) | DataType::Json | DataType::Jsonb
2543 ) | (DataType::Json | DataType::Jsonb, DataType::Text)
2544 | (DataType::Json, DataType::Jsonb)
2545 | (DataType::Jsonb, DataType::Json)
2546 | (DataType::Timestamp, DataType::Timestamptz)
2547 | (DataType::Timestamptz, DataType::Timestamp)
2548 )
2549 || matches!(
2550 (actual, col.ty),
2551 (
2552 DataType::Numeric { scale: a, .. },
2553 DataType::Numeric { scale: b, .. },
2554 ) if a == b
2555 );
2556 if !compatible {
2557 return Err(StorageError::TypeMismatch {
2558 column: col.name.clone(),
2559 expected: col.ty,
2560 actual,
2561 position: i,
2562 });
2563 }
2564 }
2565 let old_row = self
2566 .rows
2567 .get(position)
2568 .expect("position bounds-checked above");
2569 let old_bytes = row_body_encoded_len(old_row, &self.schema) as u64;
2570 let new_row = Row::new(new_values);
2571 let new_bytes = row_body_encoded_len(&new_row, &self.schema) as u64;
2572 // v7.20 P4 — incremental index maintenance. `rows.set`
2573 // replaces the row in place, so every OTHER row's Hot
2574 // locator stays valid; only indices whose key value
2575 // actually changed at `position` need touching. The
2576 // common OLTP shape (`UPDATE … SET non_indexed_col = …
2577 // WHERE pk = $1`) touches no index at all — pre-v7.20
2578 // this path paid a full rebuild_indices() (O(rows ×
2579 // indices)) per UPDATE, which dominated the profiled
2580 // write cost on a 5k-row table (~1 ms/stmt).
2581 //
2582 // BTree gets an in-place entry move (drop Hot(position)
2583 // from the old key's locator list, append to the new
2584 // key's). NSW graphs / BRIN summaries / GIN posting
2585 // lists have no cheap single-key move — a changed column
2586 // under one of those falls back to the full rebuild.
2587 enum IdxFix {
2588 BTreeMove {
2589 idx_pos: usize,
2590 old_key: Option<IndexKey>,
2591 new_key: Option<IndexKey>,
2592 },
2593 FullRebuild,
2594 }
2595 let mut fixes: Vec<IdxFix> = Vec::new();
2596 for (idx_pos, idx) in self.indices.iter().enumerate() {
2597 let col = idx.column_position;
2598 let old_v = &old_row.values[col];
2599 let new_v = &new_row.values[col];
2600 if old_v == new_v {
2601 continue;
2602 }
2603 match &idx.kind {
2604 IndexKind::BTree(_) => fixes.push(IdxFix::BTreeMove {
2605 idx_pos,
2606 old_key: IndexKey::from_value(old_v),
2607 new_key: IndexKey::from_value(new_v),
2608 }),
2609 IndexKind::Nsw(_)
2610 | IndexKind::Brin { .. }
2611 | IndexKind::Gin(_)
2612 | IndexKind::GinTrgm(_)
2613 | IndexKind::GinFulltext(_) => {
2614 fixes.clear();
2615 fixes.push(IdxFix::FullRebuild);
2616 break;
2617 }
2618 }
2619 }
2620 self.rows = self
2621 .rows
2622 .set(position, new_row)
2623 .expect("position bounds-checked above");
2624 self.hot_bytes = self
2625 .hot_bytes
2626 .saturating_sub(old_bytes)
2627 .saturating_add(new_bytes);
2628 for fix in fixes {
2629 match fix {
2630 IdxFix::FullRebuild => {
2631 self.rebuild_indices();
2632 break;
2633 }
2634 IdxFix::BTreeMove {
2635 idx_pos,
2636 old_key,
2637 new_key,
2638 } => {
2639 let IndexKind::BTree(map) = &mut self.indices[idx_pos].kind else {
2640 unreachable!("IdxFix::BTreeMove built from a BTree index");
2641 };
2642 // NULL keys never enter the B-tree (from_value
2643 // returns None), so a None on either side means
2644 // "no entry on that side".
2645 if let Some(k) = old_key
2646 && let Some(locs) = map.get(&k)
2647 {
2648 let mut locs = locs.clone();
2649 locs.retain(|l| *l != RowLocator::Hot(position));
2650 // No remove_mut on the persistent map: an
2651 // empty locator list is the tombstone —
2652 // lookup_eq returns an empty slice, and the
2653 // next rebuild_indices() drops the key.
2654 map.insert_mut(k, locs);
2655 }
2656 if let Some(k) = new_key {
2657 let mut entries = map.get(&k).cloned().unwrap_or_default();
2658 entries.push(RowLocator::Hot(position));
2659 map.insert_mut(k, entries);
2660 }
2661 }
2662 }
2663 }
2664 Ok(())
2665 }
2666
2667 /// v4.4 helper used by `delete_rows` / `update_row`: discard all
2668 /// index payloads and rebuild from `self.rows`. Cheap enough
2669 /// for typical SPG scale (catalogs in the docker-compose
2670 /// deployment shape are small); the alternative — incremental
2671 /// shift bookkeeping across B-tree + NSW — would be far more
2672 /// invasive than the savings justify.
2673 fn rebuild_indices(&mut self) {
2674 // v5.2.3: capture every `Cold` locator on every BTree index
2675 // before the rebuild, so the from-rows re-emission below
2676 // (which only produces `Hot` locators) doesn't drop cold-
2677 // tier entries on keys unrelated to the row that changed.
2678 // Pre-v5.2.3 this was a `freeze_oldest_to_cold` worry only
2679 // and the freezer did its own capture-then-reregister; v5.2.3
2680 // promotes that pattern into the base helper because UPDATE
2681 // / DELETE now run rebuild_indices on tables with cold rows.
2682 let preserved_cold: Vec<(String, Vec<(IndexKey, RowLocator)>)> = self
2683 .indices
2684 .iter()
2685 .filter_map(|idx| match &idx.kind {
2686 IndexKind::BTree(map) => {
2687 let cold: Vec<(IndexKey, RowLocator)> = map
2688 .iter()
2689 .flat_map(|(k, locs)| {
2690 locs.iter()
2691 .filter(|l| l.is_cold())
2692 .copied()
2693 .map(move |l| (k.clone(), l))
2694 })
2695 .collect();
2696 if cold.is_empty() {
2697 None
2698 } else {
2699 Some((idx.name.clone(), cold))
2700 }
2701 }
2702 // BRIN / NSW carry no key→locator map. GIN handles
2703 // its own cold preservation below in `preserved_gin_cold`.
2704 IndexKind::Nsw(_)
2705 | IndexKind::Brin { .. }
2706 | IndexKind::Gin(_)
2707 | IndexKind::GinTrgm(_)
2708 | IndexKind::GinFulltext(_) => None,
2709 })
2710 .collect();
2711
2712 // v7.12.3 — same cold-preservation pattern for GIN's
2713 // `word → Vec<RowLocator>` posting lists. Parallel to the
2714 // BTree pass above (different key type so a separate vec is
2715 // cleaner than a generic merge). v7.15.0: trigram-GIN
2716 // (`gin_trgm_ops`) shares the same posting-list shape, so
2717 // one pass handles both — the `RebuildKind` carries the
2718 // kind tag to drive resurrection.
2719 let preserved_gin_cold: Vec<(String, Vec<(String, RowLocator)>)> = self
2720 .indices
2721 .iter()
2722 .filter_map(|idx| match &idx.kind {
2723 // v7.17.0 Phase 2.2 — fulltext-GIN posting lists
2724 // share the `String → Vec<RowLocator>` shape, so
2725 // cold preservation handles all three GIN flavours
2726 // in one pass.
2727 IndexKind::Gin(map) | IndexKind::GinTrgm(map) | IndexKind::GinFulltext(map) => {
2728 let cold: Vec<(String, RowLocator)> = map
2729 .iter()
2730 .flat_map(|(w, locs)| {
2731 locs.iter()
2732 .filter(|l| l.is_cold())
2733 .copied()
2734 .map(move |l| (w.clone(), l))
2735 })
2736 .collect();
2737 if cold.is_empty() {
2738 None
2739 } else {
2740 Some((idx.name.clone(), cold))
2741 }
2742 }
2743 IndexKind::BTree(_) | IndexKind::Nsw(_) | IndexKind::Brin { .. } => None,
2744 })
2745 .collect();
2746
2747 // v6.7.1 — descriptor needs to capture index kind so the
2748 // rebuild loop can resurrect BTree / NSW / BRIN / GIN exactly
2749 // as they were. (NSW carries m; BRIN carries the column type
2750 // snapshot; BTree / GIN need no extra payload.)
2751 #[derive(Clone)]
2752 enum RebuildKind {
2753 BTree,
2754 Nsw(usize),
2755 Brin(DataType),
2756 Gin,
2757 GinTrgm,
2758 GinFulltext,
2759 }
2760 let descriptors: Vec<(String, usize, RebuildKind)> = self
2761 .indices
2762 .iter()
2763 .map(|idx| {
2764 let kind = match &idx.kind {
2765 IndexKind::Nsw(g) => RebuildKind::Nsw(g.m),
2766 IndexKind::Brin { column_type } => RebuildKind::Brin(*column_type),
2767 IndexKind::BTree(_) => RebuildKind::BTree,
2768 IndexKind::Gin(_) => RebuildKind::Gin,
2769 IndexKind::GinTrgm(_) => RebuildKind::GinTrgm,
2770 IndexKind::GinFulltext(_) => RebuildKind::GinFulltext,
2771 };
2772 (idx.name.clone(), idx.column_position, kind)
2773 })
2774 .collect();
2775 self.indices.clear();
2776 for (name, column_position, rebuild_kind) in descriptors {
2777 match rebuild_kind {
2778 RebuildKind::Nsw(m) => {
2779 let idx = Index::new_nsw(name, column_position, m);
2780 self.indices.push(idx);
2781 let idx_pos = self.indices.len() - 1;
2782 let row_indices: Vec<usize> = (0..self.rows.len()).collect();
2783 for row_idx in row_indices {
2784 nsw_insert_at(self, idx_pos, row_idx);
2785 }
2786 }
2787 RebuildKind::Brin(column_type) => {
2788 // BRIN has no in-memory rebuild — the summaries
2789 // live in cold segments which freeze emits.
2790 self.indices
2791 .push(Index::new_brin(name, column_position, column_type));
2792 }
2793 RebuildKind::BTree => {
2794 let mut idx = Index::new_btree(name, column_position);
2795 if let IndexKind::BTree(map) = &mut idx.kind {
2796 for (i, row) in self.rows.iter().enumerate() {
2797 if let Some(key) = IndexKey::from_value(&row.values[column_position]) {
2798 let mut entries = map.get(&key).cloned().unwrap_or_default();
2799 entries.push(RowLocator::Hot(i));
2800 map.insert_mut(key, entries);
2801 }
2802 }
2803 }
2804 self.indices.push(idx);
2805 }
2806 RebuildKind::Gin => {
2807 let mut idx = Index::new_gin(name, column_position);
2808 if let IndexKind::Gin(map) = &mut idx.kind {
2809 for (i, row) in self.rows.iter().enumerate() {
2810 if let Value::TsVector(lexemes) = &row.values[column_position] {
2811 for lex in lexemes {
2812 let mut entries =
2813 map.get(&lex.word).cloned().unwrap_or_default();
2814 entries.push(RowLocator::Hot(i));
2815 map.insert_mut(lex.word.clone(), entries);
2816 }
2817 }
2818 }
2819 }
2820 self.indices.push(idx);
2821 }
2822 RebuildKind::GinTrgm => {
2823 let mut idx = Index::new_gin_trgm(name, column_position);
2824 if let IndexKind::GinTrgm(map) = &mut idx.kind {
2825 for (i, row) in self.rows.iter().enumerate() {
2826 if let Value::Text(s) = &row.values[column_position] {
2827 for tri in trgm::extract_trigrams(s) {
2828 let mut entries = map.get(&tri).cloned().unwrap_or_default();
2829 entries.push(RowLocator::Hot(i));
2830 map.insert_mut(tri, entries);
2831 }
2832 }
2833 }
2834 }
2835 self.indices.push(idx);
2836 }
2837 RebuildKind::GinFulltext => {
2838 // v7.17.0 Phase 2.2 — re-derive the lexeme
2839 // posting list from each TEXT/VARCHAR cell.
2840 // Mirrors the GinTrgm rebuild shape but
2841 // tokenises via `fts_simple::simple_lex`
2842 // (same rule as `to_tsvector('simple')`).
2843 let mut idx = Index::new_gin_fulltext(name, column_position);
2844 if let IndexKind::GinFulltext(map) = &mut idx.kind {
2845 for (i, row) in self.rows.iter().enumerate() {
2846 if let Value::Text(s) = &row.values[column_position] {
2847 for lex in fts_simple::simple_lex(s) {
2848 let mut entries = map.get(&lex).cloned().unwrap_or_default();
2849 entries.push(RowLocator::Hot(i));
2850 map.insert_mut(lex, entries);
2851 }
2852 }
2853 }
2854 }
2855 self.indices.push(idx);
2856 }
2857 }
2858 }
2859
2860 // Re-attach preserved cold locators after the from-rows
2861 // rebuild. `register_cold_locators` handles the per-key
2862 // entries-vec append; no key collisions arise because the
2863 // rebuild loop above produced only Hot locators.
2864 for (idx_name, locators) in preserved_cold {
2865 // Errors here would only fire if the index disappeared
2866 // between snapshot and rebuild, which can't happen
2867 // because the rebuild restores the same descriptor set.
2868 let _ = self.register_cold_locators(&idx_name, locators);
2869 }
2870 // v7.12.3 — same for GIN posting-list cold locators.
2871 for (idx_name, locators) in preserved_gin_cold {
2872 let _ = self.register_gin_cold_locators(&idx_name, locators);
2873 }
2874 }
2875
2876 fn add_nsw_index_inner(
2877 &mut self,
2878 name: String,
2879 column_name: &str,
2880 m: usize,
2881 restore: Option<NswGraph>,
2882 ) -> Result<(), StorageError> {
2883 if self.indices.iter().any(|i| i.name == name) {
2884 return Err(StorageError::DuplicateIndex { name });
2885 }
2886 let column_position = self.schema.column_position(column_name).ok_or_else(|| {
2887 StorageError::ColumnNotFound {
2888 column: column_name.into(),
2889 }
2890 })?;
2891 if !matches!(
2892 self.schema.columns[column_position].ty,
2893 DataType::Vector { .. }
2894 ) {
2895 return Err(StorageError::TypeMismatch {
2896 column: column_name.into(),
2897 expected: DataType::Vector {
2898 dim: 0,
2899 encoding: VecEncoding::F32,
2900 },
2901 actual: self.schema.columns[column_position].ty,
2902 position: column_position,
2903 });
2904 }
2905 if let Some(graph) = restore {
2906 self.indices.push(Index {
2907 name,
2908 column_position,
2909 kind: IndexKind::Nsw(graph),
2910 included_columns: Vec::new(),
2911 partial_predicate: None,
2912 expression: None,
2913 is_unique: false,
2914 extra_column_positions: Vec::new(),
2915 });
2916 return Ok(());
2917 }
2918 let idx = Index::new_nsw(name, column_position, m);
2919 self.indices.push(idx);
2920 let idx_pos = self.indices.len() - 1;
2921 // Bulk-build by walking the existing rows in order — each insert
2922 // sees the partial graph and links into it.
2923 let row_indices: Vec<usize> = (0..self.rows.len()).collect();
2924 for row_idx in row_indices {
2925 nsw_insert_at(self, idx_pos, row_idx);
2926 }
2927 Ok(())
2928 }
2929}
2930
2931/// v6.0.4 — re-encode a single cell to the target `VecEncoding`.
2932/// Used by `Table::rebuild_nsw_index` when ALTER INDEX REBUILD
2933/// includes the optional `WITH (encoding = …)` clause. Round-trip
2934/// goes through f32: `current → Vec<f32> → target`, leaving NULL
2935/// cells untouched. Returns `Unsupported` on a non-vector cell —
2936/// the caller should have rejected the schema before reaching this.
2937fn recode_vector_cell(cell: Value, target: VecEncoding) -> Result<Value, StorageError> {
2938 if matches!(cell, Value::Null) {
2939 return Ok(cell);
2940 }
2941 // Step 1 — extract the f32 representation of the source cell.
2942 let as_f32: Vec<f32> = match &cell {
2943 Value::Vector(v) => v.clone(),
2944 Value::Sq8Vector(q) => quantize::dequantize(q),
2945 Value::HalfVector(h) => h.to_f32_vec(),
2946 other => {
2947 return Err(StorageError::Unsupported(format!(
2948 "ALTER INDEX REBUILD: cannot recode non-vector cell {:?}",
2949 other.data_type()
2950 )));
2951 }
2952 };
2953 // Step 2 — encode into the target shape. `F32` is the identity
2954 // path (saves one alloc round-trip when the source is already
2955 // F32 — but `Value::Vector(as_f32)` is the right answer
2956 // regardless).
2957 Ok(match target {
2958 VecEncoding::F32 => Value::Vector(as_f32),
2959 VecEncoding::Sq8 => Value::Sq8Vector(quantize::quantize(&as_f32)),
2960 VecEncoding::F16 => Value::HalfVector(halfvec::HalfVector::from_f32_slice(&as_f32)),
2961 })
2962}
2963
2964/// Insert one row into the HNSW graph held by index slot `idx_pos`.
2965/// No-op when the row's value at the indexed column isn't a vector.
2966/// v6.0.1: handles `Value::Sq8Vector` by dequantising into an f32
2967/// "query" surface — the existing greedy + beam-search machinery
2968/// then uses `cell_to_query_metric_distance` to route every
2969/// distance call through the cell's actual encoding.
2970fn nsw_insert_at(table: &mut Table, idx_pos: usize, new_row_idx: usize) {
2971 let col_pos = table.indices[idx_pos].column_position;
2972 let cell_dim: Option<usize> = match &table.rows[new_row_idx].values[col_pos] {
2973 Value::Vector(v) => Some(v.len()),
2974 Value::Sq8Vector(q) => Some(q.bytes.len()),
2975 Value::HalfVector(h) => Some(h.dim()),
2976 _ => None,
2977 };
2978 let Some(dim) = cell_dim else {
2979 // Even non-vector rows occupy a level slot so per-node Vec
2980 // lengths stay aligned with `table.rows.len()`.
2981 ensure_node_slot(table, idx_pos, new_row_idx, 0);
2982 return;
2983 };
2984 if dim == 0 {
2985 ensure_node_slot(table, idx_pos, new_row_idx, 0);
2986 return;
2987 }
2988 let level = nsw_assign_level(new_row_idx);
2989 ensure_node_slot(table, idx_pos, new_row_idx, level);
2990 let (entry, entry_level, m) = match &table.indices[idx_pos].kind {
2991 IndexKind::Nsw(g) => (g.entry, g.entry_level, g.m),
2992 IndexKind::BTree(_)
2993 | IndexKind::Brin { .. }
2994 | IndexKind::Gin(_)
2995 | IndexKind::GinTrgm(_)
2996 | IndexKind::GinFulltext(_) => {
2997 unreachable!("nsw_insert_at on a non-NSW index")
2998 }
2999 };
3000 // First node ever — declare it the entry (it gets its own level).
3001 if entry.is_none() {
3002 if let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind {
3003 g.entry = Some(new_row_idx);
3004 g.entry_level = level;
3005 *g.levels
3006 .get_mut(new_row_idx)
3007 .expect("levels slot padded by ensure_node_slot") = level;
3008 }
3009 return;
3010 }
3011 // Set the node's recorded level.
3012 if let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind {
3013 *g.levels
3014 .get_mut(new_row_idx)
3015 .expect("levels slot padded by ensure_node_slot") = level;
3016 }
3017 let query = match &table.rows[new_row_idx].values[col_pos] {
3018 Value::Vector(v) => v.clone(),
3019 // v6.0.1: dequantise the inserted SQ8 cell into an f32 query
3020 // surface so the existing greedy / beam machinery can route
3021 // distances through `cell_to_query_metric_distance`. The
3022 // small dequantisation error is what the recall@10 ≥ 0.95
3023 // envelope already accounts for (V6_DESIGN deliberation #3).
3024 Value::Sq8Vector(q) => quantize::dequantize(q),
3025 // v6.0.3: halfvec dequant is bit-exact at the storage layer,
3026 // so the inserted query is a faithful representation.
3027 Value::HalfVector(h) => h.to_f32_vec(),
3028 _ => return,
3029 };
3030 // Phase 1: greedy descend from `entry` down to `level + 1`, keeping
3031 // exactly one current best so the next layer starts from it.
3032 let mut current = entry.expect("entry was Some above");
3033 let mut current_d = vec_l2_sq(table, col_pos, current, &query);
3034 if entry_level > level {
3035 for layer in (level + 1..=entry_level).rev() {
3036 (current, current_d) =
3037 greedy_layer_walk(table, idx_pos, layer, current, current_d, &query);
3038 }
3039 }
3040 // Phase 2: from `min(level, entry_level)` down to 0, beam-search
3041 // `ef_construction` candidates, run the HNSW §4 heuristic neighbour
3042 // selection over them, and connect bidirectionally.
3043 let top = level.min(entry_level);
3044 let ef = (m * 2).max(8);
3045 for layer in (0..=top).rev() {
3046 let cap = if layer == 0 { m * 2 } else { m };
3047 let mut candidates = layer_beam_search(
3048 table,
3049 idx_pos,
3050 layer,
3051 current,
3052 current_d,
3053 &query,
3054 ef,
3055 NswMetric::L2,
3056 );
3057 candidates.retain(|&(_, n)| n != new_row_idx);
3058 // Take the closest as the entry for the next layer down — done
3059 // before heuristic narrowing because the heuristic can reorder.
3060 if let Some(&(d, n)) = candidates.first() {
3061 current = n;
3062 current_d = d;
3063 }
3064 let peers = select_neighbours_heuristic(&candidates, cap, table, col_pos);
3065 connect_at_layer(table, idx_pos, layer, new_row_idx, &peers);
3066 }
3067 // Phase 3: if the new node climbed above the current entry, take
3068 // over as entry so future inserts/searches start from the new top.
3069 if level > entry_level
3070 && let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind
3071 {
3072 g.entry = Some(new_row_idx);
3073 g.entry_level = level;
3074 }
3075}
3076
3077/// Make sure `layers[*][new_row_idx]` and `levels[new_row_idx]` exist,
3078/// padding with empty/zero entries as needed. Also grows `layers` to
3079/// accommodate the node's top `level`.
3080fn ensure_node_slot(table: &mut Table, idx_pos: usize, new_row_idx: usize, level: u8) {
3081 let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind else {
3082 unreachable!("ensure_node_slot on a BTree index");
3083 };
3084 while g.layers.len() <= level as usize {
3085 g.layers.push(PersistentVec::new());
3086 }
3087 while g.levels.len() <= new_row_idx {
3088 g.levels.push_mut(0);
3089 }
3090 for layer_vec in &mut g.layers {
3091 while layer_vec.len() <= new_row_idx {
3092 layer_vec.push_mut(Vec::new());
3093 }
3094 }
3095}
3096
3097/// Single-step greedy walk on one layer: from `current` (with cached
3098/// distance `current_d`), inspect that node's neighbours at `layer` and
3099/// hop to the closest if it beats `current_d`. Repeat until no move
3100/// improves the distance. Cheap variant of beam-search used for the
3101/// "descend" phase that only needs one survivor per layer.
3102fn greedy_layer_walk(
3103 table: &Table,
3104 idx_pos: usize,
3105 layer: u8,
3106 mut current: usize,
3107 mut current_d: f32,
3108 query: &[f32],
3109) -> (usize, f32) {
3110 let g = match &table.indices[idx_pos].kind {
3111 IndexKind::Nsw(g) => g,
3112 IndexKind::BTree(_)
3113 | IndexKind::Brin { .. }
3114 | IndexKind::Gin(_)
3115 | IndexKind::GinTrgm(_)
3116 | IndexKind::GinFulltext(_) => {
3117 return (current, current_d);
3118 }
3119 };
3120 let col_pos = table.indices[idx_pos].column_position;
3121 loop {
3122 let neighbours: &[u32] = g
3123 .layers
3124 .get(layer as usize)
3125 .and_then(|layer_v| layer_v.get(current))
3126 .map_or(&[][..], Vec::as_slice);
3127 let mut best = current;
3128 let mut best_d = current_d;
3129 for &n in neighbours {
3130 let n = n as usize;
3131 let d = vec_l2_sq(table, col_pos, n, query);
3132 if d < best_d {
3133 best = n;
3134 best_d = d;
3135 }
3136 }
3137 if best == current {
3138 return (current, current_d);
3139 }
3140 current = best;
3141 current_d = best_d;
3142 }
3143}
3144
3145/// Beam search on one layer starting from `entry_node` with cached
3146/// `entry_d`. Returns the top `ef` candidates in ascending-distance
3147/// order. Caller picks the closest as the next layer's entry and / or
3148/// trims to M for connection.
3149///
3150/// v3.0.1: uses two `BinaryHeap`s (min-heap for the open frontier,
3151/// max-heap for the working top-`ef` results) and a `Vec<bool>` visited
3152/// bitmap, replacing the v2.x `Vec` + `partition_point` + `BTreeSet`
3153/// implementation. Same algorithm shape (HNSW search algorithm 2 from
3154/// the paper); the data-structure swap cuts per-visit cost from
3155/// `O(ef + log row_count)` to amortised `O(log ef)`.
3156#[allow(clippy::too_many_arguments)] // Beam search threads layer, entry, query, ef, metric — each is intrinsic. Bundling them into a config struct hides the call sites.
3157fn layer_beam_search(
3158 table: &Table,
3159 idx_pos: usize,
3160 layer: u8,
3161 entry_node: usize,
3162 entry_d: f32,
3163 query: &[f32],
3164 ef: usize,
3165 metric: NswMetric,
3166) -> Vec<(f32, usize)> {
3167 let g = match &table.indices[idx_pos].kind {
3168 IndexKind::Nsw(g) => g,
3169 IndexKind::BTree(_)
3170 | IndexKind::Brin { .. }
3171 | IndexKind::Gin(_)
3172 | IndexKind::GinTrgm(_)
3173 | IndexKind::GinFulltext(_) => return Vec::new(),
3174 };
3175 let col_pos = table.indices[idx_pos].column_position;
3176 let d0 = if matches!(metric, NswMetric::L2) {
3177 entry_d
3178 } else {
3179 cell_to_query_metric_distance(table, col_pos, entry_node, query, metric)
3180 };
3181 let row_count = table.rows.len();
3182 let mut visited: Vec<bool> = alloc::vec![false; row_count];
3183 if entry_node < row_count {
3184 visited[entry_node] = true;
3185 }
3186 // candidates: min-heap by distance (Closest wrapper) — frontier
3187 // results: max-heap by distance (Furthest wrapper) — top-ef working set
3188 let mut candidates: alloc::collections::BinaryHeap<NodeClosest> =
3189 alloc::collections::BinaryHeap::with_capacity(ef);
3190 let mut results: alloc::collections::BinaryHeap<NodeFurthest> =
3191 alloc::collections::BinaryHeap::with_capacity(ef);
3192 candidates.push(NodeClosest {
3193 dist: d0,
3194 node: entry_node,
3195 });
3196 results.push(NodeFurthest {
3197 dist: d0,
3198 node: entry_node,
3199 });
3200 while let Some(cur) = candidates.pop() {
3201 let worst = results.peek().map_or(f32::INFINITY, |c| c.dist);
3202 if cur.dist > worst && results.len() >= ef {
3203 break;
3204 }
3205 let neighbours: &[u32] = g
3206 .layers
3207 .get(layer as usize)
3208 .and_then(|layer_v| layer_v.get(cur.node))
3209 .map_or(&[][..], Vec::as_slice);
3210 for &n in neighbours {
3211 let n = n as usize;
3212 if n >= row_count || visited[n] {
3213 continue;
3214 }
3215 visited[n] = true;
3216 // v6.0.1: cell-aware distance — F32 cells take the
3217 // existing scalar metric, SQ8 cells route through
3218 // the asymmetric ADC variant for the same metric.
3219 let dn = cell_to_query_metric_distance(table, col_pos, n, query, metric);
3220 if !dn.is_finite() {
3221 continue;
3222 }
3223 let worst = results.peek().map_or(f32::INFINITY, |c| c.dist);
3224 if results.len() < ef || dn < worst {
3225 results.push(NodeFurthest { dist: dn, node: n });
3226 if results.len() > ef {
3227 results.pop();
3228 }
3229 candidates.push(NodeClosest { dist: dn, node: n });
3230 }
3231 }
3232 }
3233 // Drain results (max-heap order) and re-sort ascending so callers
3234 // can take `closest = result[0]` without flipping.
3235 let mut out: Vec<(f32, usize)> = results.into_iter().map(|c| (c.dist, c.node)).collect();
3236 out.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
3237 out
3238}
3239
3240/// Min-heap wrapper: smaller `dist` → higher priority in a `BinaryHeap`
3241/// (which is a max-heap), so we flip the comparison. NaN sorts last
3242/// (lowest priority) to keep the heap total-ordered.
3243#[derive(Debug, Clone, Copy)]
3244struct NodeClosest {
3245 dist: f32,
3246 node: usize,
3247}
3248impl PartialEq for NodeClosest {
3249 fn eq(&self, other: &Self) -> bool {
3250 self.dist == other.dist && self.node == other.node
3251 }
3252}
3253impl Eq for NodeClosest {}
3254impl PartialOrd for NodeClosest {
3255 fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
3256 Some(self.cmp(other))
3257 }
3258}
3259impl Ord for NodeClosest {
3260 fn cmp(&self, other: &Self) -> core::cmp::Ordering {
3261 // Reversed: smaller dist = greater priority.
3262 other
3263 .dist
3264 .partial_cmp(&self.dist)
3265 .unwrap_or(core::cmp::Ordering::Equal)
3266 }
3267}
3268
3269/// Max-heap wrapper: larger `dist` sits at the top so the worst result
3270/// can be evicted in O(log n) when a better candidate arrives.
3271#[derive(Debug, Clone, Copy)]
3272struct NodeFurthest {
3273 dist: f32,
3274 node: usize,
3275}
3276impl PartialEq for NodeFurthest {
3277 fn eq(&self, other: &Self) -> bool {
3278 self.dist == other.dist && self.node == other.node
3279 }
3280}
3281impl Eq for NodeFurthest {}
3282impl PartialOrd for NodeFurthest {
3283 fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
3284 Some(self.cmp(other))
3285 }
3286}
3287impl Ord for NodeFurthest {
3288 fn cmp(&self, other: &Self) -> core::cmp::Ordering {
3289 self.dist
3290 .partial_cmp(&other.dist)
3291 .unwrap_or(core::cmp::Ordering::Equal)
3292 }
3293}
3294
3295/// HNSW paper §4 algorithm 4: pick `m` neighbours from `candidates` so
3296/// that each chosen point isn't already covered by a closer chosen
3297/// point. Improves graph diversity → fewer hops needed at search time.
3298///
3299/// `candidates` arrives sorted ascending by distance-to-query. We walk
3300/// it in order, keeping a candidate only when no already-chosen point
3301/// is closer to it than the query is. Result is a vector of row
3302/// indices (length ≤ `m`).
3303fn select_neighbours_heuristic(
3304 candidates: &[(f32, usize)],
3305 m: usize,
3306 table: &Table,
3307 col_pos: usize,
3308) -> Vec<usize> {
3309 let mut chosen: Vec<usize> = Vec::with_capacity(m);
3310 for &(d_q, e) in candidates {
3311 if chosen.len() >= m {
3312 break;
3313 }
3314 // v6.0.1: works on either `Value::Vector` (F32) or
3315 // `Value::Sq8Vector` (Sq8) cells — `cell_l2_sq` dispatches
3316 // on encoding. A non-vector cell yields `f32::INFINITY`
3317 // which the `< d_q` test will never accept.
3318 if !matches!(
3319 table.rows.get(e).and_then(|r| r.values.get(col_pos)),
3320 Some(Value::Vector(_) | Value::Sq8Vector(_) | Value::HalfVector(_))
3321 ) {
3322 continue;
3323 }
3324 let mut covered = false;
3325 for &r in &chosen {
3326 // dist(e, r) measured in the same metric the topology was
3327 // built with (L2). If a chosen `r` is closer to `e` than
3328 // the query is, `r` already "covers" `e` for navigation.
3329 if cell_l2_sq(table, col_pos, e, r) < d_q {
3330 covered = true;
3331 break;
3332 }
3333 }
3334 if !covered {
3335 chosen.push(e);
3336 }
3337 }
3338 chosen
3339}
3340
3341/// Bidirectionally connect `new_row_idx` to each of `peers` at `layer`,
3342/// trimming each endpoint's adjacency to that layer's degree cap by
3343/// keeping only the closest neighbours.
3344fn connect_at_layer(
3345 table: &mut Table,
3346 idx_pos: usize,
3347 layer: u8,
3348 new_row_idx: usize,
3349 peers: &[usize],
3350) {
3351 let col_pos = table.indices[idx_pos].column_position;
3352 let cap = match &table.indices[idx_pos].kind {
3353 IndexKind::Nsw(g) => g.cap_for_layer(layer),
3354 IndexKind::BTree(_)
3355 | IndexKind::Brin { .. }
3356 | IndexKind::Gin(_)
3357 | IndexKind::GinTrgm(_)
3358 | IndexKind::GinFulltext(_) => return,
3359 };
3360 // v6.1.x: NSW adjacency stores neighbour row indices as u32 (4 B
3361 // each) rather than usize (8 B on 64-bit). Boundary casts here
3362 // assert the row count fits in u32 — the catalog already enforces
3363 // ≤ 4G rows per table, so the conversion can't lose data.
3364 let new_row_u32 = u32::try_from(new_row_idx).expect("row index fits in u32");
3365 if let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind {
3366 let layer_v = &mut g.layers[layer as usize];
3367 if let Some(slot) = layer_v.get_mut(new_row_idx) {
3368 *slot = peers
3369 .iter()
3370 .map(|&p| u32::try_from(p).expect("row index fits in u32"))
3371 .collect();
3372 }
3373 }
3374 for &peer in peers {
3375 // Skip peers whose indexed cell isn't a vector — same fence
3376 // as the F32 path; SQ8 cells flow through `cell_l2_sq`
3377 // below without dequantising.
3378 if !matches!(
3379 &table.rows[peer].values[col_pos],
3380 Value::Vector(_) | Value::Sq8Vector(_) | Value::HalfVector(_)
3381 ) {
3382 continue;
3383 }
3384 // 1. add the new node to peer's adjacency
3385 if let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind {
3386 let layer_v = &mut g.layers[layer as usize];
3387 if let Some(slot) = layer_v.get_mut(peer)
3388 && !slot.contains(&new_row_u32)
3389 {
3390 slot.push(new_row_u32);
3391 }
3392 }
3393 // 2. if peer is over budget, rebuild its adjacency with the
3394 // HNSW §4 heuristic — same diversity criterion as the
3395 // insert path so connectivity stays consistent.
3396 let needs_trim = match &table.indices[idx_pos].kind {
3397 IndexKind::Nsw(g) => g.layers[layer as usize][peer].len() > cap,
3398 IndexKind::BTree(_)
3399 | IndexKind::Brin { .. }
3400 | IndexKind::Gin(_)
3401 | IndexKind::GinTrgm(_)
3402 | IndexKind::GinFulltext(_) => false,
3403 };
3404 if needs_trim {
3405 let current_peers: Vec<usize> = match &table.indices[idx_pos].kind {
3406 IndexKind::Nsw(g) => g.layers[layer as usize][peer]
3407 .iter()
3408 .map(|&n| n as usize)
3409 .collect(),
3410 IndexKind::BTree(_)
3411 | IndexKind::Brin { .. }
3412 | IndexKind::Gin(_)
3413 | IndexKind::GinTrgm(_)
3414 | IndexKind::GinFulltext(_) => continue,
3415 };
3416 // Sort by distance from `peer`'s cell ascending so the
3417 // heuristic receives candidates closest-first. `cell_l2_sq`
3418 // dispatches on encoding so SQ8 columns trim using
3419 // symmetric ADC.
3420 let mut tagged: Vec<(f32, usize)> = current_peers
3421 .iter()
3422 .map(|&p| (cell_l2_sq(table, col_pos, peer, p), p))
3423 .collect();
3424 tagged.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
3425 let kept = select_neighbours_heuristic(&tagged, cap, table, col_pos);
3426 if let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind
3427 && let Some(slot) = g.layers[layer as usize].get_mut(peer)
3428 {
3429 *slot = kept
3430 .into_iter()
3431 .map(|p| u32::try_from(p).expect("row index fits in u32"))
3432 .collect();
3433 }
3434 }
3435 }
3436}
3437
3438/// Squared L2 distance from `query` (raw f32) to the cell at
3439/// `(row, col_pos)`. Dispatches on cell encoding: `Value::Vector`
3440/// (F32) uses `l2_distance_sq`; `Value::Sq8Vector` uses
3441/// `sq8_l2_distance_sq_asymmetric` (the v6.0.1 quantised path).
3442/// Returns `f32::INFINITY` for any non-vector cell so callers can
3443/// compare uniformly.
3444fn vec_l2_sq(table: &Table, col_pos: usize, row: usize, query: &[f32]) -> f32 {
3445 match table.rows.get(row).and_then(|r| r.values.get(col_pos)) {
3446 Some(Value::Vector(v)) if v.len() == query.len() => l2_distance_sq(v, query),
3447 Some(Value::Sq8Vector(q)) if q.bytes.len() == query.len() => {
3448 quantize::sq8_l2_distance_sq_asymmetric(q, query)
3449 }
3450 // v6.0.6: halfvec → fused NEON SIMD kernel; no Vec<f32>
3451 // allocation. v6.0.3 used `to_f32_vec()` + f32 NEON which
3452 // was correct but allocated per call (5× slower than F32).
3453 Some(Value::HalfVector(h)) if h.dim() == query.len() => {
3454 halfvec::half_l2_distance_sq_asymmetric(h, query)
3455 }
3456 _ => f32::INFINITY,
3457 }
3458}
3459
3460/// Squared L2 distance between two stored cells (no f32 query in
3461/// sight). Used during HNSW graph build — both endpoints are
3462/// rows already in the table, so symmetric ADC applies for SQ8
3463/// columns. Mixed-encoding cells within one column are a
3464/// schema-level impossibility (INSERT-time coercion enforces
3465/// uniform encoding), so the catch-all is an abort.
3466fn cell_l2_sq(table: &Table, col_pos: usize, row_a: usize, row_b: usize) -> f32 {
3467 let Some(cell_a) = table.rows.get(row_a).and_then(|r| r.values.get(col_pos)) else {
3468 return f32::INFINITY;
3469 };
3470 let Some(cell_b) = table.rows.get(row_b).and_then(|r| r.values.get(col_pos)) else {
3471 return f32::INFINITY;
3472 };
3473 match (cell_a, cell_b) {
3474 (Value::Vector(a), Value::Vector(b)) if a.len() == b.len() => l2_distance_sq(a, b),
3475 (Value::Sq8Vector(a), Value::Sq8Vector(b)) if a.bytes.len() == b.bytes.len() => {
3476 quantize::sq8_l2_distance_sq(a, b)
3477 }
3478 // v6.0.6: halfvec symmetric NEON — fused SIMD kernel that
3479 // loads both cells' raw u16 bits, expands to f32 lanes
3480 // inline, FMA-accumulates the squared diff. No Vec<f32>
3481 // allocation per call.
3482 (Value::HalfVector(a), Value::HalfVector(b)) if a.dim() == b.dim() => {
3483 halfvec::half_l2_distance_sq(a, b)
3484 }
3485 _ => f32::INFINITY,
3486 }
3487}
3488
3489/// kNN-search-time distance: stored cell → f32 query under the
3490/// caller's metric. Dispatches on cell encoding so SQ8 columns
3491/// take the ADC path with the right asymmetric variant. NaN /
3492/// dim-mismatch / non-vector → `f32::INFINITY`.
3493fn cell_to_query_metric_distance(
3494 table: &Table,
3495 col_pos: usize,
3496 row: usize,
3497 query: &[f32],
3498 metric: NswMetric,
3499) -> f32 {
3500 match table.rows.get(row).and_then(|r| r.values.get(col_pos)) {
3501 Some(Value::Vector(v)) if v.len() == query.len() => metric_distance(metric, v, query),
3502 Some(Value::Sq8Vector(q)) if q.bytes.len() == query.len() => match metric {
3503 NswMetric::L2 => quantize::sq8_l2_distance_sq_asymmetric(q, query),
3504 NswMetric::InnerProduct => quantize::sq8_inner_product_asymmetric(q, query),
3505 NswMetric::Cosine => quantize::sq8_cosine_distance_asymmetric(q, query),
3506 },
3507 // v6.0.6: halfvec dispatches by metric to fused NEON
3508 // kernels — no Vec<f32> allocation per call.
3509 Some(Value::HalfVector(h)) if h.dim() == query.len() => match metric {
3510 NswMetric::L2 => halfvec::half_l2_distance_sq_asymmetric(h, query),
3511 NswMetric::InnerProduct => halfvec::half_inner_product_asymmetric(h, query),
3512 NswMetric::Cosine => halfvec::half_cosine_distance_asymmetric(h, query),
3513 },
3514 _ => f32::INFINITY,
3515 }
3516}
3517
3518/// Distance metric used at NSW search time. The graph topology is
3519/// always built with `L2`; querying with `InnerProduct` / `Cosine`
3520/// reuses the same edges but ranks candidates by the chosen metric.
3521/// For the corpus-sized graphs this loses negligible recall vs
3522/// building separate per-metric graphs.
3523#[derive(Debug, Clone, Copy, PartialEq, Eq)]
3524pub enum NswMetric {
3525 /// Squared Euclidean — ranks "smaller = closer" (the sqrt is
3526 /// monotonic so we skip it for ordering).
3527 L2,
3528 /// Negated dot product, matching pgvector `<#>` convention so
3529 /// "smaller = more similar" holds across all three metrics.
3530 InnerProduct,
3531 /// Cosine distance `1 - cos(a, b)`. Zero-norm operand yields
3532 /// `f32::INFINITY` so it sorts last.
3533 Cosine,
3534}
3535
3536/// Multi-layer HNSW kNN search: greedy-descend from the entry to layer 0,
3537/// then beam-search there with the requested `ef` to return the top `k`
3538/// results under the caller-chosen metric. Topology was built with L2 —
3539/// upper-layer descent uses L2 as a coarse heuristic; final beam search
3540/// runs in the requested metric so rankings are correct for `<#>` / `<=>`.
3541fn nsw_search(
3542 table: &Table,
3543 idx_pos: usize,
3544 query: &[f32],
3545 k: usize,
3546 ef: usize,
3547 metric: NswMetric,
3548) -> Vec<(f32, usize)> {
3549 let (entry, entry_level) = match &table.indices[idx_pos].kind {
3550 IndexKind::Nsw(g) => (g.entry, g.entry_level),
3551 IndexKind::BTree(_)
3552 | IndexKind::Brin { .. }
3553 | IndexKind::Gin(_)
3554 | IndexKind::GinTrgm(_)
3555 | IndexKind::GinFulltext(_) => return Vec::new(),
3556 };
3557 let Some(entry) = entry else {
3558 return Vec::new();
3559 };
3560 let col_pos = table.indices[idx_pos].column_position;
3561 // v6.0.1 step 5: SQ8 columns over-fetch by `SQ8_RERANK_OVER_FETCH`
3562 // so the rerank pass below sees enough candidates to recover
3563 // recall after the ADC re-ordering. F32 + F16 columns skip the
3564 // over-fetch — F32 distances are exact, F16 dequant is
3565 // bit-exact at the storage layer so the beam search already
3566 // ranks under the column's full precision.
3567 let sq8 = matches!(
3568 table.schema.columns.get(col_pos).map(|c| c.ty),
3569 Some(DataType::Vector {
3570 encoding: VecEncoding::Sq8,
3571 ..
3572 })
3573 );
3574 let ef = if sq8 {
3575 ef.max(k).max(k * SQ8_RERANK_OVER_FETCH)
3576 } else {
3577 ef.max(k)
3578 };
3579 // Descend by L2 (the topology metric) so layers prune consistently.
3580 let entry_d = vec_l2_sq(table, col_pos, entry, query);
3581 let mut current = entry;
3582 let mut current_d = entry_d;
3583 for layer in (1..=entry_level).rev() {
3584 (current, current_d) = greedy_layer_walk(table, idx_pos, layer, current, current_d, query);
3585 }
3586 // Final beam search on layer 0 under the caller's metric.
3587 let mut results = layer_beam_search(table, idx_pos, 0, current, current_d, query, ef, metric);
3588 if sq8 {
3589 results = sq8_rerank(table, col_pos, &results, query, metric);
3590 }
3591 results.truncate(k);
3592 results
3593}
3594
3595/// v6.0.1 step 5: re-score ADC top-`K*3` candidates with the
3596/// dequantised cell vs the f32 query, then re-sort. Recovers the
3597/// recall the SQ8 ADC sacrifices for 4× compression — the design's
3598/// "f32 rerank step is on by default" path (deliberation #3).
3599/// `metric` is the same metric the beam search used; the rerank
3600/// arithmetic re-derives the exact distance under that metric.
3601fn sq8_rerank(
3602 table: &Table,
3603 col_pos: usize,
3604 candidates: &[(f32, usize)],
3605 query: &[f32],
3606 metric: NswMetric,
3607) -> Vec<(f32, usize)> {
3608 let mut out: Vec<(f32, usize)> = candidates
3609 .iter()
3610 .filter_map(|&(adc_d, row)| {
3611 let cell = table.rows.get(row).and_then(|r| r.values.get(col_pos))?;
3612 let Value::Sq8Vector(q) = cell else {
3613 // F32 cells shouldn't reach this path (sq8 fence
3614 // above), but stay defensive: pass through with
3615 // the ADC distance unchanged.
3616 return Some((adc_d, row));
3617 };
3618 let deq = quantize::dequantize(q);
3619 if deq.len() != query.len() {
3620 return None;
3621 }
3622 Some((metric_distance(metric, &deq, query), row))
3623 })
3624 .collect();
3625 out.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
3626 out
3627}
3628
3629/// Multiplier applied to `k` so the SQ8 rerank pass sees a wider
3630/// candidate set. 3× is the design-stage value; v6.0.5 sweep work
3631/// can re-tune once full corpus profiling is in.
3632const SQ8_RERANK_OVER_FETCH: usize = 3;
3633
3634fn metric_distance(metric: NswMetric, a: &[f32], b: &[f32]) -> f32 {
3635 match metric {
3636 NswMetric::L2 => l2_distance_sq(a, b),
3637 NswMetric::InnerProduct => -inner_product_f32(a, b),
3638 NswMetric::Cosine => {
3639 let (dot, na, nb) = cosine_dot_norms_f32(a, b);
3640 if na == 0.0 || nb == 0.0 {
3641 return f32::INFINITY;
3642 }
3643 // `f32::sqrt` lives in std, so hand-roll Newton-Raphson on
3644 // f64 — same trick the L2 binary op already uses.
3645 let denom = sqrt_newton_f32(na) * sqrt_newton_f32(nb);
3646 1.0 - dot / denom
3647 }
3648 }
3649}
3650
3651/// v6.0.2: dispatch wrapper for the f32 dot product (used by `<#>` +
3652/// the cosine numerator). NEON path when `len % 4 == 0 && len >= 4`,
3653/// scalar fallback otherwise. Returns the positive dot — callers
3654/// negate for the pgvector `<#>` "smaller = closer" convention.
3655///
3656/// Public so perf gates + downstream benches can microbenchmark the
3657/// dispatch directly; not part of the STABILITY contract — internal
3658/// SIMD layout can evolve in any release.
3659#[doc(hidden)]
3660#[inline]
3661pub fn inner_product_f32(a: &[f32], b: &[f32]) -> f32 {
3662 #[cfg(target_arch = "aarch64")]
3663 {
3664 if a.len() == b.len() && a.len() >= 4 && a.len().is_multiple_of(4) {
3665 // SAFETY: NEON is a baseline aarch64 feature; preconditions
3666 // (matching lengths, ≥ 1 full lane group) are checked above.
3667 return unsafe { inner_product_neon(a, b) };
3668 }
3669 }
3670 inner_product_scalar(a, b)
3671}
3672
3673fn inner_product_scalar(a: &[f32], b: &[f32]) -> f32 {
3674 let mut dot: f32 = 0.0;
3675 for (x, y) in a.iter().zip(b.iter()) {
3676 dot += x * y;
3677 }
3678 dot
3679}
3680
3681#[cfg(target_arch = "aarch64")]
3682#[target_feature(enable = "neon")]
3683#[allow(clippy::many_single_char_names)] // NEON intrinsics work in single-letter regs by convention
3684unsafe fn inner_product_neon(a: &[f32], b: &[f32]) -> f32 {
3685 use core::arch::aarch64::{
3686 float32x4_t, vaddq_f32, vaddvq_f32, vdupq_n_f32, vfmaq_f32, vld1q_f32,
3687 };
3688 unsafe {
3689 // Two parallel accumulators (same trick as L2 NEON) so the
3690 // FMA dependency chain doesn't serialise.
3691 let zero: float32x4_t = vdupq_n_f32(0.0);
3692 let mut acc0 = zero;
3693 let mut acc1 = zero;
3694 let n = a.len();
3695 let mut i = 0usize;
3696 while i + 8 <= n {
3697 let av0 = vld1q_f32(a.as_ptr().add(i));
3698 let bv0 = vld1q_f32(b.as_ptr().add(i));
3699 acc0 = vfmaq_f32(acc0, av0, bv0);
3700 let av1 = vld1q_f32(a.as_ptr().add(i + 4));
3701 let bv1 = vld1q_f32(b.as_ptr().add(i + 4));
3702 acc1 = vfmaq_f32(acc1, av1, bv1);
3703 i += 8;
3704 }
3705 while i + 4 <= n {
3706 let av = vld1q_f32(a.as_ptr().add(i));
3707 let bv = vld1q_f32(b.as_ptr().add(i));
3708 acc0 = vfmaq_f32(acc0, av, bv);
3709 i += 4;
3710 }
3711 vaddvq_f32(vaddq_f32(acc0, acc1))
3712 }
3713}
3714
3715/// v6.0.2: dispatch wrapper for the three accumulators (`dot`, `||a||²`,
3716/// `||b||²`) cosine needs. Same NEON pre-condition as the L2 / IP
3717/// paths; same scalar fallback shape.
3718///
3719/// Public for benchmarking only (see `inner_product_f32`); not in the
3720/// STABILITY contract.
3721#[doc(hidden)]
3722#[inline]
3723pub fn cosine_dot_norms_f32(a: &[f32], b: &[f32]) -> (f32, f32, f32) {
3724 #[cfg(target_arch = "aarch64")]
3725 {
3726 if a.len() == b.len() && a.len() >= 4 && a.len().is_multiple_of(4) {
3727 // SAFETY: see `inner_product_neon`.
3728 return unsafe { cosine_dot_norms_neon(a, b) };
3729 }
3730 }
3731 cosine_dot_norms_scalar(a, b)
3732}
3733
3734fn cosine_dot_norms_scalar(a: &[f32], b: &[f32]) -> (f32, f32, f32) {
3735 let mut dot: f32 = 0.0;
3736 let mut na: f32 = 0.0;
3737 let mut nb: f32 = 0.0;
3738 for (x, y) in a.iter().zip(b.iter()) {
3739 dot += x * y;
3740 na += x * x;
3741 nb += y * y;
3742 }
3743 (dot, na, nb)
3744}
3745
3746#[cfg(target_arch = "aarch64")]
3747#[target_feature(enable = "neon")]
3748#[allow(clippy::many_single_char_names, clippy::similar_names)]
3749unsafe fn cosine_dot_norms_neon(a: &[f32], b: &[f32]) -> (f32, f32, f32) {
3750 use core::arch::aarch64::{float32x4_t, vaddvq_f32, vdupq_n_f32, vfmaq_f32, vld1q_f32};
3751 unsafe {
3752 let zero: float32x4_t = vdupq_n_f32(0.0);
3753 let mut acc_dot = zero;
3754 let mut acc_na = zero;
3755 let mut acc_nb = zero;
3756 let n = a.len();
3757 let mut i = 0usize;
3758 while i + 4 <= n {
3759 let av = vld1q_f32(a.as_ptr().add(i));
3760 let bv = vld1q_f32(b.as_ptr().add(i));
3761 acc_dot = vfmaq_f32(acc_dot, av, bv);
3762 acc_na = vfmaq_f32(acc_na, av, av);
3763 acc_nb = vfmaq_f32(acc_nb, bv, bv);
3764 i += 4;
3765 }
3766 (vaddvq_f32(acc_dot), vaddvq_f32(acc_na), vaddvq_f32(acc_nb))
3767 }
3768}
3769
3770fn sqrt_newton_f32(x: f32) -> f32 {
3771 if x <= 0.0 {
3772 return 0.0;
3773 }
3774 let mut g = x;
3775 for _ in 0..10 {
3776 g = 0.5 * (g + x / g);
3777 }
3778 g
3779}
3780
3781/// Squared Euclidean distance — used for ordering inside NSW (the sqrt
3782/// preserves the order). Caller takes sqrt before reporting back to SQL.
3783///
3784/// v3.3.2: aarch64 NEON path for `len % 4 == 0` (which covers every
3785/// HNSW-indexed VECTOR(N) where N is a multiple of 4 — i.e. all
3786/// production-shaped embeddings: 64, 128, 256, 384, 512, 768, 1024,
3787/// 1536, ...). Other shapes fall back to the scalar loop.
3788#[inline]
3789fn l2_distance_sq(a: &[f32], b: &[f32]) -> f32 {
3790 #[cfg(target_arch = "aarch64")]
3791 {
3792 if a.len() == b.len() && a.len() >= 4 && a.len().is_multiple_of(4) {
3793 // SAFETY: NEON is a baseline aarch64 feature (ARMv8);
3794 // the precondition is checked above (matching lengths,
3795 // multiple of 4, at least one 128-bit lane group).
3796 return unsafe { l2_distance_sq_neon(a, b) };
3797 }
3798 }
3799 l2_distance_sq_scalar(a, b)
3800}
3801
3802fn l2_distance_sq_scalar(a: &[f32], b: &[f32]) -> f32 {
3803 let mut sum: f32 = 0.0;
3804 for (x, y) in a.iter().zip(b.iter()) {
3805 let d = *x - *y;
3806 sum += d * d;
3807 }
3808 sum
3809}
3810
3811#[cfg(target_arch = "aarch64")]
3812#[target_feature(enable = "neon")]
3813#[allow(clippy::many_single_char_names)] // NEON intrinsics work in single-letter regs by convention
3814unsafe fn l2_distance_sq_neon(a: &[f32], b: &[f32]) -> f32 {
3815 use core::arch::aarch64::{
3816 float32x4_t, vaddq_f32, vaddvq_f32, vdupq_n_f32, vfmaq_f32, vld1q_f32, vsubq_f32,
3817 };
3818 unsafe {
3819 // Two independent accumulator registers so the FMA dependency
3820 // chain doesn't serialise (each FMA depends on prior FMA).
3821 // Pre-conditions checked by caller: `a.len() == b.len()`,
3822 // `a.len() % 4 == 0`, `a.len() >= 4`.
3823 let zero: float32x4_t = vdupq_n_f32(0.0);
3824 let mut acc0 = zero;
3825 let mut acc1 = zero;
3826 let n = a.len();
3827 let mut i = 0usize;
3828 // Process 8 floats per iter when available (two parallel
3829 // accumulators). Tail of 4 falls into the second loop.
3830 while i + 8 <= n {
3831 let d0 = vsubq_f32(vld1q_f32(a.as_ptr().add(i)), vld1q_f32(b.as_ptr().add(i)));
3832 acc0 = vfmaq_f32(acc0, d0, d0);
3833 let d1 = vsubq_f32(
3834 vld1q_f32(a.as_ptr().add(i + 4)),
3835 vld1q_f32(b.as_ptr().add(i + 4)),
3836 );
3837 acc1 = vfmaq_f32(acc1, d1, d1);
3838 i += 8;
3839 }
3840 while i + 4 <= n {
3841 let d = vsubq_f32(vld1q_f32(a.as_ptr().add(i)), vld1q_f32(b.as_ptr().add(i)));
3842 acc0 = vfmaq_f32(acc0, d, d);
3843 i += 4;
3844 }
3845 vaddvq_f32(vaddq_f32(acc0, acc1))
3846 }
3847}
3848
3849/// Public wrapper: run an NSW kNN search and return the top-k row
3850/// indices ordered by ascending distance under the given metric.
3851pub fn nsw_query(
3852 table: &Table,
3853 idx_name: &str,
3854 query: &[f32],
3855 k: usize,
3856 metric: NswMetric,
3857) -> Vec<usize> {
3858 let Some(idx_pos) = table.indices.iter().position(|i| i.name == idx_name) else {
3859 return Vec::new();
3860 };
3861 let ef = (k * 2).max(NSW_DEFAULT_M);
3862 let mut hits = nsw_search(table, idx_pos, query, k, ef, metric);
3863 hits.truncate(k);
3864 hits.into_iter().map(|(_, idx)| idx).collect()
3865}
3866
3867/// Find any NSW index on a column. Used by the planner to decide
3868/// whether an `ORDER BY col <-> literal LIMIT k` query can skip the
3869/// brute-force scan.
3870pub fn nsw_index_on(table: &Table, column_position: usize) -> Option<&Index> {
3871 table
3872 .indices
3873 .iter()
3874 .find(|i| i.column_position == column_position && matches!(i.kind, IndexKind::Nsw(_)))
3875}
3876
3877/// Catalog: insertion-ordered `Vec<Table>` for stable iter / serialize,
3878/// plus a `BTreeMap<String, usize>` sidecar index so `get` / `get_mut`
3879/// run in O(log n) instead of the old linear scan with per-element
3880/// string compares.
3881///
3882/// A pure `BTreeMap<String, Table>` was tried in an interim version
3883/// of v3.1.2 and regressed the single-table catalog benches by ~10%
3884/// (the per-element `BTreeMap` overhead outweighs the lookup win
3885/// when n is small). The sidecar shape preserves the insertion-order
3886/// iteration the on-disk encoding relies on and keeps `last_mut`
3887/// (used by the deserialize hot path) cheap.
3888#[derive(Debug, Clone, Default)]
3889pub struct Catalog {
3890 tables: Vec<Table>,
3891 /// `name → tables[index]`. Kept in lock-step with `tables`.
3892 /// `create_table` is the only write path.
3893 by_name: BTreeMap<String, usize>,
3894 /// v5.1: in-memory cold-tier segments. Side-loaded via
3895 /// [`Catalog::load_segment_bytes`] — they live outside the
3896 /// catalog snapshot (caller persists them as separate files
3897 /// and re-loads on boot, until v5.3's `CatalogManifest` makes
3898 /// that wiring automatic). `RowLocator::Cold { segment_id, .. }`
3899 /// indexes this `Vec`. Cleared on `Catalog::new` / fresh
3900 /// `deserialize`.
3901 ///
3902 /// `Arc` wrap keeps `Catalog::clone` at O(N segments) bumps
3903 /// (rather than O(total segment bytes) memcpy) so the v4.42
3904 /// group-commit pre-image rollback invariant — clone is
3905 /// effectively free — survives the cold-tier addition.
3906 ///
3907 /// v6.7.3 — slots became `Option<…>` so cold-segment compaction
3908 /// can tombstone merged sources without breaking the
3909 /// `segment_id = index_into_vec` contract that on-disk
3910 /// `RowLocator::Cold { segment_id }` already serialized.
3911 /// `None` slot = the segment was retired by compaction; the
3912 /// physical file may still be on disk (next CHECKPOINT writes
3913 /// a manifest that no longer lists it, and the file becomes
3914 /// an orphan eligible for offline cleanup).
3915 cold_segments: Vec<Option<Arc<OwnedSegment>>>,
3916 /// v7.12.4 — user-defined functions (PL/pgSQL + SQL).
3917 /// Keyed by function name (PG overloading is out of scope).
3918 /// Bodies are stored as the raw source text the parser saw
3919 /// between `$$ ... $$`; the engine re-parses on each
3920 /// invocation. This keeps `spg-storage` free of `spg-sql`
3921 /// dependency — same pattern as partial-index predicates.
3922 functions: BTreeMap<String, FunctionDef>,
3923 /// v7.12.4 — triggers in insertion order. Multiple triggers
3924 /// per table / event fire in this order (matching PG's
3925 /// alphabetical-by-default with insertion-stable tie-break
3926 /// behaviour — we just keep insertion order for now).
3927 triggers: Vec<TriggerDef>,
3928 /// v7.17.0 — catalogued SEQUENCE objects (Phase 1.1). Each
3929 /// `nextval(name)` reaches in here, atomically increments
3930 /// `last_value` / flips `is_called`, returns the new value.
3931 /// Persisted in catalog FILE_VERSION 26+; older catalogs
3932 /// deserialise with an empty map.
3933 sequences: BTreeMap<String, SequenceDef>,
3934 /// v7.17.0 — catalogued VIEW objects (Phase 1.2). Each
3935 /// `SELECT FROM v` at engine exec-time looks up `v` here and
3936 /// prepends the view body as a synthetic CTE. Persisted in
3937 /// catalog FILE_VERSION 27+; older catalogs deserialise with
3938 /// an empty map.
3939 views: BTreeMap<String, ViewDef>,
3940 /// v7.17.0 — catalogued MATERIALIZED VIEW source registry
3941 /// (Phase 1.3). Maps name → SELECT source. The materialised
3942 /// rows themselves live as a regular `Table` with the same
3943 /// name; REFRESH re-parses + re-executes the source against
3944 /// the table. Persisted in catalog FILE_VERSION 28+;
3945 /// older catalogs deserialise with an empty map.
3946 materialized_views: BTreeMap<String, String>,
3947 /// v7.17.0 — catalogued user-defined ENUM types (Phase 1.4).
3948 /// Maps name → label list. Columns reference these by name
3949 /// via `ColumnSchema.user_enum_type`. Persisted in catalog
3950 /// FILE_VERSION 29+; older catalogs deserialise with an empty
3951 /// map.
3952 enum_types: BTreeMap<String, EnumDef>,
3953 /// v7.17.0 — catalogued user-defined DOMAIN types (Phase 1.5).
3954 /// Maps name → base + CHECK constraints. Columns reference
3955 /// these by name via `ColumnSchema.user_domain_type`.
3956 /// Persisted in catalog FILE_VERSION 30+; older catalogs
3957 /// deserialise with an empty map.
3958 domain_types: BTreeMap<String, DomainDef>,
3959 /// v7.17.0 — schema-namespace registry (Phase 1.6). Tracks
3960 /// which schemas exist. `public`, `pg_catalog`, and
3961 /// `information_schema` are built-in and always present.
3962 /// Schema-qualified table references still strip the prefix
3963 /// at lookup time per v7.16-and-earlier — full
3964 /// schema-as-isolation is v7.18+ scope. Persisted in catalog
3965 /// FILE_VERSION 31+; older catalogs deserialise with just
3966 /// the built-ins.
3967 schemas: alloc::collections::BTreeSet<String>,
3968}
3969
3970/// v7.12.4 — catalogued user-defined function. `body` is the raw
3971/// source text between `$$ ... $$`; the engine re-parses it on
3972/// invocation. This keeps the storage codec stable when the
3973/// PL/pgSQL surface grows (no breaking-change risk on the disk
3974/// format).
3975#[derive(Debug, Clone, PartialEq, Eq)]
3976pub struct FunctionDef {
3977 pub name: String,
3978 /// Display form of the argument list, e.g.
3979 /// `"(name TEXT, ts TIMESTAMP)"`. Empty `"()"` for the trigger
3980 /// function shape. Parser-side canonicalised before storage.
3981 pub args_repr: String,
3982 /// Display form of the return type, e.g. `"TRIGGER"` /
3983 /// `"INT"` / `"SETOF text"`. The engine special-cases
3984 /// `"TRIGGER"` (case-insensitive) to gate trigger-only
3985 /// semantics (NEW/OLD).
3986 pub returns: String,
3987 /// `LANGUAGE` clause, lowercased. `"plpgsql"` / `"sql"`.
3988 pub language: String,
3989 /// Source body of the function. PL/pgSQL: includes the
3990 /// surrounding `BEGIN ... END;`. SQL: includes the
3991 /// statement(s). The engine re-parses on invocation; bad
3992 /// bodies surface as a parse error at CALL time, not CREATE.
3993 pub body: String,
3994}
3995
3996/// v7.12.4 — catalogued trigger. References its function by
3997/// name; the function must exist at TRIGGER creation time
3998/// (forward references are deferred to v7.12.5+).
3999#[derive(Debug, Clone, PartialEq, Eq)]
4000pub struct TriggerDef {
4001 pub name: String,
4002 /// Watched table. Trigger is dropped when the table drops.
4003 pub table: String,
4004 /// `"BEFORE"` / `"AFTER"` / `"INSTEAD OF"`. Stored as the
4005 /// uppercased keyword so deserialised catalogs round-trip
4006 /// without canonicalisation surprises.
4007 pub timing: String,
4008 /// Each entry is one of `"INSERT"` / `"UPDATE"` / `"DELETE"`
4009 /// / `"TRUNCATE"`. `INSERT OR UPDATE` parses to two entries.
4010 pub events: Vec<String>,
4011 /// `"ROW"` / `"STATEMENT"`. v7.12.4 ships `"ROW"` only;
4012 /// `"STATEMENT"` parses and persists but the executor
4013 /// refuses it at trigger fire time.
4014 pub for_each: String,
4015 /// Name of the PL/pgSQL function to invoke.
4016 pub function: String,
4017 /// v7.13.0 — `UPDATE OF col, col, …` column-list filter
4018 /// (mailrs round-5 G7). Non-empty means the trigger fires
4019 /// only when at least one of these columns appears in the
4020 /// UPDATE's SET list. Empty = no column filter. Stored in
4021 /// catalog FILE_VERSION 23+; older catalogs deserialise with
4022 /// an empty vec.
4023 pub update_columns: Vec<String>,
4024 /// v7.16.1 — whether the trigger fires when its watched
4025 /// event occurs. Toggled by `ALTER TABLE … { ENABLE |
4026 /// DISABLE } TRIGGER …`; pg_dump --disable-triggers wraps
4027 /// every data block with a DISABLE/ENABLE pair so the
4028 /// rows already-computed in prod don't get re-rewritten.
4029 /// Defaults to `true` at CREATE TRIGGER time. Stored in
4030 /// catalog FILE_VERSION 25+; older catalogs deserialise
4031 /// with `enabled = true`.
4032 pub enabled: bool,
4033}
4034
4035/// v7.17.0 — catalogued SEQUENCE. PG semantics: a counter object
4036/// returning monotonically increasing values via `nextval(name)`.
4037/// `last_value` is the most recent value handed out; `is_called`
4038/// is false until the first `nextval`/`setval`. Stored separately
4039/// from tables in the catalog.
4040#[derive(Debug, Clone, PartialEq, Eq)]
4041pub struct SequenceDef {
4042 pub name: String,
4043 /// Data type — narrows the i64 range. PG default BIGINT.
4044 pub data_type: SequenceDataType,
4045 pub start: i64,
4046 pub increment: i64,
4047 pub min_value: i64,
4048 pub max_value: i64,
4049 pub cache: i64,
4050 pub cycle: bool,
4051 /// `OWNED BY` target — `(table, column)` or NONE.
4052 pub owned_by: Option<(String, String)>,
4053 /// Most recently handed-out value. Meaningless when
4054 /// `is_called == false`; in that case the NEXT `nextval`
4055 /// will return `start`.
4056 pub last_value: i64,
4057 pub is_called: bool,
4058}
4059
4060/// v7.17.0 — sequence integer width.
4061#[derive(Debug, Clone, Copy, PartialEq, Eq)]
4062pub enum SequenceDataType {
4063 SmallInt,
4064 Int,
4065 BigInt,
4066}
4067
4068/// v7.17.0 Phase 1.6 — built-in schema names that every Catalog
4069/// understands without an explicit CREATE SCHEMA. Used by
4070/// [`Catalog::schema_exists`] and the engine's schema-qualified
4071/// lookup path.
4072#[must_use]
4073pub fn is_builtin_schema(name: &str) -> bool {
4074 name.eq_ignore_ascii_case("public")
4075 || name.eq_ignore_ascii_case("pg_catalog")
4076 || name.eq_ignore_ascii_case("information_schema")
4077}
4078
4079/// v7.17.0 — parse a PG-canonical UUID text representation into the
4080/// 16-byte network-order layout used by `Value::Uuid`. Accepted input
4081/// shapes (all case-insensitive):
4082/// * Canonical hyphenated 8-4-4-4-12 (`550e8400-e29b-41d4-a716-446655440000`)
4083/// * Unhyphenated 32-char hex (`550e8400e29b41d4a716446655440000`)
4084/// * Either form wrapped in `{ ... }`
4085///
4086/// Returns `None` for any malformed input (wrong length, non-hex
4087/// characters, misplaced hyphens). The caller surfaces a SQL error
4088/// at coercion time — silent acceptance of garbage would mask
4089/// application bugs and is exactly the divergence from PG that
4090/// breaks the 0-change cutover promise.
4091#[must_use]
4092pub fn parse_uuid_str(input: &str) -> Option<[u8; 16]> {
4093 let s = input.trim();
4094 // Strip surrounding braces if present.
4095 let s = if let Some(inner) = s.strip_prefix('{').and_then(|x| x.strip_suffix('}')) {
4096 inner
4097 } else {
4098 s
4099 };
4100 // Two valid shapes after braces are stripped: 32 hex chars or
4101 // the canonical 36-char hyphenated form.
4102 let hex: String = match s.len() {
4103 32 => s.to_ascii_lowercase(),
4104 36 => {
4105 // Hyphens must be exactly at positions 8, 13, 18, 23.
4106 let b = s.as_bytes();
4107 if b[8] != b'-' || b[13] != b'-' || b[18] != b'-' || b[23] != b'-' {
4108 return None;
4109 }
4110 let mut out = String::with_capacity(32);
4111 out.push_str(&s[0..8]);
4112 out.push_str(&s[9..13]);
4113 out.push_str(&s[14..18]);
4114 out.push_str(&s[19..23]);
4115 out.push_str(&s[24..36]);
4116 out.make_ascii_lowercase();
4117 out
4118 }
4119 _ => return None,
4120 };
4121 let bytes = hex.as_bytes();
4122 let mut out = [0u8; 16];
4123 for i in 0..16 {
4124 let hi = hex_nibble(bytes[i * 2])?;
4125 let lo = hex_nibble(bytes[i * 2 + 1])?;
4126 out[i] = (hi << 4) | lo;
4127 }
4128 Some(out)
4129}
4130
4131fn hex_nibble(b: u8) -> Option<u8> {
4132 match b {
4133 b'0'..=b'9' => Some(b - b'0'),
4134 b'a'..=b'f' => Some(10 + b - b'a'),
4135 b'A'..=b'F' => Some(10 + b - b'A'),
4136 _ => None,
4137 }
4138}
4139
4140/// v7.17.0 — render a `Value::Uuid` payload as the canonical
4141/// lowercase 8-4-4-4-12 hyphenated form PG `text` cast surfaces.
4142#[must_use]
4143pub fn format_uuid(b: &[u8; 16]) -> String {
4144 const HEX: &[u8; 16] = b"0123456789abcdef";
4145 let mut out = String::with_capacity(36);
4146 for (i, byte) in b.iter().enumerate() {
4147 if matches!(i, 4 | 6 | 8 | 10) {
4148 out.push('-');
4149 }
4150 out.push(HEX[(byte >> 4) as usize] as char);
4151 out.push(HEX[(byte & 0x0f) as usize] as char);
4152 }
4153 out
4154}
4155
4156/// v7.17.0 Phase 1.5 — catalogued user-defined DOMAIN. A domain
4157/// is a named CHECK-constrained alias over a built-in type;
4158/// columns bound to it inherit the base type plus the CHECK
4159/// predicates + NOT NULL + DEFAULT at INSERT/UPDATE time.
4160/// `default` / `checks` are stored as Display-form source so
4161/// `spg-storage` stays free of `spg-sql` dependency — same
4162/// pattern as FunctionDef / ViewDef.
4163#[derive(Debug, Clone, PartialEq, Eq)]
4164pub struct DomainDef {
4165 pub name: String,
4166 pub base_type: DataType,
4167 pub nullable: bool,
4168 pub default: Option<String>,
4169 pub checks: Vec<String>,
4170}
4171
4172/// v7.17.0 Phase 1.4 — catalogued user-defined ENUM type. The
4173/// label vector is order-preserving (PG enum ordering follows the
4174/// declared order). At INSERT/UPDATE on a column bound to this
4175/// enum, the engine looks up the value against `labels` and
4176/// rejects non-members.
4177#[derive(Debug, Clone, PartialEq, Eq)]
4178pub struct EnumDef {
4179 pub name: String,
4180 pub labels: Vec<String>,
4181}
4182
4183/// v7.17.0 Phase 1.2 — catalogued VIEW. The body is stored as the
4184/// raw source text the parser saw between `AS` and the statement
4185/// terminator; the engine re-parses on each invocation. Same
4186/// pattern as `FunctionDef` — keeps `spg-storage` free of
4187/// `spg-sql` dependency.
4188#[derive(Debug, Clone, PartialEq, Eq)]
4189pub struct ViewDef {
4190 pub name: String,
4191 /// Optional `(col, col, …)` rename list. Empty when the body's
4192 /// projected names are used directly.
4193 pub columns: Vec<String>,
4194 /// Raw SELECT source. Display-rendered at storage time so the
4195 /// catalog round-trips a deterministic form regardless of
4196 /// whitespace / comments in the original input. Re-parsed at
4197 /// SELECT-from-view time to materialise as a synthetic CTE.
4198 pub body: String,
4199}
4200
4201impl SequenceDataType {
4202 /// PG default min/max per AS clause.
4203 pub fn default_bounds(self, increment_positive: bool) -> (i64, i64) {
4204 match self {
4205 Self::SmallInt => {
4206 if increment_positive {
4207 (1, i64::from(i16::MAX))
4208 } else {
4209 (i64::from(i16::MIN), -1)
4210 }
4211 }
4212 Self::Int => {
4213 if increment_positive {
4214 (1, i64::from(i32::MAX))
4215 } else {
4216 (i64::from(i32::MIN), -1)
4217 }
4218 }
4219 Self::BigInt => {
4220 if increment_positive {
4221 (1, i64::MAX)
4222 } else {
4223 (i64::MIN, -1)
4224 }
4225 }
4226 }
4227 }
4228}
4229
4230impl Catalog {
4231 pub const fn new() -> Self {
4232 Self {
4233 tables: Vec::new(),
4234 by_name: BTreeMap::new(),
4235 cold_segments: Vec::new(),
4236 functions: BTreeMap::new(),
4237 triggers: Vec::new(),
4238 sequences: BTreeMap::new(),
4239 views: BTreeMap::new(),
4240 materialized_views: BTreeMap::new(),
4241 enum_types: BTreeMap::new(),
4242 domain_types: BTreeMap::new(),
4243 schemas: alloc::collections::BTreeSet::new(),
4244 }
4245 }
4246
4247 /// v7.12.4 — read-only view of catalogued user-defined
4248 /// functions. Engine callers go through here to look up the
4249 /// function body before re-parsing it for invocation.
4250 pub const fn functions(&self) -> &BTreeMap<String, FunctionDef> {
4251 &self.functions
4252 }
4253
4254 /// v7.12.4 — register a new user-defined function. With
4255 /// `or_replace = false`, errors if the name is taken. The
4256 /// engine validates the body before passing it here.
4257 pub fn create_function(
4258 &mut self,
4259 def: FunctionDef,
4260 or_replace: bool,
4261 ) -> Result<(), StorageError> {
4262 if !or_replace && self.functions.contains_key(&def.name) {
4263 return Err(StorageError::Corrupt(format!(
4264 "function {:?} already exists (drop or use CREATE OR REPLACE)",
4265 def.name
4266 )));
4267 }
4268 self.functions.insert(def.name.clone(), def);
4269 Ok(())
4270 }
4271
4272 /// v7.12.4 — remove a user-defined function by name. Returns
4273 /// `true` if a function was removed, `false` if none matched.
4274 /// Caller decides whether to surface `if_exists` semantics.
4275 pub fn drop_function(&mut self, name: &str) -> bool {
4276 self.functions.remove(name).is_some()
4277 }
4278
4279 /// v7.17.0 — read-only handle to catalogued sequences.
4280 pub const fn sequences(&self) -> &BTreeMap<String, SequenceDef> {
4281 &self.sequences
4282 }
4283
4284 /// v7.17.0 — register a new SEQUENCE. Errors if `name`
4285 /// collides with an existing sequence and `if_not_exists`
4286 /// is false.
4287 pub fn create_sequence(
4288 &mut self,
4289 def: SequenceDef,
4290 if_not_exists: bool,
4291 ) -> Result<(), StorageError> {
4292 if self.sequences.contains_key(&def.name) {
4293 if if_not_exists {
4294 return Ok(());
4295 }
4296 return Err(StorageError::Corrupt(format!(
4297 "sequence {:?} already exists",
4298 def.name
4299 )));
4300 }
4301 self.sequences.insert(def.name.clone(), def);
4302 Ok(())
4303 }
4304
4305 /// v7.17.0 — remove a SEQUENCE by name. Returns `true` if a
4306 /// sequence was removed, `false` if none matched. Caller
4307 /// surfaces IF EXISTS semantics.
4308 pub fn drop_sequence(&mut self, name: &str) -> bool {
4309 self.sequences.remove(name).is_some()
4310 }
4311
4312 /// v7.17.0 — atomic nextval. Increments `last_value` per
4313 /// `increment`, returns the new value, sets `is_called`.
4314 /// Returns an error on CYCLE-less overflow.
4315 pub fn sequence_next_value(&mut self, name: &str) -> Result<i64, StorageError> {
4316 let Some(seq) = self.sequences.get_mut(name) else {
4317 return Err(StorageError::Corrupt(format!(
4318 "sequence {name:?} does not exist"
4319 )));
4320 };
4321 // PG semantics: when !is_called (fresh sequence or
4322 // setval(_, false)), the next nextval returns the stored
4323 // `last_value`. When is_called, it advances by `increment`
4324 // and CYCLE-wraps on overflow.
4325 let candidate = if seq.is_called {
4326 let next = seq.last_value.checked_add(seq.increment).ok_or_else(|| {
4327 StorageError::Corrupt(format!("sequence {name:?} arithmetic overflow"))
4328 })?;
4329 if seq.increment > 0 {
4330 if next > seq.max_value {
4331 if seq.cycle {
4332 seq.min_value
4333 } else {
4334 return Err(StorageError::Corrupt(format!(
4335 "sequence {name:?} reached MAXVALUE ({})",
4336 seq.max_value
4337 )));
4338 }
4339 } else {
4340 next
4341 }
4342 } else if next < seq.min_value {
4343 if seq.cycle {
4344 seq.max_value
4345 } else {
4346 return Err(StorageError::Corrupt(format!(
4347 "sequence {name:?} reached MINVALUE ({})",
4348 seq.min_value
4349 )));
4350 }
4351 } else {
4352 next
4353 }
4354 } else {
4355 seq.last_value
4356 };
4357 seq.last_value = candidate;
4358 seq.is_called = true;
4359 Ok(candidate)
4360 }
4361
4362 /// v7.17.0 — currval. Errors if the session has never called
4363 /// nextval on this sequence (PG semantics). At the catalog
4364 /// level we approximate "session" with "is_called persisted";
4365 /// the engine session-tracking layer can wrap this for the
4366 /// strict per-session semantics later.
4367 pub fn sequence_current_value(&self, name: &str) -> Result<i64, StorageError> {
4368 let Some(seq) = self.sequences.get(name) else {
4369 return Err(StorageError::Corrupt(format!(
4370 "sequence {name:?} does not exist"
4371 )));
4372 };
4373 if !seq.is_called {
4374 return Err(StorageError::Corrupt(format!(
4375 "currval of sequence {name:?} is not yet defined in this session"
4376 )));
4377 }
4378 Ok(seq.last_value)
4379 }
4380
4381 /// v7.17.0 — setval(name, value [, is_called]). PG returns
4382 /// `value` regardless. `is_called=true` means the NEXT
4383 /// nextval will return `value + increment`; `is_called=false`
4384 /// means the next nextval will return `value`.
4385 pub fn sequence_set_value(
4386 &mut self,
4387 name: &str,
4388 value: i64,
4389 is_called: bool,
4390 ) -> Result<i64, StorageError> {
4391 let Some(seq) = self.sequences.get_mut(name) else {
4392 return Err(StorageError::Corrupt(format!(
4393 "sequence {name:?} does not exist"
4394 )));
4395 };
4396 seq.last_value = value;
4397 seq.is_called = is_called;
4398 Ok(value)
4399 }
4400
4401 /// v7.17.0 Phase 1.2 — read-only handle to catalogued views.
4402 pub const fn views(&self) -> &BTreeMap<String, ViewDef> {
4403 &self.views
4404 }
4405
4406 /// v7.17.0 Phase 1.2 — install a VIEW. `or_replace=true`
4407 /// overwrites an existing entry; `if_not_exists=true` is a
4408 /// silent no-op when the name is taken. Errors if both flags
4409 /// are off and the name collides.
4410 pub fn create_view(
4411 &mut self,
4412 def: ViewDef,
4413 or_replace: bool,
4414 if_not_exists: bool,
4415 ) -> Result<(), StorageError> {
4416 if self.views.contains_key(&def.name) {
4417 if or_replace {
4418 self.views.insert(def.name.clone(), def);
4419 return Ok(());
4420 }
4421 if if_not_exists {
4422 return Ok(());
4423 }
4424 return Err(StorageError::Corrupt(format!(
4425 "view {:?} already exists",
4426 def.name
4427 )));
4428 }
4429 // Reject name collision with tables / sequences — same
4430 // namespace per PG.
4431 if self.by_name.contains_key(&def.name) {
4432 return Err(StorageError::Corrupt(format!(
4433 "view {:?} would shadow an existing table",
4434 def.name
4435 )));
4436 }
4437 if self.sequences.contains_key(&def.name) {
4438 return Err(StorageError::Corrupt(format!(
4439 "view {:?} would shadow an existing sequence",
4440 def.name
4441 )));
4442 }
4443 self.views.insert(def.name.clone(), def);
4444 Ok(())
4445 }
4446
4447 /// v7.17.0 Phase 1.2 — remove a view by name. Returns true if
4448 /// a view was removed.
4449 pub fn drop_view(&mut self, name: &str) -> bool {
4450 self.views.remove(name).is_some()
4451 }
4452
4453 /// v7.17.0 Phase 1.3 — read-only handle to the materialised-
4454 /// view source registry. Each entry pairs with a regular
4455 /// table of the same name that holds the cached rows.
4456 pub const fn materialized_views(&self) -> &BTreeMap<String, String> {
4457 &self.materialized_views
4458 }
4459
4460 /// v7.17.0 Phase 1.3 — register a source for a materialised
4461 /// view. Caller has already created the backing table.
4462 pub fn register_materialized_view(&mut self, name: String, body: String) {
4463 self.materialized_views.insert(name, body);
4464 }
4465
4466 /// v7.17.0 Phase 1.3 — drop the source registry entry. Returns
4467 /// true if a source was unregistered. Caller separately drops
4468 /// the backing table.
4469 pub fn drop_materialized_view_source(&mut self, name: &str) -> bool {
4470 self.materialized_views.remove(name).is_some()
4471 }
4472
4473 /// v7.17.0 Phase 1.4 — read-only handle to user-defined ENUM
4474 /// catalog.
4475 pub const fn enum_types(&self) -> &BTreeMap<String, EnumDef> {
4476 &self.enum_types
4477 }
4478
4479 /// v7.17.0 Phase 1.4 — install a new ENUM type. Errors if
4480 /// `name` collides with an existing enum (no IF NOT EXISTS
4481 /// per PG semantics for CREATE TYPE).
4482 pub fn create_enum_type(&mut self, def: EnumDef) -> Result<(), StorageError> {
4483 if self.enum_types.contains_key(&def.name) {
4484 return Err(StorageError::Corrupt(format!(
4485 "type {:?} already exists",
4486 def.name
4487 )));
4488 }
4489 self.enum_types.insert(def.name.clone(), def);
4490 Ok(())
4491 }
4492
4493 /// v7.17.0 Phase 1.4 — drop an ENUM type by name. Returns
4494 /// true if a type was removed.
4495 pub fn drop_enum_type(&mut self, name: &str) -> bool {
4496 self.enum_types.remove(name).is_some()
4497 }
4498
4499 /// v7.17.0 Phase 1.5 — read-only handle to DOMAIN catalog.
4500 pub const fn domain_types(&self) -> &BTreeMap<String, DomainDef> {
4501 &self.domain_types
4502 }
4503
4504 /// v7.17.0 Phase 1.5 — install a DOMAIN. Errors on collision
4505 /// with an existing domain.
4506 pub fn create_domain_type(&mut self, def: DomainDef) -> Result<(), StorageError> {
4507 if self.domain_types.contains_key(&def.name) {
4508 return Err(StorageError::Corrupt(format!(
4509 "domain {:?} already exists",
4510 def.name
4511 )));
4512 }
4513 self.domain_types.insert(def.name.clone(), def);
4514 Ok(())
4515 }
4516
4517 /// v7.17.0 Phase 1.5 — drop a DOMAIN by name.
4518 pub fn drop_domain_type(&mut self, name: &str) -> bool {
4519 self.domain_types.remove(name).is_some()
4520 }
4521
4522 /// v7.17.0 Phase 1.6 — read-only handle to the user-created
4523 /// schema registry. Built-in schemas (`public`, `pg_catalog`,
4524 /// `information_schema`) are NOT included here; use
4525 /// [`schema_exists`](Self::schema_exists) for the full
4526 /// check.
4527 pub const fn user_schemas(&self) -> &alloc::collections::BTreeSet<String> {
4528 &self.schemas
4529 }
4530
4531 /// v7.17.0 Phase 1.6 — schema-name resolver. Returns true
4532 /// for built-in schemas + every user-CREATEd one. Used by
4533 /// CREATE SCHEMA collision checks and (future) by
4534 /// information_schema.schemata.
4535 pub fn schema_exists(&self, name: &str) -> bool {
4536 is_builtin_schema(name) || self.schemas.contains(name)
4537 }
4538
4539 /// v7.17.0 Phase 1.6 — register a new schema. Errors if the
4540 /// name already exists and `if_not_exists=false`. Built-in
4541 /// names cannot be redeclared.
4542 pub fn create_schema(&mut self, name: String, if_not_exists: bool) -> Result<(), StorageError> {
4543 if is_builtin_schema(&name) {
4544 if if_not_exists {
4545 return Ok(());
4546 }
4547 return Err(StorageError::Corrupt(format!(
4548 "schema {name:?} is built-in and cannot be redeclared"
4549 )));
4550 }
4551 if self.schemas.contains(&name) {
4552 if if_not_exists {
4553 return Ok(());
4554 }
4555 return Err(StorageError::Corrupt(format!(
4556 "schema {name:?} already exists"
4557 )));
4558 }
4559 self.schemas.insert(name);
4560 Ok(())
4561 }
4562
4563 /// v7.17.0 Phase 1.6 — drop a user-created schema. Returns
4564 /// true if a schema was removed. Built-in names always
4565 /// return false (cannot be dropped). Tables that previously
4566 /// used the schema as a prefix keep their bare name and stay
4567 /// queryable — this is the "prefix routing, not isolation"
4568 /// posture documented in v7.17 Phase 1.6.
4569 pub fn drop_schema(&mut self, name: &str) -> Result<bool, StorageError> {
4570 if is_builtin_schema(name) {
4571 return Err(StorageError::Corrupt(format!(
4572 "schema {name:?} is built-in and cannot be dropped"
4573 )));
4574 }
4575 Ok(self.schemas.remove(name))
4576 }
4577
4578 /// v7.17.0 — ALTER SEQUENCE option merge. Caller-provided
4579 /// updates overwrite the matching fields; unset fields keep
4580 /// their stored values. RESTART variants update last_value
4581 /// directly per PG: `RESTART` resets to current `start`;
4582 /// `RESTART WITH n` resets to `n`.
4583 #[allow(clippy::too_many_arguments)]
4584 pub fn alter_sequence(
4585 &mut self,
4586 name: &str,
4587 increment: Option<i64>,
4588 min_value: Option<i64>,
4589 max_value: Option<i64>,
4590 start: Option<i64>,
4591 restart: Option<Option<i64>>,
4592 cache: Option<i64>,
4593 cycle: Option<bool>,
4594 owned_by: Option<Option<(String, String)>>,
4595 ) -> Result<(), StorageError> {
4596 let Some(seq) = self.sequences.get_mut(name) else {
4597 return Err(StorageError::Corrupt(format!(
4598 "sequence {name:?} does not exist"
4599 )));
4600 };
4601 if let Some(v) = increment {
4602 seq.increment = v;
4603 }
4604 if let Some(v) = min_value {
4605 seq.min_value = v;
4606 }
4607 if let Some(v) = max_value {
4608 seq.max_value = v;
4609 }
4610 if let Some(v) = start {
4611 seq.start = v;
4612 }
4613 if let Some(restart_value) = restart {
4614 seq.last_value = restart_value.unwrap_or(seq.start);
4615 seq.is_called = false;
4616 }
4617 if let Some(v) = cache {
4618 seq.cache = v;
4619 }
4620 if let Some(v) = cycle {
4621 seq.cycle = v;
4622 }
4623 if let Some(v) = owned_by {
4624 seq.owned_by = v;
4625 }
4626 Ok(())
4627 }
4628
4629 /// v7.12.4 — read-only slice of all catalogued triggers.
4630 /// Engine row-write paths filter this by (table, event,
4631 /// timing) and fire matches in slice order.
4632 pub fn triggers(&self) -> &[TriggerDef] {
4633 &self.triggers
4634 }
4635
4636 /// v7.15.0 — mutable handle to the trigger slice for
4637 /// `ALTER TABLE … RENAME COLUMN`, which rewrites every
4638 /// `update_columns` entry that referenced the renamed
4639 /// column.
4640 pub fn triggers_mut(&mut self) -> &mut Vec<TriggerDef> {
4641 &mut self.triggers
4642 }
4643
4644 /// v7.12.4 — register a new trigger. With `or_replace = false`,
4645 /// errors when a trigger with the same name already exists on
4646 /// the same table (PG scoping rule — trigger names are
4647 /// per-table, not global). Trigger function must already
4648 /// exist in the catalog at registration time.
4649 pub fn create_trigger(
4650 &mut self,
4651 def: TriggerDef,
4652 or_replace: bool,
4653 ) -> Result<(), StorageError> {
4654 if !self.by_name.contains_key(&def.table) {
4655 return Err(StorageError::TableNotFound {
4656 name: def.table.clone(),
4657 });
4658 }
4659 if !self.functions.contains_key(&def.function) {
4660 return Err(StorageError::Corrupt(format!(
4661 "trigger {:?} references unknown function {:?}",
4662 def.name, def.function
4663 )));
4664 }
4665 let dup = self
4666 .triggers
4667 .iter()
4668 .position(|t| t.name == def.name && t.table == def.table);
4669 match (dup, or_replace) {
4670 (Some(_), false) => Err(StorageError::Corrupt(format!(
4671 "trigger {:?} already exists on table {:?}",
4672 def.name, def.table
4673 ))),
4674 (Some(i), true) => {
4675 self.triggers[i] = def;
4676 Ok(())
4677 }
4678 (None, _) => {
4679 self.triggers.push(def);
4680 Ok(())
4681 }
4682 }
4683 }
4684
4685 /// v7.12.4 — remove a trigger by `(name, table)`. Returns
4686 /// `true` if one was removed.
4687 pub fn drop_trigger(&mut self, name: &str, table: &str) -> bool {
4688 let before = self.triggers.len();
4689 self.triggers
4690 .retain(|t| !(t.name == name && t.table == table));
4691 before != self.triggers.len()
4692 }
4693
4694 pub fn create_table(&mut self, schema: TableSchema) -> Result<(), StorageError> {
4695 if self.by_name.contains_key(&schema.name) {
4696 return Err(StorageError::DuplicateTable {
4697 name: schema.name.clone(),
4698 });
4699 }
4700 let idx = self.tables.len();
4701 let name = schema.name.clone();
4702 self.tables.push(Table::new(schema));
4703 self.by_name.insert(name, idx);
4704 Ok(())
4705 }
4706
4707 pub fn get(&self, name: &str) -> Option<&Table> {
4708 let idx = *self.by_name.get(name)?;
4709 self.tables.get(idx)
4710 }
4711
4712 pub fn get_mut(&mut self, name: &str) -> Option<&mut Table> {
4713 let idx = *self.by_name.get(name)?;
4714 self.tables.get_mut(idx)
4715 }
4716
4717 pub fn table_count(&self) -> usize {
4718 self.tables.len()
4719 }
4720
4721 /// v7.14.0 — remove a table by name. Returns `true` when the
4722 /// table existed (and is now gone), `false` when it didn't.
4723 /// Used by `DROP TABLE` from pg_dump / mysqldump preambles
4724 /// where the dump re-creates schema and starts with
4725 /// `DROP TABLE IF EXISTS`.
4726 pub fn drop_table(&mut self, name: &str) -> bool {
4727 let Some(idx) = self.by_name.remove(name) else {
4728 return false;
4729 };
4730 // swap_remove invalidates the trailing index → rebuild
4731 // by_name for affected entries.
4732 self.tables.swap_remove(idx);
4733 // Re-stamp moved table's index slot in by_name.
4734 if idx < self.tables.len() {
4735 let moved_name = self.tables[idx].schema.name.clone();
4736 self.by_name.insert(moved_name, idx);
4737 }
4738 true
4739 }
4740
4741 /// v7.16.2 — rename a table (mailrs round-10 A.5). Updates
4742 /// the schema name, the catalog name → index map, and
4743 /// rewrites every reference dangling at the table name:
4744 /// * every FK on every OTHER table whose `parent_table`
4745 /// pointed at the old name now points at the new
4746 /// name, so FK enforcement keeps working
4747 /// * every trigger watching the table updates its `table`
4748 /// field
4749 /// Returns `Ok` on success; `Err(StorageError::TableNotFound)`
4750 /// when the old name isn't in the catalog and
4751 /// `Err(StorageError::DuplicateTable)` when the new name is
4752 /// already taken.
4753 pub fn rename_table(&mut self, old: &str, new: &str) -> Result<(), StorageError> {
4754 if old == new {
4755 return Ok(());
4756 }
4757 if self.by_name.contains_key(new) {
4758 return Err(StorageError::Corrupt(format!(
4759 "rename_table: target name {new:?} already exists"
4760 )));
4761 }
4762 let idx = self
4763 .by_name
4764 .remove(old)
4765 .ok_or_else(|| StorageError::TableNotFound { name: old.into() })?;
4766 self.tables[idx].schema.name = new.to_string();
4767 self.by_name.insert(new.to_string(), idx);
4768 for t in &mut self.tables {
4769 for fk in &mut t.schema.foreign_keys {
4770 if fk.parent_table == old {
4771 fk.parent_table = new.to_string();
4772 }
4773 }
4774 }
4775 for trig in &mut self.triggers {
4776 if trig.table == old {
4777 trig.table = new.to_string();
4778 }
4779 }
4780 Ok(())
4781 }
4782
4783 /// v7.16.2 — rename an index by name. Walks every table
4784 /// since the index lives on its owning table; updates the
4785 /// name in place. Errors with `IndexNotFound` when no
4786 /// index matches. mailrs round-10 A.5.
4787 pub fn rename_index(&mut self, old: &str, new: &str) -> Result<(), StorageError> {
4788 if old == new {
4789 return Ok(());
4790 }
4791 // Reject the new name if it already exists anywhere.
4792 for t in &self.tables {
4793 if t.indices.iter().any(|i| i.name == new) {
4794 return Err(StorageError::Corrupt(format!(
4795 "rename_index: target name {new:?} already exists"
4796 )));
4797 }
4798 }
4799 for t in &mut self.tables {
4800 for i in &mut t.indices {
4801 if i.name == old {
4802 i.name = new.to_string();
4803 return Ok(());
4804 }
4805 }
4806 }
4807 Err(StorageError::IndexNotFound { name: old.into() })
4808 }
4809
4810 /// v7.14.0 — remove a named index across the catalog.
4811 /// Returns `true` when found + dropped.
4812 pub fn drop_named_index(&mut self, name: &str) -> bool {
4813 for t in &mut self.tables {
4814 let before = t.indices.len();
4815 t.indices.retain(|i| i.name != name);
4816 if t.indices.len() != before {
4817 return true;
4818 }
4819 }
4820 false
4821 }
4822
4823 /// Borrow-free copy of every table's name in catalog order
4824 /// (= insertion order, matching the on-disk encoding).
4825 pub fn table_names(&self) -> Vec<String> {
4826 self.tables.iter().map(|t| t.schema.name.clone()).collect()
4827 }
4828
4829 /// v5.1: register a cold-tier segment that already lives in
4830 /// memory (caller did the file read). Returns the
4831 /// `segment_id` that `RowLocator::Cold { segment_id, .. }`
4832 /// will reference — currently this is just the index into
4833 /// `cold_segments`, but treat it as an opaque token.
4834 ///
4835 /// Storage is `no_std`, so file I/O is the caller's
4836 /// responsibility — `spg-server` reads the file and forwards
4837 /// the bytes here. The bytes stay resident in the catalog
4838 /// for the life of the `Catalog`, parsed only once.
4839 pub fn load_segment_bytes(&mut self, bytes: Vec<u8>) -> Result<u32, StorageError> {
4840 let id = u32::try_from(self.cold_segments.len()).map_err(|_| {
4841 StorageError::Corrupt("cold segment count would exceed u32::MAX".into())
4842 })?;
4843 let seg = OwnedSegment::from_bytes(bytes)
4844 .map_err(|e| StorageError::Corrupt(format!("cold segment parse failed: {e}")))?;
4845 self.cold_segments.push(Some(Arc::new(seg)));
4846 Ok(id)
4847 }
4848
4849 /// v6.7.3 — register a cold-tier segment at a specific id. Used
4850 /// by the spg-server manifest-boot path so segments whose
4851 /// neighbouring ids were retired by compaction still get back
4852 /// the same `segment_id` they had pre-restart (the
4853 /// `RowLocator::Cold { segment_id }` baked into the BTree-index
4854 /// snapshot persists across restart and must continue to
4855 /// resolve).
4856 ///
4857 /// Pads the Vec with `None` slots up to `target_id` if needed.
4858 /// Errors when the target slot is already occupied (would
4859 /// stomp another segment), the parse fails, or `target_id`
4860 /// exceeds `u32::MAX`.
4861 pub fn load_segment_bytes_at(
4862 &mut self,
4863 target_id: u32,
4864 bytes: Vec<u8>,
4865 ) -> Result<(), StorageError> {
4866 let seg = OwnedSegment::from_bytes(bytes)
4867 .map_err(|e| StorageError::Corrupt(format!("cold segment parse failed: {e}")))?;
4868 let idx = target_id as usize;
4869 while self.cold_segments.len() <= idx {
4870 self.cold_segments.push(None);
4871 }
4872 if self.cold_segments[idx].is_some() {
4873 return Err(StorageError::Corrupt(format!(
4874 "load_segment_bytes_at: segment_id {target_id} already occupied"
4875 )));
4876 }
4877 self.cold_segments[idx] = Some(Arc::new(seg));
4878 Ok(())
4879 }
4880
4881 /// v6.7.3 — retire a cold-tier segment slot (compaction-driven).
4882 /// The physical file is the caller's concern (typically kept
4883 /// on disk until the next CHECKPOINT writes a manifest that
4884 /// no longer lists it); this just flips the in-memory slot
4885 /// to `None` so later cold lookups for `segment_id` resolve
4886 /// as "unknown" instead of returning a stale row.
4887 ///
4888 /// No-op when the slot is already `None`. Errors only when
4889 /// `segment_id` is out of bounds.
4890 pub fn tombstone_segment(&mut self, segment_id: u32) -> Result<(), StorageError> {
4891 let idx = segment_id as usize;
4892 if idx >= self.cold_segments.len() {
4893 return Err(StorageError::Corrupt(format!(
4894 "tombstone_segment: segment_id {segment_id} out of bounds (len={})",
4895 self.cold_segments.len()
4896 )));
4897 }
4898 self.cold_segments[idx] = None;
4899 Ok(())
4900 }
4901
4902 /// Number of *active* (non-tombstoned) cold segments.
4903 #[must_use]
4904 pub fn cold_segment_count(&self) -> usize {
4905 self.cold_segments.iter().filter(|s| s.is_some()).count()
4906 }
4907
4908 /// Slot count including tombstones (= the next id the
4909 /// no-arg `load_segment_bytes` would allocate).
4910 #[must_use]
4911 pub fn cold_segment_slot_count(&self) -> usize {
4912 self.cold_segments.len()
4913 }
4914
4915 /// v6.2.7 — list every *active* cold-tier segment id known to
4916 /// this catalog (skips compaction tombstones since v6.7.3).
4917 /// Used by EXPLAIN ANALYZE to annotate scan nodes with the
4918 /// segments they could have walked.
4919 #[must_use]
4920 pub fn cold_segment_ids_global(&self) -> Vec<u32> {
4921 self.cold_segments
4922 .iter()
4923 .enumerate()
4924 .filter_map(|(i, s)| s.as_ref().map(|_| i as u32))
4925 .collect()
4926 }
4927
4928 /// v5.2.1: sum of `Table::hot_bytes` across every table. The v5.2
4929 /// freezer compares this against `SPG_HOT_TIER_BYTES` (parsed at
4930 /// server startup; default 4 GiB) and wakes when the budget is
4931 /// crossed. Pre-freezer (v5.2.1) this is measurement-only — the
4932 /// counter exposes whether the budget is being approached without
4933 /// triggering any demotion.
4934 #[must_use]
4935 pub fn hot_tier_bytes(&self) -> u64 {
4936 self.tables
4937 .iter()
4938 .map(Table::hot_bytes)
4939 .fold(0u64, u64::saturating_add)
4940 }
4941
4942 /// v5.2.2: freeze the **first** `max_rows` rows of `table_name`'s
4943 /// hot tier into a brand-new cold-tier segment. The named `BTree`
4944 /// index supplies the per-row PK (its column must be an integer
4945 /// type — v5.2.2 only supports `IndexKey::Int` PKs, matching the
4946 /// `index_key_as_u64` constraint used by the cold-tier lookup
4947 /// path). On success returns a [`FreezeReport`] with the
4948 /// freshly-allocated segment id, the count of rows that moved,
4949 /// the encoded segment bytes (so the caller can persist them to
4950 /// disk for later reload via `SPG_PRELOAD_COLD_SEGMENT`), and the
4951 /// hot-tier byte delta that was reclaimed.
4952 ///
4953 /// **Semantics**:
4954 /// 1. The first `max_rows` rows (by hot-tier position — same as
4955 /// insertion order under v4.39 `PersistentVec`) are read.
4956 /// 2. Rows are sorted ascending by PK and serialised into a new
4957 /// segment via [`encode_segment`].
4958 /// 3. The hot rows are dropped via [`Table::delete_rows`]; the
4959 /// `rebuild_indices` it triggers regenerates `Hot` locators
4960 /// for every remaining row (their positions shift down by
4961 /// `max_rows`). Existing `Cold` locators in this index — from
4962 /// a previous freeze — are also rebuilt **but with empty
4963 /// payload** since rebuild reads only `self.rows`; this
4964 /// routine re-registers them at the end of the call so the
4965 /// user-visible state preserves all prior cold locators.
4966 /// 4. The new segment is loaded into `self.cold_segments` via
4967 /// [`Catalog::load_segment_bytes`] (allocating a fresh
4968 /// `segment_id`). New `Cold` locators are registered on the
4969 /// named index — one per frozen row.
4970 ///
4971 /// **v5.2.2 limits** (relaxed in later sub-versions):
4972 /// - INSERT-only flow: subsequent UPDATE/DELETE on a frozen row
4973 /// returns a stale-locator error (no promote-on-write until
4974 /// v5.2.3).
4975 /// - Single-table scope: callers iterate tables themselves.
4976 /// - All-or-nothing: returns `Err` and leaves catalog unchanged
4977 /// if any step fails before the atomic swap point.
4978 ///
4979 /// Errors:
4980 /// - [`StorageError::Corrupt`] for missing table/index, non-`BTree`
4981 /// index, non-integer PK column, `max_rows == 0`, or
4982 /// `max_rows > row_count`.
4983 /// - The encoder's [`SegmentError`] surfaces as `Corrupt` (the
4984 /// only realistic source is "a single row is larger than the
4985 /// page size"; SPG schemas don't hit it in practice).
4986 pub fn freeze_oldest_to_cold(
4987 &mut self,
4988 table_name: &str,
4989 index_name: &str,
4990 max_rows: usize,
4991 ) -> Result<FreezeReport, StorageError> {
4992 // --- validation phase: never mutates ---------------------
4993 if max_rows == 0 {
4994 return Err(StorageError::Corrupt(
4995 "freeze_oldest_to_cold: max_rows must be > 0".into(),
4996 ));
4997 }
4998 let table = self.get(table_name).ok_or_else(|| {
4999 StorageError::Corrupt(format!(
5000 "freeze_oldest_to_cold: table {table_name:?} not found"
5001 ))
5002 })?;
5003 if max_rows > table.rows.len() {
5004 return Err(StorageError::Corrupt(format!(
5005 "freeze_oldest_to_cold: max_rows {max_rows} > row_count {}",
5006 table.rows.len()
5007 )));
5008 }
5009 let idx = table
5010 .indices
5011 .iter()
5012 .find(|i| i.name == index_name)
5013 .ok_or_else(|| {
5014 StorageError::Corrupt(format!(
5015 "freeze_oldest_to_cold: index {index_name:?} not found on {table_name:?}"
5016 ))
5017 })?;
5018 if !matches!(idx.kind, IndexKind::BTree(_)) {
5019 return Err(StorageError::Corrupt(format!(
5020 "freeze_oldest_to_cold: index {index_name:?} is NSW; only BTree indices may freeze"
5021 )));
5022 }
5023 let column_position = idx.column_position;
5024
5025 // --- segment build phase: reads only --------------------
5026 let schema = table.schema.clone();
5027 let mut to_freeze: Vec<(u64, Vec<u8>, IndexKey)> = Vec::with_capacity(max_rows);
5028 for row_idx in 0..max_rows {
5029 let row = table.rows.get(row_idx).expect("bounds-checked above");
5030 let key = IndexKey::from_value(&row.values[column_position]).ok_or_else(|| {
5031 StorageError::Corrupt(format!(
5032 "freeze_oldest_to_cold: row {row_idx} has NULL / non-key value in index column"
5033 ))
5034 })?;
5035 let pk_u64 = index_key_as_u64(&key).ok_or_else(|| {
5036 StorageError::Corrupt(format!(
5037 "freeze_oldest_to_cold: index {index_name:?} column type is non-integer; \
5038 v5.2.2 cold tier requires IndexKey::Int (Text PK lands in v5.5+)"
5039 ))
5040 })?;
5041 to_freeze.push((pk_u64, encode_row_body_dense(row, &schema), key));
5042 }
5043 // encode_segment requires ascending u64 keys. Sort by PK
5044 // before encoding; the caller's row-position order is not
5045 // necessarily PK order (e.g. workloads that insert random
5046 // PKs).
5047 to_freeze.sort_by_key(|(k, _, _)| *k);
5048 // Reject duplicate PKs — encode_segment also rejects them
5049 // (`SegmentError::UnsortedKey`), but the resulting error
5050 // message there is misleading. Surface a clearer one.
5051 for w in to_freeze.windows(2) {
5052 if w[0].0 == w[1].0 {
5053 return Err(StorageError::Corrupt(format!(
5054 "freeze_oldest_to_cold: duplicate PK {} in freeze batch",
5055 w[0].0
5056 )));
5057 }
5058 }
5059 // Snapshot the (key, locator) pairs that will be registered
5060 // post-swap. Cloning the IndexKey out before the move makes
5061 // the registration loop borrow-free.
5062 let post_swap_keys: Vec<IndexKey> = to_freeze.iter().map(|(_, _, k)| k.clone()).collect();
5063 // Segment encode is now infallible w.r.t. ordering. Map the
5064 // `SegmentError` into a `StorageError::Corrupt` so the
5065 // public surface stays one error type.
5066 let seg_rows: Vec<(u64, Vec<u8>)> = to_freeze
5067 .into_iter()
5068 .map(|(k, body, _)| (k, body))
5069 .collect();
5070 let frozen_rows = seg_rows.len();
5071 let (seg_bytes, _meta) = encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES)
5072 .map_err(|e| StorageError::Corrupt(format!("freeze_oldest_to_cold: encode: {e}")))?;
5073
5074 // --- atomic swap phase: mutations only past this point ---
5075 // v5.2.3 made `Table::rebuild_indices` preserve every Cold
5076 // locator across the per-table rebuild, so `delete_rows`
5077 // below no longer wipes prior-freeze cold entries. The pre-
5078 // v5.2.3 capture-then-re-register that used to live here
5079 // was removed in v5.3.1 — keeping it would double-count
5080 // every prior-frozen key's Cold locator on each subsequent
5081 // freeze.
5082 let bytes_before = self.get(table_name).expect("just validated").hot_bytes();
5083 let positions: Vec<usize> = (0..max_rows).collect();
5084 let t_mut = self
5085 .get_mut(table_name)
5086 .expect("just validated; still present");
5087 let removed = t_mut.delete_rows(&positions);
5088 debug_assert_eq!(removed, max_rows, "delete_rows count matches request");
5089 let bytes_after = t_mut.hot_bytes();
5090 let bytes_freed = bytes_before.saturating_sub(bytes_after);
5091
5092 let segment_id = self
5093 .load_segment_bytes(seg_bytes.clone())
5094 .map_err(|e| StorageError::Corrupt(format!("freeze_oldest_to_cold: load: {e}")))?;
5095 let new_cold = post_swap_keys.into_iter().map(|k| {
5096 (
5097 k,
5098 RowLocator::Cold {
5099 segment_id,
5100 page_offset: 0,
5101 },
5102 )
5103 });
5104 let t_mut = self.get_mut(table_name).expect("still present");
5105 t_mut.register_cold_locators(index_name, new_cold)?;
5106
5107 Ok(FreezeReport {
5108 segment_id,
5109 frozen_rows,
5110 bytes_freed,
5111 segment_bytes: seg_bytes,
5112 })
5113 }
5114
5115 /// v5.1: borrow the cold segment at `segment_id`. Used by the
5116 /// spg-server preload path to enumerate (key, locator) pairs
5117 /// after loading a segment, so it can call
5118 /// [`Table::register_cold_locators`] without re-parsing the
5119 /// bytes.
5120 #[must_use]
5121 pub fn cold_segment(&self, segment_id: u32) -> Option<&OwnedSegment> {
5122 self.cold_segments
5123 .get(segment_id as usize)
5124 .and_then(|s| s.as_deref())
5125 }
5126
5127 /// v5.1: resolve a single `RowLocator::Cold` to its underlying
5128 /// `Row`. Decoupled from [`Catalog::lookup_by_pk`] so callers
5129 /// iterating a multi-locator slice (e.g. the engine's index
5130 /// seek path) can dispatch per locator instead of getting back
5131 /// only the first row for a key. Returns `None` when the
5132 /// segment isn't registered, the key isn't `u64`-coercible, or
5133 /// the segment doesn't actually carry the key (bloom or page-
5134 /// index reject).
5135 pub fn resolve_cold_locator(
5136 &self,
5137 table_name: &str,
5138 segment_id: u32,
5139 key: &IndexKey,
5140 ) -> Option<Row> {
5141 let t = self.get(table_name)?;
5142 let u64_key = index_key_as_u64(key)?;
5143 let seg = self.cold_segments.get(segment_id as usize)?.as_ref()?;
5144 let payload = seg.lookup(u64_key)?;
5145 let (row, _) = decode_row_body_dense(&payload, &t.schema, seg.long_strings()).ok()?;
5146 Some(row)
5147 }
5148
5149 /// v5.1: indexed PK lookup that dispatches per locator,
5150 /// returning the first matching row from either the hot tier
5151 /// (`Table::rows`) or a registered cold segment.
5152 ///
5153 /// The cold path requires the index column to be coercible to
5154 /// a `u64` (the segment's PK type) and the segment payload to
5155 /// be a [`encode_row_body_dense`]-encoded row body for the
5156 /// same schema. v5.1 ships this for BIGINT / INT / SMALLINT
5157 /// PKs; other types fall through to hot-only behavior.
5158 ///
5159 /// Returns `None` if (a) the table or index doesn't exist,
5160 /// (b) the key isn't in the index at all, or (c) the key was
5161 /// resolved to a stale locator (Hot index out of range, Cold
5162 /// segment id unknown, segment lookup miss). Does not surface
5163 /// segment-decode errors — those would indicate corrupted
5164 /// cold-tier files and should be caught at
5165 /// [`Catalog::load_segment_bytes`] time.
5166 pub fn lookup_by_pk(&self, table: &str, index_name: &str, key: &IndexKey) -> Option<Row> {
5167 let t = self.get(table)?;
5168 let idx = t.indices.iter().find(|i| i.name == index_name)?;
5169 let locators = idx.lookup_eq(key);
5170 let cold_u64_key = index_key_as_u64(key);
5171 for loc in locators {
5172 match *loc {
5173 RowLocator::Hot(i) => {
5174 if let Some(row) = t.rows.get(i) {
5175 return Some(row.clone());
5176 }
5177 }
5178 RowLocator::Cold {
5179 segment_id,
5180 page_offset: _,
5181 } => {
5182 let Some(u64_key) = cold_u64_key else {
5183 // Key type not coercible to u64 — cold tier
5184 // only handles BIGINT/INT/SMALLINT in v5.1.
5185 continue;
5186 };
5187 let Some(seg) = self
5188 .cold_segments
5189 .get(segment_id as usize)
5190 .and_then(|s| s.as_deref())
5191 else {
5192 // v6.7.3 — `None` slot = compaction
5193 // retired this segment; the live locator
5194 // on a freshly-compacted index points to
5195 // the merged segment_id, so a Cold hit
5196 // here against a tombstone means the BTree
5197 // entry hasn't been swapped yet (mid-
5198 // compaction reader race) or the caller is
5199 // looking up a stale snapshot. Skip — the
5200 // next locator in the list, if any, is
5201 // typically the merged segment.
5202 continue;
5203 };
5204 let Some(payload) = seg.lookup(u64_key) else {
5205 continue;
5206 };
5207 let (row, _) =
5208 decode_row_body_dense(&payload, &t.schema, seg.long_strings()).ok()?;
5209 return Some(row);
5210 }
5211 }
5212 }
5213 None
5214 }
5215
5216 /// v5.2.3: promote a frozen row back to the hot tier so an
5217 /// UPDATE / DELETE can mutate it. Reads the cold-tier row body
5218 /// (decoded from its registered segment), pushes it into
5219 /// `table.rows` via [`Table::insert`] (which also adds a fresh
5220 /// `Hot(new_idx)` locator on `index_name`), then retires the
5221 /// shadowed `Cold` locator via
5222 /// [`Table::remove_cold_locators_for_key`]. The cold-tier row
5223 /// in the segment file becomes garbage — recoverable when a
5224 /// future cold-segment compaction job lands.
5225 ///
5226 /// Returns:
5227 /// - `Ok(Some(new_hot_idx))` when the key resolved through a
5228 /// cold locator and the promote completed. `new_hot_idx` is
5229 /// the position the row now occupies in `table.rows`.
5230 /// - `Ok(None)` when the key has no Cold locator on the index
5231 /// (already hot, or wasn't present at all). Callers treat this
5232 /// as "nothing to do here, fall back to the hot-only path".
5233 ///
5234 /// Errors when the table / index doesn't exist, the index isn't
5235 /// `BTree`, the cold segment is missing / can't decode the row,
5236 /// or the inferred row body fails `Table::insert` validation.
5237 pub fn promote_cold_row(
5238 &mut self,
5239 table_name: &str,
5240 index_name: &str,
5241 key: &IndexKey,
5242 ) -> Result<Option<usize>, StorageError> {
5243 let cold_loc = self.find_cold_locator(table_name, index_name, key)?;
5244 let Some((segment_id, _page_offset)) = cold_loc else {
5245 return Ok(None);
5246 };
5247 let u64_key = index_key_as_u64(key).ok_or_else(|| {
5248 StorageError::Corrupt(
5249 "promote_cold_row: key type not coercible to u64 (cold tier requires integer PK)"
5250 .into(),
5251 )
5252 })?;
5253 // Read the row body from the segment. Borrow the segment +
5254 // schema short-term so we can then take `&mut self` for the
5255 // hot-side insert.
5256 let schema = self
5257 .get(table_name)
5258 .ok_or_else(|| {
5259 StorageError::Corrupt(format!("promote_cold_row: table {table_name:?} not found"))
5260 })?
5261 .schema
5262 .clone();
5263 let seg = self
5264 .cold_segments
5265 .get(segment_id as usize)
5266 .and_then(|s| s.as_ref())
5267 .ok_or_else(|| {
5268 StorageError::Corrupt(format!(
5269 "promote_cold_row: segment {segment_id} not registered on catalog"
5270 ))
5271 })?;
5272 let payload = seg.lookup(u64_key).ok_or_else(|| {
5273 StorageError::Corrupt(format!(
5274 "promote_cold_row: key {u64_key} resolves to segment {segment_id} \
5275 but the segment's bloom/page lookup didn't return a row"
5276 ))
5277 })?;
5278 let (row, _consumed) = decode_row_body_dense(&payload, &schema, seg.long_strings())?;
5279 // Insert the promoted row into the hot tier. `Table::insert`
5280 // appends to `self.rows`, adds a `Hot(new_idx)` locator to
5281 // every BTree index covering the row's keyed columns, and
5282 // increments `hot_bytes`.
5283 let t = self
5284 .get_mut(table_name)
5285 .expect("table existed at lookup time");
5286 t.insert(row)?;
5287 let new_hot_idx =
5288 t.rows.len().checked_sub(1).ok_or_else(|| {
5289 StorageError::Corrupt("promote_cold_row: empty after insert".into())
5290 })?;
5291 // The hot insert added Hot(new_idx) alongside the still-
5292 // present Cold locator. Drop the Cold entry so future
5293 // lookups return only the fresh hot row.
5294 t.remove_cold_locators_for_key(index_name, key)?;
5295 Ok(Some(new_hot_idx))
5296 }
5297
5298 /// v5.2.3: shadow a frozen row's index entry. Used by DELETE
5299 /// when the row to remove lives in a cold-tier segment — the
5300 /// row body stays in the segment file (becoming garbage) but
5301 /// every `Cold` locator for `key` on `index_name` is removed
5302 /// so PK lookups stop returning it.
5303 ///
5304 /// Returns the number of cold locators retired (0 when the key
5305 /// has no cold entries — the DELETE fell on a hot row or a
5306 /// key that was already absent). Errors when the table /
5307 /// index doesn't exist or the index isn't `BTree`.
5308 ///
5309 /// Cold-segment compaction (which merges shadowed-heavy
5310 /// segments and reclaims their disk footprint) lands in a
5311 /// later v5.x sub-version; until then, repeated UPDATE/DELETE
5312 /// of cold rows can amplify cold-segment disk usage by up to
5313 /// 1-2× — still well under typical LSM-tree shadowing because
5314 /// SPG segments are bulk-baked, not write-merged.
5315 pub fn shadow_cold_row(
5316 &mut self,
5317 table_name: &str,
5318 index_name: &str,
5319 key: &IndexKey,
5320 ) -> Result<usize, StorageError> {
5321 let t = self.get_mut(table_name).ok_or_else(|| {
5322 StorageError::Corrupt(format!("shadow_cold_row: table {table_name:?} not found"))
5323 })?;
5324 t.remove_cold_locators_for_key(index_name, key)
5325 }
5326
5327 /// v6.7.4 — read-only slice preparation for the parallel
5328 /// freezer. Walks rows in `row_range`, builds the
5329 /// `(pk_u64, encoded_body, IndexKey)` triples that the
5330 /// coordinator's k-way merge consumes, sorts the slice by
5331 /// `pk_u64`, and returns a [`FreezeSlice`].
5332 ///
5333 /// Caller invariants:
5334 /// - `row_range.end <= table.rows.len()` (caller's job to
5335 /// compute the partition).
5336 /// - All slices passed to `commit_freeze_slices` must cover a
5337 /// contiguous half-open range `[0, total_max_rows)` with no
5338 /// gaps and no overlaps. The coordinator validates this
5339 /// invariant before committing.
5340 ///
5341 /// `&self`-only — multiple workers can run this concurrently
5342 /// against the same `Catalog` reference under the engine's
5343 /// write lock (workers don't mutate; the coordinator does).
5344 pub fn prepare_freeze_slice(
5345 &self,
5346 table_name: &str,
5347 index_name: &str,
5348 row_range: core::ops::Range<usize>,
5349 ) -> Result<FreezeSlice, StorageError> {
5350 let table = self.get(table_name).ok_or_else(|| {
5351 StorageError::Corrupt(format!(
5352 "prepare_freeze_slice: table {table_name:?} not found"
5353 ))
5354 })?;
5355 let idx = table
5356 .indices
5357 .iter()
5358 .find(|i| i.name == index_name)
5359 .ok_or_else(|| {
5360 StorageError::Corrupt(format!(
5361 "prepare_freeze_slice: index {index_name:?} not found on {table_name:?}"
5362 ))
5363 })?;
5364 if !matches!(idx.kind, IndexKind::BTree(_)) {
5365 return Err(StorageError::Corrupt(format!(
5366 "prepare_freeze_slice: index {index_name:?} is NSW; only BTree indices may freeze"
5367 )));
5368 }
5369 if row_range.end > table.rows.len() {
5370 return Err(StorageError::Corrupt(format!(
5371 "prepare_freeze_slice: row_range end {} > row_count {}",
5372 row_range.end,
5373 table.rows.len()
5374 )));
5375 }
5376 let column_position = idx.column_position;
5377 let schema = table.schema.clone();
5378 let mut rows: Vec<(u64, Vec<u8>, IndexKey)> = Vec::with_capacity(row_range.len());
5379 for row_idx in row_range.clone() {
5380 let row = table.rows.get(row_idx).expect("bounds-checked above");
5381 let key = IndexKey::from_value(&row.values[column_position]).ok_or_else(|| {
5382 StorageError::Corrupt(format!(
5383 "prepare_freeze_slice: row {row_idx} has NULL / non-key value in index column"
5384 ))
5385 })?;
5386 let pk_u64 = index_key_as_u64(&key).ok_or_else(|| {
5387 StorageError::Corrupt(format!(
5388 "prepare_freeze_slice: index {index_name:?} column type is non-integer; \
5389 v5.2.2 cold tier requires IndexKey::Int (Text PK lands in v5.5+)"
5390 ))
5391 })?;
5392 rows.push((pk_u64, encode_row_body_dense(row, &schema), key));
5393 }
5394 rows.sort_by_key(|(k, _, _)| *k);
5395 Ok(FreezeSlice { row_range, rows })
5396 }
5397
5398 /// v6.7.4 — coordinator commit step. Merges N
5399 /// [`FreezeSlice`]s into one segment via the standard
5400 /// [`encode_segment`] path, atomically swaps the catalog
5401 /// state (delete the union row range + register Cold
5402 /// locators + load the segment).
5403 ///
5404 /// Validates that the slices cover a contiguous, gap-free,
5405 /// overlap-free half-open range starting at index 0 (the
5406 /// freezer always freezes "oldest first" — same semantics as
5407 /// the single-threaded [`Catalog::freeze_oldest_to_cold`]).
5408 ///
5409 /// Empty `slices` → no-op success (returns a zero-row report
5410 /// without mutating). Total row count = `Σ slice.rows.len()`.
5411 pub fn commit_freeze_slices(
5412 &mut self,
5413 table_name: &str,
5414 index_name: &str,
5415 slices: Vec<FreezeSlice>,
5416 ) -> Result<FreezeReport, StorageError> {
5417 // --- validation phase: never mutates ---------------------
5418 let table = self.get(table_name).ok_or_else(|| {
5419 StorageError::Corrupt(format!(
5420 "commit_freeze_slices: table {table_name:?} not found"
5421 ))
5422 })?;
5423 let idx = table
5424 .indices
5425 .iter()
5426 .find(|i| i.name == index_name)
5427 .ok_or_else(|| {
5428 StorageError::Corrupt(format!(
5429 "commit_freeze_slices: index {index_name:?} not found on {table_name:?}"
5430 ))
5431 })?;
5432 if !matches!(idx.kind, IndexKind::BTree(_)) {
5433 return Err(StorageError::Corrupt(format!(
5434 "commit_freeze_slices: index {index_name:?} is NSW; only BTree indices may freeze"
5435 )));
5436 }
5437 // Validate slice coverage: contiguous from 0, no gaps, no
5438 // overlaps. Allow the caller to pass slices in any order —
5439 // sort by row_range.start first.
5440 let mut ordered = slices;
5441 ordered.sort_by_key(|s| s.row_range.start);
5442 // Drop fully-empty slices that fell out of an uneven
5443 // partition; they carry no data but contribute to the
5444 // contiguity check, so keep them in line.
5445 let mut expected_start = 0usize;
5446 for s in &ordered {
5447 if s.row_range.start != expected_start {
5448 return Err(StorageError::Corrupt(format!(
5449 "commit_freeze_slices: gap/overlap at row {}; expected start {}",
5450 s.row_range.start, expected_start
5451 )));
5452 }
5453 expected_start = s.row_range.end;
5454 }
5455 let max_rows = expected_start;
5456 if max_rows > table.rows.len() {
5457 return Err(StorageError::Corrupt(format!(
5458 "commit_freeze_slices: total row range {} exceeds row_count {}",
5459 max_rows,
5460 table.rows.len()
5461 )));
5462 }
5463 if max_rows == 0 {
5464 return Ok(FreezeReport {
5465 segment_id: u32::MAX,
5466 frozen_rows: 0,
5467 bytes_freed: 0,
5468 segment_bytes: Vec::new(),
5469 });
5470 }
5471
5472 // --- segment build phase: reads only --------------------
5473 // K-way merge of already-sorted slices. Each slice's rows
5474 // are ascending by pk_u64; we keep a per-slice cursor and
5475 // pull the next-smallest head until every cursor drains.
5476 let total_rows: usize = ordered.iter().map(|s| s.rows.len()).sum();
5477 if total_rows != max_rows {
5478 return Err(StorageError::Corrupt(format!(
5479 "commit_freeze_slices: total slice rows {total_rows} ≠ row_range coverage {max_rows}"
5480 )));
5481 }
5482 let mut cursors: Vec<usize> = alloc::vec![0; ordered.len()];
5483 let mut merged: Vec<(u64, Vec<u8>, IndexKey)> = Vec::with_capacity(total_rows);
5484 loop {
5485 // Pick the slice whose head row has the smallest key
5486 // and isn't yet exhausted.
5487 let mut pick: Option<usize> = None;
5488 for (i, c) in cursors.iter().enumerate() {
5489 let slice = &ordered[i];
5490 if *c >= slice.rows.len() {
5491 continue;
5492 }
5493 match pick {
5494 None => pick = Some(i),
5495 Some(j) => {
5496 if slice.rows[*c].0 < ordered[j].rows[cursors[j]].0 {
5497 pick = Some(i);
5498 }
5499 }
5500 }
5501 }
5502 let Some(i) = pick else { break };
5503 let row = ordered[i].rows[cursors[i]].clone();
5504 cursors[i] += 1;
5505 merged.push(row);
5506 }
5507 // Reject duplicate PKs — same error as the single-threaded
5508 // path so callers get a uniform surface.
5509 for w in merged.windows(2) {
5510 if w[0].0 == w[1].0 {
5511 return Err(StorageError::Corrupt(format!(
5512 "commit_freeze_slices: duplicate PK {} across slices",
5513 w[0].0
5514 )));
5515 }
5516 }
5517 let post_swap_keys: Vec<IndexKey> = merged.iter().map(|(_, _, k)| k.clone()).collect();
5518 let seg_rows: Vec<(u64, Vec<u8>)> =
5519 merged.into_iter().map(|(k, body, _)| (k, body)).collect();
5520 let frozen_rows = seg_rows.len();
5521 let (seg_bytes, _meta) = encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES)
5522 .map_err(|e| StorageError::Corrupt(format!("commit_freeze_slices: encode: {e}")))?;
5523
5524 // --- atomic swap phase: mutations only past this point ---
5525 let bytes_before = self.get(table_name).expect("just validated").hot_bytes();
5526 let positions: Vec<usize> = (0..max_rows).collect();
5527 let t_mut = self
5528 .get_mut(table_name)
5529 .expect("just validated; still present");
5530 let removed = t_mut.delete_rows(&positions);
5531 debug_assert_eq!(removed, max_rows, "delete_rows count matches request");
5532 let bytes_after = t_mut.hot_bytes();
5533 let bytes_freed = bytes_before.saturating_sub(bytes_after);
5534
5535 let segment_id = self
5536 .load_segment_bytes(seg_bytes.clone())
5537 .map_err(|e| StorageError::Corrupt(format!("commit_freeze_slices: load: {e}")))?;
5538 let new_cold = post_swap_keys.into_iter().map(|k| {
5539 (
5540 k,
5541 RowLocator::Cold {
5542 segment_id,
5543 page_offset: 0,
5544 },
5545 )
5546 });
5547 let t_mut = self.get_mut(table_name).expect("still present");
5548 t_mut.register_cold_locators(index_name, new_cold)?;
5549
5550 Ok(FreezeReport {
5551 segment_id,
5552 frozen_rows,
5553 bytes_freed,
5554 segment_bytes: seg_bytes,
5555 })
5556 }
5557
5558 /// v6.7.3 — compact every cold segment on `(table, index)` whose
5559 /// `OwnedSegment::bytes().len()` is below `target_segment_bytes`
5560 /// into a single larger merged segment. Rows present in source
5561 /// segment payloads but no longer referenced by any
5562 /// `RowLocator::Cold` on the index (DELETE'd + frozen rows
5563 /// retired via [`Catalog::shadow_cold_row`]) are GC'd in the
5564 /// merge.
5565 ///
5566 /// **Semantics**:
5567 /// 1. Walk the BTree index to collect every Cold locator that
5568 /// targets a small (< threshold) segment. Each such
5569 /// `(key, segment_id)` becomes a row in the merged segment;
5570 /// payload is looked up from the source segment in-place.
5571 /// 2. Encode the collected rows into one new segment via
5572 /// [`encode_segment`]; register it via
5573 /// [`Catalog::load_segment_bytes`] (allocating a fresh
5574 /// `merged_segment_id` at the end of `cold_segments`).
5575 /// 3. Rewrite the BTree index in one pass: every
5576 /// `RowLocator::Cold { segment_id ∈ sources }` becomes
5577 /// `RowLocator::Cold { segment_id = merged_id, page_offset = 0 }`.
5578 /// Hot locators are untouched.
5579 /// 4. Tombstone every source slot via
5580 /// [`Catalog::tombstone_segment`]. Source segment payloads
5581 /// are no longer reachable through the catalog; the on-disk
5582 /// files are the caller's concern.
5583 ///
5584 /// On fewer than 2 candidate segments the catalog is **not**
5585 /// mutated and a no-op report (`merged_segment_id: None`,
5586 /// `sources: []`) is returned. This is the routine case — a
5587 /// freshly-frozen table has at most 1 small segment, no merge
5588 /// possible.
5589 ///
5590 /// Atomicity: every mutating step runs after the read-only
5591 /// gather phase, so a panic before the merge encode leaves the
5592 /// catalog unchanged. The mutation block itself (load + rewrite +
5593 /// tombstone) takes only `&mut self` — callers serialise the
5594 /// engine write lock outside this function.
5595 ///
5596 /// Errors when the table / index doesn't exist, the index isn't
5597 /// `BTree`, the index column type isn't u64-coercible (cold-tier
5598 /// pre-condition), or a source segment fails its in-place
5599 /// row-body lookup (would indicate prior catalog corruption).
5600 pub fn compact_cold_segments(
5601 &mut self,
5602 table_name: &str,
5603 index_name: &str,
5604 target_segment_bytes: u64,
5605 ) -> Result<CompactReport, StorageError> {
5606 // --- validation phase ----------------------------------
5607 let t = self.get(table_name).ok_or_else(|| {
5608 StorageError::Corrupt(format!(
5609 "compact_cold_segments: table {table_name:?} not found"
5610 ))
5611 })?;
5612 let idx = t
5613 .indices
5614 .iter()
5615 .find(|i| i.name == index_name)
5616 .ok_or_else(|| {
5617 StorageError::Corrupt(format!(
5618 "compact_cold_segments: index {index_name:?} not found on {table_name:?}"
5619 ))
5620 })?;
5621 let map = match &idx.kind {
5622 IndexKind::BTree(m) => m,
5623 IndexKind::Nsw(_)
5624 | IndexKind::Brin { .. }
5625 | IndexKind::Gin(_)
5626 | IndexKind::GinTrgm(_)
5627 | IndexKind::GinFulltext(_) => {
5628 return Err(StorageError::Corrupt(format!(
5629 "compact_cold_segments: index {index_name:?} is not BTree; \
5630 compaction applies only to BTree cold-tier indices"
5631 )));
5632 }
5633 };
5634
5635 // --- gather phase --------------------------------------
5636 // Step A: every segment_id this BTree index Cold-references.
5637 let mut referenced_ids: BTreeSet<u32> = BTreeSet::new();
5638 for (_key, locators) in map.iter() {
5639 for loc in locators {
5640 if let RowLocator::Cold { segment_id, .. } = loc {
5641 referenced_ids.insert(*segment_id);
5642 }
5643 }
5644 }
5645 // Step B: keep only the small + still-active ones.
5646 let candidate_set: BTreeSet<u32> = referenced_ids
5647 .into_iter()
5648 .filter(|id| {
5649 self.cold_segments
5650 .get(*id as usize)
5651 .and_then(|s| s.as_deref())
5652 .is_some_and(|s| (s.bytes().len() as u64) < target_segment_bytes)
5653 })
5654 .collect();
5655 if candidate_set.len() < 2 {
5656 return Ok(CompactReport {
5657 sources: Vec::new(),
5658 merged_segment_id: None,
5659 merged_segment_bytes: Vec::new(),
5660 merged_rows: 0,
5661 deleted_rows_pruned: 0,
5662 bytes_reclaimed_estimate: 0,
5663 });
5664 }
5665 // Step C: pre-count source rows for the deleted-pruned metric.
5666 let mut source_row_count: usize = 0;
5667 let mut source_byte_total: u64 = 0;
5668 for &id in &candidate_set {
5669 let seg = self.cold_segments[id as usize]
5670 .as_ref()
5671 .expect("candidate selected only when slot is Some");
5672 source_row_count = source_row_count.saturating_add(seg.meta().num_rows as usize);
5673 source_byte_total = source_byte_total.saturating_add(seg.bytes().len() as u64);
5674 }
5675 // Step D: collect (key, body) pairs from every live Cold
5676 // locator pointing at a candidate. dedupe by key — one
5677 // BTree key resolves to at most one cold payload (the
5678 // freezer + promote/shadow flow keeps Cold locators
5679 // unique per key).
5680 let mut collected: BTreeMap<u64, (Vec<u8>, IndexKey)> = BTreeMap::new();
5681 for (key, locators) in map.iter() {
5682 for loc in locators {
5683 let RowLocator::Cold { segment_id, .. } = loc else {
5684 continue;
5685 };
5686 if !candidate_set.contains(segment_id) {
5687 continue;
5688 }
5689 let u64_key = index_key_as_u64(key).ok_or_else(|| {
5690 StorageError::Corrupt(format!(
5691 "compact_cold_segments: index {index_name:?} has non-integer Cold key; \
5692 cold tier requires IndexKey::Int (Text PK lands in v5.5+)"
5693 ))
5694 })?;
5695 let seg = self.cold_segments[*segment_id as usize]
5696 .as_ref()
5697 .expect("candidate slot guaranteed Some above");
5698 let payload = seg.lookup(u64_key).ok_or_else(|| {
5699 StorageError::Corrupt(format!(
5700 "compact_cold_segments: BTree {index_name:?} points key={u64_key} \
5701 at segment {segment_id} but the segment lookup missed"
5702 ))
5703 })?;
5704 collected.insert(u64_key, (payload, key.clone()));
5705 break;
5706 }
5707 }
5708 let merged_rows = collected.len();
5709 let deleted_rows_pruned = source_row_count.saturating_sub(merged_rows);
5710
5711 // Step E: encode the merged segment. `BTreeMap<u64, _>`
5712 // iteration is ascending by key, which is what
5713 // `encode_segment` requires.
5714 let seg_rows: Vec<(u64, Vec<u8>)> = collected
5715 .iter()
5716 .map(|(k, (body, _))| (*k, body.clone()))
5717 .collect();
5718 let (seg_bytes, _meta) = encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES)
5719 .map_err(|e| StorageError::Corrupt(format!("compact_cold_segments: encode: {e}")))?;
5720 let merged_bytes_len = seg_bytes.len() as u64;
5721
5722 // --- atomic mutation phase ------------------------------
5723 let merged_segment_id = self
5724 .load_segment_bytes(seg_bytes.clone())
5725 .map_err(|e| StorageError::Corrupt(format!("compact_cold_segments: load: {e}")))?;
5726
5727 // Rewrite the BTree index: every Cold locator pointing at
5728 // a candidate source becomes a Cold locator pointing at
5729 // the merged segment. Use a flat collect-then-replace
5730 // pattern so we never hold a `&self` borrow across the
5731 // `&mut self` write.
5732 let entries: Vec<(IndexKey, Vec<RowLocator>)> = {
5733 let t = self
5734 .get(table_name)
5735 .expect("table existed at the start of this fn");
5736 let idx = t
5737 .indices
5738 .iter()
5739 .find(|i| i.name == index_name)
5740 .expect("index existed at the start of this fn");
5741 let IndexKind::BTree(map) = &idx.kind else {
5742 unreachable!("validated above");
5743 };
5744 map.iter().map(|(k, v)| (k.clone(), v.clone())).collect()
5745 };
5746 let t_mut = self
5747 .get_mut(table_name)
5748 .expect("table existed at the start of this fn");
5749 let idx_mut = t_mut
5750 .indices
5751 .iter_mut()
5752 .find(|i| i.name == index_name)
5753 .expect("index existed at the start of this fn");
5754 let IndexKind::BTree(map_mut) = &mut idx_mut.kind else {
5755 unreachable!("validated above");
5756 };
5757 for (key, locators) in entries {
5758 let mut new_locs: Vec<RowLocator> = Vec::with_capacity(locators.len());
5759 let mut changed = false;
5760 for loc in &locators {
5761 match *loc {
5762 RowLocator::Cold {
5763 segment_id,
5764 page_offset: _,
5765 } if candidate_set.contains(&segment_id) => {
5766 let replacement = RowLocator::Cold {
5767 segment_id: merged_segment_id,
5768 page_offset: 0,
5769 };
5770 if !new_locs.contains(&replacement) {
5771 new_locs.push(replacement);
5772 }
5773 changed = true;
5774 }
5775 other => new_locs.push(other),
5776 }
5777 }
5778 if changed {
5779 map_mut.insert_mut(key, new_locs);
5780 }
5781 }
5782
5783 // Tombstone every source slot. Last step — failures here
5784 // would leave the segment double-referenced in both
5785 // memory + manifest, but `tombstone_segment` only errors
5786 // on out-of-bounds, which we've already validated.
5787 for &id in &candidate_set {
5788 self.tombstone_segment(id)?;
5789 }
5790
5791 let bytes_reclaimed_estimate = source_byte_total.saturating_sub(merged_bytes_len);
5792 Ok(CompactReport {
5793 sources: candidate_set.into_iter().collect(),
5794 merged_segment_id: Some(merged_segment_id),
5795 merged_segment_bytes: seg_bytes,
5796 merged_rows,
5797 deleted_rows_pruned,
5798 bytes_reclaimed_estimate,
5799 })
5800 }
5801
5802 /// Internal helper: scan `(table, index)` for a `Cold` locator
5803 /// keyed by `key`. Returns `Ok(Some((segment_id, page_offset)))`
5804 /// when found, `Ok(None)` when the key has only hot entries
5805 /// or no entries at all, `Err` on the same input-validation
5806 /// errors as the public `promote_cold_row` / `shadow_cold_row`.
5807 fn find_cold_locator(
5808 &self,
5809 table_name: &str,
5810 index_name: &str,
5811 key: &IndexKey,
5812 ) -> Result<Option<(u32, u32)>, StorageError> {
5813 let t = self.get(table_name).ok_or_else(|| {
5814 StorageError::Corrupt(format!("find_cold_locator: table {table_name:?} not found"))
5815 })?;
5816 let idx = t
5817 .indices
5818 .iter()
5819 .find(|i| i.name == index_name)
5820 .ok_or_else(|| {
5821 StorageError::Corrupt(format!(
5822 "find_cold_locator: index {index_name:?} not found on {table_name:?}"
5823 ))
5824 })?;
5825 if !matches!(idx.kind, IndexKind::BTree(_)) {
5826 return Err(StorageError::Corrupt(format!(
5827 "find_cold_locator: index {index_name:?} is NSW; promote-on-write only applies to BTree indices"
5828 )));
5829 }
5830 for loc in idx.lookup_eq(key) {
5831 if let RowLocator::Cold {
5832 segment_id,
5833 page_offset,
5834 } = *loc
5835 {
5836 return Ok(Some((segment_id, page_offset)));
5837 }
5838 }
5839 Ok(None)
5840 }
5841}
5842
5843/// Coerce an [`IndexKey`] to the `u64` that v5.1 cold-tier
5844/// segments use as their on-disk PK. Returns `None` for keys that
5845/// aren't representable as `u64` — Text PKs need a hash mapping
5846/// the segment writer baked in (deferred to v5.2+), Bool PKs are
5847/// almost never wide enough to be sharded into a cold tier.
5848fn index_key_as_u64(key: &IndexKey) -> Option<u64> {
5849 match key {
5850 // Reinterpret the i64 bit pattern as u64. Cold-tier segments
5851 // are sorted by this u64 view, so the chosen interpretation
5852 // only has to match between insert (bake_segment / freezer)
5853 // and lookup — using cast_unsigned keeps both sides honest
5854 // and silences clippy::cast_sign_loss.
5855 IndexKey::Int(n) => Some(n.cast_unsigned()),
5856 // Text / Bool / Uuid PKs aren't representable as u64 and so
5857 // can't participate in the u64-sorted cold-tier segment
5858 // PK layout. Same deferral story as Text — lookup falls
5859 // through the in-memory btree.
5860 IndexKey::Text(_) | IndexKey::Bool(_) | IndexKey::Uuid(_) => None,
5861 }
5862}
5863
5864#[derive(Debug, Clone, PartialEq, Eq)]
5865#[non_exhaustive]
5866pub enum StorageError {
5867 DuplicateTable {
5868 name: String,
5869 },
5870 TableNotFound {
5871 name: String,
5872 },
5873 ArityMismatch {
5874 expected: usize,
5875 actual: usize,
5876 },
5877 TypeMismatch {
5878 column: String,
5879 expected: DataType,
5880 actual: DataType,
5881 position: usize,
5882 },
5883 NullInNotNull {
5884 column: String,
5885 },
5886 /// Index with this name already exists on the table.
5887 DuplicateIndex {
5888 name: String,
5889 },
5890 /// Column referenced by an index doesn't exist on the table.
5891 ColumnNotFound {
5892 column: String,
5893 },
5894 /// On-disk format failed to parse — corrupted file, wrong magic, truncated
5895 /// payload, or unknown tag bytes.
5896 Corrupt(String),
5897 /// v6.0.4 — ALTER INDEX targeted an index name that doesn't
5898 /// exist on any table in this catalog.
5899 IndexNotFound {
5900 name: String,
5901 },
5902 /// v6.0.4 — operation requested isn't supported on this index
5903 /// kind / column type (e.g. ALTER INDEX REBUILD on a `BTree`
5904 /// index, or REBUILD WITH (encoding=…) on a non-vector column).
5905 Unsupported(String),
5906}
5907
5908impl fmt::Display for StorageError {
5909 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
5910 match self {
5911 Self::DuplicateTable { name } => write!(f, "table already exists: {name}"),
5912 Self::TableNotFound { name } => write!(f, "table not found: {name}"),
5913 Self::ArityMismatch { expected, actual } => write!(
5914 f,
5915 "row arity mismatch: expected {expected} columns, got {actual}"
5916 ),
5917 Self::TypeMismatch {
5918 column,
5919 expected,
5920 actual,
5921 position,
5922 } => write!(
5923 f,
5924 "type mismatch in column {column:?} (position {position}): expected {expected}, got {actual}"
5925 ),
5926 Self::NullInNotNull { column } => {
5927 write!(f, "NULL value in NOT NULL column {column:?}")
5928 }
5929 Self::DuplicateIndex { name } => write!(f, "index already exists: {name}"),
5930 Self::ColumnNotFound { column } => write!(f, "column not found: {column}"),
5931 Self::Corrupt(detail) => write!(f, "corrupt on-disk format: {detail}"),
5932 Self::IndexNotFound { name } => write!(f, "index not found: {name}"),
5933 Self::Unsupported(detail) => write!(f, "unsupported: {detail}"),
5934 }
5935 }
5936}
5937
5938impl ColumnSchema {
5939 pub fn new(name: impl Into<String>, ty: DataType, nullable: bool) -> Self {
5940 Self {
5941 name: name.into(),
5942 ty,
5943 nullable,
5944 default: None,
5945 runtime_default: None,
5946 auto_increment: false,
5947 user_enum_type: None,
5948 user_domain_type: None,
5949 on_update_runtime: None,
5950 collation: Collation::Binary,
5951 is_unsigned: false,
5952 inline_enum_variants: None,
5953 inline_set_variants: None,
5954 }
5955 }
5956
5957 /// Builder-style helper to attach a default value to an otherwise
5958 /// plain column schema. Used by the engine when CREATE TABLE
5959 /// specifies `column TYPE DEFAULT <expr>`.
5960 #[must_use]
5961 pub fn with_default(mut self, default: Value) -> Self {
5962 self.default = Some(default);
5963 self
5964 }
5965
5966 /// v7.9.21 — builder for runtime-evaluated defaults
5967 /// (`DEFAULT now()`, `DEFAULT CURRENT_TIMESTAMP`, …).
5968 /// `expr` is the Expr's `Display` form, re-parsed by the
5969 /// engine at each INSERT.
5970 #[must_use]
5971 pub fn with_runtime_default(mut self, expr: impl Into<String>) -> Self {
5972 self.runtime_default = Some(expr.into());
5973 self
5974 }
5975
5976 /// Builder-style helper to mark a column as `AUTO_INCREMENT`.
5977 #[must_use]
5978 pub const fn with_auto_increment(mut self) -> Self {
5979 self.auto_increment = true;
5980 self
5981 }
5982}
5983
5984impl TableSchema {
5985 pub fn new(name: impl Into<String>, columns: Vec<ColumnSchema>) -> Self {
5986 Self {
5987 name: name.into(),
5988 columns,
5989 hot_tier_bytes: None,
5990 foreign_keys: Vec::new(),
5991 uniqueness_constraints: Vec::new(),
5992 checks: Vec::new(),
5993 }
5994 }
5995}
5996
5997// =========================================================================
5998// Persistent binary format for the catalog.
5999//
6000// Layout (little-endian throughout):
6001//
6002// [magic "SPGDB001" 8 bytes][version u8]
6003// [table_count u32]
6004// for each table:
6005// [name_len u16][name bytes]
6006// [col_count u16]
6007// for each col:
6008// [name_len u16][name bytes]
6009// [type_tag u8 + optional payload]
6010// 1=Int 2=BigInt 3=Float 4=Text 5=Bool
6011// 6=Vector(u32 dim)
6012// 7=SmallInt
6013// 8=Varchar(u32 max)
6014// 9=Char(u32 size)
6015// 10=Numeric(u8 precision, u8 scale)
6016// 11=Date
6017// 12=Timestamp
6018// [nullable u8] 0/1
6019// [default_tag u8] 0=none 1=value (followed by [value_tag u8] + bytes)
6020// [row_count u32]
6021// for each row, for each col, one [value_tag u8] + value bytes:
6022// tag 0 (Null) → no body
6023// tag 1 (Int) → i32 LE
6024// tag 2 (BigInt) → i64 LE
6025// tag 3 (Float) → f64 LE
6026// tag 4 (Text) → u16 LE len + UTF-8 bytes
6027// tag 5 (Bool) → u8 0/1
6028// tag 6 (Vector) → u32 LE dim + dim×f32 LE
6029// tag 7 (SmallInt) → i16 LE
6030// tag 8 (Numeric) → i128 LE (16 bytes) + u8 scale
6031// tag 9 (Date) → i32 LE (days since Unix epoch)
6032// tag 10 (Timestamp) → i64 LE (microseconds since Unix epoch)
6033//
6034// Bumped to version 3 when NUMERIC was added; to version 4 when
6035// AUTO_INCREMENT (per-column flag) + NSW index `kind` byte landed;
6036// to version 5 when DATE / TIMESTAMP were added; to version 6 when
6037// NSW graph topology started travelling on disk (v2.7); to version 7
6038// when the NSW topology became multi-layer HNSW (v2.13); to version 8
6039// when row encoding switched to schema-driven dense layout (v3.0.2 —
6040// per-row NULL bitmap + per-column fixed-width body, no per-cell type
6041// tag).
6042// =========================================================================
6043
6044const FILE_MAGIC: &[u8; 8] = b"SPGDB001";
6045/// Current catalog snapshot format version emitted by [`Catalog::serialize`].
6046///
6047/// v9 (v5.2) extends v8 by serialising `BTree` index entries directly — every
6048/// `(IndexKey, Vec<RowLocator>)` pair travels on disk with the v5.1
6049/// `RowLocator::write_le` tag-prefixed codec. v8 `BTree` indices stored no
6050/// entries at all (the map was rebuilt from `Table::rows` on load); v9
6051/// preserves on-disk Cold locators so freezer-produced cold-tier index
6052/// entries survive a catalog snapshot round-trip. v8 readers are accepted
6053/// by version dispatch in [`Catalog::deserialize`] — every entry decodes
6054/// as `RowLocator::Hot(_)` via `add_index` rebuild, identical to v5.1
6055/// behaviour.
6056/// v6.7.2 — bumped from 10 to 11 to append per-table
6057/// `hot_tier_bytes: Option<u64>` after the per-table indices
6058/// section. v10 catalogs (v6.7.1) load with `hot_tier_bytes =
6059/// None` for every table (the deserialiser short-circuits when
6060/// version < 11). v11 snapshots written by a pre-v6.7.2 binary
6061/// fail loudly at the version check, matching the v6.1.2 /
6062/// v6.1.4 / v6.2.0 / v6.7.1 envelope-bump upgrade fences.
6063///
6064/// v6.8.0 — bumped from 11 to 12: per-index
6065/// `included_columns: Vec<u16>` appended at the tail of each
6066/// index payload. v11 (= v6.7.2) catalogs load with
6067/// `included_columns = Vec::new()` for every index — same
6068/// "older readers, append-only extension" pattern as the v6.7.2
6069/// hot_tier_bytes byte.
6070/// v7.13.0 — bumped from 22 to 23. mailrs round-5 G3 / G10.
6071/// Per-table appendix gains two new sections:
6072/// * `checks: Vec<String>` — CHECK predicate sources (Display
6073/// form of the AST Expr); re-parsed on INSERT/UPDATE to
6074/// enforce against candidate rows. Same persistence pattern
6075/// as `Index::partial_predicate`.
6076/// * Per `UniquenessConstraint`: trailing `nulls_not_distinct:
6077/// u8` flag for PG 15+ `UNIQUE NULLS NOT DISTINCT (cols)`
6078/// semantics.
6079/// v22 catalogs deserialise with empty `checks` and every UC
6080/// at `nulls_not_distinct = false`.
6081/// v24 introduces:
6082/// * Index kind tag 4 = trigram-GIN (`gin_trgm_ops`-flavoured
6083/// `USING gin` over a TEXT/VARCHAR column). Payload shape is
6084/// identical to tag-3 GIN (String → Vec<RowLocator>); the
6085/// keys are PG-compatible 3-byte trigram shingles instead of
6086/// tsvector lexemes. v23 catalogs deserialise unchanged — no
6087/// v23 writer ever emitted tag 4.
6088/// v25 introduces:
6089/// * Per `TriggerDef`: trailing `enabled: u8` flag (mailrs
6090/// round-9 A.2.b — `ALTER TABLE … { ENABLE | DISABLE }
6091/// TRIGGER …`). v24 catalogs deserialise with every trigger
6092/// `enabled = true`, matching pre-v7.16.1 behaviour.
6093/// v26 introduces (v7.17.0 Phase 1.1):
6094/// * Trailing SEQUENCE catalog block after triggers. Encoded
6095/// as `u32 count` followed by per-sequence:
6096/// `name`, `data_type: u8` (0=SmallInt,1=Int,2=BigInt),
6097/// `start i64`, `increment i64`, `min_value i64`,
6098/// `max_value i64`, `cache i64`, `cycle u8`,
6099/// `owned_by_tag u8` (0=NONE, 1=Column → `table`,`column`),
6100/// `last_value i64`, `is_called u8`. v25-and-below catalogs
6101/// deserialise with an empty sequences map.
6102/// v27 introduces (v7.17.0 Phase 1.2):
6103/// * Trailing VIEW catalog block after sequences. Encoded as
6104/// `u32 count` followed by per-view:
6105/// `name`, `column_count u16`, then column names, then
6106/// `body` long-string. v26-and-below catalogs deserialise
6107/// with an empty views map.
6108/// v28 introduces (v7.17.0 Phase 1.3):
6109/// * Trailing MATERIALIZED VIEW source registry block after
6110/// views. Encoded as `u32 count` followed by per-entry:
6111/// `name`, `body` long-string. The materialised rows live
6112/// as a regular Table of the same name (already covered by
6113/// the pre-existing tables block). v27-and-below catalogs
6114/// deserialise with an empty map.
6115/// v29 introduces (v7.17.0 Phase 1.4):
6116/// * Per-table user_enum_type appendix (after the CHECK
6117/// appendix). Layout: `u16 count` followed by per-binding
6118/// `[u16 col_pos][str enum_name]`. Only columns whose
6119/// `user_enum_type` is Some land here; the catalog stays
6120/// compact for the common no-enum case.
6121/// * Trailing ENUM types catalog block after materialized
6122/// views. Encoded as `u32 count` followed by per-entry:
6123/// `name`, `u16 label_count`, then `label_count` short
6124/// strings. v28-and-below catalogs deserialise with an
6125/// empty enum_types map and every column's
6126/// `user_enum_type = None`.
6127/// v30 introduces (v7.17.0 Phase 1.5):
6128/// * Per-table user_domain_type appendix (after the
6129/// user_enum_type appendix). Same shape as the enum one.
6130/// * Trailing DOMAIN types catalog block after the enum
6131/// block. Encoded as `u32 count` followed by per-entry:
6132/// `name`, `data_type` byte, `nullable u8`,
6133/// `default_present u8` + optional default string,
6134/// `u16 check_count` then `check_count` Display-form
6135/// CHECK strings. v29-and-below catalogs deserialise with
6136/// an empty domain_types map and `user_domain_type = None`.
6137/// v31 introduces (v7.17.0 Phase 1.6):
6138/// * Trailing user-schemas block after the DOMAIN block.
6139/// Encoded as `u32 count` followed by `count` schema-name
6140/// short strings. Built-in schemas (`public`, `pg_catalog`,
6141/// `information_schema`) are NOT serialised — they're
6142/// hardcoded in `is_builtin_schema`. v30-and-below catalogs
6143/// deserialise with an empty user-schemas set.
6144/// v32 introduces (v7.17.0 Phase 2.1):
6145/// * Per-table on_update_runtime appendix (after the
6146/// user_domain_type appendix). Layout: `u16 count` followed
6147/// by per-binding `[u16 col_pos][str expr_src]`. Only
6148/// columns whose `on_update_runtime` is Some land here;
6149/// the catalog stays compact when no MySQL-shaped table
6150/// uses the attribute. v31-and-below catalogs deserialise
6151/// with every column's `on_update_runtime = None`.
6152/// v33 introduces (v7.17.0 Phase 2.2):
6153/// * Index kind tag 5 = fulltext-GIN (MySQL `FULLTEXT KEY`
6154/// surface over a TEXT / VARCHAR column). Payload shape is
6155/// identical to tag-3 / tag-4 GIN (`String → Vec<RowLocator>`);
6156/// the keys are lower-cased word lexemes (same rule as
6157/// `to_tsvector('simple', text)`). v32 catalogs deserialise
6158/// unchanged — no v32 writer ever emitted tag 5, and FULLTEXT
6159/// KEY was silently dropped pre-v7.17 so no rebuild shim is
6160/// needed for round-tripped catalogs.
6161/// v34 introduces (v7.17.0 Phase 2.5):
6162/// * Per-table collation appendix (after the on_update_runtime
6163/// appendix). Sparse layout: only columns whose `collation`
6164/// is non-Binary land here. `u16 count` then per-binding
6165/// `[u16 col_pos][u8 collation_tag]` where the tag matches
6166/// `Collation::TAG_*`. Snapshots written by v33-and-below
6167/// readers deserialise every column with `collation =
6168/// Binary`, preserving the prior byte-wise compare
6169/// semantics. Unknown tags read back as Binary too — keeps
6170/// a forward-compat path if a future v35 adds variants
6171/// and someone rolls back to a v34 reader.
6172/// v35 introduces (v7.17.0 Phase 4.4):
6173/// * Per-table is_unsigned appendix (after the collation
6174/// appendix). Sparse layout: only `is_unsigned = true`
6175/// columns land. `u16 count` then per-binding `[u16 col_pos]`.
6176/// v34-and-below catalogs deserialise every column as
6177/// `is_unsigned = false`, preserving the prior silent-
6178/// accept behaviour for negative inserts on UNSIGNED columns.
6179/// v46 introduces (v7.23, mailrs round-14):
6180/// * Escaped short-string codec — `write_str` lengths >= 0xFFFF
6181/// emit `[u16 0xFFFF][u32 real_len]` so TEXT cells (mail bodies,
6182/// document text) above 64 KiB encode instead of panicking.
6183/// One-way upgrade: v45-and-below readers reject v46 catalogs
6184/// loudly via the version gate; v46 readers decode v45 catalogs
6185/// with the plain-u16 rules (0xFFFF is a legitimate length
6186/// there).
6187const FILE_VERSION: u8 = 46;
6188/// Oldest format version [`Catalog::deserialize`] still accepts. v8 is the
6189/// v3.0.2 dense-row layout; pre-v8 catalogs require an offline migration.
6190const MIN_SUPPORTED_FILE_VERSION: u8 = 8;
6191
6192// IndexKey wire format (v9):
6193// tag 0 = Int → [i64 LE]
6194// tag 1 = Text → [u16 LE len + UTF-8 bytes] (via write_str / read_str)
6195// tag 2 = Bool → [u8 0/1]
6196const INDEX_KEY_TAG_INT: u8 = 0;
6197const INDEX_KEY_TAG_TEXT: u8 = 1;
6198const INDEX_KEY_TAG_BOOL: u8 = 2;
6199/// v7.17.0 — `IndexKey::Uuid([u8; 16])`. Body = raw 16 bytes
6200/// (RFC 4122 byte order). Persisted only in FILE_VERSION 36+
6201/// catalogs.
6202const INDEX_KEY_TAG_UUID: u8 = 3;
6203
6204impl Catalog {
6205 /// Serialize the whole catalog (schema + every row) into a self-contained
6206 /// byte buffer. Format is documented above the impl block.
6207 pub fn serialize(&self) -> Vec<u8> {
6208 let mut out = Vec::with_capacity(64);
6209 out.extend_from_slice(FILE_MAGIC);
6210 out.push(FILE_VERSION);
6211 write_u32(
6212 &mut out,
6213 u32::try_from(self.tables.len()).expect("≤ 4G tables"),
6214 );
6215 for t in &self.tables {
6216 write_str(&mut out, &t.schema.name);
6217 write_u16(
6218 &mut out,
6219 u16::try_from(t.schema.columns.len()).expect("≤ 65k columns/table"),
6220 );
6221 for c in &t.schema.columns {
6222 write_str(&mut out, &c.name);
6223 write_data_type(&mut out, c.ty);
6224 out.push(u8::from(c.nullable));
6225 match &c.default {
6226 None => out.push(0),
6227 Some(v) => {
6228 out.push(1);
6229 write_value(&mut out, v);
6230 }
6231 }
6232 out.push(u8::from(c.auto_increment));
6233 }
6234 write_u32(
6235 &mut out,
6236 u32::try_from(t.rows.len()).expect("≤ 4G rows/table"),
6237 );
6238 // v3.0.2 dense row encoding (FILE_VERSION 8): per-row NULL
6239 // bitmap, then tightly-packed bodies. Identical wire format
6240 // as before — extracted into `encode_row_body_dense` so cold-
6241 // tier segments (v5.1+) can share the encoding.
6242 for row in &t.rows {
6243 out.extend_from_slice(&encode_row_body_dense(row, &t.schema));
6244 }
6245 // Index definitions. Per-index payload:
6246 // [name][col_pos u16][kind u8]
6247 // kind 0 = B-tree (no params — rebuilt on load)
6248 // kind 1 = NSW graph (u16 M + serialized graph)
6249 // For NSW the graph topology travels on disk so startup
6250 // doesn't re-run the O(n²M) rebuild — see v2.7 notes.
6251 write_u16(
6252 &mut out,
6253 u16::try_from(t.indices.len()).expect("≤ 65k indices/table"),
6254 );
6255 for idx in &t.indices {
6256 write_str(&mut out, &idx.name);
6257 write_u16(
6258 &mut out,
6259 u16::try_from(idx.column_position).expect("≤ 65k columns/table"),
6260 );
6261 match &idx.kind {
6262 IndexKind::BTree(map) => {
6263 out.push(0);
6264 // v9: serialise the full PB map. Each entry's
6265 // RowLocator list travels with the tag-prefixed
6266 // codec from `row_locator::write_le`, so freezer-
6267 // produced Cold locators survive a snapshot
6268 // round-trip. v8 BTree wrote nothing here and
6269 // rebuilt from rows — v9 readers tolerate v8 by
6270 // version dispatch in `Catalog::deserialize`.
6271 write_u32(
6272 &mut out,
6273 u32::try_from(map.len()).expect("≤ 4G index entries/index"),
6274 );
6275 for (key, locators) in map {
6276 write_index_key(&mut out, key);
6277 write_u32(
6278 &mut out,
6279 u32::try_from(locators.len()).expect("≤ 4G locators/key"),
6280 );
6281 for loc in locators {
6282 loc.write_le(&mut out);
6283 }
6284 }
6285 }
6286 IndexKind::Nsw(g) => {
6287 out.push(1);
6288 write_u16(&mut out, u16::try_from(g.m).expect("≤ 65k NSW neighbours"));
6289 write_nsw_graph(&mut out, g);
6290 }
6291 IndexKind::Brin { column_type } => {
6292 // v6.7.1 — tag byte 2 = BRIN. Payload is the
6293 // column type code (1 byte mapping to the
6294 // shared DataType numeric encoding); no
6295 // further data — BRIN summaries live in
6296 // cold segments, not the catalog.
6297 out.push(2);
6298 write_data_type(&mut out, *column_type);
6299 }
6300 IndexKind::Gin(map) => {
6301 // v7.12.3 — tag byte 3 = GIN. Payload mirrors
6302 // the BTree encoding but with String (lexeme
6303 // word) keys instead of IndexKey. Tag-prefixed
6304 // RowLocator codec so freezer-produced Cold
6305 // locators survive snapshot round-trip.
6306 // FILE_VERSION 21+; v20 catalogs never wrote a
6307 // GIN index (the AM degraded to BTree fallback
6308 // pre-v7.12.3), so no migration shim is needed.
6309 out.push(3);
6310 write_u32(
6311 &mut out,
6312 u32::try_from(map.len()).expect("≤ 4G GIN posting lists"),
6313 );
6314 for (word, locators) in map {
6315 write_str(&mut out, word);
6316 write_u32(
6317 &mut out,
6318 u32::try_from(locators.len()).expect("≤ 4G locators/posting list"),
6319 );
6320 for loc in locators {
6321 loc.write_le(&mut out);
6322 }
6323 }
6324 }
6325 IndexKind::GinTrgm(map) => {
6326 // v7.15.0 — tag byte 4 = GinTrgm
6327 // (`gin_trgm_ops` GIN over a TEXT column).
6328 // Payload shape is identical to tag-3 GIN —
6329 // `String → Vec<RowLocator>` posting lists.
6330 // The String keys are 3-byte trigrams instead
6331 // of tsvector lexemes; the deserializer
6332 // dispatches on the tag, not the key shape.
6333 // FILE_VERSION 24+; v23 catalogs never wrote
6334 // a trigram-GIN.
6335 out.push(4);
6336 write_u32(
6337 &mut out,
6338 u32::try_from(map.len()).expect("≤ 4G trigram-GIN posting lists"),
6339 );
6340 for (tri, locators) in map {
6341 write_str(&mut out, tri);
6342 write_u32(
6343 &mut out,
6344 u32::try_from(locators.len()).expect("≤ 4G locators/posting list"),
6345 );
6346 for loc in locators {
6347 loc.write_le(&mut out);
6348 }
6349 }
6350 }
6351 IndexKind::GinFulltext(map) => {
6352 // v7.17.0 Phase 2.2 — tag byte 5 =
6353 // GinFulltext (MySQL `FULLTEXT KEY` GIN
6354 // over a TEXT/VARCHAR column). Payload
6355 // shape mirrors tag-3 / tag-4 GIN —
6356 // `String → Vec<RowLocator>` posting
6357 // lists keyed by lower-cased word
6358 // lexemes. FILE_VERSION 33+; v32 catalogs
6359 // never wrote a fulltext-GIN (FULLTEXT
6360 // KEY was silently dropped pre-v7.17).
6361 out.push(5);
6362 write_u32(
6363 &mut out,
6364 u32::try_from(map.len()).expect("≤ 4G fulltext-GIN posting lists"),
6365 );
6366 for (lex, locators) in map {
6367 write_str(&mut out, lex);
6368 write_u32(
6369 &mut out,
6370 u32::try_from(locators.len()).expect("≤ 4G locators/posting list"),
6371 );
6372 for loc in locators {
6373 loc.write_le(&mut out);
6374 }
6375 }
6376 }
6377 }
6378 // v6.8.0 — included_columns appendix per index.
6379 // Layout: [u16 num_included][num × u16 column_position].
6380 // v11 readers stop before this u16 (deserialise loop
6381 // gated on version >= 12); v12+ readers always
6382 // consume it. Empty Vec serialises as a bare 0u16.
6383 write_u16(
6384 &mut out,
6385 u16::try_from(idx.included_columns.len()).expect("≤ 65k INCLUDE columns/index"),
6386 );
6387 for col_pos in &idx.included_columns {
6388 write_u16(
6389 &mut out,
6390 u16::try_from(*col_pos).expect("≤ 65k columns/table"),
6391 );
6392 }
6393 // v6.8.1 — partial_predicate appendix per index.
6394 // Layout: [u8 has_pred][u16 LE len][bytes (if has_pred)].
6395 // Same v12 gate as included_columns.
6396 match &idx.partial_predicate {
6397 None => out.push(0),
6398 Some(pred) => {
6399 out.push(1);
6400 write_str(&mut out, pred);
6401 }
6402 }
6403 // v6.8.2 — expression appendix. Same shape as
6404 // partial_predicate.
6405 match &idx.expression {
6406 None => out.push(0),
6407 Some(expr) => {
6408 out.push(1);
6409 write_str(&mut out, expr);
6410 }
6411 }
6412 // v7.9.29 — is_unique appendix (FILE_VERSION 16+).
6413 // Single byte 0/1. v15-and-below readers stop before
6414 // this byte; v16 readers always consume it. mailrs K1.
6415 out.push(u8::from(idx.is_unique));
6416 // v7.9.29 — extra_column_positions appendix.
6417 // Layout: [u16 count][count × u16 column_position].
6418 write_u16(
6419 &mut out,
6420 u16::try_from(idx.extra_column_positions.len())
6421 .expect("≤ 65k extra cols / index"),
6422 );
6423 for cp in &idx.extra_column_positions {
6424 write_u16(&mut out, u16::try_from(*cp).expect("≤ 65k columns/table"));
6425 }
6426 }
6427 // v6.7.2 — per-table hot_tier_bytes Option<u64>.
6428 // Layout: [u8 has_value][u64 LE value (if has_value)].
6429 // v10 readers stop before this byte (deserialise loop
6430 // gated on version >= 11); v11+ readers always
6431 // consume it.
6432 match t.schema.hot_tier_bytes {
6433 None => out.push(0),
6434 Some(n) => {
6435 out.push(1);
6436 out.extend_from_slice(&n.to_le_bytes());
6437 }
6438 }
6439 // v7.6.1 — FOREIGN KEY appendix (catalog FILE_VERSION 13+).
6440 // Layout: [u16 LE fk_count]
6441 // per fk:
6442 // [u8 has_name] [str name (if has_name)]
6443 // [u16 LE local_arity] [u16 LE local_pos]*arity
6444 // [str parent_table]
6445 // [u16 LE parent_arity] [u16 LE parent_pos]*arity
6446 // [u8 on_delete_tag] [u8 on_update_tag]
6447 // Older catalogs (v12 and below) skip this block entirely;
6448 // their reader stops before this byte.
6449 write_u16(
6450 &mut out,
6451 u16::try_from(t.schema.foreign_keys.len()).expect("≤ 65k FKs/table"),
6452 );
6453 for fk in &t.schema.foreign_keys {
6454 match &fk.name {
6455 None => out.push(0),
6456 Some(n) => {
6457 out.push(1);
6458 write_str(&mut out, n);
6459 }
6460 }
6461 write_u16(
6462 &mut out,
6463 u16::try_from(fk.local_columns.len()).expect("≤ 65k FK columns"),
6464 );
6465 for &p in &fk.local_columns {
6466 write_u16(&mut out, u16::try_from(p).expect("≤ 65k columns/table"));
6467 }
6468 write_str(&mut out, &fk.parent_table);
6469 write_u16(
6470 &mut out,
6471 u16::try_from(fk.parent_columns.len()).expect("≤ 65k FK parent columns"),
6472 );
6473 for &p in &fk.parent_columns {
6474 write_u16(&mut out, u16::try_from(p).expect("≤ 65k columns/table"));
6475 }
6476 out.push(fk.on_delete.tag());
6477 out.push(fk.on_update.tag());
6478 }
6479 // v7.9.19 — UniquenessConstraint appendix (catalog
6480 // FILE_VERSION 15+). Layout per table after the FK
6481 // block:
6482 // [u16 count]
6483 // per constraint:
6484 // [u8 is_primary_key]
6485 // [u16 arity][u16 col_pos]*arity
6486 // Older catalogs (v14 and below) skip this block.
6487 write_u16(
6488 &mut out,
6489 u16::try_from(t.schema.uniqueness_constraints.len())
6490 .expect("≤ 65k uniqueness constraints/table"),
6491 );
6492 for uc in &t.schema.uniqueness_constraints {
6493 out.push(u8::from(uc.is_primary_key));
6494 write_u16(
6495 &mut out,
6496 u16::try_from(uc.columns.len()).expect("≤ 65k cols in uniqueness constraint"),
6497 );
6498 for &p in &uc.columns {
6499 write_u16(&mut out, u16::try_from(p).expect("≤ 65k columns/table"));
6500 }
6501 // v7.13.0 — `nulls_not_distinct` flag
6502 // (FILE_VERSION 23+). Always written by writers at
6503 // version 23+; deserialise gates on `version >= 23`
6504 // so v22-and-below catalogs round-trip cleanly.
6505 out.push(u8::from(uc.nulls_not_distinct));
6506 }
6507 // v7.9.21 — runtime_default appendix per table.
6508 // Layout: [u16 count] then for each:
6509 // [u16 col_pos][str expr]
6510 // Only columns whose runtime_default is Some land here;
6511 // catalog stays compact for the common literal-default
6512 // case.
6513 let mut rt_defaults: Vec<(usize, &str)> = Vec::new();
6514 for (i, c) in t.schema.columns.iter().enumerate() {
6515 if let Some(e) = &c.runtime_default {
6516 rt_defaults.push((i, e.as_str()));
6517 }
6518 }
6519 write_u16(
6520 &mut out,
6521 u16::try_from(rt_defaults.len()).expect("≤ 65k runtime defaults/table"),
6522 );
6523 for (pos, expr) in rt_defaults {
6524 write_u16(&mut out, u16::try_from(pos).expect("≤ 65k columns/table"));
6525 write_str(&mut out, expr);
6526 }
6527 // v7.13.0 — CHECK constraint appendix per table.
6528 // Layout: [u16 count] then `count` Display-form
6529 // expression strings. Re-parsed on every INSERT/UPDATE
6530 // by the engine. FILE_VERSION 23+ only; v22 readers
6531 // never reach this block because the writer also moves
6532 // to v23 in lock-step.
6533 write_u16(
6534 &mut out,
6535 u16::try_from(t.schema.checks.len()).expect("≤ 65k CHECK constraints/table"),
6536 );
6537 for c in &t.schema.checks {
6538 write_str(&mut out, c.as_str());
6539 }
6540 // v7.17.0 Phase 1.4 — per-table user_enum_type
6541 // appendix. Layout: [u16 count] then
6542 // [u16 col_pos][str enum_name] per binding. Only
6543 // columns whose user_enum_type is Some land here.
6544 let mut enum_bindings: Vec<(usize, &str)> = Vec::new();
6545 for (i, c) in t.schema.columns.iter().enumerate() {
6546 if let Some(e) = &c.user_enum_type {
6547 enum_bindings.push((i, e.as_str()));
6548 }
6549 }
6550 write_u16(
6551 &mut out,
6552 u16::try_from(enum_bindings.len()).expect("≤ 65k enum-typed columns/table"),
6553 );
6554 for (pos, ename) in enum_bindings {
6555 write_u16(&mut out, u16::try_from(pos).expect("≤ 65k columns/table"));
6556 write_str(&mut out, ename);
6557 }
6558 // v7.17.0 Phase 1.5 — per-table user_domain_type
6559 // appendix. Same layout as the enum one. v29-and-
6560 // below readers stop after the enum appendix.
6561 let mut domain_bindings: Vec<(usize, &str)> = Vec::new();
6562 for (i, c) in t.schema.columns.iter().enumerate() {
6563 if let Some(d) = &c.user_domain_type {
6564 domain_bindings.push((i, d.as_str()));
6565 }
6566 }
6567 write_u16(
6568 &mut out,
6569 u16::try_from(domain_bindings.len()).expect("≤ 65k domain-typed columns/table"),
6570 );
6571 for (pos, dname) in domain_bindings {
6572 write_u16(&mut out, u16::try_from(pos).expect("≤ 65k columns/table"));
6573 write_str(&mut out, dname);
6574 }
6575 // v7.17.0 Phase 2.1 — per-table on_update_runtime
6576 // appendix. Sparse: only ON UPDATE-bound columns.
6577 let mut on_update_bindings: Vec<(usize, &str)> = Vec::new();
6578 for (i, c) in t.schema.columns.iter().enumerate() {
6579 if let Some(e) = &c.on_update_runtime {
6580 on_update_bindings.push((i, e.as_str()));
6581 }
6582 }
6583 write_u16(
6584 &mut out,
6585 u16::try_from(on_update_bindings.len()).expect("≤ 65k ON UPDATE columns/table"),
6586 );
6587 for (pos, expr_src) in on_update_bindings {
6588 write_u16(&mut out, u16::try_from(pos).expect("≤ 65k columns/table"));
6589 write_str(&mut out, expr_src);
6590 }
6591 // v7.17.0 Phase 2.5 — per-table collation appendix.
6592 // Sparse: only non-Binary columns land. Layout:
6593 // `[u16 count][u16 col_pos][u8 tag] × count`.
6594 let mut coll_bindings: Vec<(usize, u8)> = Vec::new();
6595 for (i, c) in t.schema.columns.iter().enumerate() {
6596 let tag = match c.collation {
6597 Collation::Binary => continue,
6598 Collation::CaseInsensitive => Collation::TAG_CASE_INSENSITIVE,
6599 };
6600 coll_bindings.push((i, tag));
6601 }
6602 write_u16(
6603 &mut out,
6604 u16::try_from(coll_bindings.len()).expect("≤ 65k collation bindings/table"),
6605 );
6606 for (pos, tag) in coll_bindings {
6607 write_u16(&mut out, u16::try_from(pos).expect("≤ 65k columns/table"));
6608 out.push(tag);
6609 }
6610 // v7.17.0 Phase 4.4 — per-table is_unsigned appendix.
6611 // Sparse: only UNSIGNED columns land. Layout:
6612 // `[u16 count][u16 col_pos] × count`.
6613 let mut unsigned_bindings: Vec<usize> = Vec::new();
6614 for (i, c) in t.schema.columns.iter().enumerate() {
6615 if c.is_unsigned {
6616 unsigned_bindings.push(i);
6617 }
6618 }
6619 write_u16(
6620 &mut out,
6621 u16::try_from(unsigned_bindings.len()).expect("≤ 65k UNSIGNED columns/table"),
6622 );
6623 for pos in unsigned_bindings {
6624 write_u16(&mut out, u16::try_from(pos).expect("≤ 65k columns/table"));
6625 }
6626 // v7.17.0 Phase 3.P0-36 — per-table inline_enum_variants
6627 // appendix. Sparse: only ENUM columns land. Layout:
6628 // `[u16 count] then per binding [u16 col_pos]
6629 // [u16 variant_count] then variant strings`.
6630 // FILE_VERSION 41+; v40 readers never reach this block.
6631 let mut enum_inline_bindings: Vec<(usize, &[String])> = Vec::new();
6632 for (i, c) in t.schema.columns.iter().enumerate() {
6633 if let Some(vs) = &c.inline_enum_variants {
6634 enum_inline_bindings.push((i, vs.as_slice()));
6635 }
6636 }
6637 write_u16(
6638 &mut out,
6639 u16::try_from(enum_inline_bindings.len()).expect("≤ 65k inline-ENUM columns/table"),
6640 );
6641 for (pos, variants) in enum_inline_bindings {
6642 write_u16(&mut out, u16::try_from(pos).expect("≤ 65k columns/table"));
6643 write_u16(
6644 &mut out,
6645 u16::try_from(variants.len()).expect("≤ 65k variants/ENUM"),
6646 );
6647 for v in variants {
6648 write_str(&mut out, v.as_str());
6649 }
6650 }
6651 // v7.17.0 Phase 3.P0-37 — per-table inline_set_variants
6652 // appendix. Same layout as the inline ENUM block.
6653 // FILE_VERSION 42+; v41 readers never reach this block.
6654 let mut set_inline_bindings: Vec<(usize, &[String])> = Vec::new();
6655 for (i, c) in t.schema.columns.iter().enumerate() {
6656 if let Some(vs) = &c.inline_set_variants {
6657 set_inline_bindings.push((i, vs.as_slice()));
6658 }
6659 }
6660 write_u16(
6661 &mut out,
6662 u16::try_from(set_inline_bindings.len()).expect("≤ 65k inline-SET columns/table"),
6663 );
6664 for (pos, variants) in set_inline_bindings {
6665 write_u16(&mut out, u16::try_from(pos).expect("≤ 65k columns/table"));
6666 write_u16(
6667 &mut out,
6668 u16::try_from(variants.len()).expect("≤ 65k variants/SET"),
6669 );
6670 for v in variants {
6671 write_str(&mut out, v.as_str());
6672 }
6673 }
6674 }
6675 // v7.12.4 — catalog-wide appendix: user-defined functions
6676 // then triggers. FILE_VERSION 22+ only. v21 and earlier
6677 // readers stop after the last table; v22 readers always
6678 // consume two `u32` counts (possibly zero).
6679 //
6680 // Function entry layout:
6681 // [str name] [str args_repr] [str returns]
6682 // [str language] [str body]
6683 // Trigger entry layout:
6684 // [str name] [str table] [str timing]
6685 // [u16 event_count] (event_count × str)
6686 // [str for_each] [str function]
6687 write_u32(
6688 &mut out,
6689 u32::try_from(self.functions.len()).expect("≤ 4G functions"),
6690 );
6691 for fd in self.functions.values() {
6692 write_str(&mut out, &fd.name);
6693 write_str(&mut out, &fd.args_repr);
6694 write_str(&mut out, &fd.returns);
6695 write_str(&mut out, &fd.language);
6696 write_str_long(&mut out, &fd.body);
6697 }
6698 write_u32(
6699 &mut out,
6700 u32::try_from(self.triggers.len()).expect("≤ 4G triggers"),
6701 );
6702 for td in &self.triggers {
6703 write_str(&mut out, &td.name);
6704 write_str(&mut out, &td.table);
6705 write_str(&mut out, &td.timing);
6706 write_u16(
6707 &mut out,
6708 u16::try_from(td.events.len()).expect("≤ 65k events / trigger"),
6709 );
6710 for ev in &td.events {
6711 write_str(&mut out, ev);
6712 }
6713 write_str(&mut out, &td.for_each);
6714 write_str(&mut out, &td.function);
6715 // v7.13.0 — `UPDATE OF cols` filter
6716 // (FILE_VERSION 23+). v22 readers omit; v23 writers
6717 // always emit (possibly zero).
6718 write_u16(
6719 &mut out,
6720 u16::try_from(td.update_columns.len()).expect("≤ 65k cols / trigger"),
6721 );
6722 for c in &td.update_columns {
6723 write_str(&mut out, c);
6724 }
6725 // v7.16.1 — TriggerDef.enabled (FILE_VERSION 25+).
6726 out.push(u8::from(td.enabled));
6727 }
6728 // v7.17.0 Phase 1.1 — SEQUENCE catalog block (FILE_VERSION 26+).
6729 write_u32(
6730 &mut out,
6731 u32::try_from(self.sequences.len()).expect("≤ 4G sequences"),
6732 );
6733 for seq in self.sequences.values() {
6734 write_str(&mut out, &seq.name);
6735 out.push(match seq.data_type {
6736 SequenceDataType::SmallInt => 0,
6737 SequenceDataType::Int => 1,
6738 SequenceDataType::BigInt => 2,
6739 });
6740 out.extend_from_slice(&seq.start.to_le_bytes());
6741 out.extend_from_slice(&seq.increment.to_le_bytes());
6742 out.extend_from_slice(&seq.min_value.to_le_bytes());
6743 out.extend_from_slice(&seq.max_value.to_le_bytes());
6744 out.extend_from_slice(&seq.cache.to_le_bytes());
6745 out.push(u8::from(seq.cycle));
6746 match &seq.owned_by {
6747 None => out.push(0),
6748 Some((table, column)) => {
6749 out.push(1);
6750 write_str(&mut out, table);
6751 write_str(&mut out, column);
6752 }
6753 }
6754 out.extend_from_slice(&seq.last_value.to_le_bytes());
6755 out.push(u8::from(seq.is_called));
6756 }
6757 // v7.17.0 Phase 1.2 — VIEW catalog block (FILE_VERSION 27+).
6758 write_u32(
6759 &mut out,
6760 u32::try_from(self.views.len()).expect("≤ 4G views"),
6761 );
6762 for view in self.views.values() {
6763 write_str(&mut out, &view.name);
6764 write_u16(
6765 &mut out,
6766 u16::try_from(view.columns.len()).expect("≤ 65k cols / view"),
6767 );
6768 for c in &view.columns {
6769 write_str(&mut out, c);
6770 }
6771 write_str_long(&mut out, &view.body);
6772 }
6773 // v7.17.0 Phase 1.3 — MATERIALIZED VIEW source registry
6774 // (FILE_VERSION 28+). The backing rows live as a regular
6775 // table of the same name already in the tables block.
6776 write_u32(
6777 &mut out,
6778 u32::try_from(self.materialized_views.len()).expect("≤ 4G materialized views"),
6779 );
6780 for (name, body) in &self.materialized_views {
6781 write_str(&mut out, name);
6782 write_str_long(&mut out, body);
6783 }
6784 // v7.17.0 Phase 1.4 — ENUM types catalog block
6785 // (FILE_VERSION 29+).
6786 write_u32(
6787 &mut out,
6788 u32::try_from(self.enum_types.len()).expect("≤ 4G enum types"),
6789 );
6790 for e in self.enum_types.values() {
6791 write_str(&mut out, &e.name);
6792 write_u16(
6793 &mut out,
6794 u16::try_from(e.labels.len()).expect("≤ 65k labels / enum"),
6795 );
6796 for l in &e.labels {
6797 write_str(&mut out, l);
6798 }
6799 }
6800 // v7.17.0 Phase 1.5 — DOMAIN types catalog block
6801 // (FILE_VERSION 30+).
6802 write_u32(
6803 &mut out,
6804 u32::try_from(self.domain_types.len()).expect("≤ 4G domain types"),
6805 );
6806 for d in self.domain_types.values() {
6807 write_str(&mut out, &d.name);
6808 write_data_type(&mut out, d.base_type);
6809 out.push(u8::from(d.nullable));
6810 match &d.default {
6811 None => out.push(0),
6812 Some(s) => {
6813 out.push(1);
6814 write_str(&mut out, s);
6815 }
6816 }
6817 write_u16(
6818 &mut out,
6819 u16::try_from(d.checks.len()).expect("≤ 65k CHECKs / domain"),
6820 );
6821 for c in &d.checks {
6822 write_str(&mut out, c);
6823 }
6824 }
6825 // v7.17.0 Phase 1.6 — user-schemas registry
6826 // (FILE_VERSION 31+). Built-ins are hardcoded in
6827 // `is_builtin_schema` and not persisted.
6828 write_u32(
6829 &mut out,
6830 u32::try_from(self.schemas.len()).expect("≤ 4G schemas"),
6831 );
6832 for name in &self.schemas {
6833 write_str(&mut out, name);
6834 }
6835 out
6836 }
6837
6838 /// Deserialize a previously-serialized catalog. Rejects bad magic, version
6839 /// mismatch, unknown tags, truncation, and trailing bytes.
6840 pub fn deserialize(buf: &[u8]) -> Result<Self, StorageError> {
6841 let mut cur = Cursor::new(buf);
6842 let magic = cur.take(8)?;
6843 if magic != FILE_MAGIC {
6844 return Err(StorageError::Corrupt(format!(
6845 "bad magic: expected SPGDB001, got {magic:?}"
6846 )));
6847 }
6848 let version = cur.read_u8()?;
6849 if !(MIN_SUPPORTED_FILE_VERSION..=FILE_VERSION).contains(&version) {
6850 return Err(StorageError::Corrupt(format!(
6851 "unsupported file version: {version} (supported: {MIN_SUPPORTED_FILE_VERSION}..={FILE_VERSION})"
6852 )));
6853 }
6854 // v7.23 — string decoding is version-gated (see
6855 // STR_LEN_ESCAPE).
6856 cur.long_strings = version >= 46;
6857 let table_count = cur.read_u32()? as usize;
6858 let mut cat = Self::new();
6859 for _ in 0..table_count {
6860 deserialize_table(&mut cur, &mut cat, version)?;
6861 }
6862 // v7.12.4 — catalog-wide function + trigger appendix.
6863 // FILE_VERSION 22+ only; v21 and earlier catalogs stop
6864 // after the last table.
6865 if version >= 22 {
6866 let fn_count = cur.read_u32()? as usize;
6867 for _ in 0..fn_count {
6868 let name = cur.read_str()?;
6869 let args_repr = cur.read_str()?;
6870 let returns = cur.read_str()?;
6871 let language = cur.read_str()?;
6872 let body = cur.read_str_long()?;
6873 cat.functions.insert(
6874 name.clone(),
6875 FunctionDef {
6876 name,
6877 args_repr,
6878 returns,
6879 language,
6880 body,
6881 },
6882 );
6883 }
6884 let trg_count = cur.read_u32()? as usize;
6885 for _ in 0..trg_count {
6886 let name = cur.read_str()?;
6887 let table = cur.read_str()?;
6888 let timing = cur.read_str()?;
6889 let ev_count = cur.read_u16()? as usize;
6890 let mut events = Vec::with_capacity(ev_count);
6891 for _ in 0..ev_count {
6892 events.push(cur.read_str()?);
6893 }
6894 let for_each = cur.read_str()?;
6895 let function = cur.read_str()?;
6896 // v7.13.0 — trailing `UPDATE OF cols` filter
6897 // (FILE_VERSION 23+ only; v22 catalogs omit and
6898 // deserialise with an empty vec).
6899 let update_columns = if version >= 23 {
6900 let n = cur.read_u16()? as usize;
6901 let mut cols = Vec::with_capacity(n);
6902 for _ in 0..n {
6903 cols.push(cur.read_str()?);
6904 }
6905 cols
6906 } else {
6907 Vec::new()
6908 };
6909 // v7.16.1 — TriggerDef.enabled (FILE_VERSION 25+).
6910 // v24-and-below catalogs deserialise with `true`
6911 // — pre-v7.16.1 every trigger always fired.
6912 let enabled = if version >= 25 {
6913 cur.read_u8()? != 0
6914 } else {
6915 true
6916 };
6917 cat.triggers.push(TriggerDef {
6918 name,
6919 table,
6920 timing,
6921 events,
6922 for_each,
6923 function,
6924 update_columns,
6925 enabled,
6926 });
6927 }
6928 }
6929 // v7.17.0 Phase 1.1 — SEQUENCE block (FILE_VERSION 26+).
6930 // v25-and-below catalogs omit; we leave the map empty.
6931 if version >= 26 {
6932 let seq_count = cur.read_u32()? as usize;
6933 for _ in 0..seq_count {
6934 let name = cur.read_str()?;
6935 let data_type = match cur.read_u8()? {
6936 0 => SequenceDataType::SmallInt,
6937 1 => SequenceDataType::Int,
6938 2 => SequenceDataType::BigInt,
6939 other => {
6940 return Err(StorageError::Corrupt(format!(
6941 "unknown SEQUENCE data-type tag {other}"
6942 )));
6943 }
6944 };
6945 let start = cur.read_i64()?;
6946 let increment = cur.read_i64()?;
6947 let min_value = cur.read_i64()?;
6948 let max_value = cur.read_i64()?;
6949 let cache = cur.read_i64()?;
6950 let cycle = cur.read_u8()? != 0;
6951 let owned_by = match cur.read_u8()? {
6952 0 => None,
6953 1 => {
6954 let t = cur.read_str()?;
6955 let c = cur.read_str()?;
6956 Some((t, c))
6957 }
6958 other => {
6959 return Err(StorageError::Corrupt(format!(
6960 "unknown SEQUENCE owned-by tag {other}"
6961 )));
6962 }
6963 };
6964 let last_value = cur.read_i64()?;
6965 let is_called = cur.read_u8()? != 0;
6966 cat.sequences.insert(
6967 name.clone(),
6968 SequenceDef {
6969 name,
6970 data_type,
6971 start,
6972 increment,
6973 min_value,
6974 max_value,
6975 cache,
6976 cycle,
6977 owned_by,
6978 last_value,
6979 is_called,
6980 },
6981 );
6982 }
6983 }
6984 // v7.17.0 Phase 1.2 — VIEW block (FILE_VERSION 27+).
6985 // v26-and-below catalogs omit; we leave the map empty.
6986 if version >= 27 {
6987 let view_count = cur.read_u32()? as usize;
6988 for _ in 0..view_count {
6989 let name = cur.read_str()?;
6990 let col_count = cur.read_u16()? as usize;
6991 let mut columns = Vec::with_capacity(col_count);
6992 for _ in 0..col_count {
6993 columns.push(cur.read_str()?);
6994 }
6995 let body = cur.read_str_long()?;
6996 cat.views.insert(
6997 name.clone(),
6998 ViewDef {
6999 name,
7000 columns,
7001 body,
7002 },
7003 );
7004 }
7005 }
7006 // v7.17.0 Phase 1.3 — MATERIALIZED VIEW source registry
7007 // (FILE_VERSION 28+). v27-and-below catalogs omit.
7008 if version >= 28 {
7009 let mv_count = cur.read_u32()? as usize;
7010 for _ in 0..mv_count {
7011 let name = cur.read_str()?;
7012 let body = cur.read_str_long()?;
7013 cat.materialized_views.insert(name, body);
7014 }
7015 }
7016 // v7.17.0 Phase 1.4 — ENUM types catalog block
7017 // (FILE_VERSION 29+).
7018 if version >= 29 {
7019 let etype_count = cur.read_u32()? as usize;
7020 for _ in 0..etype_count {
7021 let name = cur.read_str()?;
7022 let label_count = cur.read_u16()? as usize;
7023 let mut labels = Vec::with_capacity(label_count);
7024 for _ in 0..label_count {
7025 labels.push(cur.read_str()?);
7026 }
7027 cat.enum_types
7028 .insert(name.clone(), EnumDef { name, labels });
7029 }
7030 }
7031 // v7.17.0 Phase 1.5 — DOMAIN types catalog block
7032 // (FILE_VERSION 30+).
7033 if version >= 30 {
7034 let dtype_count = cur.read_u32()? as usize;
7035 for _ in 0..dtype_count {
7036 let name = cur.read_str()?;
7037 let base_type = cur.read_data_type()?;
7038 let nullable = cur.read_u8()? != 0;
7039 let default = match cur.read_u8()? {
7040 0 => None,
7041 1 => Some(cur.read_str()?),
7042 other => {
7043 return Err(StorageError::Corrupt(format!(
7044 "unknown DOMAIN default tag {other}"
7045 )));
7046 }
7047 };
7048 let check_count = cur.read_u16()? as usize;
7049 let mut checks = Vec::with_capacity(check_count);
7050 for _ in 0..check_count {
7051 checks.push(cur.read_str()?);
7052 }
7053 cat.domain_types.insert(
7054 name.clone(),
7055 DomainDef {
7056 name,
7057 base_type,
7058 nullable,
7059 default,
7060 checks,
7061 },
7062 );
7063 }
7064 }
7065 // v7.17.0 Phase 1.6 — user-schemas registry
7066 // (FILE_VERSION 31+).
7067 if version >= 31 {
7068 let sch_count = cur.read_u32()? as usize;
7069 for _ in 0..sch_count {
7070 let name = cur.read_str()?;
7071 cat.schemas.insert(name);
7072 }
7073 }
7074 if cur.pos < buf.len() {
7075 return Err(StorageError::Corrupt(format!(
7076 "trailing bytes: {} unread",
7077 buf.len() - cur.pos
7078 )));
7079 }
7080 Ok(cat)
7081 }
7082}
7083
7084/// Per-table deserialize body — schema, rows, indices. Pulled out of
7085/// `Catalog::deserialize` to keep the latter under the line-budget lint
7086/// and to give the row hot loop its own scope (so the borrow on `t`
7087/// stays scoped here rather than across the whole catalog loop).
7088fn deserialize_table(
7089 cur: &mut Cursor<'_>,
7090 cat: &mut Catalog,
7091 version: u8,
7092) -> Result<(), StorageError> {
7093 let table_name = cur.read_str()?;
7094 let name = table_name.clone();
7095 let col_count = cur.read_u16()? as usize;
7096 let mut cols = Vec::with_capacity(col_count);
7097 for _ in 0..col_count {
7098 let c_name = cur.read_str()?;
7099 let ty = cur.read_data_type()?;
7100 let nullable = cur.read_u8()? != 0;
7101 let default = match cur.read_u8()? {
7102 0 => None,
7103 1 => Some(cur.read_value()?),
7104 other => {
7105 return Err(StorageError::Corrupt(format!(
7106 "unknown default tag: {other}"
7107 )));
7108 }
7109 };
7110 let auto_increment = cur.read_u8()? != 0;
7111 // Note: deserialiser sets runtime_default = None for
7112 // older catalogs (≤ v14). v15+ reads it from the
7113 // per-column appendix below.
7114 cols.push(ColumnSchema {
7115 name: c_name,
7116 ty,
7117 nullable,
7118 default,
7119 runtime_default: None,
7120 auto_increment,
7121 user_enum_type: None,
7122 user_domain_type: None,
7123 on_update_runtime: None,
7124 collation: Collation::Binary,
7125 is_unsigned: false,
7126 inline_enum_variants: None,
7127 inline_set_variants: None,
7128 });
7129 }
7130 let n_cols = cols.len();
7131 cat.create_table(TableSchema::new(name, cols))?;
7132 // Vec<Table> with insertion-order semantics — the just-pushed
7133 // table is at the end. Sidecar `by_name` is already wired up but
7134 // we skip the map lookup here since we know the position.
7135 let t = cat.tables.last_mut().expect("create_table just pushed");
7136 deserialize_rows(cur, t, n_cols)?;
7137 deserialize_indices(cur, t, version)?;
7138 // v6.7.2 — per-table hot_tier_bytes appendix. v11+ writes
7139 // `[u8 has_value][u64 LE value (if has_value)]`. v10 / v9 / v8
7140 // catalogs skip this entirely (the deserialiser reads no extra
7141 // bytes; the table's hot_tier_bytes stays None from
7142 // TableSchema::new).
7143 if version >= 11 {
7144 let has = cur.read_u8()?;
7145 let hot_tier_bytes = match has {
7146 0 => None,
7147 1 => Some(cur.read_u64()?),
7148 other => {
7149 return Err(StorageError::Corrupt(format!(
7150 "hot_tier_bytes appendix: unknown has-value byte {other}"
7151 )));
7152 }
7153 };
7154 t.schema_mut().hot_tier_bytes = hot_tier_bytes;
7155 }
7156 // v7.6.1 — FOREIGN KEY appendix (FILE_VERSION 13+). v12 / v11 / …
7157 // catalogs skip this entirely.
7158 if version >= 13 {
7159 let fk_count = cur.read_u16()? as usize;
7160 let mut fks = Vec::with_capacity(fk_count);
7161 for _ in 0..fk_count {
7162 let name = match cur.read_u8()? {
7163 0 => None,
7164 1 => Some(cur.read_str()?),
7165 other => {
7166 return Err(StorageError::Corrupt(format!(
7167 "FK appendix: unknown has-name byte {other}"
7168 )));
7169 }
7170 };
7171 let local_arity = cur.read_u16()? as usize;
7172 let mut local_columns = Vec::with_capacity(local_arity);
7173 for _ in 0..local_arity {
7174 local_columns.push(cur.read_u16()? as usize);
7175 }
7176 let parent_table = cur.read_str()?;
7177 let parent_arity = cur.read_u16()? as usize;
7178 if parent_arity != local_arity {
7179 return Err(StorageError::Corrupt(format!(
7180 "FK arity mismatch in catalog: local {local_arity} vs parent {parent_arity}"
7181 )));
7182 }
7183 let mut parent_columns = Vec::with_capacity(parent_arity);
7184 for _ in 0..parent_arity {
7185 parent_columns.push(cur.read_u16()? as usize);
7186 }
7187 let on_delete = FkAction::from_tag(cur.read_u8()?).ok_or_else(|| {
7188 StorageError::Corrupt("FK appendix: unknown on_delete tag".into())
7189 })?;
7190 let on_update = FkAction::from_tag(cur.read_u8()?).ok_or_else(|| {
7191 StorageError::Corrupt("FK appendix: unknown on_update tag".into())
7192 })?;
7193 fks.push(ForeignKeyConstraint {
7194 name,
7195 local_columns,
7196 parent_table,
7197 parent_columns,
7198 on_delete,
7199 on_update,
7200 });
7201 }
7202 t.schema_mut().foreign_keys = fks;
7203 }
7204 // v7.9.19 — UniquenessConstraint appendix (FILE_VERSION 15+).
7205 // v14 and below skip this entirely.
7206 if version >= 15 {
7207 let uc_count = cur.read_u16()? as usize;
7208 let mut ucs = Vec::with_capacity(uc_count);
7209 for _ in 0..uc_count {
7210 let is_pk = cur.read_u8()? != 0;
7211 let arity = cur.read_u16()? as usize;
7212 let mut cols = Vec::with_capacity(arity);
7213 for _ in 0..arity {
7214 cols.push(cur.read_u16()? as usize);
7215 }
7216 // v7.13.0 — trailing `nulls_not_distinct` flag
7217 // (FILE_VERSION 23+). v22 and below skip — flag
7218 // defaults to false (= NULLS DISTINCT).
7219 let nulls_not_distinct = if version >= 23 {
7220 cur.read_u8()? != 0
7221 } else {
7222 false
7223 };
7224 ucs.push(UniquenessConstraint {
7225 is_primary_key: is_pk,
7226 columns: cols,
7227 nulls_not_distinct,
7228 });
7229 }
7230 t.schema_mut().uniqueness_constraints = ucs;
7231 // v7.9.21 — runtime_default appendix (FILE_VERSION 15+).
7232 let rt_count = cur.read_u16()? as usize;
7233 for _ in 0..rt_count {
7234 let pos = cur.read_u16()? as usize;
7235 let expr = cur.read_str()?;
7236 if let Some(col) = t.schema_mut().columns.get_mut(pos) {
7237 col.runtime_default = Some(expr);
7238 }
7239 }
7240 }
7241 // v7.13.0 — CHECK constraints appendix (FILE_VERSION 23+).
7242 // v22 and below leave the vec empty.
7243 if version >= 23 {
7244 let check_count = cur.read_u16()? as usize;
7245 let mut checks = Vec::with_capacity(check_count);
7246 for _ in 0..check_count {
7247 checks.push(cur.read_str()?);
7248 }
7249 t.schema_mut().checks = checks;
7250 }
7251 // v7.17.0 Phase 1.4 — per-table user_enum_type appendix
7252 // (FILE_VERSION 29+). Layout: [u16 count] then
7253 // [u16 col_pos][str enum_name] per binding.
7254 if version >= 29 {
7255 let binding_count = cur.read_u16()? as usize;
7256 for _ in 0..binding_count {
7257 let col_pos = cur.read_u16()? as usize;
7258 let ename = cur.read_str()?;
7259 if let Some(col) = t.schema_mut().columns.get_mut(col_pos) {
7260 col.user_enum_type = Some(ename);
7261 }
7262 }
7263 }
7264 // v7.17.0 Phase 1.5 — per-table user_domain_type appendix
7265 // (FILE_VERSION 30+). Same shape as the enum one.
7266 if version >= 30 {
7267 let binding_count = cur.read_u16()? as usize;
7268 for _ in 0..binding_count {
7269 let col_pos = cur.read_u16()? as usize;
7270 let dname = cur.read_str()?;
7271 if let Some(col) = t.schema_mut().columns.get_mut(col_pos) {
7272 col.user_domain_type = Some(dname);
7273 }
7274 }
7275 }
7276 // v7.17.0 Phase 2.1 — per-table on_update_runtime appendix
7277 // (FILE_VERSION 32+). Sparse layout matches the enum/
7278 // domain bindings.
7279 if version >= 32 {
7280 let binding_count = cur.read_u16()? as usize;
7281 for _ in 0..binding_count {
7282 let col_pos = cur.read_u16()? as usize;
7283 let expr_src = cur.read_str()?;
7284 if let Some(col) = t.schema_mut().columns.get_mut(col_pos) {
7285 col.on_update_runtime = Some(expr_src);
7286 }
7287 }
7288 }
7289 // v7.17.0 Phase 2.5 — per-table collation appendix
7290 // (FILE_VERSION 34+). Sparse: only non-Binary columns
7291 // land. v33-and-below readers leave every column at its
7292 // ColumnSchema::new default (Binary). Unknown tags from a
7293 // forward-incompat snapshot read back as Binary.
7294 if version >= 34 {
7295 let binding_count = cur.read_u16()? as usize;
7296 for _ in 0..binding_count {
7297 let col_pos = cur.read_u16()? as usize;
7298 let tag = cur.read_u8()?;
7299 let collation = match tag {
7300 Collation::TAG_CASE_INSENSITIVE => Collation::CaseInsensitive,
7301 _ => Collation::Binary,
7302 };
7303 if let Some(col) = t.schema_mut().columns.get_mut(col_pos) {
7304 col.collation = collation;
7305 }
7306 }
7307 }
7308 // v7.17.0 Phase 4.4 — per-table is_unsigned appendix
7309 // (FILE_VERSION 35+). Sparse: only UNSIGNED columns land.
7310 // v34-and-below readers leave every column at
7311 // `is_unsigned = false`.
7312 if version >= 35 {
7313 let binding_count = cur.read_u16()? as usize;
7314 for _ in 0..binding_count {
7315 let col_pos = cur.read_u16()? as usize;
7316 if let Some(col) = t.schema_mut().columns.get_mut(col_pos) {
7317 col.is_unsigned = true;
7318 }
7319 }
7320 }
7321 // v7.17.0 Phase 3.P0-36 — per-table inline_enum_variants
7322 // appendix (FILE_VERSION 41+). Sparse: only ENUM columns land.
7323 // v40-and-below readers leave every column at
7324 // `inline_enum_variants = None`.
7325 if version >= 41 {
7326 let binding_count = cur.read_u16()? as usize;
7327 for _ in 0..binding_count {
7328 let col_pos = cur.read_u16()? as usize;
7329 let variant_count = cur.read_u16()? as usize;
7330 let mut variants = Vec::with_capacity(variant_count);
7331 for _ in 0..variant_count {
7332 variants.push(cur.read_str()?);
7333 }
7334 if let Some(col) = t.schema_mut().columns.get_mut(col_pos) {
7335 col.inline_enum_variants = Some(variants);
7336 }
7337 }
7338 }
7339 // v7.17.0 Phase 3.P0-37 — per-table inline_set_variants
7340 // appendix (FILE_VERSION 42+). Sparse: only SET columns land.
7341 if version >= 42 {
7342 let binding_count = cur.read_u16()? as usize;
7343 for _ in 0..binding_count {
7344 let col_pos = cur.read_u16()? as usize;
7345 let variant_count = cur.read_u16()? as usize;
7346 let mut variants = Vec::with_capacity(variant_count);
7347 for _ in 0..variant_count {
7348 variants.push(cur.read_str()?);
7349 }
7350 if let Some(col) = t.schema_mut().columns.get_mut(col_pos) {
7351 col.inline_set_variants = Some(variants);
7352 }
7353 }
7354 }
7355 let _ = table_name;
7356 Ok(())
7357}
7358
7359fn deserialize_rows(
7360 cur: &mut Cursor<'_>,
7361 t: &mut Table,
7362 _n_cols: usize,
7363) -> Result<(), StorageError> {
7364 let row_count = cur.read_u32()? as usize;
7365 // v4.39: PV has no `reserve` (the BVT doesn't preallocate a
7366 // contiguous buffer); we just push directly and let the trie
7367 // grow. v5.1: row decode reuses `decode_row_body_dense` so the
7368 // catalog and cold-tier segments share one row codec.
7369 let mut hot_bytes: u64 = 0;
7370 for _ in 0..row_count {
7371 let tail = &cur.buf[cur.pos..];
7372 let (row, consumed) = decode_row_body_dense(tail, &t.schema, cur.long_strings)?;
7373 cur.pos += consumed;
7374 // v5.2.1: account for hot bytes as we go; the snapshot's row
7375 // block bytes are exactly what `encode_row_body_dense` would
7376 // produce, so `consumed` would do too — but going via the
7377 // helper keeps the counter's definition coupled to the
7378 // encoder rather than the snapshot's row prefix layout.
7379 hot_bytes = hot_bytes.saturating_add(row_body_encoded_len(&row, &t.schema) as u64);
7380 t.rows.push_mut(row);
7381 }
7382 t.hot_bytes = hot_bytes;
7383 Ok(())
7384}
7385
7386fn deserialize_indices(
7387 cur: &mut Cursor<'_>,
7388 t: &mut Table,
7389 version: u8,
7390) -> Result<(), StorageError> {
7391 let index_count = cur.read_u16()? as usize;
7392 for _ in 0..index_count {
7393 let idx_name = cur.read_str()?;
7394 let col_pos = cur.read_u16()? as usize;
7395 let column_name = t
7396 .schema
7397 .columns
7398 .get(col_pos)
7399 .ok_or_else(|| {
7400 StorageError::Corrupt(format!(
7401 "index {idx_name:?} points at non-existent column position {col_pos}"
7402 ))
7403 })?
7404 .name
7405 .clone();
7406 let kind_tag = cur.read_u8()?;
7407 match kind_tag {
7408 0 => {
7409 if version >= 9 {
7410 // v9+: BTree entries serialised inline (tag-prefixed
7411 // locator codec). Restore the map directly so any
7412 // freezer-produced Cold locators come back exactly
7413 // as they went out.
7414 let map = read_btree_map(cur)?;
7415 t.restore_btree_index(idx_name, &column_name, map)?;
7416 } else {
7417 // v8: no entries on disk; rebuild from rows. Every
7418 // entry is materialised as `RowLocator::Hot(i)` —
7419 // semantically identical to the v5.1 in-memory state
7420 // since v8 catalogs never produced Cold locators.
7421 t.add_index(idx_name, &column_name)?;
7422 }
7423 }
7424 1 => {
7425 let m = cur.read_u16()? as usize;
7426 let graph = cur.read_nsw_graph(m)?;
7427 t.restore_nsw_index(idx_name, &column_name, graph)?;
7428 }
7429 2 => {
7430 // v6.7.1 — BRIN tag. Payload is the column type
7431 // tag. No further data — summaries live in cold
7432 // segments.
7433 let column_type = cur.read_data_type()?;
7434 t.restore_brin_index(idx_name, &column_name, column_type)?;
7435 }
7436 3 => {
7437 // v7.12.3 — GIN tag. Payload mirrors the BTree
7438 // encoding but with String (lexeme word) keys.
7439 // Only emitted by FILE_VERSION 21+ writers — v20
7440 // and earlier degraded `USING gin` to BTree.
7441 let map = read_gin_map(cur)?;
7442 t.restore_gin_index(idx_name, &column_name, map)?;
7443 }
7444 4 => {
7445 // v7.15.0 — trigram-GIN tag (`gin_trgm_ops`).
7446 // Same payload shape as tag 3 (String → posting
7447 // list); only emitted by FILE_VERSION 24+ writers.
7448 if version < 24 {
7449 return Err(StorageError::Corrupt(format!(
7450 "trigram-GIN index tag 4 found in catalog FILE_VERSION {version}; \
7451 FILE_VERSION 24+ required (v7.15.0 introduced this tag)"
7452 )));
7453 }
7454 let map = read_gin_map(cur)?;
7455 t.restore_gin_trgm_index(idx_name, &column_name, map)?;
7456 }
7457 5 => {
7458 // v7.17.0 Phase 2.2 — fulltext-GIN tag (MySQL
7459 // `FULLTEXT KEY` surface). Same payload shape as
7460 // tag 3 / tag 4 (String → posting list); only
7461 // emitted by FILE_VERSION 33+ writers.
7462 if version < 33 {
7463 return Err(StorageError::Corrupt(format!(
7464 "fulltext-GIN index tag 5 found in catalog FILE_VERSION {version}; \
7465 FILE_VERSION 33+ required (v7.17.0 Phase 2.2 introduced this tag)"
7466 )));
7467 }
7468 let map = read_gin_map(cur)?;
7469 t.restore_gin_fulltext_index(idx_name, &column_name, map)?;
7470 }
7471 other => {
7472 return Err(StorageError::Corrupt(format!(
7473 "unknown index kind tag: {other}"
7474 )));
7475 }
7476 }
7477 // v6.8.0 — included_columns appendix per index. v11- snapshots
7478 // stop before this u16; v12+ always carries it (possibly 0).
7479 if version >= 12 {
7480 let num_included = cur.read_u16()? as usize;
7481 if num_included > 0 {
7482 let mut included: Vec<usize> = Vec::with_capacity(num_included);
7483 for _ in 0..num_included {
7484 let cp = cur.read_u16()? as usize;
7485 if cp >= t.schema.columns.len() {
7486 return Err(StorageError::Corrupt(format!(
7487 "INCLUDE column position {cp} out of range \
7488 ({} schema columns)",
7489 t.schema.columns.len()
7490 )));
7491 }
7492 included.push(cp);
7493 }
7494 if let Some(last) = t.indices.last_mut() {
7495 last.included_columns = included;
7496 }
7497 }
7498 // v6.8.1 — partial_predicate appendix.
7499 match cur.read_u8()? {
7500 0 => {}
7501 1 => {
7502 let pred = cur.read_str()?;
7503 if let Some(last) = t.indices.last_mut() {
7504 last.partial_predicate = Some(pred);
7505 }
7506 }
7507 other => {
7508 return Err(StorageError::Corrupt(format!(
7509 "partial_predicate tag: unknown byte {other}"
7510 )));
7511 }
7512 }
7513 // v6.8.2 — expression appendix.
7514 match cur.read_u8()? {
7515 0 => {}
7516 1 => {
7517 let expr = cur.read_str()?;
7518 if let Some(last) = t.indices.last_mut() {
7519 last.expression = Some(expr);
7520 }
7521 }
7522 other => {
7523 return Err(StorageError::Corrupt(format!(
7524 "expression tag: unknown byte {other}"
7525 )));
7526 }
7527 }
7528 // v7.9.29 — is_unique appendix (FILE_VERSION 16+).
7529 // v15-and-below catalogs stop before this byte. mailrs K1.
7530 if version >= 16 {
7531 match cur.read_u8()? {
7532 0 => {}
7533 1 => {
7534 if let Some(last) = t.indices.last_mut() {
7535 last.is_unique = true;
7536 }
7537 }
7538 other => {
7539 return Err(StorageError::Corrupt(format!(
7540 "is_unique tag: unknown byte {other}"
7541 )));
7542 }
7543 }
7544 // v7.9.29 — extra_column_positions appendix.
7545 let n = cur.read_u16()? as usize;
7546 if n > 0 {
7547 let mut extras: Vec<usize> = Vec::with_capacity(n);
7548 for _ in 0..n {
7549 let cp = cur.read_u16()? as usize;
7550 if cp >= t.schema.columns.len() {
7551 return Err(StorageError::Corrupt(format!(
7552 "extra column position {cp} out of range \
7553 ({} schema columns)",
7554 t.schema.columns.len()
7555 )));
7556 }
7557 extras.push(cp);
7558 }
7559 if let Some(last) = t.indices.last_mut() {
7560 last.extra_column_positions = extras;
7561 }
7562 }
7563 }
7564 }
7565 }
7566 Ok(())
7567}
7568
7569/// Parse a v9 `BTree` index payload — `[u32 entry_count]` followed by
7570/// `entry_count` `(IndexKey, Vec<RowLocator>)` pairs. The locator list
7571/// uses the v5.1 tag-prefixed wire format (`RowLocator::read_le`).
7572fn read_btree_map(
7573 cur: &mut Cursor<'_>,
7574) -> Result<PersistentBTreeMap<IndexKey, Vec<RowLocator>>, StorageError> {
7575 let entry_count = cur.read_u32()? as usize;
7576 let mut map = PersistentBTreeMap::new();
7577 for _ in 0..entry_count {
7578 let key = cur.read_index_key()?;
7579 let locator_count = cur.read_u32()? as usize;
7580 let mut locators = Vec::with_capacity(locator_count);
7581 for _ in 0..locator_count {
7582 let tail = &cur.buf[cur.pos..];
7583 let (loc, consumed) = RowLocator::read_le(tail).map_err(|e| {
7584 StorageError::Corrupt(format!("row_locator decode at offset {}: {e}", cur.pos))
7585 })?;
7586 cur.pos += consumed;
7587 locators.push(loc);
7588 }
7589 map.insert_mut(key, locators);
7590 }
7591 Ok(map)
7592}
7593
7594/// v7.12.3 — parse a `Gin` index payload. Mirrors [`read_btree_map`]
7595/// but with `String` (lexeme word) keys instead of `IndexKey`.
7596/// FILE_VERSION 21+ only.
7597fn read_gin_map(
7598 cur: &mut Cursor<'_>,
7599) -> Result<PersistentBTreeMap<String, Vec<RowLocator>>, StorageError> {
7600 let entry_count = cur.read_u32()? as usize;
7601 let mut map = PersistentBTreeMap::new();
7602 for _ in 0..entry_count {
7603 let word = cur.read_str()?;
7604 let locator_count = cur.read_u32()? as usize;
7605 let mut locators = Vec::with_capacity(locator_count);
7606 for _ in 0..locator_count {
7607 let tail = &cur.buf[cur.pos..];
7608 let (loc, consumed) = RowLocator::read_le(tail).map_err(|e| {
7609 StorageError::Corrupt(format!("row_locator decode at offset {}: {e}", cur.pos))
7610 })?;
7611 cur.pos += consumed;
7612 locators.push(loc);
7613 }
7614 map.insert_mut(word, locators);
7615 }
7616 Ok(map)
7617}
7618
7619// --- low-level binary helpers ---------------------------------------------
7620
7621/// Write a `DataType` as a tag byte + optional payload (Vector carries its
7622/// `u32` dimension). Inverse: [`read_data_type`].
7623/// Serialize an HNSW graph after the `[kind=1][u16 M]` header (v7).
7624/// Layout:
7625/// - `[u16 m_max_0]`
7626/// - `[entry u32]` — `u32::MAX` means `None`, else the entry node index
7627/// - `[u8 entry_level]`
7628/// - `[node_count u32]`
7629/// - for each node: `[u8 level]` (top layer for this node)
7630/// - `[layer_count u8]`
7631/// - for each layer `0..layer_count`:
7632/// - `[u32 layer_node_count]` (== `node_count`; per-layer slot)
7633/// - for each node: `[u16 neighbor_count] [u32 neighbor]*`
7634fn write_nsw_graph(out: &mut Vec<u8>, g: &NswGraph) {
7635 let entry = g.entry.map_or(u32::MAX, |e| {
7636 u32::try_from(e).expect("NSW entry fits in u32")
7637 });
7638 write_u16(
7639 out,
7640 u16::try_from(g.m_max_0).expect("HNSW m_max_0 fits in u16"),
7641 );
7642 out.extend_from_slice(&entry.to_le_bytes());
7643 out.push(g.entry_level);
7644 let node_count = g.levels.len();
7645 write_u32(
7646 out,
7647 u32::try_from(node_count).expect("HNSW node count fits in u32"),
7648 );
7649 for &lvl in &g.levels {
7650 out.push(lvl);
7651 }
7652 let layer_count = u8::try_from(g.layers.len()).expect("HNSW layer count ≤ 255");
7653 out.push(layer_count);
7654 for layer in &g.layers {
7655 write_u32(
7656 out,
7657 u32::try_from(layer.len()).expect("HNSW per-layer node count fits in u32"),
7658 );
7659 for neighbors in layer {
7660 write_u16(
7661 out,
7662 u16::try_from(neighbors.len()).expect("HNSW neighbour list fits in u16"),
7663 );
7664 // v6.1.x: neighbour slot is already u32 in memory; just
7665 // emit the raw bytes. (v6.0 stored usize and converted
7666 // here.)
7667 for &peer in neighbors {
7668 write_u32(out, peer);
7669 }
7670 }
7671 }
7672}
7673
7674fn write_data_type(out: &mut Vec<u8>, t: DataType) {
7675 match t {
7676 DataType::Int => out.push(1),
7677 DataType::BigInt => out.push(2),
7678 DataType::Float => out.push(3),
7679 DataType::Text => out.push(4),
7680 DataType::Bool => out.push(5),
7681 DataType::Vector { dim, encoding } => match encoding {
7682 // Tag 6: pre-v6 F32 vector. Layout unchanged; pre-v6
7683 // binaries continue to deserialise this exactly as
7684 // before.
7685 VecEncoding::F32 => {
7686 out.push(6);
7687 out.extend_from_slice(&dim.to_le_bytes());
7688 }
7689 // v6.0.3: tag 15 for `VECTOR(N) USING HALF`. Same
7690 // forward-compat fence story as SQ8 below.
7691 VecEncoding::F16 => {
7692 out.push(15);
7693 out.extend_from_slice(&dim.to_le_bytes());
7694 }
7695 // v6.0.1: new tag 14 for `VECTOR(N) USING SQ8` column
7696 // type. Pre-v6 readers fall through `read_data_type`'s
7697 // catch-all and surface `Corrupt("unknown data type tag")`
7698 // — the explicit forward-compat fence called out in
7699 // V6_DESIGN deliberation #5.
7700 VecEncoding::Sq8 => {
7701 out.push(14);
7702 out.extend_from_slice(&dim.to_le_bytes());
7703 }
7704 },
7705 DataType::SmallInt => out.push(7),
7706 DataType::Varchar(max) => {
7707 out.push(8);
7708 out.extend_from_slice(&max.to_le_bytes());
7709 }
7710 DataType::Char(size) => {
7711 out.push(9);
7712 out.extend_from_slice(&size.to_le_bytes());
7713 }
7714 DataType::Numeric { precision, scale } => {
7715 out.push(10);
7716 out.push(precision);
7717 out.push(scale);
7718 }
7719 DataType::Date => out.push(11),
7720 DataType::Timestamp => out.push(12),
7721 // v7.9.2 — tag 17 for TIMESTAMPTZ. Body = i64 microseconds
7722 // UTC, identical to tag 12. Only the schema-side type tag
7723 // differs (for wire OID advertisement).
7724 DataType::Timestamptz => out.push(17),
7725 // INTERVAL is runtime-only — CREATE TABLE never produces a
7726 // column with this type, so write_data_type must not be called
7727 // on it. (Disk-format codepoint reserved for a future v3 where
7728 // INTERVAL becomes storable.)
7729 DataType::Interval => {
7730 unreachable!("DataType::Interval has no on-disk encoding in v2.11")
7731 }
7732 DataType::Json => out.push(13),
7733 // v7.9.0: tag 16 for `JSONB`. Same on-disk layout as
7734 // tag 13 — only the wire OID differs.
7735 DataType::Jsonb => out.push(16),
7736 // v7.10.4: tag 18 for `BYTEA`. Body = [u16 len][bytes].
7737 DataType::Bytes => out.push(18),
7738 // v7.10.9: tag 19 for `TEXT[]`. Body = [u16 count][per
7739 // element: u8 null + (if non-null) u16 len + utf-8].
7740 DataType::TextArray => out.push(19),
7741 // v7.11.12: tag 20 for `INT[]`. Body = [u16 count][per
7742 // element: u8 null + (if non-null) i32 LE].
7743 DataType::IntArray => out.push(20),
7744 // v7.11.12: tag 21 for `BIGINT[]`. Body = [u16 count][per
7745 // element: u8 null + (if non-null) i64 LE].
7746 DataType::BigIntArray => out.push(21),
7747 // v7.12.0: tag 22 for `tsvector`. No body — type identity
7748 // alone. Catalog FILE_VERSION 20+.
7749 DataType::TsVector => out.push(22),
7750 // v7.12.0: tag 23 for `tsquery`. No body. Catalog
7751 // FILE_VERSION 20+.
7752 DataType::TsQuery => out.push(23),
7753 // v7.17.0: tag 24 for `UUID`. No body — type identity
7754 // alone. Catalog FILE_VERSION 36+.
7755 DataType::Uuid => out.push(24),
7756 // v7.17.0 Phase 3.P0-32: tag 25 for `TIME`. No body — type
7757 // identity alone. Catalog FILE_VERSION 37+.
7758 DataType::Time => out.push(25),
7759 // v7.17.0 Phase 3.P0-33: tag 26 for `YEAR`. No body — type
7760 // identity alone. Catalog FILE_VERSION 38+.
7761 DataType::Year => out.push(26),
7762 // v7.17.0 Phase 3.P0-34: tag 27 for `TIMETZ`. No body —
7763 // type identity alone. Catalog FILE_VERSION 39+.
7764 DataType::TimeTz => out.push(27),
7765 // v7.17.0 Phase 3.P0-35: tag 28 for `MONEY`. No body —
7766 // type identity alone. Catalog FILE_VERSION 40+.
7767 DataType::Money => out.push(28),
7768 // v7.17.0 Phase 3.P0-38: tag 29 for range types. Body
7769 // = `[u8 RangeKind tag]`. Catalog FILE_VERSION 43+.
7770 DataType::Range(k) => {
7771 out.push(29);
7772 out.push(k.tag());
7773 }
7774 // v7.17.0 Phase 3.P0-39: tag 30 for hstore. No body —
7775 // type identity alone. Catalog FILE_VERSION 44+.
7776 DataType::Hstore => out.push(30),
7777 // v7.17.0 Phase 3.P0-40: tag 31/32/33 for 2D arrays.
7778 // No body — type identity alone. Catalog FILE_VERSION 45+.
7779 DataType::IntArray2D => out.push(31),
7780 DataType::BigIntArray2D => out.push(32),
7781 DataType::TextArray2D => out.push(33),
7782 }
7783}
7784
7785impl Cursor<'_> {
7786 fn read_data_type(&mut self) -> Result<DataType, StorageError> {
7787 let tag = self.read_u8()?;
7788 match tag {
7789 1 => Ok(DataType::Int),
7790 2 => Ok(DataType::BigInt),
7791 3 => Ok(DataType::Float),
7792 4 => Ok(DataType::Text),
7793 5 => Ok(DataType::Bool),
7794 6 => Ok(DataType::Vector {
7795 dim: self.read_u32()?,
7796 encoding: VecEncoding::F32,
7797 }),
7798 7 => Ok(DataType::SmallInt),
7799 8 => Ok(DataType::Varchar(self.read_u32()?)),
7800 9 => Ok(DataType::Char(self.read_u32()?)),
7801 10 => {
7802 let precision = self.read_u8()?;
7803 let scale = self.read_u8()?;
7804 Ok(DataType::Numeric { precision, scale })
7805 }
7806 11 => Ok(DataType::Date),
7807 12 => Ok(DataType::Timestamp),
7808 13 => Ok(DataType::Json),
7809 14 => Ok(DataType::Vector {
7810 dim: self.read_u32()?,
7811 encoding: VecEncoding::Sq8,
7812 }),
7813 // v6.0.3: tag 15 for `VECTOR(N) USING HALF`. Same
7814 // [u32 dim] type-tag payload as F32 / SQ8; the encoding
7815 // lives in the tag byte itself.
7816 15 => Ok(DataType::Vector {
7817 dim: self.read_u32()?,
7818 encoding: VecEncoding::F16,
7819 }),
7820 // v7.9.0: tag 16 for `JSONB`. Storage shape == Json;
7821 // we only carry the type tag so the wire layer can
7822 // emit PG OID 3802 instead of 114.
7823 16 => Ok(DataType::Jsonb),
7824 // v7.9.2: tag 17 for `TIMESTAMPTZ`. Storage shape ==
7825 // Timestamp (i64 microseconds UTC); only the wire OID
7826 // (1184) differs.
7827 17 => Ok(DataType::Timestamptz),
7828 // v7.10.4: tag 18 for `BYTEA`. Catalog FILE_VERSION 17+.
7829 18 => Ok(DataType::Bytes),
7830 // v7.10.9: tag 19 for `TEXT[]`. Catalog FILE_VERSION 18+.
7831 19 => Ok(DataType::TextArray),
7832 // v7.11.12: tags 20/21 for INT[]/BIGINT[]. FILE_VERSION 19+.
7833 20 => Ok(DataType::IntArray),
7834 21 => Ok(DataType::BigIntArray),
7835 // v7.12.0: tags 22/23 for tsvector / tsquery. Catalog
7836 // FILE_VERSION 20+.
7837 22 => Ok(DataType::TsVector),
7838 23 => Ok(DataType::TsQuery),
7839 // v7.17.0: tag 24 — UUID. Catalog FILE_VERSION 36+.
7840 24 => Ok(DataType::Uuid),
7841 // v7.17.0 Phase 3.P0-32: tag 25 — TIME. Catalog
7842 // FILE_VERSION 37+.
7843 25 => Ok(DataType::Time),
7844 // v7.17.0 Phase 3.P0-33: tag 26 — YEAR. Catalog
7845 // FILE_VERSION 38+.
7846 26 => Ok(DataType::Year),
7847 // v7.17.0 Phase 3.P0-34: tag 27 — TIMETZ. Catalog
7848 // FILE_VERSION 39+.
7849 27 => Ok(DataType::TimeTz),
7850 // v7.17.0 Phase 3.P0-35: tag 28 — MONEY. Catalog
7851 // FILE_VERSION 40+.
7852 28 => Ok(DataType::Money),
7853 // v7.17.0 Phase 3.P0-38: tag 29 + RangeKind tag.
7854 29 => {
7855 let kt = self.read_u8()?;
7856 let k = RangeKind::from_tag(kt)
7857 .ok_or_else(|| StorageError::Corrupt(format!("unknown RangeKind tag: {kt}")))?;
7858 Ok(DataType::Range(k))
7859 }
7860 // v7.17.0 Phase 3.P0-39: tag 30 — HSTORE.
7861 30 => Ok(DataType::Hstore),
7862 // v7.17.0 Phase 3.P0-40: tag 31/32/33 — 2D arrays.
7863 31 => Ok(DataType::IntArray2D),
7864 32 => Ok(DataType::BigIntArray2D),
7865 33 => Ok(DataType::TextArray2D),
7866 other => Err(StorageError::Corrupt(format!(
7867 "unknown data type tag: {other}"
7868 ))),
7869 }
7870 }
7871}
7872
7873/// Fast computation of the byte length [`encode_row_body_dense`]
7874/// would produce, without allocating the output buffer. Mirrors the
7875/// encoder's per-column body sizing so the v5.2.1 `Table::hot_bytes`
7876/// incremental counter doesn't pay an alloc-per-insert tax. Returns
7877/// the exact same `usize` as `encode_row_body_dense(row, schema).len()`.
7878pub fn row_body_encoded_len(row: &Row, schema: &TableSchema) -> usize {
7879 debug_assert_eq!(
7880 row.values.len(),
7881 schema.columns.len(),
7882 "row_body_encoded_len: row arity must match schema"
7883 );
7884 let bitmap_bytes = schema.columns.len().div_ceil(8);
7885 let mut n = bitmap_bytes;
7886 for (col_idx, v) in row.values.iter().enumerate() {
7887 if matches!(v, Value::Null) {
7888 continue;
7889 }
7890 n += value_body_encoded_len(v, schema.columns[col_idx].ty);
7891 }
7892 n
7893}
7894
7895/// Byte length a single cell consumes when written by
7896/// `write_value_body`. Used by [`row_body_encoded_len`]; kept in
7897/// lock-step with the encoder. The `_ty` slot is reserved for future
7898/// type-dependent encodings — every variant currently writes a fixed
7899/// body shape regardless of the declared column type.
7900fn value_body_encoded_len(v: &Value, _ty: DataType) -> usize {
7901 match v {
7902 Value::SmallInt(_) => 2,
7903 // 4-byte body: i32 / Date.
7904 Value::Int(_) | Value::Date(_) => 4,
7905 // 8-byte body: i64 / f64 / Timestamp.
7906 Value::BigInt(_) | Value::Float(_) | Value::Timestamp(_) => 8,
7907 Value::Bool(_) => 1,
7908 // Text/Varchar/Char/Json share the [u16 len][utf-8] layout;
7909 // v7.23 — texts >= 64 KiB take the 6-byte escape header
7910 // (these sizes feed the freezer's hot-bytes budget, so the
7911 // estimate must not undercount).
7912 Value::Text(s) | Value::Json(s) => {
7913 if s.len() >= STR_LEN_ESCAPE as usize {
7914 6 + s.len()
7915 } else {
7916 2 + s.len()
7917 }
7918 }
7919 // [u32 dim][f32 * dim]
7920 Value::Vector(vec) => 4 + 4 * vec.len(),
7921 // v6.0.1: SQ8 cell on-disk shape — [u32 dim][f32 min]
7922 // [f32 max][u8 * dim] = 12 + dim bytes. `hot_bytes`
7923 // tracking on `Table::insert` calls this every row, so
7924 // returning the real size now (even though the actual
7925 // `write_value_body` writer lands in step 6) keeps the
7926 // sizing arithmetic honest for in-memory benches.
7927 Value::Sq8Vector(q) => 4 + 4 + 4 + q.bytes.len(),
7928 // v6.0.3: halfvec on-disk shape — [u32 dim][u16 LE * dim]
7929 // = 4 + 2 * dim bytes.
7930 Value::HalfVector(h) => 4 + h.bytes.len(),
7931 // [i128 scaled][u8 scale]
7932 Value::Numeric { .. } => 16 + 1,
7933 // v7.10.4: BYTEA on-disk shape mirrors Text — [u16 len][bytes].
7934 // The 16-bit length cap is the same TEXT/JSON limit (~65 KB);
7935 // larger blobs need toast-style chunking which is a v7.11
7936 // carve-out (kept aligned with TEXT for now so the catalog
7937 // snapshot stays simple).
7938 Value::Bytes(b) => 2 + b.len(),
7939 // v7.10.9: TEXT[] on-disk shape — [u16 count][per element:
7940 // u8 null flag + (when non-null) u16 len + utf-8 bytes].
7941 Value::TextArray(items) => {
7942 let mut n = 2; // count prefix
7943 for item in items {
7944 n += 1; // null flag
7945 if let Some(s) = item {
7946 n += 2 + s.len();
7947 }
7948 }
7949 n
7950 }
7951 // v7.11.12: INT[] / BIGINT[] — [u16 count][per element:
7952 // u8 null + (when non-null) fixed-width LE].
7953 Value::IntArray(items) => {
7954 2 + items
7955 .iter()
7956 .map(|x| if x.is_some() { 5 } else { 1 })
7957 .sum::<usize>()
7958 }
7959 Value::BigIntArray(items) => {
7960 2 + items
7961 .iter()
7962 .map(|x| if x.is_some() { 9 } else { 1 })
7963 .sum::<usize>()
7964 }
7965 // v7.12.0: tsvector dense body — [u16 lexeme_count][per
7966 // lex: u16 word_len + utf-8 word + u16 pos_count + (u16
7967 // LE * pos_count) + u8 weight].
7968 Value::TsVector(lexs) => {
7969 let mut n = 2;
7970 for l in lexs {
7971 n += 2 + l.word.len() + 2 + 2 * l.positions.len() + 1;
7972 }
7973 n
7974 }
7975 // v7.12.0: tsquery dense body — prefix-coded tree.
7976 // Sizing must match `write_tsquery_body` walker.
7977 Value::TsQuery(ast) => tsquery_encoded_len(ast),
7978 // v7.17.0: UUID dense body — fixed 16 bytes, no prefix.
7979 Value::Uuid(_) => 16,
7980 // v7.17.0 Phase 3.P0-32: TIME dense body — fixed i64 LE.
7981 Value::Time(_) => 8,
7982 // v7.17.0 Phase 3.P0-33: YEAR dense body — fixed u16 LE.
7983 Value::Year(_) => 2,
7984 // v7.17.0 Phase 3.P0-34: TIMETZ dense body — i64 LE + i32 LE.
7985 Value::TimeTz { .. } => 12,
7986 // v7.17.0 Phase 3.P0-35: MONEY dense body — i64 LE cents.
7987 Value::Money(_) => 8,
7988 // v7.17.0 Phase 3.P0-38: range dense body — `[u8 flags]
7989 // [if lower: write_value(lower)] [if upper: write_value(upper)]`.
7990 // Element uses the schema-agnostic write_value codec
7991 // (which carries its own tag byte). The flags byte
7992 // captures empty/lower_some/upper_some/lower_inc/upper_inc.
7993 Value::Range { lower, upper, .. } => {
7994 1 + lower
7995 .as_ref()
7996 .map(|v| write_value_encoded_len(v))
7997 .unwrap_or(0)
7998 + upper
7999 .as_ref()
8000 .map(|v| write_value_encoded_len(v))
8001 .unwrap_or(0)
8002 }
8003 // v7.17.0 Phase 3.P0-39: hstore dense body — `[u32 count]
8004 // then per pair [u32 klen][k bytes][u8 has_val][if has_val:
8005 // u32 vlen][v bytes]`.
8006 Value::Hstore(pairs) => {
8007 let mut n = 4;
8008 for (k, v) in pairs {
8009 n += 4 + k.len() + 1;
8010 if let Some(val) = v {
8011 n += 4 + val.len();
8012 }
8013 }
8014 n
8015 }
8016 // v7.17.0 Phase 3.P0-40: 2D arrays dense body — `[u32 rows]
8017 // [u32 cols] then row-major elements with per-element
8018 // `[u8 null_flag][if non-null: element body]`.
8019 Value::IntArray2D(rows) => {
8020 let cols = rows.first().map(|r| r.len()).unwrap_or(0);
8021 8 + rows.len() * cols * (1 + 4)
8022 }
8023 Value::BigIntArray2D(rows) => {
8024 let cols = rows.first().map(|r| r.len()).unwrap_or(0);
8025 8 + rows.len() * cols * (1 + 8)
8026 }
8027 Value::TextArray2D(rows) => {
8028 let cols = rows.first().map(|r| r.len()).unwrap_or(0);
8029 let mut n = 8 + rows.len() * cols;
8030 for row in rows {
8031 for s in row.iter().flatten() {
8032 n += 4 + s.len();
8033 }
8034 }
8035 n
8036 }
8037 // NULL is encoded only in the bitmap, never in the body.
8038 Value::Null => 0,
8039 // INTERVAL has no on-disk encoding (see write_value_body).
8040 Value::Interval { .. } => {
8041 unreachable!("Value::Interval has no on-disk encoding")
8042 }
8043 }
8044}
8045
8046/// Encode one row's body in the v3.0.2 dense format (`FILE_VERSION`
8047/// 8): per-row NULL bitmap (1 bit/col, ceil(cols/8) bytes), then
8048/// each non-NULL cell as `write_value_body`. Same wire shape the
8049/// catalog snapshot writes per row inside its rows-block. Exposed
8050/// pub so v5.1+ cold-tier segment writers can produce row payloads
8051/// that the catalog [`decode_row_body_dense`] decodes 1:1.
8052///
8053/// `row.values.len()` must equal `schema.columns.len()` — the row
8054/// is expected to have been validated by `Table::insert` (the
8055/// engine's INSERT path) before reaching this function.
8056pub fn encode_row_body_dense(row: &Row, schema: &TableSchema) -> Vec<u8> {
8057 debug_assert_eq!(
8058 row.values.len(),
8059 schema.columns.len(),
8060 "dense encode: row arity must match schema"
8061 );
8062 let bitmap_bytes = schema.columns.len().div_ceil(8);
8063 // 8 B per fixed-width cell is a reasonable average; the buffer
8064 // grows past this for variable-width Text/Vector cells.
8065 let mut out = Vec::with_capacity(bitmap_bytes + schema.columns.len() * 8);
8066 let bitmap_offset = out.len();
8067 out.resize(bitmap_offset + bitmap_bytes, 0);
8068 for (i, v) in row.values.iter().enumerate() {
8069 if matches!(v, Value::Null) {
8070 out[bitmap_offset + i / 8] |= 1 << (i % 8);
8071 }
8072 }
8073 for (col_idx, v) in row.values.iter().enumerate() {
8074 if matches!(v, Value::Null) {
8075 continue;
8076 }
8077 write_value_body(&mut out, v, schema.columns[col_idx].ty);
8078 }
8079 out
8080}
8081
8082/// Inverse of [`encode_row_body_dense`]. Reads one row's body from
8083/// `bytes` and returns it plus the number of bytes consumed (so a
8084/// caller decoding a back-to-back stream of rows can advance its
8085/// cursor). Returns `StorageError::Corrupt` on truncation, bad
8086/// UTF-8, or unknown cell tags.
8087pub fn decode_row_body_dense(
8088 bytes: &[u8],
8089 schema: &TableSchema,
8090 long_strings: bool,
8091) -> Result<(Row, usize), StorageError> {
8092 let mut cur = Cursor::new(bytes).with_long_strings(long_strings);
8093 let bitmap_bytes = schema.columns.len().div_ceil(8);
8094 let mut bitmap_buf = [0u8; 32];
8095 if bitmap_bytes > bitmap_buf.len() {
8096 return Err(StorageError::Corrupt(format!(
8097 "row NULL bitmap {bitmap_bytes} B exceeds 32 B cap"
8098 )));
8099 }
8100 let slice = cur.take(bitmap_bytes)?;
8101 bitmap_buf[..bitmap_bytes].copy_from_slice(slice);
8102 let mut values = Vec::with_capacity(schema.columns.len());
8103 for (col_idx, col) in schema.columns.iter().enumerate() {
8104 if (bitmap_buf[col_idx / 8] >> (col_idx % 8)) & 1 == 1 {
8105 values.push(Value::Null);
8106 } else {
8107 values.push(cur.read_value_body(col.ty)?);
8108 }
8109 }
8110 Ok((Row { values }, cur.pos))
8111}
8112
8113/// Schema-driven dense value encoding (`FILE_VERSION` 8). Caller already
8114/// knows the column type and has decided this cell is non-NULL, so we
8115/// skip the per-cell type tag the v7 `write_value` was writing. NULL
8116/// is encoded via the per-row bitmap before this function runs, never
8117/// reaches here. Used only inside the row-encoding hot loop; the
8118/// schema-default path still goes through the legacy `write_value` so
8119/// DEFAULT values keep their self-describing tag and remain decodable
8120/// without consulting a column type.
8121fn write_value_body(out: &mut Vec<u8>, v: &Value, ty: DataType) {
8122 match (v, ty) {
8123 (Value::SmallInt(n), DataType::SmallInt) => out.extend_from_slice(&n.to_le_bytes()),
8124 (Value::Int(n), DataType::Int) => out.extend_from_slice(&n.to_le_bytes()),
8125 (Value::BigInt(n), DataType::BigInt) => out.extend_from_slice(&n.to_le_bytes()),
8126 (Value::Float(x), DataType::Float) => out.extend_from_slice(&x.to_le_bytes()),
8127 (Value::Bool(b), DataType::Bool) => out.push(u8::from(*b)),
8128 (Value::Text(s), DataType::Text | DataType::Varchar(_) | DataType::Char(_)) => {
8129 write_str(out, s);
8130 }
8131 (
8132 Value::Vector(v),
8133 DataType::Vector {
8134 encoding: VecEncoding::F32,
8135 ..
8136 },
8137 ) => {
8138 let dim = u32::try_from(v.len()).expect("vector dim fits in u32");
8139 out.extend_from_slice(&dim.to_le_bytes());
8140 for x in v {
8141 out.extend_from_slice(&x.to_le_bytes());
8142 }
8143 }
8144 // v6.0.1: SQ8 dense body — [u32 dim][f32 min][f32 max]
8145 // [u8 * dim]. Self-describes its length so v6 readers
8146 // walking rows of a v6 catalog stay aligned even if the
8147 // declared column dim drifts (defensive, not normally
8148 // possible since CREATE TABLE pins the dim).
8149 (
8150 Value::Sq8Vector(q),
8151 DataType::Vector {
8152 encoding: VecEncoding::Sq8,
8153 ..
8154 },
8155 ) => {
8156 let dim = u32::try_from(q.bytes.len()).expect("vector dim fits in u32");
8157 out.extend_from_slice(&dim.to_le_bytes());
8158 out.extend_from_slice(&q.min.to_le_bytes());
8159 out.extend_from_slice(&q.max.to_le_bytes());
8160 out.extend_from_slice(&q.bytes);
8161 }
8162 // v6.0.3: halfvec dense body — [u32 dim][u16 LE * dim].
8163 // The raw u16 bytes already live in `h.bytes` little-
8164 // endian, so we just splat them.
8165 (
8166 Value::HalfVector(h),
8167 DataType::Vector {
8168 encoding: VecEncoding::F16,
8169 ..
8170 },
8171 ) => {
8172 let dim = u32::try_from(h.dim()).expect("vector dim fits in u32");
8173 out.extend_from_slice(&dim.to_le_bytes());
8174 out.extend_from_slice(&h.bytes);
8175 }
8176 (Value::Numeric { scaled, .. }, DataType::Numeric { scale, .. }) => {
8177 out.extend_from_slice(&scaled.to_le_bytes());
8178 out.push(scale);
8179 }
8180 (Value::Date(d), DataType::Date) => out.extend_from_slice(&d.to_le_bytes()),
8181 (Value::Timestamp(t), DataType::Timestamp | DataType::Timestamptz) => {
8182 out.extend_from_slice(&t.to_le_bytes())
8183 }
8184 // v4.9: JSON stores as length-prefixed text; same shape as
8185 // Text — the type tag lives in the column schema, not the
8186 // per-cell body.
8187 (Value::Json(s), DataType::Json | DataType::Jsonb) => write_str(out, s),
8188 // v7.10.4: BYTEA shares the [u16 len][bytes] shape with
8189 // Text but writes raw bytes (no UTF-8 invariant).
8190 (Value::Bytes(b), DataType::Bytes) => {
8191 let len = u16::try_from(b.len()).expect("BYTEA cell ≤ 64 KiB");
8192 out.extend_from_slice(&len.to_le_bytes());
8193 out.extend_from_slice(b);
8194 }
8195 // v7.10.9: TEXT[] dense body — [u16 count][per element:
8196 // u8 null flag + (when non-null) u16 len + utf-8 bytes].
8197 (Value::TextArray(items), DataType::TextArray) => {
8198 let count = u16::try_from(items.len()).expect("TEXT[] ≤ 65k elements");
8199 out.extend_from_slice(&count.to_le_bytes());
8200 for item in items {
8201 match item {
8202 None => out.push(1),
8203 Some(s) => {
8204 out.push(0);
8205 let len = u16::try_from(s.len()).expect("TEXT[] element ≤ 64 KiB");
8206 out.extend_from_slice(&len.to_le_bytes());
8207 out.extend_from_slice(s.as_bytes());
8208 }
8209 }
8210 }
8211 }
8212 // v7.11.12: INT[] dense body — [u16 count][per element:
8213 // u8 null + (when non-null) i32 LE].
8214 (Value::IntArray(items), DataType::IntArray) => {
8215 let count = u16::try_from(items.len()).expect("INT[] ≤ 65k elements");
8216 out.extend_from_slice(&count.to_le_bytes());
8217 for item in items {
8218 match item {
8219 None => out.push(1),
8220 Some(n) => {
8221 out.push(0);
8222 out.extend_from_slice(&n.to_le_bytes());
8223 }
8224 }
8225 }
8226 }
8227 // v7.11.12: BIGINT[] dense body — [u16 count][per element:
8228 // u8 null + (when non-null) i64 LE].
8229 (Value::BigIntArray(items), DataType::BigIntArray) => {
8230 let count = u16::try_from(items.len()).expect("BIGINT[] ≤ 65k elements");
8231 out.extend_from_slice(&count.to_le_bytes());
8232 for item in items {
8233 match item {
8234 None => out.push(1),
8235 Some(n) => {
8236 out.push(0);
8237 out.extend_from_slice(&n.to_le_bytes());
8238 }
8239 }
8240 }
8241 }
8242 // v7.12.0: tsvector dense body — see `value_body_encoded_len`
8243 // for layout. Lexemes are written in their already-sorted order.
8244 (Value::TsVector(lexs), DataType::TsVector) => write_tsvector_body(out, lexs),
8245 // v7.12.0: tsquery dense body — prefix-coded tree.
8246 (Value::TsQuery(ast), DataType::TsQuery) => write_tsquery_body(out, ast),
8247 // v7.17.0: UUID dense body — raw 16 bytes (RFC 4122 byte
8248 // order). No length prefix; the type's fixed width makes
8249 // the codec stateless.
8250 (Value::Uuid(b), DataType::Uuid) => out.extend_from_slice(&b[..]),
8251 // v7.17.0 Phase 3.P0-32: TIME dense body — i64 LE
8252 // microseconds since 00:00:00.
8253 (Value::Time(us), DataType::Time) => out.extend_from_slice(&us.to_le_bytes()),
8254 // v7.17.0 Phase 3.P0-33: YEAR dense body — u16 LE.
8255 (Value::Year(y), DataType::Year) => out.extend_from_slice(&y.to_le_bytes()),
8256 // v7.17.0 Phase 3.P0-34: TIMETZ dense body — i64 LE us +
8257 // i32 LE offset_secs.
8258 (Value::TimeTz { us, offset_secs }, DataType::TimeTz) => {
8259 out.extend_from_slice(&us.to_le_bytes());
8260 out.extend_from_slice(&offset_secs.to_le_bytes());
8261 }
8262 // v7.17.0 Phase 3.P0-35: MONEY dense body — i64 LE cents.
8263 (Value::Money(c), DataType::Money) => out.extend_from_slice(&c.to_le_bytes()),
8264 // v7.17.0 Phase 3.P0-38: range dense body — see
8265 // value_body_encoded_len for layout. `kind` is implicit
8266 // from the column DataType.
8267 (
8268 Value::Range {
8269 lower,
8270 upper,
8271 lower_inc,
8272 upper_inc,
8273 empty,
8274 ..
8275 },
8276 DataType::Range(_),
8277 ) => {
8278 let mut flags: u8 = 0;
8279 if *empty {
8280 flags |= 0b0000_0001;
8281 }
8282 if lower.is_some() {
8283 flags |= 0b0000_0010;
8284 }
8285 if upper.is_some() {
8286 flags |= 0b0000_0100;
8287 }
8288 if *lower_inc {
8289 flags |= 0b0000_1000;
8290 }
8291 if *upper_inc {
8292 flags |= 0b0001_0000;
8293 }
8294 out.push(flags);
8295 if let Some(l) = lower {
8296 write_value(out, l);
8297 }
8298 if let Some(u) = upper {
8299 write_value(out, u);
8300 }
8301 }
8302 // v7.17.0 Phase 3.P0-39: hstore dense body — same shape
8303 // as write_value_body for hstore (no leading tag — that
8304 // lives on the data type).
8305 (Value::Hstore(pairs), DataType::Hstore) => write_hstore_body(out, pairs),
8306 // v7.17.0 Phase 3.P0-40: 2D array dense body.
8307 (Value::IntArray2D(rows), DataType::IntArray2D) => write_int_2d_body(out, rows),
8308 (Value::BigIntArray2D(rows), DataType::BigIntArray2D) => write_bigint_2d_body(out, rows),
8309 (Value::TextArray2D(rows), DataType::TextArray2D) => write_text_2d_body(out, rows),
8310 // Type mismatch shouldn't happen — `Table::insert` validates
8311 // value type against column type before pushing. Treat as a
8312 // bug, not a runtime error.
8313 (other, ty) => unreachable!(
8314 "schema-driven encode received mismatched value/type pair: \
8315 value tag={:?}, column type={:?}",
8316 other.data_type(),
8317 ty
8318 ),
8319 }
8320}
8321
8322/// v7.17.0 Phase 3.P0-38 — length the schema-agnostic
8323/// `write_value` would emit for `v`. Used by the range codec to
8324/// pre-size cells. We mirror the tag-byte + body shape from
8325/// `write_value` rather than serialising to a temp Vec.
8326fn write_value_encoded_len(v: &Value) -> usize {
8327 match v {
8328 Value::Null => 1,
8329 Value::SmallInt(_) => 1 + 2,
8330 Value::Int(_) | Value::Date(_) => 1 + 4,
8331 Value::BigInt(_)
8332 | Value::Float(_)
8333 | Value::Timestamp(_)
8334 | Value::Time(_)
8335 | Value::Money(_) => 1 + 8,
8336 Value::Bool(_) => 1 + 1,
8337 Value::Year(_) => 1 + 2,
8338 Value::Text(s) | Value::Json(s) => 1 + 4 + s.len(),
8339 Value::Bytes(b) => 1 + 4 + b.len(),
8340 Value::Numeric { .. } => 1 + 16 + 1,
8341 Value::Uuid(_) => 1 + 16,
8342 Value::TimeTz { .. } => 1 + 12,
8343 Value::Hstore(pairs) => {
8344 let mut n = 1 + 4;
8345 for (k, v) in pairs {
8346 n += 4 + k.len() + 1;
8347 if let Some(val) = v {
8348 n += 4 + val.len();
8349 }
8350 }
8351 n
8352 }
8353 Value::IntArray2D(rows) => {
8354 let cols = rows.first().map(|r| r.len()).unwrap_or(0);
8355 1 + 8 + rows.len() * cols * (1 + 4)
8356 }
8357 Value::BigIntArray2D(rows) => {
8358 let cols = rows.first().map(|r| r.len()).unwrap_or(0);
8359 1 + 8 + rows.len() * cols * (1 + 8)
8360 }
8361 Value::TextArray2D(rows) => {
8362 let cols = rows.first().map(|r| r.len()).unwrap_or(0);
8363 let mut n = 1 + 8 + rows.len() * cols;
8364 for row in rows {
8365 for s in row.iter().flatten() {
8366 n += 4 + s.len();
8367 }
8368 }
8369 n
8370 }
8371 // Range-of-range and other nested cases — not currently
8372 // representable but defensively measured via the dense
8373 // body when the data_type is known.
8374 other => {
8375 let ty = other.data_type().unwrap_or(DataType::Int);
8376 1 + value_body_encoded_len(other, ty)
8377 }
8378 }
8379}
8380
8381fn write_value(out: &mut Vec<u8>, v: &Value) {
8382 match v {
8383 Value::Null => out.push(0),
8384 Value::SmallInt(n) => {
8385 out.push(7);
8386 out.extend_from_slice(&n.to_le_bytes());
8387 }
8388 Value::Int(n) => {
8389 out.push(1);
8390 out.extend_from_slice(&n.to_le_bytes());
8391 }
8392 Value::BigInt(n) => {
8393 out.push(2);
8394 out.extend_from_slice(&n.to_le_bytes());
8395 }
8396 Value::Float(x) => {
8397 out.push(3);
8398 out.extend_from_slice(&x.to_le_bytes());
8399 }
8400 // v4.9: JSON shares the tag-4 (Text) on-disk encoding —
8401 // schema decides which variant comes back on read. The
8402 // bodies are byte-identical so collapsing the match keeps
8403 // clippy::match_same_arms quiet.
8404 Value::Text(s) | Value::Json(s) => {
8405 out.push(4);
8406 write_str(out, s);
8407 }
8408 Value::Bool(b) => {
8409 out.push(5);
8410 out.push(u8::from(*b));
8411 }
8412 Value::Vector(v) => {
8413 out.push(6);
8414 let dim = u32::try_from(v.len()).expect("vector dim fits in u32");
8415 out.extend_from_slice(&dim.to_le_bytes());
8416 for x in v {
8417 out.extend_from_slice(&x.to_le_bytes());
8418 }
8419 }
8420 // v6.0.1: new tag 11 for an SQ8 cell carried with its full
8421 // header. Layout matches the dense row body shape so a
8422 // round-trip through write_value → read_value bit-equals
8423 // the original `Value::Sq8Vector`.
8424 Value::Sq8Vector(q) => {
8425 out.push(11);
8426 let dim = u32::try_from(q.bytes.len()).expect("vector dim fits in u32");
8427 out.extend_from_slice(&dim.to_le_bytes());
8428 out.extend_from_slice(&q.min.to_le_bytes());
8429 out.extend_from_slice(&q.max.to_le_bytes());
8430 out.extend_from_slice(&q.bytes);
8431 }
8432 // v6.0.3: tag 12 for a HalfVector cell.
8433 // Layout: `[u32 dim][u16 LE × dim]` — bit-identical to the
8434 // dense row body so `write_value` / `read_value` bit-equal
8435 // the original `Value::HalfVector`.
8436 Value::HalfVector(h) => {
8437 out.push(12);
8438 let dim = u32::try_from(h.dim()).expect("vector dim fits in u32");
8439 out.extend_from_slice(&dim.to_le_bytes());
8440 out.extend_from_slice(&h.bytes);
8441 }
8442 Value::Numeric { scaled, scale } => {
8443 out.push(8);
8444 out.extend_from_slice(&scaled.to_le_bytes());
8445 out.push(*scale);
8446 }
8447 Value::Date(d) => {
8448 out.push(9);
8449 out.extend_from_slice(&d.to_le_bytes());
8450 }
8451 Value::Timestamp(t) => {
8452 out.push(10);
8453 out.extend_from_slice(&t.to_le_bytes());
8454 }
8455 // Interval is a runtime-only value (no on-disk representation in
8456 // v2.11). CREATE TABLE rejects `DataType::Interval` columns, so a
8457 // Value::Interval here would mean the engine bypassed that gate.
8458 Value::Interval { .. } => {
8459 unreachable!(
8460 "Value::Interval has no on-disk encoding; engine must reject it before write"
8461 )
8462 }
8463 // v7.10.4: BYTEA — [u8 tag=13_b][u16 len][bytes]. Tag
8464 // distinct from Text (4) so the schema-agnostic
8465 // read_value path can disambiguate. (Tag 11 is taken by
8466 // the WAL `auto_commit_sql` shape elsewhere, hence 14.)
8467 Value::Bytes(b) => {
8468 out.push(14);
8469 let len = u16::try_from(b.len()).expect("BYTEA value ≤ 64 KiB");
8470 out.extend_from_slice(&len.to_le_bytes());
8471 out.extend_from_slice(b);
8472 }
8473 // v7.10.9: TEXT[] — [u8 tag=15][u16 count][per elem: u8
8474 // null + (if non-null) u16 len + utf-8 bytes].
8475 Value::TextArray(items) => {
8476 out.push(15);
8477 let count = u16::try_from(items.len()).expect("TEXT[] ≤ 65k elements");
8478 out.extend_from_slice(&count.to_le_bytes());
8479 for item in items {
8480 match item {
8481 None => out.push(1),
8482 Some(s) => {
8483 out.push(0);
8484 let len = u16::try_from(s.len()).expect("TEXT[] element ≤ 64 KiB");
8485 out.extend_from_slice(&len.to_le_bytes());
8486 out.extend_from_slice(s.as_bytes());
8487 }
8488 }
8489 }
8490 }
8491 // v7.11.12: INT[] — tag 16. [u16 count][per elem: u8 null +
8492 // (if non-null) i32 LE].
8493 Value::IntArray(items) => {
8494 out.push(16);
8495 let count = u16::try_from(items.len()).expect("INT[] ≤ 65k elements");
8496 out.extend_from_slice(&count.to_le_bytes());
8497 for item in items {
8498 match item {
8499 None => out.push(1),
8500 Some(n) => {
8501 out.push(0);
8502 out.extend_from_slice(&n.to_le_bytes());
8503 }
8504 }
8505 }
8506 }
8507 // v7.11.12: BIGINT[] — tag 17. [u16 count][per elem: u8 null +
8508 // (if non-null) i64 LE].
8509 Value::BigIntArray(items) => {
8510 out.push(17);
8511 let count = u16::try_from(items.len()).expect("BIGINT[] ≤ 65k elements");
8512 out.extend_from_slice(&count.to_le_bytes());
8513 for item in items {
8514 match item {
8515 None => out.push(1),
8516 Some(n) => {
8517 out.push(0);
8518 out.extend_from_slice(&n.to_le_bytes());
8519 }
8520 }
8521 }
8522 }
8523 // v7.12.0: tsvector — tag 18. Body shape matches
8524 // `write_tsvector_body`.
8525 Value::TsVector(lexs) => {
8526 out.push(18);
8527 write_tsvector_body(out, lexs);
8528 }
8529 // v7.12.0: tsquery — tag 19. Body shape matches
8530 // `write_tsquery_body`.
8531 Value::TsQuery(ast) => {
8532 out.push(19);
8533 write_tsquery_body(out, ast);
8534 }
8535 // v7.17.0: UUID — tag 20. Body = raw 16 bytes (RFC 4122
8536 // byte order).
8537 Value::Uuid(b) => {
8538 out.push(20);
8539 out.extend_from_slice(&b[..]);
8540 }
8541 // v7.17.0 Phase 3.P0-32: TIME — tag 21. Body = i64 LE
8542 // microseconds since 00:00:00.
8543 Value::Time(us) => {
8544 out.push(21);
8545 out.extend_from_slice(&us.to_le_bytes());
8546 }
8547 // v7.17.0 Phase 3.P0-33: YEAR — tag 22. Body = u16 LE.
8548 Value::Year(y) => {
8549 out.push(22);
8550 out.extend_from_slice(&y.to_le_bytes());
8551 }
8552 // v7.17.0 Phase 3.P0-34: TIMETZ — tag 23. Body = i64 LE
8553 // us + i32 LE offset_secs.
8554 Value::TimeTz { us, offset_secs } => {
8555 out.push(23);
8556 out.extend_from_slice(&us.to_le_bytes());
8557 out.extend_from_slice(&offset_secs.to_le_bytes());
8558 }
8559 // v7.17.0 Phase 3.P0-35: MONEY — tag 24. Body = i64 LE cents.
8560 Value::Money(c) => {
8561 out.push(24);
8562 out.extend_from_slice(&c.to_le_bytes());
8563 }
8564 // v7.17.0 Phase 3.P0-38: range — tag 25. Body =
8565 // [u8 RangeKind tag][u8 flags][if lower: write_value(lower)]
8566 // [if upper: write_value(upper)].
8567 Value::Range {
8568 kind,
8569 lower,
8570 upper,
8571 lower_inc,
8572 upper_inc,
8573 empty,
8574 } => {
8575 out.push(25);
8576 out.push(kind.tag());
8577 let mut flags: u8 = 0;
8578 if *empty {
8579 flags |= 0b0000_0001;
8580 }
8581 if lower.is_some() {
8582 flags |= 0b0000_0010;
8583 }
8584 if upper.is_some() {
8585 flags |= 0b0000_0100;
8586 }
8587 if *lower_inc {
8588 flags |= 0b0000_1000;
8589 }
8590 if *upper_inc {
8591 flags |= 0b0001_0000;
8592 }
8593 out.push(flags);
8594 if let Some(l) = lower {
8595 write_value(out, l);
8596 }
8597 if let Some(u) = upper {
8598 write_value(out, u);
8599 }
8600 }
8601 // v7.17.0 Phase 3.P0-39: hstore — tag 26. Body =
8602 // [u32 count] then per pair `[u32 klen][k bytes][u8 has_val]
8603 // [if has_val: u32 vlen][v bytes]`.
8604 Value::Hstore(pairs) => {
8605 out.push(26);
8606 write_hstore_body(out, pairs);
8607 }
8608 // v7.17.0 Phase 3.P0-40: 2D arrays — tag 27/28/29.
8609 Value::IntArray2D(rows) => {
8610 out.push(27);
8611 write_int_2d_body(out, rows);
8612 }
8613 Value::BigIntArray2D(rows) => {
8614 out.push(28);
8615 write_bigint_2d_body(out, rows);
8616 }
8617 Value::TextArray2D(rows) => {
8618 out.push(29);
8619 write_text_2d_body(out, rows);
8620 }
8621 }
8622}
8623
8624/// v7.17.0 Phase 3.P0-40 — shared 2D INT writer.
8625fn write_int_2d_body(out: &mut Vec<u8>, rows: &[Vec<Option<i32>>]) {
8626 let nrows = u32::try_from(rows.len()).expect("≤ 4G rows");
8627 let ncols = u32::try_from(rows.first().map(|r| r.len()).unwrap_or(0)).expect("≤ 4G cols");
8628 out.extend_from_slice(&nrows.to_le_bytes());
8629 out.extend_from_slice(&ncols.to_le_bytes());
8630 for row in rows {
8631 for cell in row {
8632 match cell {
8633 None => out.push(1),
8634 Some(n) => {
8635 out.push(0);
8636 out.extend_from_slice(&n.to_le_bytes());
8637 }
8638 }
8639 }
8640 }
8641}
8642
8643/// v7.17.0 Phase 3.P0-40 — shared 2D BIGINT writer.
8644fn write_bigint_2d_body(out: &mut Vec<u8>, rows: &[Vec<Option<i64>>]) {
8645 let nrows = u32::try_from(rows.len()).expect("≤ 4G rows");
8646 let ncols = u32::try_from(rows.first().map(|r| r.len()).unwrap_or(0)).expect("≤ 4G cols");
8647 out.extend_from_slice(&nrows.to_le_bytes());
8648 out.extend_from_slice(&ncols.to_le_bytes());
8649 for row in rows {
8650 for cell in row {
8651 match cell {
8652 None => out.push(1),
8653 Some(n) => {
8654 out.push(0);
8655 out.extend_from_slice(&n.to_le_bytes());
8656 }
8657 }
8658 }
8659 }
8660}
8661
8662/// v7.17.0 Phase 3.P0-40 — shared 2D TEXT writer. Cells use
8663/// `[u8 null_flag][if non-null: u32 len][utf-8 bytes]` layout.
8664fn write_text_2d_body(out: &mut Vec<u8>, rows: &[Vec<Option<String>>]) {
8665 let nrows = u32::try_from(rows.len()).expect("≤ 4G rows");
8666 let ncols = u32::try_from(rows.first().map(|r| r.len()).unwrap_or(0)).expect("≤ 4G cols");
8667 out.extend_from_slice(&nrows.to_le_bytes());
8668 out.extend_from_slice(&ncols.to_le_bytes());
8669 for row in rows {
8670 for cell in row {
8671 match cell {
8672 None => out.push(1),
8673 Some(s) => {
8674 out.push(0);
8675 let l = u32::try_from(s.len()).expect("≤ 4 GiB cell");
8676 out.extend_from_slice(&l.to_le_bytes());
8677 out.extend_from_slice(s.as_bytes());
8678 }
8679 }
8680 }
8681 }
8682}
8683
8684/// v7.17.0 Phase 3.P0-39 — shared hstore body writer.
8685fn write_hstore_body(out: &mut Vec<u8>, pairs: &[(String, Option<String>)]) {
8686 let count = u32::try_from(pairs.len()).expect("hstore ≤ u32::MAX pairs");
8687 out.extend_from_slice(&count.to_le_bytes());
8688 for (k, v) in pairs {
8689 let klen = u32::try_from(k.len()).expect("hstore key ≤ 4 GiB");
8690 out.extend_from_slice(&klen.to_le_bytes());
8691 out.extend_from_slice(k.as_bytes());
8692 match v {
8693 None => out.push(0),
8694 Some(val) => {
8695 out.push(1);
8696 let vlen = u32::try_from(val.len()).expect("hstore val ≤ 4 GiB");
8697 out.extend_from_slice(&vlen.to_le_bytes());
8698 out.extend_from_slice(val.as_bytes());
8699 }
8700 }
8701 }
8702}
8703
8704/// v7.12.0: shared tsvector body writer (used by both dense and
8705/// schema-agnostic codecs).
8706fn write_tsvector_body(out: &mut Vec<u8>, lexs: &[TsLexeme]) {
8707 let count = u16::try_from(lexs.len()).expect("tsvector ≤ 65k lexemes");
8708 out.extend_from_slice(&count.to_le_bytes());
8709 for l in lexs {
8710 let wlen = u16::try_from(l.word.len()).expect("tsvector word ≤ 64 KiB");
8711 out.extend_from_slice(&wlen.to_le_bytes());
8712 out.extend_from_slice(l.word.as_bytes());
8713 let plen = u16::try_from(l.positions.len()).expect("tsvector pos count ≤ 65k");
8714 out.extend_from_slice(&plen.to_le_bytes());
8715 for p in &l.positions {
8716 out.extend_from_slice(&p.to_le_bytes());
8717 }
8718 out.push(l.weight);
8719 }
8720}
8721
8722/// v7.12.0: shared tsquery body writer. Prefix-coded tree: each
8723/// node starts with `[u8 tag]` then a tag-specific payload. Tags:
8724/// 0=Term, 1=And, 2=Or, 3=Not, 4=Phrase.
8725fn write_tsquery_body(out: &mut Vec<u8>, ast: &TsQueryAst) {
8726 match ast {
8727 TsQueryAst::Term { word, weight_mask } => {
8728 out.push(0);
8729 let len = u16::try_from(word.len()).expect("tsquery term ≤ 64 KiB");
8730 out.extend_from_slice(&len.to_le_bytes());
8731 out.extend_from_slice(word.as_bytes());
8732 out.push(*weight_mask);
8733 }
8734 TsQueryAst::And(a, b) => {
8735 out.push(1);
8736 write_tsquery_body(out, a);
8737 write_tsquery_body(out, b);
8738 }
8739 TsQueryAst::Or(a, b) => {
8740 out.push(2);
8741 write_tsquery_body(out, a);
8742 write_tsquery_body(out, b);
8743 }
8744 TsQueryAst::Not(x) => {
8745 out.push(3);
8746 write_tsquery_body(out, x);
8747 }
8748 TsQueryAst::Phrase {
8749 left,
8750 right,
8751 distance,
8752 } => {
8753 out.push(4);
8754 out.extend_from_slice(&distance.to_le_bytes());
8755 write_tsquery_body(out, left);
8756 write_tsquery_body(out, right);
8757 }
8758 }
8759}
8760
8761/// v7.12.0: byte length that `write_tsquery_body` would emit.
8762fn tsquery_encoded_len(ast: &TsQueryAst) -> usize {
8763 match ast {
8764 TsQueryAst::Term { word, .. } => 1 + 2 + word.len() + 1,
8765 TsQueryAst::And(a, b) | TsQueryAst::Or(a, b) => {
8766 1 + tsquery_encoded_len(a) + tsquery_encoded_len(b)
8767 }
8768 TsQueryAst::Not(x) => 1 + tsquery_encoded_len(x),
8769 TsQueryAst::Phrase { left, right, .. } => {
8770 1 + 2 + tsquery_encoded_len(left) + tsquery_encoded_len(right)
8771 }
8772 }
8773}
8774
8775fn write_u16(out: &mut Vec<u8>, n: u16) {
8776 out.extend_from_slice(&n.to_le_bytes());
8777}
8778fn write_u32(out: &mut Vec<u8>, n: u32) {
8779 out.extend_from_slice(&n.to_le_bytes());
8780}
8781/// v7.23 (mailrs round-14) — sentinel for the escape form of the
8782/// short-string codec: a u16 length of `0xFFFF` means "the REAL
8783/// length follows as a u32". Strings of length `>= 0xFFFF` take the
8784/// escape form (including exactly 65 535, so the sentinel is
8785/// unambiguous within v46+ payloads); shorter strings keep the
8786/// 2-byte header — zero overhead for identifiers and typical text.
8787/// Pre-v46 catalogs (and pre-V3 segments) may legitimately contain
8788/// a plain length of 0xFFFF, so DECODING is gated on the container
8789/// version (`Cursor::long_strings`); encoding always emits the v46
8790/// form because every new container carries the new version mark.
8791const STR_LEN_ESCAPE: u16 = u16::MAX;
8792
8793fn write_str(out: &mut Vec<u8>, s: &str) {
8794 if s.len() >= STR_LEN_ESCAPE as usize {
8795 // Real mail bodies / document text routinely exceed 64 KiB
8796 // (mailrs round-14: the old `fits in u16` expect PANICKED —
8797 // after the INSERT was acknowledged — at the next snapshot
8798 // encode).
8799 let len = u32::try_from(s.len()).expect("text fits in u32 (4 GiB cap)");
8800 write_u16(out, STR_LEN_ESCAPE);
8801 write_u32(out, len);
8802 } else {
8803 write_u16(out, s.len() as u16);
8804 }
8805 out.extend_from_slice(s.as_bytes());
8806}
8807
8808/// v7.12.4 — long-string variant: `[u32 LE len][bytes]`. For
8809/// payloads that can plausibly exceed 64 KiB (notably PL/pgSQL
8810/// function bodies). Identifiers + short text continue to use
8811/// the u16 [`write_str`] codec.
8812fn write_str_long(out: &mut Vec<u8>, s: &str) {
8813 let len = u32::try_from(s.len()).expect("function body fits in u32");
8814 write_u32(out, len);
8815 out.extend_from_slice(s.as_bytes());
8816}
8817
8818/// Serialise an [`IndexKey`] using the v9 tagged codec. `read_index_key`
8819/// is the inverse. v8 catalogs never wrote index keys (`BTree` entries were
8820/// rebuilt from `Table::rows`), so this codec is v9+ only.
8821fn write_index_key(out: &mut Vec<u8>, key: &IndexKey) {
8822 match key {
8823 IndexKey::Int(n) => {
8824 out.push(INDEX_KEY_TAG_INT);
8825 out.extend_from_slice(&n.to_le_bytes());
8826 }
8827 IndexKey::Text(s) => {
8828 out.push(INDEX_KEY_TAG_TEXT);
8829 write_str(out, s);
8830 }
8831 IndexKey::Bool(b) => {
8832 out.push(INDEX_KEY_TAG_BOOL);
8833 out.push(u8::from(*b));
8834 }
8835 IndexKey::Uuid(b) => {
8836 out.push(INDEX_KEY_TAG_UUID);
8837 out.extend_from_slice(&b[..]);
8838 }
8839 }
8840}
8841
8842struct Cursor<'a> {
8843 buf: &'a [u8],
8844 pos: usize,
8845 /// v7.23 (round-14) — true when the container declares the v46+
8846 /// string codec (catalog `FILE_VERSION >= 46` / segment magic
8847 /// V3): a u16 length of [`STR_LEN_ESCAPE`] escapes to a u32 real
8848 /// length. False for older containers, where 0xFFFF is a
8849 /// legitimate plain length.
8850 long_strings: bool,
8851}
8852
8853impl<'a> Cursor<'a> {
8854 const fn new(buf: &'a [u8]) -> Self {
8855 Self {
8856 buf,
8857 pos: 0,
8858 long_strings: false,
8859 }
8860 }
8861
8862 /// v7.23 — builder for version-gated string decoding.
8863 const fn with_long_strings(mut self, on: bool) -> Self {
8864 self.long_strings = on;
8865 self
8866 }
8867
8868 fn take(&mut self, n: usize) -> Result<&'a [u8], StorageError> {
8869 let end = self
8870 .pos
8871 .checked_add(n)
8872 .ok_or_else(|| StorageError::Corrupt(format!("length overflow taking {n} bytes")))?;
8873 if end > self.buf.len() {
8874 return Err(StorageError::Corrupt(format!(
8875 "unexpected EOF at offset {} (wanted {n} more bytes)",
8876 self.pos
8877 )));
8878 }
8879 let s = &self.buf[self.pos..end];
8880 self.pos = end;
8881 Ok(s)
8882 }
8883
8884 fn read_u8(&mut self) -> Result<u8, StorageError> {
8885 Ok(self.take(1)?[0])
8886 }
8887 fn read_u16(&mut self) -> Result<u16, StorageError> {
8888 let s = self.take(2)?;
8889 Ok(u16::from_le_bytes([s[0], s[1]]))
8890 }
8891 fn read_u32(&mut self) -> Result<u32, StorageError> {
8892 let s = self.take(4)?;
8893 Ok(u32::from_le_bytes([s[0], s[1], s[2], s[3]]))
8894 }
8895 fn read_i32(&mut self) -> Result<i32, StorageError> {
8896 let s = self.take(4)?;
8897 Ok(i32::from_le_bytes([s[0], s[1], s[2], s[3]]))
8898 }
8899 /// v6.7.2 — u64 LE read for the per-table `hot_tier_bytes`
8900 /// catalog appendix.
8901 fn read_u64(&mut self) -> Result<u64, StorageError> {
8902 let s = self.take(8)?;
8903 Ok(u64::from_le_bytes([
8904 s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7],
8905 ]))
8906 }
8907 fn read_i64(&mut self) -> Result<i64, StorageError> {
8908 let s = self.take(8)?;
8909 let arr: [u8; 8] = s.try_into().expect("checked");
8910 Ok(i64::from_le_bytes(arr))
8911 }
8912 fn read_f64(&mut self) -> Result<f64, StorageError> {
8913 let s = self.take(8)?;
8914 let arr: [u8; 8] = s.try_into().expect("checked");
8915 Ok(f64::from_le_bytes(arr))
8916 }
8917 fn read_f32(&mut self) -> Result<f32, StorageError> {
8918 let s = self.take(4)?;
8919 Ok(f32::from_le_bytes([s[0], s[1], s[2], s[3]]))
8920 }
8921 fn read_str(&mut self) -> Result<String, StorageError> {
8922 let short = self.read_u16()?;
8923 let len = if self.long_strings && short == STR_LEN_ESCAPE {
8924 // v7.23 escape form — real length follows as u32.
8925 self.read_u32()? as usize
8926 } else {
8927 short as usize
8928 };
8929 let bytes = self.take(len)?;
8930 core::str::from_utf8(bytes)
8931 .map(String::from)
8932 .map_err(|_| StorageError::Corrupt("invalid UTF-8 in identifier or text".into()))
8933 }
8934
8935 /// v7.12.4 — long-string variant for payloads written via
8936 /// [`write_str_long`] (u32-length prefix). Used for PL/pgSQL
8937 /// function bodies which can plausibly exceed 64 KiB.
8938 fn read_str_long(&mut self) -> Result<String, StorageError> {
8939 let len = self.read_u32()? as usize;
8940 let bytes = self.take(len)?;
8941 core::str::from_utf8(bytes)
8942 .map(String::from)
8943 .map_err(|_| StorageError::Corrupt("invalid UTF-8 in long-string payload".into()))
8944 }
8945
8946 /// Parse an [`IndexKey`] emitted by `write_index_key` (v9 tagged
8947 /// codec). Returns `StorageError::Corrupt` on unknown tag or
8948 /// truncated payload.
8949 fn read_index_key(&mut self) -> Result<IndexKey, StorageError> {
8950 let tag = self.read_u8()?;
8951 match tag {
8952 INDEX_KEY_TAG_INT => Ok(IndexKey::Int(self.read_i64()?)),
8953 INDEX_KEY_TAG_TEXT => Ok(IndexKey::Text(self.read_str()?)),
8954 INDEX_KEY_TAG_BOOL => Ok(IndexKey::Bool(self.read_u8()? != 0)),
8955 INDEX_KEY_TAG_UUID => {
8956 let s = self.take(16)?;
8957 let mut b = [0u8; 16];
8958 b.copy_from_slice(s);
8959 Ok(IndexKey::Uuid(b))
8960 }
8961 other => Err(StorageError::Corrupt(format!(
8962 "unknown index key tag: {other}"
8963 ))),
8964 }
8965 }
8966 /// Schema-driven dense value decode (`FILE_VERSION` 8). Caller has
8967 /// already cleared the NULL bit from the row bitmap; we read the
8968 /// fixed-width body for the given column type. Used inside the row
8969 /// hot loop; column defaults still go through `read_value` (which
8970 /// reads its own type tag) so DEFAULT round-trips without a schema.
8971 fn read_value_body(&mut self, ty: DataType) -> Result<Value, StorageError> {
8972 match ty {
8973 DataType::SmallInt => {
8974 let s = self.take(2)?;
8975 Ok(Value::SmallInt(i16::from_le_bytes([s[0], s[1]])))
8976 }
8977 DataType::Int => Ok(Value::Int(self.read_i32()?)),
8978 DataType::BigInt => Ok(Value::BigInt(self.read_i64()?)),
8979 DataType::Float => Ok(Value::Float(self.read_f64()?)),
8980 DataType::Bool => Ok(Value::Bool(self.read_u8()? != 0)),
8981 DataType::Text | DataType::Varchar(_) | DataType::Char(_) => {
8982 Ok(Value::Text(self.read_str()?))
8983 }
8984 DataType::Vector {
8985 encoding: VecEncoding::F32,
8986 ..
8987 } => {
8988 let dim = self.read_u32()? as usize;
8989 let mut v = Vec::with_capacity(dim);
8990 for _ in 0..dim {
8991 let bytes: [u8; 4] = self.take(4)?.try_into().expect("checked");
8992 v.push(f32::from_le_bytes(bytes));
8993 }
8994 Ok(Value::Vector(v))
8995 }
8996 DataType::Vector {
8997 encoding: VecEncoding::Sq8,
8998 ..
8999 } => {
9000 let dim = self.read_u32()? as usize;
9001 let min = self.read_f32()?;
9002 let max = self.read_f32()?;
9003 let bytes = self.take(dim)?.to_vec();
9004 Ok(Value::Sq8Vector(quantize::Sq8Vector { min, max, bytes }))
9005 }
9006 DataType::Vector {
9007 encoding: VecEncoding::F16,
9008 ..
9009 } => {
9010 let dim = self.read_u32()? as usize;
9011 let bytes = self.take(dim * 2)?.to_vec();
9012 Ok(Value::HalfVector(halfvec::HalfVector { bytes }))
9013 }
9014 DataType::Numeric { .. } => {
9015 let s = self.take(16)?;
9016 let arr: [u8; 16] = s.try_into().expect("checked");
9017 let scaled = i128::from_le_bytes(arr);
9018 let scale = self.read_u8()?;
9019 Ok(Value::Numeric { scaled, scale })
9020 }
9021 DataType::Date => Ok(Value::Date(self.read_i32()?)),
9022 DataType::Timestamp => Ok(Value::Timestamp(self.read_i64()?)),
9023 DataType::Timestamptz => Ok(Value::Timestamp(self.read_i64()?)),
9024 DataType::Jsonb => Ok(Value::Json(self.read_str()?)),
9025 DataType::Interval => {
9026 // Defensive — schema gate (CREATE TABLE rejects Interval
9027 // columns) means this branch can't be hit through normal
9028 // flow; reject corrupt files explicitly rather than
9029 // panic.
9030 Err(StorageError::Corrupt(
9031 "INTERVAL column found on disk — runtime-only type, v3.0.2 rejects it".into(),
9032 ))
9033 }
9034 DataType::Json => Ok(Value::Json(self.read_str()?)),
9035 // v7.10.4: BYTEA on-disk is [u16 len][bytes]. Same wire
9036 // shape as Text, but read as raw Vec<u8>.
9037 DataType::Bytes => {
9038 let len = self.read_u16()? as usize;
9039 let bytes = self.take(len)?.to_vec();
9040 Ok(Value::Bytes(bytes))
9041 }
9042 // v7.10.9: TEXT[] dense body.
9043 DataType::TextArray => {
9044 let count = self.read_u16()? as usize;
9045 let mut items: Vec<Option<String>> = Vec::with_capacity(count);
9046 for _ in 0..count {
9047 match self.read_u8()? {
9048 0 => items.push(Some(self.read_str()?)),
9049 1 => items.push(None),
9050 other => {
9051 return Err(StorageError::Corrupt(format!(
9052 "TEXT[] null flag: unknown byte {other}"
9053 )));
9054 }
9055 }
9056 }
9057 Ok(Value::TextArray(items))
9058 }
9059 // v7.11.12: INT[] dense body.
9060 DataType::IntArray => {
9061 let count = self.read_u16()? as usize;
9062 let mut items: Vec<Option<i32>> = Vec::with_capacity(count);
9063 for _ in 0..count {
9064 match self.read_u8()? {
9065 0 => items.push(Some(self.read_i32()?)),
9066 1 => items.push(None),
9067 other => {
9068 return Err(StorageError::Corrupt(format!(
9069 "INT[] null flag: unknown byte {other}"
9070 )));
9071 }
9072 }
9073 }
9074 Ok(Value::IntArray(items))
9075 }
9076 // v7.11.12: BIGINT[] dense body.
9077 DataType::BigIntArray => {
9078 let count = self.read_u16()? as usize;
9079 let mut items: Vec<Option<i64>> = Vec::with_capacity(count);
9080 for _ in 0..count {
9081 match self.read_u8()? {
9082 0 => items.push(Some(self.read_i64()?)),
9083 1 => items.push(None),
9084 other => {
9085 return Err(StorageError::Corrupt(format!(
9086 "BIGINT[] null flag: unknown byte {other}"
9087 )));
9088 }
9089 }
9090 }
9091 Ok(Value::BigIntArray(items))
9092 }
9093 // v7.12.0: tsvector dense body — [u16 lex_count]
9094 // [per lex: u16 word_len + utf-8 word + u16 pos_count
9095 // + (u16 LE * pos_count) + u8 weight].
9096 DataType::TsVector => Ok(Value::TsVector(self.read_tsvector_body()?)),
9097 DataType::TsQuery => Ok(Value::TsQuery(self.read_tsquery_body()?)),
9098 // v7.17.0: UUID dense body — raw 16 bytes.
9099 DataType::Uuid => {
9100 let s = self.take(16)?;
9101 let mut b = [0u8; 16];
9102 b.copy_from_slice(s);
9103 Ok(Value::Uuid(b))
9104 }
9105 // v7.17.0 Phase 3.P0-32: TIME dense body — i64 LE.
9106 DataType::Time => Ok(Value::Time(self.read_i64()?)),
9107 // v7.17.0 Phase 3.P0-33: YEAR dense body — u16 LE.
9108 DataType::Year => Ok(Value::Year(self.read_u16()?)),
9109 // v7.17.0 Phase 3.P0-34: TIMETZ dense body —
9110 // i64 LE us + i32 LE offset_secs.
9111 DataType::TimeTz => {
9112 let us = self.read_i64()?;
9113 let offset_secs = self.read_i32()?;
9114 Ok(Value::TimeTz { us, offset_secs })
9115 }
9116 // v7.17.0 Phase 3.P0-35: MONEY dense body — i64 LE cents.
9117 DataType::Money => Ok(Value::Money(self.read_i64()?)),
9118 // v7.17.0 Phase 3.P0-39: hstore dense body. Body
9119 // shape == read_hstore_body.
9120 DataType::Hstore => Ok(Value::Hstore(self.read_hstore_body()?)),
9121 // v7.17.0 Phase 3.P0-40: 2D arrays dense body.
9122 DataType::IntArray2D => Ok(Value::IntArray2D(self.read_int_2d_body()?)),
9123 DataType::BigIntArray2D => Ok(Value::BigIntArray2D(self.read_bigint_2d_body()?)),
9124 DataType::TextArray2D => Ok(Value::TextArray2D(self.read_text_2d_body()?)),
9125 // v7.17.0 Phase 3.P0-38: range dense body. Element
9126 // type is determined by the surrounding RangeKind.
9127 DataType::Range(kind) => {
9128 let flags = self.read_u8()?;
9129 let empty = flags & 0b0000_0001 != 0;
9130 let has_lower = flags & 0b0000_0010 != 0;
9131 let has_upper = flags & 0b0000_0100 != 0;
9132 let lower_inc = flags & 0b0000_1000 != 0;
9133 let upper_inc = flags & 0b0001_0000 != 0;
9134 let lower = if has_lower {
9135 Some(alloc::boxed::Box::new(self.read_value()?))
9136 } else {
9137 None
9138 };
9139 let upper = if has_upper {
9140 Some(alloc::boxed::Box::new(self.read_value()?))
9141 } else {
9142 None
9143 };
9144 Ok(Value::Range {
9145 kind,
9146 lower,
9147 upper,
9148 lower_inc,
9149 upper_inc,
9150 empty,
9151 })
9152 }
9153 }
9154 }
9155
9156 /// v7.17.0 Phase 3.P0-40 — read a 2D INT array body emitted
9157 /// by `write_int_2d_body`.
9158 fn read_int_2d_body(&mut self) -> Result<Vec<Vec<Option<i32>>>, StorageError> {
9159 let nrows = self.read_u32()? as usize;
9160 let ncols = self.read_u32()? as usize;
9161 let mut rows = Vec::with_capacity(nrows);
9162 for _ in 0..nrows {
9163 let mut row = Vec::with_capacity(ncols);
9164 for _ in 0..ncols {
9165 let null = self.read_u8()?;
9166 row.push(if null == 1 {
9167 None
9168 } else {
9169 Some(self.read_i32()?)
9170 });
9171 }
9172 rows.push(row);
9173 }
9174 Ok(rows)
9175 }
9176
9177 /// v7.17.0 Phase 3.P0-40 — read a 2D BIGINT array body.
9178 fn read_bigint_2d_body(&mut self) -> Result<Vec<Vec<Option<i64>>>, StorageError> {
9179 let nrows = self.read_u32()? as usize;
9180 let ncols = self.read_u32()? as usize;
9181 let mut rows = Vec::with_capacity(nrows);
9182 for _ in 0..nrows {
9183 let mut row = Vec::with_capacity(ncols);
9184 for _ in 0..ncols {
9185 let null = self.read_u8()?;
9186 row.push(if null == 1 {
9187 None
9188 } else {
9189 Some(self.read_i64()?)
9190 });
9191 }
9192 rows.push(row);
9193 }
9194 Ok(rows)
9195 }
9196
9197 /// v7.17.0 Phase 3.P0-40 — read a 2D TEXT array body. Each
9198 /// cell is `[u8 null_flag][if non-null: u32 len + utf-8 bytes]`.
9199 fn read_text_2d_body(&mut self) -> Result<Vec<Vec<Option<String>>>, StorageError> {
9200 let nrows = self.read_u32()? as usize;
9201 let ncols = self.read_u32()? as usize;
9202 let mut rows = Vec::with_capacity(nrows);
9203 for _ in 0..nrows {
9204 let mut row = Vec::with_capacity(ncols);
9205 for _ in 0..ncols {
9206 let null = self.read_u8()?;
9207 if null == 1 {
9208 row.push(None);
9209 } else {
9210 let l = self.read_u32()? as usize;
9211 let bytes = self.take(l)?.to_vec();
9212 let s = String::from_utf8(bytes).map_err(|_| {
9213 StorageError::Corrupt("2D TEXT cell is not valid UTF-8".into())
9214 })?;
9215 row.push(Some(s));
9216 }
9217 }
9218 rows.push(row);
9219 }
9220 Ok(rows)
9221 }
9222
9223 /// v7.17.0 Phase 3.P0-39 — read a hstore body emitted by
9224 /// `write_hstore_body`.
9225 fn read_hstore_body(&mut self) -> Result<Vec<(String, Option<String>)>, StorageError> {
9226 let count = self.read_u32()? as usize;
9227 let mut out = Vec::with_capacity(count);
9228 for _ in 0..count {
9229 let klen = self.read_u32()? as usize;
9230 let k_bytes = self.take(klen)?.to_vec();
9231 let k = String::from_utf8(k_bytes)
9232 .map_err(|_| StorageError::Corrupt("hstore key is not valid UTF-8".into()))?;
9233 let has_val = self.read_u8()? != 0;
9234 let v =
9235 if has_val {
9236 let vlen = self.read_u32()? as usize;
9237 let v_bytes = self.take(vlen)?.to_vec();
9238 Some(String::from_utf8(v_bytes).map_err(|_| {
9239 StorageError::Corrupt("hstore value is not valid UTF-8".into())
9240 })?)
9241 } else {
9242 None
9243 };
9244 out.push((k, v));
9245 }
9246 Ok(out)
9247 }
9248
9249 /// v7.12.0 — read a tsvector body emitted by `write_tsvector_body`.
9250 fn read_tsvector_body(&mut self) -> Result<Vec<TsLexeme>, StorageError> {
9251 let count = self.read_u16()? as usize;
9252 let mut out = Vec::with_capacity(count);
9253 for _ in 0..count {
9254 let word = self.read_str()?;
9255 let pos_count = self.read_u16()? as usize;
9256 let mut positions = Vec::with_capacity(pos_count);
9257 for _ in 0..pos_count {
9258 positions.push(self.read_u16()?);
9259 }
9260 let weight = self.read_u8()?;
9261 out.push(TsLexeme {
9262 word,
9263 positions,
9264 weight,
9265 });
9266 }
9267 Ok(out)
9268 }
9269
9270 /// v7.12.0 — read a tsquery body emitted by `write_tsquery_body`.
9271 fn read_tsquery_body(&mut self) -> Result<TsQueryAst, StorageError> {
9272 let tag = self.read_u8()?;
9273 match tag {
9274 0 => {
9275 let word = self.read_str()?;
9276 let weight_mask = self.read_u8()?;
9277 Ok(TsQueryAst::Term { word, weight_mask })
9278 }
9279 1 => {
9280 let a = self.read_tsquery_body()?;
9281 let b = self.read_tsquery_body()?;
9282 Ok(TsQueryAst::And(Box::new(a), Box::new(b)))
9283 }
9284 2 => {
9285 let a = self.read_tsquery_body()?;
9286 let b = self.read_tsquery_body()?;
9287 Ok(TsQueryAst::Or(Box::new(a), Box::new(b)))
9288 }
9289 3 => {
9290 let x = self.read_tsquery_body()?;
9291 Ok(TsQueryAst::Not(Box::new(x)))
9292 }
9293 4 => {
9294 let distance = self.read_u16()?;
9295 let left = self.read_tsquery_body()?;
9296 let right = self.read_tsquery_body()?;
9297 Ok(TsQueryAst::Phrase {
9298 left: Box::new(left),
9299 right: Box::new(right),
9300 distance,
9301 })
9302 }
9303 other => Err(StorageError::Corrupt(format!(
9304 "tsquery: unknown node tag {other}"
9305 ))),
9306 }
9307 }
9308
9309 fn read_value(&mut self) -> Result<Value, StorageError> {
9310 let tag = self.read_u8()?;
9311 match tag {
9312 0 => Ok(Value::Null),
9313 1 => Ok(Value::Int(self.read_i32()?)),
9314 2 => Ok(Value::BigInt(self.read_i64()?)),
9315 3 => Ok(Value::Float(self.read_f64()?)),
9316 4 => Ok(Value::Text(self.read_str()?)),
9317 5 => Ok(Value::Bool(self.read_u8()? != 0)),
9318 6 => {
9319 let dim = self.read_u32()? as usize;
9320 let mut v = Vec::with_capacity(dim);
9321 for _ in 0..dim {
9322 let bytes: [u8; 4] = self.take(4)?.try_into().expect("checked");
9323 v.push(f32::from_le_bytes(bytes));
9324 }
9325 Ok(Value::Vector(v))
9326 }
9327 7 => {
9328 let s = self.take(2)?;
9329 Ok(Value::SmallInt(i16::from_le_bytes([s[0], s[1]])))
9330 }
9331 8 => {
9332 let s = self.take(16)?;
9333 let arr: [u8; 16] = s.try_into().expect("checked");
9334 let scaled = i128::from_le_bytes(arr);
9335 let scale = self.read_u8()?;
9336 Ok(Value::Numeric { scaled, scale })
9337 }
9338 9 => Ok(Value::Date(self.read_i32()?)),
9339 10 => Ok(Value::Timestamp(self.read_i64()?)),
9340 // v6.0.1: tag 11 — Sq8Vector. Pre-v6 readers fall
9341 // through to the catch-all and surface
9342 // `Corrupt("unknown value tag")`, matching the
9343 // forward-compat fence on the column-type side.
9344 11 => {
9345 let dim = self.read_u32()? as usize;
9346 let min = self.read_f32()?;
9347 let max = self.read_f32()?;
9348 let bytes = self.take(dim)?.to_vec();
9349 Ok(Value::Sq8Vector(quantize::Sq8Vector { min, max, bytes }))
9350 }
9351 // v6.0.3: tag 12 — HalfVector. Same forward-compat
9352 // fence story as tag 11.
9353 12 => {
9354 let dim = self.read_u32()? as usize;
9355 let bytes = self.take(dim * 2)?.to_vec();
9356 Ok(Value::HalfVector(halfvec::HalfVector { bytes }))
9357 }
9358 // v7.10.4: tag 14 — BYTEA. [u16 len][bytes].
9359 14 => {
9360 let len = self.read_u16()? as usize;
9361 let bytes = self.take(len)?.to_vec();
9362 Ok(Value::Bytes(bytes))
9363 }
9364 // v7.10.9: tag 15 — TEXT[]. [u16 count][per elem: u8
9365 // null + (when non-null) u16 len + utf-8 bytes].
9366 15 => {
9367 let count = self.read_u16()? as usize;
9368 let mut items: Vec<Option<String>> = Vec::with_capacity(count);
9369 for _ in 0..count {
9370 match self.read_u8()? {
9371 0 => items.push(Some(self.read_str()?)),
9372 1 => items.push(None),
9373 other => {
9374 return Err(StorageError::Corrupt(format!(
9375 "TEXT[] null flag in value tag: unknown byte {other}"
9376 )));
9377 }
9378 }
9379 }
9380 Ok(Value::TextArray(items))
9381 }
9382 // v7.11.12: tags 16/17 — INT[] / BIGINT[].
9383 16 => {
9384 let count = self.read_u16()? as usize;
9385 let mut items: Vec<Option<i32>> = Vec::with_capacity(count);
9386 for _ in 0..count {
9387 match self.read_u8()? {
9388 0 => items.push(Some(self.read_i32()?)),
9389 1 => items.push(None),
9390 other => {
9391 return Err(StorageError::Corrupt(format!(
9392 "INT[] null flag in value tag: unknown byte {other}"
9393 )));
9394 }
9395 }
9396 }
9397 Ok(Value::IntArray(items))
9398 }
9399 17 => {
9400 let count = self.read_u16()? as usize;
9401 let mut items: Vec<Option<i64>> = Vec::with_capacity(count);
9402 for _ in 0..count {
9403 match self.read_u8()? {
9404 0 => items.push(Some(self.read_i64()?)),
9405 1 => items.push(None),
9406 other => {
9407 return Err(StorageError::Corrupt(format!(
9408 "BIGINT[] null flag in value tag: unknown byte {other}"
9409 )));
9410 }
9411 }
9412 }
9413 Ok(Value::BigIntArray(items))
9414 }
9415 // v7.12.0: tag 18 — tsvector. Body matches the dense
9416 // form (`read_tsvector_body`).
9417 18 => Ok(Value::TsVector(self.read_tsvector_body()?)),
9418 // v7.12.0: tag 19 — tsquery.
9419 19 => Ok(Value::TsQuery(self.read_tsquery_body()?)),
9420 // v7.17.0: tag 20 — UUID. Raw 16 bytes.
9421 20 => {
9422 let s = self.take(16)?;
9423 let mut b = [0u8; 16];
9424 b.copy_from_slice(s);
9425 Ok(Value::Uuid(b))
9426 }
9427 // v7.17.0 Phase 3.P0-32: tag 21 — TIME. i64 LE.
9428 21 => Ok(Value::Time(self.read_i64()?)),
9429 // v7.17.0 Phase 3.P0-33: tag 22 — YEAR. u16 LE.
9430 22 => Ok(Value::Year(self.read_u16()?)),
9431 // v7.17.0 Phase 3.P0-34: tag 23 — TIMETZ. i64 LE us +
9432 // i32 LE offset_secs.
9433 23 => {
9434 let us = self.read_i64()?;
9435 let offset_secs = self.read_i32()?;
9436 Ok(Value::TimeTz { us, offset_secs })
9437 }
9438 // v7.17.0 Phase 3.P0-35: tag 24 — MONEY. i64 LE cents.
9439 24 => Ok(Value::Money(self.read_i64()?)),
9440 // v7.17.0 Phase 3.P0-39: tag 26 — Hstore. Body shape
9441 // == read_hstore_body.
9442 26 => Ok(Value::Hstore(self.read_hstore_body()?)),
9443 // v7.17.0 Phase 3.P0-40: tag 27/28/29 — 2D arrays.
9444 27 => Ok(Value::IntArray2D(self.read_int_2d_body()?)),
9445 28 => Ok(Value::BigIntArray2D(self.read_bigint_2d_body()?)),
9446 29 => Ok(Value::TextArray2D(self.read_text_2d_body()?)),
9447 // v7.17.0 Phase 3.P0-38: tag 25 — Range.
9448 // [u8 RangeKind tag][u8 flags][opt lower][opt upper].
9449 25 => {
9450 let kt = self.read_u8()?;
9451 let kind = RangeKind::from_tag(kt)
9452 .ok_or_else(|| StorageError::Corrupt(format!("unknown RangeKind tag: {kt}")))?;
9453 let flags = self.read_u8()?;
9454 let empty = flags & 0b0000_0001 != 0;
9455 let has_lower = flags & 0b0000_0010 != 0;
9456 let has_upper = flags & 0b0000_0100 != 0;
9457 let lower_inc = flags & 0b0000_1000 != 0;
9458 let upper_inc = flags & 0b0001_0000 != 0;
9459 let lower = if has_lower {
9460 Some(alloc::boxed::Box::new(self.read_value()?))
9461 } else {
9462 None
9463 };
9464 let upper = if has_upper {
9465 Some(alloc::boxed::Box::new(self.read_value()?))
9466 } else {
9467 None
9468 };
9469 Ok(Value::Range {
9470 kind,
9471 lower,
9472 upper,
9473 lower_inc,
9474 upper_inc,
9475 empty,
9476 })
9477 }
9478 other => Err(StorageError::Corrupt(format!("unknown value tag: {other}"))),
9479 }
9480 }
9481
9482 /// Read an NSW graph that was emitted via `write_nsw_graph`. `m`
9483 /// is passed in because it was already consumed from the per-
9484 /// index header. Returns the reconstituted `NswGraph`.
9485 fn read_nsw_graph(&mut self, m: usize) -> Result<NswGraph, StorageError> {
9486 let m_max_0 = self.read_u16()? as usize;
9487 let entry_raw = self.read_u32()?;
9488 let entry = if entry_raw == u32::MAX {
9489 None
9490 } else {
9491 Some(entry_raw as usize)
9492 };
9493 let entry_level = self.read_u8()?;
9494 let node_count = self.read_u32()? as usize;
9495 // v5.5.0: levels/per-layer are PV-backed in memory, but the wire
9496 // format is unchanged — decode element-by-element into a PV via
9497 // push_mut (transient in-place, no per-element path-copy here since
9498 // the freshly-built PV is uniquely owned).
9499 let mut levels: PersistentVec<u8> = PersistentVec::new();
9500 for _ in 0..node_count {
9501 levels.push_mut(self.read_u8()?);
9502 }
9503 let layer_count = self.read_u8()? as usize;
9504 let mut layers: Vec<PersistentVec<Vec<u32>>> = Vec::with_capacity(layer_count);
9505 for _ in 0..layer_count {
9506 let n = self.read_u32()? as usize;
9507 let mut per_layer: PersistentVec<Vec<u32>> = PersistentVec::new();
9508 for _ in 0..n {
9509 let cnt = self.read_u16()? as usize;
9510 let mut row: Vec<u32> = Vec::with_capacity(cnt);
9511 for _ in 0..cnt {
9512 row.push(self.read_u32()?);
9513 }
9514 per_layer.push_mut(row);
9515 }
9516 layers.push(per_layer);
9517 }
9518 Ok(NswGraph {
9519 m,
9520 m_max_0,
9521 entry,
9522 entry_level,
9523 levels,
9524 layers,
9525 })
9526 }
9527}
9528
9529#[cfg(test)]
9530mod tests {
9531 use super::*;
9532 use alloc::string::ToString;
9533 use alloc::vec;
9534
9535 /// v7.23 (mailrs round-14) — the escaped short-string codec.
9536 /// Boundary cases: 0xFFFE stays plain-u16, 0xFFFF and above take
9537 /// the escape form, round-trips are exact at 1 MiB.
9538 #[test]
9539 fn escaped_string_codec_round_trips_large_text() {
9540 for len in [0usize, 1, 65_534, 65_535, 65_536, 1_048_576] {
9541 let s: String = "x".repeat(len);
9542 let mut buf = Vec::new();
9543 write_str(&mut buf, &s);
9544 let expected_header = if len >= STR_LEN_ESCAPE as usize { 6 } else { 2 };
9545 assert_eq!(buf.len(), expected_header + len, "header width for {len}");
9546 let mut cur = Cursor::new(&buf).with_long_strings(true);
9547 assert_eq!(cur.read_str().unwrap().len(), len, "round-trip {len}");
9548 }
9549 }
9550
9551 /// Pre-v46 containers may carry a PLAIN length of exactly 0xFFFF
9552 /// — the decoder must not treat it as an escape there.
9553 #[test]
9554 fn plain_u16_len_ffff_decodes_under_old_rules() {
9555 let s = "y".repeat(65_535);
9556 let mut buf = Vec::new();
9557 // Hand-encode the OLD form: plain u16 length.
9558 write_u16(&mut buf, 65_535);
9559 buf.extend_from_slice(s.as_bytes());
9560 let mut old = Cursor::new(&buf); // long_strings = false
9561 assert_eq!(old.read_str().unwrap(), s);
9562 }
9563
9564 /// End-to-end: a catalog holding a 1 MiB TEXT row snapshots and
9565 /// reloads — the exact shape that panicked at 7.22's graceful
9566 /// close ("identifier / text fits in u16").
9567 #[test]
9568 fn snapshot_round_trips_megabyte_text_row() {
9569 let mut cat = Catalog::new();
9570 cat.create_table(TableSchema::new(
9571 "mail",
9572 vec![
9573 ColumnSchema::new("id", DataType::BigInt, false),
9574 ColumnSchema::new("body", DataType::Text, false),
9575 ],
9576 ))
9577 .unwrap();
9578 let body = "m".repeat(1_048_576);
9579 cat.get_mut("mail")
9580 .unwrap()
9581 .insert(Row::new(vec![Value::BigInt(1), Value::Text(body.clone())]))
9582 .unwrap();
9583 let bytes = cat.serialize();
9584 let re = Catalog::deserialize(&bytes).unwrap();
9585 let t = re.get("mail").unwrap();
9586 match &t.rows.get(0).unwrap().values[1] {
9587 Value::Text(s) => assert_eq!(s.len(), body.len()),
9588 other => panic!("expected Text, got {other:?}"),
9589 }
9590 }
9591
9592 /// Cold tier: a segment holding a > 64 KiB TEXT row encodes (V3
9593 /// magic) and looks up; a hand-built V1 segment with a legal
9594 /// 0xFFFF-length text still decodes under old rules.
9595 #[test]
9596 fn segment_v3_round_trips_large_text_rows() {
9597 let schema = TableSchema::new(
9598 "mail",
9599 vec![
9600 ColumnSchema::new("id", DataType::BigInt, false),
9601 ColumnSchema::new("body", DataType::Text, false),
9602 ],
9603 );
9604 let big = "b".repeat(200_000);
9605 let rows: Vec<(u64, Vec<u8>)> = (0u64..3)
9606 .map(|i| {
9607 let row = Row::new(vec![
9608 Value::BigInt(i.cast_signed()),
9609 Value::Text(big.clone()),
9610 ]);
9611 (i, encode_row_body_dense(&row, &schema))
9612 })
9613 .collect();
9614 let (bytes, _) = encode_segment(rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES).unwrap();
9615 assert_eq!(&bytes[..8], b"SPGSEG\x03\x00", "new segments are V3");
9616 let seg = OwnedSegment::from_bytes(bytes).unwrap();
9617 assert!(seg.long_strings());
9618 let payload = seg.lookup(1).expect("pk 1 present");
9619 let (row, _) = decode_row_body_dense(&payload, &schema, seg.long_strings()).unwrap();
9620 match &row.values[1] {
9621 Value::Text(s) => assert_eq!(s.len(), big.len()),
9622 other => panic!("expected Text, got {other:?}"),
9623 }
9624 }
9625
9626 /// Index keys derive from TEXT columns — a > 64 KiB key must
9627 /// round-trip through the v9 tagged index-key codec too.
9628 #[test]
9629 fn index_key_round_trips_large_text() {
9630 let key = IndexKey::Text("k".repeat(100_000));
9631 let mut buf = Vec::new();
9632 write_index_key(&mut buf, &key);
9633 let mut cur = Cursor::new(&buf).with_long_strings(true);
9634 let back = cur.read_index_key().unwrap();
9635 assert_eq!(back, key);
9636 }
9637
9638 #[cfg(target_arch = "aarch64")]
9639 #[test]
9640 fn neon_l2_matches_scalar() {
9641 // For every dim that's a multiple of 4 (4, 8, 12, 16, 64,
9642 // 128, 256, 384, 512, 768, 1024, 1536), the NEON impl must
9643 // agree with the scalar reference within tight float
9644 // tolerance (FMA rounding differs from separate * + +).
9645 let dims = [4usize, 8, 12, 16, 64, 128, 256, 384, 512, 768, 1024, 1536];
9646 for &d in &dims {
9647 let mut state: u64 = (d as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15);
9648 let mut a = Vec::with_capacity(d);
9649 let mut b = Vec::with_capacity(d);
9650 for _ in 0..d {
9651 state = state
9652 .wrapping_mul(6_364_136_223_846_793_005)
9653 .wrapping_add(1);
9654 #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
9655 let x = (((state >> 32) & 0x00FF_FFFF) as f32) / (0x80_0000_u32 as f32) - 1.0;
9656 state = state
9657 .wrapping_mul(6_364_136_223_846_793_005)
9658 .wrapping_add(1);
9659 #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
9660 let y = (((state >> 32) & 0x00FF_FFFF) as f32) / (0x80_0000_u32 as f32) - 1.0;
9661 a.push(x);
9662 b.push(y);
9663 }
9664 let scalar = l2_distance_sq_scalar(&a, &b);
9665 let neon = unsafe { l2_distance_sq_neon(&a, &b) };
9666 let tol = (scalar.abs().max(1e-6)) * 1e-4;
9667 assert!(
9668 (scalar - neon).abs() <= tol,
9669 "dim={d}: scalar={scalar} neon={neon} diff={}",
9670 (scalar - neon).abs()
9671 );
9672 }
9673 }
9674
9675 #[cfg(target_arch = "aarch64")]
9676 #[test]
9677 fn neon_inner_product_matches_scalar() {
9678 // v6.0.2 step 1: NEON IP must agree with scalar across every
9679 // production-shaped dim. FMA rounding differs from
9680 // separate * + +, so the tolerance scales with magnitude.
9681 let dims = [4usize, 8, 12, 16, 64, 128, 256, 512, 1024];
9682 for &d in &dims {
9683 let mut state: u64 = (d as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15);
9684 let mut a = Vec::with_capacity(d);
9685 let mut b = Vec::with_capacity(d);
9686 for _ in 0..d {
9687 state = state
9688 .wrapping_mul(6_364_136_223_846_793_005)
9689 .wrapping_add(1);
9690 #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
9691 let x = (((state >> 32) & 0x00FF_FFFF) as f32) / (0x80_0000_u32 as f32) - 1.0;
9692 state = state
9693 .wrapping_mul(6_364_136_223_846_793_005)
9694 .wrapping_add(1);
9695 #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
9696 let y = (((state >> 32) & 0x00FF_FFFF) as f32) / (0x80_0000_u32 as f32) - 1.0;
9697 a.push(x);
9698 b.push(y);
9699 }
9700 let scalar = inner_product_scalar(&a, &b);
9701 let neon = unsafe { inner_product_neon(&a, &b) };
9702 #[allow(clippy::cast_precision_loss)]
9703 let tol = (scalar.abs().max(1e-6)) * 1e-4 + (d as f32) * 1e-6;
9704 assert!(
9705 (scalar - neon).abs() <= tol,
9706 "IP dim={d}: scalar={scalar} neon={neon} diff={}",
9707 (scalar - neon).abs()
9708 );
9709 }
9710 }
9711
9712 #[cfg(target_arch = "aarch64")]
9713 #[allow(clippy::similar_names)]
9714 #[test]
9715 fn neon_cosine_dot_norms_matches_scalar() {
9716 let dims = [4usize, 8, 12, 16, 64, 128, 256, 512, 1024];
9717 for &d in &dims {
9718 let mut state: u64 = (d as u64).wrapping_mul(0xBF58_476D_1CE4_E5B9);
9719 let mut a = Vec::with_capacity(d);
9720 let mut b = Vec::with_capacity(d);
9721 for _ in 0..d {
9722 state = state
9723 .wrapping_mul(6_364_136_223_846_793_005)
9724 .wrapping_add(1);
9725 #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
9726 let x = (((state >> 32) & 0x00FF_FFFF) as f32) / (0x80_0000_u32 as f32) - 1.0;
9727 state = state
9728 .wrapping_mul(6_364_136_223_846_793_005)
9729 .wrapping_add(1);
9730 #[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
9731 let y = (((state >> 32) & 0x00FF_FFFF) as f32) / (0x80_0000_u32 as f32) - 1.0;
9732 a.push(x);
9733 b.push(y);
9734 }
9735 let (dot_s, na_s, nb_s) = cosine_dot_norms_scalar(&a, &b);
9736 let (dot_n, na_n, nb_n) = unsafe { cosine_dot_norms_neon(&a, &b) };
9737 #[allow(clippy::cast_precision_loss)]
9738 let tol_d = (dot_s.abs().max(1e-6)) * 1e-4 + (d as f32) * 1e-6;
9739 #[allow(clippy::cast_precision_loss)]
9740 let tol_n = (na_s.abs().max(1e-6)) * 1e-4 + (d as f32) * 1e-6;
9741 assert!(
9742 (dot_s - dot_n).abs() <= tol_d,
9743 "cosine dot dim={d}: scalar={dot_s} neon={dot_n}"
9744 );
9745 assert!(
9746 (na_s - na_n).abs() <= tol_n,
9747 "cosine na dim={d}: scalar={na_s} neon={na_n}"
9748 );
9749 assert!(
9750 (nb_s - nb_n).abs() <= tol_n,
9751 "cosine nb dim={d}: scalar={nb_s} neon={nb_n}"
9752 );
9753 }
9754 }
9755
9756 fn make_users_schema() -> TableSchema {
9757 TableSchema::new(
9758 "users",
9759 vec![
9760 ColumnSchema::new("id", DataType::Int, false),
9761 ColumnSchema::new("name", DataType::Text, false),
9762 ColumnSchema::new("score", DataType::Float, true),
9763 ],
9764 )
9765 }
9766
9767 #[test]
9768 fn value_type_tag_matches_variant() {
9769 assert_eq!(Value::Int(1).data_type(), Some(DataType::Int));
9770 assert_eq!(Value::BigInt(1).data_type(), Some(DataType::BigInt));
9771 assert_eq!(Value::Float(1.0).data_type(), Some(DataType::Float));
9772 assert_eq!(Value::Text("x".into()).data_type(), Some(DataType::Text));
9773 assert_eq!(Value::Bool(true).data_type(), Some(DataType::Bool));
9774 assert_eq!(Value::Null.data_type(), None);
9775 assert!(Value::Null.is_null());
9776 assert!(!Value::Int(0).is_null());
9777 }
9778
9779 #[test]
9780 fn sq8_value_reports_sq8_data_type() {
9781 // v6.0.1: a `Value::Sq8Vector` cell surfaces its dim
9782 // (= bytes.len()) and encoding through `data_type()` so
9783 // INSERT-time column type-checks (step 3) can route on
9784 // both shape and encoding.
9785 let q = crate::quantize::quantize(&[0.0, 0.25, 0.5, 0.75, 1.0]);
9786 let v = Value::Sq8Vector(q);
9787 assert_eq!(
9788 v.data_type(),
9789 Some(DataType::Vector {
9790 dim: 5,
9791 encoding: VecEncoding::Sq8,
9792 }),
9793 );
9794 }
9795
9796 #[test]
9797 fn datatype_display_matches_pg_keyword() {
9798 assert_eq!(DataType::Int.to_string(), "INT");
9799 assert_eq!(DataType::BigInt.to_string(), "BIGINT");
9800 assert_eq!(DataType::Float.to_string(), "FLOAT");
9801 assert_eq!(DataType::Text.to_string(), "TEXT");
9802 assert_eq!(DataType::Bool.to_string(), "BOOL");
9803 }
9804
9805 #[test]
9806 fn row_len_and_emptiness() {
9807 let r = Row::new(vec![Value::Int(1), Value::Null]);
9808 assert_eq!(r.len(), 2);
9809 assert!(!r.is_empty());
9810 assert!(Row::new(Vec::new()).is_empty());
9811 }
9812
9813 #[test]
9814 fn table_schema_column_position() {
9815 let s = make_users_schema();
9816 assert_eq!(s.column_position("id"), Some(0));
9817 assert_eq!(s.column_position("score"), Some(2));
9818 assert_eq!(s.column_position("missing"), None);
9819 }
9820
9821 #[test]
9822 fn catalog_create_table_then_lookup() {
9823 let mut cat = Catalog::new();
9824 cat.create_table(make_users_schema()).unwrap();
9825 assert_eq!(cat.table_count(), 1);
9826 assert!(cat.get("users").is_some());
9827 assert!(cat.get("nope").is_none());
9828 }
9829
9830 #[test]
9831 fn catalog_duplicate_table_is_rejected() {
9832 let mut cat = Catalog::new();
9833 cat.create_table(make_users_schema()).unwrap();
9834 let err = cat.create_table(make_users_schema()).unwrap_err();
9835 assert!(matches!(err, StorageError::DuplicateTable { ref name } if name == "users"));
9836 }
9837
9838 #[test]
9839 fn table_insert_happy_path_appends_row() {
9840 let mut cat = Catalog::new();
9841 cat.create_table(make_users_schema()).unwrap();
9842 let t = cat.get_mut("users").unwrap();
9843 t.insert(Row::new(vec![
9844 Value::Int(1),
9845 Value::Text("alice".into()),
9846 Value::Float(99.5),
9847 ]))
9848 .unwrap();
9849 assert_eq!(t.row_count(), 1);
9850 assert_eq!(t.rows()[0].values[1], Value::Text("alice".into()));
9851 }
9852
9853 #[test]
9854 fn table_insert_arity_mismatch() {
9855 let mut cat = Catalog::new();
9856 cat.create_table(make_users_schema()).unwrap();
9857 let t = cat.get_mut("users").unwrap();
9858 let err = t.insert(Row::new(vec![Value::Int(1)])).unwrap_err();
9859 assert!(matches!(
9860 err,
9861 StorageError::ArityMismatch {
9862 expected: 3,
9863 actual: 1
9864 }
9865 ));
9866 assert_eq!(t.row_count(), 0);
9867 }
9868
9869 #[test]
9870 fn table_insert_type_mismatch_reports_column() {
9871 let mut cat = Catalog::new();
9872 cat.create_table(make_users_schema()).unwrap();
9873 let t = cat.get_mut("users").unwrap();
9874 let err = t
9875 .insert(Row::new(vec![
9876 Value::Int(1),
9877 Value::Int(42), // name expects Text
9878 Value::Float(0.0),
9879 ]))
9880 .unwrap_err();
9881 match err {
9882 StorageError::TypeMismatch {
9883 ref column,
9884 expected,
9885 actual,
9886 position,
9887 } => {
9888 assert_eq!(column, "name");
9889 assert_eq!(expected, DataType::Text);
9890 assert_eq!(actual, DataType::Int);
9891 assert_eq!(position, 1);
9892 }
9893 other => panic!("unexpected: {other:?}"),
9894 }
9895 assert_eq!(t.row_count(), 0);
9896 }
9897
9898 #[test]
9899 fn table_insert_null_into_not_null_rejected() {
9900 let mut cat = Catalog::new();
9901 cat.create_table(make_users_schema()).unwrap();
9902 let t = cat.get_mut("users").unwrap();
9903 let err = t
9904 .insert(Row::new(vec![
9905 Value::Int(1),
9906 Value::Null, // name is NOT NULL
9907 Value::Float(1.0),
9908 ]))
9909 .unwrap_err();
9910 assert!(matches!(err, StorageError::NullInNotNull { ref column } if column == "name"));
9911 }
9912
9913 #[test]
9914 fn table_insert_null_into_nullable_ok() {
9915 let mut cat = Catalog::new();
9916 cat.create_table(make_users_schema()).unwrap();
9917 let t = cat.get_mut("users").unwrap();
9918 t.insert(Row::new(vec![
9919 Value::Int(1),
9920 Value::Text("bob".into()),
9921 Value::Null,
9922 ]))
9923 .unwrap();
9924 assert_eq!(t.row_count(), 1);
9925 }
9926
9927 #[test]
9928 fn catalog_get_mut_independent_per_table() {
9929 let mut cat = Catalog::new();
9930 cat.create_table(TableSchema::new(
9931 "a",
9932 vec![ColumnSchema::new("v", DataType::Int, false)],
9933 ))
9934 .unwrap();
9935 cat.create_table(TableSchema::new(
9936 "b",
9937 vec![ColumnSchema::new("v", DataType::Int, false)],
9938 ))
9939 .unwrap();
9940 cat.get_mut("a")
9941 .unwrap()
9942 .insert(Row::new(vec![Value::Int(1)]))
9943 .unwrap();
9944 assert_eq!(cat.get("a").unwrap().row_count(), 1);
9945 assert_eq!(cat.get("b").unwrap().row_count(), 0);
9946 }
9947
9948 // --- v0.6 persistence round-trips --------------------------------------
9949
9950 fn assert_round_trip(cat: &Catalog) {
9951 let bytes = cat.serialize();
9952 let restored = Catalog::deserialize(&bytes).expect("deserialize");
9953 // Compare semantic state: same tables in same order, same schema +
9954 // rows in each.
9955 assert_eq!(restored.table_count(), cat.table_count());
9956 for (a, b) in cat.tables.iter().zip(restored.tables.iter()) {
9957 assert_eq!(a.schema, b.schema);
9958 assert_eq!(a.rows, b.rows);
9959 }
9960 }
9961
9962 #[test]
9963 fn serialize_empty_catalog_round_trips() {
9964 assert_round_trip(&Catalog::new());
9965 }
9966
9967 #[test]
9968 fn serialize_single_empty_table_round_trips() {
9969 let mut cat = Catalog::new();
9970 cat.create_table(make_users_schema()).unwrap();
9971 assert_round_trip(&cat);
9972 }
9973
9974 #[test]
9975 fn nsw_clone_is_o1() {
9976 // v5.5.0: NswGraph::clone must be O(1) structural sharing, not the
9977 // pre-v5.5 O(N) element copy — it rides on Catalog::clone for every
9978 // group-commit write on a vector table. Build a non-trivial multi-
9979 // layer graph, clone it, and prove the clone shares the very same PV
9980 // storage (root+tail Arc) for `levels` and every `layers[l]`. Sharing
9981 // ⇒ no per-node element copy ⇒ clone cost independent of N (node
9982 // count); only the outer layer Vec (len ≤ 8) is copied, O(1) in
9983 // practice.
9984 let mut cat = Catalog::new();
9985 cat.create_table(TableSchema::new(
9986 "docs",
9987 alloc::vec![
9988 ColumnSchema::new("id", DataType::Int, false),
9989 ColumnSchema::new(
9990 "v",
9991 DataType::Vector {
9992 dim: 3,
9993 encoding: VecEncoding::F32
9994 },
9995 true
9996 ),
9997 ],
9998 ))
9999 .unwrap();
10000 let t = cat.get_mut("docs").unwrap();
10001 for i in 0..1500_i32 {
10002 #[allow(clippy::cast_precision_loss)] // 0..1500 — no precision lost
10003 let base = (i as f32) * 0.01;
10004 t.insert(Row::new(alloc::vec![
10005 Value::Int(i),
10006 Value::Vector(alloc::vec![base, base + 0.05, base + 0.1]),
10007 ]))
10008 .unwrap();
10009 }
10010 t.add_nsw_index("docs_nsw".into(), "v", NSW_DEFAULT_M)
10011 .unwrap();
10012 let g = match &cat.get("docs").unwrap().indices()[0].kind {
10013 IndexKind::Nsw(g) => g,
10014 IndexKind::BTree(_)
10015 | IndexKind::Brin { .. }
10016 | IndexKind::Gin(_)
10017 | IndexKind::GinTrgm(_)
10018 | IndexKind::GinFulltext(_) => {
10019 panic!("expected NSW")
10020 }
10021 };
10022 // Non-trivial graph: one level slot per row, and the geometric level
10023 // distribution puts some nodes above layer 0.
10024 assert_eq!(g.levels.len(), 1500, "one level slot per inserted row");
10025 assert!(
10026 g.layers.len() >= 2,
10027 "1500 nodes should populate at least two HNSW layers, got {}",
10028 g.layers.len()
10029 );
10030
10031 let cloned = g.clone();
10032
10033 assert!(
10034 g.levels.shares_storage_with(&cloned.levels),
10035 "levels PV not shared after clone — clone copied elements (O(N))"
10036 );
10037 assert_eq!(g.layers.len(), cloned.layers.len());
10038 for (l, (orig, cl)) in g.layers.iter().zip(cloned.layers.iter()).enumerate() {
10039 assert!(
10040 orig.shares_storage_with(cl),
10041 "layer {l} PV not shared after clone — clone copied elements (O(N))"
10042 );
10043 }
10044 }
10045
10046 #[test]
10047 fn sq8_catalog_serialise_roundtrip_preserves_cells_and_index() {
10048 // v6.0.1 step 6 verify: a catalog with an `VECTOR(N)
10049 // USING SQ8` column + NSW index survives a full
10050 // serialise → deserialise cycle. Cells re-decode bit-
10051 // identically (per-vector affine triple), the NSW
10052 // topology stays intact, and kNN search still routes
10053 // through the SQ8 ADC dispatcher after the catalog hop.
10054 let mut cat = Catalog::new();
10055 cat.create_table(TableSchema::new(
10056 "vecs",
10057 alloc::vec![
10058 ColumnSchema::new("id", DataType::Int, false),
10059 ColumnSchema::new(
10060 "v",
10061 DataType::Vector {
10062 dim: 8,
10063 encoding: VecEncoding::Sq8,
10064 },
10065 false,
10066 ),
10067 ],
10068 ))
10069 .unwrap();
10070 let t = cat.get_mut("vecs").unwrap();
10071 for i in 0..32_i32 {
10072 #[allow(clippy::cast_precision_loss)]
10073 let base = (i as f32) * 0.03;
10074 let v: Vec<f32> = (0..8_i32)
10075 .map(|j| {
10076 #[allow(clippy::cast_precision_loss)]
10077 let off = (j as f32) * 0.01;
10078 base + off
10079 })
10080 .collect();
10081 t.insert(Row::new(alloc::vec![
10082 Value::Int(i),
10083 Value::Sq8Vector(quantize::quantize(&v)),
10084 ]))
10085 .unwrap();
10086 }
10087 t.add_nsw_index("v_idx".into(), "v", NSW_DEFAULT_M).unwrap();
10088 // Capture a pre-serialise reference cell + nsw hits to
10089 // compare against the restored catalog.
10090 let query = alloc::vec![0.15_f32, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21, 0.22];
10091 let (before_cell, before_ty, before_hits) = {
10092 let t_ref = cat.get("vecs").unwrap();
10093 (
10094 t_ref.rows()[5].values[1].clone(),
10095 t_ref.schema().columns[1].ty,
10096 nsw_query(t_ref, "v_idx", &query, 5, NswMetric::L2),
10097 )
10098 };
10099
10100 let bytes = cat.serialize();
10101 let restored = Catalog::deserialize(&bytes).expect("deserialize ok");
10102 let rt = restored.get("vecs").unwrap();
10103 assert_eq!(rt.schema().columns[1].ty, before_ty);
10104 assert_eq!(rt.rows()[5].values[1], before_cell);
10105 let after_hits = nsw_query(rt, "v_idx", &query, 5, NswMetric::L2);
10106 assert_eq!(before_hits, after_hits);
10107 }
10108
10109 #[test]
10110 fn half_catalog_serialise_roundtrip_preserves_cells_and_index() {
10111 // v6.0.3 step 4 verify: a catalog with a `VECTOR(N) USING
10112 // HALF` column + NSW index survives a full serialise →
10113 // deserialise cycle. Cells re-decode bit-identically (raw
10114 // u16 LE bytes), the NSW topology stays intact, and kNN
10115 // search still returns the same hit IDs against the
10116 // restored catalog.
10117 use crate::halfvec;
10118 let mut cat = Catalog::new();
10119 cat.create_table(TableSchema::new(
10120 "vecs",
10121 alloc::vec![
10122 ColumnSchema::new("id", DataType::Int, false),
10123 ColumnSchema::new(
10124 "v",
10125 DataType::Vector {
10126 dim: 8,
10127 encoding: VecEncoding::F16,
10128 },
10129 false,
10130 ),
10131 ],
10132 ))
10133 .unwrap();
10134 let t = cat.get_mut("vecs").unwrap();
10135 for i in 0..32_i32 {
10136 #[allow(clippy::cast_precision_loss)]
10137 let base = (i as f32) * 0.03;
10138 let v: Vec<f32> = (0..8_i32)
10139 .map(|j| {
10140 #[allow(clippy::cast_precision_loss)]
10141 let off = (j as f32) * 0.01;
10142 base + off
10143 })
10144 .collect();
10145 t.insert(Row::new(alloc::vec![
10146 Value::Int(i),
10147 Value::HalfVector(halfvec::HalfVector::from_f32_slice(&v)),
10148 ]))
10149 .unwrap();
10150 }
10151 t.add_nsw_index("v_idx".into(), "v", NSW_DEFAULT_M).unwrap();
10152 let query = alloc::vec![0.15_f32, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21, 0.22];
10153 let (before_cell, before_ty, before_hits) = {
10154 let t_ref = cat.get("vecs").unwrap();
10155 (
10156 t_ref.rows()[5].values[1].clone(),
10157 t_ref.schema().columns[1].ty,
10158 nsw_query(t_ref, "v_idx", &query, 5, NswMetric::L2),
10159 )
10160 };
10161 let bytes = cat.serialize();
10162 let restored = Catalog::deserialize(&bytes).expect("deserialize ok");
10163 let rt = restored.get("vecs").unwrap();
10164 assert_eq!(rt.schema().columns[1].ty, before_ty);
10165 assert_eq!(rt.rows()[5].values[1], before_cell);
10166 let after_hits = nsw_query(rt, "v_idx", &query, 5, NswMetric::L2);
10167 assert_eq!(before_hits, after_hits);
10168 }
10169
10170 #[test]
10171 #[allow(clippy::similar_names)]
10172 fn hnsw_half_recall_at_10_matches_f32_groundtruth() {
10173 // v6.0.3 step 3 verify: HALF column NSW retrieves ≥ 95%
10174 // top-10 overlap vs brute-force F32 ground truth.
10175 // Half-precision dequantises bit-exactly at the storage
10176 // layer (no rerank pass), so the recall floor is tighter
10177 // than the SQ8 case — only the rounding noise from f32 →
10178 // f16 quantisation contributes.
10179 use crate::halfvec;
10180 fn next(state: &mut u64) -> f32 {
10181 *state = state
10182 .wrapping_add(0x9E37_79B9_7F4A_7C15)
10183 .wrapping_mul(0xBF58_476D_1CE4_E5B9);
10184 #[allow(clippy::cast_precision_loss)]
10185 let u = ((*state >> 32) as u32 as f32) / (u32::MAX as f32);
10186 2.0 * u - 1.0
10187 }
10188 let dim: u32 = 32;
10189 let n: usize = 512;
10190 let dim_us = dim as usize;
10191 let mut seed: u64 = 0xF16_F16_F16_F16_u64;
10192 let corpus: Vec<Vec<f32>> = (0..n)
10193 .map(|_| (0..dim_us).map(|_| next(&mut seed)).collect())
10194 .collect();
10195 let queries: Vec<Vec<f32>> = (0..32)
10196 .map(|_| (0..dim_us).map(|_| next(&mut seed)).collect())
10197 .collect();
10198 let exact_top10: Vec<Vec<usize>> = queries
10199 .iter()
10200 .map(|q| {
10201 let mut scored: Vec<(f32, usize)> = corpus
10202 .iter()
10203 .enumerate()
10204 .map(|(i, v)| (l2_distance_sq(v, q), i))
10205 .collect();
10206 scored.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
10207 scored.into_iter().take(10).map(|(_, i)| i).collect()
10208 })
10209 .collect();
10210 let mut cat = Catalog::new();
10211 cat.create_table(TableSchema::new(
10212 "vecs",
10213 alloc::vec![
10214 ColumnSchema::new("id", DataType::Int, false),
10215 ColumnSchema::new(
10216 "v",
10217 DataType::Vector {
10218 dim,
10219 encoding: VecEncoding::F16,
10220 },
10221 false,
10222 ),
10223 ],
10224 ))
10225 .unwrap();
10226 let t = cat.get_mut("vecs").unwrap();
10227 for (i, v) in corpus.iter().enumerate() {
10228 t.insert(Row::new(alloc::vec![
10229 Value::Int(i32::try_from(i).unwrap()),
10230 Value::HalfVector(halfvec::HalfVector::from_f32_slice(v)),
10231 ]))
10232 .unwrap();
10233 }
10234 t.add_nsw_index("v_idx".into(), "v", NSW_DEFAULT_M).unwrap();
10235 let table = cat.get("vecs").unwrap();
10236 let mut total_overlap = 0_usize;
10237 for (q, exact) in queries.iter().zip(exact_top10.iter()) {
10238 let hits = nsw_query(table, "v_idx", q, 10, NswMetric::L2);
10239 for h in &hits {
10240 if exact.contains(h) {
10241 total_overlap += 1;
10242 }
10243 }
10244 }
10245 #[allow(clippy::cast_precision_loss)]
10246 let recall = total_overlap as f32 / (10.0 * queries.len() as f32);
10247 assert!(
10248 recall >= 0.95,
10249 "HALF HNSW recall@10 = {recall:.3}, below floor 0.95 — \
10250 check halfvec dispatch in `cell_to_query_metric_distance`"
10251 );
10252 }
10253
10254 #[test]
10255 fn hnsw_sq8_recall_at_10_above_0_95_vs_f32_groundtruth() {
10256 // v6.0.1 step 5 verify: build TWO catalogs over the same
10257 // corpus — one F32, one SQ8 — and confirm SQ8 NSW + f32
10258 // rerank retrieves ≥ 95% top-10 overlap vs brute-force F32
10259 // ground truth. The rerank pass (sq8_rerank) re-scores ADC
10260 // candidates with dequantised cells, recovering recall the
10261 // raw ADC sacrifices for 4× compression.
10262 use crate::quantize;
10263 // Deterministic Gaussian-ish corpus via splitmix64. Vectors
10264 // get normalised so SQ8's per-vector `(min, max)` lives in
10265 // a sensible range; matches the v6.0.0 fuzz harness.
10266 fn next(state: &mut u64) -> f32 {
10267 *state = state
10268 .wrapping_add(0x9E37_79B9_7F4A_7C15)
10269 .wrapping_mul(0xBF58_476D_1CE4_E5B9);
10270 #[allow(clippy::cast_precision_loss)]
10271 let u = ((*state >> 32) as u32 as f32) / (u32::MAX as f32);
10272 2.0 * u - 1.0
10273 }
10274 let dim: u32 = 32;
10275 let n: usize = 512;
10276 let dim_us = dim as usize;
10277 let mut seed: u64 = 0xCAFE_BABE_DEAD_BEEFu64;
10278 let corpus: Vec<Vec<f32>> = (0..n)
10279 .map(|_| (0..dim_us).map(|_| next(&mut seed)).collect())
10280 .collect();
10281 let queries: Vec<Vec<f32>> = (0..32)
10282 .map(|_| (0..dim_us).map(|_| next(&mut seed)).collect())
10283 .collect();
10284 // F32 ground truth — pure exact arithmetic, brute force.
10285 let exact_top10: Vec<Vec<usize>> = queries
10286 .iter()
10287 .map(|q| {
10288 let mut scored: Vec<(f32, usize)> = corpus
10289 .iter()
10290 .enumerate()
10291 .map(|(i, v)| (l2_distance_sq(v, q), i))
10292 .collect();
10293 scored.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
10294 scored.into_iter().take(10).map(|(_, i)| i).collect()
10295 })
10296 .collect();
10297 // SQ8 catalog — INSERTs land as `Value::Sq8Vector` cells;
10298 // HNSW build uses the ADC path verified in step 4.
10299 let mut cat = Catalog::new();
10300 cat.create_table(TableSchema::new(
10301 "vecs",
10302 alloc::vec![
10303 ColumnSchema::new("id", DataType::Int, false),
10304 ColumnSchema::new(
10305 "v",
10306 DataType::Vector {
10307 dim,
10308 encoding: VecEncoding::Sq8,
10309 },
10310 false,
10311 ),
10312 ],
10313 ))
10314 .unwrap();
10315 let t = cat.get_mut("vecs").unwrap();
10316 for (i, v) in corpus.iter().enumerate() {
10317 t.insert(Row::new(alloc::vec![
10318 Value::Int(i32::try_from(i).unwrap()),
10319 Value::Sq8Vector(quantize::quantize(v)),
10320 ]))
10321 .unwrap();
10322 }
10323 t.add_nsw_index("v_idx".into(), "v", NSW_DEFAULT_M).unwrap();
10324 let table = cat.get("vecs").unwrap();
10325 let mut total_overlap = 0_usize;
10326 for (q, exact) in queries.iter().zip(exact_top10.iter()) {
10327 let hits = nsw_query(table, "v_idx", q, 10, NswMetric::L2);
10328 for h in &hits {
10329 if exact.contains(h) {
10330 total_overlap += 1;
10331 }
10332 }
10333 }
10334 #[allow(clippy::cast_precision_loss)]
10335 let recall = total_overlap as f32 / (10.0 * queries.len() as f32);
10336 assert!(
10337 recall >= 0.95,
10338 "SQ8 HNSW recall@10 = {recall:.3}, below floor 0.95 — \
10339 check `sq8_rerank` is wired in `nsw_search` for SQ8 columns"
10340 );
10341 }
10342
10343 #[test]
10344 fn nsw_index_topology_persists_through_round_trip() {
10345 // Build an NSW index, capture its (entry, neighbors) tuple, do
10346 // a full serialize → deserialize, and verify the restored
10347 // graph is byte-for-byte identical. The point of v2.7 is that
10348 // startup skips the rebuild, so the topology has to survive
10349 // the disk hop.
10350 let mut cat = Catalog::new();
10351 cat.create_table(TableSchema::new(
10352 "docs",
10353 alloc::vec![
10354 ColumnSchema::new("id", DataType::Int, false),
10355 ColumnSchema::new(
10356 "v",
10357 DataType::Vector {
10358 dim: 3,
10359 encoding: VecEncoding::F32
10360 },
10361 true
10362 ),
10363 ],
10364 ))
10365 .unwrap();
10366 let t = cat.get_mut("docs").unwrap();
10367 for i in 0..6_i32 {
10368 #[allow(clippy::cast_precision_loss)] // 0..6 — no precision lost
10369 let base = (i as f32) * 0.1;
10370 let row = Row::new(alloc::vec![
10371 Value::Int(i),
10372 Value::Vector(alloc::vec![base, base + 0.05, base + 0.1]),
10373 ]);
10374 t.insert(row).unwrap();
10375 }
10376 t.add_nsw_index("docs_nsw".into(), "v", NSW_DEFAULT_M)
10377 .unwrap();
10378 let original = match &cat.get("docs").unwrap().indices()[0].kind {
10379 IndexKind::Nsw(g) => g.clone(),
10380 IndexKind::BTree(_)
10381 | IndexKind::Brin { .. }
10382 | IndexKind::Gin(_)
10383 | IndexKind::GinTrgm(_)
10384 | IndexKind::GinFulltext(_) => {
10385 panic!("expected NSW")
10386 }
10387 };
10388 let bytes = cat.serialize();
10389 let restored = Catalog::deserialize(&bytes).expect("deserialize");
10390 let restored_graph = match &restored.get("docs").unwrap().indices()[0].kind {
10391 IndexKind::Nsw(g) => g.clone(),
10392 IndexKind::BTree(_)
10393 | IndexKind::Brin { .. }
10394 | IndexKind::Gin(_)
10395 | IndexKind::GinTrgm(_)
10396 | IndexKind::GinFulltext(_) => {
10397 panic!("expected NSW")
10398 }
10399 };
10400 assert_eq!(restored_graph.m, original.m);
10401 assert_eq!(restored_graph.m_max_0, original.m_max_0);
10402 assert_eq!(restored_graph.entry, original.entry);
10403 assert_eq!(restored_graph.entry_level, original.entry_level);
10404 assert_eq!(restored_graph.levels, original.levels);
10405 assert_eq!(restored_graph.layers, original.layers);
10406 }
10407
10408 #[test]
10409 fn hnsw_level_assignment_is_deterministic() {
10410 // Same row index always produces the same level — the topology
10411 // must be reproducible (matters for serialize round-trip).
10412 for i in 0..32usize {
10413 assert_eq!(nsw_assign_level(i), nsw_assign_level(i));
10414 }
10415 }
10416
10417 #[test]
10418 fn hnsw_layer_0_dominates_population() {
10419 // Sanity: out of N inserts, the vast majority should land on
10420 // layer 0. The 4-bit-clear promotion rule gives roughly 1/16
10421 // promotion to layer ≥ 1, so under 50 nodes we expect ~3 on
10422 // layer ≥ 1 and the rest on layer 0.
10423 let on_zero = (0..200usize).filter(|&i| nsw_assign_level(i) == 0).count();
10424 assert!(on_zero > 150, "level-0 nodes too few: {on_zero}");
10425 }
10426
10427 #[test]
10428 fn hnsw_search_matches_brute_force_for_l2_top1() {
10429 // Build a small dataset, query it, and confirm the top result
10430 // matches the brute-force nearest by L2. Topology variability
10431 // shouldn't break recall at k=1 for well-separated vectors.
10432 let mut cat = Catalog::new();
10433 cat.create_table(TableSchema::new(
10434 "vecs",
10435 alloc::vec![
10436 ColumnSchema::new("id", DataType::Int, false),
10437 ColumnSchema::new(
10438 "v",
10439 DataType::Vector {
10440 dim: 3,
10441 encoding: VecEncoding::F32
10442 },
10443 true
10444 ),
10445 ],
10446 ))
10447 .unwrap();
10448 let t = cat.get_mut("vecs").unwrap();
10449 let dataset: alloc::vec::Vec<(i32, [f32; 3])> = alloc::vec![
10450 (1, [0.0, 0.0, 0.0]),
10451 (2, [1.0, 0.0, 0.0]),
10452 (3, [0.0, 1.0, 0.0]),
10453 (4, [0.0, 0.0, 1.0]),
10454 (5, [1.0, 1.0, 0.0]),
10455 (6, [1.0, 0.0, 1.0]),
10456 (7, [0.0, 1.0, 1.0]),
10457 (8, [1.0, 1.0, 1.0]),
10458 (9, [0.5, 0.5, 0.5]),
10459 (10, [0.2, 0.8, 0.5]),
10460 ];
10461 for &(id, v) in &dataset {
10462 t.insert(Row::new(alloc::vec![
10463 Value::Int(id),
10464 Value::Vector(alloc::vec![v[0], v[1], v[2]]),
10465 ]))
10466 .unwrap();
10467 }
10468 t.add_nsw_index("v_idx".into(), "v", NSW_DEFAULT_M).unwrap();
10469 let idx_pos = cat
10470 .get("vecs")
10471 .unwrap()
10472 .indices()
10473 .iter()
10474 .position(|i| i.name == "v_idx")
10475 .unwrap();
10476 for query in [[0.4, 0.4, 0.4], [0.9, 0.1, 0.0], [0.0, 0.9, 0.9]] {
10477 let table = cat.get("vecs").unwrap();
10478 let hnsw_top = nsw_search(table, idx_pos, &query, 1, 16, NswMetric::L2);
10479 let mut brute: alloc::vec::Vec<(f32, usize)> = (0..table.rows.len())
10480 .map(|i| {
10481 let Value::Vector(v) = &table.rows[i].values[1] else {
10482 return (f32::INFINITY, i);
10483 };
10484 (l2_distance_sq(v, &query), i)
10485 })
10486 .collect();
10487 brute.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
10488 assert!(!hnsw_top.is_empty(), "HNSW returned no results");
10489 assert_eq!(
10490 hnsw_top[0].1, brute[0].1,
10491 "HNSW top-1 != brute-force top-1 for {query:?}"
10492 );
10493 }
10494 }
10495
10496 #[test]
10497 fn serialize_table_with_rows_round_trips() {
10498 let mut cat = Catalog::new();
10499 cat.create_table(make_users_schema()).unwrap();
10500 let t = cat.get_mut("users").unwrap();
10501 t.insert(Row::new(vec![
10502 Value::Int(1),
10503 Value::Text("alice".into()),
10504 Value::Float(95.5),
10505 ]))
10506 .unwrap();
10507 t.insert(Row::new(vec![
10508 Value::Int(2),
10509 Value::Text("bob".into()),
10510 Value::Null,
10511 ]))
10512 .unwrap();
10513 assert_round_trip(&cat);
10514 }
10515
10516 #[test]
10517 fn serialize_multiple_tables_round_trips() {
10518 let mut cat = Catalog::new();
10519 cat.create_table(make_users_schema()).unwrap();
10520 cat.create_table(TableSchema::new(
10521 "flags",
10522 vec![
10523 ColumnSchema::new("id", DataType::BigInt, false),
10524 ColumnSchema::new("active", DataType::Bool, false),
10525 ],
10526 ))
10527 .unwrap();
10528 cat.get_mut("flags")
10529 .unwrap()
10530 .insert(Row::new(vec![Value::BigInt(7), Value::Bool(true)]))
10531 .unwrap();
10532 assert_round_trip(&cat);
10533 }
10534
10535 #[test]
10536 fn deserialize_rejects_bad_magic() {
10537 let mut buf = b"BADMAGIC".to_vec();
10538 buf.push(FILE_VERSION);
10539 buf.extend_from_slice(&0u32.to_le_bytes());
10540 let err = Catalog::deserialize(&buf).unwrap_err();
10541 assert!(matches!(err, StorageError::Corrupt(_)));
10542 }
10543
10544 #[test]
10545 fn deserialize_rejects_unsupported_version() {
10546 let mut buf = FILE_MAGIC.to_vec();
10547 buf.push(99); // future version
10548 buf.extend_from_slice(&0u32.to_le_bytes());
10549 let err = Catalog::deserialize(&buf).unwrap_err();
10550 assert!(matches!(err, StorageError::Corrupt(ref s) if s.contains("version")));
10551 }
10552
10553 #[test]
10554 fn deserialize_rejects_truncated_file() {
10555 let mut cat = Catalog::new();
10556 cat.create_table(make_users_schema()).unwrap();
10557 let bytes = cat.serialize();
10558 // Drop the last byte to simulate truncation.
10559 let truncated = &bytes[..bytes.len() - 1];
10560 assert!(matches!(
10561 Catalog::deserialize(truncated),
10562 Err(StorageError::Corrupt(_))
10563 ));
10564 }
10565
10566 #[test]
10567 fn deserialize_rejects_trailing_garbage() {
10568 let cat = Catalog::new();
10569 let mut bytes = cat.serialize();
10570 bytes.push(0xFF);
10571 assert!(matches!(
10572 Catalog::deserialize(&bytes),
10573 Err(StorageError::Corrupt(ref s)) if s.contains("trailing")
10574 ));
10575 }
10576
10577 // --- v0.8 indices ------------------------------------------------------
10578
10579 fn populated_users() -> Catalog {
10580 let mut cat = Catalog::new();
10581 cat.create_table(make_users_schema()).unwrap();
10582 let t = cat.get_mut("users").unwrap();
10583 for (id, name, score) in [
10584 (1, "alice", Some(90.0)),
10585 (2, "bob", None),
10586 (3, "alice", Some(70.0)), // duplicate name → maps to two row idxs
10587 ] {
10588 t.insert(Row::new(vec![
10589 Value::Int(id),
10590 Value::Text(name.into()),
10591 score.map_or(Value::Null, Value::Float),
10592 ]))
10593 .unwrap();
10594 }
10595 cat
10596 }
10597
10598 #[test]
10599 fn add_index_builds_from_existing_rows() {
10600 let mut cat = populated_users();
10601 cat.get_mut("users")
10602 .unwrap()
10603 .add_index("by_id".into(), "id")
10604 .unwrap();
10605 let t = cat.get("users").unwrap();
10606 let idx = t.index_on(0).expect("index_on(0)");
10607 assert_eq!(idx.lookup_eq(&IndexKey::Int(2)), &[RowLocator::Hot(1)]);
10608 assert_eq!(idx.lookup_eq(&IndexKey::Int(99)), &[] as &[RowLocator]);
10609 }
10610
10611 #[test]
10612 fn add_index_dup_name_rejected() {
10613 let mut cat = populated_users();
10614 let t = cat.get_mut("users").unwrap();
10615 t.add_index("ix".into(), "id").unwrap();
10616 let err = t.add_index("ix".into(), "name").unwrap_err();
10617 assert!(matches!(err, StorageError::DuplicateIndex { ref name } if name == "ix"));
10618 }
10619
10620 #[test]
10621 fn add_index_unknown_column_rejected() {
10622 let mut cat = populated_users();
10623 let err = cat
10624 .get_mut("users")
10625 .unwrap()
10626 .add_index("ix".into(), "ghost")
10627 .unwrap_err();
10628 assert!(matches!(err, StorageError::ColumnNotFound { ref column } if column == "ghost"));
10629 }
10630
10631 #[test]
10632 fn insert_after_create_index_updates_it() {
10633 let mut cat = populated_users();
10634 let t = cat.get_mut("users").unwrap();
10635 t.add_index("by_name".into(), "name").unwrap();
10636 t.insert(Row::new(vec![
10637 Value::Int(4),
10638 Value::Text("dave".into()),
10639 Value::Null,
10640 ]))
10641 .unwrap();
10642 let idx = t.index_on(1).unwrap();
10643 assert_eq!(
10644 idx.lookup_eq(&IndexKey::Text("dave".into())),
10645 &[RowLocator::Hot(3)]
10646 );
10647 // Pre-existing duplicates remain mapped to the two original row idxs.
10648 assert_eq!(
10649 idx.lookup_eq(&IndexKey::Text("alice".into())),
10650 &[RowLocator::Hot(0), RowLocator::Hot(2)]
10651 );
10652 }
10653
10654 #[test]
10655 fn null_or_float_values_are_not_indexed() {
10656 let mut cat = populated_users();
10657 let t = cat.get_mut("users").unwrap();
10658 t.add_index("by_score".into(), "score").unwrap();
10659 let idx = t.index_on(2).unwrap();
10660 // bob's score is NULL → no entry for bob.
10661 // Score is Float → the spec says we don't index NaN-prone columns,
10662 // so even the present scores are absent. Lookups via IndexKey::Int(90)
10663 // mis-match the column type and trivially find nothing.
10664 assert_eq!(idx.lookup_eq(&IndexKey::Int(90)), &[] as &[RowLocator]);
10665 }
10666
10667 // --- v0.11 vector type -------------------------------------------------
10668
10669 #[test]
10670 fn vector_value_data_type_carries_dim() {
10671 let v = Value::Vector(vec![1.0, 2.0, 3.0]);
10672 assert_eq!(
10673 v.data_type(),
10674 Some(DataType::Vector {
10675 dim: 3,
10676 encoding: VecEncoding::F32
10677 })
10678 );
10679 }
10680
10681 #[test]
10682 fn vector_column_insert_matching_dim_ok() {
10683 let mut cat = Catalog::new();
10684 cat.create_table(TableSchema::new(
10685 "emb",
10686 vec![ColumnSchema::new(
10687 "v",
10688 DataType::Vector {
10689 dim: 3,
10690 encoding: VecEncoding::F32,
10691 },
10692 false,
10693 )],
10694 ))
10695 .unwrap();
10696 cat.get_mut("emb")
10697 .unwrap()
10698 .insert(Row::new(vec![Value::Vector(vec![1.0, 2.0, 3.0])]))
10699 .unwrap();
10700 }
10701
10702 #[test]
10703 fn vector_column_insert_dim_mismatch_rejected() {
10704 let mut cat = Catalog::new();
10705 cat.create_table(TableSchema::new(
10706 "emb",
10707 vec![ColumnSchema::new(
10708 "v",
10709 DataType::Vector {
10710 dim: 3,
10711 encoding: VecEncoding::F32,
10712 },
10713 false,
10714 )],
10715 ))
10716 .unwrap();
10717 let err = cat
10718 .get_mut("emb")
10719 .unwrap()
10720 .insert(Row::new(vec![Value::Vector(vec![1.0, 2.0])]))
10721 .unwrap_err();
10722 assert!(matches!(err, StorageError::TypeMismatch { .. }));
10723 }
10724
10725 #[test]
10726 fn vector_value_survives_catalog_round_trip() {
10727 let mut cat = Catalog::new();
10728 cat.create_table(TableSchema::new(
10729 "emb",
10730 vec![
10731 ColumnSchema::new("id", DataType::Int, false),
10732 ColumnSchema::new(
10733 "v",
10734 DataType::Vector {
10735 dim: 4,
10736 encoding: VecEncoding::F32,
10737 },
10738 false,
10739 ),
10740 ],
10741 ))
10742 .unwrap();
10743 cat.get_mut("emb")
10744 .unwrap()
10745 .insert(Row::new(vec![
10746 Value::Int(1),
10747 Value::Vector(vec![0.5, -1.25, 3.0, 7.0]),
10748 ]))
10749 .unwrap();
10750 let restored = Catalog::deserialize(&cat.serialize()).expect("round-trip");
10751 let table = restored.get("emb").unwrap();
10752 assert_eq!(
10753 table.schema().columns[1].ty,
10754 DataType::Vector {
10755 dim: 4,
10756 encoding: VecEncoding::F32
10757 }
10758 );
10759 assert_eq!(
10760 table.rows()[0].values[1],
10761 Value::Vector(vec![0.5, -1.25, 3.0, 7.0])
10762 );
10763 }
10764
10765 #[test]
10766 fn index_survives_serialize_deserialize_round_trip() {
10767 let mut cat = populated_users();
10768 cat.get_mut("users")
10769 .unwrap()
10770 .add_index("by_name".into(), "name")
10771 .unwrap();
10772 let restored = Catalog::deserialize(&cat.serialize()).unwrap();
10773 let idx = restored
10774 .get("users")
10775 .unwrap()
10776 .index_on(1)
10777 .expect("index_on(1) after restore");
10778 assert_eq!(idx.name, "by_name");
10779 // Data was rebuilt from rows, not deserialized directly.
10780 assert_eq!(
10781 idx.lookup_eq(&IndexKey::Text("alice".into())),
10782 &[RowLocator::Hot(0), RowLocator::Hot(2)]
10783 );
10784 }
10785
10786 // --- v5.1 cold-tier integration tests ----------------------
10787
10788 /// Schema with a BIGINT PK column matching what the v5.1 cold-
10789 /// tier path supports (`IndexKey::Int` → `u64` cast).
10790 fn bigint_pk_users_schema() -> TableSchema {
10791 TableSchema::new(
10792 "users",
10793 vec![
10794 ColumnSchema::new("id", DataType::BigInt, false),
10795 ColumnSchema::new("name", DataType::Text, false),
10796 ],
10797 )
10798 }
10799
10800 fn make_user_row(id: i64, name: &str) -> Row {
10801 Row::new(vec![Value::BigInt(id), Value::Text(name.into())])
10802 }
10803
10804 // v7.20 P4 — update_row incremental index maintenance.
10805
10806 #[test]
10807 fn update_row_non_indexed_column_keeps_index_intact() {
10808 let mut cat = Catalog::new();
10809 cat.create_table(bigint_pk_users_schema()).unwrap();
10810 let t = cat.get_mut("users").unwrap();
10811 for (id, name) in [(1i64, "alice"), (2, "bob"), (3, "carol")] {
10812 t.insert(make_user_row(id, name)).unwrap();
10813 }
10814 t.add_index("by_id".into(), "id").unwrap();
10815 // Change only the non-indexed `name` column — the by_id
10816 // entry for key 2 must still resolve position 1.
10817 t.update_row(1, vec![Value::BigInt(2), Value::Text("bobby".into())])
10818 .unwrap();
10819 let idx = t.index_on(0).unwrap();
10820 assert_eq!(
10821 idx.lookup_eq(&IndexKey::Int(2)),
10822 &[RowLocator::Hot(1)],
10823 "old key still resolves the in-place position"
10824 );
10825 assert_eq!(t.rows()[1].values[1], Value::Text("bobby".into()));
10826 }
10827
10828 #[test]
10829 fn update_row_indexed_column_moves_entry() {
10830 let mut cat = Catalog::new();
10831 cat.create_table(bigint_pk_users_schema()).unwrap();
10832 let t = cat.get_mut("users").unwrap();
10833 for (id, name) in [(1i64, "alice"), (2, "bob"), (3, "carol")] {
10834 t.insert(make_user_row(id, name)).unwrap();
10835 }
10836 t.add_index("by_id".into(), "id").unwrap();
10837 // Change the indexed key 2 → 20.
10838 t.update_row(1, vec![Value::BigInt(20), Value::Text("bob".into())])
10839 .unwrap();
10840 let idx = t.index_on(0).unwrap();
10841 assert!(
10842 idx.lookup_eq(&IndexKey::Int(2)).is_empty(),
10843 "old key entry removed"
10844 );
10845 assert_eq!(
10846 idx.lookup_eq(&IndexKey::Int(20)),
10847 &[RowLocator::Hot(1)],
10848 "new key entry resolves the position"
10849 );
10850 // Untouched neighbours unaffected.
10851 assert_eq!(idx.lookup_eq(&IndexKey::Int(1)), &[RowLocator::Hot(0)]);
10852 assert_eq!(idx.lookup_eq(&IndexKey::Int(3)), &[RowLocator::Hot(2)]);
10853 }
10854
10855 #[test]
10856 fn update_row_duplicate_key_moves_only_target_position() {
10857 let mut cat = Catalog::new();
10858 cat.create_table(bigint_pk_users_schema()).unwrap();
10859 let t = cat.get_mut("users").unwrap();
10860 // Two rows share key 7.
10861 for (id, name) in [(7i64, "a"), (7, "b"), (9, "c")] {
10862 t.insert(make_user_row(id, name)).unwrap();
10863 }
10864 t.add_index("by_id".into(), "id").unwrap();
10865 // Move position 1's key 7 → 8; position 0 must keep its 7.
10866 t.update_row(1, vec![Value::BigInt(8), Value::Text("b".into())])
10867 .unwrap();
10868 let idx = t.index_on(0).unwrap();
10869 assert_eq!(idx.lookup_eq(&IndexKey::Int(7)), &[RowLocator::Hot(0)]);
10870 assert_eq!(idx.lookup_eq(&IndexKey::Int(8)), &[RowLocator::Hot(1)]);
10871 assert_eq!(idx.lookup_eq(&IndexKey::Int(9)), &[RowLocator::Hot(2)]);
10872 }
10873
10874 #[test]
10875 fn update_row_null_transition_on_indexed_nullable_column() {
10876 let mut cat = Catalog::new();
10877 cat.create_table(TableSchema::new(
10878 "n",
10879 vec![
10880 ColumnSchema::new("id", DataType::BigInt, false),
10881 ColumnSchema::new("tag", DataType::BigInt, true),
10882 ],
10883 ))
10884 .unwrap();
10885 let t = cat.get_mut("n").unwrap();
10886 t.insert(Row::new(vec![Value::BigInt(1), Value::BigInt(5)]))
10887 .unwrap();
10888 t.add_index("by_tag".into(), "tag").unwrap();
10889 // 5 → NULL: entry leaves the index (NULL never enters a B-tree).
10890 t.update_row(0, vec![Value::BigInt(1), Value::Null])
10891 .unwrap();
10892 let idx = t.index_on(1).unwrap();
10893 assert!(idx.lookup_eq(&IndexKey::Int(5)).is_empty());
10894 // NULL → 6: entry re-enters under the new key.
10895 t.update_row(0, vec![Value::BigInt(1), Value::BigInt(6)])
10896 .unwrap();
10897 let idx = t.index_on(1).unwrap();
10898 assert_eq!(idx.lookup_eq(&IndexKey::Int(6)), &[RowLocator::Hot(0)]);
10899 }
10900
10901 #[test]
10902 fn lookup_by_pk_finds_row_via_hot_index() {
10903 let mut cat = Catalog::new();
10904 cat.create_table(bigint_pk_users_schema()).unwrap();
10905 let t = cat.get_mut("users").unwrap();
10906 for (id, name) in [(1i64, "alice"), (2, "bob"), (3, "carol")] {
10907 t.insert(make_user_row(id, name)).unwrap();
10908 }
10909 t.add_index("by_id".into(), "id").unwrap();
10910 // All locators are Hot; cold_segments is empty.
10911 let got = cat
10912 .lookup_by_pk("users", "by_id", &IndexKey::Int(2))
10913 .unwrap();
10914 assert_eq!(got, make_user_row(2, "bob"));
10915 assert_eq!(cat.cold_segment_count(), 0);
10916 }
10917
10918 #[test]
10919 fn lookup_by_pk_returns_none_when_key_missing() {
10920 let mut cat = Catalog::new();
10921 cat.create_table(bigint_pk_users_schema()).unwrap();
10922 let t = cat.get_mut("users").unwrap();
10923 t.insert(make_user_row(1, "alice")).unwrap();
10924 t.add_index("by_id".into(), "id").unwrap();
10925 assert!(
10926 cat.lookup_by_pk("users", "by_id", &IndexKey::Int(999))
10927 .is_none()
10928 );
10929 // Also: unknown table / unknown index name.
10930 assert!(
10931 cat.lookup_by_pk("other_table", "by_id", &IndexKey::Int(1))
10932 .is_none()
10933 );
10934 assert!(
10935 cat.lookup_by_pk("users", "no_such_index", &IndexKey::Int(1))
10936 .is_none()
10937 );
10938 }
10939
10940 #[test]
10941 fn lookup_by_pk_resolves_cold_locator_via_loaded_segment() {
10942 // Build a cold-tier segment whose payloads are dense-encoded
10943 // BIGINT rows. Wire each PK into the BTree index as a Cold
10944 // locator. The hot tier carries no rows for those PKs.
10945 let mut cat = Catalog::new();
10946 cat.create_table(bigint_pk_users_schema()).unwrap();
10947 let t = cat.get_mut("users").unwrap();
10948 t.add_index("by_id".into(), "id").unwrap();
10949 let schema = t.schema.clone();
10950
10951 let cold_rows: Vec<(i64, &str)> =
10952 vec![(100, "ivy"), (200, "joe"), (300, "kim"), (400, "lin")];
10953 let seg_rows: Vec<(u64, Vec<u8>)> = cold_rows
10954 .iter()
10955 .map(|(id, name)| {
10956 let row = make_user_row(*id, name);
10957 ((*id).cast_unsigned(), encode_row_body_dense(&row, &schema))
10958 })
10959 .collect();
10960 let (seg_bytes, _meta) =
10961 encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES).unwrap();
10962 let seg_id = cat.load_segment_bytes(seg_bytes).unwrap();
10963 assert_eq!(seg_id, 0);
10964 assert_eq!(cat.cold_segment_count(), 1);
10965
10966 let pairs: Vec<(IndexKey, RowLocator)> = cold_rows
10967 .iter()
10968 .map(|(id, _)| {
10969 (
10970 IndexKey::Int(*id),
10971 RowLocator::Cold {
10972 segment_id: seg_id,
10973 page_offset: 0,
10974 },
10975 )
10976 })
10977 .collect();
10978 let registered = cat
10979 .get_mut("users")
10980 .unwrap()
10981 .register_cold_locators("by_id", pairs)
10982 .unwrap();
10983 assert_eq!(registered, 4);
10984
10985 for (id, name) in &cold_rows {
10986 let got = cat
10987 .lookup_by_pk("users", "by_id", &IndexKey::Int(*id))
10988 .unwrap_or_else(|| panic!("cold key {id} not found"));
10989 assert_eq!(got, make_user_row(*id, name));
10990 }
10991 // Cold key that isn't in the segment must return None.
10992 assert!(
10993 cat.lookup_by_pk("users", "by_id", &IndexKey::Int(999))
10994 .is_none()
10995 );
10996 }
10997
10998 #[test]
10999 fn lookup_by_pk_mixes_hot_and_cold_tiers() {
11000 // Half the rows live in the hot tier (Table::rows + add_index
11001 // produces Hot locators); half live in a cold segment and have
11002 // Cold locators wired manually. Each lookup hits the right tier.
11003 let mut cat = Catalog::new();
11004 cat.create_table(bigint_pk_users_schema()).unwrap();
11005 let t = cat.get_mut("users").unwrap();
11006 for (id, name) in [(1i64, "alice"), (2, "bob")] {
11007 t.insert(make_user_row(id, name)).unwrap();
11008 }
11009 t.add_index("by_id".into(), "id").unwrap();
11010 let schema = t.schema.clone();
11011
11012 let cold_rows: Vec<(i64, &str)> = vec![(100, "ivy"), (200, "joe")];
11013 let seg_rows: Vec<(u64, Vec<u8>)> = cold_rows
11014 .iter()
11015 .map(|(id, name)| {
11016 let row = make_user_row(*id, name);
11017 ((*id).cast_unsigned(), encode_row_body_dense(&row, &schema))
11018 })
11019 .collect();
11020 let (seg_bytes, _) =
11021 encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES).unwrap();
11022 let seg_id = cat.load_segment_bytes(seg_bytes).unwrap();
11023 let pairs: Vec<(IndexKey, RowLocator)> = cold_rows
11024 .iter()
11025 .map(|(id, _)| {
11026 (
11027 IndexKey::Int(*id),
11028 RowLocator::Cold {
11029 segment_id: seg_id,
11030 page_offset: 0,
11031 },
11032 )
11033 })
11034 .collect();
11035 cat.get_mut("users")
11036 .unwrap()
11037 .register_cold_locators("by_id", pairs)
11038 .unwrap();
11039
11040 // Hot tier hits.
11041 assert_eq!(
11042 cat.lookup_by_pk("users", "by_id", &IndexKey::Int(1))
11043 .unwrap(),
11044 make_user_row(1, "alice")
11045 );
11046 assert_eq!(
11047 cat.lookup_by_pk("users", "by_id", &IndexKey::Int(2))
11048 .unwrap(),
11049 make_user_row(2, "bob")
11050 );
11051 // Cold tier hits.
11052 assert_eq!(
11053 cat.lookup_by_pk("users", "by_id", &IndexKey::Int(100))
11054 .unwrap(),
11055 make_user_row(100, "ivy")
11056 );
11057 assert_eq!(
11058 cat.lookup_by_pk("users", "by_id", &IndexKey::Int(200))
11059 .unwrap(),
11060 make_user_row(200, "joe")
11061 );
11062 // Miss in both tiers.
11063 assert!(
11064 cat.lookup_by_pk("users", "by_id", &IndexKey::Int(50))
11065 .is_none()
11066 );
11067 }
11068
11069 #[test]
11070 fn register_cold_locators_rejects_nsw_index() {
11071 let mut cat = Catalog::new();
11072 cat.create_table(TableSchema::new(
11073 "vecs",
11074 vec![
11075 ColumnSchema::new("id", DataType::Int, false),
11076 ColumnSchema::new(
11077 "v",
11078 DataType::Vector {
11079 dim: 4,
11080 encoding: VecEncoding::F32,
11081 },
11082 false,
11083 ),
11084 ],
11085 ))
11086 .unwrap();
11087 let t = cat.get_mut("vecs").unwrap();
11088 t.insert(Row::new(vec![
11089 Value::Int(1),
11090 Value::Vector(vec![1.0, 0.0, 0.0, 0.0]),
11091 ]))
11092 .unwrap();
11093 t.add_nsw_index("by_v".into(), "v", NSW_DEFAULT_M).unwrap();
11094 let err = t
11095 .register_cold_locators(
11096 "by_v",
11097 vec![(
11098 IndexKey::Int(1),
11099 RowLocator::Cold {
11100 segment_id: 0,
11101 page_offset: 0,
11102 },
11103 )],
11104 )
11105 .unwrap_err();
11106 // v6.7.1: message switched from "is NSW" to "is not BTree"
11107 // when the Brin variant was added.
11108 assert!(matches!(err, StorageError::Corrupt(ref s) if s.contains("not BTree")));
11109 }
11110
11111 #[test]
11112 fn load_segment_bytes_rejects_garbage() {
11113 let mut cat = Catalog::new();
11114 let err = cat.load_segment_bytes(vec![0u8; 10]).unwrap_err();
11115 assert!(matches!(err, StorageError::Corrupt(ref s) if s.contains("segment")));
11116 // Loader doesn't mutate state on error.
11117 assert_eq!(cat.cold_segment_count(), 0);
11118 }
11119
11120 #[test]
11121 fn load_segment_bytes_returns_sequential_ids() {
11122 let mut cat = Catalog::new();
11123 cat.create_table(bigint_pk_users_schema()).unwrap();
11124 let schema = cat.get("users").unwrap().schema.clone();
11125 for batch in 0u32..3 {
11126 let rows: Vec<(u64, Vec<u8>)> = (0u64..4)
11127 .map(|i| {
11128 let id = u64::from(batch) * 100 + i;
11129 let row = make_user_row(id.cast_signed(), "x");
11130 (id, encode_row_body_dense(&row, &schema))
11131 })
11132 .collect();
11133 let (bytes, _) = encode_segment(rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES).unwrap();
11134 assert_eq!(cat.load_segment_bytes(bytes).unwrap(), batch);
11135 }
11136 assert_eq!(cat.cold_segment_count(), 3);
11137 }
11138
11139 // --- v5.2 catalog format v9 ----------------------------------
11140
11141 /// Hand-craft a v8 catalog byte stream and confirm the v9 reader
11142 /// accepts it and surfaces every `BTree` entry as a Hot locator.
11143 /// Guards the backward-compat read path: existing v3.0.2 / v4.x
11144 /// snapshots on disk must keep loading after the v5.2 bump.
11145 #[test]
11146 fn v8_catalog_decodes_as_all_hot_under_v9_reader() {
11147 // Build a populated catalog in memory, snapshot it with the
11148 // v9 serializer, then patch the version byte back to 8 and
11149 // strip the v9 BTree payload bytes so the layout matches what
11150 // a real v8 snapshot would have produced on disk. The v9
11151 // reader's version dispatch path then rebuilds the index
11152 // from rows (every locator becomes Hot).
11153 let mut cat = populated_users();
11154 cat.get_mut("users")
11155 .unwrap()
11156 .add_index("by_name".into(), "name")
11157 .unwrap();
11158
11159 // To produce a faithful v8 byte stream we re-encode the same
11160 // catalog with the v8 layout: identical bytes up to (and
11161 // including) the per-index kind tag, but no inline BTree
11162 // entries.
11163 let v8_bytes = encode_as_v8(&cat);
11164 assert_eq!(v8_bytes[FILE_MAGIC.len()], 8, "version byte must be 8");
11165
11166 let restored = Catalog::deserialize(&v8_bytes).expect("v9 reader accepts v8 stream");
11167 let idx = restored
11168 .get("users")
11169 .unwrap()
11170 .index_on(1)
11171 .expect("index_on(1) after restore");
11172 // v8 path always materialises Hot locators (no cold tier
11173 // existed pre-v5.2).
11174 assert_eq!(
11175 idx.lookup_eq(&IndexKey::Text("alice".into())),
11176 &[RowLocator::Hot(0), RowLocator::Hot(2)]
11177 );
11178 // No accidental Cold leak.
11179 for entry in idx.lookup_eq(&IndexKey::Text("alice".into())) {
11180 assert!(entry.is_hot(), "v8 → v9 read must yield Hot only");
11181 }
11182 }
11183
11184 /// Encode `cat` using the v8 layout (no inline `BTree` entries,
11185 /// version byte = 8). Pure test helper — duplicates just enough
11186 /// of `Catalog::serialize` to produce a faithful v8 stream that
11187 /// real v3.0.2 / v4.x deployments wrote.
11188 fn encode_as_v8(cat: &Catalog) -> Vec<u8> {
11189 let mut out = Vec::with_capacity(64);
11190 out.extend_from_slice(FILE_MAGIC);
11191 out.push(8u8);
11192 write_u32(&mut out, u32::try_from(cat.tables.len()).unwrap());
11193 for t in &cat.tables {
11194 write_str(&mut out, &t.schema.name);
11195 write_u16(&mut out, u16::try_from(t.schema.columns.len()).unwrap());
11196 for c in &t.schema.columns {
11197 write_str(&mut out, &c.name);
11198 write_data_type(&mut out, c.ty);
11199 out.push(u8::from(c.nullable));
11200 match &c.default {
11201 None => out.push(0),
11202 Some(v) => {
11203 out.push(1);
11204 write_value(&mut out, v);
11205 }
11206 }
11207 out.push(u8::from(c.auto_increment));
11208 }
11209 write_u32(&mut out, u32::try_from(t.rows.len()).unwrap());
11210 for row in &t.rows {
11211 out.extend_from_slice(&encode_row_body_dense(row, &t.schema));
11212 }
11213 write_u16(&mut out, u16::try_from(t.indices.len()).unwrap());
11214 for idx in &t.indices {
11215 write_str(&mut out, &idx.name);
11216 write_u16(&mut out, u16::try_from(idx.column_position).unwrap());
11217 match &idx.kind {
11218 // v8 BTree wrote only the kind tag; entries
11219 // rebuild from rows on read.
11220 IndexKind::BTree(_) => out.push(0),
11221 IndexKind::Nsw(g) => {
11222 out.push(1);
11223 write_u16(&mut out, u16::try_from(g.m).unwrap());
11224 write_nsw_graph(&mut out, g);
11225 }
11226 // v8 had no BRIN / GIN; this test-only writer
11227 // can't serialise either into the legacy format.
11228 IndexKind::Brin { .. } => panic!(
11229 "v8 catalog writer cannot serialise BRIN — \
11230 tests with BRIN indices must use the current writer"
11231 ),
11232 IndexKind::Gin(_) => panic!(
11233 "v8 catalog writer cannot serialise GIN — \
11234 tests with GIN indices must use the current writer"
11235 ),
11236 IndexKind::GinTrgm(_) => panic!(
11237 "v8 catalog writer cannot serialise trigram-GIN — \
11238 tests with trgm indices must use the current writer"
11239 ),
11240 IndexKind::GinFulltext(_) => panic!(
11241 "v8 catalog writer cannot serialise fulltext-GIN — \
11242 tests with FULLTEXT KEY must use the current writer"
11243 ),
11244 }
11245 }
11246 }
11247 out
11248 }
11249
11250 /// Build a catalog that carries both hot and cold locators on a
11251 /// `BTree` index, snapshot it through `serialize`, then deserialise
11252 /// and confirm every Cold locator round-trips byte-identical and
11253 /// `lookup_by_pk` resolves through the rebuilt cold-segment
11254 /// registry.
11255 #[test]
11256 fn v9_catalog_round_trip_preserves_cold_locators() {
11257 let mut cat = Catalog::new();
11258 cat.create_table(bigint_pk_users_schema()).unwrap();
11259 let t = cat.get_mut("users").unwrap();
11260 // Hot rows: 1, 2
11261 for (id, name) in [(1i64, "alice"), (2, "bob")] {
11262 t.insert(make_user_row(id, name)).unwrap();
11263 }
11264 t.add_index("by_id".into(), "id").unwrap();
11265 let schema = t.schema.clone();
11266
11267 // Cold rows: 100, 200, 300 — sit in a single segment.
11268 let cold_rows: Vec<(i64, &str)> = vec![(100, "ivy"), (200, "joe"), (300, "kim")];
11269 let seg_rows: Vec<(u64, Vec<u8>)> = cold_rows
11270 .iter()
11271 .map(|(id, name)| {
11272 let row = make_user_row(*id, name);
11273 ((*id).cast_unsigned(), encode_row_body_dense(&row, &schema))
11274 })
11275 .collect();
11276 let (seg_bytes, _) =
11277 encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES).unwrap();
11278 let seg_id = cat.load_segment_bytes(seg_bytes.clone()).unwrap();
11279 let pairs: Vec<(IndexKey, RowLocator)> = cold_rows
11280 .iter()
11281 .map(|(id, _)| {
11282 (
11283 IndexKey::Int(*id),
11284 RowLocator::Cold {
11285 segment_id: seg_id,
11286 page_offset: 0,
11287 },
11288 )
11289 })
11290 .collect();
11291 cat.get_mut("users")
11292 .unwrap()
11293 .register_cold_locators("by_id", pairs)
11294 .unwrap();
11295
11296 // Snapshot + restore via the v9 codec.
11297 let bytes = cat.serialize();
11298 assert_eq!(bytes[FILE_MAGIC.len()], FILE_VERSION);
11299 let mut restored = Catalog::deserialize(&bytes).expect("v9 round-trip parses");
11300
11301 // Catalog::serialize does not yet emit cold segment file
11302 // bytes (v5.3 manifest is the future home for that). For
11303 // this v9 test the caller side-loads the segment again so
11304 // lookup_by_pk can resolve the Cold locator. The point of
11305 // this assertion is that the locator metadata survived the
11306 // catalog round-trip.
11307 let restored_seg_id = restored.load_segment_bytes(seg_bytes).unwrap();
11308 assert_eq!(restored_seg_id, seg_id);
11309
11310 let idx = restored.get("users").unwrap().index_on(0).unwrap();
11311 // Hot locators round-trip.
11312 assert_eq!(idx.lookup_eq(&IndexKey::Int(1)), &[RowLocator::Hot(0)]);
11313 assert_eq!(idx.lookup_eq(&IndexKey::Int(2)), &[RowLocator::Hot(1)]);
11314 // Cold locators round-trip byte-identical.
11315 for (id, _) in &cold_rows {
11316 assert_eq!(
11317 idx.lookup_eq(&IndexKey::Int(*id)),
11318 &[RowLocator::Cold {
11319 segment_id: seg_id,
11320 page_offset: 0,
11321 }]
11322 );
11323 }
11324 // End-to-end: lookup_by_pk resolves both tiers.
11325 assert_eq!(
11326 restored
11327 .lookup_by_pk("users", "by_id", &IndexKey::Int(2))
11328 .unwrap(),
11329 make_user_row(2, "bob")
11330 );
11331 for (id, name) in &cold_rows {
11332 assert_eq!(
11333 restored
11334 .lookup_by_pk("users", "by_id", &IndexKey::Int(*id))
11335 .unwrap(),
11336 make_user_row(*id, name)
11337 );
11338 }
11339 }
11340
11341 // --- v5.2.1 hot tier byte tracking ---------------------------
11342
11343 /// `row_body_encoded_len` is the perf-critical fast path; pin it
11344 /// against `encode_row_body_dense(...).len()` for every
11345 /// representative cell type so an encoder change can't silently
11346 /// desync the counter.
11347 #[test]
11348 fn row_body_encoded_len_matches_actual_encode_for_all_types() {
11349 let schema = TableSchema::new(
11350 "wide",
11351 vec![
11352 ColumnSchema::new("a", DataType::SmallInt, true),
11353 ColumnSchema::new("b", DataType::Int, false),
11354 ColumnSchema::new("c", DataType::BigInt, false),
11355 ColumnSchema::new("d", DataType::Float, false),
11356 ColumnSchema::new("e", DataType::Bool, false),
11357 ColumnSchema::new("f", DataType::Text, false),
11358 ColumnSchema::new(
11359 "g",
11360 DataType::Vector {
11361 dim: 3,
11362 encoding: VecEncoding::F32,
11363 },
11364 false,
11365 ),
11366 ColumnSchema::new(
11367 "h",
11368 DataType::Numeric {
11369 precision: 18,
11370 scale: 2,
11371 },
11372 false,
11373 ),
11374 ColumnSchema::new("i", DataType::Date, false),
11375 ColumnSchema::new("j", DataType::Timestamp, false),
11376 ],
11377 );
11378 let cases: &[Row] = &[
11379 Row::new(vec![
11380 Value::SmallInt(7),
11381 Value::Int(42),
11382 Value::BigInt(1_000_000),
11383 Value::Float(1.5),
11384 Value::Bool(true),
11385 Value::Text("hello".into()),
11386 Value::Vector(vec![1.0, 2.0, 3.0]),
11387 Value::Numeric {
11388 scaled: 12345,
11389 scale: 2,
11390 },
11391 Value::Date(20_000),
11392 Value::Timestamp(1_700_000_000_000_000),
11393 ]),
11394 // NULL in the bitmap, varied text length.
11395 Row::new(vec![
11396 Value::Null,
11397 Value::Int(0),
11398 Value::BigInt(0),
11399 Value::Float(0.0),
11400 Value::Bool(false),
11401 Value::Text(String::new()),
11402 Value::Vector(vec![]),
11403 Value::Numeric {
11404 scaled: 0,
11405 scale: 2,
11406 },
11407 Value::Date(0),
11408 Value::Timestamp(0),
11409 ]),
11410 Row::new(vec![
11411 Value::SmallInt(-1),
11412 Value::Int(-1),
11413 Value::BigInt(-1),
11414 Value::Float(-0.5),
11415 Value::Bool(true),
11416 Value::Text("a much longer payload here".into()),
11417 Value::Vector(vec![0.1, 0.2, 0.3]),
11418 Value::Numeric {
11419 scaled: -999_999_999,
11420 scale: 2,
11421 },
11422 Value::Date(-1),
11423 Value::Timestamp(-1),
11424 ]),
11425 ];
11426 for row in cases {
11427 let actual = encode_row_body_dense(row, &schema).len();
11428 let fast = row_body_encoded_len(row, &schema);
11429 assert_eq!(actual, fast, "row {row:?}");
11430 }
11431 }
11432
11433 #[test]
11434 fn hot_bytes_grows_on_insert_and_matches_encoded_sum() {
11435 let mut cat = Catalog::new();
11436 cat.create_table(bigint_pk_users_schema()).unwrap();
11437 let t = cat.get_mut("users").unwrap();
11438 assert_eq!(t.hot_bytes(), 0);
11439 let mut expected: u64 = 0;
11440 for (id, name) in [(1i64, "alice"), (2, "bob"), (3, "carol")] {
11441 let row = make_user_row(id, name);
11442 expected += encode_row_body_dense(&row, &t.schema).len() as u64;
11443 t.insert(row).unwrap();
11444 }
11445 assert_eq!(t.hot_bytes(), expected);
11446 assert_eq!(cat.hot_tier_bytes(), expected);
11447 }
11448
11449 #[test]
11450 fn hot_bytes_shrinks_on_delete() {
11451 let mut cat = Catalog::new();
11452 cat.create_table(bigint_pk_users_schema()).unwrap();
11453 let t = cat.get_mut("users").unwrap();
11454 for (id, name) in [(1i64, "alice"), (2, "bob"), (3, "carol")] {
11455 t.insert(make_user_row(id, name)).unwrap();
11456 }
11457 let before = t.hot_bytes();
11458 // Delete row at position 1 (bob).
11459 let bob_row = make_user_row(2, "bob");
11460 let bob_bytes = encode_row_body_dense(&bob_row, &t.schema).len() as u64;
11461 let removed = t.delete_rows(&[1]);
11462 assert_eq!(removed, 1);
11463 assert_eq!(t.hot_bytes(), before - bob_bytes);
11464 }
11465
11466 #[test]
11467 fn hot_bytes_diffs_on_update_for_variable_width_columns() {
11468 let mut cat = Catalog::new();
11469 cat.create_table(bigint_pk_users_schema()).unwrap();
11470 let t = cat.get_mut("users").unwrap();
11471 t.insert(make_user_row(1, "alice")).unwrap();
11472 let after_insert = t.hot_bytes();
11473 // Update with a longer text payload — bytes must grow exactly
11474 // by the text-length delta.
11475 let new_row = make_user_row(1, "alice-the-longer-name");
11476 let old_len = encode_row_body_dense(&make_user_row(1, "alice"), &t.schema).len() as u64;
11477 let new_len = encode_row_body_dense(&new_row, &t.schema).len() as u64;
11478 t.update_row(0, new_row.values).unwrap();
11479 assert_eq!(t.hot_bytes(), after_insert - old_len + new_len);
11480 assert!(t.hot_bytes() > after_insert, "longer text grew the counter");
11481 }
11482
11483 #[test]
11484 fn hot_bytes_round_trips_through_serialize_deserialize() {
11485 let mut cat = Catalog::new();
11486 cat.create_table(bigint_pk_users_schema()).unwrap();
11487 let t = cat.get_mut("users").unwrap();
11488 for i in 0..10 {
11489 t.insert(make_user_row(i, &alloc::format!("name-{i}")))
11490 .unwrap();
11491 }
11492 let pre = cat.hot_tier_bytes();
11493 let restored = Catalog::deserialize(&cat.serialize()).unwrap();
11494 assert_eq!(restored.hot_tier_bytes(), pre);
11495 assert_eq!(restored.get("users").unwrap().hot_bytes(), pre);
11496 }
11497
11498 // --- v5.2.2 freezer atomic swap -------------------------------
11499
11500 /// Happy path: freeze the first half of a populated hot tier,
11501 /// confirm row counts shift, `hot_bytes` shrinks, and every frozen
11502 /// PK still resolves via `lookup_by_pk` (now through the cold
11503 /// segment registered by the freeze).
11504 #[test]
11505 fn freeze_oldest_to_cold_moves_rows_and_keeps_lookups_working() {
11506 let mut cat = Catalog::new();
11507 cat.create_table(bigint_pk_users_schema()).unwrap();
11508 let t = cat.get_mut("users").unwrap();
11509 for id in 0..10i64 {
11510 t.insert(make_user_row(id, &alloc::format!("u-{id}")))
11511 .unwrap();
11512 }
11513 t.add_index("by_id".into(), "id").unwrap();
11514 let total_bytes_before = t.hot_bytes();
11515
11516 let report = cat
11517 .freeze_oldest_to_cold("users", "by_id", 6)
11518 .expect("freeze succeeds");
11519 assert_eq!(report.frozen_rows, 6);
11520 assert_eq!(report.segment_id, 0);
11521 assert!(report.bytes_freed > 0);
11522 assert!(!report.segment_bytes.is_empty());
11523
11524 let t = cat.get("users").unwrap();
11525 assert_eq!(t.row_count(), 4, "4 hot rows remain (10 - 6 frozen)");
11526 assert_eq!(cat.cold_segment_count(), 1);
11527 // Hot bytes shrank by exactly the freed amount.
11528 assert_eq!(
11529 t.hot_bytes(),
11530 total_bytes_before - report.bytes_freed,
11531 "hot_bytes accounting matches FreezeReport"
11532 );
11533
11534 // Every original PK still resolves — frozen ones via the
11535 // cold segment, kept ones via the (renumbered) hot tier.
11536 for id in 0..10i64 {
11537 let got = cat
11538 .lookup_by_pk("users", "by_id", &IndexKey::Int(id))
11539 .unwrap_or_else(|| panic!("PK {id} disappeared after freeze"));
11540 assert_eq!(got, make_user_row(id, &alloc::format!("u-{id}")));
11541 }
11542 }
11543
11544 /// Two successive freezes on the same index must preserve the
11545 /// first batch's cold locators when the second freeze runs.
11546 /// Catches the `rebuild_indices` wipe-Cold-on-delete bug that
11547 /// `collect_cold_locators` / re-register guards against.
11548 #[test]
11549 fn freeze_twice_preserves_prior_cold_locators() {
11550 let mut cat = Catalog::new();
11551 cat.create_table(bigint_pk_users_schema()).unwrap();
11552 let t = cat.get_mut("users").unwrap();
11553 for id in 0..12i64 {
11554 t.insert(make_user_row(id, &alloc::format!("u-{id}")))
11555 .unwrap();
11556 }
11557 t.add_index("by_id".into(), "id").unwrap();
11558
11559 cat.freeze_oldest_to_cold("users", "by_id", 4)
11560 .expect("first freeze ok");
11561 cat.freeze_oldest_to_cold("users", "by_id", 4)
11562 .expect("second freeze ok");
11563
11564 assert_eq!(cat.get("users").unwrap().row_count(), 4);
11565 assert_eq!(cat.cold_segment_count(), 2);
11566 // All 12 PKs still resolve — first 4 via segment 0,
11567 // next 4 via segment 1, last 4 still hot.
11568 for id in 0..12i64 {
11569 let got = cat
11570 .lookup_by_pk("users", "by_id", &IndexKey::Int(id))
11571 .unwrap_or_else(|| panic!("PK {id} not resolvable after two freezes"));
11572 assert_eq!(got, make_user_row(id, &alloc::format!("u-{id}")));
11573 }
11574 }
11575
11576 /// Validation guard tests. Each must return `Err` and **not
11577 /// mutate the catalog** — the API is all-or-nothing.
11578 #[test]
11579 fn freeze_oldest_to_cold_rejects_invalid_input() {
11580 let mut cat = Catalog::new();
11581 cat.create_table(bigint_pk_users_schema()).unwrap();
11582 let t = cat.get_mut("users").unwrap();
11583 for id in 0..3i64 {
11584 t.insert(make_user_row(id, &alloc::format!("u-{id}")))
11585 .unwrap();
11586 }
11587 t.add_index("by_id".into(), "id").unwrap();
11588
11589 // max_rows == 0
11590 assert!(matches!(
11591 cat.freeze_oldest_to_cold("users", "by_id", 0),
11592 Err(StorageError::Corrupt(_))
11593 ));
11594 // table missing
11595 assert!(matches!(
11596 cat.freeze_oldest_to_cold("missing", "by_id", 1),
11597 Err(StorageError::Corrupt(_))
11598 ));
11599 // index missing
11600 assert!(matches!(
11601 cat.freeze_oldest_to_cold("users", "no_such_index", 1),
11602 Err(StorageError::Corrupt(_))
11603 ));
11604 // max_rows > row_count
11605 assert!(matches!(
11606 cat.freeze_oldest_to_cold("users", "by_id", 999),
11607 Err(StorageError::Corrupt(_))
11608 ));
11609 // Catalog still untouched.
11610 assert_eq!(cat.get("users").unwrap().row_count(), 3);
11611 assert_eq!(cat.cold_segment_count(), 0);
11612 }
11613
11614 /// Freeze with a non-integer PK column must surface a clear
11615 /// error (Text PKs land in v5.5+).
11616 #[test]
11617 fn freeze_oldest_to_cold_rejects_non_integer_pk() {
11618 let mut cat = Catalog::new();
11619 cat.create_table(TableSchema::new(
11620 "by_name",
11621 vec![
11622 ColumnSchema::new("name", DataType::Text, false),
11623 ColumnSchema::new("payload", DataType::BigInt, false),
11624 ],
11625 ))
11626 .unwrap();
11627 let t = cat.get_mut("by_name").unwrap();
11628 t.insert(Row::new(vec![Value::Text("a".into()), Value::BigInt(1)]))
11629 .unwrap();
11630 t.add_index("by_n".into(), "name").unwrap();
11631 let err = cat
11632 .freeze_oldest_to_cold("by_name", "by_n", 1)
11633 .expect_err("non-integer PK rejected");
11634 match err {
11635 StorageError::Corrupt(s) => assert!(
11636 s.contains("non-integer"),
11637 "error message names the constraint: {s}"
11638 ),
11639 other => panic!("expected Corrupt, got {other:?}"),
11640 }
11641 // Catalog untouched.
11642 assert_eq!(cat.get("by_name").unwrap().row_count(), 1);
11643 assert_eq!(cat.cold_segment_count(), 0);
11644 }
11645
11646 /// Hot-tier rows after the freeze must keep their secondary-
11647 /// index lookups working — `delete_rows` shifts positions, and
11648 /// `rebuild_indices` must regenerate Hot locators at the new
11649 /// indices.
11650 #[test]
11651 fn freeze_keeps_remaining_hot_rows_addressable_via_secondary_index() {
11652 let mut cat = Catalog::new();
11653 cat.create_table(bigint_pk_users_schema()).unwrap();
11654 let t = cat.get_mut("users").unwrap();
11655 for id in 0..6i64 {
11656 t.insert(make_user_row(id, &alloc::format!("u-{id}")))
11657 .unwrap();
11658 }
11659 t.add_index("by_id".into(), "id").unwrap();
11660 t.add_index("by_name".into(), "name").unwrap();
11661
11662 cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
11663
11664 // Remaining hot rows: id 3, 4, 5. They moved to positions
11665 // 0, 1, 2 inside `self.rows`; the `by_name` index must now
11666 // resolve them via fresh Hot locators.
11667 let idx = cat.get("users").unwrap().index_on(1).unwrap();
11668 let got = idx.lookup_eq(&IndexKey::Text("u-4".into()));
11669 assert_eq!(got.len(), 1);
11670 assert!(got[0].is_hot(), "kept-hot rows still surface as Hot");
11671 match got[0] {
11672 RowLocator::Hot(i) => {
11673 // The 4th-inserted row was at position 4; after
11674 // dropping positions 0..3 it sits at position 1.
11675 assert_eq!(i, 1);
11676 }
11677 RowLocator::Cold { .. } => unreachable!(),
11678 }
11679 }
11680
11681 // --- v5.2.3 promote-on-write primitives ----------------------
11682
11683 /// Build a populated catalog with the first N rows frozen, then
11684 /// run `promote_cold_row` and verify the row crossed tiers
11685 /// correctly: the cold locator is retired, a fresh Hot locator
11686 /// appears, `lookup_by_pk` returns the row from the hot tier, and
11687 /// `hot_bytes` grew by the row's encoded byte length.
11688 #[test]
11689 fn promote_cold_row_pulls_frozen_row_back_to_hot_tier() {
11690 let mut cat = Catalog::new();
11691 cat.create_table(bigint_pk_users_schema()).unwrap();
11692 let t = cat.get_mut("users").unwrap();
11693 for id in 0..6i64 {
11694 t.insert(make_user_row(id, &alloc::format!("u-{id}")))
11695 .unwrap();
11696 }
11697 t.add_index("by_id".into(), "id").unwrap();
11698 // Freeze first 4 rows (ids 0..3). After: hot rows = 4, 5 at
11699 // positions 0, 1; cold locators for keys 0..3.
11700 cat.freeze_oldest_to_cold("users", "by_id", 4).unwrap();
11701 let hot_bytes_before = cat.get("users").unwrap().hot_bytes();
11702
11703 // Promote PK=2 — it lives in segment 0 as a cold row.
11704 let new_idx = cat
11705 .promote_cold_row("users", "by_id", &IndexKey::Int(2))
11706 .expect("promote ok")
11707 .expect("PK 2 was cold");
11708 assert_eq!(
11709 new_idx, 2,
11710 "promoted row appended after the 2 surviving hot rows"
11711 );
11712
11713 let t = cat.get("users").unwrap();
11714 assert_eq!(t.row_count(), 3, "hot tier grew from 2 to 3");
11715 // Hot-bytes climbed by exactly one row's encoded length.
11716 let row = make_user_row(2, "u-2");
11717 let row_len = encode_row_body_dense(&row, &t.schema).len() as u64;
11718 assert_eq!(t.hot_bytes(), hot_bytes_before + row_len);
11719
11720 // The index now reports a Hot locator (the freshly inserted
11721 // row) — no Cold locator left for PK 2.
11722 let entries = t.index_on(0).unwrap().lookup_eq(&IndexKey::Int(2));
11723 assert_eq!(entries.len(), 1, "exactly one locator per key");
11724 assert!(entries[0].is_hot(), "promote retired the Cold locator");
11725 // End-to-end: lookup_by_pk still returns the row body.
11726 assert_eq!(
11727 cat.lookup_by_pk("users", "by_id", &IndexKey::Int(2))
11728 .unwrap(),
11729 row
11730 );
11731 // Other cold rows untouched — still resolvable through the
11732 // segment.
11733 assert_eq!(
11734 cat.lookup_by_pk("users", "by_id", &IndexKey::Int(0))
11735 .unwrap(),
11736 make_user_row(0, "u-0")
11737 );
11738 }
11739
11740 /// `promote_cold_row` on a key that's already hot (or absent)
11741 /// returns `Ok(None)` — not an error. The caller falls back to
11742 /// the hot-only update/delete path.
11743 #[test]
11744 fn promote_cold_row_returns_none_when_key_is_not_cold() {
11745 let mut cat = Catalog::new();
11746 cat.create_table(bigint_pk_users_schema()).unwrap();
11747 let t = cat.get_mut("users").unwrap();
11748 t.insert(make_user_row(7, "alice")).unwrap();
11749 t.add_index("by_id".into(), "id").unwrap();
11750
11751 // Hot-only key.
11752 assert!(
11753 cat.promote_cold_row("users", "by_id", &IndexKey::Int(7))
11754 .unwrap()
11755 .is_none()
11756 );
11757 // Absent key.
11758 assert!(
11759 cat.promote_cold_row("users", "by_id", &IndexKey::Int(99))
11760 .unwrap()
11761 .is_none()
11762 );
11763 // Catalog untouched on both no-op paths.
11764 assert_eq!(cat.get("users").unwrap().row_count(), 1);
11765 assert_eq!(cat.cold_segment_count(), 0);
11766 }
11767
11768 /// `shadow_cold_row` removes every Cold locator for a key on a
11769 /// `BTree` index. After the shadow, `lookup_by_pk` for that key
11770 /// returns None (the row data still sits in the segment file,
11771 /// but it's now garbage; compaction will reclaim it later).
11772 #[test]
11773 fn shadow_cold_row_removes_cold_locators_and_drops_lookup() {
11774 let mut cat = Catalog::new();
11775 cat.create_table(bigint_pk_users_schema()).unwrap();
11776 let t = cat.get_mut("users").unwrap();
11777 for id in 0..5i64 {
11778 t.insert(make_user_row(id, &alloc::format!("u-{id}")))
11779 .unwrap();
11780 }
11781 t.add_index("by_id".into(), "id").unwrap();
11782 cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
11783
11784 // Shadow PK=1 — pre-shadow lookup hits the cold tier.
11785 assert!(
11786 cat.lookup_by_pk("users", "by_id", &IndexKey::Int(1))
11787 .is_some(),
11788 "frozen PK resolves before shadow"
11789 );
11790 let removed = cat
11791 .shadow_cold_row("users", "by_id", &IndexKey::Int(1))
11792 .unwrap();
11793 assert_eq!(removed, 1, "exactly one cold locator retired");
11794
11795 // Post-shadow: lookup misses, even though the row still
11796 // exists in segment 0.
11797 assert!(
11798 cat.lookup_by_pk("users", "by_id", &IndexKey::Int(1))
11799 .is_none(),
11800 "shadowed key no longer resolves"
11801 );
11802 // Other cold keys still resolve.
11803 assert_eq!(
11804 cat.lookup_by_pk("users", "by_id", &IndexKey::Int(0))
11805 .unwrap(),
11806 make_user_row(0, "u-0")
11807 );
11808 assert_eq!(
11809 cat.lookup_by_pk("users", "by_id", &IndexKey::Int(2))
11810 .unwrap(),
11811 make_user_row(2, "u-2")
11812 );
11813 }
11814
11815 /// `shadow_cold_row` returns 0 (not Err) for keys with only Hot
11816 /// entries or no entries — the engine's DELETE path uses this
11817 /// signal to decide whether the cold-tier shadow path consumed
11818 /// the work.
11819 #[test]
11820 fn shadow_cold_row_returns_zero_when_key_is_not_cold() {
11821 let mut cat = Catalog::new();
11822 cat.create_table(bigint_pk_users_schema()).unwrap();
11823 let t = cat.get_mut("users").unwrap();
11824 t.insert(make_user_row(1, "alice")).unwrap();
11825 t.add_index("by_id".into(), "id").unwrap();
11826 assert_eq!(
11827 cat.shadow_cold_row("users", "by_id", &IndexKey::Int(1))
11828 .unwrap(),
11829 0,
11830 "hot-only key drops no cold locators"
11831 );
11832 assert_eq!(
11833 cat.shadow_cold_row("users", "by_id", &IndexKey::Int(999))
11834 .unwrap(),
11835 0,
11836 "absent key drops no cold locators"
11837 );
11838 assert_eq!(cat.get("users").unwrap().row_count(), 1);
11839 }
11840
11841 /// Validation guards on both promote / shadow primitives.
11842 #[test]
11843 fn promote_and_shadow_reject_invalid_inputs() {
11844 let mut cat = Catalog::new();
11845 cat.create_table(bigint_pk_users_schema()).unwrap();
11846 let t = cat.get_mut("users").unwrap();
11847 t.insert(make_user_row(1, "alice")).unwrap();
11848 t.add_index("by_id".into(), "id").unwrap();
11849
11850 // Missing table.
11851 assert!(matches!(
11852 cat.promote_cold_row("missing", "by_id", &IndexKey::Int(1)),
11853 Err(StorageError::Corrupt(_))
11854 ));
11855 assert!(matches!(
11856 cat.shadow_cold_row("missing", "by_id", &IndexKey::Int(1)),
11857 Err(StorageError::Corrupt(_))
11858 ));
11859 // Missing index.
11860 assert!(matches!(
11861 cat.promote_cold_row("users", "no_such_index", &IndexKey::Int(1)),
11862 Err(StorageError::Corrupt(_))
11863 ));
11864 assert!(matches!(
11865 cat.shadow_cold_row("users", "no_such_index", &IndexKey::Int(1)),
11866 Err(StorageError::Corrupt(_))
11867 ));
11868 }
11869
11870 // --- v6.7.4 parallel-freezer slice/commit API -----------------
11871
11872 /// One slice covering the entire freeze produces the same
11873 /// catalog state as the single-threaded `freeze_oldest_to_cold`
11874 /// — segment id, frozen row count, hot byte delta, and every
11875 /// post-freeze PK lookup match exactly.
11876 #[test]
11877 fn commit_freeze_slices_single_slice_matches_freeze_oldest() {
11878 let mut a = Catalog::new();
11879 let mut b = Catalog::new();
11880 for cat in [&mut a, &mut b] {
11881 cat.create_table(bigint_pk_users_schema()).unwrap();
11882 let t = cat.get_mut("users").unwrap();
11883 for id in 0..10i64 {
11884 t.insert(make_user_row(id, &alloc::format!("u-{id}")))
11885 .unwrap();
11886 }
11887 t.add_index("by_id".into(), "id").unwrap();
11888 }
11889 let single = a.freeze_oldest_to_cold("users", "by_id", 6).unwrap();
11890 let slice = b
11891 .prepare_freeze_slice("users", "by_id", 0..6)
11892 .expect("prepare");
11893 let parallel = b
11894 .commit_freeze_slices("users", "by_id", alloc::vec![slice])
11895 .expect("commit");
11896 assert_eq!(single.segment_id, parallel.segment_id);
11897 assert_eq!(single.frozen_rows, parallel.frozen_rows);
11898 assert_eq!(single.bytes_freed, parallel.bytes_freed);
11899 assert_eq!(single.segment_bytes, parallel.segment_bytes);
11900 // Same post-freeze lookup behaviour on both catalogs.
11901 for id in 0..10i64 {
11902 assert_eq!(
11903 a.lookup_by_pk("users", "by_id", &IndexKey::Int(id)),
11904 b.lookup_by_pk("users", "by_id", &IndexKey::Int(id)),
11905 "PK {id} differs after single vs slice freeze"
11906 );
11907 }
11908 }
11909
11910 /// Two slices covering disjoint halves of the freeze produce
11911 /// the same merged segment as one slice covering the full
11912 /// range. The k-way merge preserves PK ordering even when
11913 /// slice halves alternate.
11914 #[test]
11915 fn commit_freeze_slices_two_slices_match_single_slice() {
11916 let mut a = Catalog::new();
11917 let mut b = Catalog::new();
11918 for cat in [&mut a, &mut b] {
11919 cat.create_table(bigint_pk_users_schema()).unwrap();
11920 let t = cat.get_mut("users").unwrap();
11921 // Random-ish PKs so the per-slice sort actually has
11922 // work to do (and slice halves carry interleaved keys).
11923 for id in [3, 7, 1, 9, 5, 0, 8, 4, 2, 6].iter().copied() {
11924 t.insert(make_user_row(id as i64, &alloc::format!("u-{id}")))
11925 .unwrap();
11926 }
11927 t.add_index("by_id".into(), "id").unwrap();
11928 }
11929 let single = a
11930 .prepare_freeze_slice("users", "by_id", 0..8)
11931 .expect("prepare");
11932 let one = a
11933 .commit_freeze_slices("users", "by_id", alloc::vec![single])
11934 .expect("commit one");
11935 let s1 = b
11936 .prepare_freeze_slice("users", "by_id", 0..4)
11937 .expect("prepare s1");
11938 let s2 = b
11939 .prepare_freeze_slice("users", "by_id", 4..8)
11940 .expect("prepare s2");
11941 let two = b
11942 .commit_freeze_slices("users", "by_id", alloc::vec![s1, s2])
11943 .expect("commit two");
11944 assert_eq!(one.segment_bytes, two.segment_bytes);
11945 assert_eq!(one.frozen_rows, two.frozen_rows);
11946 // Every PK that survived freeze (hot or cold) resolves on
11947 // both catalogs.
11948 for id in 0..10i64 {
11949 assert_eq!(
11950 a.lookup_by_pk("users", "by_id", &IndexKey::Int(id)),
11951 b.lookup_by_pk("users", "by_id", &IndexKey::Int(id)),
11952 "PK {id} differs after one-slice vs two-slice freeze"
11953 );
11954 }
11955 }
11956
11957 /// Gap between slices → error before any mutation lands.
11958 #[test]
11959 fn commit_freeze_slices_rejects_gap() {
11960 let mut cat = Catalog::new();
11961 cat.create_table(bigint_pk_users_schema()).unwrap();
11962 let t = cat.get_mut("users").unwrap();
11963 for id in 0..6i64 {
11964 t.insert(make_user_row(id, &alloc::format!("u-{id}")))
11965 .unwrap();
11966 }
11967 t.add_index("by_id".into(), "id").unwrap();
11968 let s1 = cat.prepare_freeze_slice("users", "by_id", 0..2).unwrap();
11969 let s2 = cat.prepare_freeze_slice("users", "by_id", 3..5).unwrap();
11970 assert!(matches!(
11971 cat.commit_freeze_slices("users", "by_id", alloc::vec![s1, s2]),
11972 Err(StorageError::Corrupt(_))
11973 ));
11974 // Catalog untouched.
11975 assert_eq!(cat.cold_segment_count(), 0);
11976 assert_eq!(cat.get("users").unwrap().row_count(), 6);
11977 }
11978
11979 /// Empty slice list → no-op success, catalog untouched.
11980 #[test]
11981 fn commit_freeze_slices_empty_is_noop() {
11982 let mut cat = Catalog::new();
11983 cat.create_table(bigint_pk_users_schema()).unwrap();
11984 let t = cat.get_mut("users").unwrap();
11985 for id in 0..3i64 {
11986 t.insert(make_user_row(id, &alloc::format!("u-{id}")))
11987 .unwrap();
11988 }
11989 t.add_index("by_id".into(), "id").unwrap();
11990 let report = cat
11991 .commit_freeze_slices("users", "by_id", Vec::new())
11992 .unwrap();
11993 assert_eq!(report.frozen_rows, 0);
11994 assert_eq!(cat.cold_segment_count(), 0);
11995 assert_eq!(cat.get("users").unwrap().row_count(), 3);
11996 }
11997
11998 // --- v6.7.3 cold-segment compaction ---------------------------
11999
12000 /// Two small cold segments merge into a single larger one. The
12001 /// merged segment carries every cold-resident row; the source
12002 /// slots are tombstoned; every PK still resolves through the
12003 /// new merged segment via `lookup_by_pk`.
12004 #[test]
12005 fn compact_merges_small_segments_storage_unit() {
12006 let mut cat = Catalog::new();
12007 cat.create_table(bigint_pk_users_schema()).unwrap();
12008 let t = cat.get_mut("users").unwrap();
12009 for id in 0..8i64 {
12010 t.insert(make_user_row(id, &alloc::format!("u-{id}")))
12011 .unwrap();
12012 }
12013 t.add_index("by_id".into(), "id").unwrap();
12014 // Two freezes of 3 rows each → two small cold segments.
12015 cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
12016 cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
12017 assert_eq!(cat.cold_segment_count(), 2);
12018 assert_eq!(cat.cold_segment_slot_count(), 2);
12019
12020 // Pick a threshold larger than either segment's size so
12021 // both qualify.
12022 let max_seg_bytes = cat
12023 .cold_segment_ids_global()
12024 .iter()
12025 .map(|id| cat.cold_segment(*id).unwrap().bytes().len() as u64)
12026 .max()
12027 .unwrap();
12028 let target = max_seg_bytes + 1;
12029
12030 let report = cat
12031 .compact_cold_segments("users", "by_id", target)
12032 .expect("compact succeeds");
12033 assert_eq!(report.sources.len(), 2);
12034 let merged_id = report.merged_segment_id.expect("merge happened");
12035 assert_eq!(report.merged_rows, 6);
12036 assert_eq!(report.deleted_rows_pruned, 0);
12037 assert!(!report.merged_segment_bytes.is_empty());
12038
12039 // Active count drops back to 1; slot count grew to 3
12040 // (2 sources tombstoned + 1 merged appended).
12041 assert_eq!(cat.cold_segment_count(), 1);
12042 assert_eq!(cat.cold_segment_slot_count(), 3);
12043 assert_eq!(cat.cold_segment_ids_global(), alloc::vec![merged_id]);
12044
12045 // Every PK that was frozen still resolves (via the merged
12046 // segment); the 2 hot rows still resolve too.
12047 for id in 0..8i64 {
12048 let got = cat
12049 .lookup_by_pk("users", "by_id", &IndexKey::Int(id))
12050 .unwrap_or_else(|| panic!("PK {id} lost after compaction"));
12051 assert_eq!(got, make_user_row(id, &alloc::format!("u-{id}")));
12052 }
12053 }
12054
12055 /// DELETE'd-but-frozen rows are dropped during the merge. Set
12056 /// up two small segments, then shadow one row in each; the
12057 /// merged segment must NOT carry the shadowed rows.
12058 #[test]
12059 fn compact_drops_shadowed_cold_rows() {
12060 let mut cat = Catalog::new();
12061 cat.create_table(bigint_pk_users_schema()).unwrap();
12062 let t = cat.get_mut("users").unwrap();
12063 for id in 0..6i64 {
12064 t.insert(make_user_row(id, &alloc::format!("u-{id}")))
12065 .unwrap();
12066 }
12067 t.add_index("by_id".into(), "id").unwrap();
12068 cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
12069 cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
12070 // Shadow PK 1 (in seg 0) + PK 4 (in seg 1).
12071 assert_eq!(
12072 cat.shadow_cold_row("users", "by_id", &IndexKey::Int(1))
12073 .unwrap(),
12074 1
12075 );
12076 assert_eq!(
12077 cat.shadow_cold_row("users", "by_id", &IndexKey::Int(4))
12078 .unwrap(),
12079 1
12080 );
12081
12082 let max_seg_bytes = cat
12083 .cold_segment_ids_global()
12084 .iter()
12085 .map(|id| cat.cold_segment(*id).unwrap().bytes().len() as u64)
12086 .max()
12087 .unwrap();
12088 let report = cat
12089 .compact_cold_segments("users", "by_id", max_seg_bytes + 1)
12090 .expect("compact succeeds");
12091 assert_eq!(report.sources.len(), 2);
12092 assert_eq!(report.merged_rows, 4, "6 frozen − 2 shadowed = 4 live");
12093 assert_eq!(report.deleted_rows_pruned, 2);
12094
12095 // PK 1 and 4 stay invisible after compact.
12096 for shadowed in [1i64, 4i64] {
12097 assert!(
12098 cat.lookup_by_pk("users", "by_id", &IndexKey::Int(shadowed))
12099 .is_none(),
12100 "shadowed PK {shadowed} must remain invisible after compact"
12101 );
12102 }
12103 // The other 4 frozen rows resolve.
12104 for live in [0i64, 2, 3, 5] {
12105 cat.lookup_by_pk("users", "by_id", &IndexKey::Int(live))
12106 .unwrap_or_else(|| panic!("live PK {live} lost after compact"));
12107 }
12108 }
12109
12110 /// No-op cases: 0 or 1 candidate segment under the threshold
12111 /// leaves the catalog untouched.
12112 #[test]
12113 fn compact_is_noop_below_two_candidates() {
12114 let mut cat = Catalog::new();
12115 cat.create_table(bigint_pk_users_schema()).unwrap();
12116 let t = cat.get_mut("users").unwrap();
12117 for id in 0..6i64 {
12118 t.insert(make_user_row(id, &alloc::format!("u-{id}")))
12119 .unwrap();
12120 }
12121 t.add_index("by_id".into(), "id").unwrap();
12122 // 0 cold segments.
12123 let report = cat
12124 .compact_cold_segments("users", "by_id", 1 << 30)
12125 .expect("noop ok");
12126 assert!(report.merged_segment_id.is_none());
12127 assert!(report.sources.is_empty());
12128
12129 // 1 cold segment — still a no-op (need ≥2 to merge).
12130 cat.freeze_oldest_to_cold("users", "by_id", 4).unwrap();
12131 let report = cat
12132 .compact_cold_segments("users", "by_id", 1 << 30)
12133 .expect("noop ok");
12134 assert!(report.merged_segment_id.is_none());
12135 assert_eq!(cat.cold_segment_count(), 1);
12136
12137 // Threshold too small to cover the single segment → still
12138 // no-op.
12139 let report = cat
12140 .compact_cold_segments("users", "by_id", 1)
12141 .expect("noop ok");
12142 assert!(report.merged_segment_id.is_none());
12143 assert_eq!(cat.cold_segment_count(), 1);
12144 }
12145
12146 /// Manifest-style atomicity: a Catalog snapshot taken AFTER
12147 /// `compact_cold_segments` returns must round-trip with the
12148 /// post-compact BTree state, while the cold-tier registry is
12149 /// re-derived from the source-of-truth manifest (=
12150 /// `load_segment_bytes_at` with the merged id + the still-on-
12151 /// disk merged bytes). This mirrors the boot path: catalog
12152 /// snapshot + cold-segment files = full state.
12153 #[test]
12154 fn compact_swap_survives_catalog_roundtrip_via_load_at() {
12155 let mut cat = Catalog::new();
12156 cat.create_table(bigint_pk_users_schema()).unwrap();
12157 let t = cat.get_mut("users").unwrap();
12158 for id in 0..6i64 {
12159 t.insert(make_user_row(id, &alloc::format!("u-{id}")))
12160 .unwrap();
12161 }
12162 t.add_index("by_id".into(), "id").unwrap();
12163 cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
12164 cat.freeze_oldest_to_cold("users", "by_id", 3).unwrap();
12165 let max_seg_bytes = cat
12166 .cold_segment_ids_global()
12167 .iter()
12168 .map(|id| cat.cold_segment(*id).unwrap().bytes().len() as u64)
12169 .max()
12170 .unwrap();
12171 let report = cat
12172 .compact_cold_segments("users", "by_id", max_seg_bytes + 1)
12173 .expect("compact ok");
12174 let merged_id = report.merged_segment_id.unwrap();
12175
12176 // Serialise the catalog (BTree index points at merged_id
12177 // now) and the merged segment bytes; pretend to crash; on
12178 // restart, re-hydrate the catalog and reload only the
12179 // merged segment at its baked-in id.
12180 let cat_bytes = cat.serialize();
12181 let merged_bytes = report.merged_segment_bytes.clone();
12182
12183 let mut restored = Catalog::deserialize(&cat_bytes).expect("deserialize ok");
12184 restored
12185 .load_segment_bytes_at(merged_id, merged_bytes)
12186 .expect("reload merged ok");
12187
12188 // All 6 PKs still resolve through the restored merged segment.
12189 for id in 0..6i64 {
12190 let got = restored
12191 .lookup_by_pk("users", "by_id", &IndexKey::Int(id))
12192 .unwrap_or_else(|| panic!("PK {id} lost across roundtrip"));
12193 assert_eq!(got, make_user_row(id, &alloc::format!("u-{id}")));
12194 }
12195 // No source slot ever rehydrates — confirmed by
12196 // `cold_segment_count` matching only the merged segment.
12197 assert_eq!(restored.cold_segment_count(), 1);
12198 }
12199
12200 /// `load_segment_bytes_at` refuses to stomp an occupied slot
12201 /// and pads with `None` when the target id is past the end.
12202 #[test]
12203 fn load_segment_bytes_at_pads_and_rejects_collision() {
12204 let mut cat = Catalog::new();
12205 cat.create_table(bigint_pk_users_schema()).unwrap();
12206 let t = cat.get_mut("users").unwrap();
12207 for id in 0..4i64 {
12208 t.insert(make_user_row(id, &alloc::format!("u-{id}")))
12209 .unwrap();
12210 }
12211 t.add_index("by_id".into(), "id").unwrap();
12212 let report = cat.freeze_oldest_to_cold("users", "by_id", 2).unwrap();
12213 let bytes_seg0 = report.segment_bytes.clone();
12214
12215 // Pad to id=5 (slots 1..5 are None, slot 5 holds the
12216 // segment loaded back). The slot count jumps, the active
12217 // count is now 2 (seg 0 + seg 5).
12218 cat.load_segment_bytes_at(5, bytes_seg0.clone())
12219 .expect("pad + load ok");
12220 assert_eq!(cat.cold_segment_slot_count(), 6);
12221 assert_eq!(cat.cold_segment_count(), 2);
12222
12223 // Re-loading at the same id collides.
12224 assert!(matches!(
12225 cat.load_segment_bytes_at(5, bytes_seg0.clone()),
12226 Err(StorageError::Corrupt(_))
12227 ));
12228 // Re-loading at id 0 (already occupied) also collides.
12229 assert!(matches!(
12230 cat.load_segment_bytes_at(0, bytes_seg0),
12231 Err(StorageError::Corrupt(_))
12232 ));
12233 }
12234
12235 /// Round trip: freeze → promote → re-freeze. The same PK can
12236 /// migrate hot ↔ cold multiple times. After two cycles only the
12237 /// final Hot locator should be live.
12238 #[test]
12239 fn promote_then_refreeze_does_not_leave_orphan_locators() {
12240 let mut cat = Catalog::new();
12241 cat.create_table(bigint_pk_users_schema()).unwrap();
12242 let t = cat.get_mut("users").unwrap();
12243 for id in 0..4i64 {
12244 t.insert(make_user_row(id, &alloc::format!("u-{id}")))
12245 .unwrap();
12246 }
12247 t.add_index("by_id".into(), "id").unwrap();
12248
12249 // Cycle 1: freeze first 2 rows, then promote PK 0.
12250 cat.freeze_oldest_to_cold("users", "by_id", 2).unwrap();
12251 let promoted = cat
12252 .promote_cold_row("users", "by_id", &IndexKey::Int(0))
12253 .unwrap();
12254 assert!(promoted.is_some());
12255 let entries_after_promote = cat
12256 .get("users")
12257 .unwrap()
12258 .index_on(0)
12259 .unwrap()
12260 .lookup_eq(&IndexKey::Int(0))
12261 .to_vec();
12262 assert_eq!(entries_after_promote.len(), 1);
12263 assert!(entries_after_promote[0].is_hot());
12264
12265 // Cycle 2: freeze the front rows again. PK 0 is now at
12266 // position 2 (after the survivors); it could still go cold
12267 // again on a future freeze depending on policy, but the
12268 // current "first N positions" policy leaves it alone here.
12269 // What matters: prior cold locators for PKs 0..1 are gone,
12270 // PKs 2..3 still resolve through their original segments.
12271 for id in [2i64, 3] {
12272 assert_eq!(
12273 cat.lookup_by_pk("users", "by_id", &IndexKey::Int(id))
12274 .unwrap(),
12275 make_user_row(id, &alloc::format!("u-{id}"))
12276 );
12277 }
12278 }
12279}