spg_storage/lib.rs
1//! In-memory storage primitives.
2//!
3//! v0.3 is intentionally simple: a flat catalog of tables, each holding rows
4//! as `Vec<Value>` (positional, matching the table's `TableSchema`). No MVCC,
5//! no on-disk format — those land in later milestones.
6#![no_std]
7// v3.3.2 NEON path for l2_distance_sq (aarch64 only). Scoped allow:
8// `unsafe_code = "deny"` at workspace level stays in force for every
9// other crate.
10#![cfg_attr(target_arch = "aarch64", allow(unsafe_code))]
11
12extern crate alloc;
13
14pub mod bloom;
15mod codec;
16pub mod fts_simple;
17pub mod halfvec;
18mod nsw;
19pub mod persistent;
20pub mod persistent_btree;
21pub mod quantize;
22pub mod row_locator;
23pub mod segment;
24mod table;
25pub mod trgm;
26
27pub use self::bloom::{BloomError, BloomFilter};
28// v7.31 monster tier-3 cut 3 — on-disk codec moved to `codec`; the
29// public dense-row surface keeps its `spg_storage::*` paths, and the
30// low-level write/read primitives stay crate-visible for the
31// `Catalog::serialize`/`deserialize` methods that remain in this file.
32pub(crate) use self::codec::*;
33pub use self::codec::{decode_row_body_dense, encode_row_body_dense, row_body_encoded_len};
34// v7.31 monster tier-3 cut 2 — HNSW algorithms moved to `nsw`; the
35// public vector-search surface keeps its `spg_storage::*` paths via
36// these re-exports, and `nsw_insert_at` stays crate-visible for the
37// `Table` insert paths in the `table` module.
38pub(crate) use self::nsw::nsw_insert_at;
39pub use self::nsw::{NswMetric, cosine_dot_norms_f32, inner_product_f32, nsw_index_on, nsw_query};
40pub use self::row_locator::{RowLocator, RowLocatorError};
41pub use self::segment::{
42 BRIN_SIDECAR_MAGIC, BrinSummary, OwnedSegment, SEGMENT_COMPRESS_ALGO_LZSS,
43 SEGMENT_COMPRESS_ALGO_NONE, SEGMENT_MAGIC, SEGMENT_MAGIC_V2, SEGMENT_PAGE_BYTES, SegmentError,
44 SegmentMeta, SegmentReader, derive_brin_summaries, encode_segment, wrap_v2_envelope,
45 wrap_v2_envelope_with_brin,
46};
47
48use alloc::boxed::Box;
49use alloc::collections::{BTreeMap, BTreeSet};
50use alloc::format;
51use alloc::string::{String, ToString};
52use alloc::sync::Arc;
53use alloc::vec::Vec;
54use core::fmt;
55
56use self::persistent::PersistentVec;
57use self::persistent_btree::PersistentBTreeMap;
58
59/// In-cell encoding for `DataType::Vector`. Mirrors
60/// `spg_sql::ast::VecEncoding` — kept here so storage stays
61/// dep-free of `spg-sql`. The engine bridges between the two
62/// at DDL-execution time.
63///
64/// `F32` is the pre-v6 default: each cell holds a raw `Vec<f32>`.
65/// `Sq8` (v6.0.1) stores `Sq8Vector { min, max, bytes: Vec<u8> }`
66/// per cell; 4× compression vs `F32` with recall@10 ≥ 0.95 on
67/// natural embeddings (Gaussian / unit-sphere corpora).
68/// `F16` (v6.0.3, DDL keyword `HALF`) stores each element as
69/// IEEE-754 binary16; 2× compression and bit-exact dequantise.
70#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
71pub enum VecEncoding {
72 #[default]
73 F32,
74 Sq8,
75 F16,
76}
77
78impl fmt::Display for VecEncoding {
79 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
80 match self {
81 Self::F32 => f.write_str("F32"),
82 Self::Sq8 => f.write_str("SQ8"),
83 Self::F16 => f.write_str("HALF"),
84 }
85 }
86}
87
88/// Runtime type tags. `Vector { dim, encoding }` / `Varchar(max)` /
89/// `Char(size)` are parameterised; the parameter travels with both
90/// the column schema and the on-wire serialised representation.
91#[derive(Debug, Clone, Copy, PartialEq, Eq)]
92pub enum DataType {
93 /// 16-bit signed. Backed by `Value::SmallInt(i16)`; arithmetic that
94 /// would overflow surfaces as a type error at INSERT time.
95 SmallInt,
96 Int, // 32-bit signed
97 BigInt, // 64-bit signed
98 Float, // f64 (PG double precision)
99 Text,
100 /// `VARCHAR(n)` — same byte representation as `Text`, but INSERT
101 /// rejects values longer than `n` Unicode characters.
102 Varchar(u32),
103 /// `CHAR(n)` — same representation as `Text`, but INSERT right-pads
104 /// with U+0020 to exactly `n` Unicode characters (or rejects when
105 /// the input is already longer).
106 Char(u32),
107 Bool,
108 /// pgvector-style fixed-dimension vector. `encoding` selects
109 /// the in-cell representation (`F32` = pre-v6 raw f32 buffer;
110 /// `Sq8` = v6.0.1 8-bit scalar-quantised). The DDL grammar
111 /// surfaces encoding via the optional `USING <encoding>`
112 /// clause: `VECTOR(128) USING SQ8`.
113 Vector {
114 dim: u32,
115 encoding: VecEncoding,
116 },
117 /// `NUMERIC(precision, scale)` — exact fixed-point decimal stored as
118 /// a scaled `i128`. `precision` caps total decimal digits, `scale`
119 /// fixes digits after the decimal point. v1.12 supports up to
120 /// precision 38 (the i128-safe ceiling). `NUMERIC` and `NUMERIC(p)`
121 /// surface as `Numeric { precision: p, scale: 0 }`.
122 Numeric {
123 precision: u8,
124 scale: u8,
125 },
126 /// `DATE` — calendar date with day precision, stored as `i32` days
127 /// since the Unix epoch (1970-01-01).
128 Date,
129 /// `TIMESTAMP` (a.k.a. `MySQL` `DATETIME`) — instant with microsecond
130 /// precision, stored as `i64` microseconds since the Unix epoch.
131 Timestamp,
132 /// v7.9.2 `TIMESTAMPTZ` — bit-identical to `Timestamp` on disk
133 /// (i64 microseconds, UTC by convention). Carried as a distinct
134 /// type tag so the PG-wire layer can advertise OID 1184 (PG's
135 /// `timestamp with time zone`) and `sqlx`/`pgx`/JDBC clients
136 /// decode into their TZ-aware datetime types. The internal
137 /// semantics are unchanged: SPG never stored per-row offsets,
138 /// and neither did PG — `TIMESTAMPTZ` in PG is also UTC i64.
139 Timestamptz,
140 /// `INTERVAL` — calendar-aware span (months + microseconds). v2.11
141 /// supports INTERVAL only as a runtime intermediate (literals,
142 /// arithmetic results); on-disk encoding is rejected so this branch
143 /// can't appear in a `ColumnSchema`.
144 Interval,
145 /// v4.9: `JSON` — text-backed JSON document. We don't parse
146 /// the content (no path operators or jsonb functions yet) —
147 /// the column accepts any TEXT-compatible value and round-trips
148 /// it verbatim. PG OID 114 on the wire.
149 Json,
150 /// v7.9.0: `JSONB` — semantically identical to `Json` on
151 /// the storage side (same `Value::Json` cells, same
152 /// row codec), but advertised as PG OID 3802 on the wire
153 /// so `sqlx`-style clients that bind `jsonb` columns
154 /// decode correctly. mailrs migration blocker #3.
155 Jsonb,
156 /// v7.10.4: `BYTES` / `BYTEA` — variable-length raw binary.
157 /// Backed by `Value::Bytes(Vec<u8>)`. PG wire OID 17. Literal
158 /// forms accepted by parser/engine: PG hex form `'\xDEADBEEF'`
159 /// (case-insensitive hex pairs) and escape form
160 /// `'foo\\000bar'` (the latter decoded at coercion time when
161 /// the target column is BYTEA — TEXT columns leave the
162 /// backslash sequence verbatim).
163 Bytes,
164 /// v7.10.9: `TEXT[]` — single-dimension TEXT array. Elements
165 /// may be NULL (PG semantics). PG wire OID 1009. Literal
166 /// forms: `ARRAY['a', 'b', NULL]` and the PG external form
167 /// `'{a,b,NULL}'::TEXT[]`. Engine implements `= ANY(arr)`,
168 /// `<> ALL(arr)`, and 1-based indexing `arr[i]`. Catalog
169 /// FILE_VERSION 18+; older snapshots reject this DataType
170 /// (forward-only by design — TEXT[] columns aren't readable
171 /// on a pre-v7.10 binary).
172 TextArray,
173 /// v7.11.12: `INT[]` — single-dimension i32 array. PG wire
174 /// OID 1007 (_int4). Same `ARRAY[...]` / `'{1,2,3}'::INT[]`
175 /// literal surface as TEXT[]. Catalog FILE_VERSION 19+.
176 IntArray,
177 /// v7.11.12: `BIGINT[]` — single-dimension i64 array. PG
178 /// wire OID 1016 (_int8). Catalog FILE_VERSION 19+.
179 BigIntArray,
180 /// v7.12.0: PG `tsvector` — ordered, deduplicated set of
181 /// `(lexeme, positions, weight)` tuples. PG wire OID 3614.
182 /// Catalog FILE_VERSION 20+. Storage shape is row-codec
183 /// tag 22; the schema-agnostic `write_value` path emits tag
184 /// 18. Literal: `'foo:1 bar:2,3'::tsvector` (PG external
185 /// form). G-CRIT-3 entry — v7.12.0 only ships the type +
186 /// codec; matching `@@` lands in v7.12.2.
187 TsVector,
188 /// v7.12.0: PG `tsquery` — parse tree of lexemes joined by
189 /// `&` `|` `!` and phrase operators. PG wire OID 3615.
190 /// Catalog FILE_VERSION 20+.
191 TsQuery,
192 /// v7.17.0: PG `uuid` — 128-bit identifier stored as
193 /// `Value::Uuid([u8; 16])`. PG wire OID 2950. Canonical
194 /// text form is lowercase 8-4-4-4-12 hyphenated; input
195 /// also accepts uppercase, unhyphenated, and brace-wrapped
196 /// forms (`{xxxx…}`). Catalog FILE_VERSION 36+; tag 24 on
197 /// the dense type-tag side, tag 20 on the schema-agnostic
198 /// value side. The drop-in PG/MySQL surface for Django /
199 /// Rails / Hibernate "id UUID PRIMARY KEY DEFAULT
200 /// gen_random_uuid()" default-PK pattern.
201 Uuid,
202 /// v7.17.0 Phase 3.P0-32: PG `time` (without time zone) — i64
203 /// microseconds since 00:00:00. PG wire OID 1083. Display:
204 /// canonical zero-padded `HH:MM:SS` when fractional is zero,
205 /// `HH:MM:SS.ffffff` otherwise. Catalog FILE_VERSION 37+;
206 /// tag 25 on the dense type-tag side, tag 21 on the schema-
207 /// agnostic value side. The wall-clock-of-day half of PG's
208 /// date/time triplet (date / time / timestamp).
209 Time,
210 /// v7.17.0 Phase 3.P0-33: MySQL `YEAR` — u16 in range
211 /// 1901..=2155 plus the special zero-year sentinel 0. No
212 /// dedicated PG OID (advertised as INT4 / OID 23 on the wire
213 /// — psql renders integers, MySQL CLI renders 4-digit
214 /// zero-padded text). Display always 4 digits: `0000` for the
215 /// zero-year, `1985` / `2007` / etc otherwise. Catalog
216 /// FILE_VERSION 38+; tag 26 on the dense type-tag side, tag
217 /// 22 on the schema-agnostic value side.
218 Year,
219 /// v7.17.0 Phase 3.P0-34: PG `time with time zone` (TIMETZ) —
220 /// i64 microseconds since 00:00:00 in the local wall clock
221 /// PLUS i32 offset-from-UTC in seconds. PG wire OID 1266.
222 /// Display: `HH:MM:SS[.ffffff]±HH[:MM]` (PG `timetz_out`).
223 /// Range: offset in ±50400 seconds (±14 hours). Catalog
224 /// FILE_VERSION 39+; tag 27 on the dense type-tag side, tag
225 /// 23 on the schema-agnostic value side.
226 TimeTz,
227 /// v7.17.0 Phase 3.P0-35: PG `money` — i64 cents (locale-
228 /// independent storage). PG wire OID 790. Display: en_US
229 /// locale (`$N,NNN.CC`, negative → `-$1.23`). Input accepts
230 /// `$N.NN`, `$N,NNN.NN`, bare integer (treated as major
231 /// units), optional leading `-`. Range: full i64. Catalog
232 /// FILE_VERSION 40+; tag 28 on the dense type-tag side, tag
233 /// 24 on the schema-agnostic value side.
234 Money,
235 /// v7.17.0 Phase 3.P0-38: PG range type. The same DataType
236 /// variant covers all six builtin ranges (int4range,
237 /// int8range, numrange, tsrange, tstzrange, daterange) —
238 /// `RangeKind` pins the element type so encode / decode /
239 /// display can route off one switch. Catalog FILE_VERSION
240 /// 43+; tag 29 + a 1-byte RangeKind on the dense type-tag
241 /// side, tag 25 on the schema-agnostic value side.
242 Range(RangeKind),
243 /// v7.17.0 Phase 3.P0-39: PG `hstore` extension type — flat
244 /// `text => text` map with NULL value support. Catalog
245 /// FILE_VERSION 44+; tag 30 on the dense type-tag side, tag
246 /// 26 on the schema-agnostic value side. The contrib OID is
247 /// installation-dependent in real PG; SPG advertises it via
248 /// dynamic lookup, falling back to TEXT (OID 25) on the wire
249 /// when the installed `hstore` extension hasn't claimed an
250 /// OID yet.
251 Hstore,
252 /// v7.17.0 Phase 3.P0-40: PG `int[][]` — 2-dimensional INT
253 /// matrix. Storage: row-major Vec<Vec<Option<i32>>>. All
254 /// rows must share the same column count. Wire OID 1007
255 /// (same as INT[]; the dimension count travels in the data
256 /// header, not the OID). Catalog FILE_VERSION 45+; tag 31
257 /// on the dense type-tag side, tag 27 on the schema-agnostic
258 /// value side.
259 IntArray2D,
260 /// v7.17.0 Phase 3.P0-40: PG `bigint[][]` — 2-dimensional
261 /// BIGINT matrix. Storage / OID / tags mirror IntArray2D.
262 /// Tag 32 dense, tag 28 schema-agnostic.
263 BigIntArray2D,
264 /// v7.17.0 Phase 3.P0-40: PG `text[][]` — 2-dimensional TEXT
265 /// matrix. Storage: row-major Vec<Vec<Option<String>>>.
266 /// Tag 33 dense, tag 29 schema-agnostic.
267 TextArray2D,
268}
269
270/// v7.17.0 Phase 3.P0-38 — pins the element type of a range value
271/// or column. Wire OIDs: Int4=3904, Int8=3926, Num=3906,
272/// Ts=3908, TsTz=3910, Date=3912.
273#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
274pub enum RangeKind {
275 Int4,
276 Int8,
277 Num,
278 Ts,
279 TsTz,
280 Date,
281}
282
283impl RangeKind {
284 pub const fn tag(self) -> u8 {
285 match self {
286 Self::Int4 => 0,
287 Self::Int8 => 1,
288 Self::Num => 2,
289 Self::Ts => 3,
290 Self::TsTz => 4,
291 Self::Date => 5,
292 }
293 }
294 pub const fn from_tag(t: u8) -> Option<Self> {
295 Some(match t {
296 0 => Self::Int4,
297 1 => Self::Int8,
298 2 => Self::Num,
299 3 => Self::Ts,
300 4 => Self::TsTz,
301 5 => Self::Date,
302 _ => return None,
303 })
304 }
305 pub const fn keyword(self) -> &'static str {
306 match self {
307 Self::Int4 => "INT4RANGE",
308 Self::Int8 => "INT8RANGE",
309 Self::Num => "NUMRANGE",
310 Self::Ts => "TSRANGE",
311 Self::TsTz => "TSTZRANGE",
312 Self::Date => "DATERANGE",
313 }
314 }
315}
316
317impl fmt::Display for DataType {
318 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
319 match self {
320 Self::SmallInt => f.write_str("SMALLINT"),
321 Self::Int => f.write_str("INT"),
322 Self::BigInt => f.write_str("BIGINT"),
323 Self::Float => f.write_str("FLOAT"),
324 Self::Text => f.write_str("TEXT"),
325 Self::Varchar(n) => write!(f, "VARCHAR({n})"),
326 Self::Char(n) => write!(f, "CHAR({n})"),
327 Self::Bool => f.write_str("BOOL"),
328 Self::Vector { dim, encoding } => match encoding {
329 VecEncoding::F32 => write!(f, "VECTOR({dim})"),
330 VecEncoding::Sq8 => write!(f, "VECTOR({dim}) USING SQ8"),
331 VecEncoding::F16 => write!(f, "VECTOR({dim}) USING HALF"),
332 },
333 Self::Numeric { precision, scale } => {
334 if *scale == 0 {
335 write!(f, "NUMERIC({precision})")
336 } else {
337 write!(f, "NUMERIC({precision}, {scale})")
338 }
339 }
340 Self::Date => f.write_str("DATE"),
341 Self::Timestamp => f.write_str("TIMESTAMP"),
342 Self::Timestamptz => f.write_str("TIMESTAMPTZ"),
343 Self::Interval => f.write_str("INTERVAL"),
344 Self::Json => f.write_str("JSON"),
345 Self::Jsonb => f.write_str("JSONB"),
346 Self::Bytes => f.write_str("BYTEA"),
347 Self::TextArray => f.write_str("TEXT[]"),
348 Self::IntArray => f.write_str("INT[]"),
349 Self::BigIntArray => f.write_str("BIGINT[]"),
350 Self::TsVector => f.write_str("TSVECTOR"),
351 Self::TsQuery => f.write_str("TSQUERY"),
352 Self::Uuid => f.write_str("UUID"),
353 Self::Time => f.write_str("TIME"),
354 Self::Year => f.write_str("YEAR"),
355 Self::TimeTz => f.write_str("TIMETZ"),
356 Self::Money => f.write_str("MONEY"),
357 Self::Range(k) => f.write_str(k.keyword()),
358 Self::Hstore => f.write_str("HSTORE"),
359 Self::IntArray2D => f.write_str("INT[][]"),
360 Self::BigIntArray2D => f.write_str("BIGINT[][]"),
361 Self::TextArray2D => f.write_str("TEXT[][]"),
362 }
363 }
364}
365
366/// v7.12.0 — one entry in a `Value::TsVector`. The lexeme is the
367/// (already-tokenised + stemmed in v7.12.1+) word; `positions` is
368/// a strictly-ascending list of 1-based positions; `weight` is the
369/// PG weight letter (A=3, B=2, C=1, D=0) — v7.12.0 defaults every
370/// lexeme to D, the v7.12.2 ranking path consumes the weight.
371#[derive(Debug, Clone, PartialEq, Eq)]
372pub struct TsLexeme {
373 pub word: String,
374 pub positions: Vec<u16>,
375 pub weight: u8,
376}
377
378/// v7.12.0 — parse tree for a PG `tsquery`. v7.12.0 ships the
379/// type + codec only; the `to_tsquery` / `plainto_tsquery` lexer
380/// lands in v7.12.1 and the `@@` evaluator in v7.12.2.
381#[derive(Debug, Clone, PartialEq, Eq)]
382pub enum TsQueryAst {
383 /// Single lexeme term. The `weight_mask` is the PG-style
384 /// bitmask of accepted weights (`A=1<<3`, `B=1<<2`, `C=1<<1`,
385 /// `D=1<<0`); `0` = any weight. v7.12.0 always sets it to 0.
386 Term {
387 word: String,
388 weight_mask: u8,
389 },
390 And(Box<TsQueryAst>, Box<TsQueryAst>),
391 Or(Box<TsQueryAst>, Box<TsQueryAst>),
392 Not(Box<TsQueryAst>),
393 /// `phrase <distance> phrase`. v7.12.0 only persists this; the
394 /// match semantics arrive in v7.12.2 alongside `@@`.
395 Phrase {
396 left: Box<TsQueryAst>,
397 right: Box<TsQueryAst>,
398 distance: u16,
399 },
400}
401
402/// A row-cell value, including SQL `NULL`. `Float` uses `f64`; NaN compares
403/// non-equal to itself (PG behaviour) — `PartialEq` is derived so callers
404/// must opt into NaN-aware comparison if they need stronger guarantees.
405#[derive(Debug, Clone, PartialEq)]
406#[non_exhaustive]
407pub enum Value {
408 SmallInt(i16),
409 Int(i32),
410 BigInt(i64),
411 Float(f64),
412 Text(String),
413 Bool(bool),
414 Vector(Vec<f32>),
415 /// v6.0.1: 8-bit scalar-quantised vector cell. Lives in
416 /// columns declared `VECTOR(N) USING SQ8`. Layout per cell:
417 /// `Sq8Vector { min: f32, max: f32, bytes: Vec<u8> }` —
418 /// 4× compression vs `Vector(Vec<f32>)`. The wire layer
419 /// dequantises to `f32` on SELECT; INSERT path quantises
420 /// incoming `Vector(Vec<f32>)` cells into this variant.
421 Sq8Vector(crate::quantize::Sq8Vector),
422 /// v6.0.3: IEEE-754 binary16 vector cell. Lives in columns
423 /// declared `VECTOR(N) USING HALF`. Stores raw u16 LE bits
424 /// (2× compression vs `Vector(Vec<f32>)`). Wire / display
425 /// paths dequantise to f32 bit-exactly; INSERT path converts
426 /// incoming f32 vectors at the engine boundary.
427 HalfVector(crate::halfvec::HalfVector),
428 /// Exact fixed-point decimal. `scaled` holds the value as
429 /// `actual * 10^scale` so the storage type is always integral —
430 /// arithmetic never falls back to floating-point.
431 Numeric {
432 scaled: i128,
433 scale: u8,
434 },
435 /// Days since the Unix epoch (1970-01-01). Negative for earlier dates.
436 Date(i32),
437 /// Microseconds since the Unix epoch (1970-01-01T00:00:00Z).
438 Timestamp(i64),
439 /// Calendar span: `months` (variable-length) + `micros` (fixed-length).
440 /// Runtime-only — cannot appear in a stored row in v2.11.
441 Interval {
442 months: i32,
443 micros: i64,
444 },
445 /// v4.9 `JSON` — raw JSON text. No structural validation
446 /// happens at the storage layer; whatever the parser hands us
447 /// round-trips verbatim. Equality is byte-wise.
448 Json(String),
449 /// v7.10.4 `BYTEA` — raw binary blob. Equality is byte-wise.
450 /// Layout matches `Text`'s length-prefixed shape (`[u32 LE
451 /// len][bytes]`) under tag 18; the engine accepts PG hex
452 /// literals (`'\xDEADBEEF'`) and escape literals at the
453 /// coercion boundary.
454 Bytes(Vec<u8>),
455 /// v7.10.9 `TEXT[]` — single-dimension TEXT array with
456 /// optional NULL elements. Equality is element-wise. PG's
457 /// NULL-element comparison semantics: NULL ≠ NULL inside
458 /// arrays under `=`, so `[NULL] != [NULL]` (the engine
459 /// honours this).
460 TextArray(Vec<Option<String>>),
461 /// v7.11.12 `INT[]` — single-dimension i32 array with optional
462 /// NULL elements. Codec mirrors TextArray with i32 LE per
463 /// element instead of length-prefixed UTF-8.
464 IntArray(Vec<Option<i32>>),
465 /// v7.11.12 `BIGINT[]` — single-dimension i64 array with optional
466 /// NULL elements.
467 BigIntArray(Vec<Option<i64>>),
468 /// v7.12.0 `tsvector` — sorted-by-word, deduped lexeme set with
469 /// positions + weights. The engine enforces sort/dedup on
470 /// construction; consumers can rely on `lexemes.windows(2)`
471 /// being strictly ascending by `word`.
472 TsVector(Vec<TsLexeme>),
473 /// v7.12.0 `tsquery` — boolean / phrase parse tree over
474 /// lexemes. Engine builds via `to_tsquery` family.
475 TsQuery(TsQueryAst),
476 /// v7.17.0 `uuid` — 128-bit identifier. Stored as 16 bytes
477 /// (big-endian / network-byte order, same as RFC 4122).
478 /// Display normalises to canonical lowercase 8-4-4-4-12
479 /// hyphenated form. Equality is byte-wise.
480 Uuid([u8; 16]),
481 /// v7.17.0 Phase 3.P0-32 — PG `time` (without time zone) —
482 /// i64 microseconds since 00:00:00. Range 0..86_400_000_000.
483 /// Display: `HH:MM:SS` zero-padded, with optional `.ffffff`
484 /// suffix when fractional is non-zero.
485 Time(i64),
486 /// v7.17.0 Phase 3.P0-33 — MySQL `YEAR` — u16 in range
487 /// 1901..=2155 plus the special zero-year sentinel 0.
488 /// Display always 4 digits zero-padded (`0000` for the
489 /// sentinel; `1985`/`2007` otherwise).
490 Year(u16),
491 /// v7.17.0 Phase 3.P0-34 — PG `time with time zone` — i64
492 /// microseconds since 00:00:00 in the LOCAL wall clock PLUS
493 /// an i32 offset-from-UTC in seconds. PG preserves the
494 /// offset on output, so the wall-clock value is NOT shifted
495 /// to UTC at storage time. Offset range: ±50400 seconds
496 /// (±14 hours).
497 TimeTz {
498 us: i64,
499 offset_secs: i32,
500 },
501 /// v7.17.0 Phase 3.P0-35 — PG `money` — i64 cents
502 /// (locale-independent storage; the en_US locale renders on
503 /// display via `$N,NNN.CC`).
504 Money(i64),
505 /// v7.17.0 Phase 3.P0-39 — PG `hstore` value: flat
506 /// `text => text` map with NULL value support. Insertion
507 /// order preserved on input; duplicate keys take last-write-
508 /// wins at parse time.
509 Hstore(Vec<(String, Option<String>)>),
510 /// v7.17.0 Phase 3.P0-40 — 2D INT matrix (row-major).
511 IntArray2D(Vec<Vec<Option<i32>>>),
512 /// v7.17.0 Phase 3.P0-40 — 2D BIGINT matrix (row-major).
513 BigIntArray2D(Vec<Vec<Option<i64>>>),
514 /// v7.17.0 Phase 3.P0-40 — 2D TEXT matrix (row-major).
515 TextArray2D(Vec<Vec<Option<String>>>),
516 /// v7.17.0 Phase 3.P0-38 — PG range value. One shape covers
517 /// all six builtin range types; `kind` pins the element type
518 /// (must match the column's `DataType::Range(kind)`).
519 /// `lower` / `upper` are `None` for the unbounded sides;
520 /// `lower_inc` / `upper_inc` mirror the canonical PG
521 /// `[` / `(` / `]` / `)` bracket inclusivity. `empty=true`
522 /// supersedes all other fields (the empty range has no
523 /// bounds).
524 Range {
525 kind: RangeKind,
526 lower: Option<alloc::boxed::Box<Value>>,
527 upper: Option<alloc::boxed::Box<Value>>,
528 lower_inc: bool,
529 upper_inc: bool,
530 empty: bool,
531 },
532 Null,
533}
534
535impl Value {
536 /// Type tag, or `None` for `NULL` (unknown at value level).
537 pub fn data_type(&self) -> Option<DataType> {
538 match self {
539 Self::SmallInt(_) => Some(DataType::SmallInt),
540 Self::Int(_) => Some(DataType::Int),
541 Self::BigInt(_) => Some(DataType::BigInt),
542 Self::Float(_) => Some(DataType::Float),
543 // `Text` covers both unbounded TEXT and bounded VARCHAR/CHAR
544 // — the constraint lives on the column schema, not the value.
545 Self::Text(_) => Some(DataType::Text),
546 Self::Bool(_) => Some(DataType::Bool),
547 Self::Vector(v) => Some(DataType::Vector {
548 dim: u32::try_from(v.len()).expect("vector dim ≤ u32"),
549 encoding: VecEncoding::F32,
550 }),
551 Self::Sq8Vector(q) => Some(DataType::Vector {
552 dim: u32::try_from(q.bytes.len()).expect("vector dim ≤ u32"),
553 encoding: VecEncoding::Sq8,
554 }),
555 Self::HalfVector(h) => Some(DataType::Vector {
556 dim: u32::try_from(h.dim()).expect("vector dim ≤ u32"),
557 encoding: VecEncoding::F16,
558 }),
559 // `Value::Numeric` doesn't carry its precision (the column
560 // schema does); we surface precision=0 as "unknown" and let
561 // the engine reconcile against the column type at coercion
562 // time.
563 Self::Numeric { scale, .. } => Some(DataType::Numeric {
564 precision: 0,
565 scale: *scale,
566 }),
567 Self::Date(_) => Some(DataType::Date),
568 Self::Timestamp(_) => Some(DataType::Timestamp),
569 Self::Interval { .. } => Some(DataType::Interval),
570 Self::Json(_) => Some(DataType::Json),
571 Self::Bytes(_) => Some(DataType::Bytes),
572 Self::TextArray(_) => Some(DataType::TextArray),
573 Self::IntArray(_) => Some(DataType::IntArray),
574 Self::BigIntArray(_) => Some(DataType::BigIntArray),
575 Self::TsVector(_) => Some(DataType::TsVector),
576 Self::TsQuery(_) => Some(DataType::TsQuery),
577 Self::Uuid(_) => Some(DataType::Uuid),
578 Self::Time(_) => Some(DataType::Time),
579 Self::Year(_) => Some(DataType::Year),
580 Self::TimeTz { .. } => Some(DataType::TimeTz),
581 Self::Money(_) => Some(DataType::Money),
582 Self::Range { kind, .. } => Some(DataType::Range(*kind)),
583 Self::Hstore(_) => Some(DataType::Hstore),
584 Self::IntArray2D(_) => Some(DataType::IntArray2D),
585 Self::BigIntArray2D(_) => Some(DataType::BigIntArray2D),
586 Self::TextArray2D(_) => Some(DataType::TextArray2D),
587 Self::Null => None,
588 }
589 }
590
591 pub const fn is_null(&self) -> bool {
592 matches!(self, Self::Null)
593 }
594}
595
596/// One table row — values are positional and must match
597/// `TableSchema.columns` in length and (modulo NULL) in `DataType`.
598#[derive(Debug, Clone, PartialEq)]
599pub struct Row {
600 pub values: Vec<Value>,
601}
602
603impl Row {
604 pub const fn new(values: Vec<Value>) -> Self {
605 Self { values }
606 }
607
608 pub fn len(&self) -> usize {
609 self.values.len()
610 }
611
612 pub fn is_empty(&self) -> bool {
613 self.values.is_empty()
614 }
615}
616
617#[derive(Debug, Clone, PartialEq)]
618pub struct ColumnSchema {
619 pub name: String,
620 pub ty: DataType,
621 pub nullable: bool,
622 /// Optional `DEFAULT` value, frozen at CREATE TABLE time. `None`
623 /// means "no default" (so omitted columns become NULL, or error
624 /// out when the column is NOT NULL). Literal defaults take this
625 /// path.
626 pub default: Option<Value>,
627 /// v7.9.21 — for DEFAULT expressions that need INSERT-time
628 /// evaluation (e.g. `DEFAULT now()`, `DEFAULT CURRENT_TIMESTAMP`),
629 /// the Display form of the expression. The engine re-parses
630 /// it on each INSERT default-fill, evaluates against an empty
631 /// row context, and coerces to the column type. mailrs G4.
632 /// Persisted in catalog FILE_VERSION 15+; older catalogs
633 /// deserialise with None.
634 pub runtime_default: Option<String>,
635 /// MySQL-style `AUTO_INCREMENT`. When set, an INSERT that leaves
636 /// this column unbound (or sets it to NULL) gets the next integer
637 /// computed from the column's current max + 1.
638 pub auto_increment: bool,
639 /// v7.17.0 Phase 1.4 — when the column is bound to a user-
640 /// defined ENUM type (the parser saw an unknown type ident
641 /// and the engine resolved it against `catalog.enum_types`),
642 /// this carries the enum name so INSERT/UPDATE can validate
643 /// the cell value against the enum's labels. `ty` is
644 /// `DataType::Text` in that case. Persisted in catalog
645 /// FILE_VERSION 29+; older catalogs deserialise with None.
646 pub user_enum_type: Option<String>,
647 /// v7.17.0 Phase 1.5 — when the column is bound to a user-
648 /// defined DOMAIN (the parser saw an unknown type ident and
649 /// the engine resolved it against `catalog.domain_types`),
650 /// this carries the domain name. `ty` is the domain's base
651 /// type; INSERT/UPDATE re-evaluates the domain's CHECK list
652 /// + NOT NULL against the cell value. Persisted in catalog
653 /// FILE_VERSION 30+; older catalogs deserialise with None.
654 pub user_domain_type: Option<String>,
655 /// v7.17.0 Phase 2.1 — MySQL `ON UPDATE CURRENT_TIMESTAMP`
656 /// column attribute. When `Some(expr_src)`, an UPDATE that
657 /// does NOT bind this column overrides the new value with
658 /// the engine-evaluated expression (always `now()` in
659 /// v7.17.0). Stored as Display-form source so storage
660 /// stays free of spg-sql; the engine re-parses at UPDATE
661 /// time. Persisted in catalog FILE_VERSION 32+; older
662 /// catalogs deserialise with None — preserves the existing
663 /// "silent ignore" behaviour for snapshots written before
664 /// the upgrade.
665 pub on_update_runtime: Option<String>,
666 /// v7.17.0 Phase 2.5 — text collation. Pre-2.5 SPG accepted
667 /// `COLLATE <name>` clauses but discarded the name, so a
668 /// column declared `COLLATE "case_insensitive"` (or any
669 /// MySQL `_ci` collation) still compared byte-wise — a
670 /// Tier-S silent failure where `WHERE name = 'foo'` never
671 /// matched stored `'Foo'`. This carries the parser-derived
672 /// classification so the engine's WHERE evaluator can route
673 /// text equality through a case-aware compare. `Binary` (the
674 /// default) preserves the prior byte-wise behaviour. Only
675 /// CaseInsensitive lands in the catalog appendix — Binary
676 /// columns stay implicit, keeping snapshots compact.
677 /// Persisted in catalog FILE_VERSION 34+; older catalogs
678 /// deserialise every column as `Binary`.
679 pub collation: Collation,
680 /// v7.17.0 Phase 4.4 — MySQL `UNSIGNED` modifier flag. Drives
681 /// engine-side INSERT / UPDATE range enforcement (rejects
682 /// negative values on UNSIGNED int columns). Pre-4.4 the
683 /// parser consumed and discarded the keyword silently, so
684 /// every UNSIGNED column quietly accepted negatives — a
685 /// Tier-A correctness drift. Sparse: only UNSIGNED columns
686 /// land in the catalog appendix; the default `false` keeps
687 /// snapshots compact for the common signed-int path.
688 /// Persisted in catalog FILE_VERSION 35+; older catalogs
689 /// deserialise every column as `is_unsigned = false`.
690 pub is_unsigned: bool,
691 /// v7.17.0 Phase 3.P0-36 — MySQL inline `ENUM('a','b','c')`
692 /// value list. Distinct from `user_enum_type` (which points
693 /// to a separately CREATE TYPE'd PG enum); this carries the
694 /// column-local list MySQL DDL declares inline. When `Some`,
695 /// `ty` is `DataType::Text` and INSERT/UPDATE validates the
696 /// cell value against this list. Variant ORDER is preserved
697 /// (MySQL uses it for `ORDER BY col`). Sparse: only ENUM
698 /// columns land in the catalog appendix.
699 /// Persisted in catalog FILE_VERSION 41+; older catalogs
700 /// deserialise with None — preserves silent-drop behaviour
701 /// for snapshots written before P0-36.
702 pub inline_enum_variants: Option<Vec<String>>,
703 /// v7.17.0 Phase 3.P0-37 — MySQL inline `SET('a','b','c')`
704 /// variant list. Storage is TEXT (canonical comma-joined in
705 /// definition order, de-duplicated). INSERT/UPDATE validates
706 /// every comma-separated token against this list. Sparse:
707 /// only SET columns land in the catalog appendix.
708 /// Persisted in catalog FILE_VERSION 42+; older catalogs
709 /// deserialise with None.
710 pub inline_set_variants: Option<Vec<String>>,
711}
712
713/// v7.17.0 Phase 2.5 — column-level text collation. Drives the
714/// engine's WHERE / GROUP BY equality routing for `Value::Text`.
715/// Only two variants are modelled in v7.17:
716/// * `Binary` — byte-wise comparison (the SPG default;
717/// matches PG `COLLATE "C"` / `pg_catalog.default`
718/// and MySQL `*_bin`).
719/// * `CaseInsensitive` — ASCII case-folded comparison
720/// (matches PG `COLLATE "case_insensitive"` and
721/// MySQL `*_ci` collations). Non-ASCII bytes
722/// still compare byte-wise; full ICU folding is
723/// out of v7.17 scope.
724/// New variants append at the end — older catalogs read missing
725/// columns as `Binary`.
726#[derive(Debug, Clone, Copy, PartialEq, Eq)]
727pub enum Collation {
728 Binary,
729 CaseInsensitive,
730}
731
732#[allow(clippy::derivable_impls)]
733impl Default for Collation {
734 fn default() -> Self {
735 Self::Binary
736 }
737}
738
739impl Collation {
740 /// Wire tag persisted in the FILE_VERSION 34+ catalog appendix.
741 /// Stable: future variants append above the recognised range
742 /// and unknown tags read back as `Binary` for forward-compat
743 /// on rollback.
744 pub const TAG_BINARY: u8 = 0;
745 pub const TAG_CASE_INSENSITIVE: u8 = 1;
746}
747
748#[derive(Debug, Clone, PartialEq)]
749pub struct TableSchema {
750 pub name: String,
751 pub columns: Vec<ColumnSchema>,
752 /// v6.7.2 — per-table hot-tier byte budget override. `None`
753 /// falls through to the global `SPG_HOT_TIER_BYTES` setting;
754 /// `Some(n)` overrides it for this specific table. Set via
755 /// `ALTER TABLE t SET hot_tier_bytes = X`. Persisted in
756 /// catalog FILE_VERSION 11+.
757 pub hot_tier_bytes: Option<u64>,
758 /// v7.6.1 — FOREIGN KEY constraints declared on this table.
759 /// Engine maintains this in lock-step with `spg-sql`'s parser
760 /// AST; the storage layer carries the on-disk shape so a
761 /// catalog snapshot round-trips without external mapping.
762 /// Persisted in catalog FILE_VERSION 13+. Older catalogs
763 /// deserialise with an empty vec.
764 pub foreign_keys: Vec<ForeignKeyConstraint>,
765 /// v7.9.19 — composite UNIQUE / PRIMARY KEY constraints
766 /// declared at the table level. Each entry's leading column
767 /// has a BTree index (created via the constraint), and INSERT
768 /// path enforces the full-tuple uniqueness via a scan keyed
769 /// by the leading column. Persisted in catalog FILE_VERSION
770 /// 15+. Older catalogs (≤ 14) deserialise with an empty vec.
771 pub uniqueness_constraints: Vec<UniquenessConstraint>,
772 /// v7.13.0 — `CHECK (<expr>)` predicates declared on this
773 /// table. Both column-level inline `CHECK (…)` and
774 /// table-level `CHECK (…)` fold into this list. Each entry
775 /// is the AST Expr's `Display` form, re-parsed on every
776 /// INSERT/UPDATE and evaluated against the candidate row.
777 /// A false / NULL result rejects the mutation (PG semantics).
778 /// Persisted in catalog FILE_VERSION 23+. Older catalogs
779 /// deserialise with an empty vec.
780 pub checks: Vec<String>,
781}
782
783/// v7.9.19 — composite UNIQUE / PRIMARY KEY constraint persisted
784/// on the table schema. The leading column always has a BTree
785/// index (created at CREATE TABLE time); INSERT enforcement
786/// scans that index for collisions on the full column tuple.
787#[derive(Debug, Clone, PartialEq, Eq)]
788pub struct UniquenessConstraint {
789 /// `true` when this constraint was declared as `PRIMARY KEY`
790 /// (vs `UNIQUE`). Semantically PK implies NOT NULL on all
791 /// referenced columns; the engine enforces that at CREATE
792 /// TABLE time.
793 pub is_primary_key: bool,
794 /// Column positions on the parent table. ≥ 1 element. For
795 /// single-column UNIQUE this is exactly one position; the
796 /// BTree index alone enforces it.
797 pub columns: Vec<usize>,
798 /// v7.13.0 — `UNIQUE NULLS NOT DISTINCT` modifier
799 /// (mailrs round-5 G10; PG 15+ surface). When `true`, two
800 /// rows whose constrained columns are all NULL collide on
801 /// the constraint. Default (`false`) is the SQL-standard
802 /// `NULLS DISTINCT` behaviour where any NULL passes.
803 /// Persisted in catalog FILE_VERSION 23+.
804 pub nulls_not_distinct: bool,
805}
806
807/// v7.6.1 — Storage-layer mirror of `spg_sql::ast::ForeignKeyConstraint`.
808/// The engine's CREATE TABLE path translates between the two; keeping
809/// them separate preserves the no-deps boundary between
810/// `spg-storage` and `spg-sql`.
811#[derive(Debug, Clone, PartialEq, Eq)]
812pub struct ForeignKeyConstraint {
813 /// Optional user-supplied constraint name (`CONSTRAINT <name>`
814 /// prefix). Used by `ALTER TABLE DROP CONSTRAINT <name>` in
815 /// v7.6.8; ignored by enforcement.
816 pub name: Option<String>,
817 /// Positions of local columns in this table's column list.
818 /// Same arity as `parent_columns`.
819 pub local_columns: Vec<usize>,
820 /// Referenced parent table name.
821 pub parent_table: String,
822 /// Positions of parent columns in the parent's column list.
823 /// Engine resolves these at CREATE TABLE time (after the parent
824 /// schema is known) so enforcement paths can skip the name
825 /// lookup on every row.
826 pub parent_columns: Vec<usize>,
827 /// Referential action when a parent row is deleted.
828 pub on_delete: FkAction,
829 /// Referential action when a parent row's referenced columns
830 /// are updated.
831 pub on_update: FkAction,
832}
833
834/// v7.6.1 — referential action tag. Mirrors `spg_sql::ast::FkAction`.
835#[derive(Debug, Clone, Copy, PartialEq, Eq)]
836pub enum FkAction {
837 Restrict,
838 Cascade,
839 SetNull,
840 SetDefault,
841 NoAction,
842}
843
844impl FkAction {
845 /// On-disk tag byte (v13 catalog appendix).
846 pub const fn tag(self) -> u8 {
847 match self {
848 Self::Restrict => 0,
849 Self::Cascade => 1,
850 Self::SetNull => 2,
851 Self::SetDefault => 3,
852 Self::NoAction => 4,
853 }
854 }
855 pub const fn from_tag(b: u8) -> Option<Self> {
856 Some(match b {
857 0 => Self::Restrict,
858 1 => Self::Cascade,
859 2 => Self::SetNull,
860 3 => Self::SetDefault,
861 4 => Self::NoAction,
862 _ => return None,
863 })
864 }
865}
866
867impl TableSchema {
868 pub fn column_position(&self, name: &str) -> Option<usize> {
869 self.columns.iter().position(|c| c.name == name)
870 }
871}
872
873/// Key type accepted by secondary indices. Float / NULL / Vector values
874/// can't participate in a B-tree index — `f64` is only `PartialOrd`, NULL
875/// has SQL-three-valued semantics, and Vector belongs to the (future) HNSW
876/// path. Index lookups on those columns fall back to full scan.
877#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
878pub enum IndexKey {
879 Int(i64),
880 Text(String),
881 Bool(bool),
882 /// v7.17.0 — `Value::Uuid` index key. Comparison is byte-wise
883 /// (RFC 4122 byte order) so PRIMARY KEY UUID lookups land on
884 /// the same fast-path as Int / Text.
885 Uuid([u8; 16]),
886}
887
888impl IndexKey {
889 pub fn from_value(v: &Value) -> Option<Self> {
890 match v {
891 Value::SmallInt(n) => Some(Self::Int(i64::from(*n))),
892 Value::Int(n) => Some(Self::Int(i64::from(*n))),
893 Value::BigInt(n) => Some(Self::Int(*n)),
894 Value::Text(s) => Some(Self::Text(s.clone())),
895 Value::Bool(b) => Some(Self::Bool(*b)),
896 // Date/Timestamp use their integer storage repr as the
897 // index key — same order semantics, same comparison.
898 Value::Date(d) => Some(Self::Int(i64::from(*d))),
899 Value::Timestamp(t) => Some(Self::Int(*t)),
900 // v7.17.0: UUID indexable via byte-wise ordering. Lookup
901 // on `id = '...'::uuid` resolves through the secondary
902 // index rather than full-scan.
903 Value::Uuid(b) => Some(Self::Uuid(*b)),
904 // v7.17.0 Phase 3.P0-32: TIME indexable via i64 — same
905 // order semantics as Date/Timestamp.
906 Value::Time(us) => Some(Self::Int(*us)),
907 // v7.17.0 Phase 3.P0-33: YEAR indexable as i64 — u16
908 // widens losslessly and gives the natural calendar
909 // ordering.
910 Value::Year(y) => Some(Self::Int(i64::from(*y))),
911 // v7.17.0 Phase 3.P0-34: TIMETZ indexable by its
912 // UTC-equivalent microseconds (local wall - offset).
913 // Without normalising, two values for the same
914 // physical instant in different zones would sort
915 // wrong. Matches PG's TIMETZ index behaviour.
916 Value::TimeTz { us, offset_secs } => {
917 Some(Self::Int(us - i64::from(*offset_secs) * 1_000_000))
918 }
919 // v7.17.0 Phase 3.P0-35: MONEY indexable as i64 cents
920 // (no scaling needed — natural numeric ordering).
921 Value::Money(c) => Some(Self::Int(*c)),
922 // v7.17.0 Phase 3.P0-38: ranges are NOT indexable in
923 // v7.17.0 — they'd need a custom comparator (PG uses
924 // SP-GiST for this). Skip.
925 Value::Range { .. } => None,
926 // v7.17.0 Phase 3.P0-39: hstore is NOT indexable in
927 // v7.17.0 — map columns need GIN with bespoke ops.
928 Value::Hstore(_) => None,
929 // v7.17.0 Phase 3.P0-40: 2D arrays aren't indexable.
930 Value::IntArray2D(_) | Value::BigIntArray2D(_) | Value::TextArray2D(_) => None,
931 // Numeric isn't (yet) indexable — exact-decimal index keys
932 // would need a stable scale-normalised representation.
933 // Interval isn't index-eligible either (and can't reach this
934 // path through column storage anyway).
935 Value::Null
936 | Value::Float(_)
937 | Value::Vector(_)
938 | Value::Sq8Vector(_)
939 | Value::HalfVector(_)
940 | Value::Numeric { .. }
941 | Value::Interval { .. }
942 | Value::Json(_)
943 | Value::Bytes(_)
944 | Value::TextArray(_)
945 | Value::IntArray(_)
946 | Value::BigIntArray(_)
947 | Value::TsVector(_)
948 | Value::TsQuery(_) => None,
949 }
950 }
951}
952
953/// A single-column secondary index. v2.0 carries either a B-tree map
954/// (the default — used for equality / range lookups on scalar columns)
955/// or a navigable-small-world graph (used for kNN over vector
956/// columns).
957#[derive(Debug, Clone)]
958pub struct Index {
959 pub name: String,
960 pub column_position: usize,
961 pub kind: IndexKind,
962 /// v6.8.0 — column positions of `INCLUDE (col1, col2, …)`
963 /// non-key columns. Carries the planner's "this query is
964 /// covered by the index" signal; lookup paths still resolve
965 /// via the `RowLocator` to fetch the row body, but EXPLAIN
966 /// surfaces the covered-scan annotation so operators can
967 /// confirm the planner sees the coverage.
968 ///
969 /// Empty `Vec` = no `INCLUDE` clause (the legacy shape). v12
970 /// catalog snapshots deserialise with an empty vec.
971 pub included_columns: Vec<usize>,
972 /// v6.8.1 — partial-index predicate stored as its canonical
973 /// Display form (the engine re-parses it on the maintenance
974 /// path). `None` = unconditional index (the legacy shape).
975 /// Persisted as `[u8 has_pred][u16 LE len][bytes]` on the
976 /// catalog snapshot (FILE_VERSION 12, appended after
977 /// `included_columns`).
978 pub partial_predicate: Option<String>,
979 /// v6.8.2 — expression-index key, stored as the expression's
980 /// canonical Display form. `None` = bare column-reference
981 /// index (the legacy shape). Persisted alongside
982 /// `partial_predicate` on the v12 catalog snapshot.
983 pub expression: Option<String>,
984 /// v7.9.29 — `CREATE UNIQUE INDEX …`. When true the engine
985 /// rejects INSERTs whose key already appears in this index
986 /// (combined with `partial_predicate` when present — only
987 /// rows matching the predicate enter the uniqueness check).
988 /// Catalog FILE_VERSION 16+; older snapshots deserialise
989 /// with `false`. mailrs K1.
990 pub is_unique: bool,
991 /// v7.9.29 — extra (non-leading) column positions for
992 /// multi-column indexes (`CREATE INDEX … (a, b, c)`). The
993 /// planner today still only uses the leading
994 /// `column_position` for index seeks, but UNIQUE INDEX
995 /// enforcement walks the full tuple so partial-unique
996 /// invariants like CalDAV `(calendar_id, uid,
997 /// recurrence_id)` are enforced correctly. Catalog
998 /// FILE_VERSION 16+; older snapshots deserialise empty.
999 pub extra_column_positions: Vec<usize>,
1000}
1001
1002/// Default neighbor degree (M) for the NSW graph. Picked at construction
1003/// time and persisted with the index.
1004pub const NSW_DEFAULT_M: usize = 16;
1005
1006/// v5.2.2: outcome of a successful [`Catalog::freeze_oldest_to_cold`]
1007/// call. The catalog state has already been mutated by the time this
1008/// is returned (hot rows dropped + segment registered + Cold locators
1009/// flipped). The caller's only remaining concern is `segment_bytes` —
1010/// persist them to disk under `<db>.spg/segments/seg_<id>.spg` so a
1011/// future restart can reload via the v5.1 `SPG_PRELOAD_COLD_SEGMENT`
1012/// path. (v5.3's manifest will subsume this manual step.)
1013#[derive(Debug, Clone)]
1014pub struct FreezeReport {
1015 /// Id allocated by [`Catalog::load_segment_bytes`] for the new
1016 /// cold-tier segment. Stable across the call's success path.
1017 pub segment_id: u32,
1018 /// Number of rows that moved hot → cold. Equals the `max_rows`
1019 /// the caller asked for (the API is strict on the count).
1020 pub frozen_rows: usize,
1021 /// Hot-tier bytes reclaimed by the freeze — the
1022 /// [`Table::hot_bytes`] delta before vs after. Useful to feed
1023 /// back into the freezer's budget check on the next tick.
1024 pub bytes_freed: u64,
1025 /// Encoded segment bytes, byte-identical to what
1026 /// [`encode_segment`] produced. The catalog already owns a
1027 /// copy inside `cold_segments`; this hand-off lets the caller
1028 /// persist them without re-encoding.
1029 pub segment_bytes: Vec<u8>,
1030}
1031
1032/// v6.7.4 — read-only output of [`Catalog::prepare_freeze_slice`].
1033/// Carries every row body + key in a contiguous hot-row range,
1034/// already encoded and sorted by PK so the coordinator's merge
1035/// step is a k-way merge over already-sorted streams.
1036///
1037/// `Vec<FreezeSlice>` from N independent workers feeds
1038/// [`Catalog::commit_freeze_slices`], which concats + encodes the
1039/// merged segment + atomically swaps the catalog state.
1040#[derive(Debug, Clone)]
1041pub struct FreezeSlice {
1042 /// Hot-row index range this slice covered (half-open, in the
1043 /// table's `rows: PersistentVec` ordering at call time). The
1044 /// commit step uses this to compute the union range that
1045 /// gets passed to [`Table::delete_rows`].
1046 pub row_range: core::ops::Range<usize>,
1047 /// `(pk_u64, encoded_row_body, IndexKey)` triples, sorted
1048 /// ascending by `pk_u64`. Per-slice sort happens inside
1049 /// `prepare_freeze_slice`; the coordinator does only a
1050 /// k-way merge to reach the global PK ordering
1051 /// [`encode_segment`] requires.
1052 pub rows: Vec<(u64, Vec<u8>, IndexKey)>,
1053}
1054
1055/// v6.7.3 — outcome of a [`Catalog::compact_cold_segments`] call.
1056/// The catalog state has already been mutated when this is returned:
1057/// the merged segment is loaded into `cold_segments`, the source
1058/// segment slots are tombstoned (`None`), and every BTree-index
1059/// `RowLocator::Cold` that previously pointed at a source now
1060/// points at the merged segment. The caller's remaining job is to
1061/// persist `merged_segment_bytes` under
1062/// `<db>.spg/segments/seg_<merged_segment_id>.spg` and update the
1063/// in-memory `segment_id → path` map (remove the source ids, add
1064/// the merged id) so the next CHECKPOINT writes a manifest that
1065/// no longer lists the retired sources.
1066///
1067/// On a no-op (fewer than 2 candidate segments under the threshold),
1068/// `merged_segment_id` is `None` and `sources` is empty; the
1069/// catalog was not mutated.
1070#[derive(Debug, Clone)]
1071pub struct CompactReport {
1072 /// Source segment ids that were merged + tombstoned.
1073 pub sources: Vec<u32>,
1074 /// Id allocated for the merged segment. `None` on no-op.
1075 pub merged_segment_id: Option<u32>,
1076 /// Encoded merged-segment bytes (empty on no-op).
1077 pub merged_segment_bytes: Vec<u8>,
1078 /// Number of rows that landed in the merged segment.
1079 pub merged_rows: usize,
1080 /// `Σ source.num_rows − merged_rows`. Rows present in source
1081 /// segment payloads but unreferenced by any live BTree
1082 /// `Cold` locator — DELETE'd-but-still-frozen rows that
1083 /// compaction GC'd during the merge.
1084 pub deleted_rows_pruned: usize,
1085 /// `Σ source.bytes() − merged.bytes()`. Estimate of on-disk
1086 /// space the merge will reclaim once the source segment files
1087 /// are GC'd. Saturating subtract — never negative.
1088 pub bytes_reclaimed_estimate: u64,
1089}
1090
1091#[derive(Debug, Clone)]
1092pub enum IndexKind {
1093 /// v4.40: structural-sharing B-tree over `IndexKey`. Replaces the v0.8
1094 /// `BTreeMap<IndexKey, Vec<usize>>` — `Index::clone` is now an `Arc`
1095 /// bump regardless of index size, so `Catalog::clone` inside the
1096 /// v4.34 auto-commit wrap stays O(1) even for tables with secondary
1097 /// indices (the case that bottlenecked v4.39 at 1M rows in the
1098 /// sweep).
1099 ///
1100 /// v5.1: value type widened from `Vec<usize>` to `Vec<RowLocator>` so
1101 /// a single key can point to a mix of hot-tier rows (`RowLocator::Hot`,
1102 /// equivalent to the pre-v5 `usize` row index) and cold-tier rows
1103 /// (`RowLocator::Cold { segment_id, page_offset }`) once the v5.2
1104 /// freezer starts producing them. Pre-v5.2 only `Hot` entries appear
1105 /// — the on-disk encoding stays at `FILE_VERSION` 8 (raw u64 row index)
1106 /// because every locator round-trips through `RowLocator::from_legacy_v8_u64`
1107 /// without information loss. `FILE_VERSION` 9 with tagged encoding lands
1108 /// alongside the first freezer commit (v5.1 step 2b / v5.2).
1109 BTree(PersistentBTreeMap<IndexKey, Vec<RowLocator>>),
1110 /// Navigable-small-world graph for vector kNN search.
1111 Nsw(NswGraph),
1112 /// v6.7.1 — BRIN (Block Range INdex). Pure metadata: BRIN
1113 /// indexes carry NO in-memory key→locator map. The (min,
1114 /// max) summaries live in each cold-tier segment's v2
1115 /// envelope sidecar; the BRIN entry in `Table.indices` only
1116 /// records THAT a BRIN index exists on this column so the
1117 /// segment encoder + planner can opt into the summary path.
1118 Brin {
1119 /// The cell type at `column_position` at CREATE INDEX time.
1120 /// Used by the planner to type-check WHERE-clause range
1121 /// predicates against the BRIN-indexed column.
1122 column_type: DataType,
1123 },
1124 /// v7.12.3 — GIN inverted index over a `tsvector` column.
1125 ///
1126 /// Storage shape: `lexeme word → Vec<RowLocator>`. The posting
1127 /// list per word is appended in row-order, so range scans are
1128 /// O(matching rows) once the per-word lookup is done. Multi-
1129 /// term queries intersect / union posting lists.
1130 ///
1131 /// `IndexKey::from_value(TsVector)` returns `None` — GIN doesn't
1132 /// participate in `try_index_seek` (which is BTree-equality-keyed).
1133 /// The engine consults this index through `try_gin_lookup` on
1134 /// `WHERE col @@ tsquery` predicates instead.
1135 ///
1136 /// Backed by a `PersistentBTreeMap` so `Catalog::clone` (the
1137 /// per-write snapshot) stays O(1) — same structural-sharing
1138 /// invariant as BTree.
1139 Gin(PersistentBTreeMap<alloc::string::String, Vec<RowLocator>>),
1140 /// v7.15.0 — `USING gin (col gin_trgm_ops)` over a `TEXT`
1141 /// column. Posting lists map `trigram` (PG-compatible 3-byte
1142 /// shingle on the lower-cased + space-padded input) to row
1143 /// locators. The planner uses this index to accelerate
1144 /// `WHERE col LIKE '…'` / `ILIKE '…'` / `similarity(col, q) >
1145 /// t` — every literal run of length ≥ 1 in the pattern
1146 /// produces a trigram set, the engine intersects the posting
1147 /// lists, and the LIKE / similarity predicate is re-evaluated
1148 /// per candidate row to filter the over-approximation.
1149 /// Persisted via tag-4 index payload in `FILE_VERSION` 24+.
1150 GinTrgm(PersistentBTreeMap<alloc::string::String, Vec<RowLocator>>),
1151 /// v7.17.0 Phase 2.2 — MySQL `FULLTEXT KEY (col)` over a
1152 /// `TEXT` / `VARCHAR` column. Posting lists map
1153 /// `tsvector('simple') lexeme` to row locators. At insert /
1154 /// build time the engine derives the lexemes from the cell
1155 /// via the same lower-case tokenisation rule as
1156 /// `to_tsvector('simple', ...)` — the column itself stays a
1157 /// plain text type on disk (mysqldump round-trips would be
1158 /// broken otherwise). The planner uses this index to
1159 /// accelerate MySQL-shape `MATCH(col) AGAINST('term')`
1160 /// queries by mapping them onto the existing tsquery `@@`
1161 /// walker. Persisted via tag-5 index payload in
1162 /// `FILE_VERSION` 33+.
1163 GinFulltext(PersistentBTreeMap<alloc::string::String, Vec<RowLocator>>),
1164}
1165
1166impl IndexKind {
1167 /// v7.31 (memory campaign, C2) — bytes this index variant holds
1168 /// resident in RAM, computed by walking its OWN structure rather
1169 /// than a parametric guess made by the engine. Replaces the old
1170 /// `spg_admin::memory_stats` inline match, which charged NSW with
1171 /// a stale `m_max_0 * 8` per node (neighbour slots are `u32` = 4 B
1172 /// since v6.1.x, and most nodes never fill `m_max_0`) and lumped
1173 /// every GIN family index into a flat 1 KiB token — a gross
1174 /// undercount for the text-heavy posting lists that dominate
1175 /// mailrs' footprint. Per-entry container overhead uses the
1176 /// 3-word (24 B on 64-bit) `Vec`/`String` header as the charge.
1177 ///
1178 /// O(index entries): operator/monitoring surface (`memory_stats` /
1179 /// `spg_memory_stats`), not a query path.
1180 #[must_use]
1181 pub fn approx_resident_bytes(&self) -> u64 {
1182 const HEADER: usize = 24; // Vec/String 3-word header on 64-bit.
1183 let loc = core::mem::size_of::<RowLocator>();
1184 match self {
1185 IndexKind::BTree(map) => {
1186 let key = core::mem::size_of::<IndexKey>();
1187 map.iter()
1188 .map(|(_, locs)| (key + HEADER + locs.len() * loc) as u64)
1189 .sum()
1190 }
1191 IndexKind::Nsw(g) => {
1192 // `levels` is one byte per node; each layer's adjacency
1193 // is a `Vec<u32>` per node whose actual length we walk
1194 // (the dense layer-0 list dominates, but upper layers
1195 // are sparse — the old estimate ignored that).
1196 let mut b = g.levels.len() as u64;
1197 for layer in &g.layers {
1198 for nbrs in layer.iter() {
1199 b += (HEADER + nbrs.len() * core::mem::size_of::<u32>()) as u64;
1200 }
1201 }
1202 b
1203 }
1204 // BRIN carries NO in-memory key→locator map (the (min,max)
1205 // summaries live in cold-segment sidecars on disk); the
1206 // resident footprint is just the column-type token.
1207 IndexKind::Brin { .. } => core::mem::size_of::<DataType>() as u64,
1208 IndexKind::Gin(map) | IndexKind::GinTrgm(map) | IndexKind::GinFulltext(map) => map
1209 .iter()
1210 .map(|(word, postings)| {
1211 (word.len() + HEADER + HEADER + postings.len() * loc) as u64
1212 })
1213 .sum(),
1214 }
1215 }
1216}
1217
1218/// Multi-layer HNSW graph (v2.13). Each node is assigned a `top_level`;
1219/// it appears in layers `0..=top_level`. Higher layers are sparser, so
1220/// search starts from the entry at the top layer, greedy-descends to
1221/// layer 0, and beam-searches there. Layer 0 keeps a larger neighbour
1222/// budget (`m_max_0 = 2 * m` per the HNSW paper); upper layers cap at
1223/// `m`. The struct name stays `NswGraph` so external users / on-disk
1224/// callers don't have to track a rename — the algorithm changed, the
1225/// data slot didn't.
1226#[derive(Debug, Clone)]
1227pub struct NswGraph {
1228 /// Max neighbours per node on layers ≥ 1.
1229 pub m: usize,
1230 /// Max neighbours on layer 0 (the dense bottom layer). HNSW
1231 /// convention: `m_max_0 = 2 * m`.
1232 pub m_max_0: usize,
1233 /// Entry point — the node that sits on the topmost layer. Search
1234 /// always starts here.
1235 pub entry: Option<usize>,
1236 /// Top layer of the entry node (== `layers.len() - 1` when populated).
1237 pub entry_level: u8,
1238 /// `levels[i]` = top layer of node `i`. Nodes whose vector cell is
1239 /// NULL / non-Vector have `levels[i] = 0` and no neighbour entries.
1240 ///
1241 /// v5.5.0: backed by `PersistentVec` so `NswGraph::clone` (and the
1242 /// `Catalog::clone` on every group-commit write that contains it) is O(1)
1243 /// structural-sharing instead of an O(N) element copy.
1244 pub levels: PersistentVec<u8>,
1245 /// `layers[l][i]` = neighbours of node `i` at layer `l`. Inner vec
1246 /// is empty when node `i` doesn't reach layer `l`.
1247 ///
1248 /// v5.5.0: the per-node middle dimension (the O(N) one) is a
1249 /// `PersistentVec`; the outer layer dimension stays a plain `Vec`
1250 /// (layer count ≤ 8, so its clone is O(1) in practice) and the inner
1251 /// neighbour list stays a `Vec` (bounded by `m_max_0`).
1252 ///
1253 /// v6.1.x: neighbour slot widened from `usize` (8 B on 64-bit) to
1254 /// `u32` (4 B). Row indices are catalog-bounded by `u32::MAX` (4G
1255 /// rows per table); the cast at the NSW boundary asserts this. At
1256 /// 1M dim-128 SQ8, layer 0 adjacency alone shrinks by ~128 MiB
1257 /// — the largest single contribution to the v6.0.5-measured
1258 /// 624 MiB ambition gap. On-disk format already used u32 LE, so
1259 /// this is a pure in-memory layout change; no `FILE_VERSION` bump.
1260 pub layers: Vec<PersistentVec<Vec<u32>>>,
1261}
1262
1263impl NswGraph {
1264 fn new(m: usize) -> Self {
1265 Self {
1266 m,
1267 m_max_0: m.saturating_mul(2),
1268 entry: None,
1269 entry_level: 0,
1270 levels: PersistentVec::new(),
1271 layers: alloc::vec![PersistentVec::new()],
1272 }
1273 }
1274
1275 /// Max-neighbour budget for layer `l`.
1276 pub const fn cap_for_layer(&self, layer: u8) -> usize {
1277 if layer == 0 { self.m_max_0 } else { self.m }
1278 }
1279}
1280
1281/// Deterministic level assignment, seeded on the row index so the same
1282/// insert order reproduces the same topology. Distribution is roughly
1283/// HNSW-flavoured with `mL ≈ 1/ln(M) ≈ 0.36` for M=16: each 4-bit
1284/// chunk that comes up zero promotes the node one layer (so P(level ≥
1285/// L) ≈ (1/16)^L).
1286#[allow(clippy::verbose_bit_mask)] // clippy suggests trailing_zeros(); we need an explicit MAX cap and a stable distribution shape.
1287pub fn nsw_assign_level(row_idx: usize) -> u8 {
1288 const MAX_LEVEL: u8 = 7; // 7 ⇒ ~16^7 ≈ 2.7e8 expected nodes between promotions; ample.
1289 // SplitMix-style mixer — cheap and seedable.
1290 let mut x = (row_idx as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15);
1291 x ^= x >> 30;
1292 x = x.wrapping_mul(0xBF58_476D_1CE4_E5B9);
1293 x ^= x >> 27;
1294 x = x.wrapping_mul(0x94D0_49BB_1331_11EB);
1295 x ^= x >> 31;
1296 // Count contiguous low-end zero nibbles (4-bit chunks). Each zero
1297 // nibble has probability 1/16, mirroring HNSW's `mL ≈ 1/ln(M)` for
1298 // M=16. `trailing_zeros / 4` would lose the ordering when x = 0, so
1299 // a plain loop with a cap is clearer.
1300 let mut level: u8 = 0;
1301 while x & 0xF == 0 && level < MAX_LEVEL {
1302 level += 1;
1303 x >>= 4;
1304 }
1305 level
1306}
1307
1308impl Index {
1309 fn new_btree(name: String, column_position: usize) -> Self {
1310 Self {
1311 name,
1312 column_position,
1313 kind: IndexKind::BTree(PersistentBTreeMap::new()),
1314 included_columns: Vec::new(),
1315 partial_predicate: None,
1316 expression: None,
1317 is_unique: false,
1318 extra_column_positions: Vec::new(),
1319 }
1320 }
1321
1322 fn new_nsw(name: String, column_position: usize, m: usize) -> Self {
1323 Self {
1324 name,
1325 column_position,
1326 kind: IndexKind::Nsw(NswGraph::new(m)),
1327 included_columns: Vec::new(),
1328 partial_predicate: None,
1329 expression: None,
1330 is_unique: false,
1331 extra_column_positions: Vec::new(),
1332 }
1333 }
1334
1335 /// v6.7.1 — BRIN index constructor. BRIN carries no in-memory
1336 /// data; the `column_type` snapshot is used by the segment
1337 /// encoder + planner for type-checking range predicates.
1338 fn new_brin(name: String, column_position: usize, column_type: DataType) -> Self {
1339 Self {
1340 name,
1341 column_position,
1342 kind: IndexKind::Brin { column_type },
1343 included_columns: Vec::new(),
1344 partial_predicate: None,
1345 expression: None,
1346 is_unique: false,
1347 extra_column_positions: Vec::new(),
1348 }
1349 }
1350
1351 /// v7.12.3 — GIN inverted-index constructor. Empty posting-list
1352 /// map; caller (typically [`Table::add_gin_index`] or
1353 /// [`Table::restore_gin_index`]) populates it from existing rows
1354 /// or from a deserialised snapshot.
1355 fn new_gin(name: String, column_position: usize) -> Self {
1356 Self {
1357 name,
1358 column_position,
1359 kind: IndexKind::Gin(PersistentBTreeMap::new()),
1360 included_columns: Vec::new(),
1361 partial_predicate: None,
1362 expression: None,
1363 is_unique: false,
1364 extra_column_positions: Vec::new(),
1365 }
1366 }
1367
1368 /// v7.15.0 — `gin_trgm_ops`-flavoured GIN constructor. Same
1369 /// shape as `new_gin` but the posting-list keys are 3-byte
1370 /// trigram shingles (`pg_trgm`-compatible) and the column
1371 /// type is `TEXT` / `VARCHAR` (not `TSVECTOR`).
1372 fn new_gin_trgm(name: String, column_position: usize) -> Self {
1373 Self {
1374 name,
1375 column_position,
1376 kind: IndexKind::GinTrgm(PersistentBTreeMap::new()),
1377 included_columns: Vec::new(),
1378 partial_predicate: None,
1379 expression: None,
1380 is_unique: false,
1381 extra_column_positions: Vec::new(),
1382 }
1383 }
1384
1385 /// v7.17.0 Phase 2.2 — MySQL `FULLTEXT KEY` GIN constructor.
1386 /// Same shape as `new_gin_trgm` but the posting-list keys
1387 /// are lower-cased word lexemes (`to_tsvector('simple', col)`
1388 /// equivalent) instead of trigrams, and the column type is
1389 /// `TEXT` / `VARCHAR` (not `TSVECTOR`).
1390 fn new_gin_fulltext(name: String, column_position: usize) -> Self {
1391 Self {
1392 name,
1393 column_position,
1394 kind: IndexKind::GinFulltext(PersistentBTreeMap::new()),
1395 included_columns: Vec::new(),
1396 partial_predicate: None,
1397 expression: None,
1398 is_unique: false,
1399 extra_column_positions: Vec::new(),
1400 }
1401 }
1402
1403 /// Look up the locators stored under `key` (B-tree only). Returns
1404 /// an empty slice when the key is absent or the index isn't a
1405 /// BTree — callers can treat both cases uniformly.
1406 ///
1407 /// v5.1: return type widened from `&[usize]` to `&[RowLocator]`.
1408 /// Pre-v5.2 callers can read the slice and `.as_hot().unwrap()`
1409 /// each entry (no `Cold` variants exist until the freezer lands);
1410 /// post-v5.2 callers dispatch hot vs. cold per locator.
1411 pub fn lookup_eq(&self, key: &IndexKey) -> &[RowLocator] {
1412 match &self.kind {
1413 IndexKind::BTree(m) => m.get(key).map_or(&[][..], Vec::as_slice),
1414 // BRIN / NSW / GIN / trigram-GIN / fulltext-GIN have
1415 // no IndexKey-keyed map; lookup is a no-op. GIN uses
1416 // [`Index::gin_lookup_word`] instead.
1417 IndexKind::Nsw(_)
1418 | IndexKind::Brin { .. }
1419 | IndexKind::Gin(_)
1420 | IndexKind::GinTrgm(_)
1421 | IndexKind::GinFulltext(_) => &[][..],
1422 }
1423 }
1424
1425 /// v7.12.3 — GIN posting-list lookup. Returns the row locators
1426 /// whose `tsvector` cell contains `word`. Empty when the word is
1427 /// absent from the index or this isn't a GIN index.
1428 pub fn gin_lookup_word(&self, word: &str) -> &[RowLocator] {
1429 match &self.kind {
1430 // v7.17.0 Phase 2.2 — fulltext-GIN shares the same
1431 // lexeme-keyed posting list shape as the
1432 // tsvector-typed GIN, so the same lookup applies.
1433 IndexKind::Gin(m) | IndexKind::GinFulltext(m) => {
1434 m.get(&String::from(word)).map_or(&[][..], Vec::as_slice)
1435 }
1436 IndexKind::BTree(_)
1437 | IndexKind::Nsw(_)
1438 | IndexKind::Brin { .. }
1439 | IndexKind::GinTrgm(_) => &[][..],
1440 }
1441 }
1442
1443 /// v7.15.0 — trigram-GIN posting-list lookup. Returns the row
1444 /// locators whose indexed `TEXT` cell contains the trigram
1445 /// `tri`. Empty when the trigram is absent or this isn't a
1446 /// trigram-GIN index.
1447 pub fn gin_trgm_lookup(&self, tri: &str) -> &[RowLocator] {
1448 match &self.kind {
1449 IndexKind::GinTrgm(m) => m.get(&String::from(tri)).map_or(&[][..], Vec::as_slice),
1450 IndexKind::BTree(_)
1451 | IndexKind::Nsw(_)
1452 | IndexKind::Brin { .. }
1453 | IndexKind::Gin(_)
1454 | IndexKind::GinFulltext(_) => &[][..],
1455 }
1456 }
1457
1458 /// Borrow the NSW graph (if this is an NSW index). Callers that need
1459 /// the graph for a kNN search go through here.
1460 pub const fn nsw(&self) -> Option<&NswGraph> {
1461 match &self.kind {
1462 IndexKind::Nsw(g) => Some(g),
1463 IndexKind::BTree(_)
1464 | IndexKind::Brin { .. }
1465 | IndexKind::Gin(_)
1466 | IndexKind::GinTrgm(_)
1467 | IndexKind::GinFulltext(_) => None,
1468 }
1469 }
1470
1471 /// v6.7.1 — true when this index is a BRIN (block range) index.
1472 /// Used by the segment encoder to opt into BRIN sidecar emission
1473 /// at freeze time, and by the planner to opt into page-skipping
1474 /// on range predicates.
1475 pub const fn is_brin(&self) -> bool {
1476 matches!(self.kind, IndexKind::Brin { .. })
1477 }
1478
1479 /// v7.15.0 — true when this index is a trigram GIN
1480 /// (`gin_trgm_ops`-flavoured). Used by the LIKE planner to
1481 /// opt into trigram acceleration.
1482 pub const fn is_gin_trgm(&self) -> bool {
1483 matches!(self.kind, IndexKind::GinTrgm(_))
1484 }
1485
1486 /// v7.12.3 — true when this index is a GIN inverted index.
1487 /// Used by the planner to opt into posting-list acceleration on
1488 /// `WHERE col @@ tsquery` predicates.
1489 pub const fn is_gin(&self) -> bool {
1490 matches!(self.kind, IndexKind::Gin(_))
1491 }
1492
1493 /// v7.17.0 Phase 2.2 — true when this index is a fulltext
1494 /// GIN over a TEXT / VARCHAR column (MySQL `FULLTEXT KEY`
1495 /// surface). Used by the planner to opt the FULLTEXT-indexed
1496 /// column into MATCH AGAINST acceleration.
1497 pub const fn is_gin_fulltext(&self) -> bool {
1498 matches!(self.kind, IndexKind::GinFulltext(_))
1499 }
1500}
1501
1502/// In-memory table: schema + a persistent row vector + secondary indices.
1503///
1504/// v4.39: `rows` is a [`PersistentVec`] (Bitmapped Vector Trie, 32-way) so
1505/// `Table::clone()` is `O(1)` — the whole reason for v4.39's existence is
1506/// to make `Catalog::clone()` cheap inside the v4.34 auto-commit wrap.
1507///
1508/// v5.2.1: `hot_bytes` tracks the encoded byte size of every row currently
1509/// in [`Self::rows`], summed over rows. Updated incrementally by `insert`
1510/// (+= encoded row size), `delete_rows` (-= removed rows' encoded sizes),
1511/// and `update_row` (-= old size, += new size). The value is what the
1512/// v5.2 freezer reads to decide when to demote cold rows — when the
1513/// catalog-wide sum crosses `SPG_HOT_TIER_BYTES` (default 4 GiB) the
1514/// freezer thread wakes. v5.2.1 ships measurement only; the freezer
1515/// itself lands in v5.2.2. Stored as `u64` so a single field clone in
1516/// `Catalog::clone` stays at the O(1) invariant v4.39 built.
1517/// v7.34 (crash-recovery P0 #2) — one row-level physical redo record.
1518/// Row-level redo replaces statement-based WAL replay (which re-executes
1519/// each SQL through the full engine — O(records × catalog_rows), the
1520/// superlinear recovery hang root-caused on the mailrs crash-recovery
1521/// P0). A `RowChange` is the exact storage mutation the engine applied
1522/// (`Table::insert` / `update_row` / `delete_rows`); replaying it on a
1523/// catalog restored from the matching checkpoint reproduces the state
1524/// WITHOUT re-validating uniqueness/FK/parse/plan — O(changed rows).
1525///
1526/// Positions are physical, not key-based: `serialize`/`deserialize`
1527/// preserve row order exactly (rows written + read back in `self.rows`
1528/// order) and the mutation ops are deterministic, so the same op sequence
1529/// replayed from the same checkpoint reproduces the same positions. This
1530/// matches PostgreSQL's physical redo and supports tables with no primary
1531/// key. (Caveat handled at replay integration: a post-checkpoint cold-tier
1532/// freeze shifts hot positions and must itself be logged or fenced by a
1533/// checkpoint — see `row-level-redo-design`.)
1534#[derive(Debug, Clone, PartialEq)]
1535pub enum RowChange {
1536 /// Append `row` to `table`.
1537 Insert { table: String, row: Row },
1538 /// Replace the row at physical `pos` in `table` with `new_row`.
1539 Update {
1540 table: String,
1541 pos: usize,
1542 new_row: Vec<Value>,
1543 },
1544 /// Remove the rows at the given physical `positions` from `table`.
1545 Delete {
1546 table: String,
1547 positions: Vec<usize>,
1548 },
1549}
1550
1551/// v7.34 (crash-recovery P0 #2) — encode a row-level redo log to bytes for
1552/// a WAL record. Self-describing: the writer's `FILE_VERSION` leads so a
1553/// later spg can decode it via the version-gated value codec. Layout:
1554/// `[u8 version][u32 count]` then per change `[u8 op][str table]` and,
1555/// per op, `Insert [u32 n][value×n]`, `Update [u32 pos][u32 n][value×n]`,
1556/// `Delete [u32 n][u32 pos×n]`. Positions are physical (u32 ≤ 4 G rows).
1557#[must_use]
1558pub fn encode_redo_log(changes: &[RowChange]) -> Vec<u8> {
1559 let mut out = Vec::new();
1560 out.push(FILE_VERSION);
1561 codec::write_u32(&mut out, changes.len() as u32);
1562 let write_values = |out: &mut Vec<u8>, vals: &[Value]| {
1563 codec::write_u32(out, vals.len() as u32);
1564 for v in vals {
1565 codec::write_value(out, v);
1566 }
1567 };
1568 for change in changes {
1569 match change {
1570 RowChange::Insert { table, row } => {
1571 out.push(0);
1572 codec::write_str(&mut out, table);
1573 write_values(&mut out, &row.values);
1574 }
1575 RowChange::Update {
1576 table,
1577 pos,
1578 new_row,
1579 } => {
1580 out.push(1);
1581 codec::write_str(&mut out, table);
1582 codec::write_u32(&mut out, *pos as u32);
1583 write_values(&mut out, new_row);
1584 }
1585 RowChange::Delete { table, positions } => {
1586 out.push(2);
1587 codec::write_str(&mut out, table);
1588 codec::write_u32(&mut out, positions.len() as u32);
1589 for p in positions {
1590 codec::write_u32(&mut out, *p as u32);
1591 }
1592 }
1593 }
1594 }
1595 out
1596}
1597
1598/// v7.34 — decode a row-level redo log written by [`encode_redo_log`].
1599/// A truncated / corrupt buffer is a hard error (the embedding layer
1600/// frames each record with its own length + CRC; a frame that decodes
1601/// short is corruption, not a torn tail).
1602pub fn decode_redo_log(bytes: &[u8]) -> Result<Vec<RowChange>, StorageError> {
1603 let version = *bytes
1604 .first()
1605 .ok_or_else(|| StorageError::Corrupt("redo log: empty".into()))?;
1606 let mut cur = codec::Cursor::new(bytes).with_codec_version(version);
1607 let _version = cur.read_u8()?;
1608 let count = cur.read_u32()? as usize;
1609 let mut read_values = |cur: &mut codec::Cursor<'_>| -> Result<Vec<Value>, StorageError> {
1610 let n = cur.read_u32()? as usize;
1611 let mut vals = Vec::with_capacity(n);
1612 for _ in 0..n {
1613 vals.push(cur.read_value()?);
1614 }
1615 Ok(vals)
1616 };
1617 let mut changes = Vec::with_capacity(count);
1618 for _ in 0..count {
1619 let op = cur.read_u8()?;
1620 let table = cur.read_str()?;
1621 let change = match op {
1622 0 => RowChange::Insert {
1623 table,
1624 row: Row::new(read_values(&mut cur)?),
1625 },
1626 1 => {
1627 let pos = cur.read_u32()? as usize;
1628 RowChange::Update {
1629 table,
1630 pos,
1631 new_row: read_values(&mut cur)?,
1632 }
1633 }
1634 2 => {
1635 let n = cur.read_u32()? as usize;
1636 let mut positions = Vec::with_capacity(n);
1637 for _ in 0..n {
1638 positions.push(cur.read_u32()? as usize);
1639 }
1640 RowChange::Delete { table, positions }
1641 }
1642 other => {
1643 return Err(StorageError::Corrupt(alloc::format!(
1644 "redo log: unknown op {other}"
1645 )));
1646 }
1647 };
1648 changes.push(change);
1649 }
1650 Ok(changes)
1651}
1652
1653#[derive(Debug, Clone)]
1654pub struct Table {
1655 schema: TableSchema,
1656 rows: PersistentVec<Row>,
1657 indices: Vec<Index>,
1658 hot_bytes: u64,
1659 /// v6.7.0 — cached count of rows currently materialised in the
1660 /// cold tier via `RowLocator::Cold` entries across THIS table's
1661 /// indices. Populated by `ANALYZE` (walks every BTree index and
1662 /// counts Cold locators); the count survives until the next
1663 /// ANALYZE recomputes it. Surfaced via `spg_statistic.cold_row_count`
1664 /// and `spg_stat_segment.table_name`.
1665 ///
1666 /// Honest scope: this is a CACHED count, not a live one.
1667 /// Freezer / promote / DELETE don't currently update the cache
1668 /// incrementally — they invalidate it by setting the
1669 /// `cold_row_count_stale` flag, and the next ANALYZE re-walks.
1670 /// Incremental maintenance is a v6.7.x candidate if observation
1671 /// shows the ANALYZE walk cost dominates.
1672 cold_row_count: u64,
1673 /// v6.7.0 — set when the cached `cold_row_count` may be wrong
1674 /// because rows moved into / out of the cold tier since the last
1675 /// ANALYZE. The virtual-table surface reports the cached value
1676 /// regardless (operators run ANALYZE to refresh).
1677 cold_row_count_stale: bool,
1678 /// v7.34 (crash-recovery P0 #2) — row-level redo capture buffer.
1679 /// `None` (default, in-memory mode) captures nothing — zero overhead.
1680 /// `Some` (set by the engine when persistence is on, before a
1681 /// mutating call) makes `insert` / `update_row` / `delete_rows`
1682 /// record the physical [`RowChange`] they applied, which the engine
1683 /// drains after the statement and writes to the WAL in place of the
1684 /// SQL text. Transient: never serialized; a `Catalog::clone` between
1685 /// enable and drain copies it (cheap — empty in the steady state).
1686 redo_log: Option<Vec<RowChange>>,
1687}
1688
1689/// Catalog: insertion-ordered `Vec<Table>` for stable iter / serialize,
1690/// plus a `BTreeMap<String, usize>` sidecar index so `get` / `get_mut`
1691/// run in O(log n) instead of the old linear scan with per-element
1692/// string compares.
1693///
1694/// A pure `BTreeMap<String, Table>` was tried in an interim version
1695/// of v3.1.2 and regressed the single-table catalog benches by ~10%
1696/// (the per-element `BTreeMap` overhead outweighs the lookup win
1697/// when n is small). The sidecar shape preserves the insertion-order
1698/// iteration the on-disk encoding relies on and keeps `last_mut`
1699/// (used by the deserialize hot path) cheap.
1700#[derive(Debug, Clone, Default)]
1701pub struct Catalog {
1702 tables: Vec<Table>,
1703 /// `name → tables[index]`. Kept in lock-step with `tables`.
1704 /// `create_table` is the only write path.
1705 by_name: BTreeMap<String, usize>,
1706 /// v5.1: in-memory cold-tier segments. Side-loaded via
1707 /// [`Catalog::load_segment_bytes`] — they live outside the
1708 /// catalog snapshot (caller persists them as separate files
1709 /// and re-loads on boot, until v5.3's `CatalogManifest` makes
1710 /// that wiring automatic). `RowLocator::Cold { segment_id, .. }`
1711 /// indexes this `Vec`. Cleared on `Catalog::new` / fresh
1712 /// `deserialize`.
1713 ///
1714 /// `Arc` wrap keeps `Catalog::clone` at O(N segments) bumps
1715 /// (rather than O(total segment bytes) memcpy) so the v4.42
1716 /// group-commit pre-image rollback invariant — clone is
1717 /// effectively free — survives the cold-tier addition.
1718 ///
1719 /// v6.7.3 — slots became `Option<…>` so cold-segment compaction
1720 /// can tombstone merged sources without breaking the
1721 /// `segment_id = index_into_vec` contract that on-disk
1722 /// `RowLocator::Cold { segment_id }` already serialized.
1723 /// `None` slot = the segment was retired by compaction; the
1724 /// physical file may still be on disk (next CHECKPOINT writes
1725 /// a manifest that no longer lists it, and the file becomes
1726 /// an orphan eligible for offline cleanup).
1727 cold_segments: Vec<Option<Arc<OwnedSegment>>>,
1728 /// v7.12.4 — user-defined functions (PL/pgSQL + SQL).
1729 /// Keyed by function name (PG overloading is out of scope).
1730 /// Bodies are stored as the raw source text the parser saw
1731 /// between `$$ ... $$`; the engine re-parses on each
1732 /// invocation. This keeps `spg-storage` free of `spg-sql`
1733 /// dependency — same pattern as partial-index predicates.
1734 functions: BTreeMap<String, FunctionDef>,
1735 /// v7.12.4 — triggers in insertion order. Multiple triggers
1736 /// per table / event fire in this order (matching PG's
1737 /// alphabetical-by-default with insertion-stable tie-break
1738 /// behaviour — we just keep insertion order for now).
1739 triggers: Vec<TriggerDef>,
1740 /// v7.17.0 — catalogued SEQUENCE objects (Phase 1.1). Each
1741 /// `nextval(name)` reaches in here, atomically increments
1742 /// `last_value` / flips `is_called`, returns the new value.
1743 /// Persisted in catalog FILE_VERSION 26+; older catalogs
1744 /// deserialise with an empty map.
1745 sequences: BTreeMap<String, SequenceDef>,
1746 /// v7.17.0 — catalogued VIEW objects (Phase 1.2). Each
1747 /// `SELECT FROM v` at engine exec-time looks up `v` here and
1748 /// prepends the view body as a synthetic CTE. Persisted in
1749 /// catalog FILE_VERSION 27+; older catalogs deserialise with
1750 /// an empty map.
1751 views: BTreeMap<String, ViewDef>,
1752 /// v7.17.0 — catalogued MATERIALIZED VIEW source registry
1753 /// (Phase 1.3). Maps name → SELECT source. The materialised
1754 /// rows themselves live as a regular `Table` with the same
1755 /// name; REFRESH re-parses + re-executes the source against
1756 /// the table. Persisted in catalog FILE_VERSION 28+;
1757 /// older catalogs deserialise with an empty map.
1758 materialized_views: BTreeMap<String, String>,
1759 /// v7.17.0 — catalogued user-defined ENUM types (Phase 1.4).
1760 /// Maps name → label list. Columns reference these by name
1761 /// via `ColumnSchema.user_enum_type`. Persisted in catalog
1762 /// FILE_VERSION 29+; older catalogs deserialise with an empty
1763 /// map.
1764 enum_types: BTreeMap<String, EnumDef>,
1765 /// v7.17.0 — catalogued user-defined DOMAIN types (Phase 1.5).
1766 /// Maps name → base + CHECK constraints. Columns reference
1767 /// these by name via `ColumnSchema.user_domain_type`.
1768 /// Persisted in catalog FILE_VERSION 30+; older catalogs
1769 /// deserialise with an empty map.
1770 domain_types: BTreeMap<String, DomainDef>,
1771 /// v7.17.0 — schema-namespace registry (Phase 1.6). Tracks
1772 /// which schemas exist. `public`, `pg_catalog`, and
1773 /// `information_schema` are built-in and always present.
1774 /// Schema-qualified table references still strip the prefix
1775 /// at lookup time per v7.16-and-earlier — full
1776 /// schema-as-isolation is v7.18+ scope. Persisted in catalog
1777 /// FILE_VERSION 31+; older catalogs deserialise with just
1778 /// the built-ins.
1779 schemas: alloc::collections::BTreeSet<String>,
1780}
1781
1782/// v7.12.4 — catalogued user-defined function. `body` is the raw
1783/// source text between `$$ ... $$`; the engine re-parses it on
1784/// invocation. This keeps the storage codec stable when the
1785/// PL/pgSQL surface grows (no breaking-change risk on the disk
1786/// format).
1787#[derive(Debug, Clone, PartialEq, Eq)]
1788pub struct FunctionDef {
1789 pub name: String,
1790 /// Display form of the argument list, e.g.
1791 /// `"(name TEXT, ts TIMESTAMP)"`. Empty `"()"` for the trigger
1792 /// function shape. Parser-side canonicalised before storage.
1793 pub args_repr: String,
1794 /// Display form of the return type, e.g. `"TRIGGER"` /
1795 /// `"INT"` / `"SETOF text"`. The engine special-cases
1796 /// `"TRIGGER"` (case-insensitive) to gate trigger-only
1797 /// semantics (NEW/OLD).
1798 pub returns: String,
1799 /// `LANGUAGE` clause, lowercased. `"plpgsql"` / `"sql"`.
1800 pub language: String,
1801 /// Source body of the function. PL/pgSQL: includes the
1802 /// surrounding `BEGIN ... END;`. SQL: includes the
1803 /// statement(s). The engine re-parses on invocation; bad
1804 /// bodies surface as a parse error at CALL time, not CREATE.
1805 pub body: String,
1806}
1807
1808/// v7.12.4 — catalogued trigger. References its function by
1809/// name; the function must exist at TRIGGER creation time
1810/// (forward references are deferred to v7.12.5+).
1811#[derive(Debug, Clone, PartialEq, Eq)]
1812pub struct TriggerDef {
1813 pub name: String,
1814 /// Watched table. Trigger is dropped when the table drops.
1815 pub table: String,
1816 /// `"BEFORE"` / `"AFTER"` / `"INSTEAD OF"`. Stored as the
1817 /// uppercased keyword so deserialised catalogs round-trip
1818 /// without canonicalisation surprises.
1819 pub timing: String,
1820 /// Each entry is one of `"INSERT"` / `"UPDATE"` / `"DELETE"`
1821 /// / `"TRUNCATE"`. `INSERT OR UPDATE` parses to two entries.
1822 pub events: Vec<String>,
1823 /// `"ROW"` / `"STATEMENT"`. v7.12.4 ships `"ROW"` only;
1824 /// `"STATEMENT"` parses and persists but the executor
1825 /// refuses it at trigger fire time.
1826 pub for_each: String,
1827 /// Name of the PL/pgSQL function to invoke.
1828 pub function: String,
1829 /// v7.13.0 — `UPDATE OF col, col, …` column-list filter
1830 /// (mailrs round-5 G7). Non-empty means the trigger fires
1831 /// only when at least one of these columns appears in the
1832 /// UPDATE's SET list. Empty = no column filter. Stored in
1833 /// catalog FILE_VERSION 23+; older catalogs deserialise with
1834 /// an empty vec.
1835 pub update_columns: Vec<String>,
1836 /// v7.16.1 — whether the trigger fires when its watched
1837 /// event occurs. Toggled by `ALTER TABLE … { ENABLE |
1838 /// DISABLE } TRIGGER …`; pg_dump --disable-triggers wraps
1839 /// every data block with a DISABLE/ENABLE pair so the
1840 /// rows already-computed in prod don't get re-rewritten.
1841 /// Defaults to `true` at CREATE TRIGGER time. Stored in
1842 /// catalog FILE_VERSION 25+; older catalogs deserialise
1843 /// with `enabled = true`.
1844 pub enabled: bool,
1845}
1846
1847/// v7.17.0 — catalogued SEQUENCE. PG semantics: a counter object
1848/// returning monotonically increasing values via `nextval(name)`.
1849/// `last_value` is the most recent value handed out; `is_called`
1850/// is false until the first `nextval`/`setval`. Stored separately
1851/// from tables in the catalog.
1852#[derive(Debug, Clone, PartialEq, Eq)]
1853pub struct SequenceDef {
1854 pub name: String,
1855 /// Data type — narrows the i64 range. PG default BIGINT.
1856 pub data_type: SequenceDataType,
1857 pub start: i64,
1858 pub increment: i64,
1859 pub min_value: i64,
1860 pub max_value: i64,
1861 pub cache: i64,
1862 pub cycle: bool,
1863 /// `OWNED BY` target — `(table, column)` or NONE.
1864 pub owned_by: Option<(String, String)>,
1865 /// Most recently handed-out value. Meaningless when
1866 /// `is_called == false`; in that case the NEXT `nextval`
1867 /// will return `start`.
1868 pub last_value: i64,
1869 pub is_called: bool,
1870}
1871
1872/// v7.17.0 — sequence integer width.
1873#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1874pub enum SequenceDataType {
1875 SmallInt,
1876 Int,
1877 BigInt,
1878}
1879
1880/// v7.17.0 Phase 1.6 — built-in schema names that every Catalog
1881/// understands without an explicit CREATE SCHEMA. Used by
1882/// [`Catalog::schema_exists`] and the engine's schema-qualified
1883/// lookup path.
1884#[must_use]
1885pub fn is_builtin_schema(name: &str) -> bool {
1886 name.eq_ignore_ascii_case("public")
1887 || name.eq_ignore_ascii_case("pg_catalog")
1888 || name.eq_ignore_ascii_case("information_schema")
1889}
1890
1891/// v7.17.0 — parse a PG-canonical UUID text representation into the
1892/// 16-byte network-order layout used by `Value::Uuid`. Accepted input
1893/// shapes (all case-insensitive):
1894/// * Canonical hyphenated 8-4-4-4-12 (`550e8400-e29b-41d4-a716-446655440000`)
1895/// * Unhyphenated 32-char hex (`550e8400e29b41d4a716446655440000`)
1896/// * Either form wrapped in `{ ... }`
1897///
1898/// Returns `None` for any malformed input (wrong length, non-hex
1899/// characters, misplaced hyphens). The caller surfaces a SQL error
1900/// at coercion time — silent acceptance of garbage would mask
1901/// application bugs and is exactly the divergence from PG that
1902/// breaks the 0-change cutover promise.
1903#[must_use]
1904pub fn parse_uuid_str(input: &str) -> Option<[u8; 16]> {
1905 let s = input.trim();
1906 // Strip surrounding braces if present.
1907 let s = if let Some(inner) = s.strip_prefix('{').and_then(|x| x.strip_suffix('}')) {
1908 inner
1909 } else {
1910 s
1911 };
1912 // Two valid shapes after braces are stripped: 32 hex chars or
1913 // the canonical 36-char hyphenated form.
1914 let hex: String = match s.len() {
1915 32 => s.to_ascii_lowercase(),
1916 36 => {
1917 // Hyphens must be exactly at positions 8, 13, 18, 23.
1918 let b = s.as_bytes();
1919 if b[8] != b'-' || b[13] != b'-' || b[18] != b'-' || b[23] != b'-' {
1920 return None;
1921 }
1922 let mut out = String::with_capacity(32);
1923 out.push_str(&s[0..8]);
1924 out.push_str(&s[9..13]);
1925 out.push_str(&s[14..18]);
1926 out.push_str(&s[19..23]);
1927 out.push_str(&s[24..36]);
1928 out.make_ascii_lowercase();
1929 out
1930 }
1931 _ => return None,
1932 };
1933 let bytes = hex.as_bytes();
1934 let mut out = [0u8; 16];
1935 for i in 0..16 {
1936 let hi = hex_nibble(bytes[i * 2])?;
1937 let lo = hex_nibble(bytes[i * 2 + 1])?;
1938 out[i] = (hi << 4) | lo;
1939 }
1940 Some(out)
1941}
1942
1943fn hex_nibble(b: u8) -> Option<u8> {
1944 match b {
1945 b'0'..=b'9' => Some(b - b'0'),
1946 b'a'..=b'f' => Some(10 + b - b'a'),
1947 b'A'..=b'F' => Some(10 + b - b'A'),
1948 _ => None,
1949 }
1950}
1951
1952/// v7.17.0 — render a `Value::Uuid` payload as the canonical
1953/// lowercase 8-4-4-4-12 hyphenated form PG `text` cast surfaces.
1954#[must_use]
1955pub fn format_uuid(b: &[u8; 16]) -> String {
1956 const HEX: &[u8; 16] = b"0123456789abcdef";
1957 let mut out = String::with_capacity(36);
1958 for (i, byte) in b.iter().enumerate() {
1959 if matches!(i, 4 | 6 | 8 | 10) {
1960 out.push('-');
1961 }
1962 out.push(HEX[(byte >> 4) as usize] as char);
1963 out.push(HEX[(byte & 0x0f) as usize] as char);
1964 }
1965 out
1966}
1967
1968/// v7.17.0 Phase 1.5 — catalogued user-defined DOMAIN. A domain
1969/// is a named CHECK-constrained alias over a built-in type;
1970/// columns bound to it inherit the base type plus the CHECK
1971/// predicates + NOT NULL + DEFAULT at INSERT/UPDATE time.
1972/// `default` / `checks` are stored as Display-form source so
1973/// `spg-storage` stays free of `spg-sql` dependency — same
1974/// pattern as FunctionDef / ViewDef.
1975#[derive(Debug, Clone, PartialEq, Eq)]
1976pub struct DomainDef {
1977 pub name: String,
1978 pub base_type: DataType,
1979 pub nullable: bool,
1980 pub default: Option<String>,
1981 pub checks: Vec<String>,
1982}
1983
1984/// v7.17.0 Phase 1.4 — catalogued user-defined ENUM type. The
1985/// label vector is order-preserving (PG enum ordering follows the
1986/// declared order). At INSERT/UPDATE on a column bound to this
1987/// enum, the engine looks up the value against `labels` and
1988/// rejects non-members.
1989#[derive(Debug, Clone, PartialEq, Eq)]
1990pub struct EnumDef {
1991 pub name: String,
1992 pub labels: Vec<String>,
1993}
1994
1995/// v7.17.0 Phase 1.2 — catalogued VIEW. The body is stored as the
1996/// raw source text the parser saw between `AS` and the statement
1997/// terminator; the engine re-parses on each invocation. Same
1998/// pattern as `FunctionDef` — keeps `spg-storage` free of
1999/// `spg-sql` dependency.
2000#[derive(Debug, Clone, PartialEq, Eq)]
2001pub struct ViewDef {
2002 pub name: String,
2003 /// Optional `(col, col, …)` rename list. Empty when the body's
2004 /// projected names are used directly.
2005 pub columns: Vec<String>,
2006 /// Raw SELECT source. Display-rendered at storage time so the
2007 /// catalog round-trips a deterministic form regardless of
2008 /// whitespace / comments in the original input. Re-parsed at
2009 /// SELECT-from-view time to materialise as a synthetic CTE.
2010 pub body: String,
2011}
2012
2013impl SequenceDataType {
2014 /// PG default min/max per AS clause.
2015 pub fn default_bounds(self, increment_positive: bool) -> (i64, i64) {
2016 match self {
2017 Self::SmallInt => {
2018 if increment_positive {
2019 (1, i64::from(i16::MAX))
2020 } else {
2021 (i64::from(i16::MIN), -1)
2022 }
2023 }
2024 Self::Int => {
2025 if increment_positive {
2026 (1, i64::from(i32::MAX))
2027 } else {
2028 (i64::from(i32::MIN), -1)
2029 }
2030 }
2031 Self::BigInt => {
2032 if increment_positive {
2033 (1, i64::MAX)
2034 } else {
2035 (i64::MIN, -1)
2036 }
2037 }
2038 }
2039 }
2040}
2041
2042impl Catalog {
2043 pub const fn new() -> Self {
2044 Self {
2045 tables: Vec::new(),
2046 by_name: BTreeMap::new(),
2047 cold_segments: Vec::new(),
2048 functions: BTreeMap::new(),
2049 triggers: Vec::new(),
2050 sequences: BTreeMap::new(),
2051 views: BTreeMap::new(),
2052 materialized_views: BTreeMap::new(),
2053 enum_types: BTreeMap::new(),
2054 domain_types: BTreeMap::new(),
2055 schemas: alloc::collections::BTreeSet::new(),
2056 }
2057 }
2058
2059 /// v7.12.4 — read-only view of catalogued user-defined
2060 /// functions. Engine callers go through here to look up the
2061 /// function body before re-parsing it for invocation.
2062 pub const fn functions(&self) -> &BTreeMap<String, FunctionDef> {
2063 &self.functions
2064 }
2065
2066 /// v7.12.4 — register a new user-defined function. With
2067 /// `or_replace = false`, errors if the name is taken. The
2068 /// engine validates the body before passing it here.
2069 pub fn create_function(
2070 &mut self,
2071 def: FunctionDef,
2072 or_replace: bool,
2073 ) -> Result<(), StorageError> {
2074 if !or_replace && self.functions.contains_key(&def.name) {
2075 return Err(StorageError::Corrupt(format!(
2076 "function {:?} already exists (drop or use CREATE OR REPLACE)",
2077 def.name
2078 )));
2079 }
2080 self.functions.insert(def.name.clone(), def);
2081 Ok(())
2082 }
2083
2084 /// v7.12.4 — remove a user-defined function by name. Returns
2085 /// `true` if a function was removed, `false` if none matched.
2086 /// Caller decides whether to surface `if_exists` semantics.
2087 pub fn drop_function(&mut self, name: &str) -> bool {
2088 self.functions.remove(name).is_some()
2089 }
2090
2091 /// v7.17.0 — read-only handle to catalogued sequences.
2092 pub const fn sequences(&self) -> &BTreeMap<String, SequenceDef> {
2093 &self.sequences
2094 }
2095
2096 /// v7.17.0 — register a new SEQUENCE. Errors if `name`
2097 /// collides with an existing sequence and `if_not_exists`
2098 /// is false.
2099 pub fn create_sequence(
2100 &mut self,
2101 def: SequenceDef,
2102 if_not_exists: bool,
2103 ) -> Result<(), StorageError> {
2104 if self.sequences.contains_key(&def.name) {
2105 if if_not_exists {
2106 return Ok(());
2107 }
2108 return Err(StorageError::Corrupt(format!(
2109 "sequence {:?} already exists",
2110 def.name
2111 )));
2112 }
2113 self.sequences.insert(def.name.clone(), def);
2114 Ok(())
2115 }
2116
2117 /// v7.17.0 — remove a SEQUENCE by name. Returns `true` if a
2118 /// sequence was removed, `false` if none matched. Caller
2119 /// surfaces IF EXISTS semantics.
2120 pub fn drop_sequence(&mut self, name: &str) -> bool {
2121 self.sequences.remove(name).is_some()
2122 }
2123
2124 /// v7.17.0 — atomic nextval. Increments `last_value` per
2125 /// `increment`, returns the new value, sets `is_called`.
2126 /// Returns an error on CYCLE-less overflow.
2127 pub fn sequence_next_value(&mut self, name: &str) -> Result<i64, StorageError> {
2128 let Some(seq) = self.sequences.get_mut(name) else {
2129 return Err(StorageError::Corrupt(format!(
2130 "sequence {name:?} does not exist"
2131 )));
2132 };
2133 // PG semantics: when !is_called (fresh sequence or
2134 // setval(_, false)), the next nextval returns the stored
2135 // `last_value`. When is_called, it advances by `increment`
2136 // and CYCLE-wraps on overflow.
2137 let candidate = if seq.is_called {
2138 let next = seq.last_value.checked_add(seq.increment).ok_or_else(|| {
2139 StorageError::Corrupt(format!("sequence {name:?} arithmetic overflow"))
2140 })?;
2141 if seq.increment > 0 {
2142 if next > seq.max_value {
2143 if seq.cycle {
2144 seq.min_value
2145 } else {
2146 return Err(StorageError::Corrupt(format!(
2147 "sequence {name:?} reached MAXVALUE ({})",
2148 seq.max_value
2149 )));
2150 }
2151 } else {
2152 next
2153 }
2154 } else if next < seq.min_value {
2155 if seq.cycle {
2156 seq.max_value
2157 } else {
2158 return Err(StorageError::Corrupt(format!(
2159 "sequence {name:?} reached MINVALUE ({})",
2160 seq.min_value
2161 )));
2162 }
2163 } else {
2164 next
2165 }
2166 } else {
2167 seq.last_value
2168 };
2169 seq.last_value = candidate;
2170 seq.is_called = true;
2171 Ok(candidate)
2172 }
2173
2174 /// v7.17.0 — currval. Errors if the session has never called
2175 /// nextval on this sequence (PG semantics). At the catalog
2176 /// level we approximate "session" with "is_called persisted";
2177 /// the engine session-tracking layer can wrap this for the
2178 /// strict per-session semantics later.
2179 pub fn sequence_current_value(&self, name: &str) -> Result<i64, StorageError> {
2180 let Some(seq) = self.sequences.get(name) else {
2181 return Err(StorageError::Corrupt(format!(
2182 "sequence {name:?} does not exist"
2183 )));
2184 };
2185 if !seq.is_called {
2186 return Err(StorageError::Corrupt(format!(
2187 "currval of sequence {name:?} is not yet defined in this session"
2188 )));
2189 }
2190 Ok(seq.last_value)
2191 }
2192
2193 /// v7.17.0 — setval(name, value [, is_called]). PG returns
2194 /// `value` regardless. `is_called=true` means the NEXT
2195 /// nextval will return `value + increment`; `is_called=false`
2196 /// means the next nextval will return `value`.
2197 pub fn sequence_set_value(
2198 &mut self,
2199 name: &str,
2200 value: i64,
2201 is_called: bool,
2202 ) -> Result<i64, StorageError> {
2203 let Some(seq) = self.sequences.get_mut(name) else {
2204 return Err(StorageError::Corrupt(format!(
2205 "sequence {name:?} does not exist"
2206 )));
2207 };
2208 seq.last_value = value;
2209 seq.is_called = is_called;
2210 Ok(value)
2211 }
2212
2213 /// v7.17.0 Phase 1.2 — read-only handle to catalogued views.
2214 pub const fn views(&self) -> &BTreeMap<String, ViewDef> {
2215 &self.views
2216 }
2217
2218 /// v7.17.0 Phase 1.2 — install a VIEW. `or_replace=true`
2219 /// overwrites an existing entry; `if_not_exists=true` is a
2220 /// silent no-op when the name is taken. Errors if both flags
2221 /// are off and the name collides.
2222 pub fn create_view(
2223 &mut self,
2224 def: ViewDef,
2225 or_replace: bool,
2226 if_not_exists: bool,
2227 ) -> Result<(), StorageError> {
2228 if self.views.contains_key(&def.name) {
2229 if or_replace {
2230 self.views.insert(def.name.clone(), def);
2231 return Ok(());
2232 }
2233 if if_not_exists {
2234 return Ok(());
2235 }
2236 return Err(StorageError::Corrupt(format!(
2237 "view {:?} already exists",
2238 def.name
2239 )));
2240 }
2241 // Reject name collision with tables / sequences — same
2242 // namespace per PG.
2243 if self.by_name.contains_key(&def.name) {
2244 return Err(StorageError::Corrupt(format!(
2245 "view {:?} would shadow an existing table",
2246 def.name
2247 )));
2248 }
2249 if self.sequences.contains_key(&def.name) {
2250 return Err(StorageError::Corrupt(format!(
2251 "view {:?} would shadow an existing sequence",
2252 def.name
2253 )));
2254 }
2255 self.views.insert(def.name.clone(), def);
2256 Ok(())
2257 }
2258
2259 /// v7.17.0 Phase 1.2 — remove a view by name. Returns true if
2260 /// a view was removed.
2261 pub fn drop_view(&mut self, name: &str) -> bool {
2262 self.views.remove(name).is_some()
2263 }
2264
2265 /// v7.17.0 Phase 1.3 — read-only handle to the materialised-
2266 /// view source registry. Each entry pairs with a regular
2267 /// table of the same name that holds the cached rows.
2268 pub const fn materialized_views(&self) -> &BTreeMap<String, String> {
2269 &self.materialized_views
2270 }
2271
2272 /// v7.17.0 Phase 1.3 — register a source for a materialised
2273 /// view. Caller has already created the backing table.
2274 pub fn register_materialized_view(&mut self, name: String, body: String) {
2275 self.materialized_views.insert(name, body);
2276 }
2277
2278 /// v7.17.0 Phase 1.3 — drop the source registry entry. Returns
2279 /// true if a source was unregistered. Caller separately drops
2280 /// the backing table.
2281 pub fn drop_materialized_view_source(&mut self, name: &str) -> bool {
2282 self.materialized_views.remove(name).is_some()
2283 }
2284
2285 /// v7.17.0 Phase 1.4 — read-only handle to user-defined ENUM
2286 /// catalog.
2287 pub const fn enum_types(&self) -> &BTreeMap<String, EnumDef> {
2288 &self.enum_types
2289 }
2290
2291 /// v7.17.0 Phase 1.4 — install a new ENUM type. Errors if
2292 /// `name` collides with an existing enum (no IF NOT EXISTS
2293 /// per PG semantics for CREATE TYPE).
2294 pub fn create_enum_type(&mut self, def: EnumDef) -> Result<(), StorageError> {
2295 if self.enum_types.contains_key(&def.name) {
2296 return Err(StorageError::Corrupt(format!(
2297 "type {:?} already exists",
2298 def.name
2299 )));
2300 }
2301 self.enum_types.insert(def.name.clone(), def);
2302 Ok(())
2303 }
2304
2305 /// v7.17.0 Phase 1.4 — drop an ENUM type by name. Returns
2306 /// true if a type was removed.
2307 pub fn drop_enum_type(&mut self, name: &str) -> bool {
2308 self.enum_types.remove(name).is_some()
2309 }
2310
2311 /// v7.17.0 Phase 1.5 — read-only handle to DOMAIN catalog.
2312 pub const fn domain_types(&self) -> &BTreeMap<String, DomainDef> {
2313 &self.domain_types
2314 }
2315
2316 /// v7.17.0 Phase 1.5 — install a DOMAIN. Errors on collision
2317 /// with an existing domain.
2318 pub fn create_domain_type(&mut self, def: DomainDef) -> Result<(), StorageError> {
2319 if self.domain_types.contains_key(&def.name) {
2320 return Err(StorageError::Corrupt(format!(
2321 "domain {:?} already exists",
2322 def.name
2323 )));
2324 }
2325 self.domain_types.insert(def.name.clone(), def);
2326 Ok(())
2327 }
2328
2329 /// v7.17.0 Phase 1.5 — drop a DOMAIN by name.
2330 pub fn drop_domain_type(&mut self, name: &str) -> bool {
2331 self.domain_types.remove(name).is_some()
2332 }
2333
2334 /// v7.17.0 Phase 1.6 — read-only handle to the user-created
2335 /// schema registry. Built-in schemas (`public`, `pg_catalog`,
2336 /// `information_schema`) are NOT included here; use
2337 /// [`schema_exists`](Self::schema_exists) for the full
2338 /// check.
2339 pub const fn user_schemas(&self) -> &alloc::collections::BTreeSet<String> {
2340 &self.schemas
2341 }
2342
2343 /// v7.17.0 Phase 1.6 — schema-name resolver. Returns true
2344 /// for built-in schemas + every user-CREATEd one. Used by
2345 /// CREATE SCHEMA collision checks and (future) by
2346 /// information_schema.schemata.
2347 pub fn schema_exists(&self, name: &str) -> bool {
2348 is_builtin_schema(name) || self.schemas.contains(name)
2349 }
2350
2351 /// v7.17.0 Phase 1.6 — register a new schema. Errors if the
2352 /// name already exists and `if_not_exists=false`. Built-in
2353 /// names cannot be redeclared.
2354 pub fn create_schema(&mut self, name: String, if_not_exists: bool) -> Result<(), StorageError> {
2355 if is_builtin_schema(&name) {
2356 if if_not_exists {
2357 return Ok(());
2358 }
2359 return Err(StorageError::Corrupt(format!(
2360 "schema {name:?} is built-in and cannot be redeclared"
2361 )));
2362 }
2363 if self.schemas.contains(&name) {
2364 if if_not_exists {
2365 return Ok(());
2366 }
2367 return Err(StorageError::Corrupt(format!(
2368 "schema {name:?} already exists"
2369 )));
2370 }
2371 self.schemas.insert(name);
2372 Ok(())
2373 }
2374
2375 /// v7.17.0 Phase 1.6 — drop a user-created schema. Returns
2376 /// true if a schema was removed. Built-in names always
2377 /// return false (cannot be dropped). Tables that previously
2378 /// used the schema as a prefix keep their bare name and stay
2379 /// queryable — this is the "prefix routing, not isolation"
2380 /// posture documented in v7.17 Phase 1.6.
2381 pub fn drop_schema(&mut self, name: &str) -> Result<bool, StorageError> {
2382 if is_builtin_schema(name) {
2383 return Err(StorageError::Corrupt(format!(
2384 "schema {name:?} is built-in and cannot be dropped"
2385 )));
2386 }
2387 Ok(self.schemas.remove(name))
2388 }
2389
2390 /// v7.17.0 — ALTER SEQUENCE option merge. Caller-provided
2391 /// updates overwrite the matching fields; unset fields keep
2392 /// their stored values. RESTART variants update last_value
2393 /// directly per PG: `RESTART` resets to current `start`;
2394 /// `RESTART WITH n` resets to `n`.
2395 #[allow(clippy::too_many_arguments)]
2396 pub fn alter_sequence(
2397 &mut self,
2398 name: &str,
2399 increment: Option<i64>,
2400 min_value: Option<i64>,
2401 max_value: Option<i64>,
2402 start: Option<i64>,
2403 restart: Option<Option<i64>>,
2404 cache: Option<i64>,
2405 cycle: Option<bool>,
2406 owned_by: Option<Option<(String, String)>>,
2407 ) -> Result<(), StorageError> {
2408 let Some(seq) = self.sequences.get_mut(name) else {
2409 return Err(StorageError::Corrupt(format!(
2410 "sequence {name:?} does not exist"
2411 )));
2412 };
2413 if let Some(v) = increment {
2414 seq.increment = v;
2415 }
2416 if let Some(v) = min_value {
2417 seq.min_value = v;
2418 }
2419 if let Some(v) = max_value {
2420 seq.max_value = v;
2421 }
2422 if let Some(v) = start {
2423 seq.start = v;
2424 }
2425 if let Some(restart_value) = restart {
2426 seq.last_value = restart_value.unwrap_or(seq.start);
2427 seq.is_called = false;
2428 }
2429 if let Some(v) = cache {
2430 seq.cache = v;
2431 }
2432 if let Some(v) = cycle {
2433 seq.cycle = v;
2434 }
2435 if let Some(v) = owned_by {
2436 seq.owned_by = v;
2437 }
2438 Ok(())
2439 }
2440
2441 /// v7.12.4 — read-only slice of all catalogued triggers.
2442 /// Engine row-write paths filter this by (table, event,
2443 /// timing) and fire matches in slice order.
2444 pub fn triggers(&self) -> &[TriggerDef] {
2445 &self.triggers
2446 }
2447
2448 /// v7.15.0 — mutable handle to the trigger slice for
2449 /// `ALTER TABLE … RENAME COLUMN`, which rewrites every
2450 /// `update_columns` entry that referenced the renamed
2451 /// column.
2452 pub fn triggers_mut(&mut self) -> &mut Vec<TriggerDef> {
2453 &mut self.triggers
2454 }
2455
2456 /// v7.12.4 — register a new trigger. With `or_replace = false`,
2457 /// errors when a trigger with the same name already exists on
2458 /// the same table (PG scoping rule — trigger names are
2459 /// per-table, not global). Trigger function must already
2460 /// exist in the catalog at registration time.
2461 pub fn create_trigger(
2462 &mut self,
2463 def: TriggerDef,
2464 or_replace: bool,
2465 ) -> Result<(), StorageError> {
2466 if !self.by_name.contains_key(&def.table) {
2467 return Err(StorageError::TableNotFound {
2468 name: def.table.clone(),
2469 });
2470 }
2471 if !self.functions.contains_key(&def.function) {
2472 return Err(StorageError::Corrupt(format!(
2473 "trigger {:?} references unknown function {:?}",
2474 def.name, def.function
2475 )));
2476 }
2477 let dup = self
2478 .triggers
2479 .iter()
2480 .position(|t| t.name == def.name && t.table == def.table);
2481 match (dup, or_replace) {
2482 (Some(_), false) => Err(StorageError::Corrupt(format!(
2483 "trigger {:?} already exists on table {:?}",
2484 def.name, def.table
2485 ))),
2486 (Some(i), true) => {
2487 self.triggers[i] = def;
2488 Ok(())
2489 }
2490 (None, _) => {
2491 self.triggers.push(def);
2492 Ok(())
2493 }
2494 }
2495 }
2496
2497 /// v7.12.4 — remove a trigger by `(name, table)`. Returns
2498 /// `true` if one was removed.
2499 pub fn drop_trigger(&mut self, name: &str, table: &str) -> bool {
2500 let before = self.triggers.len();
2501 self.triggers
2502 .retain(|t| !(t.name == name && t.table == table));
2503 before != self.triggers.len()
2504 }
2505
2506 pub fn create_table(&mut self, schema: TableSchema) -> Result<(), StorageError> {
2507 if self.by_name.contains_key(&schema.name) {
2508 return Err(StorageError::DuplicateTable {
2509 name: schema.name.clone(),
2510 });
2511 }
2512 let idx = self.tables.len();
2513 let name = schema.name.clone();
2514 self.tables.push(Table::new(schema));
2515 self.by_name.insert(name, idx);
2516 Ok(())
2517 }
2518
2519 pub fn get(&self, name: &str) -> Option<&Table> {
2520 let idx = *self.by_name.get(name)?;
2521 self.tables.get(idx)
2522 }
2523
2524 pub fn get_mut(&mut self, name: &str) -> Option<&mut Table> {
2525 let idx = *self.by_name.get(name)?;
2526 self.tables.get_mut(idx)
2527 }
2528
2529 /// v7.34 (crash-recovery P0 #2) — replay a row-level redo log onto
2530 /// this catalog (the [`RowChange`] physical-redo apply primitive that
2531 /// row-level WAL recovery will use in place of statement re-execution).
2532 /// Applies each change in order via the same `Table` mutators the
2533 /// engine used — no uniqueness/FK/parse/plan: the original execution
2534 /// already validated, replay trusts and applies. Positions are
2535 /// physical and only valid when replayed from the matching checkpoint
2536 /// baseline in original order (see [`RowChange`] docs).
2537 ///
2538 /// A change naming an absent table, or whose position is out of range,
2539 /// is a corrupt/misaligned log and surfaces as an error rather than a
2540 /// silent skip.
2541 pub fn apply_redo(&mut self, changes: &[RowChange]) -> Result<(), StorageError> {
2542 for change in changes {
2543 match change {
2544 RowChange::Insert { table, row } => {
2545 self.table_for_redo(table)?.insert(row.clone())?;
2546 }
2547 RowChange::Update {
2548 table,
2549 pos,
2550 new_row,
2551 } => {
2552 self.table_for_redo(table)?
2553 .update_row(*pos, new_row.clone())?;
2554 }
2555 RowChange::Delete { table, positions } => {
2556 self.table_for_redo(table)?.delete_rows(positions);
2557 }
2558 }
2559 }
2560 Ok(())
2561 }
2562
2563 fn table_for_redo(&mut self, name: &str) -> Result<&mut Table, StorageError> {
2564 self.get_mut(name)
2565 .ok_or_else(|| StorageError::Corrupt(alloc::format!("redo: unknown table {name:?}")))
2566 }
2567
2568 /// v7.34 (crash-recovery P0 #2) — enable row-level redo capture on
2569 /// every table (the engine calls this before a mutating statement
2570 /// when persistence is on; idempotent, keeps any in-flight capture).
2571 pub fn enable_redo_all(&mut self) {
2572 for t in &mut self.tables {
2573 t.enable_redo();
2574 }
2575 }
2576
2577 /// v7.34 — drain the row-level redo captured across all tables, in
2578 /// table order then per-table apply order, and stop capturing. The
2579 /// engine calls this after a successful mutating statement and writes
2580 /// the returned [`RowChange`]s to the WAL in place of the SQL text.
2581 pub fn drain_redo(&mut self) -> Vec<RowChange> {
2582 let mut all = Vec::new();
2583 for t in &mut self.tables {
2584 all.extend(t.take_redo());
2585 }
2586 all
2587 }
2588
2589 pub fn table_count(&self) -> usize {
2590 self.tables.len()
2591 }
2592
2593 /// v7.14.0 — remove a table by name. Returns `true` when the
2594 /// table existed (and is now gone), `false` when it didn't.
2595 /// Used by `DROP TABLE` from pg_dump / mysqldump preambles
2596 /// where the dump re-creates schema and starts with
2597 /// `DROP TABLE IF EXISTS`.
2598 pub fn drop_table(&mut self, name: &str) -> bool {
2599 let Some(idx) = self.by_name.remove(name) else {
2600 return false;
2601 };
2602 // swap_remove invalidates the trailing index → rebuild
2603 // by_name for affected entries.
2604 self.tables.swap_remove(idx);
2605 // Re-stamp moved table's index slot in by_name.
2606 if idx < self.tables.len() {
2607 let moved_name = self.tables[idx].schema.name.clone();
2608 self.by_name.insert(moved_name, idx);
2609 }
2610 true
2611 }
2612
2613 /// v7.16.2 — rename a table (mailrs round-10 A.5). Updates
2614 /// the schema name, the catalog name → index map, and
2615 /// rewrites every reference dangling at the table name:
2616 /// * every FK on every OTHER table whose `parent_table`
2617 /// pointed at the old name now points at the new
2618 /// name, so FK enforcement keeps working
2619 /// * every trigger watching the table updates its `table`
2620 /// field
2621 /// Returns `Ok` on success; `Err(StorageError::TableNotFound)`
2622 /// when the old name isn't in the catalog and
2623 /// `Err(StorageError::DuplicateTable)` when the new name is
2624 /// already taken.
2625 pub fn rename_table(&mut self, old: &str, new: &str) -> Result<(), StorageError> {
2626 if old == new {
2627 return Ok(());
2628 }
2629 if self.by_name.contains_key(new) {
2630 return Err(StorageError::Corrupt(format!(
2631 "rename_table: target name {new:?} already exists"
2632 )));
2633 }
2634 let idx = self
2635 .by_name
2636 .remove(old)
2637 .ok_or_else(|| StorageError::TableNotFound { name: old.into() })?;
2638 self.tables[idx].schema.name = new.to_string();
2639 self.by_name.insert(new.to_string(), idx);
2640 for t in &mut self.tables {
2641 for fk in &mut t.schema.foreign_keys {
2642 if fk.parent_table == old {
2643 fk.parent_table = new.to_string();
2644 }
2645 }
2646 }
2647 for trig in &mut self.triggers {
2648 if trig.table == old {
2649 trig.table = new.to_string();
2650 }
2651 }
2652 Ok(())
2653 }
2654
2655 /// v7.16.2 — rename an index by name. Walks every table
2656 /// since the index lives on its owning table; updates the
2657 /// name in place. Errors with `IndexNotFound` when no
2658 /// index matches. mailrs round-10 A.5.
2659 pub fn rename_index(&mut self, old: &str, new: &str) -> Result<(), StorageError> {
2660 if old == new {
2661 return Ok(());
2662 }
2663 // Reject the new name if it already exists anywhere.
2664 for t in &self.tables {
2665 if t.indices.iter().any(|i| i.name == new) {
2666 return Err(StorageError::Corrupt(format!(
2667 "rename_index: target name {new:?} already exists"
2668 )));
2669 }
2670 }
2671 for t in &mut self.tables {
2672 for i in &mut t.indices {
2673 if i.name == old {
2674 i.name = new.to_string();
2675 return Ok(());
2676 }
2677 }
2678 }
2679 Err(StorageError::IndexNotFound { name: old.into() })
2680 }
2681
2682 /// v7.14.0 — remove a named index across the catalog.
2683 /// Returns `true` when found + dropped.
2684 pub fn drop_named_index(&mut self, name: &str) -> bool {
2685 for t in &mut self.tables {
2686 let before = t.indices.len();
2687 t.indices.retain(|i| i.name != name);
2688 if t.indices.len() != before {
2689 return true;
2690 }
2691 }
2692 false
2693 }
2694
2695 /// Borrow-free copy of every table's name in catalog order
2696 /// (= insertion order, matching the on-disk encoding).
2697 pub fn table_names(&self) -> Vec<String> {
2698 self.tables.iter().map(|t| t.schema.name.clone()).collect()
2699 }
2700
2701 /// v5.1: register a cold-tier segment that already lives in
2702 /// memory (caller did the file read). Returns the
2703 /// `segment_id` that `RowLocator::Cold { segment_id, .. }`
2704 /// will reference — currently this is just the index into
2705 /// `cold_segments`, but treat it as an opaque token.
2706 ///
2707 /// Storage is `no_std`, so file I/O is the caller's
2708 /// responsibility — `spg-server` reads the file and forwards
2709 /// the bytes here. The bytes stay resident in the catalog
2710 /// for the life of the `Catalog`, parsed only once.
2711 pub fn load_segment_bytes(&mut self, bytes: Vec<u8>) -> Result<u32, StorageError> {
2712 let id = u32::try_from(self.cold_segments.len()).map_err(|_| {
2713 StorageError::Corrupt("cold segment count would exceed u32::MAX".into())
2714 })?;
2715 let seg = OwnedSegment::from_bytes(bytes)
2716 .map_err(|e| StorageError::Corrupt(format!("cold segment parse failed: {e}")))?;
2717 self.cold_segments.push(Some(Arc::new(seg)));
2718 Ok(id)
2719 }
2720
2721 /// v6.7.3 — register a cold-tier segment at a specific id. Used
2722 /// by the spg-server manifest-boot path so segments whose
2723 /// neighbouring ids were retired by compaction still get back
2724 /// the same `segment_id` they had pre-restart (the
2725 /// `RowLocator::Cold { segment_id }` baked into the BTree-index
2726 /// snapshot persists across restart and must continue to
2727 /// resolve).
2728 ///
2729 /// Pads the Vec with `None` slots up to `target_id` if needed.
2730 /// Errors when the target slot is already occupied (would
2731 /// stomp another segment), the parse fails, or `target_id`
2732 /// exceeds `u32::MAX`.
2733 pub fn load_segment_bytes_at(
2734 &mut self,
2735 target_id: u32,
2736 bytes: Vec<u8>,
2737 ) -> Result<(), StorageError> {
2738 let seg = OwnedSegment::from_bytes(bytes)
2739 .map_err(|e| StorageError::Corrupt(format!("cold segment parse failed: {e}")))?;
2740 let idx = target_id as usize;
2741 while self.cold_segments.len() <= idx {
2742 self.cold_segments.push(None);
2743 }
2744 if self.cold_segments[idx].is_some() {
2745 return Err(StorageError::Corrupt(format!(
2746 "load_segment_bytes_at: segment_id {target_id} already occupied"
2747 )));
2748 }
2749 self.cold_segments[idx] = Some(Arc::new(seg));
2750 Ok(())
2751 }
2752
2753 /// v6.7.3 — retire a cold-tier segment slot (compaction-driven).
2754 /// The physical file is the caller's concern (typically kept
2755 /// on disk until the next CHECKPOINT writes a manifest that
2756 /// no longer lists it); this just flips the in-memory slot
2757 /// to `None` so later cold lookups for `segment_id` resolve
2758 /// as "unknown" instead of returning a stale row.
2759 ///
2760 /// No-op when the slot is already `None`. Errors only when
2761 /// `segment_id` is out of bounds.
2762 pub fn tombstone_segment(&mut self, segment_id: u32) -> Result<(), StorageError> {
2763 let idx = segment_id as usize;
2764 if idx >= self.cold_segments.len() {
2765 return Err(StorageError::Corrupt(format!(
2766 "tombstone_segment: segment_id {segment_id} out of bounds (len={})",
2767 self.cold_segments.len()
2768 )));
2769 }
2770 self.cold_segments[idx] = None;
2771 Ok(())
2772 }
2773
2774 /// Number of *active* (non-tombstoned) cold segments.
2775 #[must_use]
2776 pub fn cold_segment_count(&self) -> usize {
2777 self.cold_segments.iter().filter(|s| s.is_some()).count()
2778 }
2779
2780 /// Slot count including tombstones (= the next id the
2781 /// no-arg `load_segment_bytes` would allocate).
2782 #[must_use]
2783 pub fn cold_segment_slot_count(&self) -> usize {
2784 self.cold_segments.len()
2785 }
2786
2787 /// v6.2.7 — list every *active* cold-tier segment id known to
2788 /// this catalog (skips compaction tombstones since v6.7.3).
2789 /// Used by EXPLAIN ANALYZE to annotate scan nodes with the
2790 /// segments they could have walked.
2791 #[must_use]
2792 pub fn cold_segment_ids_global(&self) -> Vec<u32> {
2793 self.cold_segments
2794 .iter()
2795 .enumerate()
2796 .filter_map(|(i, s)| s.as_ref().map(|_| i as u32))
2797 .collect()
2798 }
2799
2800 /// v5.2.1: sum of `Table::hot_bytes` across every table. The v5.2
2801 /// freezer compares this against `SPG_HOT_TIER_BYTES` (parsed at
2802 /// server startup; default 4 GiB) and wakes when the budget is
2803 /// crossed. Pre-freezer (v5.2.1) this is measurement-only — the
2804 /// counter exposes whether the budget is being approached without
2805 /// triggering any demotion.
2806 #[must_use]
2807 pub fn hot_tier_bytes(&self) -> u64 {
2808 self.tables
2809 .iter()
2810 .map(Table::hot_bytes)
2811 .fold(0u64, u64::saturating_add)
2812 }
2813
2814 /// v5.2.2: freeze the **first** `max_rows` rows of `table_name`'s
2815 /// hot tier into a brand-new cold-tier segment. The named `BTree`
2816 /// index supplies the per-row PK (its column must be an integer
2817 /// type — v5.2.2 only supports `IndexKey::Int` PKs, matching the
2818 /// `index_key_as_u64` constraint used by the cold-tier lookup
2819 /// path). On success returns a [`FreezeReport`] with the
2820 /// freshly-allocated segment id, the count of rows that moved,
2821 /// the encoded segment bytes (so the caller can persist them to
2822 /// disk for later reload via `SPG_PRELOAD_COLD_SEGMENT`), and the
2823 /// hot-tier byte delta that was reclaimed.
2824 ///
2825 /// **Semantics**:
2826 /// 1. The first `max_rows` rows (by hot-tier position — same as
2827 /// insertion order under v4.39 `PersistentVec`) are read.
2828 /// 2. Rows are sorted ascending by PK and serialised into a new
2829 /// segment via [`encode_segment`].
2830 /// 3. The hot rows are dropped via [`Table::delete_rows`]; the
2831 /// `rebuild_indices` it triggers regenerates `Hot` locators
2832 /// for every remaining row (their positions shift down by
2833 /// `max_rows`). Existing `Cold` locators in this index — from
2834 /// a previous freeze — are also rebuilt **but with empty
2835 /// payload** since rebuild reads only `self.rows`; this
2836 /// routine re-registers them at the end of the call so the
2837 /// user-visible state preserves all prior cold locators.
2838 /// 4. The new segment is loaded into `self.cold_segments` via
2839 /// [`Catalog::load_segment_bytes`] (allocating a fresh
2840 /// `segment_id`). New `Cold` locators are registered on the
2841 /// named index — one per frozen row.
2842 ///
2843 /// **v5.2.2 limits** (relaxed in later sub-versions):
2844 /// - INSERT-only flow: subsequent UPDATE/DELETE on a frozen row
2845 /// returns a stale-locator error (no promote-on-write until
2846 /// v5.2.3).
2847 /// - Single-table scope: callers iterate tables themselves.
2848 /// - All-or-nothing: returns `Err` and leaves catalog unchanged
2849 /// if any step fails before the atomic swap point.
2850 ///
2851 /// Errors:
2852 /// - [`StorageError::Corrupt`] for missing table/index, non-`BTree`
2853 /// index, non-integer PK column, `max_rows == 0`, or
2854 /// `max_rows > row_count`.
2855 /// - The encoder's [`SegmentError`] surfaces as `Corrupt` (the
2856 /// only realistic source is "a single row is larger than the
2857 /// page size"; SPG schemas don't hit it in practice).
2858 pub fn freeze_oldest_to_cold(
2859 &mut self,
2860 table_name: &str,
2861 index_name: &str,
2862 max_rows: usize,
2863 ) -> Result<FreezeReport, StorageError> {
2864 // --- validation phase: never mutates ---------------------
2865 if max_rows == 0 {
2866 return Err(StorageError::Corrupt(
2867 "freeze_oldest_to_cold: max_rows must be > 0".into(),
2868 ));
2869 }
2870 let table = self.get(table_name).ok_or_else(|| {
2871 StorageError::Corrupt(format!(
2872 "freeze_oldest_to_cold: table {table_name:?} not found"
2873 ))
2874 })?;
2875 if max_rows > table.rows.len() {
2876 return Err(StorageError::Corrupt(format!(
2877 "freeze_oldest_to_cold: max_rows {max_rows} > row_count {}",
2878 table.rows.len()
2879 )));
2880 }
2881 let idx = table
2882 .indices
2883 .iter()
2884 .find(|i| i.name == index_name)
2885 .ok_or_else(|| {
2886 StorageError::Corrupt(format!(
2887 "freeze_oldest_to_cold: index {index_name:?} not found on {table_name:?}"
2888 ))
2889 })?;
2890 if !matches!(idx.kind, IndexKind::BTree(_)) {
2891 return Err(StorageError::Corrupt(format!(
2892 "freeze_oldest_to_cold: index {index_name:?} is NSW; only BTree indices may freeze"
2893 )));
2894 }
2895 let column_position = idx.column_position;
2896
2897 // --- segment build phase: reads only --------------------
2898 let schema = table.schema.clone();
2899 let mut to_freeze: Vec<(u64, Vec<u8>, IndexKey)> = Vec::with_capacity(max_rows);
2900 for row_idx in 0..max_rows {
2901 let row = table.rows.get(row_idx).expect("bounds-checked above");
2902 let key = IndexKey::from_value(&row.values[column_position]).ok_or_else(|| {
2903 StorageError::Corrupt(format!(
2904 "freeze_oldest_to_cold: row {row_idx} has NULL / non-key value in index column"
2905 ))
2906 })?;
2907 let pk_u64 = index_key_as_u64(&key).ok_or_else(|| {
2908 StorageError::Corrupt(format!(
2909 "freeze_oldest_to_cold: index {index_name:?} column type is non-integer; \
2910 v5.2.2 cold tier requires IndexKey::Int (Text PK lands in v5.5+)"
2911 ))
2912 })?;
2913 to_freeze.push((pk_u64, encode_row_body_dense(row, &schema), key));
2914 }
2915 // encode_segment requires ascending u64 keys. Sort by PK
2916 // before encoding; the caller's row-position order is not
2917 // necessarily PK order (e.g. workloads that insert random
2918 // PKs).
2919 to_freeze.sort_by_key(|(k, _, _)| *k);
2920 // Reject duplicate PKs — encode_segment also rejects them
2921 // (`SegmentError::UnsortedKey`), but the resulting error
2922 // message there is misleading. Surface a clearer one.
2923 for w in to_freeze.windows(2) {
2924 if w[0].0 == w[1].0 {
2925 return Err(StorageError::Corrupt(format!(
2926 "freeze_oldest_to_cold: duplicate PK {} in freeze batch",
2927 w[0].0
2928 )));
2929 }
2930 }
2931 // Snapshot the (key, locator) pairs that will be registered
2932 // post-swap. Cloning the IndexKey out before the move makes
2933 // the registration loop borrow-free.
2934 let post_swap_keys: Vec<IndexKey> = to_freeze.iter().map(|(_, _, k)| k.clone()).collect();
2935 // Segment encode is now infallible w.r.t. ordering. Map the
2936 // `SegmentError` into a `StorageError::Corrupt` so the
2937 // public surface stays one error type.
2938 let seg_rows: Vec<(u64, Vec<u8>)> = to_freeze
2939 .into_iter()
2940 .map(|(k, body, _)| (k, body))
2941 .collect();
2942 let frozen_rows = seg_rows.len();
2943 let (seg_bytes, _meta) = encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES)
2944 .map_err(|e| StorageError::Corrupt(format!("freeze_oldest_to_cold: encode: {e}")))?;
2945
2946 // --- atomic swap phase: mutations only past this point ---
2947 // v5.2.3 made `Table::rebuild_indices` preserve every Cold
2948 // locator across the per-table rebuild, so `delete_rows`
2949 // below no longer wipes prior-freeze cold entries. The pre-
2950 // v5.2.3 capture-then-re-register that used to live here
2951 // was removed in v5.3.1 — keeping it would double-count
2952 // every prior-frozen key's Cold locator on each subsequent
2953 // freeze.
2954 let bytes_before = self.get(table_name).expect("just validated").hot_bytes();
2955 let positions: Vec<usize> = (0..max_rows).collect();
2956 let t_mut = self
2957 .get_mut(table_name)
2958 .expect("just validated; still present");
2959 let removed = t_mut.delete_rows(&positions);
2960 debug_assert_eq!(removed, max_rows, "delete_rows count matches request");
2961 let bytes_after = t_mut.hot_bytes();
2962 let bytes_freed = bytes_before.saturating_sub(bytes_after);
2963
2964 let segment_id = self
2965 .load_segment_bytes(seg_bytes.clone())
2966 .map_err(|e| StorageError::Corrupt(format!("freeze_oldest_to_cold: load: {e}")))?;
2967 let new_cold = post_swap_keys.into_iter().map(|k| {
2968 (
2969 k,
2970 RowLocator::Cold {
2971 segment_id,
2972 page_offset: 0,
2973 },
2974 )
2975 });
2976 let t_mut = self.get_mut(table_name).expect("still present");
2977 t_mut.register_cold_locators(index_name, new_cold)?;
2978
2979 Ok(FreezeReport {
2980 segment_id,
2981 frozen_rows,
2982 bytes_freed,
2983 segment_bytes: seg_bytes,
2984 })
2985 }
2986
2987 /// v5.1: borrow the cold segment at `segment_id`. Used by the
2988 /// spg-server preload path to enumerate (key, locator) pairs
2989 /// after loading a segment, so it can call
2990 /// [`Table::register_cold_locators`] without re-parsing the
2991 /// bytes.
2992 #[must_use]
2993 pub fn cold_segment(&self, segment_id: u32) -> Option<&OwnedSegment> {
2994 self.cold_segments
2995 .get(segment_id as usize)
2996 .and_then(|s| s.as_deref())
2997 }
2998
2999 /// v5.1: resolve a single `RowLocator::Cold` to its underlying
3000 /// `Row`. Decoupled from [`Catalog::lookup_by_pk`] so callers
3001 /// iterating a multi-locator slice (e.g. the engine's index
3002 /// seek path) can dispatch per locator instead of getting back
3003 /// only the first row for a key. Returns `None` when the
3004 /// segment isn't registered, the key isn't `u64`-coercible, or
3005 /// the segment doesn't actually carry the key (bloom or page-
3006 /// index reject).
3007 pub fn resolve_cold_locator(
3008 &self,
3009 table_name: &str,
3010 segment_id: u32,
3011 key: &IndexKey,
3012 ) -> Option<Row> {
3013 let t = self.get(table_name)?;
3014 let u64_key = index_key_as_u64(key)?;
3015 let seg = self.cold_segments.get(segment_id as usize)?.as_ref()?;
3016 let payload = seg.lookup(u64_key)?;
3017 let (row, _) = decode_row_body_dense(&payload, &t.schema, seg.codec_version()).ok()?;
3018 Some(row)
3019 }
3020
3021 /// v5.1: indexed PK lookup that dispatches per locator,
3022 /// returning the first matching row from either the hot tier
3023 /// (`Table::rows`) or a registered cold segment.
3024 ///
3025 /// The cold path requires the index column to be coercible to
3026 /// a `u64` (the segment's PK type) and the segment payload to
3027 /// be a [`encode_row_body_dense`]-encoded row body for the
3028 /// same schema. v5.1 ships this for BIGINT / INT / SMALLINT
3029 /// PKs; other types fall through to hot-only behavior.
3030 ///
3031 /// Returns `None` if (a) the table or index doesn't exist,
3032 /// (b) the key isn't in the index at all, or (c) the key was
3033 /// resolved to a stale locator (Hot index out of range, Cold
3034 /// segment id unknown, segment lookup miss). Does not surface
3035 /// segment-decode errors — those would indicate corrupted
3036 /// cold-tier files and should be caught at
3037 /// [`Catalog::load_segment_bytes`] time.
3038 pub fn lookup_by_pk(&self, table: &str, index_name: &str, key: &IndexKey) -> Option<Row> {
3039 let t = self.get(table)?;
3040 let idx = t.indices.iter().find(|i| i.name == index_name)?;
3041 let locators = idx.lookup_eq(key);
3042 let cold_u64_key = index_key_as_u64(key);
3043 for loc in locators {
3044 match *loc {
3045 RowLocator::Hot(i) => {
3046 if let Some(row) = t.rows.get(i) {
3047 return Some(row.clone());
3048 }
3049 }
3050 RowLocator::Cold {
3051 segment_id,
3052 page_offset: _,
3053 } => {
3054 let Some(u64_key) = cold_u64_key else {
3055 // Key type not coercible to u64 — cold tier
3056 // only handles BIGINT/INT/SMALLINT in v5.1.
3057 continue;
3058 };
3059 let Some(seg) = self
3060 .cold_segments
3061 .get(segment_id as usize)
3062 .and_then(|s| s.as_deref())
3063 else {
3064 // v6.7.3 — `None` slot = compaction
3065 // retired this segment; the live locator
3066 // on a freshly-compacted index points to
3067 // the merged segment_id, so a Cold hit
3068 // here against a tombstone means the BTree
3069 // entry hasn't been swapped yet (mid-
3070 // compaction reader race) or the caller is
3071 // looking up a stale snapshot. Skip — the
3072 // next locator in the list, if any, is
3073 // typically the merged segment.
3074 continue;
3075 };
3076 let Some(payload) = seg.lookup(u64_key) else {
3077 continue;
3078 };
3079 let (row, _) =
3080 decode_row_body_dense(&payload, &t.schema, seg.codec_version()).ok()?;
3081 return Some(row);
3082 }
3083 }
3084 }
3085 None
3086 }
3087
3088 /// v5.2.3: promote a frozen row back to the hot tier so an
3089 /// UPDATE / DELETE can mutate it. Reads the cold-tier row body
3090 /// (decoded from its registered segment), pushes it into
3091 /// `table.rows` via [`Table::insert`] (which also adds a fresh
3092 /// `Hot(new_idx)` locator on `index_name`), then retires the
3093 /// shadowed `Cold` locator via
3094 /// [`Table::remove_cold_locators_for_key`]. The cold-tier row
3095 /// in the segment file becomes garbage — recoverable when a
3096 /// future cold-segment compaction job lands.
3097 ///
3098 /// Returns:
3099 /// - `Ok(Some(new_hot_idx))` when the key resolved through a
3100 /// cold locator and the promote completed. `new_hot_idx` is
3101 /// the position the row now occupies in `table.rows`.
3102 /// - `Ok(None)` when the key has no Cold locator on the index
3103 /// (already hot, or wasn't present at all). Callers treat this
3104 /// as "nothing to do here, fall back to the hot-only path".
3105 ///
3106 /// Errors when the table / index doesn't exist, the index isn't
3107 /// `BTree`, the cold segment is missing / can't decode the row,
3108 /// or the inferred row body fails `Table::insert` validation.
3109 pub fn promote_cold_row(
3110 &mut self,
3111 table_name: &str,
3112 index_name: &str,
3113 key: &IndexKey,
3114 ) -> Result<Option<usize>, StorageError> {
3115 let cold_loc = self.find_cold_locator(table_name, index_name, key)?;
3116 let Some((segment_id, _page_offset)) = cold_loc else {
3117 return Ok(None);
3118 };
3119 let u64_key = index_key_as_u64(key).ok_or_else(|| {
3120 StorageError::Corrupt(
3121 "promote_cold_row: key type not coercible to u64 (cold tier requires integer PK)"
3122 .into(),
3123 )
3124 })?;
3125 // Read the row body from the segment. Borrow the segment +
3126 // schema short-term so we can then take `&mut self` for the
3127 // hot-side insert.
3128 let schema = self
3129 .get(table_name)
3130 .ok_or_else(|| {
3131 StorageError::Corrupt(format!("promote_cold_row: table {table_name:?} not found"))
3132 })?
3133 .schema
3134 .clone();
3135 let seg = self
3136 .cold_segments
3137 .get(segment_id as usize)
3138 .and_then(|s| s.as_ref())
3139 .ok_or_else(|| {
3140 StorageError::Corrupt(format!(
3141 "promote_cold_row: segment {segment_id} not registered on catalog"
3142 ))
3143 })?;
3144 let payload = seg.lookup(u64_key).ok_or_else(|| {
3145 StorageError::Corrupt(format!(
3146 "promote_cold_row: key {u64_key} resolves to segment {segment_id} \
3147 but the segment's bloom/page lookup didn't return a row"
3148 ))
3149 })?;
3150 let (row, _consumed) = decode_row_body_dense(&payload, &schema, seg.codec_version())?;
3151 // Insert the promoted row into the hot tier. `Table::insert`
3152 // appends to `self.rows`, adds a `Hot(new_idx)` locator to
3153 // every BTree index covering the row's keyed columns, and
3154 // increments `hot_bytes`.
3155 let t = self
3156 .get_mut(table_name)
3157 .expect("table existed at lookup time");
3158 t.insert(row)?;
3159 let new_hot_idx =
3160 t.rows.len().checked_sub(1).ok_or_else(|| {
3161 StorageError::Corrupt("promote_cold_row: empty after insert".into())
3162 })?;
3163 // The hot insert added Hot(new_idx) alongside the still-
3164 // present Cold locator. Drop the Cold entry so future
3165 // lookups return only the fresh hot row.
3166 t.remove_cold_locators_for_key(index_name, key)?;
3167 Ok(Some(new_hot_idx))
3168 }
3169
3170 /// v5.2.3: shadow a frozen row's index entry. Used by DELETE
3171 /// when the row to remove lives in a cold-tier segment — the
3172 /// row body stays in the segment file (becoming garbage) but
3173 /// every `Cold` locator for `key` on `index_name` is removed
3174 /// so PK lookups stop returning it.
3175 ///
3176 /// Returns the number of cold locators retired (0 when the key
3177 /// has no cold entries — the DELETE fell on a hot row or a
3178 /// key that was already absent). Errors when the table /
3179 /// index doesn't exist or the index isn't `BTree`.
3180 ///
3181 /// Cold-segment compaction (which merges shadowed-heavy
3182 /// segments and reclaims their disk footprint) lands in a
3183 /// later v5.x sub-version; until then, repeated UPDATE/DELETE
3184 /// of cold rows can amplify cold-segment disk usage by up to
3185 /// 1-2× — still well under typical LSM-tree shadowing because
3186 /// SPG segments are bulk-baked, not write-merged.
3187 pub fn shadow_cold_row(
3188 &mut self,
3189 table_name: &str,
3190 index_name: &str,
3191 key: &IndexKey,
3192 ) -> Result<usize, StorageError> {
3193 let t = self.get_mut(table_name).ok_or_else(|| {
3194 StorageError::Corrupt(format!("shadow_cold_row: table {table_name:?} not found"))
3195 })?;
3196 t.remove_cold_locators_for_key(index_name, key)
3197 }
3198
3199 /// v6.7.4 — read-only slice preparation for the parallel
3200 /// freezer. Walks rows in `row_range`, builds the
3201 /// `(pk_u64, encoded_body, IndexKey)` triples that the
3202 /// coordinator's k-way merge consumes, sorts the slice by
3203 /// `pk_u64`, and returns a [`FreezeSlice`].
3204 ///
3205 /// Caller invariants:
3206 /// - `row_range.end <= table.rows.len()` (caller's job to
3207 /// compute the partition).
3208 /// - All slices passed to `commit_freeze_slices` must cover a
3209 /// contiguous half-open range `[0, total_max_rows)` with no
3210 /// gaps and no overlaps. The coordinator validates this
3211 /// invariant before committing.
3212 ///
3213 /// `&self`-only — multiple workers can run this concurrently
3214 /// against the same `Catalog` reference under the engine's
3215 /// write lock (workers don't mutate; the coordinator does).
3216 pub fn prepare_freeze_slice(
3217 &self,
3218 table_name: &str,
3219 index_name: &str,
3220 row_range: core::ops::Range<usize>,
3221 ) -> Result<FreezeSlice, StorageError> {
3222 let table = self.get(table_name).ok_or_else(|| {
3223 StorageError::Corrupt(format!(
3224 "prepare_freeze_slice: table {table_name:?} not found"
3225 ))
3226 })?;
3227 let idx = table
3228 .indices
3229 .iter()
3230 .find(|i| i.name == index_name)
3231 .ok_or_else(|| {
3232 StorageError::Corrupt(format!(
3233 "prepare_freeze_slice: index {index_name:?} not found on {table_name:?}"
3234 ))
3235 })?;
3236 if !matches!(idx.kind, IndexKind::BTree(_)) {
3237 return Err(StorageError::Corrupt(format!(
3238 "prepare_freeze_slice: index {index_name:?} is NSW; only BTree indices may freeze"
3239 )));
3240 }
3241 if row_range.end > table.rows.len() {
3242 return Err(StorageError::Corrupt(format!(
3243 "prepare_freeze_slice: row_range end {} > row_count {}",
3244 row_range.end,
3245 table.rows.len()
3246 )));
3247 }
3248 let column_position = idx.column_position;
3249 let schema = table.schema.clone();
3250 let mut rows: Vec<(u64, Vec<u8>, IndexKey)> = Vec::with_capacity(row_range.len());
3251 for row_idx in row_range.clone() {
3252 let row = table.rows.get(row_idx).expect("bounds-checked above");
3253 let key = IndexKey::from_value(&row.values[column_position]).ok_or_else(|| {
3254 StorageError::Corrupt(format!(
3255 "prepare_freeze_slice: row {row_idx} has NULL / non-key value in index column"
3256 ))
3257 })?;
3258 let pk_u64 = index_key_as_u64(&key).ok_or_else(|| {
3259 StorageError::Corrupt(format!(
3260 "prepare_freeze_slice: index {index_name:?} column type is non-integer; \
3261 v5.2.2 cold tier requires IndexKey::Int (Text PK lands in v5.5+)"
3262 ))
3263 })?;
3264 rows.push((pk_u64, encode_row_body_dense(row, &schema), key));
3265 }
3266 rows.sort_by_key(|(k, _, _)| *k);
3267 Ok(FreezeSlice { row_range, rows })
3268 }
3269
3270 /// v6.7.4 — coordinator commit step. Merges N
3271 /// [`FreezeSlice`]s into one segment via the standard
3272 /// [`encode_segment`] path, atomically swaps the catalog
3273 /// state (delete the union row range + register Cold
3274 /// locators + load the segment).
3275 ///
3276 /// Validates that the slices cover a contiguous, gap-free,
3277 /// overlap-free half-open range starting at index 0 (the
3278 /// freezer always freezes "oldest first" — same semantics as
3279 /// the single-threaded [`Catalog::freeze_oldest_to_cold`]).
3280 ///
3281 /// Empty `slices` → no-op success (returns a zero-row report
3282 /// without mutating). Total row count = `Σ slice.rows.len()`.
3283 pub fn commit_freeze_slices(
3284 &mut self,
3285 table_name: &str,
3286 index_name: &str,
3287 slices: Vec<FreezeSlice>,
3288 ) -> Result<FreezeReport, StorageError> {
3289 // --- validation phase: never mutates ---------------------
3290 let table = self.get(table_name).ok_or_else(|| {
3291 StorageError::Corrupt(format!(
3292 "commit_freeze_slices: table {table_name:?} not found"
3293 ))
3294 })?;
3295 let idx = table
3296 .indices
3297 .iter()
3298 .find(|i| i.name == index_name)
3299 .ok_or_else(|| {
3300 StorageError::Corrupt(format!(
3301 "commit_freeze_slices: index {index_name:?} not found on {table_name:?}"
3302 ))
3303 })?;
3304 if !matches!(idx.kind, IndexKind::BTree(_)) {
3305 return Err(StorageError::Corrupt(format!(
3306 "commit_freeze_slices: index {index_name:?} is NSW; only BTree indices may freeze"
3307 )));
3308 }
3309 // Validate slice coverage: contiguous from 0, no gaps, no
3310 // overlaps. Allow the caller to pass slices in any order —
3311 // sort by row_range.start first.
3312 let mut ordered = slices;
3313 ordered.sort_by_key(|s| s.row_range.start);
3314 // Drop fully-empty slices that fell out of an uneven
3315 // partition; they carry no data but contribute to the
3316 // contiguity check, so keep them in line.
3317 let mut expected_start = 0usize;
3318 for s in &ordered {
3319 if s.row_range.start != expected_start {
3320 return Err(StorageError::Corrupt(format!(
3321 "commit_freeze_slices: gap/overlap at row {}; expected start {}",
3322 s.row_range.start, expected_start
3323 )));
3324 }
3325 expected_start = s.row_range.end;
3326 }
3327 let max_rows = expected_start;
3328 if max_rows > table.rows.len() {
3329 return Err(StorageError::Corrupt(format!(
3330 "commit_freeze_slices: total row range {} exceeds row_count {}",
3331 max_rows,
3332 table.rows.len()
3333 )));
3334 }
3335 if max_rows == 0 {
3336 return Ok(FreezeReport {
3337 segment_id: u32::MAX,
3338 frozen_rows: 0,
3339 bytes_freed: 0,
3340 segment_bytes: Vec::new(),
3341 });
3342 }
3343
3344 // --- segment build phase: reads only --------------------
3345 // K-way merge of already-sorted slices. Each slice's rows
3346 // are ascending by pk_u64; we keep a per-slice cursor and
3347 // pull the next-smallest head until every cursor drains.
3348 let total_rows: usize = ordered.iter().map(|s| s.rows.len()).sum();
3349 if total_rows != max_rows {
3350 return Err(StorageError::Corrupt(format!(
3351 "commit_freeze_slices: total slice rows {total_rows} ≠ row_range coverage {max_rows}"
3352 )));
3353 }
3354 let mut cursors: Vec<usize> = alloc::vec![0; ordered.len()];
3355 let mut merged: Vec<(u64, Vec<u8>, IndexKey)> = Vec::with_capacity(total_rows);
3356 loop {
3357 // Pick the slice whose head row has the smallest key
3358 // and isn't yet exhausted.
3359 let mut pick: Option<usize> = None;
3360 for (i, c) in cursors.iter().enumerate() {
3361 let slice = &ordered[i];
3362 if *c >= slice.rows.len() {
3363 continue;
3364 }
3365 match pick {
3366 None => pick = Some(i),
3367 Some(j) => {
3368 if slice.rows[*c].0 < ordered[j].rows[cursors[j]].0 {
3369 pick = Some(i);
3370 }
3371 }
3372 }
3373 }
3374 let Some(i) = pick else { break };
3375 let row = ordered[i].rows[cursors[i]].clone();
3376 cursors[i] += 1;
3377 merged.push(row);
3378 }
3379 // Reject duplicate PKs — same error as the single-threaded
3380 // path so callers get a uniform surface.
3381 for w in merged.windows(2) {
3382 if w[0].0 == w[1].0 {
3383 return Err(StorageError::Corrupt(format!(
3384 "commit_freeze_slices: duplicate PK {} across slices",
3385 w[0].0
3386 )));
3387 }
3388 }
3389 let post_swap_keys: Vec<IndexKey> = merged.iter().map(|(_, _, k)| k.clone()).collect();
3390 let seg_rows: Vec<(u64, Vec<u8>)> =
3391 merged.into_iter().map(|(k, body, _)| (k, body)).collect();
3392 let frozen_rows = seg_rows.len();
3393 let (seg_bytes, _meta) = encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES)
3394 .map_err(|e| StorageError::Corrupt(format!("commit_freeze_slices: encode: {e}")))?;
3395
3396 // --- atomic swap phase: mutations only past this point ---
3397 let bytes_before = self.get(table_name).expect("just validated").hot_bytes();
3398 let positions: Vec<usize> = (0..max_rows).collect();
3399 let t_mut = self
3400 .get_mut(table_name)
3401 .expect("just validated; still present");
3402 let removed = t_mut.delete_rows(&positions);
3403 debug_assert_eq!(removed, max_rows, "delete_rows count matches request");
3404 let bytes_after = t_mut.hot_bytes();
3405 let bytes_freed = bytes_before.saturating_sub(bytes_after);
3406
3407 let segment_id = self
3408 .load_segment_bytes(seg_bytes.clone())
3409 .map_err(|e| StorageError::Corrupt(format!("commit_freeze_slices: load: {e}")))?;
3410 let new_cold = post_swap_keys.into_iter().map(|k| {
3411 (
3412 k,
3413 RowLocator::Cold {
3414 segment_id,
3415 page_offset: 0,
3416 },
3417 )
3418 });
3419 let t_mut = self.get_mut(table_name).expect("still present");
3420 t_mut.register_cold_locators(index_name, new_cold)?;
3421
3422 Ok(FreezeReport {
3423 segment_id,
3424 frozen_rows,
3425 bytes_freed,
3426 segment_bytes: seg_bytes,
3427 })
3428 }
3429
3430 /// v6.7.3 — compact every cold segment on `(table, index)` whose
3431 /// `OwnedSegment::bytes().len()` is below `target_segment_bytes`
3432 /// into a single larger merged segment. Rows present in source
3433 /// segment payloads but no longer referenced by any
3434 /// `RowLocator::Cold` on the index (DELETE'd + frozen rows
3435 /// retired via [`Catalog::shadow_cold_row`]) are GC'd in the
3436 /// merge.
3437 ///
3438 /// **Semantics**:
3439 /// 1. Walk the BTree index to collect every Cold locator that
3440 /// targets a small (< threshold) segment. Each such
3441 /// `(key, segment_id)` becomes a row in the merged segment;
3442 /// payload is looked up from the source segment in-place.
3443 /// 2. Encode the collected rows into one new segment via
3444 /// [`encode_segment`]; register it via
3445 /// [`Catalog::load_segment_bytes`] (allocating a fresh
3446 /// `merged_segment_id` at the end of `cold_segments`).
3447 /// 3. Rewrite the BTree index in one pass: every
3448 /// `RowLocator::Cold { segment_id ∈ sources }` becomes
3449 /// `RowLocator::Cold { segment_id = merged_id, page_offset = 0 }`.
3450 /// Hot locators are untouched.
3451 /// 4. Tombstone every source slot via
3452 /// [`Catalog::tombstone_segment`]. Source segment payloads
3453 /// are no longer reachable through the catalog; the on-disk
3454 /// files are the caller's concern.
3455 ///
3456 /// On fewer than 2 candidate segments the catalog is **not**
3457 /// mutated and a no-op report (`merged_segment_id: None`,
3458 /// `sources: []`) is returned. This is the routine case — a
3459 /// freshly-frozen table has at most 1 small segment, no merge
3460 /// possible.
3461 ///
3462 /// Atomicity: every mutating step runs after the read-only
3463 /// gather phase, so a panic before the merge encode leaves the
3464 /// catalog unchanged. The mutation block itself (load + rewrite +
3465 /// tombstone) takes only `&mut self` — callers serialise the
3466 /// engine write lock outside this function.
3467 ///
3468 /// Errors when the table / index doesn't exist, the index isn't
3469 /// `BTree`, the index column type isn't u64-coercible (cold-tier
3470 /// pre-condition), or a source segment fails its in-place
3471 /// row-body lookup (would indicate prior catalog corruption).
3472 pub fn compact_cold_segments(
3473 &mut self,
3474 table_name: &str,
3475 index_name: &str,
3476 target_segment_bytes: u64,
3477 ) -> Result<CompactReport, StorageError> {
3478 // --- validation phase ----------------------------------
3479 let t = self.get(table_name).ok_or_else(|| {
3480 StorageError::Corrupt(format!(
3481 "compact_cold_segments: table {table_name:?} not found"
3482 ))
3483 })?;
3484 let idx = t
3485 .indices
3486 .iter()
3487 .find(|i| i.name == index_name)
3488 .ok_or_else(|| {
3489 StorageError::Corrupt(format!(
3490 "compact_cold_segments: index {index_name:?} not found on {table_name:?}"
3491 ))
3492 })?;
3493 let map = match &idx.kind {
3494 IndexKind::BTree(m) => m,
3495 IndexKind::Nsw(_)
3496 | IndexKind::Brin { .. }
3497 | IndexKind::Gin(_)
3498 | IndexKind::GinTrgm(_)
3499 | IndexKind::GinFulltext(_) => {
3500 return Err(StorageError::Corrupt(format!(
3501 "compact_cold_segments: index {index_name:?} is not BTree; \
3502 compaction applies only to BTree cold-tier indices"
3503 )));
3504 }
3505 };
3506
3507 // --- gather phase --------------------------------------
3508 // Step A: every segment_id this BTree index Cold-references.
3509 let mut referenced_ids: BTreeSet<u32> = BTreeSet::new();
3510 for (_key, locators) in map.iter() {
3511 for loc in locators {
3512 if let RowLocator::Cold { segment_id, .. } = loc {
3513 referenced_ids.insert(*segment_id);
3514 }
3515 }
3516 }
3517 // Step B: keep only the small + still-active ones.
3518 let candidate_set: BTreeSet<u32> = referenced_ids
3519 .into_iter()
3520 .filter(|id| {
3521 self.cold_segments
3522 .get(*id as usize)
3523 .and_then(|s| s.as_deref())
3524 .is_some_and(|s| (s.bytes().len() as u64) < target_segment_bytes)
3525 })
3526 .collect();
3527 if candidate_set.len() < 2 {
3528 return Ok(CompactReport {
3529 sources: Vec::new(),
3530 merged_segment_id: None,
3531 merged_segment_bytes: Vec::new(),
3532 merged_rows: 0,
3533 deleted_rows_pruned: 0,
3534 bytes_reclaimed_estimate: 0,
3535 });
3536 }
3537 // Step C: pre-count source rows for the deleted-pruned metric.
3538 let mut source_row_count: usize = 0;
3539 let mut source_byte_total: u64 = 0;
3540 for &id in &candidate_set {
3541 let seg = self.cold_segments[id as usize]
3542 .as_ref()
3543 .expect("candidate selected only when slot is Some");
3544 source_row_count = source_row_count.saturating_add(seg.meta().num_rows as usize);
3545 source_byte_total = source_byte_total.saturating_add(seg.bytes().len() as u64);
3546 }
3547 // Step D: collect (key, body) pairs from every live Cold
3548 // locator pointing at a candidate. dedupe by key — one
3549 // BTree key resolves to at most one cold payload (the
3550 // freezer + promote/shadow flow keeps Cold locators
3551 // unique per key).
3552 let mut collected: BTreeMap<u64, (Vec<u8>, IndexKey)> = BTreeMap::new();
3553 for (key, locators) in map.iter() {
3554 for loc in locators {
3555 let RowLocator::Cold { segment_id, .. } = loc else {
3556 continue;
3557 };
3558 if !candidate_set.contains(segment_id) {
3559 continue;
3560 }
3561 let u64_key = index_key_as_u64(key).ok_or_else(|| {
3562 StorageError::Corrupt(format!(
3563 "compact_cold_segments: index {index_name:?} has non-integer Cold key; \
3564 cold tier requires IndexKey::Int (Text PK lands in v5.5+)"
3565 ))
3566 })?;
3567 let seg = self.cold_segments[*segment_id as usize]
3568 .as_ref()
3569 .expect("candidate slot guaranteed Some above");
3570 let payload = seg.lookup(u64_key).ok_or_else(|| {
3571 StorageError::Corrupt(format!(
3572 "compact_cold_segments: BTree {index_name:?} points key={u64_key} \
3573 at segment {segment_id} but the segment lookup missed"
3574 ))
3575 })?;
3576 collected.insert(u64_key, (payload, key.clone()));
3577 break;
3578 }
3579 }
3580 let merged_rows = collected.len();
3581 let deleted_rows_pruned = source_row_count.saturating_sub(merged_rows);
3582
3583 // Step E: encode the merged segment. `BTreeMap<u64, _>`
3584 // iteration is ascending by key, which is what
3585 // `encode_segment` requires.
3586 let seg_rows: Vec<(u64, Vec<u8>)> = collected
3587 .iter()
3588 .map(|(k, (body, _))| (*k, body.clone()))
3589 .collect();
3590 let (seg_bytes, _meta) = encode_segment(seg_rows.into_iter(), 0.01, SEGMENT_PAGE_BYTES)
3591 .map_err(|e| StorageError::Corrupt(format!("compact_cold_segments: encode: {e}")))?;
3592 let merged_bytes_len = seg_bytes.len() as u64;
3593
3594 // --- atomic mutation phase ------------------------------
3595 let merged_segment_id = self
3596 .load_segment_bytes(seg_bytes.clone())
3597 .map_err(|e| StorageError::Corrupt(format!("compact_cold_segments: load: {e}")))?;
3598
3599 // Rewrite the BTree index: every Cold locator pointing at
3600 // a candidate source becomes a Cold locator pointing at
3601 // the merged segment. Use a flat collect-then-replace
3602 // pattern so we never hold a `&self` borrow across the
3603 // `&mut self` write.
3604 let entries: Vec<(IndexKey, Vec<RowLocator>)> = {
3605 let t = self
3606 .get(table_name)
3607 .expect("table existed at the start of this fn");
3608 let idx = t
3609 .indices
3610 .iter()
3611 .find(|i| i.name == index_name)
3612 .expect("index existed at the start of this fn");
3613 let IndexKind::BTree(map) = &idx.kind else {
3614 unreachable!("validated above");
3615 };
3616 map.iter().map(|(k, v)| (k.clone(), v.clone())).collect()
3617 };
3618 let t_mut = self
3619 .get_mut(table_name)
3620 .expect("table existed at the start of this fn");
3621 let idx_mut = t_mut
3622 .indices
3623 .iter_mut()
3624 .find(|i| i.name == index_name)
3625 .expect("index existed at the start of this fn");
3626 let IndexKind::BTree(map_mut) = &mut idx_mut.kind else {
3627 unreachable!("validated above");
3628 };
3629 for (key, locators) in entries {
3630 let mut new_locs: Vec<RowLocator> = Vec::with_capacity(locators.len());
3631 let mut changed = false;
3632 for loc in &locators {
3633 match *loc {
3634 RowLocator::Cold {
3635 segment_id,
3636 page_offset: _,
3637 } if candidate_set.contains(&segment_id) => {
3638 let replacement = RowLocator::Cold {
3639 segment_id: merged_segment_id,
3640 page_offset: 0,
3641 };
3642 if !new_locs.contains(&replacement) {
3643 new_locs.push(replacement);
3644 }
3645 changed = true;
3646 }
3647 other => new_locs.push(other),
3648 }
3649 }
3650 if changed {
3651 map_mut.insert_mut(key, new_locs);
3652 }
3653 }
3654
3655 // Tombstone every source slot. Last step — failures here
3656 // would leave the segment double-referenced in both
3657 // memory + manifest, but `tombstone_segment` only errors
3658 // on out-of-bounds, which we've already validated.
3659 for &id in &candidate_set {
3660 self.tombstone_segment(id)?;
3661 }
3662
3663 let bytes_reclaimed_estimate = source_byte_total.saturating_sub(merged_bytes_len);
3664 Ok(CompactReport {
3665 sources: candidate_set.into_iter().collect(),
3666 merged_segment_id: Some(merged_segment_id),
3667 merged_segment_bytes: seg_bytes,
3668 merged_rows,
3669 deleted_rows_pruned,
3670 bytes_reclaimed_estimate,
3671 })
3672 }
3673
3674 /// Internal helper: scan `(table, index)` for a `Cold` locator
3675 /// keyed by `key`. Returns `Ok(Some((segment_id, page_offset)))`
3676 /// when found, `Ok(None)` when the key has only hot entries
3677 /// or no entries at all, `Err` on the same input-validation
3678 /// errors as the public `promote_cold_row` / `shadow_cold_row`.
3679 fn find_cold_locator(
3680 &self,
3681 table_name: &str,
3682 index_name: &str,
3683 key: &IndexKey,
3684 ) -> Result<Option<(u32, u32)>, StorageError> {
3685 let t = self.get(table_name).ok_or_else(|| {
3686 StorageError::Corrupt(format!("find_cold_locator: table {table_name:?} not found"))
3687 })?;
3688 let idx = t
3689 .indices
3690 .iter()
3691 .find(|i| i.name == index_name)
3692 .ok_or_else(|| {
3693 StorageError::Corrupt(format!(
3694 "find_cold_locator: index {index_name:?} not found on {table_name:?}"
3695 ))
3696 })?;
3697 if !matches!(idx.kind, IndexKind::BTree(_)) {
3698 return Err(StorageError::Corrupt(format!(
3699 "find_cold_locator: index {index_name:?} is NSW; promote-on-write only applies to BTree indices"
3700 )));
3701 }
3702 for loc in idx.lookup_eq(key) {
3703 if let RowLocator::Cold {
3704 segment_id,
3705 page_offset,
3706 } = *loc
3707 {
3708 return Ok(Some((segment_id, page_offset)));
3709 }
3710 }
3711 Ok(None)
3712 }
3713}
3714
3715/// Coerce an [`IndexKey`] to the `u64` that v5.1 cold-tier
3716/// segments use as their on-disk PK. Returns `None` for keys that
3717/// aren't representable as `u64` — Text PKs need a hash mapping
3718/// the segment writer baked in (deferred to v5.2+), Bool PKs are
3719/// almost never wide enough to be sharded into a cold tier.
3720fn index_key_as_u64(key: &IndexKey) -> Option<u64> {
3721 match key {
3722 // Reinterpret the i64 bit pattern as u64. Cold-tier segments
3723 // are sorted by this u64 view, so the chosen interpretation
3724 // only has to match between insert (bake_segment / freezer)
3725 // and lookup — using cast_unsigned keeps both sides honest
3726 // and silences clippy::cast_sign_loss.
3727 IndexKey::Int(n) => Some(n.cast_unsigned()),
3728 // Text / Bool / Uuid PKs aren't representable as u64 and so
3729 // can't participate in the u64-sorted cold-tier segment
3730 // PK layout. Same deferral story as Text — lookup falls
3731 // through the in-memory btree.
3732 IndexKey::Text(_) | IndexKey::Bool(_) | IndexKey::Uuid(_) => None,
3733 }
3734}
3735
3736#[derive(Debug, Clone, PartialEq, Eq)]
3737#[non_exhaustive]
3738pub enum StorageError {
3739 DuplicateTable {
3740 name: String,
3741 },
3742 TableNotFound {
3743 name: String,
3744 },
3745 ArityMismatch {
3746 expected: usize,
3747 actual: usize,
3748 },
3749 TypeMismatch {
3750 column: String,
3751 expected: DataType,
3752 actual: DataType,
3753 position: usize,
3754 },
3755 NullInNotNull {
3756 column: String,
3757 },
3758 /// Index with this name already exists on the table.
3759 DuplicateIndex {
3760 name: String,
3761 },
3762 /// Column referenced by an index doesn't exist on the table.
3763 ColumnNotFound {
3764 column: String,
3765 },
3766 /// On-disk format failed to parse — corrupted file, wrong magic, truncated
3767 /// payload, or unknown tag bytes.
3768 Corrupt(String),
3769 /// v6.0.4 — ALTER INDEX targeted an index name that doesn't
3770 /// exist on any table in this catalog.
3771 IndexNotFound {
3772 name: String,
3773 },
3774 /// v6.0.4 — operation requested isn't supported on this index
3775 /// kind / column type (e.g. ALTER INDEX REBUILD on a `BTree`
3776 /// index, or REBUILD WITH (encoding=…) on a non-vector column).
3777 Unsupported(String),
3778}
3779
3780impl fmt::Display for StorageError {
3781 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
3782 match self {
3783 Self::DuplicateTable { name } => write!(f, "table already exists: {name}"),
3784 Self::TableNotFound { name } => write!(f, "table not found: {name}"),
3785 Self::ArityMismatch { expected, actual } => write!(
3786 f,
3787 "row arity mismatch: expected {expected} columns, got {actual}"
3788 ),
3789 Self::TypeMismatch {
3790 column,
3791 expected,
3792 actual,
3793 position,
3794 } => write!(
3795 f,
3796 "type mismatch in column {column:?} (position {position}): expected {expected}, got {actual}"
3797 ),
3798 Self::NullInNotNull { column } => {
3799 write!(f, "NULL value in NOT NULL column {column:?}")
3800 }
3801 Self::DuplicateIndex { name } => write!(f, "index already exists: {name}"),
3802 Self::ColumnNotFound { column } => write!(f, "column not found: {column}"),
3803 Self::Corrupt(detail) => write!(f, "corrupt on-disk format: {detail}"),
3804 Self::IndexNotFound { name } => write!(f, "index not found: {name}"),
3805 Self::Unsupported(detail) => write!(f, "unsupported: {detail}"),
3806 }
3807 }
3808}
3809
3810impl ColumnSchema {
3811 pub fn new(name: impl Into<String>, ty: DataType, nullable: bool) -> Self {
3812 Self {
3813 name: name.into(),
3814 ty,
3815 nullable,
3816 default: None,
3817 runtime_default: None,
3818 auto_increment: false,
3819 user_enum_type: None,
3820 user_domain_type: None,
3821 on_update_runtime: None,
3822 collation: Collation::Binary,
3823 is_unsigned: false,
3824 inline_enum_variants: None,
3825 inline_set_variants: None,
3826 }
3827 }
3828
3829 /// Builder-style helper to attach a default value to an otherwise
3830 /// plain column schema. Used by the engine when CREATE TABLE
3831 /// specifies `column TYPE DEFAULT <expr>`.
3832 #[must_use]
3833 pub fn with_default(mut self, default: Value) -> Self {
3834 self.default = Some(default);
3835 self
3836 }
3837
3838 /// v7.9.21 — builder for runtime-evaluated defaults
3839 /// (`DEFAULT now()`, `DEFAULT CURRENT_TIMESTAMP`, …).
3840 /// `expr` is the Expr's `Display` form, re-parsed by the
3841 /// engine at each INSERT.
3842 #[must_use]
3843 pub fn with_runtime_default(mut self, expr: impl Into<String>) -> Self {
3844 self.runtime_default = Some(expr.into());
3845 self
3846 }
3847
3848 /// Builder-style helper to mark a column as `AUTO_INCREMENT`.
3849 #[must_use]
3850 pub const fn with_auto_increment(mut self) -> Self {
3851 self.auto_increment = true;
3852 self
3853 }
3854}
3855
3856impl TableSchema {
3857 pub fn new(name: impl Into<String>, columns: Vec<ColumnSchema>) -> Self {
3858 Self {
3859 name: name.into(),
3860 columns,
3861 hot_tier_bytes: None,
3862 foreign_keys: Vec::new(),
3863 uniqueness_constraints: Vec::new(),
3864 checks: Vec::new(),
3865 }
3866 }
3867}
3868
3869// =========================================================================
3870// Persistent binary format for the catalog.
3871//
3872// Layout (little-endian throughout):
3873//
3874// [magic "SPGDB001" 8 bytes][version u8]
3875// [table_count u32]
3876// for each table:
3877// [name_len u16][name bytes]
3878// [col_count u16]
3879// for each col:
3880// [name_len u16][name bytes]
3881// [type_tag u8 + optional payload]
3882// 1=Int 2=BigInt 3=Float 4=Text 5=Bool
3883// 6=Vector(u32 dim)
3884// 7=SmallInt
3885// 8=Varchar(u32 max)
3886// 9=Char(u32 size)
3887// 10=Numeric(u8 precision, u8 scale)
3888// 11=Date
3889// 12=Timestamp
3890// [nullable u8] 0/1
3891// [default_tag u8] 0=none 1=value (followed by [value_tag u8] + bytes)
3892// [row_count u32]
3893// for each row, for each col, one [value_tag u8] + value bytes:
3894// tag 0 (Null) → no body
3895// tag 1 (Int) → i32 LE
3896// tag 2 (BigInt) → i64 LE
3897// tag 3 (Float) → f64 LE
3898// tag 4 (Text) → u16 LE len + UTF-8 bytes
3899// tag 5 (Bool) → u8 0/1
3900// tag 6 (Vector) → u32 LE dim + dim×f32 LE
3901// tag 7 (SmallInt) → i16 LE
3902// tag 8 (Numeric) → i128 LE (16 bytes) + u8 scale
3903// tag 9 (Date) → i32 LE (days since Unix epoch)
3904// tag 10 (Timestamp) → i64 LE (microseconds since Unix epoch)
3905//
3906// Bumped to version 3 when NUMERIC was added; to version 4 when
3907// AUTO_INCREMENT (per-column flag) + NSW index `kind` byte landed;
3908// to version 5 when DATE / TIMESTAMP were added; to version 6 when
3909// NSW graph topology started travelling on disk (v2.7); to version 7
3910// when the NSW topology became multi-layer HNSW (v2.13); to version 8
3911// when row encoding switched to schema-driven dense layout (v3.0.2 —
3912// per-row NULL bitmap + per-column fixed-width body, no per-cell type
3913// tag).
3914// =========================================================================
3915
3916const FILE_MAGIC: &[u8; 8] = b"SPGDB001";
3917/// Current catalog snapshot format version emitted by [`Catalog::serialize`].
3918///
3919/// v9 (v5.2) extends v8 by serialising `BTree` index entries directly — every
3920/// `(IndexKey, Vec<RowLocator>)` pair travels on disk with the v5.1
3921/// `RowLocator::write_le` tag-prefixed codec. v8 `BTree` indices stored no
3922/// entries at all (the map was rebuilt from `Table::rows` on load); v9
3923/// preserves on-disk Cold locators so freezer-produced cold-tier index
3924/// entries survive a catalog snapshot round-trip. v8 readers are accepted
3925/// by version dispatch in [`Catalog::deserialize`] — every entry decodes
3926/// as `RowLocator::Hot(_)` via `add_index` rebuild, identical to v5.1
3927/// behaviour.
3928/// v6.7.2 — bumped from 10 to 11 to append per-table
3929/// `hot_tier_bytes: Option<u64>` after the per-table indices
3930/// section. v10 catalogs (v6.7.1) load with `hot_tier_bytes =
3931/// None` for every table (the deserialiser short-circuits when
3932/// version < 11). v11 snapshots written by a pre-v6.7.2 binary
3933/// fail loudly at the version check, matching the v6.1.2 /
3934/// v6.1.4 / v6.2.0 / v6.7.1 envelope-bump upgrade fences.
3935///
3936/// v6.8.0 — bumped from 11 to 12: per-index
3937/// `included_columns: Vec<u16>` appended at the tail of each
3938/// index payload. v11 (= v6.7.2) catalogs load with
3939/// `included_columns = Vec::new()` for every index — same
3940/// "older readers, append-only extension" pattern as the v6.7.2
3941/// hot_tier_bytes byte.
3942/// v7.13.0 — bumped from 22 to 23. mailrs round-5 G3 / G10.
3943/// Per-table appendix gains two new sections:
3944/// * `checks: Vec<String>` — CHECK predicate sources (Display
3945/// form of the AST Expr); re-parsed on INSERT/UPDATE to
3946/// enforce against candidate rows. Same persistence pattern
3947/// as `Index::partial_predicate`.
3948/// * Per `UniquenessConstraint`: trailing `nulls_not_distinct:
3949/// u8` flag for PG 15+ `UNIQUE NULLS NOT DISTINCT (cols)`
3950/// semantics.
3951/// v22 catalogs deserialise with empty `checks` and every UC
3952/// at `nulls_not_distinct = false`.
3953/// v24 introduces:
3954/// * Index kind tag 4 = trigram-GIN (`gin_trgm_ops`-flavoured
3955/// `USING gin` over a TEXT/VARCHAR column). Payload shape is
3956/// identical to tag-3 GIN (String → Vec<RowLocator>); the
3957/// keys are PG-compatible 3-byte trigram shingles instead of
3958/// tsvector lexemes. v23 catalogs deserialise unchanged — no
3959/// v23 writer ever emitted tag 4.
3960/// v25 introduces:
3961/// * Per `TriggerDef`: trailing `enabled: u8` flag (mailrs
3962/// round-9 A.2.b — `ALTER TABLE … { ENABLE | DISABLE }
3963/// TRIGGER …`). v24 catalogs deserialise with every trigger
3964/// `enabled = true`, matching pre-v7.16.1 behaviour.
3965/// v26 introduces (v7.17.0 Phase 1.1):
3966/// * Trailing SEQUENCE catalog block after triggers. Encoded
3967/// as `u32 count` followed by per-sequence:
3968/// `name`, `data_type: u8` (0=SmallInt,1=Int,2=BigInt),
3969/// `start i64`, `increment i64`, `min_value i64`,
3970/// `max_value i64`, `cache i64`, `cycle u8`,
3971/// `owned_by_tag u8` (0=NONE, 1=Column → `table`,`column`),
3972/// `last_value i64`, `is_called u8`. v25-and-below catalogs
3973/// deserialise with an empty sequences map.
3974/// v27 introduces (v7.17.0 Phase 1.2):
3975/// * Trailing VIEW catalog block after sequences. Encoded as
3976/// `u32 count` followed by per-view:
3977/// `name`, `column_count u16`, then column names, then
3978/// `body` long-string. v26-and-below catalogs deserialise
3979/// with an empty views map.
3980/// v28 introduces (v7.17.0 Phase 1.3):
3981/// * Trailing MATERIALIZED VIEW source registry block after
3982/// views. Encoded as `u32 count` followed by per-entry:
3983/// `name`, `body` long-string. The materialised rows live
3984/// as a regular Table of the same name (already covered by
3985/// the pre-existing tables block). v27-and-below catalogs
3986/// deserialise with an empty map.
3987/// v29 introduces (v7.17.0 Phase 1.4):
3988/// * Per-table user_enum_type appendix (after the CHECK
3989/// appendix). Layout: `u16 count` followed by per-binding
3990/// `[u16 col_pos][str enum_name]`. Only columns whose
3991/// `user_enum_type` is Some land here; the catalog stays
3992/// compact for the common no-enum case.
3993/// * Trailing ENUM types catalog block after materialized
3994/// views. Encoded as `u32 count` followed by per-entry:
3995/// `name`, `u16 label_count`, then `label_count` short
3996/// strings. v28-and-below catalogs deserialise with an
3997/// empty enum_types map and every column's
3998/// `user_enum_type = None`.
3999/// v30 introduces (v7.17.0 Phase 1.5):
4000/// * Per-table user_domain_type appendix (after the
4001/// user_enum_type appendix). Same shape as the enum one.
4002/// * Trailing DOMAIN types catalog block after the enum
4003/// block. Encoded as `u32 count` followed by per-entry:
4004/// `name`, `data_type` byte, `nullable u8`,
4005/// `default_present u8` + optional default string,
4006/// `u16 check_count` then `check_count` Display-form
4007/// CHECK strings. v29-and-below catalogs deserialise with
4008/// an empty domain_types map and `user_domain_type = None`.
4009/// v31 introduces (v7.17.0 Phase 1.6):
4010/// * Trailing user-schemas block after the DOMAIN block.
4011/// Encoded as `u32 count` followed by `count` schema-name
4012/// short strings. Built-in schemas (`public`, `pg_catalog`,
4013/// `information_schema`) are NOT serialised — they're
4014/// hardcoded in `is_builtin_schema`. v30-and-below catalogs
4015/// deserialise with an empty user-schemas set.
4016/// v32 introduces (v7.17.0 Phase 2.1):
4017/// * Per-table on_update_runtime appendix (after the
4018/// user_domain_type appendix). Layout: `u16 count` followed
4019/// by per-binding `[u16 col_pos][str expr_src]`. Only
4020/// columns whose `on_update_runtime` is Some land here;
4021/// the catalog stays compact when no MySQL-shaped table
4022/// uses the attribute. v31-and-below catalogs deserialise
4023/// with every column's `on_update_runtime = None`.
4024/// v33 introduces (v7.17.0 Phase 2.2):
4025/// * Index kind tag 5 = fulltext-GIN (MySQL `FULLTEXT KEY`
4026/// surface over a TEXT / VARCHAR column). Payload shape is
4027/// identical to tag-3 / tag-4 GIN (`String → Vec<RowLocator>`);
4028/// the keys are lower-cased word lexemes (same rule as
4029/// `to_tsvector('simple', text)`). v32 catalogs deserialise
4030/// unchanged — no v32 writer ever emitted tag 5, and FULLTEXT
4031/// KEY was silently dropped pre-v7.17 so no rebuild shim is
4032/// needed for round-tripped catalogs.
4033/// v34 introduces (v7.17.0 Phase 2.5):
4034/// * Per-table collation appendix (after the on_update_runtime
4035/// appendix). Sparse layout: only columns whose `collation`
4036/// is non-Binary land here. `u16 count` then per-binding
4037/// `[u16 col_pos][u8 collation_tag]` where the tag matches
4038/// `Collation::TAG_*`. Snapshots written by v33-and-below
4039/// readers deserialise every column with `collation =
4040/// Binary`, preserving the prior byte-wise compare
4041/// semantics. Unknown tags read back as Binary too — keeps
4042/// a forward-compat path if a future v35 adds variants
4043/// and someone rolls back to a v34 reader.
4044/// v35 introduces (v7.17.0 Phase 4.4):
4045/// * Per-table is_unsigned appendix (after the collation
4046/// appendix). Sparse layout: only `is_unsigned = true`
4047/// columns land. `u16 count` then per-binding `[u16 col_pos]`.
4048/// v34-and-below catalogs deserialise every column as
4049/// `is_unsigned = false`, preserving the prior silent-
4050/// accept behaviour for negative inserts on UNSIGNED columns.
4051/// v46 introduces (v7.23, mailrs round-14):
4052/// * Escaped short-string codec — `write_str` lengths >= 0xFFFF
4053/// emit `[u16 0xFFFF][u32 real_len]` so TEXT cells (mail bodies,
4054/// document text) above 64 KiB encode instead of panicking.
4055/// One-way upgrade: v45-and-below readers reject v46 catalogs
4056/// loudly via the version gate; v46 readers decode v45 catalogs
4057/// with the plain-u16 rules (0xFFFF is a legitimate length
4058/// there).
4059/// v47 introduces (v7.27, mailrs round-21):
4060/// * Escaped lengths for the REMAINING u16-length cell payloads —
4061/// BYTEA cells, TEXT[] elements, tsvector lexemes and tsquery
4062/// terms — the same `[u16 0xFFFF][u32 real_len]` escape v46
4063/// gave short strings. Round-14 fixed TEXT and missed these;
4064/// round-21 fired the BYTEA twin during a production migration.
4065/// One-way upgrade, same posture as v46.
4066const FILE_VERSION: u8 = 47;
4067/// Oldest format version [`Catalog::deserialize`] still accepts. v8 is the
4068/// v3.0.2 dense-row layout; pre-v8 catalogs require an offline migration.
4069const MIN_SUPPORTED_FILE_VERSION: u8 = 8;
4070
4071// IndexKey wire format (v9):
4072// tag 0 = Int → [i64 LE]
4073// tag 1 = Text → [u16 LE len + UTF-8 bytes] (via write_str / read_str)
4074// tag 2 = Bool → [u8 0/1]
4075const INDEX_KEY_TAG_INT: u8 = 0;
4076const INDEX_KEY_TAG_TEXT: u8 = 1;
4077const INDEX_KEY_TAG_BOOL: u8 = 2;
4078/// v7.17.0 — `IndexKey::Uuid([u8; 16])`. Body = raw 16 bytes
4079/// (RFC 4122 byte order). Persisted only in FILE_VERSION 36+
4080/// catalogs.
4081const INDEX_KEY_TAG_UUID: u8 = 3;
4082
4083impl Catalog {
4084 /// Serialize the whole catalog (schema + every row) into a self-contained
4085 /// byte buffer. Format is documented above the impl block.
4086 pub fn serialize(&self) -> Vec<u8> {
4087 let mut out = Vec::with_capacity(64);
4088 out.extend_from_slice(FILE_MAGIC);
4089 out.push(FILE_VERSION);
4090 write_u32(
4091 &mut out,
4092 u32::try_from(self.tables.len()).expect("≤ 4G tables"),
4093 );
4094 for t in &self.tables {
4095 write_str(&mut out, &t.schema.name);
4096 write_u16(
4097 &mut out,
4098 u16::try_from(t.schema.columns.len()).expect("≤ 65k columns/table"),
4099 );
4100 for c in &t.schema.columns {
4101 write_str(&mut out, &c.name);
4102 write_data_type(&mut out, c.ty);
4103 out.push(u8::from(c.nullable));
4104 match &c.default {
4105 None => out.push(0),
4106 Some(v) => {
4107 out.push(1);
4108 write_value(&mut out, v);
4109 }
4110 }
4111 out.push(u8::from(c.auto_increment));
4112 }
4113 write_u32(
4114 &mut out,
4115 u32::try_from(t.rows.len()).expect("≤ 4G rows/table"),
4116 );
4117 // v3.0.2 dense row encoding (FILE_VERSION 8): per-row NULL
4118 // bitmap, then tightly-packed bodies. Identical wire format
4119 // as before — extracted into `encode_row_body_dense` so cold-
4120 // tier segments (v5.1+) can share the encoding.
4121 for row in &t.rows {
4122 out.extend_from_slice(&encode_row_body_dense(row, &t.schema));
4123 }
4124 // Index definitions. Per-index payload:
4125 // [name][col_pos u16][kind u8]
4126 // kind 0 = B-tree (no params — rebuilt on load)
4127 // kind 1 = NSW graph (u16 M + serialized graph)
4128 // For NSW the graph topology travels on disk so startup
4129 // doesn't re-run the O(n²M) rebuild — see v2.7 notes.
4130 write_u16(
4131 &mut out,
4132 u16::try_from(t.indices.len()).expect("≤ 65k indices/table"),
4133 );
4134 for idx in &t.indices {
4135 write_str(&mut out, &idx.name);
4136 write_u16(
4137 &mut out,
4138 u16::try_from(idx.column_position).expect("≤ 65k columns/table"),
4139 );
4140 match &idx.kind {
4141 IndexKind::BTree(map) => {
4142 out.push(0);
4143 // v9: serialise the full PB map. Each entry's
4144 // RowLocator list travels with the tag-prefixed
4145 // codec from `row_locator::write_le`, so freezer-
4146 // produced Cold locators survive a snapshot
4147 // round-trip. v8 BTree wrote nothing here and
4148 // rebuilt from rows — v9 readers tolerate v8 by
4149 // version dispatch in `Catalog::deserialize`.
4150 write_u32(
4151 &mut out,
4152 u32::try_from(map.len()).expect("≤ 4G index entries/index"),
4153 );
4154 for (key, locators) in map {
4155 write_index_key(&mut out, key);
4156 write_u32(
4157 &mut out,
4158 u32::try_from(locators.len()).expect("≤ 4G locators/key"),
4159 );
4160 for loc in locators {
4161 loc.write_le(&mut out);
4162 }
4163 }
4164 }
4165 IndexKind::Nsw(g) => {
4166 out.push(1);
4167 write_u16(&mut out, u16::try_from(g.m).expect("≤ 65k NSW neighbours"));
4168 write_nsw_graph(&mut out, g);
4169 }
4170 IndexKind::Brin { column_type } => {
4171 // v6.7.1 — tag byte 2 = BRIN. Payload is the
4172 // column type code (1 byte mapping to the
4173 // shared DataType numeric encoding); no
4174 // further data — BRIN summaries live in
4175 // cold segments, not the catalog.
4176 out.push(2);
4177 write_data_type(&mut out, *column_type);
4178 }
4179 IndexKind::Gin(map) => {
4180 // v7.12.3 — tag byte 3 = GIN. Payload mirrors
4181 // the BTree encoding but with String (lexeme
4182 // word) keys instead of IndexKey. Tag-prefixed
4183 // RowLocator codec so freezer-produced Cold
4184 // locators survive snapshot round-trip.
4185 // FILE_VERSION 21+; v20 catalogs never wrote a
4186 // GIN index (the AM degraded to BTree fallback
4187 // pre-v7.12.3), so no migration shim is needed.
4188 out.push(3);
4189 write_u32(
4190 &mut out,
4191 u32::try_from(map.len()).expect("≤ 4G GIN posting lists"),
4192 );
4193 for (word, locators) in map {
4194 write_str(&mut out, word);
4195 write_u32(
4196 &mut out,
4197 u32::try_from(locators.len()).expect("≤ 4G locators/posting list"),
4198 );
4199 for loc in locators {
4200 loc.write_le(&mut out);
4201 }
4202 }
4203 }
4204 IndexKind::GinTrgm(map) => {
4205 // v7.15.0 — tag byte 4 = GinTrgm
4206 // (`gin_trgm_ops` GIN over a TEXT column).
4207 // Payload shape is identical to tag-3 GIN —
4208 // `String → Vec<RowLocator>` posting lists.
4209 // The String keys are 3-byte trigrams instead
4210 // of tsvector lexemes; the deserializer
4211 // dispatches on the tag, not the key shape.
4212 // FILE_VERSION 24+; v23 catalogs never wrote
4213 // a trigram-GIN.
4214 out.push(4);
4215 write_u32(
4216 &mut out,
4217 u32::try_from(map.len()).expect("≤ 4G trigram-GIN posting lists"),
4218 );
4219 for (tri, locators) in map {
4220 write_str(&mut out, tri);
4221 write_u32(
4222 &mut out,
4223 u32::try_from(locators.len()).expect("≤ 4G locators/posting list"),
4224 );
4225 for loc in locators {
4226 loc.write_le(&mut out);
4227 }
4228 }
4229 }
4230 IndexKind::GinFulltext(map) => {
4231 // v7.17.0 Phase 2.2 — tag byte 5 =
4232 // GinFulltext (MySQL `FULLTEXT KEY` GIN
4233 // over a TEXT/VARCHAR column). Payload
4234 // shape mirrors tag-3 / tag-4 GIN —
4235 // `String → Vec<RowLocator>` posting
4236 // lists keyed by lower-cased word
4237 // lexemes. FILE_VERSION 33+; v32 catalogs
4238 // never wrote a fulltext-GIN (FULLTEXT
4239 // KEY was silently dropped pre-v7.17).
4240 out.push(5);
4241 write_u32(
4242 &mut out,
4243 u32::try_from(map.len()).expect("≤ 4G fulltext-GIN posting lists"),
4244 );
4245 for (lex, locators) in map {
4246 write_str(&mut out, lex);
4247 write_u32(
4248 &mut out,
4249 u32::try_from(locators.len()).expect("≤ 4G locators/posting list"),
4250 );
4251 for loc in locators {
4252 loc.write_le(&mut out);
4253 }
4254 }
4255 }
4256 }
4257 // v6.8.0 — included_columns appendix per index.
4258 // Layout: [u16 num_included][num × u16 column_position].
4259 // v11 readers stop before this u16 (deserialise loop
4260 // gated on version >= 12); v12+ readers always
4261 // consume it. Empty Vec serialises as a bare 0u16.
4262 write_u16(
4263 &mut out,
4264 u16::try_from(idx.included_columns.len()).expect("≤ 65k INCLUDE columns/index"),
4265 );
4266 for col_pos in &idx.included_columns {
4267 write_u16(
4268 &mut out,
4269 u16::try_from(*col_pos).expect("≤ 65k columns/table"),
4270 );
4271 }
4272 // v6.8.1 — partial_predicate appendix per index.
4273 // Layout: [u8 has_pred][u16 LE len][bytes (if has_pred)].
4274 // Same v12 gate as included_columns.
4275 match &idx.partial_predicate {
4276 None => out.push(0),
4277 Some(pred) => {
4278 out.push(1);
4279 write_str(&mut out, pred);
4280 }
4281 }
4282 // v6.8.2 — expression appendix. Same shape as
4283 // partial_predicate.
4284 match &idx.expression {
4285 None => out.push(0),
4286 Some(expr) => {
4287 out.push(1);
4288 write_str(&mut out, expr);
4289 }
4290 }
4291 // v7.9.29 — is_unique appendix (FILE_VERSION 16+).
4292 // Single byte 0/1. v15-and-below readers stop before
4293 // this byte; v16 readers always consume it. mailrs K1.
4294 out.push(u8::from(idx.is_unique));
4295 // v7.9.29 — extra_column_positions appendix.
4296 // Layout: [u16 count][count × u16 column_position].
4297 write_u16(
4298 &mut out,
4299 u16::try_from(idx.extra_column_positions.len())
4300 .expect("≤ 65k extra cols / index"),
4301 );
4302 for cp in &idx.extra_column_positions {
4303 write_u16(&mut out, u16::try_from(*cp).expect("≤ 65k columns/table"));
4304 }
4305 }
4306 // v6.7.2 — per-table hot_tier_bytes Option<u64>.
4307 // Layout: [u8 has_value][u64 LE value (if has_value)].
4308 // v10 readers stop before this byte (deserialise loop
4309 // gated on version >= 11); v11+ readers always
4310 // consume it.
4311 match t.schema.hot_tier_bytes {
4312 None => out.push(0),
4313 Some(n) => {
4314 out.push(1);
4315 out.extend_from_slice(&n.to_le_bytes());
4316 }
4317 }
4318 // v7.6.1 — FOREIGN KEY appendix (catalog FILE_VERSION 13+).
4319 // Layout: [u16 LE fk_count]
4320 // per fk:
4321 // [u8 has_name] [str name (if has_name)]
4322 // [u16 LE local_arity] [u16 LE local_pos]*arity
4323 // [str parent_table]
4324 // [u16 LE parent_arity] [u16 LE parent_pos]*arity
4325 // [u8 on_delete_tag] [u8 on_update_tag]
4326 // Older catalogs (v12 and below) skip this block entirely;
4327 // their reader stops before this byte.
4328 write_u16(
4329 &mut out,
4330 u16::try_from(t.schema.foreign_keys.len()).expect("≤ 65k FKs/table"),
4331 );
4332 for fk in &t.schema.foreign_keys {
4333 match &fk.name {
4334 None => out.push(0),
4335 Some(n) => {
4336 out.push(1);
4337 write_str(&mut out, n);
4338 }
4339 }
4340 write_u16(
4341 &mut out,
4342 u16::try_from(fk.local_columns.len()).expect("≤ 65k FK columns"),
4343 );
4344 for &p in &fk.local_columns {
4345 write_u16(&mut out, u16::try_from(p).expect("≤ 65k columns/table"));
4346 }
4347 write_str(&mut out, &fk.parent_table);
4348 write_u16(
4349 &mut out,
4350 u16::try_from(fk.parent_columns.len()).expect("≤ 65k FK parent columns"),
4351 );
4352 for &p in &fk.parent_columns {
4353 write_u16(&mut out, u16::try_from(p).expect("≤ 65k columns/table"));
4354 }
4355 out.push(fk.on_delete.tag());
4356 out.push(fk.on_update.tag());
4357 }
4358 // v7.9.19 — UniquenessConstraint appendix (catalog
4359 // FILE_VERSION 15+). Layout per table after the FK
4360 // block:
4361 // [u16 count]
4362 // per constraint:
4363 // [u8 is_primary_key]
4364 // [u16 arity][u16 col_pos]*arity
4365 // Older catalogs (v14 and below) skip this block.
4366 write_u16(
4367 &mut out,
4368 u16::try_from(t.schema.uniqueness_constraints.len())
4369 .expect("≤ 65k uniqueness constraints/table"),
4370 );
4371 for uc in &t.schema.uniqueness_constraints {
4372 out.push(u8::from(uc.is_primary_key));
4373 write_u16(
4374 &mut out,
4375 u16::try_from(uc.columns.len()).expect("≤ 65k cols in uniqueness constraint"),
4376 );
4377 for &p in &uc.columns {
4378 write_u16(&mut out, u16::try_from(p).expect("≤ 65k columns/table"));
4379 }
4380 // v7.13.0 — `nulls_not_distinct` flag
4381 // (FILE_VERSION 23+). Always written by writers at
4382 // version 23+; deserialise gates on `version >= 23`
4383 // so v22-and-below catalogs round-trip cleanly.
4384 out.push(u8::from(uc.nulls_not_distinct));
4385 }
4386 // v7.9.21 — runtime_default appendix per table.
4387 // Layout: [u16 count] then for each:
4388 // [u16 col_pos][str expr]
4389 // Only columns whose runtime_default is Some land here;
4390 // catalog stays compact for the common literal-default
4391 // case.
4392 let mut rt_defaults: Vec<(usize, &str)> = Vec::new();
4393 for (i, c) in t.schema.columns.iter().enumerate() {
4394 if let Some(e) = &c.runtime_default {
4395 rt_defaults.push((i, e.as_str()));
4396 }
4397 }
4398 write_u16(
4399 &mut out,
4400 u16::try_from(rt_defaults.len()).expect("≤ 65k runtime defaults/table"),
4401 );
4402 for (pos, expr) in rt_defaults {
4403 write_u16(&mut out, u16::try_from(pos).expect("≤ 65k columns/table"));
4404 write_str(&mut out, expr);
4405 }
4406 // v7.13.0 — CHECK constraint appendix per table.
4407 // Layout: [u16 count] then `count` Display-form
4408 // expression strings. Re-parsed on every INSERT/UPDATE
4409 // by the engine. FILE_VERSION 23+ only; v22 readers
4410 // never reach this block because the writer also moves
4411 // to v23 in lock-step.
4412 write_u16(
4413 &mut out,
4414 u16::try_from(t.schema.checks.len()).expect("≤ 65k CHECK constraints/table"),
4415 );
4416 for c in &t.schema.checks {
4417 write_str(&mut out, c.as_str());
4418 }
4419 // v7.17.0 Phase 1.4 — per-table user_enum_type
4420 // appendix. Layout: [u16 count] then
4421 // [u16 col_pos][str enum_name] per binding. Only
4422 // columns whose user_enum_type is Some land here.
4423 let mut enum_bindings: Vec<(usize, &str)> = Vec::new();
4424 for (i, c) in t.schema.columns.iter().enumerate() {
4425 if let Some(e) = &c.user_enum_type {
4426 enum_bindings.push((i, e.as_str()));
4427 }
4428 }
4429 write_u16(
4430 &mut out,
4431 u16::try_from(enum_bindings.len()).expect("≤ 65k enum-typed columns/table"),
4432 );
4433 for (pos, ename) in enum_bindings {
4434 write_u16(&mut out, u16::try_from(pos).expect("≤ 65k columns/table"));
4435 write_str(&mut out, ename);
4436 }
4437 // v7.17.0 Phase 1.5 — per-table user_domain_type
4438 // appendix. Same layout as the enum one. v29-and-
4439 // below readers stop after the enum appendix.
4440 let mut domain_bindings: Vec<(usize, &str)> = Vec::new();
4441 for (i, c) in t.schema.columns.iter().enumerate() {
4442 if let Some(d) = &c.user_domain_type {
4443 domain_bindings.push((i, d.as_str()));
4444 }
4445 }
4446 write_u16(
4447 &mut out,
4448 u16::try_from(domain_bindings.len()).expect("≤ 65k domain-typed columns/table"),
4449 );
4450 for (pos, dname) in domain_bindings {
4451 write_u16(&mut out, u16::try_from(pos).expect("≤ 65k columns/table"));
4452 write_str(&mut out, dname);
4453 }
4454 // v7.17.0 Phase 2.1 — per-table on_update_runtime
4455 // appendix. Sparse: only ON UPDATE-bound columns.
4456 let mut on_update_bindings: Vec<(usize, &str)> = Vec::new();
4457 for (i, c) in t.schema.columns.iter().enumerate() {
4458 if let Some(e) = &c.on_update_runtime {
4459 on_update_bindings.push((i, e.as_str()));
4460 }
4461 }
4462 write_u16(
4463 &mut out,
4464 u16::try_from(on_update_bindings.len()).expect("≤ 65k ON UPDATE columns/table"),
4465 );
4466 for (pos, expr_src) in on_update_bindings {
4467 write_u16(&mut out, u16::try_from(pos).expect("≤ 65k columns/table"));
4468 write_str(&mut out, expr_src);
4469 }
4470 // v7.17.0 Phase 2.5 — per-table collation appendix.
4471 // Sparse: only non-Binary columns land. Layout:
4472 // `[u16 count][u16 col_pos][u8 tag] × count`.
4473 let mut coll_bindings: Vec<(usize, u8)> = Vec::new();
4474 for (i, c) in t.schema.columns.iter().enumerate() {
4475 let tag = match c.collation {
4476 Collation::Binary => continue,
4477 Collation::CaseInsensitive => Collation::TAG_CASE_INSENSITIVE,
4478 };
4479 coll_bindings.push((i, tag));
4480 }
4481 write_u16(
4482 &mut out,
4483 u16::try_from(coll_bindings.len()).expect("≤ 65k collation bindings/table"),
4484 );
4485 for (pos, tag) in coll_bindings {
4486 write_u16(&mut out, u16::try_from(pos).expect("≤ 65k columns/table"));
4487 out.push(tag);
4488 }
4489 // v7.17.0 Phase 4.4 — per-table is_unsigned appendix.
4490 // Sparse: only UNSIGNED columns land. Layout:
4491 // `[u16 count][u16 col_pos] × count`.
4492 let mut unsigned_bindings: Vec<usize> = Vec::new();
4493 for (i, c) in t.schema.columns.iter().enumerate() {
4494 if c.is_unsigned {
4495 unsigned_bindings.push(i);
4496 }
4497 }
4498 write_u16(
4499 &mut out,
4500 u16::try_from(unsigned_bindings.len()).expect("≤ 65k UNSIGNED columns/table"),
4501 );
4502 for pos in unsigned_bindings {
4503 write_u16(&mut out, u16::try_from(pos).expect("≤ 65k columns/table"));
4504 }
4505 // v7.17.0 Phase 3.P0-36 — per-table inline_enum_variants
4506 // appendix. Sparse: only ENUM columns land. Layout:
4507 // `[u16 count] then per binding [u16 col_pos]
4508 // [u16 variant_count] then variant strings`.
4509 // FILE_VERSION 41+; v40 readers never reach this block.
4510 let mut enum_inline_bindings: Vec<(usize, &[String])> = Vec::new();
4511 for (i, c) in t.schema.columns.iter().enumerate() {
4512 if let Some(vs) = &c.inline_enum_variants {
4513 enum_inline_bindings.push((i, vs.as_slice()));
4514 }
4515 }
4516 write_u16(
4517 &mut out,
4518 u16::try_from(enum_inline_bindings.len()).expect("≤ 65k inline-ENUM columns/table"),
4519 );
4520 for (pos, variants) in enum_inline_bindings {
4521 write_u16(&mut out, u16::try_from(pos).expect("≤ 65k columns/table"));
4522 write_u16(
4523 &mut out,
4524 u16::try_from(variants.len()).expect("≤ 65k variants/ENUM"),
4525 );
4526 for v in variants {
4527 write_str(&mut out, v.as_str());
4528 }
4529 }
4530 // v7.17.0 Phase 3.P0-37 — per-table inline_set_variants
4531 // appendix. Same layout as the inline ENUM block.
4532 // FILE_VERSION 42+; v41 readers never reach this block.
4533 let mut set_inline_bindings: Vec<(usize, &[String])> = Vec::new();
4534 for (i, c) in t.schema.columns.iter().enumerate() {
4535 if let Some(vs) = &c.inline_set_variants {
4536 set_inline_bindings.push((i, vs.as_slice()));
4537 }
4538 }
4539 write_u16(
4540 &mut out,
4541 u16::try_from(set_inline_bindings.len()).expect("≤ 65k inline-SET columns/table"),
4542 );
4543 for (pos, variants) in set_inline_bindings {
4544 write_u16(&mut out, u16::try_from(pos).expect("≤ 65k columns/table"));
4545 write_u16(
4546 &mut out,
4547 u16::try_from(variants.len()).expect("≤ 65k variants/SET"),
4548 );
4549 for v in variants {
4550 write_str(&mut out, v.as_str());
4551 }
4552 }
4553 }
4554 // v7.12.4 — catalog-wide appendix: user-defined functions
4555 // then triggers. FILE_VERSION 22+ only. v21 and earlier
4556 // readers stop after the last table; v22 readers always
4557 // consume two `u32` counts (possibly zero).
4558 //
4559 // Function entry layout:
4560 // [str name] [str args_repr] [str returns]
4561 // [str language] [str body]
4562 // Trigger entry layout:
4563 // [str name] [str table] [str timing]
4564 // [u16 event_count] (event_count × str)
4565 // [str for_each] [str function]
4566 write_u32(
4567 &mut out,
4568 u32::try_from(self.functions.len()).expect("≤ 4G functions"),
4569 );
4570 for fd in self.functions.values() {
4571 write_str(&mut out, &fd.name);
4572 write_str(&mut out, &fd.args_repr);
4573 write_str(&mut out, &fd.returns);
4574 write_str(&mut out, &fd.language);
4575 write_str_long(&mut out, &fd.body);
4576 }
4577 write_u32(
4578 &mut out,
4579 u32::try_from(self.triggers.len()).expect("≤ 4G triggers"),
4580 );
4581 for td in &self.triggers {
4582 write_str(&mut out, &td.name);
4583 write_str(&mut out, &td.table);
4584 write_str(&mut out, &td.timing);
4585 write_u16(
4586 &mut out,
4587 u16::try_from(td.events.len()).expect("≤ 65k events / trigger"),
4588 );
4589 for ev in &td.events {
4590 write_str(&mut out, ev);
4591 }
4592 write_str(&mut out, &td.for_each);
4593 write_str(&mut out, &td.function);
4594 // v7.13.0 — `UPDATE OF cols` filter
4595 // (FILE_VERSION 23+). v22 readers omit; v23 writers
4596 // always emit (possibly zero).
4597 write_u16(
4598 &mut out,
4599 u16::try_from(td.update_columns.len()).expect("≤ 65k cols / trigger"),
4600 );
4601 for c in &td.update_columns {
4602 write_str(&mut out, c);
4603 }
4604 // v7.16.1 — TriggerDef.enabled (FILE_VERSION 25+).
4605 out.push(u8::from(td.enabled));
4606 }
4607 // v7.17.0 Phase 1.1 — SEQUENCE catalog block (FILE_VERSION 26+).
4608 write_u32(
4609 &mut out,
4610 u32::try_from(self.sequences.len()).expect("≤ 4G sequences"),
4611 );
4612 for seq in self.sequences.values() {
4613 write_str(&mut out, &seq.name);
4614 out.push(match seq.data_type {
4615 SequenceDataType::SmallInt => 0,
4616 SequenceDataType::Int => 1,
4617 SequenceDataType::BigInt => 2,
4618 });
4619 out.extend_from_slice(&seq.start.to_le_bytes());
4620 out.extend_from_slice(&seq.increment.to_le_bytes());
4621 out.extend_from_slice(&seq.min_value.to_le_bytes());
4622 out.extend_from_slice(&seq.max_value.to_le_bytes());
4623 out.extend_from_slice(&seq.cache.to_le_bytes());
4624 out.push(u8::from(seq.cycle));
4625 match &seq.owned_by {
4626 None => out.push(0),
4627 Some((table, column)) => {
4628 out.push(1);
4629 write_str(&mut out, table);
4630 write_str(&mut out, column);
4631 }
4632 }
4633 out.extend_from_slice(&seq.last_value.to_le_bytes());
4634 out.push(u8::from(seq.is_called));
4635 }
4636 // v7.17.0 Phase 1.2 — VIEW catalog block (FILE_VERSION 27+).
4637 write_u32(
4638 &mut out,
4639 u32::try_from(self.views.len()).expect("≤ 4G views"),
4640 );
4641 for view in self.views.values() {
4642 write_str(&mut out, &view.name);
4643 write_u16(
4644 &mut out,
4645 u16::try_from(view.columns.len()).expect("≤ 65k cols / view"),
4646 );
4647 for c in &view.columns {
4648 write_str(&mut out, c);
4649 }
4650 write_str_long(&mut out, &view.body);
4651 }
4652 // v7.17.0 Phase 1.3 — MATERIALIZED VIEW source registry
4653 // (FILE_VERSION 28+). The backing rows live as a regular
4654 // table of the same name already in the tables block.
4655 write_u32(
4656 &mut out,
4657 u32::try_from(self.materialized_views.len()).expect("≤ 4G materialized views"),
4658 );
4659 for (name, body) in &self.materialized_views {
4660 write_str(&mut out, name);
4661 write_str_long(&mut out, body);
4662 }
4663 // v7.17.0 Phase 1.4 — ENUM types catalog block
4664 // (FILE_VERSION 29+).
4665 write_u32(
4666 &mut out,
4667 u32::try_from(self.enum_types.len()).expect("≤ 4G enum types"),
4668 );
4669 for e in self.enum_types.values() {
4670 write_str(&mut out, &e.name);
4671 write_u16(
4672 &mut out,
4673 u16::try_from(e.labels.len()).expect("≤ 65k labels / enum"),
4674 );
4675 for l in &e.labels {
4676 write_str(&mut out, l);
4677 }
4678 }
4679 // v7.17.0 Phase 1.5 — DOMAIN types catalog block
4680 // (FILE_VERSION 30+).
4681 write_u32(
4682 &mut out,
4683 u32::try_from(self.domain_types.len()).expect("≤ 4G domain types"),
4684 );
4685 for d in self.domain_types.values() {
4686 write_str(&mut out, &d.name);
4687 write_data_type(&mut out, d.base_type);
4688 out.push(u8::from(d.nullable));
4689 match &d.default {
4690 None => out.push(0),
4691 Some(s) => {
4692 out.push(1);
4693 write_str(&mut out, s);
4694 }
4695 }
4696 write_u16(
4697 &mut out,
4698 u16::try_from(d.checks.len()).expect("≤ 65k CHECKs / domain"),
4699 );
4700 for c in &d.checks {
4701 write_str(&mut out, c);
4702 }
4703 }
4704 // v7.17.0 Phase 1.6 — user-schemas registry
4705 // (FILE_VERSION 31+). Built-ins are hardcoded in
4706 // `is_builtin_schema` and not persisted.
4707 write_u32(
4708 &mut out,
4709 u32::try_from(self.schemas.len()).expect("≤ 4G schemas"),
4710 );
4711 for name in &self.schemas {
4712 write_str(&mut out, name);
4713 }
4714 out
4715 }
4716
4717 /// Deserialize a previously-serialized catalog. Rejects bad magic, version
4718 /// mismatch, unknown tags, truncation, and trailing bytes.
4719 pub fn deserialize(buf: &[u8]) -> Result<Self, StorageError> {
4720 let mut cur = Cursor::new(buf);
4721 let magic = cur.take(8)?;
4722 if magic != FILE_MAGIC {
4723 return Err(StorageError::Corrupt(format!(
4724 "bad magic: expected SPGDB001, got {magic:?}"
4725 )));
4726 }
4727 let version = cur.read_u8()?;
4728 if !(MIN_SUPPORTED_FILE_VERSION..=FILE_VERSION).contains(&version) {
4729 return Err(StorageError::Corrupt(format!(
4730 "unsupported file version: {version} (supported: {MIN_SUPPORTED_FILE_VERSION}..={FILE_VERSION})"
4731 )));
4732 }
4733 // v7.23/v7.27 — escape decoding is version-gated (see
4734 // STR_LEN_ESCAPE / Cursor::codec_version).
4735 cur.codec_version = version;
4736 let table_count = cur.read_u32()? as usize;
4737 let mut cat = Self::new();
4738 for _ in 0..table_count {
4739 deserialize_table(&mut cur, &mut cat, version)?;
4740 }
4741 // v7.12.4 — catalog-wide function + trigger appendix.
4742 // FILE_VERSION 22+ only; v21 and earlier catalogs stop
4743 // after the last table.
4744 if version >= 22 {
4745 let fn_count = cur.read_u32()? as usize;
4746 for _ in 0..fn_count {
4747 let name = cur.read_str()?;
4748 let args_repr = cur.read_str()?;
4749 let returns = cur.read_str()?;
4750 let language = cur.read_str()?;
4751 let body = cur.read_str_long()?;
4752 cat.functions.insert(
4753 name.clone(),
4754 FunctionDef {
4755 name,
4756 args_repr,
4757 returns,
4758 language,
4759 body,
4760 },
4761 );
4762 }
4763 let trg_count = cur.read_u32()? as usize;
4764 for _ in 0..trg_count {
4765 let name = cur.read_str()?;
4766 let table = cur.read_str()?;
4767 let timing = cur.read_str()?;
4768 let ev_count = cur.read_u16()? as usize;
4769 let mut events = Vec::with_capacity(ev_count);
4770 for _ in 0..ev_count {
4771 events.push(cur.read_str()?);
4772 }
4773 let for_each = cur.read_str()?;
4774 let function = cur.read_str()?;
4775 // v7.13.0 — trailing `UPDATE OF cols` filter
4776 // (FILE_VERSION 23+ only; v22 catalogs omit and
4777 // deserialise with an empty vec).
4778 let update_columns = if version >= 23 {
4779 let n = cur.read_u16()? as usize;
4780 let mut cols = Vec::with_capacity(n);
4781 for _ in 0..n {
4782 cols.push(cur.read_str()?);
4783 }
4784 cols
4785 } else {
4786 Vec::new()
4787 };
4788 // v7.16.1 — TriggerDef.enabled (FILE_VERSION 25+).
4789 // v24-and-below catalogs deserialise with `true`
4790 // — pre-v7.16.1 every trigger always fired.
4791 let enabled = if version >= 25 {
4792 cur.read_u8()? != 0
4793 } else {
4794 true
4795 };
4796 cat.triggers.push(TriggerDef {
4797 name,
4798 table,
4799 timing,
4800 events,
4801 for_each,
4802 function,
4803 update_columns,
4804 enabled,
4805 });
4806 }
4807 }
4808 // v7.17.0 Phase 1.1 — SEQUENCE block (FILE_VERSION 26+).
4809 // v25-and-below catalogs omit; we leave the map empty.
4810 if version >= 26 {
4811 let seq_count = cur.read_u32()? as usize;
4812 for _ in 0..seq_count {
4813 let name = cur.read_str()?;
4814 let data_type = match cur.read_u8()? {
4815 0 => SequenceDataType::SmallInt,
4816 1 => SequenceDataType::Int,
4817 2 => SequenceDataType::BigInt,
4818 other => {
4819 return Err(StorageError::Corrupt(format!(
4820 "unknown SEQUENCE data-type tag {other}"
4821 )));
4822 }
4823 };
4824 let start = cur.read_i64()?;
4825 let increment = cur.read_i64()?;
4826 let min_value = cur.read_i64()?;
4827 let max_value = cur.read_i64()?;
4828 let cache = cur.read_i64()?;
4829 let cycle = cur.read_u8()? != 0;
4830 let owned_by = match cur.read_u8()? {
4831 0 => None,
4832 1 => {
4833 let t = cur.read_str()?;
4834 let c = cur.read_str()?;
4835 Some((t, c))
4836 }
4837 other => {
4838 return Err(StorageError::Corrupt(format!(
4839 "unknown SEQUENCE owned-by tag {other}"
4840 )));
4841 }
4842 };
4843 let last_value = cur.read_i64()?;
4844 let is_called = cur.read_u8()? != 0;
4845 cat.sequences.insert(
4846 name.clone(),
4847 SequenceDef {
4848 name,
4849 data_type,
4850 start,
4851 increment,
4852 min_value,
4853 max_value,
4854 cache,
4855 cycle,
4856 owned_by,
4857 last_value,
4858 is_called,
4859 },
4860 );
4861 }
4862 }
4863 // v7.17.0 Phase 1.2 — VIEW block (FILE_VERSION 27+).
4864 // v26-and-below catalogs omit; we leave the map empty.
4865 if version >= 27 {
4866 let view_count = cur.read_u32()? as usize;
4867 for _ in 0..view_count {
4868 let name = cur.read_str()?;
4869 let col_count = cur.read_u16()? as usize;
4870 let mut columns = Vec::with_capacity(col_count);
4871 for _ in 0..col_count {
4872 columns.push(cur.read_str()?);
4873 }
4874 let body = cur.read_str_long()?;
4875 cat.views.insert(
4876 name.clone(),
4877 ViewDef {
4878 name,
4879 columns,
4880 body,
4881 },
4882 );
4883 }
4884 }
4885 // v7.17.0 Phase 1.3 — MATERIALIZED VIEW source registry
4886 // (FILE_VERSION 28+). v27-and-below catalogs omit.
4887 if version >= 28 {
4888 let mv_count = cur.read_u32()? as usize;
4889 for _ in 0..mv_count {
4890 let name = cur.read_str()?;
4891 let body = cur.read_str_long()?;
4892 cat.materialized_views.insert(name, body);
4893 }
4894 }
4895 // v7.17.0 Phase 1.4 — ENUM types catalog block
4896 // (FILE_VERSION 29+).
4897 if version >= 29 {
4898 let etype_count = cur.read_u32()? as usize;
4899 for _ in 0..etype_count {
4900 let name = cur.read_str()?;
4901 let label_count = cur.read_u16()? as usize;
4902 let mut labels = Vec::with_capacity(label_count);
4903 for _ in 0..label_count {
4904 labels.push(cur.read_str()?);
4905 }
4906 cat.enum_types
4907 .insert(name.clone(), EnumDef { name, labels });
4908 }
4909 }
4910 // v7.17.0 Phase 1.5 — DOMAIN types catalog block
4911 // (FILE_VERSION 30+).
4912 if version >= 30 {
4913 let dtype_count = cur.read_u32()? as usize;
4914 for _ in 0..dtype_count {
4915 let name = cur.read_str()?;
4916 let base_type = cur.read_data_type()?;
4917 let nullable = cur.read_u8()? != 0;
4918 let default = match cur.read_u8()? {
4919 0 => None,
4920 1 => Some(cur.read_str()?),
4921 other => {
4922 return Err(StorageError::Corrupt(format!(
4923 "unknown DOMAIN default tag {other}"
4924 )));
4925 }
4926 };
4927 let check_count = cur.read_u16()? as usize;
4928 let mut checks = Vec::with_capacity(check_count);
4929 for _ in 0..check_count {
4930 checks.push(cur.read_str()?);
4931 }
4932 cat.domain_types.insert(
4933 name.clone(),
4934 DomainDef {
4935 name,
4936 base_type,
4937 nullable,
4938 default,
4939 checks,
4940 },
4941 );
4942 }
4943 }
4944 // v7.17.0 Phase 1.6 — user-schemas registry
4945 // (FILE_VERSION 31+).
4946 if version >= 31 {
4947 let sch_count = cur.read_u32()? as usize;
4948 for _ in 0..sch_count {
4949 let name = cur.read_str()?;
4950 cat.schemas.insert(name);
4951 }
4952 }
4953 if cur.pos < buf.len() {
4954 return Err(StorageError::Corrupt(format!(
4955 "trailing bytes: {} unread",
4956 buf.len() - cur.pos
4957 )));
4958 }
4959 Ok(cat)
4960 }
4961}
4962
4963#[cfg(test)]
4964mod tests;