Skip to main content

spg_engine/
lib.rs

1//! SPG execution engine — v0.3 wires the SQL front-end to the in-memory
2//! storage layer. Implements `CREATE TABLE`, single-row `INSERT VALUES`, and
3//! `SELECT * FROM <table>` (no WHERE yet — that lands in v0.4 alongside
4//! expression evaluation against rows).
5#![no_std]
6
7extern crate alloc;
8
9pub mod aggregate;
10pub mod describe;
11pub mod eval;
12pub mod json;
13pub mod memoize;
14pub mod plan_cache;
15pub mod publications;
16pub mod query_stats;
17pub mod reorder;
18pub mod selectivity;
19pub mod statistics;
20pub mod subscriptions;
21pub mod users;
22
23pub use crate::users::{Role, ScramSecrets, UserError, UserStore};
24
25use alloc::borrow::Cow;
26use alloc::boxed::Box;
27use alloc::collections::BTreeMap;
28use alloc::string::{String, ToString};
29use alloc::vec::Vec;
30use core::fmt;
31
32use spg_sql::ast::{
33    BinOp, ColumnDef, ColumnName, ColumnTypeName, CreateIndexStatement,
34    CreatePublicationStatement, CreateSubscriptionStatement, CreateTableStatement,
35    CreateUserStatement, Expr, FrameBound, FrameKind, FromClause, IndexMethod, InsertStatement,
36    JoinKind, Literal, OrderBy, SelectItem, SelectStatement, Statement, UnOp, UnionKind,
37    VecEncoding as SqlVecEncoding, WindowFrame,
38};
39use spg_sql::parser::{self, ParseError};
40use spg_storage::{
41    Catalog, ColumnSchema, CompactReport, DataType, IndexKey, IndexKind, Row, StorageError, Table,
42    TableSchema, Value, VecEncoding,
43};
44
45use crate::eval::{EvalContext, EvalError};
46
47/// Result of executing one statement.
48#[derive(Debug, Clone, PartialEq)]
49#[non_exhaustive]
50pub enum QueryResult {
51    /// DDL or DML succeeded.
52    ///
53    /// `affected` is the row count for `INSERT` and 0 elsewhere.
54    /// `modified_catalog` tells the server whether this statement
55    /// caused the *committed* catalog to change — it's the signal to
56    /// snapshot/audit. False for `BEGIN`/`ROLLBACK`, false for writeful
57    /// statements executed inside a transaction (those only touch the
58    /// shadow), and true for `COMMIT` and for writes outside a TX.
59    CommandOk {
60        affected: usize,
61        modified_catalog: bool,
62    },
63    /// `SELECT` returned a (possibly empty) row set.
64    Rows {
65        columns: Vec<ColumnSchema>,
66        rows: Vec<Row>,
67    },
68}
69
70/// All errors the engine can return.
71///
72/// Marked `#[non_exhaustive]` from v7.5.0 onward: external `match`
73/// must include a `_` arm so new variants in subsequent v7.x releases
74/// are not breaking changes.
75#[derive(Debug, Clone, PartialEq)]
76#[non_exhaustive]
77pub enum EngineError {
78    Parse(ParseError),
79    Storage(StorageError),
80    Eval(EvalError),
81    /// Front-end accepted a construct that the v0.x executor doesn't support.
82    Unsupported(String),
83    /// `BEGIN` while another transaction is already open.
84    TransactionAlreadyOpen,
85    /// `COMMIT` / `ROLLBACK` with no active transaction.
86    NoActiveTransaction,
87    /// v4.0 sentinel: `execute_readonly` got a statement that
88    /// mutates engine state (INSERT / CREATE / BEGIN / COMMIT / …).
89    /// The caller should retake the write lock and dispatch through
90    /// `execute(&mut self)` instead.
91    WriteRequired,
92    /// v4.2: a SELECT would have returned more rows than the
93    /// configured `max_query_rows` cap. Carries the cap.
94    RowLimitExceeded(usize),
95    /// v4.5: cooperative cancellation — the host (server's
96    /// per-query watchdog) set the cancel flag while a long-running
97    /// SELECT / UPDATE / DELETE was scanning rows. The partial work
98    /// is discarded; the caller should surface this as a timeout
99    /// to the client.
100    Cancelled,
101}
102
103impl fmt::Display for EngineError {
104    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
105        match self {
106            Self::Parse(e) => write!(f, "parse: {e}"),
107            Self::Storage(e) => write!(f, "storage: {e}"),
108            Self::Eval(e) => write!(f, "eval: {e}"),
109            Self::Unsupported(s) => write!(f, "unsupported: {s}"),
110            Self::TransactionAlreadyOpen => f.write_str("a transaction is already open"),
111            Self::NoActiveTransaction => f.write_str("no active transaction"),
112            Self::WriteRequired => {
113                f.write_str("statement requires a write lock (use execute, not execute_readonly)")
114            }
115            Self::RowLimitExceeded(n) => {
116                write!(f, "query exceeded max_query_rows={n}")
117            }
118            Self::Cancelled => f.write_str("query cancelled (timeout or client request)"),
119        }
120    }
121}
122
123impl From<ParseError> for EngineError {
124    fn from(e: ParseError) -> Self {
125        Self::Parse(e)
126    }
127}
128impl From<StorageError> for EngineError {
129    fn from(e: StorageError) -> Self {
130        Self::Storage(e)
131    }
132}
133impl From<EvalError> for EngineError {
134    fn from(e: EvalError) -> Self {
135        Self::Eval(e)
136    }
137}
138
139/// The execution engine. Holds the catalog and (later) other server-scope
140/// state. `Engine::new()` is intentionally cheap so callers can construct one
141/// per database, per test.
142/// Function pointer that returns "now" as microseconds since Unix
143/// epoch. The engine is `no_std`, so it can't reach for `std::time`
144/// itself — callers (`spg-server`, the sqllogictest runner) inject a
145/// concrete implementation. `None` means `NOW()` / `CURRENT_*` raise
146/// `Unsupported`.
147pub type ClockFn = fn() -> i64;
148
149/// Function pointer that produces 16 cryptographically random bytes.
150/// Like `ClockFn`, the engine is `no_std` and can't reach for /dev/urandom
151/// itself — host (`spg-server`) injects an OS-backed source. `None`
152/// means SQL-driven `CREATE USER` falls back to a deterministic salt
153/// derived from the username (acceptable in tests; the server always
154/// installs a real RNG so production paths never see this).
155pub type SaltFn = fn() -> [u8; 16];
156
157/// v4.5 cooperative cancellation token. A long-running SELECT /
158/// UPDATE / DELETE checks `is_cancelled` at row-loop checkpoints
159/// and bails with `EngineError::Cancelled`. The host
160/// (`spg-server`) creates an `AtomicBool` per query, spawns a
161/// watchdog thread that sets it after `SPG_QUERY_TIMEOUT_MS`,
162/// and passes it via `execute_with_cancel` / `execute_readonly_with_cancel`.
163///
164/// `CancelToken::none()` is a no-op — used by the legacy `execute`
165/// and `execute_readonly` entry points so existing callers don't
166/// change.
167#[derive(Debug, Clone, Copy)]
168pub struct CancelToken<'a> {
169    flag: Option<&'a core::sync::atomic::AtomicBool>,
170}
171
172impl<'a> CancelToken<'a> {
173    #[must_use]
174    pub const fn none() -> Self {
175        Self { flag: None }
176    }
177
178    #[must_use]
179    pub const fn from_flag(f: &'a core::sync::atomic::AtomicBool) -> Self {
180        Self { flag: Some(f) }
181    }
182
183    #[must_use]
184    pub fn is_cancelled(self) -> bool {
185        self.flag
186            .is_some_and(|f| f.load(core::sync::atomic::Ordering::Relaxed))
187    }
188
189    /// Returns `Err(Cancelled)` if the token has been tripped.
190    /// Used at row-loop checkpoints to bail cooperatively without
191    /// scattering raw `is_cancelled` checks across the executor.
192    #[inline]
193    pub fn check(self) -> Result<(), EngineError> {
194        if self.is_cancelled() {
195            Err(EngineError::Cancelled)
196        } else {
197            Ok(())
198        }
199    }
200}
201
202// ---- snapshot envelope (v4.1, extended with CRC32 in v4.37,  ----
203// ----   publications in v6.1.2 v3, subscriptions in v6.1.4 v4) ----
204//
205// Wraps a catalog blob + a user blob behind a small header so the
206// server can persist both atomically without inventing a new file.
207// Bare catalog blobs (v3.x) still load via `restore_envelope` since
208// the magic check fails fast and the function falls back to
209// `Catalog::deserialize`.
210//
211// Layout — v1 (v4.1, no CRC):
212//   [8 bytes magic "SPGENV01"]
213//   [u8 version = 1]
214//   [u32 catalog_len][catalog bytes]
215//   [u32 users_len][users bytes]
216//
217// Layout — v2 (v4.37, CRC32 of body):
218//   [8 bytes magic "SPGENV01"]
219//   [u8 version = 2]
220//   [u32 catalog_len][catalog bytes]
221//   [u32 users_len][users bytes]
222//   [u32 crc32]                      ← CRC32 of every byte before it.
223//
224// Layout — v3 (v6.1.2, publications trailer):
225//   [8 bytes magic "SPGENV01"]
226//   [u8 version = 3]
227//   [u32 catalog_len][catalog bytes]
228//   [u32 users_len][users bytes]
229//   [u32 pubs_len][publications bytes]
230//   [u32 crc32]
231//
232// Layout — v4 (v6.1.4, subscriptions trailer):
233//   [8 bytes magic "SPGENV01"]
234//   [u8 version = 4]
235//   [u32 catalog_len][catalog bytes]
236//   [u32 users_len][users bytes]
237//   [u32 pubs_len][publications bytes]
238//   [u32 subs_len][subscriptions bytes]
239//   [u32 crc32]
240//
241// Layout — v5 (v6.2.0, statistics trailer):
242//   [8 bytes magic "SPGENV01"]
243//   [u8 version = 5]
244//   [u32 catalog_len][catalog bytes]
245//   [u32 users_len][users bytes]
246//   [u32 pubs_len][publications bytes]
247//   [u32 subs_len][subscriptions bytes]
248//   [u32 stats_len][statistics bytes]      ← NEW
249//   [u32 crc32]
250//
251// Writers emit v5 from v6.2.0 on. Readers accept all of {v1, v2,
252// v3, v4, v5}: v1/v2 load with empty publications / subscriptions /
253// statistics; v3 loads with empty subscriptions + statistics; v4
254// loads with empty statistics; v5 deserialises all three. Older
255// SPG versions reading a v5 envelope fall through the version
256// match to `EnvelopeParse::Bare` — pre-v6.2.0 binaries cannot
257// open v6.2.0+ snapshots (matches the v6.1.2 / v6.1.4 breaks).
258
259const ENVELOPE_MAGIC: &[u8; 8] = b"SPGENV01";
260const ENVELOPE_VERSION_V1: u8 = 1;
261const ENVELOPE_VERSION_V2: u8 = 2;
262const ENVELOPE_VERSION_V3: u8 = 3;
263const ENVELOPE_VERSION_V4: u8 = 4;
264const ENVELOPE_VERSION_V5: u8 = 5;
265
266fn build_envelope(
267    catalog: &[u8],
268    users: &[u8],
269    pubs: &[u8],
270    subs: &[u8],
271    stats: &[u8],
272) -> Vec<u8> {
273    let mut out = Vec::with_capacity(
274        8 + 1
275            + 4
276            + catalog.len()
277            + 4
278            + users.len()
279            + 4
280            + pubs.len()
281            + 4
282            + subs.len()
283            + 4
284            + stats.len()
285            + 4,
286    );
287    out.extend_from_slice(ENVELOPE_MAGIC);
288    out.push(ENVELOPE_VERSION_V5);
289    out.extend_from_slice(
290        &u32::try_from(catalog.len())
291            .expect("≤ 4G catalog")
292            .to_le_bytes(),
293    );
294    out.extend_from_slice(catalog);
295    out.extend_from_slice(
296        &u32::try_from(users.len())
297            .expect("≤ 4G users")
298            .to_le_bytes(),
299    );
300    out.extend_from_slice(users);
301    out.extend_from_slice(
302        &u32::try_from(pubs.len())
303            .expect("≤ 4G publications")
304            .to_le_bytes(),
305    );
306    out.extend_from_slice(pubs);
307    out.extend_from_slice(
308        &u32::try_from(subs.len())
309            .expect("≤ 4G subscriptions")
310            .to_le_bytes(),
311    );
312    out.extend_from_slice(subs);
313    out.extend_from_slice(
314        &u32::try_from(stats.len())
315            .expect("≤ 4G statistics")
316            .to_le_bytes(),
317    );
318    out.extend_from_slice(stats);
319    let crc = spg_crypto::crc32::crc32(&out);
320    out.extend_from_slice(&crc.to_le_bytes());
321    out
322}
323
324/// Outcome of envelope parsing: either bare-catalog fallback, a
325/// successfully split section trio from a v1/v2/v3 envelope, or an
326/// explicit corruption error from a v2/v3 CRC mismatch. `Bare`
327/// (catalog-only fallback) preserves v3.x readability. v1/v2
328/// envelopes set `publications` to `None`; v3 sets it to the
329/// publications byte slice.
330enum EnvelopeParse<'a> {
331    Bare,
332    Pair {
333        catalog: &'a [u8],
334        users: &'a [u8],
335        publications: Option<&'a [u8]>,
336        subscriptions: Option<&'a [u8]>,
337        statistics: Option<&'a [u8]>,
338    },
339    CrcMismatch {
340        expected: u32,
341        computed: u32,
342    },
343}
344
345/// Returns `EnvelopeParse::Pair` for a valid v1 / v2 / v3 envelope,
346/// `Bare` for a buffer that doesn't look like an envelope (v3.x
347/// bare catalog fallback), and `CrcMismatch` for a v2/v3 envelope
348/// whose trailing CRC32 doesn't match the body.
349fn split_envelope(buf: &[u8]) -> EnvelopeParse<'_> {
350    if buf.len() < 8 + 1 + 4 || &buf[..8] != ENVELOPE_MAGIC {
351        return EnvelopeParse::Bare;
352    }
353    let version = buf[8];
354    if !matches!(
355        version,
356        ENVELOPE_VERSION_V1
357            | ENVELOPE_VERSION_V2
358            | ENVELOPE_VERSION_V3
359            | ENVELOPE_VERSION_V4
360            | ENVELOPE_VERSION_V5
361    ) {
362        return EnvelopeParse::Bare;
363    }
364    let mut p = 9usize;
365    let Some(cat_len_bytes) = buf.get(p..p + 4) else {
366        return EnvelopeParse::Bare;
367    };
368    let Ok(cat_len_arr) = cat_len_bytes.try_into() else {
369        return EnvelopeParse::Bare;
370    };
371    let cat_len = u32::from_le_bytes(cat_len_arr) as usize;
372    p += 4;
373    if p + cat_len + 4 > buf.len() {
374        return EnvelopeParse::Bare;
375    }
376    let catalog = &buf[p..p + cat_len];
377    p += cat_len;
378    let Some(user_len_bytes) = buf.get(p..p + 4) else {
379        return EnvelopeParse::Bare;
380    };
381    let Ok(user_len_arr) = user_len_bytes.try_into() else {
382        return EnvelopeParse::Bare;
383    };
384    let user_len = u32::from_le_bytes(user_len_arr) as usize;
385    p += 4;
386    if p + user_len > buf.len() {
387        return EnvelopeParse::Bare;
388    }
389    let users = &buf[p..p + user_len];
390    p += user_len;
391    let publications = if matches!(
392        version,
393        ENVELOPE_VERSION_V3 | ENVELOPE_VERSION_V4 | ENVELOPE_VERSION_V5
394    ) {
395        // [u32 pubs_len][publications bytes]
396        let Some(pubs_len_bytes) = buf.get(p..p + 4) else {
397            return EnvelopeParse::Bare;
398        };
399        let Ok(pubs_len_arr) = pubs_len_bytes.try_into() else {
400            return EnvelopeParse::Bare;
401        };
402        let pubs_len = u32::from_le_bytes(pubs_len_arr) as usize;
403        p += 4;
404        if p + pubs_len > buf.len() {
405            return EnvelopeParse::Bare;
406        }
407        let pubs_slice = &buf[p..p + pubs_len];
408        p += pubs_len;
409        Some(pubs_slice)
410    } else {
411        None
412    };
413    let subscriptions = if matches!(version, ENVELOPE_VERSION_V4 | ENVELOPE_VERSION_V5) {
414        // [u32 subs_len][subscriptions bytes]
415        let Some(subs_len_bytes) = buf.get(p..p + 4) else {
416            return EnvelopeParse::Bare;
417        };
418        let Ok(subs_len_arr) = subs_len_bytes.try_into() else {
419            return EnvelopeParse::Bare;
420        };
421        let subs_len = u32::from_le_bytes(subs_len_arr) as usize;
422        p += 4;
423        if p + subs_len > buf.len() {
424            return EnvelopeParse::Bare;
425        }
426        let subs_slice = &buf[p..p + subs_len];
427        p += subs_len;
428        Some(subs_slice)
429    } else {
430        None
431    };
432    let statistics = if version == ENVELOPE_VERSION_V5 {
433        // [u32 stats_len][statistics bytes]
434        let Some(stats_len_bytes) = buf.get(p..p + 4) else {
435            return EnvelopeParse::Bare;
436        };
437        let Ok(stats_len_arr) = stats_len_bytes.try_into() else {
438            return EnvelopeParse::Bare;
439        };
440        let stats_len = u32::from_le_bytes(stats_len_arr) as usize;
441        p += 4;
442        if p + stats_len > buf.len() {
443            return EnvelopeParse::Bare;
444        }
445        let stats_slice = &buf[p..p + stats_len];
446        p += stats_len;
447        Some(stats_slice)
448    } else {
449        None
450    };
451    if matches!(
452        version,
453        ENVELOPE_VERSION_V2 | ENVELOPE_VERSION_V3 | ENVELOPE_VERSION_V4 | ENVELOPE_VERSION_V5
454    ) {
455        if p + 4 != buf.len() {
456            return EnvelopeParse::Bare;
457        }
458        let Ok(crc_arr) = buf[p..p + 4].try_into() else {
459            return EnvelopeParse::Bare;
460        };
461        let expected = u32::from_le_bytes(crc_arr);
462        let computed = spg_crypto::crc32::crc32(&buf[..p]);
463        if expected != computed {
464            return EnvelopeParse::CrcMismatch { expected, computed };
465        }
466    } else if p != buf.len() {
467        // v1: must end exactly at the users section.
468        return EnvelopeParse::Bare;
469    }
470    EnvelopeParse::Pair {
471        catalog,
472        users,
473        publications,
474        subscriptions,
475        statistics,
476    }
477}
478
479/// v4.41.1 opaque transaction handle. Returned by `Engine::alloc_tx_id`,
480/// threaded through `Engine::execute_in` so dispatch can identify which
481/// in-flight TX a statement belongs to. `IMPLICIT_TX` is the reserved
482/// slot every legacy caller — engine self-tests, spg-cli, spg-embedded,
483/// startup replay — implicitly uses through the unchanged
484/// `Engine::execute(sql)` API. v4.41.1 keeps at most one active slot at
485/// runtime (dispatch holds `engine.write()` across the wrap, same as
486/// v4.34); the map shape is here to let v4.42 turn on N in-flight
487/// implicit TXs without reshuffling the engine internals.
488#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
489pub struct TxId(pub u64);
490
491/// Reserved slot used by `Engine::execute(sql)` — the legacy single-
492/// global-shadow path. New `alloc_tx_id` handles start at 1.
493pub const IMPLICIT_TX: TxId = TxId(0);
494
495/// v6.7.3 — default segment-size threshold used by `COMPACT COLD
496/// SEGMENTS` when no explicit target is supplied. Segments whose
497/// `OwnedSegment::bytes().len()` is **strictly** less than this
498/// value are eligible to merge. spg-server reads
499/// `SPG_COMPACTION_TARGET_SEGMENT_BYTES` to override.
500pub const COMPACTION_TARGET_DEFAULT_BYTES: u64 = 4 * 1024 * 1024;
501
502/// Per-slot transaction state. Held inside `tx_catalogs[tx_id]` for the
503/// lifetime of a BEGIN..COMMIT (or BEGIN..ROLLBACK) window. Drops when
504/// the TX commits (its `catalog` is moved over `Engine.catalog`) or
505/// rolls back (slot removed, catalog discarded).
506#[derive(Debug, Default, Clone)]
507struct TxState {
508    /// The TX's shadow copy of the catalog. Started as a clone of
509    /// `Engine.catalog` at BEGIN time; writes flow into it; COMMIT
510    /// installs it over `Engine.catalog`. `Catalog::clone()` is O(1)
511    /// since v4.40 (`PersistentVec` rows + `PersistentBTreeMap` indices).
512    catalog: Catalog,
513    /// Per-TX savepoint stack. Each entry pairs the savepoint name with
514    /// a clone of `catalog` at the moment `SAVEPOINT <name>` fired.
515    /// `ROLLBACK TO <name>` restores from the entry and pops everything
516    /// after it; `RELEASE <name>` discards the entry and everything
517    /// after; COMMIT/ROLLBACK clears the whole stack.
518    savepoints: Vec<(String, Catalog)>,
519}
520
521#[derive(Debug, Default)]
522pub struct Engine {
523    /// Committed catalog — what survives `Engine::snapshot()` and what
524    /// outside-TX `SELECT`s read.
525    catalog: Catalog,
526    /// Active TX slots, keyed by `TxId`. Empty when no TX is in flight.
527    /// v4.41.1 runtime invariant: at most one entry (single-writer
528    /// model unchanged). v4.42 will let dispatch hold multiple entries
529    /// concurrently for group commit + engine MVCC.
530    tx_catalogs: BTreeMap<TxId, TxState>,
531    /// Which slot the next exec_* call should mutate. Set by
532    /// `execute_in(sql, tx_id)` at the entry point; legacy `execute(sql)`
533    /// sets it to `IMPLICIT_TX`. None when no TX is in flight (read /
534    /// write goes straight against `catalog`).
535    current_tx: Option<TxId>,
536    /// Monotonic counter for `alloc_tx_id`. Starts at 1 — slot 0 is
537    /// reserved for `IMPLICIT_TX`.
538    next_tx_id: u64,
539    /// Optional wall clock used to satisfy `NOW()` / `CURRENT_TIMESTAMP`
540    /// / `CURRENT_DATE`. Set by the host environment.
541    clock: Option<ClockFn>,
542    /// v4.1 cryptographic RNG for per-user password salt. Set by the
543    /// host. `None` means SQL-driven `CREATE USER` uses a
544    /// deterministic fallback — see `SaltFn`.
545    salt_fn: Option<SaltFn>,
546    /// v4.2 per-query row cap. `None` = unlimited. When set, a
547    /// SELECT that materialises more than `n` rows returns
548    /// `EngineError::RowLimitExceeded`. Enforced before the result
549    /// is shaped into wire frames so a runaway scan can't blow the
550    /// server's heap.
551    max_query_rows: Option<usize>,
552    /// v4.1 RBAC user table. Empty means "no RBAC configured yet" —
553    /// the server decides what that means at the auth boundary
554    /// (open mode vs legacy single-password mode). User CRUD goes
555    /// through `create_user`/`drop_user`/`verify_user`; persistence
556    /// rides the snapshot envelope alongside the catalog.
557    users: UserStore,
558    /// v6.1.2 logical-replication publication catalog. Empty until
559    /// `CREATE PUBLICATION` runs. Persistence rides the v3 envelope
560    /// trailer (see `build_envelope`).
561    publications: publications::Publications,
562    /// v6.1.4 logical-replication subscription catalog. Empty until
563    /// `CREATE SUBSCRIPTION` runs. Persistence rides the v4 envelope
564    /// trailer.
565    subscriptions: subscriptions::Subscriptions,
566    /// v6.2.0 — per-column statistics for the cost-based optimizer.
567    /// Populated by `ANALYZE`; queried via `spg_statistic` virtual
568    /// table. Persistence rides the v5 envelope trailer.
569    statistics: statistics::Statistics,
570    /// v6.3.0 — engine-level plan cache. Caches the post-`prepare()`
571    /// `Statement` keyed on SQL text. In-memory only — does NOT ride
572    /// the snapshot envelope (rebuilt on demand after restart).
573    plan_cache: plan_cache::PlanCache,
574    /// v6.5.1 — per-distinct-SQL execution stats. In-memory only,
575    /// surfaced via `spg_stat_query` virtual table. Updated by the
576    /// `execute_*` paths after a successful execute.
577    query_stats: query_stats::QueryStats,
578    /// v6.5.2 — connection-state provider callback. spg-server
579    /// registers a function at startup that snapshots its
580    /// per-pgwire-connection registry into `ActivityRow`s; engine
581    /// reads through it on every `SELECT * FROM spg_stat_activity`.
582    /// `None` ⇒ no-data (returns empty rows; matches the no_std
583    /// embedded callers that don't run pgwire).
584    activity_provider: Option<ActivityProvider>,
585    /// v6.5.3 — audit-chain provider + verifier. Same pattern as
586    /// activity_provider: spg-server registers both at startup;
587    /// engine reads through on `SELECT * FROM spg_audit_chain` and
588    /// `SELECT * FROM spg_audit_verify`. `None` ⇒ no-data.
589    audit_chain_provider: Option<AuditChainProvider>,
590    audit_verifier: Option<AuditVerifier>,
591    /// v6.5.6 — slow-query log threshold in microseconds. When set,
592    /// every successful execute whose elapsed exceeds the threshold
593    /// gets fed to the registered slow-query log callback (so
594    /// spg-server can emit a structured log line). Default `None`
595    /// = no slow-query logging.
596    slow_query_threshold_us: Option<u64>,
597    slow_query_logger: Option<SlowQueryLogger>,
598}
599
600/// v6.5.6 — callback signature for slow-query log emission. Called
601/// with `(sql, elapsed_us)` once per successful execute that crosses
602/// the threshold.
603pub type SlowQueryLogger = fn(&str, u64);
604
605/// v6.5.4 — synthesise a `CREATE TABLE` statement from catalog
606/// state. Round-trips through `Engine::execute` to recreate the
607/// same schema (sans data + indexes — indexes are emitted as a
608/// separate `CREATE INDEX` chain in `spg_database_ddl`).
609fn render_create_table(name: &str, columns: &[ColumnSchema]) -> String {
610    let mut out = alloc::format!("CREATE TABLE {name} (");
611    for (i, col) in columns.iter().enumerate() {
612        if i > 0 {
613            out.push_str(", ");
614        }
615        out.push_str(&col.name);
616        out.push(' ');
617        out.push_str(&render_data_type(col.ty));
618        if !col.nullable {
619            out.push_str(" NOT NULL");
620        }
621        if col.auto_increment {
622            out.push_str(" AUTO_INCREMENT");
623        }
624    }
625    out.push(')');
626    out
627}
628
629fn render_data_type(ty: DataType) -> String {
630    match ty {
631        DataType::SmallInt => "SMALLINT".into(),
632        DataType::Int => "INT".into(),
633        DataType::BigInt => "BIGINT".into(),
634        DataType::Float => "FLOAT".into(),
635        DataType::Text => "TEXT".into(),
636        DataType::Varchar(n) => alloc::format!("VARCHAR({n})"),
637        DataType::Char(n) => alloc::format!("CHAR({n})"),
638        DataType::Bool => "BOOL".into(),
639        DataType::Vector { dim, encoding } => match encoding {
640            spg_storage::VecEncoding::F32 => alloc::format!("VECTOR({dim})"),
641            spg_storage::VecEncoding::Sq8 => alloc::format!("VECTOR({dim}) USING SQ8"),
642            spg_storage::VecEncoding::F16 => alloc::format!("VECTOR({dim}) USING HALF"),
643        },
644        DataType::Numeric { precision, scale } => {
645            alloc::format!("NUMERIC({precision},{scale})")
646        }
647        DataType::Date => "DATE".into(),
648        DataType::Timestamp => "TIMESTAMP".into(),
649        DataType::Interval => "INTERVAL".into(),
650        DataType::Json => "JSON".into(),
651        DataType::Jsonb => "JSONB".into(),
652        DataType::Timestamptz => "TIMESTAMPTZ".into(),
653    }
654}
655
656/// v6.5.2 — one row of `spg_stat_activity`. Engine-public so
657/// spg-server can construct rows without re-exporting internal
658/// dispatch types.
659#[derive(Debug, Clone)]
660pub struct ActivityRow {
661    pub pid: u32,
662    pub user: String,
663    pub started_at_us: i64,
664    pub current_sql: String,
665    pub wait_event: String,
666    pub elapsed_us: i64,
667    pub in_transaction: bool,
668}
669
670/// v6.5.2 — provider callback type. Fresh snapshot returned each
671/// call; engine doesn't cache the slice.
672pub type ActivityProvider = fn() -> Vec<ActivityRow>;
673
674/// v6.5.3 — one row of `spg_audit_chain`. Engine-public so
675/// spg-server can construct rows directly from `AuditEntry`.
676#[derive(Debug, Clone)]
677pub struct AuditRow {
678    pub seq: i64,
679    pub ts_ms: i64,
680    pub prev_hash_hex: String,
681    pub entry_hash_hex: String,
682    pub sql: String,
683}
684
685/// v6.5.3 — chain-table provider + verifier. spg-server registers
686/// fn pointers that snapshot / verify the audit log. `verify`
687/// returns `(verified_count, broken_at_seq)` — `broken_at_seq` is
688/// `-1` on a clean chain.
689pub type AuditChainProvider = fn() -> Vec<AuditRow>;
690pub type AuditVerifier = fn() -> (i64, i64);
691
692impl Engine {
693    pub fn new() -> Self {
694        Self {
695            catalog: Catalog::new(),
696            tx_catalogs: BTreeMap::new(),
697            current_tx: None,
698            next_tx_id: 1,
699            clock: None,
700            salt_fn: None,
701            max_query_rows: None,
702            users: UserStore::new(),
703            publications: publications::Publications::new(),
704            subscriptions: subscriptions::Subscriptions::new(),
705            statistics: statistics::Statistics::new(),
706            plan_cache: plan_cache::PlanCache::new(),
707            query_stats: query_stats::QueryStats::new(),
708            activity_provider: None,
709            audit_chain_provider: None,
710            audit_verifier: None,
711            slow_query_threshold_us: None,
712            slow_query_logger: None,
713        }
714    }
715
716    /// Construct an engine restored from a previously-snapshotted catalog
717    /// (see `snapshot()`).
718    pub fn restore(catalog: Catalog) -> Self {
719        Self {
720            catalog,
721            tx_catalogs: BTreeMap::new(),
722            current_tx: None,
723            next_tx_id: 1,
724            clock: None,
725            salt_fn: None,
726            max_query_rows: None,
727            users: UserStore::new(),
728            publications: publications::Publications::new(),
729            subscriptions: subscriptions::Subscriptions::new(),
730            statistics: statistics::Statistics::new(),
731            plan_cache: plan_cache::PlanCache::new(),
732            query_stats: query_stats::QueryStats::new(),
733            activity_provider: None,
734            audit_chain_provider: None,
735            audit_verifier: None,
736            slow_query_threshold_us: None,
737            slow_query_logger: None,
738        }
739    }
740
741    /// Restore an engine + user table from a v4.1 envelope produced
742    /// by `snapshot_with_users()`. Falls back to plain catalog-only
743    /// restore if the envelope magic isn't present (so v3.x snapshot
744    /// files still load). v6.1.2 adds the optional publications
745    /// trailer (envelope v3); a v1/v2 envelope deserialises to an
746    /// empty publication table.
747    pub fn restore_envelope(buf: &[u8]) -> Result<Self, EngineError> {
748        match split_envelope(buf) {
749            EnvelopeParse::Pair {
750                catalog: catalog_bytes,
751                users: user_bytes,
752                publications: pub_bytes,
753                subscriptions: sub_bytes,
754                statistics: stats_bytes,
755            } => {
756                let catalog = Catalog::deserialize(catalog_bytes).map_err(EngineError::Storage)?;
757                let users = users::deserialize_users(user_bytes)
758                    .map_err(|e| EngineError::Unsupported(alloc::format!("users restore: {e}")))?;
759                let publications = match pub_bytes {
760                    Some(b) => publications::Publications::deserialize(b).map_err(|e| {
761                        EngineError::Unsupported(alloc::format!("publications restore: {e:?}"))
762                    })?,
763                    None => publications::Publications::new(),
764                };
765                let subscriptions = match sub_bytes {
766                    Some(b) => subscriptions::Subscriptions::deserialize(b).map_err(|e| {
767                        EngineError::Unsupported(alloc::format!("subscriptions restore: {e:?}"))
768                    })?,
769                    None => subscriptions::Subscriptions::new(),
770                };
771                let statistics = match stats_bytes {
772                    Some(b) => statistics::Statistics::deserialize(b).map_err(|e| {
773                        EngineError::Unsupported(alloc::format!("statistics restore: {e:?}"))
774                    })?,
775                    None => statistics::Statistics::new(),
776                };
777                Ok(Self {
778                    catalog,
779                    tx_catalogs: BTreeMap::new(),
780                    current_tx: None,
781                    next_tx_id: 1,
782                    clock: None,
783                    salt_fn: None,
784                    max_query_rows: None,
785                    users,
786                    publications,
787                    subscriptions,
788                    statistics,
789                    plan_cache: plan_cache::PlanCache::new(),
790                    query_stats: query_stats::QueryStats::new(),
791                    activity_provider: None,
792                    audit_chain_provider: None,
793                    audit_verifier: None,
794                    slow_query_threshold_us: None,
795                    slow_query_logger: None,
796                })
797            }
798            EnvelopeParse::CrcMismatch { expected, computed } => {
799                Err(EngineError::Storage(StorageError::Corrupt(alloc::format!(
800                    "snapshot envelope CRC32 mismatch (expected={expected:#010x}, computed={computed:#010x})"
801                ))))
802            }
803            EnvelopeParse::Bare => {
804                let catalog = Catalog::deserialize(buf).map_err(EngineError::Storage)?;
805                Ok(Self::restore(catalog))
806            }
807        }
808    }
809
810    pub const fn users(&self) -> &UserStore {
811        &self.users
812    }
813
814    /// `salt` is supplied by the caller (the host has a random
815    /// source; the engine is `no_std`). Caller should pass a fresh
816    /// 16-byte random value per user.
817    pub fn create_user(
818        &mut self,
819        name: &str,
820        password: &str,
821        role: Role,
822        salt: [u8; 16],
823    ) -> Result<(), UserError> {
824        self.users.create(name, password, role, salt)?;
825        // v4.8: also derive SCRAM-SHA-256 secrets so PG-wire SASL
826        // auth can verify without re-running PBKDF2 per attempt.
827        // Uses a fresh salt from the host RNG (falls back to a
828        // deterministic per-username salt when no RNG is wired, same
829        // as the legacy hash path).
830        let scram_salt = self.salt_fn.map_or_else(
831            || {
832                let mut s = [0u8; users::SCRAM_SALT_LEN];
833                let digest = spg_crypto::hash(name.as_bytes());
834                // Use bytes 16..32 of BLAKE3 so we don't reuse the
835                // exact same fallback salt as the BLAKE3 hash path.
836                s.copy_from_slice(&digest[16..32]);
837                s
838            },
839            |f| f(),
840        );
841        self.users
842            .enable_scram(name, password, scram_salt, users::SCRAM_DEFAULT_ITERS)?;
843        Ok(())
844    }
845
846    pub fn drop_user(&mut self, name: &str) -> Result<(), UserError> {
847        self.users.drop(name)
848    }
849
850    pub fn verify_user(&self, name: &str, password: &str) -> Option<Role> {
851        self.users.verify(name, password)
852    }
853
854    /// Builder: attach a wall clock so `NOW()` / `CURRENT_TIMESTAMP` /
855    /// `CURRENT_DATE` evaluate to a real value instead of erroring out.
856    #[must_use]
857    pub const fn with_clock(mut self, clock: ClockFn) -> Self {
858        self.clock = Some(clock);
859        self
860    }
861
862    /// Builder: attach an OS-backed RNG for per-user password salts.
863    /// The host (`spg-server`) typically wires this to `/dev/urandom`.
864    #[must_use]
865    pub const fn with_salt_fn(mut self, f: SaltFn) -> Self {
866        self.salt_fn = Some(f);
867        self
868    }
869
870    /// Builder: cap the number of rows a single SELECT may return.
871    /// Exceeding the cap raises `EngineError::RowLimitExceeded` —
872    /// the bound is checked inside the executor so a runaway
873    /// catalog scan can't allocate millions of rows before the
874    /// server gets a chance to reject the result.
875    #[must_use]
876    pub const fn with_max_query_rows(mut self, n: usize) -> Self {
877        self.max_query_rows = Some(n);
878        self
879    }
880
881    /// The *committed* catalog. Note: during a transaction this returns the
882    /// pre-TX state — `SELECT` inside a TX goes through `execute()` and reads
883    /// the shadow. Tests that inspect outside-TX state should use this.
884    pub const fn catalog(&self) -> &Catalog {
885        &self.catalog
886    }
887
888    /// Serialize the *committed* catalog to bytes. v0.6 was full-snapshot; v0.9
889    /// adds the rule that an open TX's shadow is never snapshotted — only the
890    /// post-COMMIT state is persisted. v4.1 wraps the catalog in an envelope
891    /// when there are users to persist; an empty user table snapshots as the
892    /// bare catalog format (backwards-compat with v3.x readers). v6.1.2
893    /// adds publications to the envelope condition: either non-empty
894    /// users OR non-empty publications now triggers the envelope path.
895    pub fn snapshot(&self) -> Vec<u8> {
896        if self.users.is_empty()
897            && self.publications.is_empty()
898            && self.subscriptions.is_empty()
899            && self.statistics.is_empty()
900        {
901            self.catalog.serialize()
902        } else {
903            build_envelope(
904                &self.catalog.serialize(),
905                &users::serialize_users(&self.users),
906                &self.publications.serialize(),
907                &self.subscriptions.serialize(),
908                &self.statistics.serialize(),
909            )
910        }
911    }
912
913    /// True when at least one TX slot is in flight. v4.41.1 runtime
914    /// invariant: at most one slot active at a time (dispatch holds
915    /// `engine.write()` across the entire wrap). v4.42 will let this
916    /// return true with multiple slots concurrently.
917    pub fn in_transaction(&self) -> bool {
918        !self.tx_catalogs.is_empty()
919    }
920
921    /// v4.41.1 allocate a fresh TX handle. Used by spg-server dispatch
922    /// to scope each implicit-wrap BEGIN..stmt..COMMIT to its own slot
923    /// in `tx_catalogs`. v4.42 — the commit-barrier leader allocates
924    /// one of these per task in its group, runs `BEGIN`+sql+`COMMIT`
925    /// sequentially under a single `engine.write()` so each task's
926    /// mutations accumulate into shared state, then either keeps the
927    /// accumulated state (fsync OK) or restores the pre-image via
928    /// `replace_catalog` (fsync err).
929    pub fn alloc_tx_id(&mut self) -> TxId {
930        let id = TxId(self.next_tx_id);
931        self.next_tx_id = self.next_tx_id.saturating_add(1);
932        id
933    }
934
935    /// v4.42 — atomically replace the live catalog. Used by the
936    /// commit-barrier leader to roll back a group whose batched
937    /// fsync failed: the leader snapshots `engine.catalog().clone()`
938    /// (O(1) Arc bump after the v4.39/v4.40 persistent migration)
939    /// at group start, sequentially applies each task's BEGIN+sql+
940    /// COMMIT under the same write lock to accumulate mutations
941    /// into shared state, batches the WAL bytes, fsyncs once, and
942    /// on failure calls this with the pre-image to undo every
943    /// task in the group at once.
944    ///
945    /// **Does NOT touch `tx_catalogs` / `current_tx`.** Any
946    /// explicit-TX slot from a concurrent client (created via the
947    /// legacy `IMPLICIT_TX`-less dispatch path or via the future
948    /// MVCC-readers v5+ work) has its own snapshot baked into the
949    /// slot — restoring `self.catalog` to the pre-image leaves
950    /// those slots untouched, exactly as they were when the leader
951    /// took the lock. The leader's own implicit-TX slots are all
952    /// already discarded (`exec_commit` removed them as each
953    /// task's COMMIT ran) by the time this is reached.
954    pub fn replace_catalog(&mut self, catalog: Catalog) {
955        self.catalog = catalog;
956    }
957
958    /// v6.7.0 — public shim around `Catalog::freeze_oldest_to_cold`
959    /// so tests + the spg-server freezer can drive a freeze without
960    /// reaching into the private `active_catalog_mut`. v6.7.4
961    /// parallel freezer will build on this surface.
962    ///
963    /// Marks the table's cached `cold_row_count` stale because the
964    /// freeze added cold locators that ANALYZE hasn't yet refreshed.
965    pub fn freeze_oldest_to_cold(
966        &mut self,
967        table_name: &str,
968        index_name: &str,
969        max_rows: usize,
970    ) -> Result<spg_storage::FreezeReport, EngineError> {
971        let report = self
972            .active_catalog_mut()
973            .freeze_oldest_to_cold(table_name, index_name, max_rows)
974            .map_err(EngineError::Storage)?;
975        if let Some(t) = self.active_catalog_mut().get_mut(table_name) {
976            t.mark_cold_row_count_stale();
977        }
978        Ok(report)
979    }
980
981    /// v6.7.5 — public shim used by the spg-server follower's
982    /// segment-forwarding receiver. Registers a cold-tier segment
983    /// at a specific id (the master's id, as transmitted on the
984    /// wire) so the follower's BTree-Cold locators stay byte-
985    /// identical with the master's. Wraps
986    /// `Catalog::load_segment_bytes_at` under the standard
987    /// clone-mutate-replace pattern.
988    ///
989    /// Returns `Ok(())` on success **and** on the "slot already
990    /// occupied" case — a follower mid-reconnect may receive a
991    /// segment chunk for a segment_id it already has on disk
992    /// (forwarded last session); the caller should treat that
993    /// path as a no-op rather than a fatal error.
994    pub fn receive_cold_segment(
995        &mut self,
996        segment_id: u32,
997        bytes: Vec<u8>,
998    ) -> Result<(), EngineError> {
999        let mut new_cat = self.catalog.clone();
1000        match new_cat.load_segment_bytes_at(segment_id, bytes) {
1001            Ok(()) => {
1002                self.replace_catalog(new_cat);
1003                Ok(())
1004            }
1005            Err(StorageError::Corrupt(msg)) if msg.contains("already occupied") => Ok(()),
1006            Err(e) => Err(EngineError::Storage(e)),
1007        }
1008    }
1009
1010    /// v6.7.3 — public shim around `Catalog::compact_cold_segments`
1011    /// driving every BTree index on every user table. Returns one
1012    /// `(table, index, report)` triple for each merge that
1013    /// actually happened (no-op (table, index) pairs are filtered
1014    /// out so callers can size persist-side work to the live
1015    /// merges). Caller is responsible for persisting each
1016    /// `report.merged_segment_bytes` and updating the on-disk
1017    /// segment registry; engine layer is no_std and never
1018    /// touches disk.
1019    ///
1020    /// Marks every touched table's cached `cold_row_count` stale
1021    /// — compaction GC'd some shadowed rows, so the count must be
1022    /// re-derived on the next ANALYZE.
1023    pub fn compact_cold_segments_with_target(
1024        &mut self,
1025        target_segment_bytes: u64,
1026    ) -> Result<Vec<(String, String, CompactReport)>, EngineError> {
1027        let table_names = self.active_catalog().table_names();
1028        let mut reports: Vec<(String, String, CompactReport)> = Vec::new();
1029        for tname in table_names {
1030            if is_internal_table_name(&tname) {
1031                continue;
1032            }
1033            let idx_names: Vec<String> = {
1034                let Some(t) = self.active_catalog().get(&tname) else {
1035                    continue;
1036                };
1037                t.indices()
1038                    .iter()
1039                    .filter(|i| matches!(i.kind, IndexKind::BTree(_)))
1040                    .map(|i| i.name.clone())
1041                    .collect()
1042            };
1043            for iname in idx_names {
1044                let report = self
1045                    .active_catalog_mut()
1046                    .compact_cold_segments(&tname, &iname, target_segment_bytes)
1047                    .map_err(EngineError::Storage)?;
1048                if report.merged_segment_id.is_some() {
1049                    if let Some(t) = self.active_catalog_mut().get_mut(&tname) {
1050                        t.mark_cold_row_count_stale();
1051                    }
1052                    reports.push((tname.clone(), iname, report));
1053                }
1054            }
1055        }
1056        Ok(reports)
1057    }
1058
1059    fn active_catalog(&self) -> &Catalog {
1060        match self.current_tx {
1061            Some(t) => self
1062                .tx_catalogs
1063                .get(&t)
1064                .map_or(&self.catalog, |s| &s.catalog),
1065            None => &self.catalog,
1066        }
1067    }
1068
1069    fn active_catalog_mut(&mut self) -> &mut Catalog {
1070        let tx = self.current_tx;
1071        match tx {
1072            Some(t) => match self.tx_catalogs.get_mut(&t) {
1073                Some(s) => &mut s.catalog,
1074                None => &mut self.catalog,
1075            },
1076            None => &mut self.catalog,
1077        }
1078    }
1079
1080    /// Read-only execute path. Succeeds for `SELECT` / `SHOW TABLES`
1081    /// / `SHOW COLUMNS`; returns `EngineError::WriteRequired` for
1082    /// every other statement, so the caller can fall through to the
1083    /// `&mut self` `execute` path under a write lock. Engine state is
1084    /// not mutated even on the success path (`rewrite_clock_calls`
1085    /// and `resolve_order_by_position` both mutate the locally-owned
1086    /// AST, not `self`).
1087    ///
1088    /// **v4.0 concurrency**: this is the entry point the server takes
1089    /// under an `RwLock::read()` so multiple `SELECT` clients run in
1090    /// parallel without serialising on a single mutex.
1091    pub fn execute_readonly(&self, sql: &str) -> Result<QueryResult, EngineError> {
1092        self.execute_readonly_with_cancel(sql, CancelToken::none())
1093    }
1094
1095    /// v4.5 — read path with cooperative cancellation. Token's
1096    /// `is_cancelled` is checked at the start (so a watchdog that
1097    /// already fired returns Cancelled immediately) and at row-loop
1098    /// checkpoints inside `exec_select`. SHOW paths are O(small) and
1099    /// don't bother checking.
1100    pub fn execute_readonly_with_cancel(
1101        &self,
1102        sql: &str,
1103        cancel: CancelToken<'_>,
1104    ) -> Result<QueryResult, EngineError> {
1105        cancel.check()?;
1106        let mut stmt = parser::parse_statement(sql)?;
1107        let now_micros = self.clock.map(|f| f());
1108        rewrite_clock_calls(&mut stmt, now_micros);
1109        if let Statement::Select(s) = &mut stmt {
1110            resolve_order_by_position(s);
1111            // v6.2.3 — cost-based JOIN reorder (read path).
1112            reorder::reorder_joins(s, &self.catalog, &self.statistics);
1113        }
1114        let result = match stmt {
1115            Statement::Select(s) => self.exec_select_cancel(&s, cancel),
1116            Statement::ShowTables => Ok(self.exec_show_tables()),
1117            Statement::ShowColumns(table) => self.exec_show_columns(&table),
1118            Statement::ShowUsers => Ok(self.exec_show_users()),
1119            Statement::ShowPublications => Ok(self.exec_show_publications()),
1120            Statement::ShowSubscriptions => Ok(self.exec_show_subscriptions()),
1121            Statement::WaitForWalPosition { .. } => Err(EngineError::Unsupported(
1122                "WAIT FOR WAL POSITION must be handled by the server layer".into(),
1123            )),
1124            Statement::Explain(e) => self.exec_explain(&e, cancel),
1125            _ => Err(EngineError::WriteRequired),
1126        };
1127        self.enforce_row_limit(result)
1128    }
1129
1130    /// v4.2: cap result-set size. Applied after the executor
1131    /// materialises rows but before they leave the engine — wrapping
1132    /// every Rows-returning exec_* function would scatter the check.
1133    fn enforce_row_limit(
1134        &self,
1135        result: Result<QueryResult, EngineError>,
1136    ) -> Result<QueryResult, EngineError> {
1137        if let (Ok(QueryResult::Rows { rows, .. }), Some(cap)) = (&result, self.max_query_rows)
1138            && rows.len() > cap
1139        {
1140            return Err(EngineError::RowLimitExceeded(cap));
1141        }
1142        result
1143    }
1144
1145    pub fn execute(&mut self, sql: &str) -> Result<QueryResult, EngineError> {
1146        self.execute_in_with_cancel(sql, IMPLICIT_TX, CancelToken::none())
1147    }
1148
1149    /// v4.5 — write path with cooperative cancellation. Same dispatch
1150    /// as `execute_in_with_cancel(sql, IMPLICIT_TX, cancel)`. Kept as
1151    /// a separate entry point for backward-compat with the v4.5
1152    /// public API.
1153    pub fn execute_with_cancel(
1154        &mut self,
1155        sql: &str,
1156        cancel: CancelToken<'_>,
1157    ) -> Result<QueryResult, EngineError> {
1158        self.execute_in_with_cancel(sql, IMPLICIT_TX, cancel)
1159    }
1160
1161    /// v4.41.1 multi-slot write entry. Routes `sql` through the TX
1162    /// slot identified by `tx_id` so spg-server dispatch can scope
1163    /// each implicit-wrap BEGIN..stmt..COMMIT to its own slot in
1164    /// `tx_catalogs`. `IMPLICIT_TX` is the legacy single-slot path
1165    /// every other caller (engine self-tests, replay, spg-embedded)
1166    /// implicitly takes via `execute()` / `execute_with_cancel()`.
1167    pub fn execute_in(&mut self, sql: &str, tx_id: TxId) -> Result<QueryResult, EngineError> {
1168        self.execute_in_with_cancel(sql, tx_id, CancelToken::none())
1169    }
1170
1171    /// v4.41.1 write path with cooperative cancellation + explicit TX
1172    /// scope. Sets `self.current_tx` for the duration of the call so
1173    /// every `exec_*` helper transparently sees its TX's shadow
1174    /// catalog and savepoint stack; restores on exit so the field is
1175    /// only valid mid-call (no leakage across calls).
1176    pub fn execute_in_with_cancel(
1177        &mut self,
1178        sql: &str,
1179        tx_id: TxId,
1180        cancel: CancelToken<'_>,
1181    ) -> Result<QueryResult, EngineError> {
1182        let saved = self.current_tx;
1183        self.current_tx = Some(tx_id);
1184        let result = self.execute_inner_with_cancel(sql, cancel);
1185        self.current_tx = saved;
1186        result
1187    }
1188
1189    /// v6.1.1 — parse and pre-process a SQL string ONCE so the
1190    /// resulting [`Statement`] can be cached and re-executed via
1191    /// [`Engine::execute_prepared`]. Returns the same `Statement`
1192    /// the simple-query path would synthesise internally (clock
1193    /// rewrites + ORDER BY position-ref resolution applied at
1194    /// prepare time, since both are session-independent). The
1195    /// `$N` placeholders in the SQL stay as `Expr::Placeholder(n)`
1196    /// nodes; they're resolved to concrete values per-call by
1197    /// `execute_prepared`'s substitution walk.
1198    ///
1199    /// Pgwire's `Parse` (P) message lands here.
1200    pub fn prepare(&self, sql: &str) -> Result<Statement, ParseError> {
1201        let mut stmt = parser::parse_statement(sql)?;
1202        let now_micros = self.clock.map(|f| f());
1203        rewrite_clock_calls(&mut stmt, now_micros);
1204        if let Statement::Select(s) = &mut stmt {
1205            // v6.4.1 — expand `GROUP BY ALL` to every non-aggregate
1206            // SELECT-list item BEFORE position / alias resolution so
1207            // downstream passes see the explicit list.
1208            expand_group_by_all(s);
1209            resolve_order_by_position(s);
1210            // v6.2.3 — cost-based JOIN reorder. No-op for
1211            // single-table FROMs or any non-INNER join shape.
1212            reorder::reorder_joins(s, &self.catalog, &self.statistics);
1213        }
1214        Ok(stmt)
1215    }
1216
1217    /// v6.3.0 — cached prepare. Returns a cloned `Statement` from
1218    /// the plan cache on hit, runs the full `prepare()` path on miss
1219    /// and inserts the resulting plan before returning. Skipping the
1220    /// parse + JOIN-reorder pipeline on hit is the dominant win for
1221    /// JDBC / sqlx / pgx clients that reuse the same SQL string.
1222    ///
1223    /// Returns a cloned `Statement` (not a borrow) because the
1224    /// pgwire layer owns its `PreparedStmt` map per-session and the
1225    /// engine-level cache must stay available for other sessions.
1226    /// Clone cost on a 5-table JOIN AST is well under the parse cost
1227    /// it replaces.
1228    pub fn prepare_cached(&mut self, sql: &str) -> Result<Statement, ParseError> {
1229        // v6.3.1 — version-aware lookup. If the cached plan was
1230        // prepared before the most recent ANALYZE, evict and replan.
1231        let current_version = self.statistics.version();
1232        if let Some(plan) = self.plan_cache.get(sql) {
1233            if plan.statistics_version == current_version {
1234                return Ok(plan.stmt.clone());
1235            }
1236            // Stale entry — fall through to evict + re-prepare.
1237        }
1238        self.plan_cache.evict(sql);
1239        let stmt = self.prepare(sql)?;
1240        let source_tables = plan_cache::collect_source_tables(&stmt);
1241        let plan = plan_cache::PreparedPlan {
1242            stmt: stmt.clone(),
1243            statistics_version: current_version,
1244            source_tables,
1245            describe_columns: alloc::vec::Vec::new(),
1246        };
1247        self.plan_cache.insert(String::from(sql), plan);
1248        Ok(stmt)
1249    }
1250
1251    /// v6.3.0 — read-only accessor for tests and v6.3.1 invalidation.
1252    pub fn plan_cache(&self) -> &plan_cache::PlanCache {
1253        &self.plan_cache
1254    }
1255
1256    /// v6.3.0 — mutable accessor for v6.3.1 invalidation hooks.
1257    pub fn plan_cache_mut(&mut self) -> &mut plan_cache::PlanCache {
1258        &mut self.plan_cache
1259    }
1260
1261    /// v6.3.3 — Describe a prepared `Statement` without executing.
1262    /// Returns `(parameter_oids, output_columns)`. Empty
1263    /// `output_columns` means the statement has no row-producing
1264    /// shape we could resolve here (JOIN, subquery, non-SELECT, …)
1265    /// — pgwire layer maps that to a `NoData` reply.
1266    pub fn describe_prepared(
1267        &self,
1268        stmt: &Statement,
1269    ) -> (Vec<u32>, Vec<ColumnSchema>) {
1270        describe::describe_prepared(stmt, self.active_catalog())
1271    }
1272
1273    /// v6.1.1 — execute a [`Statement`] previously returned by
1274    /// [`Engine::prepare`], substituting `Expr::Placeholder(n)`
1275    /// nodes for the corresponding [`Value`] in `params` (1-based
1276    /// per PG: `$1` → `params[0]`). Bind-time string parameters
1277    /// are decoded into typed `Value`s by the pgwire layer before
1278    /// this call so the resulting AST hits the same execution
1279    /// path as a simple query — no SQL re-parse.
1280    ///
1281    /// Pgwire's `Execute` (E) message after a `Bind` (B) lands here.
1282    pub fn execute_prepared(
1283        &mut self,
1284        mut stmt: Statement,
1285        params: &[Value],
1286    ) -> Result<QueryResult, EngineError> {
1287        substitute_placeholders(&mut stmt, params)?;
1288        self.execute_stmt_with_cancel(stmt, CancelToken::none())
1289    }
1290
1291    fn execute_inner_with_cancel(
1292        &mut self,
1293        sql: &str,
1294        cancel: CancelToken<'_>,
1295    ) -> Result<QueryResult, EngineError> {
1296        cancel.check()?;
1297        let stmt = self.prepare(sql)?;
1298        // v6.5.1 — wrap the executor with a wall-clock window so we
1299        // can record into spg_stat_query. Skip when the engine has
1300        // no clock attached (no_std embedded callers).
1301        let start_us = self.clock.map(|f| f());
1302        let result = self.execute_stmt_with_cancel(stmt, cancel);
1303        if let (Some(t0), Ok(_)) = (start_us, &result) {
1304            let now = self.clock.map_or(t0, |f| f());
1305            let elapsed = now.saturating_sub(t0).max(0) as u64;
1306            self.query_stats.record(sql, elapsed, now as u64);
1307            // v6.5.6 — slow-query log: fire callback when elapsed
1308            // exceeds the configured floor.
1309            if let (Some(threshold), Some(logger)) =
1310                (self.slow_query_threshold_us, self.slow_query_logger)
1311                && elapsed >= threshold
1312            {
1313                logger(sql, elapsed);
1314            }
1315        }
1316        result
1317    }
1318
1319    fn execute_stmt_with_cancel(
1320        &mut self,
1321        stmt: Statement,
1322        cancel: CancelToken<'_>,
1323    ) -> Result<QueryResult, EngineError> {
1324        cancel.check()?;
1325        let result = match stmt {
1326            Statement::CreateTable(s) => self.exec_create_table(s),
1327            // v7.9.15 — CREATE EXTENSION is a no-op on SPG. Returns
1328            // CommandOk with affected=0; modified_catalog=false so
1329            // the WAL doesn't grow a useless entry. mailrs F3.
1330            Statement::CreateExtension(_) => Ok(QueryResult::CommandOk {
1331                affected: 0,
1332                modified_catalog: false,
1333            }),
1334            // v7.9.27 — DO $$ ... $$ is also a no-op (SPG has no
1335            // PL/pgSQL). mailrs H1 + pg_dump compat.
1336            Statement::DoBlock => Ok(QueryResult::CommandOk {
1337                affected: 0,
1338                modified_catalog: false,
1339            }),
1340            Statement::CreateIndex(s) => self.exec_create_index(s),
1341            Statement::Insert(s) => self.exec_insert(s),
1342            Statement::Update(s) => self.exec_update_cancel(&s, cancel),
1343            Statement::Delete(s) => self.exec_delete_cancel(&s, cancel),
1344            Statement::Select(s) => self.exec_select_cancel(&s, cancel),
1345            Statement::Begin => self.exec_begin(),
1346            Statement::Commit => self.exec_commit(),
1347            Statement::Rollback => self.exec_rollback(),
1348            Statement::Savepoint(name) => self.exec_savepoint(name),
1349            Statement::RollbackToSavepoint(name) => self.exec_rollback_to_savepoint(&name),
1350            Statement::ReleaseSavepoint(name) => self.exec_release_savepoint(&name),
1351            Statement::ShowTables => Ok(self.exec_show_tables()),
1352            Statement::ShowColumns(table) => self.exec_show_columns(&table),
1353            Statement::ShowUsers => Ok(self.exec_show_users()),
1354            Statement::ShowPublications => Ok(self.exec_show_publications()),
1355            Statement::ShowSubscriptions => Ok(self.exec_show_subscriptions()),
1356            Statement::CreateUser(s) => self.exec_create_user(&s),
1357            Statement::DropUser(name) => self.exec_drop_user(&name),
1358            Statement::Explain(e) => self.exec_explain(&e, cancel),
1359            Statement::AlterIndex(s) => self.exec_alter_index(s),
1360            Statement::AlterTable(s) => self.exec_alter_table(s),
1361            Statement::CreatePublication(s) => self.exec_create_publication(s),
1362            Statement::DropPublication(name) => self.exec_drop_publication(&name),
1363            Statement::CreateSubscription(s) => self.exec_create_subscription(s),
1364            Statement::DropSubscription(name) => self.exec_drop_subscription(&name),
1365            // v6.1.7 — WAIT FOR WAL POSITION needs `lag_state`,
1366            // which lives in spg-server's ServerState. The engine
1367            // surfaces a clear error; the server-layer dispatch
1368            // intercepts the SQL before it reaches the engine on
1369            // a server build, so this arm only fires for
1370            // engine-only callers (spg-embedded, lib tests).
1371            Statement::WaitForWalPosition { .. } => Err(EngineError::Unsupported(
1372                "WAIT FOR WAL POSITION must be handled by the server layer".into(),
1373            )),
1374            // v6.2.0 — ANALYZE recomputes per-column histograms.
1375            Statement::Analyze(target) => self.exec_analyze(target.as_deref()),
1376            // v6.7.3 — COMPACT COLD SEGMENTS.
1377            Statement::CompactColdSegments => self.exec_compact_cold_segments(),
1378        };
1379        self.enforce_row_limit(result)
1380    }
1381
1382    /// v6.1.2 — `CREATE PUBLICATION` runtime path. Duplicate names
1383    /// surface as `EngineError::Unsupported` so the existing PG-wire
1384    /// error mapping stays uniform; the message carries the name so
1385    /// operators can grep replication-log noise. Inside-transaction
1386    /// invocation is rejected (matches `CREATE USER` / `DROP USER`
1387    /// stance) — replication-catalog mutation is a connection-level
1388    /// administrative op, not a transactional one.
1389    fn exec_create_publication(
1390        &mut self,
1391        s: CreatePublicationStatement,
1392    ) -> Result<QueryResult, EngineError> {
1393        // v6.1.4 — the v6.1.2 "no DDL inside a transaction" guard
1394        // was over-cautious: it also blocked the auto-commit wrap
1395        // path (which begins an internal TX around every WAL-
1396        // logged statement). PG itself allows CREATE PUBLICATION
1397        // inside a transaction (it rolls back with the TX).
1398        self.publications
1399            .create(s.name, s.scope)
1400            .map_err(|e| EngineError::Unsupported(alloc::format!("CREATE PUBLICATION: {e:?}")))?;
1401        Ok(QueryResult::CommandOk {
1402            affected: 1,
1403            modified_catalog: true,
1404        })
1405    }
1406
1407    /// v6.1.2 — `DROP PUBLICATION` runtime path. PG-compatible silent
1408    /// no-op when the publication doesn't exist (returns `affected=0`
1409    /// in that case so the wire-level command tag distinguishes
1410    /// "dropped" from "no-op", though both succeed).
1411    fn exec_drop_publication(&mut self, name: &str) -> Result<QueryResult, EngineError> {
1412        let removed = self.publications.drop(name);
1413        Ok(QueryResult::CommandOk {
1414            affected: usize::from(removed),
1415            modified_catalog: removed,
1416        })
1417    }
1418
1419    /// v6.1.2 — read access to the publication catalog. Used by
1420    /// the v6.1.5 publisher-side WAL filter, by `SHOW PUBLICATIONS`
1421    /// (v6.1.3+), and by e2e tests that need to assert state without
1422    /// going through the wire.
1423    pub const fn publications(&self) -> &publications::Publications {
1424        &self.publications
1425    }
1426
1427    /// v6.1.4 — `CREATE SUBSCRIPTION` runtime path. Defaults
1428    /// `enabled = true` and `last_received_pos = 0` for a freshly-
1429    /// created subscription. The actual worker thread is spawned
1430    /// by spg-server once the engine returns success.
1431    fn exec_create_subscription(
1432        &mut self,
1433        s: CreateSubscriptionStatement,
1434    ) -> Result<QueryResult, EngineError> {
1435        // See exec_create_publication — the in_transaction gate
1436        // was over-cautious; the auto-commit wrap path holds an
1437        // internal TX that this check was incorrectly blocking.
1438        let sub = subscriptions::Subscription {
1439            conn_str: s.conn_str,
1440            publications: s.publications,
1441            enabled: true,
1442            last_received_pos: 0,
1443        };
1444        self.subscriptions
1445            .create(s.name, sub)
1446            .map_err(|e| EngineError::Unsupported(alloc::format!("CREATE SUBSCRIPTION: {e:?}")))?;
1447        Ok(QueryResult::CommandOk {
1448            affected: 1,
1449            modified_catalog: true,
1450        })
1451    }
1452
1453    /// v6.1.4 — `DROP SUBSCRIPTION`. Silent no-op when the name
1454    /// doesn't exist (PG-compatible). The associated worker is
1455    /// torn down by spg-server when it observes the catalog
1456    /// change at the next snapshot or via the engine's
1457    /// subscriptions accessor (the worker polls the catalog on
1458    /// reconnect; v6.1.5's filter-side will tighten this to an
1459    /// explicit signal).
1460    fn exec_drop_subscription(&mut self, name: &str) -> Result<QueryResult, EngineError> {
1461        let removed = self.subscriptions.drop(name);
1462        Ok(QueryResult::CommandOk {
1463            affected: usize::from(removed),
1464            modified_catalog: removed,
1465        })
1466    }
1467
1468    /// v6.1.4 — read access to the subscription catalog. Used by
1469    /// the subscription worker (read its own row to find its
1470    /// publications + last applied position), by SHOW SUBSCRIPTIONS,
1471    /// and by e2e tests asserting state directly.
1472    pub const fn subscriptions(&self) -> &subscriptions::Subscriptions {
1473        &self.subscriptions
1474    }
1475
1476    /// v6.1.4 — write access to `last_received_pos`. Worker
1477    /// calls this after each apply batch (under the engine's
1478    /// write-lock). Returns `false` when the subscription was
1479    /// dropped between when the worker received the record and
1480    /// when this call landed.
1481    pub fn subscription_advance(&mut self, name: &str, pos: u64) -> bool {
1482        self.subscriptions.update_last_received_pos(name, pos)
1483    }
1484
1485    /// v6.1.4 — `SHOW SUBSCRIPTIONS` row materialisation. Returns
1486    /// `(name, conn_str, publications, enabled, last_received_pos)`
1487    /// ordered by subscription name. The `publications` column is
1488    /// the comma-joined list ("p1, p2") for ergonomic SHOW output;
1489    /// callers wanting structured access read `Engine::subscriptions`.
1490    fn exec_show_subscriptions(&self) -> QueryResult {
1491        let columns = alloc::vec![
1492            ColumnSchema::new("name", DataType::Text, false),
1493            ColumnSchema::new("conn_str", DataType::Text, false),
1494            ColumnSchema::new("publications", DataType::Text, false),
1495            ColumnSchema::new("enabled", DataType::Bool, false),
1496            ColumnSchema::new("last_received_pos", DataType::BigInt, false),
1497        ];
1498        let rows: Vec<Row> = self
1499            .subscriptions
1500            .iter()
1501            .map(|(name, sub)| {
1502                Row::new(alloc::vec![
1503                    Value::Text(name.clone()),
1504                    Value::Text(sub.conn_str.clone()),
1505                    Value::Text(sub.publications.join(", ")),
1506                    Value::Bool(sub.enabled),
1507                    Value::BigInt(i64::try_from(sub.last_received_pos).unwrap_or(i64::MAX)),
1508                ])
1509            })
1510            .collect();
1511        QueryResult::Rows { columns, rows }
1512    }
1513
1514    /// v6.2.0 — materialise `spg_statistic` rows. One row per
1515    /// `(table, column)` pair tracked in `Statistics`, with
1516    /// `histogram_bounds` rendered as a `[v0, v1, ...]` string —
1517    /// the same canonical form vector literals use for round-trip.
1518    fn exec_spg_statistic(&self) -> QueryResult {
1519        let columns = alloc::vec![
1520            ColumnSchema::new("table_name", DataType::Text, false),
1521            ColumnSchema::new("column_name", DataType::Text, false),
1522            ColumnSchema::new("null_frac", DataType::Float, false),
1523            ColumnSchema::new("n_distinct", DataType::BigInt, false),
1524            ColumnSchema::new("histogram_bounds", DataType::Text, false),
1525            // v6.7.0 — appended column (v6.2.0 stability contract
1526            // allows APPEND to spg_statistic, not reorder/rename).
1527            // Reports the cached per-table cold-row count; same
1528            // value across every column row of the same table.
1529            ColumnSchema::new("cold_row_count", DataType::BigInt, false),
1530        ];
1531        let rows: Vec<Row> = self
1532            .statistics
1533            .iter()
1534            .map(|((t, c), s)| {
1535                let cold = self
1536                    .catalog
1537                    .get(t)
1538                    .map_or(0, |table| table.cold_row_count());
1539                Row::new(alloc::vec![
1540                    Value::Text(t.clone()),
1541                    Value::Text(c.clone()),
1542                    Value::Float(f64::from(s.null_frac)),
1543                    Value::BigInt(i64::try_from(s.n_distinct).unwrap_or(i64::MAX)),
1544                    Value::Text(render_histogram_bounds(&s.histogram_bounds)),
1545                    Value::BigInt(i64::try_from(cold).unwrap_or(i64::MAX)),
1546                ])
1547            })
1548            .collect();
1549        QueryResult::Rows { columns, rows }
1550    }
1551
1552    /// v6.5.0 — materialise `spg_stat_replication` rows. One row
1553    /// per subscription with `(name, conn_str, publications,
1554    /// last_received_pos, enabled)`. Surface mirrors
1555    /// `SHOW SUBSCRIPTIONS` but follows the virtual-table dispatch
1556    /// shape so it composes with SELECT clauses (WHERE, projection
1557    /// onto specific columns, etc).
1558    fn exec_spg_stat_replication(&self) -> QueryResult {
1559        let columns = alloc::vec![
1560            ColumnSchema::new("name", DataType::Text, false),
1561            ColumnSchema::new("conn_str", DataType::Text, false),
1562            ColumnSchema::new("publications", DataType::Text, false),
1563            ColumnSchema::new("last_received_pos", DataType::BigInt, false),
1564            ColumnSchema::new("enabled", DataType::Bool, false),
1565        ];
1566        let rows: Vec<Row> = self
1567            .subscriptions
1568            .iter()
1569            .map(|(name, sub)| {
1570                Row::new(alloc::vec![
1571                    Value::Text(name.clone()),
1572                    Value::Text(sub.conn_str.clone()),
1573                    Value::Text(sub.publications.join(",")),
1574                    Value::BigInt(i64::try_from(sub.last_received_pos).unwrap_or(i64::MAX)),
1575                    Value::Bool(sub.enabled),
1576                ])
1577            })
1578            .collect();
1579        QueryResult::Rows { columns, rows }
1580    }
1581
1582    /// v6.5.0 — materialise `spg_stat_segment` rows. One row per
1583    /// cold-tier segment with `(segment_id, num_rows, num_pages,
1584    /// total_bytes)`.
1585    ///
1586    /// v6.7.0 — appended `table_name` column resolves the v6.5.0
1587    /// carve-out. Walks every user table's BTree indices to find
1588    /// which table's Cold locators point at each segment. Empty
1589    /// string for orphan segments (loaded via SPG_PRELOAD_COLD_SEGMENT
1590    /// before any index registered a locator). The walk is
1591    /// O(tables × indices × keys); cached per call, not across
1592    /// calls — re-walked on every `SELECT * FROM spg_stat_segment`.
1593    fn exec_spg_stat_segment(&self) -> QueryResult {
1594        let columns = alloc::vec![
1595            ColumnSchema::new("segment_id", DataType::BigInt, false),
1596            ColumnSchema::new("table_name", DataType::Text, false),
1597            ColumnSchema::new("num_rows", DataType::BigInt, false),
1598            ColumnSchema::new("num_pages", DataType::BigInt, false),
1599            ColumnSchema::new("total_bytes", DataType::BigInt, false),
1600        ];
1601        // v6.7.0 — build a segment_id → table_name map by walking
1602        // every user table's BTree indices once. O(tables × indices
1603        // × keys) for the v6.5.0 carve-out resolution; acceptable
1604        // because spg_stat_segment is operator-facing (not on a
1605        // hot-loop path).
1606        let mut segment_owners: alloc::collections::BTreeMap<u32, String> = BTreeMap::new();
1607        for tname in self.catalog.table_names() {
1608            if is_internal_table_name(&tname) {
1609                continue;
1610            }
1611            let Some(t) = self.catalog.get(&tname) else {
1612                continue;
1613            };
1614            for idx in t.indices() {
1615                if let spg_storage::IndexKind::BTree(map) = &idx.kind {
1616                    for (_, locs) in map.iter() {
1617                        for loc in locs {
1618                            if let spg_storage::RowLocator::Cold { segment_id, .. } = loc {
1619                                segment_owners.entry(*segment_id).or_insert_with(|| tname.clone());
1620                            }
1621                        }
1622                    }
1623                }
1624            }
1625        }
1626        let rows: Vec<Row> = self
1627            .catalog
1628            .cold_segment_ids_global()
1629            .iter()
1630            .filter_map(|&id| {
1631                let seg = self.catalog.cold_segment(id)?;
1632                let meta = seg.meta();
1633                let owner = segment_owners
1634                    .get(&id)
1635                    .cloned()
1636                    .unwrap_or_default();
1637                Some(Row::new(alloc::vec![
1638                    Value::BigInt(i64::from(id)),
1639                    Value::Text(owner),
1640                    Value::BigInt(i64::try_from(meta.num_rows).unwrap_or(i64::MAX)),
1641                    Value::BigInt(i64::from(meta.num_pages)),
1642                    Value::BigInt(i64::try_from(meta.total_bytes).unwrap_or(i64::MAX)),
1643                ]))
1644            })
1645            .collect();
1646        QueryResult::Rows { columns, rows }
1647    }
1648
1649    /// v6.5.1 — materialise `spg_stat_query` rows. One row per
1650    /// distinct SQL text recorded since the engine booted, capped
1651    /// at `QUERY_STATS_MAX` (1024). Columns:
1652    ///   sql, exec_count, total_us, mean_us, max_us, last_seen_us
1653    /// mean_us = total_us / exec_count (saturating).
1654    fn exec_spg_stat_query(&self) -> QueryResult {
1655        let columns = alloc::vec![
1656            ColumnSchema::new("sql", DataType::Text, false),
1657            ColumnSchema::new("exec_count", DataType::BigInt, false),
1658            ColumnSchema::new("total_us", DataType::BigInt, false),
1659            ColumnSchema::new("mean_us", DataType::BigInt, false),
1660            ColumnSchema::new("max_us", DataType::BigInt, false),
1661            ColumnSchema::new("last_seen_us", DataType::BigInt, false),
1662        ];
1663        let rows: Vec<Row> = self
1664            .query_stats
1665            .snapshot()
1666            .into_iter()
1667            .map(|(sql, s)| {
1668                let mean = if s.exec_count == 0 {
1669                    0
1670                } else {
1671                    s.total_us / s.exec_count
1672                };
1673                Row::new(alloc::vec![
1674                    Value::Text(sql),
1675                    Value::BigInt(i64::try_from(s.exec_count).unwrap_or(i64::MAX)),
1676                    Value::BigInt(i64::try_from(s.total_us).unwrap_or(i64::MAX)),
1677                    Value::BigInt(i64::try_from(mean).unwrap_or(i64::MAX)),
1678                    Value::BigInt(i64::try_from(s.max_us).unwrap_or(i64::MAX)),
1679                    Value::BigInt(i64::try_from(s.last_seen_us).unwrap_or(i64::MAX)),
1680                ])
1681            })
1682            .collect();
1683        QueryResult::Rows { columns, rows }
1684    }
1685
1686    /// v6.5.2 — register a connection-state provider. spg-server
1687    /// calls this at startup with a function that snapshots its
1688    /// per-pgwire-connection registry. Engine reads through the
1689    /// callback on `SELECT * FROM spg_stat_activity`.
1690    #[must_use]
1691    pub const fn with_activity_provider(mut self, f: ActivityProvider) -> Self {
1692        self.activity_provider = Some(f);
1693        self
1694    }
1695
1696    /// v6.5.3 — register audit chain provider + verifier.
1697    #[must_use]
1698    pub const fn with_audit_providers(
1699        mut self,
1700        chain: AuditChainProvider,
1701        verify: AuditVerifier,
1702    ) -> Self {
1703        self.audit_chain_provider = Some(chain);
1704        self.audit_verifier = Some(verify);
1705        self
1706    }
1707
1708    /// v6.5.6 — register a slow-query log callback. `threshold_us`
1709    /// is the floor (in microseconds); only executes above the floor
1710    /// fire the callback. spg-server wires this from
1711    /// `SPG_SLOW_QUERY_THRESHOLD_MS` (default 100 ms).
1712    #[must_use]
1713    pub const fn with_slow_query_log(
1714        mut self,
1715        threshold_us: u64,
1716        logger: SlowQueryLogger,
1717    ) -> Self {
1718        self.slow_query_threshold_us = Some(threshold_us);
1719        self.slow_query_logger = Some(logger);
1720        self
1721    }
1722
1723    /// v6.5.6 — operator knob for plan cache cap. spg-server reads
1724    /// `SPG_PLAN_CACHE_MAX` env at startup; uses this to override
1725    /// the compile-time default of 256.
1726    pub fn set_plan_cache_max(&mut self, n: usize) {
1727        self.plan_cache.set_max_entries(n);
1728    }
1729
1730    /// v6.5.2 — materialise `spg_stat_activity` rows. Pulls a fresh
1731    /// snapshot from the registered `ActivityProvider`. Returns an
1732    /// empty result set when no provider is registered (the no_std
1733    /// embedded path with no pgwire layer).
1734    fn exec_spg_stat_activity(&self) -> QueryResult {
1735        let columns = alloc::vec![
1736            ColumnSchema::new("pid", DataType::Int, false),
1737            ColumnSchema::new("user", DataType::Text, false),
1738            ColumnSchema::new("started_at_us", DataType::BigInt, false),
1739            ColumnSchema::new("current_sql", DataType::Text, false),
1740            ColumnSchema::new("wait_event", DataType::Text, false),
1741            ColumnSchema::new("elapsed_us", DataType::BigInt, false),
1742            ColumnSchema::new("in_transaction", DataType::Bool, false),
1743        ];
1744        let rows: Vec<Row> = self
1745            .activity_provider
1746            .map(|f| f())
1747            .unwrap_or_default()
1748            .into_iter()
1749            .map(|r| {
1750                Row::new(alloc::vec![
1751                    Value::Int(i32::try_from(r.pid).unwrap_or(i32::MAX)),
1752                    Value::Text(r.user),
1753                    Value::BigInt(r.started_at_us),
1754                    Value::Text(r.current_sql),
1755                    Value::Text(r.wait_event),
1756                    Value::BigInt(r.elapsed_us),
1757                    Value::Bool(r.in_transaction),
1758                ])
1759            })
1760            .collect();
1761        QueryResult::Rows { columns, rows }
1762    }
1763
1764    /// v6.5.4 — materialise `spg_table_ddl` rows. One row per user
1765    /// table with `(table_name, ddl)`. Reconstructed from catalog
1766    /// state on demand.
1767    fn exec_spg_table_ddl(&self) -> QueryResult {
1768        let columns = alloc::vec![
1769            ColumnSchema::new("table_name", DataType::Text, false),
1770            ColumnSchema::new("ddl", DataType::Text, false),
1771        ];
1772        let rows: Vec<Row> = self
1773            .catalog
1774            .table_names()
1775            .into_iter()
1776            .filter(|n| !is_internal_table_name(n))
1777            .filter_map(|name| {
1778                let table = self.catalog.get(&name)?;
1779                let ddl = render_create_table(&name, &table.schema().columns);
1780                Some(Row::new(alloc::vec![
1781                    Value::Text(name),
1782                    Value::Text(ddl),
1783                ]))
1784            })
1785            .collect();
1786        QueryResult::Rows { columns, rows }
1787    }
1788
1789    /// v6.5.4 — materialise `spg_role_ddl` rows. One row per user
1790    /// with `(role_name, ddl)`. Password is redacted (matches the
1791    /// `Statement::CreateUser` Display which prints `'<redacted>'`).
1792    fn exec_spg_role_ddl(&self) -> QueryResult {
1793        let columns = alloc::vec![
1794            ColumnSchema::new("role_name", DataType::Text, false),
1795            ColumnSchema::new("ddl", DataType::Text, false),
1796        ];
1797        let rows: Vec<Row> = self
1798            .users
1799            .iter()
1800            .map(|(name, rec)| {
1801                let ddl = alloc::format!(
1802                    "CREATE USER {name} WITH PASSWORD '<redacted>' ROLE '{}'",
1803                    rec.role.as_str(),
1804                );
1805                Row::new(alloc::vec![Value::Text(String::from(name)), Value::Text(ddl)])
1806            })
1807            .collect();
1808        QueryResult::Rows { columns, rows }
1809    }
1810
1811    /// v6.5.4 — materialise `spg_database_ddl`: single row whose
1812    /// `ddl` column concatenates every user table's CREATE +
1813    /// every role's CREATE in deterministic catalog order. Suitable
1814    /// for piping back through `Engine::execute` to recreate a
1815    /// schema-equivalent database.
1816    fn exec_spg_database_ddl(&self) -> QueryResult {
1817        let columns = alloc::vec![ColumnSchema::new("ddl", DataType::Text, false)];
1818        let mut out = String::new();
1819        for (name, rec) in self.users.iter() {
1820            out.push_str(&alloc::format!(
1821                "CREATE USER {name} WITH PASSWORD '<redacted>' ROLE '{}';\n",
1822                rec.role.as_str(),
1823            ));
1824        }
1825        for name in self.catalog.table_names() {
1826            if is_internal_table_name(&name) {
1827                continue;
1828            }
1829            if let Some(table) = self.catalog.get(&name) {
1830                out.push_str(&render_create_table(&name, &table.schema().columns));
1831                out.push_str(";\n");
1832            }
1833        }
1834        QueryResult::Rows {
1835            columns,
1836            rows: alloc::vec![Row::new(alloc::vec![Value::Text(out)])],
1837        }
1838    }
1839
1840    /// v6.5.3 — materialise `spg_audit_chain` rows. Pulls a fresh
1841    /// snapshot from the registered provider; empty when no
1842    /// provider is set.
1843    fn exec_spg_audit_chain(&self) -> QueryResult {
1844        let columns = alloc::vec![
1845            ColumnSchema::new("seq", DataType::BigInt, false),
1846            ColumnSchema::new("ts_ms", DataType::BigInt, false),
1847            ColumnSchema::new("prev_hash", DataType::Text, false),
1848            ColumnSchema::new("entry_hash", DataType::Text, false),
1849            ColumnSchema::new("sql", DataType::Text, false),
1850        ];
1851        let rows: Vec<Row> = self
1852            .audit_chain_provider
1853            .map(|f| f())
1854            .unwrap_or_default()
1855            .into_iter()
1856            .map(|r| {
1857                Row::new(alloc::vec![
1858                    Value::BigInt(r.seq),
1859                    Value::BigInt(r.ts_ms),
1860                    Value::Text(r.prev_hash_hex),
1861                    Value::Text(r.entry_hash_hex),
1862                    Value::Text(r.sql),
1863                ])
1864            })
1865            .collect();
1866        QueryResult::Rows { columns, rows }
1867    }
1868
1869    /// v6.5.3 — materialise `spg_audit_verify` single-row result.
1870    /// `(verified_count, broken_at_seq)` — broken_at_seq is `-1`
1871    /// on a clean chain. Returns one row with both values 0 when
1872    /// no verifier is registered (no-data fallback for embedded
1873    /// callers).
1874    fn exec_spg_audit_verify(&self) -> QueryResult {
1875        let columns = alloc::vec![
1876            ColumnSchema::new("verified_count", DataType::BigInt, false),
1877            ColumnSchema::new("broken_at_seq", DataType::BigInt, false),
1878        ];
1879        let (verified, broken) = self.audit_verifier.map(|f| f()).unwrap_or((0, -1));
1880        let row = Row::new(alloc::vec![
1881            Value::BigInt(verified),
1882            Value::BigInt(broken),
1883        ]);
1884        QueryResult::Rows {
1885            columns,
1886            rows: alloc::vec![row],
1887        }
1888    }
1889
1890    /// v6.5.1 — read-only accessor for tests + v6.5.6 ops resets.
1891    pub fn query_stats(&self) -> &query_stats::QueryStats {
1892        &self.query_stats
1893    }
1894
1895    /// v6.5.1 — mutable accessor (clear, etc).
1896    pub fn query_stats_mut(&mut self) -> &mut query_stats::QueryStats {
1897        &mut self.query_stats
1898    }
1899
1900    /// v6.2.0 — read access to the per-column statistics table.
1901    /// Used by the planner (v6.2.2 selectivity functions read this),
1902    /// by `SELECT * FROM spg_statistic`, and by e2e tests.
1903    pub const fn statistics(&self) -> &statistics::Statistics {
1904        &self.statistics
1905    }
1906
1907    /// v6.2.1 — return tables whose modified-row count crossed the
1908    /// auto-analyze threshold since the last ANALYZE on that table.
1909    /// The threshold is `0.1 × max(row_count, MIN_ROWS_FOR_AUTO_
1910    /// ANALYZE)` — combines PG-style fractional + absolute lower
1911    /// bound so a fresh / tiny table doesn't get hammered on every
1912    /// INSERT.
1913    ///
1914    /// Designed to be cheap: walks every user table's
1915    /// `Catalog::table_names()` + reads `statistics::modified_
1916    /// since_last_analyze()` (BTreeMap lookup). The background
1917    /// worker calls this under `engine.read()` then drops the lock
1918    /// before re-acquiring `engine.write()` for the actual ANALYZE.
1919    pub fn tables_needing_analyze(&self) -> Vec<String> {
1920        const MIN_ROWS: u64 = 100;
1921        let mut out = Vec::new();
1922        for name in self.catalog.table_names() {
1923            if is_internal_table_name(&name) {
1924                continue;
1925            }
1926            let Some(table) = self.catalog.get(&name) else {
1927                continue;
1928            };
1929            let row_count = table.rows().len() as u64;
1930            let modified = self.statistics.modified_since_last_analyze(&name);
1931            // Threshold: ceil(0.1 × max(row_count, MIN_ROWS)),
1932            // computed in integer arithmetic so spg-engine stays
1933            // no_std without pulling in libm. `(n + 9) / 10` is
1934            // `ceil(n / 10)` for non-negative `n`.
1935            let base = row_count.max(MIN_ROWS);
1936            let threshold = base.saturating_add(9) / 10;
1937            if modified >= threshold {
1938                out.push(name);
1939            }
1940        }
1941        out
1942    }
1943
1944    /// v6.2.0 — `ANALYZE [<table>]` runtime. Bare `ANALYZE` walks
1945    /// every user table; `ANALYZE <name>` re-stats one. For each
1946    /// target table, single-pass scan + per-column histogram +
1947    /// `null_frac` + `n_distinct`. Replaces the table's prior
1948    /// stats; resets the modified-row counter.
1949    ///
1950    /// v6.2.0 doesn't sample — it scans the full table. v6.2.x
1951    /// can add reservoir sampling at the > 100 K-row mark; not a
1952    /// scope blocker for the current commit since rows ≤ 100 K
1953    /// analyse in milliseconds.
1954    fn exec_analyze(&mut self, target: Option<&str>) -> Result<QueryResult, EngineError> {
1955        let names: Vec<String> = if let Some(name) = target {
1956            // Verify the table exists; surface a clear error if not.
1957            if self.catalog.get(name).is_none() {
1958                return Err(EngineError::Storage(StorageError::TableNotFound {
1959                    name: name.to_string(),
1960                }));
1961            }
1962            alloc::vec![name.to_string()]
1963        } else {
1964            self.catalog
1965                .table_names()
1966                .into_iter()
1967                .filter(|n| !is_internal_table_name(n))
1968                .collect()
1969        };
1970        let mut analysed = 0usize;
1971        for table_name in &names {
1972            self.analyze_one_table(table_name)?;
1973            analysed += 1;
1974        }
1975        // v6.3.1 — plan cache invalidation. Bump stats version so
1976        // future lookups see the new generation, and selectively
1977        // evict every plan whose `source_tables` overlap with the
1978        // ANALYZE target set. Bare ANALYZE (all tables) clears the
1979        // whole cache.
1980        if analysed > 0 {
1981            self.statistics.bump_version();
1982            if target.is_some() {
1983                for t in &names {
1984                    self.plan_cache.evict_referencing(t);
1985                }
1986            } else {
1987                self.plan_cache.clear();
1988            }
1989        }
1990        Ok(QueryResult::CommandOk {
1991            affected: analysed,
1992            modified_catalog: true,
1993        })
1994    }
1995
1996    /// v6.7.3 — `COMPACT COLD SEGMENTS` runtime path. Drives the
1997    /// engine-layer compaction shim with the default
1998    /// 4 MiB segment-size threshold. spg-server intercepts the
1999    /// SQL before it reaches the engine on a server build —
2000    /// it reads `SPG_COMPACTION_TARGET_SEGMENT_BYTES`, calls
2001    /// `Engine::compact_cold_segments_with_target` directly with
2002    /// the env value, and persists every merged segment to
2003    /// `<db>.spg/segments/`. This arm only fires for engine-only
2004    /// callers (spg-embedded, lib tests); in that mode merged
2005    /// segments live in memory and are dropped at process exit.
2006    fn exec_compact_cold_segments(&mut self) -> Result<QueryResult, EngineError> {
2007        let target = COMPACTION_TARGET_DEFAULT_BYTES;
2008        let reports = self.compact_cold_segments_with_target(target)?;
2009        let columns = alloc::vec![
2010            ColumnSchema::new("table_name", DataType::Text, false),
2011            ColumnSchema::new("index_name", DataType::Text, false),
2012            ColumnSchema::new("sources_merged", DataType::BigInt, false),
2013            ColumnSchema::new("merged_segment_id", DataType::BigInt, false),
2014            ColumnSchema::new("merged_rows", DataType::BigInt, false),
2015            ColumnSchema::new("deleted_rows_pruned", DataType::BigInt, false),
2016            ColumnSchema::new("bytes_reclaimed_estimate", DataType::BigInt, false),
2017        ];
2018        let rows: Vec<Row> = reports
2019            .into_iter()
2020            .map(|(tname, iname, report)| {
2021                Row::new(alloc::vec![
2022                    Value::Text(tname),
2023                    Value::Text(iname),
2024                    Value::BigInt(i64::try_from(report.sources.len()).unwrap_or(i64::MAX)),
2025                    Value::BigInt(i64::from(report.merged_segment_id.unwrap_or(0))),
2026                    Value::BigInt(i64::try_from(report.merged_rows).unwrap_or(i64::MAX)),
2027                    Value::BigInt(
2028                        i64::try_from(report.deleted_rows_pruned).unwrap_or(i64::MAX),
2029                    ),
2030                    Value::BigInt(
2031                        i64::try_from(report.bytes_reclaimed_estimate).unwrap_or(i64::MAX),
2032                    ),
2033                ])
2034            })
2035            .collect();
2036        Ok(QueryResult::Rows { columns, rows })
2037    }
2038
2039    /// Walk a single table's rows once and (re-)populate per-column
2040    /// stats. Drops the existing stats for `table` first so columns
2041    /// that have been DROP-ed between ANALYZEs don't leave stale
2042    /// rows.
2043    fn analyze_one_table(&mut self, table_name: &str) -> Result<(), EngineError> {
2044        let table = self.catalog.get(table_name).ok_or_else(|| {
2045            EngineError::Storage(StorageError::TableNotFound {
2046                name: table_name.to_string(),
2047            })
2048        })?;
2049        let schema = table.schema().clone();
2050        let row_count = table.rows().len();
2051        // For each column, collect (sorted) non-NULL textual values
2052        // + count NULLs; then ask `statistics::build_histogram` to
2053        // produce the 101 bounds and `estimate_n_distinct` the
2054        // distinct count.
2055        self.statistics.clear_table(table_name);
2056        for (col_pos, col_schema) in schema.columns.iter().enumerate() {
2057            // v6.2.0 skip: vector columns have their own stats
2058            // shape (HNSW graph topology). v6.2 deliberation #1.
2059            if matches!(col_schema.ty, DataType::Vector { .. }) {
2060                continue;
2061            }
2062            let mut non_null_values: Vec<Value> = Vec::with_capacity(row_count);
2063            let mut nulls: u64 = 0;
2064            for row in table.rows() {
2065                match row.values.get(col_pos) {
2066                    Some(Value::Null) | None => nulls += 1,
2067                    Some(v) => non_null_values.push(v.clone()),
2068                }
2069            }
2070            // Sort by type-aware ordering (Int as int, Text as
2071            // lex, etc.) so histogram bounds reflect the column's
2072            // natural order — not lexicographic on the string
2073            // representation, which would put "9" after "49".
2074            non_null_values.sort_by(|a, b| sort_values_for_histogram(a, b));
2075            let non_null: Vec<String> = non_null_values
2076                .iter()
2077                .map(canonical_value_repr)
2078                .collect();
2079            let null_frac = if row_count == 0 {
2080                0.0
2081            } else {
2082                #[allow(clippy::cast_precision_loss)]
2083                let f = nulls as f32 / row_count as f32;
2084                f
2085            };
2086            let n_distinct = statistics::estimate_n_distinct(&non_null);
2087            let histogram_bounds = statistics::build_histogram(&non_null);
2088            self.statistics.set(
2089                table_name.to_string(),
2090                col_schema.name.clone(),
2091                statistics::ColumnStats {
2092                    null_frac,
2093                    n_distinct,
2094                    histogram_bounds,
2095                },
2096            );
2097        }
2098        self.statistics.reset_modified(table_name);
2099        // v6.7.0 — refresh the per-table cold_rows cache. Walk the
2100        // BTree indices and count Cold locators (MAX across
2101        // indices); store the result on the table. Surfaced via
2102        // `spg_statistic.cold_row_count` (new column) and
2103        // `spg_stat_segment.table_name` (new column).
2104        let cold_count = {
2105            let table = self
2106                .active_catalog()
2107                .get(table_name)
2108                .expect("table still present");
2109            table.count_cold_locators()
2110        };
2111        let table_mut = self
2112            .active_catalog_mut()
2113            .get_mut(table_name)
2114            .expect("table still present");
2115        table_mut.set_cold_row_count(cold_count);
2116        Ok(())
2117    }
2118
2119    /// v6.1.3 — `SHOW PUBLICATIONS` row materialisation. Returns
2120    /// `(name, scope, table_count)` ordered by publication name.
2121    ///   - `scope` is the human-readable string:
2122    ///       `"FOR ALL TABLES"` /
2123    ///       `"FOR TABLE t1, t2"` /
2124    ///       `"FOR ALL TABLES EXCEPT t1, t2"`.
2125    ///   - `table_count` is NULL for `AllTables`, the list length
2126    ///     otherwise. NULLability lets clients distinguish "publish
2127    ///     everything" from "publish exactly 0 tables" (the v6.1.3
2128    ///     parser forbids the empty list, but the column shape is
2129    ///     ready for the v6.1.5 publisher-side semantics).
2130    fn exec_show_publications(&self) -> QueryResult {
2131        let columns = alloc::vec![
2132            ColumnSchema::new("name", DataType::Text, false),
2133            ColumnSchema::new("scope", DataType::Text, false),
2134            ColumnSchema::new("table_count", DataType::Int, true),
2135        ];
2136        let rows: Vec<Row> = self
2137            .publications
2138            .iter()
2139            .map(|(name, scope)| {
2140                let (scope_str, count_val) = match scope {
2141                    spg_sql::ast::PublicationScope::AllTables => {
2142                        ("FOR ALL TABLES".to_string(), Value::Null)
2143                    }
2144                    spg_sql::ast::PublicationScope::ForTables(ts) => (
2145                        alloc::format!("FOR TABLE {}", ts.join(", ")),
2146                        Value::Int(i32::try_from(ts.len()).unwrap_or(i32::MAX)),
2147                    ),
2148                    spg_sql::ast::PublicationScope::AllTablesExcept(ts) => (
2149                        alloc::format!("FOR ALL TABLES EXCEPT {}", ts.join(", ")),
2150                        Value::Int(i32::try_from(ts.len()).unwrap_or(i32::MAX)),
2151                    ),
2152                };
2153                Row::new(alloc::vec![
2154                    Value::Text(name.clone()),
2155                    Value::Text(scope_str),
2156                    count_val,
2157                ])
2158            })
2159            .collect();
2160        QueryResult::Rows { columns, rows }
2161    }
2162
2163    /// v4.1 `SHOW USERS` — `(name, role)` per row, ordered by name.
2164    fn exec_show_users(&self) -> QueryResult {
2165        let columns = alloc::vec![
2166            ColumnSchema::new("name", DataType::Text, false),
2167            ColumnSchema::new("role", DataType::Text, false),
2168        ];
2169        let rows: Vec<Row> = self
2170            .users
2171            .iter()
2172            .map(|(name, rec)| {
2173                Row::new(alloc::vec![
2174                    Value::Text(name.to_string()),
2175                    Value::Text(rec.role.as_str().to_string()),
2176                ])
2177            })
2178            .collect();
2179        QueryResult::Rows { columns, rows }
2180    }
2181
2182    fn exec_create_user(&mut self, s: &CreateUserStatement) -> Result<QueryResult, EngineError> {
2183        if self.in_transaction() {
2184            return Err(EngineError::Unsupported(
2185                "CREATE USER is not allowed inside a transaction".into(),
2186            ));
2187        }
2188        let role = users::Role::parse(&s.role).ok_or_else(|| {
2189            EngineError::Unsupported(alloc::format!("invalid role: {:?}", s.role))
2190        })?;
2191        // Prefer the host-injected RNG. Falls back to a deterministic
2192        // salt derived from the username only when no RNG is wired —
2193        // acceptable for tests; the server always installs one.
2194        let salt = self.salt_fn.map_or_else(
2195            || {
2196                let mut s_bytes = [0u8; 16];
2197                let digest = spg_crypto::hash(s.name.as_bytes());
2198                s_bytes.copy_from_slice(&digest[..16]);
2199                s_bytes
2200            },
2201            |f| f(),
2202        );
2203        self.users
2204            .create(&s.name, &s.password, role, salt)
2205            .map_err(|e| EngineError::Unsupported(alloc::format!("CREATE USER: {e}")))?;
2206        Ok(QueryResult::CommandOk {
2207            affected: 1,
2208            modified_catalog: true,
2209        })
2210    }
2211
2212    fn exec_drop_user(&mut self, name: &str) -> Result<QueryResult, EngineError> {
2213        if self.in_transaction() {
2214            return Err(EngineError::Unsupported(
2215                "DROP USER is not allowed inside a transaction".into(),
2216            ));
2217        }
2218        self.users
2219            .drop(name)
2220            .map_err(|e| EngineError::Unsupported(alloc::format!("DROP USER: {e}")))?;
2221        Ok(QueryResult::CommandOk {
2222            affected: 1,
2223            modified_catalog: true,
2224        })
2225    }
2226
2227    /// v4.4 `UPDATE <table> SET col = expr [, ...] [WHERE cond]`.
2228    /// Filter pass uses the same WHERE eval as `exec_select`. Per
2229    /// matched row, evaluate each RHS expression against the *old*
2230    /// row, then call `Table::update_row` which rebuilds indices.
2231    /// Indexed columns are correctly reflected because rebuild
2232    /// happens after the cell rewrite.
2233    fn exec_update_cancel(
2234        &mut self,
2235        stmt: &spg_sql::ast::UpdateStatement,
2236        cancel: CancelToken<'_>,
2237    ) -> Result<QueryResult, EngineError> {
2238        // v5.2.3: if the WHERE is a PK equality and matches a cold-
2239        // tier row, promote it back to the hot tier *before* the
2240        // hot-row walk. The promote pushes the row to the end of
2241        // `table.rows`, where the upcoming SET-evaluation loop will
2242        // pick it up and apply the assignments. Lookups for the key
2243        // never observe a gap because `promote_cold_row` inserts the
2244        // hot row before retiring the cold locator.
2245        if let Some(w) = &stmt.where_ {
2246            let schema_cols = self
2247                .active_catalog()
2248                .get(&stmt.table)
2249                .ok_or_else(|| {
2250                    EngineError::Storage(StorageError::TableNotFound {
2251                        name: stmt.table.clone(),
2252                    })
2253                })?
2254                .schema()
2255                .columns
2256                .clone();
2257            if let Some((col_pos, key)) = try_pk_predicate(w, &schema_cols, stmt.table.as_str())
2258                && let Some(idx_name) = self
2259                    .active_catalog()
2260                    .get(&stmt.table)
2261                    .and_then(|t| t.index_on(col_pos).map(|i| i.name.clone()))
2262            {
2263                // Promote may be a no-op (key is hot-only or absent);
2264                // we don't care about the return value here — the
2265                // subsequent hot walk will either match or not.
2266                let _ = self
2267                    .active_catalog_mut()
2268                    .promote_cold_row(&stmt.table, &idx_name, &key);
2269            }
2270        }
2271
2272        let table = self
2273            .active_catalog_mut()
2274            .get_mut(&stmt.table)
2275            .ok_or_else(|| {
2276                EngineError::Storage(StorageError::TableNotFound {
2277                    name: stmt.table.clone(),
2278                })
2279            })?;
2280        let schema_cols: Vec<ColumnSchema> = table.schema().columns.clone();
2281        // Resolve each SET target to a column position once, validate
2282        // up front so a typo'd column doesn't leave a partial mutation
2283        // behind.
2284        let mut targets: Vec<(usize, &Expr)> = Vec::with_capacity(stmt.assignments.len());
2285        for (col, expr) in &stmt.assignments {
2286            let pos = schema_cols
2287                .iter()
2288                .position(|c| c.name == *col)
2289                .ok_or_else(|| {
2290                    EngineError::Eval(EvalError::ColumnNotFound { name: col.clone() })
2291                })?;
2292            targets.push((pos, expr));
2293        }
2294        let ctx = EvalContext::new(&schema_cols, Some(stmt.table.as_str()));
2295        // Walk every row, evaluate WHERE then SET expressions. We
2296        // gather (position, new_values) tuples first and apply them
2297        // afterwards so the WHERE/RHS evaluation reads the original
2298        // row state — matches PG semantics (UPDATE doesn't see its
2299        // own writes).
2300        let mut planned: Vec<(usize, Vec<Value>)> = Vec::new();
2301        for (i, row) in table.rows().iter().enumerate() {
2302            // v4.5: cooperative cancel checkpoint every 256 rows so
2303            // a runaway UPDATE without WHERE doesn't drag past the
2304            // server's query-timeout watchdog.
2305            if i.is_multiple_of(256) {
2306                cancel.check()?;
2307            }
2308            if let Some(w) = &stmt.where_ {
2309                let cond = eval::eval_expr(w, row, &ctx)?;
2310                if !matches!(cond, Value::Bool(true)) {
2311                    continue;
2312                }
2313            }
2314            let mut new_vals = row.values.clone();
2315            for (pos, expr) in &targets {
2316                let v = eval::eval_expr(expr, row, &ctx)?;
2317                new_vals[*pos] =
2318                    coerce_value(v, schema_cols[*pos].ty, &schema_cols[*pos].name, *pos)?;
2319            }
2320            planned.push((i, new_vals));
2321        }
2322        // v7.6.6 — capture pre-update row values for the FK
2323        // enforcement passes below. `planned` carries new values
2324        // only; pair them with the old row.
2325        let plan_with_old: Vec<(usize, Vec<Value>, Vec<Value>)> = planned
2326            .iter()
2327            .map(|(pos, new_vals)| (*pos, table.rows()[*pos].values.clone(), new_vals.clone()))
2328            .collect();
2329        let self_fks = table.schema().foreign_keys.clone();
2330        let affected = planned.len();
2331        // Release mutable borrow on `table` for the FK passes.
2332        let _ = table;
2333        // v7.6.6 — Stage 2a: outbound FK check. For every row whose
2334        // local FK columns changed, the new value must exist in the
2335        // parent.
2336        if !self_fks.is_empty() {
2337            let new_rows: Vec<Vec<Value>> = planned
2338                .iter()
2339                .map(|(_pos, new_vals)| new_vals.clone())
2340                .collect();
2341            enforce_fk_inserts(self.active_catalog(), &stmt.table, &self_fks, &new_rows)?;
2342        }
2343        // v7.6.6 — Stage 2b: inbound FK check. For every row that
2344        // changed value in a column that *some other table* uses as
2345        // a FK parent column, react per `on_update` action.
2346        let child_plan = plan_fk_parent_updates(self.active_catalog(), &stmt.table, &plan_with_old)?;
2347        // Stage 3a — apply each child-side action.
2348        for step in &child_plan {
2349            apply_fk_child_step(self.active_catalog_mut(), step)?;
2350        }
2351        // Stage 3b — apply the original UPDATE.
2352        let table = self
2353            .active_catalog_mut()
2354            .get_mut(&stmt.table)
2355            .ok_or_else(|| {
2356                EngineError::Storage(StorageError::TableNotFound {
2357                    name: stmt.table.clone(),
2358                })
2359            })?;
2360        // v7.9.4 — snapshot post-update values for RETURNING.
2361        let updated_for_returning: Vec<Vec<Value>> =
2362            if stmt.returning.is_some() {
2363                planned.iter().map(|(_pos, vals)| vals.clone()).collect()
2364            } else {
2365                Vec::new()
2366            };
2367        for (pos, vals) in planned {
2368            table.update_row(pos, vals)?;
2369        }
2370        let _ = table;
2371        // v6.2.1 — auto-analyze modified-row tracking for UPDATE.
2372        if !self.in_transaction() && affected > 0 {
2373            self.statistics
2374                .record_modifications(&stmt.table, affected as u64);
2375        }
2376        // v7.9.4 — RETURNING projection.
2377        if let Some(items) = &stmt.returning {
2378            return self.build_returning_rows(
2379                &stmt.table,
2380                items,
2381                updated_for_returning,
2382            );
2383        }
2384        Ok(QueryResult::CommandOk {
2385            affected,
2386            modified_catalog: !self.in_transaction(),
2387        })
2388    }
2389
2390    /// v4.4 `DELETE FROM <table> [WHERE cond]`. Collects matching
2391    /// positions then delegates to `Table::delete_rows` (single index
2392    /// rebuild for the batch).
2393    fn exec_delete_cancel(
2394        &mut self,
2395        stmt: &spg_sql::ast::DeleteStatement,
2396        cancel: CancelToken<'_>,
2397    ) -> Result<QueryResult, EngineError> {
2398        // v5.2.3: PK-targeted DELETE → first retire any cold-tier
2399        // locator for the key. The cold row body stays in the
2400        // segment (becoming shadowed garbage that a future
2401        // compaction pass reclaims) but the index no longer
2402        // resolves it. The shadow count contributes to the
2403        // affected total; the subsequent hot walk handles any hot
2404        // rows for the same key.
2405        let mut cold_shadow_count: usize = 0;
2406        if let Some(w) = &stmt.where_ {
2407            let schema_cols = self
2408                .active_catalog()
2409                .get(&stmt.table)
2410                .ok_or_else(|| {
2411                    EngineError::Storage(StorageError::TableNotFound {
2412                        name: stmt.table.clone(),
2413                    })
2414                })?
2415                .schema()
2416                .columns
2417                .clone();
2418            if let Some((col_pos, key)) = try_pk_predicate(w, &schema_cols, stmt.table.as_str())
2419                && let Some(idx_name) = self
2420                    .active_catalog()
2421                    .get(&stmt.table)
2422                    .and_then(|t| t.index_on(col_pos).map(|i| i.name.clone()))
2423            {
2424                cold_shadow_count = self
2425                    .active_catalog_mut()
2426                    .shadow_cold_row(&stmt.table, &idx_name, &key)
2427                    .unwrap_or(0);
2428            }
2429        }
2430
2431        let table = self
2432            .active_catalog_mut()
2433            .get_mut(&stmt.table)
2434            .ok_or_else(|| {
2435                EngineError::Storage(StorageError::TableNotFound {
2436                    name: stmt.table.clone(),
2437                })
2438            })?;
2439        let schema_cols: Vec<ColumnSchema> = table.schema().columns.clone();
2440        let ctx = EvalContext::new(&schema_cols, Some(stmt.table.as_str()));
2441        let mut positions: Vec<usize> = Vec::new();
2442        // v7.6.3 — collect every to-delete row's full Value tuple
2443        // alongside its position, so the FK enforcement pass can
2444        // run after the mut borrow drops.
2445        let mut to_delete_rows: Vec<Vec<Value>> = Vec::new();
2446        for (i, row) in table.rows().iter().enumerate() {
2447            if i.is_multiple_of(256) {
2448                cancel.check()?;
2449            }
2450            let keep = if let Some(w) = &stmt.where_ {
2451                let cond = eval::eval_expr(w, row, &ctx)?;
2452                !matches!(cond, Value::Bool(true))
2453            } else {
2454                false
2455            };
2456            if !keep {
2457                positions.push(i);
2458                to_delete_rows.push(row.values.clone());
2459            }
2460        }
2461        // v7.6.3 / v7.6.4 — Stage 2: FK enforcement on the immutable
2462        // catalog. Release the mut borrow and run reverse-scan
2463        // against every child table whose FK targets this table.
2464        // RESTRICT / NoAction raise an error; CASCADE returns a
2465        // cascade plan that stage 3 applies after the primary delete.
2466        // SET NULL / SET DEFAULT remain Unsupported until v7.6.5.
2467        let _ = table;
2468        let cascade_plan = plan_fk_parent_deletions(
2469            self.active_catalog(),
2470            &stmt.table,
2471            &positions,
2472            &to_delete_rows,
2473        )?;
2474        // Stage 3a — apply each FK child step (SET NULL / SET
2475        // DEFAULT / CASCADE delete) before deleting the parent.
2476        // The plan is already ordered: nulls/defaults first, then
2477        // cascade deletes (so a row mutated and later deleted
2478        // surfaces as deleted — though v7.6.5 doesn't produce
2479        // that overlap today).
2480        for step in &cascade_plan {
2481            apply_fk_child_step(self.active_catalog_mut(), step)?;
2482        }
2483        // Stage 3b — actually delete the original target rows.
2484        let table = self
2485            .active_catalog_mut()
2486            .get_mut(&stmt.table)
2487            .ok_or_else(|| {
2488                EngineError::Storage(StorageError::TableNotFound {
2489                    name: stmt.table.clone(),
2490                })
2491            })?;
2492        let affected = table.delete_rows(&positions) + cold_shadow_count;
2493        let _ = table;
2494        // v6.2.1 — auto-analyze modified-row tracking for DELETE.
2495        if !self.in_transaction() && affected > 0 {
2496            self.statistics
2497                .record_modifications(&stmt.table, affected as u64);
2498        }
2499        // v7.9.4 — RETURNING projection over the soon-to-be-gone
2500        // rows. `to_delete_rows` was snapshotted in stage 1 before
2501        // mutation, so the projection sees the pre-delete state
2502        // (matches PG semantics: DELETE RETURNING returns the row
2503        // as it was just before removal).
2504        if let Some(items) = &stmt.returning {
2505            return self.build_returning_rows(
2506                &stmt.table,
2507                items,
2508                to_delete_rows,
2509            );
2510        }
2511        Ok(QueryResult::CommandOk {
2512            affected,
2513            modified_catalog: !self.in_transaction(),
2514        })
2515    }
2516
2517    /// `SHOW TABLES` — one row per table in the active catalog.
2518    /// Column name is `name` so result-set consumers can downstream
2519    /// `SELECT name FROM ...` style logic if needed.
2520    /// v4.26: `EXPLAIN [ANALYZE] <select>`. Returns a single-column
2521    /// `QUERY PLAN` text table — first line names the top operator
2522    /// (Scan / Aggregate / Window / etc.), indented children list
2523    /// FROM joins, WHERE filters, ORDER BY / LIMIT, projection
2524    /// shape, and any active index hits. `ANALYZE` execs the inner
2525    /// SELECT and appends actual-row + elapsed-micros annotations.
2526    #[allow(clippy::format_push_string)]
2527    fn exec_explain(
2528        &self,
2529        e: &spg_sql::ast::ExplainStatement,
2530        cancel: CancelToken<'_>,
2531    ) -> Result<QueryResult, EngineError> {
2532        let mut lines = Vec::<String>::new();
2533        explain_select(&e.inner, self, 0, &mut lines);
2534        if e.suggest {
2535            // v6.8.3 — index advisor. Walks the SELECT's FROM
2536            // tables + WHERE column refs; for each (table, column)
2537            // pair that lacks an index, append a SUGGEST line with
2538            // a copy-pastable `CREATE INDEX` statement. This is a
2539            // pure-syntax heuristic — no cardinality estimation —
2540            // matching the v6.8.3 design intent of "tell the
2541            // operator where indexes are missing", not "give the
2542            // mathematically optimal index set".
2543            let suggestions = build_index_suggestions(&e.inner, self);
2544            for s in suggestions {
2545                lines.push(s);
2546            }
2547        } else if e.analyze {
2548            // v6.2.4 — EXPLAIN ANALYZE annotates each operator line
2549            // with `(rows=N)` where the row count is computable
2550            // without re-executing the full query:
2551            //   - Top-level operator (first non-indented line):
2552            //     rows = final result.len()
2553            //   - "From: <table> [full scan]" lines: rows =
2554            //     table.rows().len() (catalog read; no execution)
2555            //   - "From: <table> [index seek]": indeterminate —
2556            //     the index step would need re-execution; v6.2.5
2557            //     adds per-operator wall-clock + hot/cold rows
2558            //     instrumentation that makes this concrete.
2559            //   - Everything else: marked `(—)` so the surface
2560            //     stays well-defined without silently dropping
2561            //     stats. v6.2.5 fills in via inline executor
2562            //     instrumentation.
2563            // Total elapsed lands on a trailing `Total: …` line.
2564            let started = self.clock.map(|f| f());
2565            let exec = self.exec_select_cancel(&e.inner, cancel)?;
2566            let elapsed_micros = match (self.clock, started) {
2567                (Some(f), Some(s)) => Some(f().saturating_sub(s)),
2568                _ => None,
2569            };
2570            let row_count = if let QueryResult::Rows { rows, .. } = &exec {
2571                rows.len()
2572            } else {
2573                0
2574            };
2575            annotate_explain_lines(&mut lines, row_count, self);
2576            let mut total = alloc::format!("Total: rows={row_count}");
2577            if let Some(us) = elapsed_micros {
2578                total.push_str(&alloc::format!(" elapsed={us}us"));
2579            }
2580            lines.push(total);
2581        }
2582        let columns = alloc::vec![ColumnSchema::new("QUERY PLAN", DataType::Text, false)];
2583        let rows: Vec<Row> = lines
2584            .into_iter()
2585            .map(|l| Row::new(alloc::vec![Value::Text(l)]))
2586            .collect();
2587        Ok(QueryResult::Rows { columns, rows })
2588    }
2589
2590    fn exec_show_tables(&self) -> QueryResult {
2591        let columns = alloc::vec![ColumnSchema::new("name", DataType::Text, false)];
2592        let rows: Vec<Row> = self
2593            .active_catalog()
2594            .table_names()
2595            .into_iter()
2596            .map(|n| Row::new(alloc::vec![Value::Text(n)]))
2597            .collect();
2598        QueryResult::Rows { columns, rows }
2599    }
2600
2601    /// `SHOW COLUMNS FROM <table>` — one row per column with the
2602    /// declared name, SQL type rendering, and nullability flag.
2603    fn exec_show_columns(&self, table_name: &str) -> Result<QueryResult, EngineError> {
2604        let table =
2605            self.active_catalog()
2606                .get(table_name)
2607                .ok_or_else(|| StorageError::TableNotFound {
2608                    name: table_name.into(),
2609                })?;
2610        let columns = alloc::vec![
2611            ColumnSchema::new("name", DataType::Text, false),
2612            ColumnSchema::new("type", DataType::Text, false),
2613            ColumnSchema::new("nullable", DataType::Bool, false),
2614        ];
2615        let rows: Vec<Row> = table
2616            .schema()
2617            .columns
2618            .iter()
2619            .map(|c| {
2620                Row::new(alloc::vec![
2621                    Value::Text(c.name.clone()),
2622                    Value::Text(alloc::format!("{}", c.ty)),
2623                    Value::Bool(c.nullable),
2624                ])
2625            })
2626            .collect();
2627        Ok(QueryResult::Rows { columns, rows })
2628    }
2629
2630    fn exec_begin(&mut self) -> Result<QueryResult, EngineError> {
2631        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2632        if self.tx_catalogs.contains_key(&tx_id) {
2633            return Err(EngineError::TransactionAlreadyOpen);
2634        }
2635        self.tx_catalogs.insert(
2636            tx_id,
2637            TxState {
2638                catalog: self.catalog.clone(),
2639                savepoints: Vec::new(),
2640            },
2641        );
2642        Ok(QueryResult::CommandOk {
2643            affected: 0,
2644            modified_catalog: false,
2645        })
2646    }
2647
2648    fn exec_commit(&mut self) -> Result<QueryResult, EngineError> {
2649        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2650        let state = self
2651            .tx_catalogs
2652            .remove(&tx_id)
2653            .ok_or(EngineError::NoActiveTransaction)?;
2654        self.catalog = state.catalog;
2655        // All savepoints become permanent at COMMIT and the stack
2656        // resets for the next TX (`state.savepoints` is discarded with
2657        // `state`).
2658        Ok(QueryResult::CommandOk {
2659            affected: 0,
2660            modified_catalog: true,
2661        })
2662    }
2663
2664    fn exec_rollback(&mut self) -> Result<QueryResult, EngineError> {
2665        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2666        if self.tx_catalogs.remove(&tx_id).is_none() {
2667            return Err(EngineError::NoActiveTransaction);
2668        }
2669        // savepoints discarded with the TxState
2670        Ok(QueryResult::CommandOk {
2671            affected: 0,
2672            modified_catalog: false,
2673        })
2674    }
2675
2676    fn exec_savepoint(&mut self, name: String) -> Result<QueryResult, EngineError> {
2677        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2678        let state = self
2679            .tx_catalogs
2680            .get_mut(&tx_id)
2681            .ok_or(EngineError::NoActiveTransaction)?;
2682        // PG re-uses an existing savepoint name by dropping the older
2683        // entry and pushing a fresh one — match that behaviour so
2684        // application code can `SAVEPOINT sp; ...; SAVEPOINT sp` freely.
2685        state.savepoints.retain(|(n, _)| n != &name);
2686        let snapshot = state.catalog.clone();
2687        state.savepoints.push((name, snapshot));
2688        Ok(QueryResult::CommandOk {
2689            affected: 0,
2690            modified_catalog: false,
2691        })
2692    }
2693
2694    fn exec_rollback_to_savepoint(&mut self, name: &str) -> Result<QueryResult, EngineError> {
2695        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2696        let state = self
2697            .tx_catalogs
2698            .get_mut(&tx_id)
2699            .ok_or(EngineError::NoActiveTransaction)?;
2700        let pos = state
2701            .savepoints
2702            .iter()
2703            .rposition(|(n, _)| n == name)
2704            .ok_or_else(|| {
2705                EngineError::Unsupported(alloc::format!("savepoint not found: {name}"))
2706            })?;
2707        // The savepoint stays on the stack (PG semantics): a later
2708        // `RELEASE` or further `ROLLBACK TO` is still allowed. Everything
2709        // after it is discarded.
2710        let snapshot = state.savepoints[pos].1.clone();
2711        state.savepoints.truncate(pos + 1);
2712        state.catalog = snapshot;
2713        Ok(QueryResult::CommandOk {
2714            affected: 0,
2715            modified_catalog: false,
2716        })
2717    }
2718
2719    fn exec_release_savepoint(&mut self, name: &str) -> Result<QueryResult, EngineError> {
2720        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2721        let state = self
2722            .tx_catalogs
2723            .get_mut(&tx_id)
2724            .ok_or(EngineError::NoActiveTransaction)?;
2725        let pos = state
2726            .savepoints
2727            .iter()
2728            .rposition(|(n, _)| n == name)
2729            .ok_or_else(|| {
2730                EngineError::Unsupported(alloc::format!("savepoint not found: {name}"))
2731            })?;
2732        // RELEASE keeps the work since the savepoint, just discards the
2733        // bookmark plus everything nested under it.
2734        state.savepoints.truncate(pos);
2735        Ok(QueryResult::CommandOk {
2736            affected: 0,
2737            modified_catalog: false,
2738        })
2739    }
2740
2741    /// v6.0.4 — synchronous `ALTER INDEX <name> REBUILD [WITH
2742    /// (encoding = …)]`. Walks every table in the active catalog
2743    /// looking for an index matching `stmt.name`, then delegates the
2744    /// rebuild (including any encoding switch) to
2745    /// `Table::rebuild_nsw_index`. The "live" non-blocking
2746    /// optimisation is v6.0.4.1 / v6.1.x territory.
2747    /// v6.7.2 — `ALTER TABLE t SET hot_tier_bytes = X`. Dispatch
2748    /// arm. Currently the only setting is `hot_tier_bytes`; later
2749    /// v6.7.x can extend `AlterTableTarget` without touching this
2750    /// arm structure.
2751    fn exec_alter_table(
2752        &mut self,
2753        s: spg_sql::ast::AlterTableStatement,
2754    ) -> Result<QueryResult, EngineError> {
2755        match s.target {
2756            spg_sql::ast::AlterTableTarget::SetHotTierBytes(n) => {
2757                let table = self
2758                    .active_catalog_mut()
2759                    .get_mut(&s.name)
2760                    .ok_or_else(|| {
2761                        EngineError::Storage(StorageError::TableNotFound {
2762                            name: s.name.clone(),
2763                        })
2764                    })?;
2765                table.schema_mut().hot_tier_bytes = Some(n);
2766            }
2767            spg_sql::ast::AlterTableTarget::AddForeignKey(fk) => {
2768                // v7.6.8 — resolve FK against the live catalog first
2769                // (validates parent table, columns, indices). Then
2770                // verify every existing row in the child table
2771                // satisfies the new constraint. Then install it.
2772                let cols_snapshot = self
2773                    .active_catalog()
2774                    .get(&s.name)
2775                    .ok_or_else(|| {
2776                        EngineError::Storage(StorageError::TableNotFound {
2777                            name: s.name.clone(),
2778                        })
2779                    })?
2780                    .schema()
2781                    .columns
2782                    .clone();
2783                let storage_fk = resolve_foreign_key(
2784                    &s.name,
2785                    &cols_snapshot,
2786                    fk,
2787                    self.active_catalog(),
2788                )?;
2789                // Verify existing rows. Treat them as a virtual
2790                // INSERT batch — reusing the v7.6.2 enforce helper.
2791                let existing_rows: Vec<Vec<Value>> = self
2792                    .active_catalog()
2793                    .get(&s.name)
2794                    .expect("checked above")
2795                    .rows()
2796                    .iter()
2797                    .map(|r| r.values.clone())
2798                    .collect();
2799                enforce_fk_inserts(
2800                    self.active_catalog(),
2801                    &s.name,
2802                    core::slice::from_ref(&storage_fk),
2803                    &existing_rows,
2804                )?;
2805                // Reject duplicate constraint name.
2806                let table = self
2807                    .active_catalog_mut()
2808                    .get_mut(&s.name)
2809                    .expect("checked above");
2810                if let Some(name) = &storage_fk.name
2811                    && table
2812                        .schema()
2813                        .foreign_keys
2814                        .iter()
2815                        .any(|f| f.name.as_ref() == Some(name))
2816                {
2817                    return Err(EngineError::Unsupported(alloc::format!(
2818                        "ALTER TABLE ADD CONSTRAINT: a constraint named {name:?} already exists"
2819                    )));
2820                }
2821                table.schema_mut().foreign_keys.push(storage_fk);
2822            }
2823            spg_sql::ast::AlterTableTarget::DropForeignKey(name) => {
2824                let table = self
2825                    .active_catalog_mut()
2826                    .get_mut(&s.name)
2827                    .ok_or_else(|| {
2828                        EngineError::Storage(StorageError::TableNotFound {
2829                            name: s.name.clone(),
2830                        })
2831                    })?;
2832                let fks = &mut table.schema_mut().foreign_keys;
2833                let before = fks.len();
2834                fks.retain(|f| f.name.as_ref() != Some(&name));
2835                if fks.len() == before {
2836                    return Err(EngineError::Unsupported(alloc::format!(
2837                        "ALTER TABLE DROP CONSTRAINT: no FK named {name:?} on {:?}",
2838                        s.name
2839                    )));
2840                }
2841            }
2842        }
2843        Ok(QueryResult::CommandOk {
2844            affected: 0,
2845            modified_catalog: !self.in_transaction(),
2846        })
2847    }
2848
2849    fn exec_alter_index(
2850        &mut self,
2851        stmt: spg_sql::ast::AlterIndexStatement,
2852    ) -> Result<QueryResult, EngineError> {
2853        // Translate the optional SQL-side encoding choice into the
2854        // storage-side enum; the same SqlVecEncoding -> VecEncoding
2855        // bridge `column_type_to_data_type` uses.
2856        let spg_sql::ast::AlterIndexStatement {
2857            name: idx_name,
2858            target,
2859        } = stmt;
2860        let spg_sql::ast::AlterIndexTarget::Rebuild { encoding } = target;
2861        let target = encoding.map(|e| match e {
2862            SqlVecEncoding::F32 => VecEncoding::F32,
2863            SqlVecEncoding::Sq8 => VecEncoding::Sq8,
2864            SqlVecEncoding::F16 => VecEncoding::F16,
2865        });
2866        // Linear scan: index names are globally unique within a
2867        // catalog (enforced by add_nsw_index_inner) so the first
2868        // match is the only one. Save the table name to avoid
2869        // borrowing while we then take a mut borrow.
2870        let table_name = {
2871            let cat = self.active_catalog();
2872            let mut found: Option<String> = None;
2873            for tname in cat.table_names() {
2874                if let Some(t) = cat.get(&tname)
2875                    && t.indices().iter().any(|i| i.name == idx_name)
2876                {
2877                    found = Some(tname);
2878                    break;
2879                }
2880            }
2881            found.ok_or_else(|| {
2882                EngineError::Storage(StorageError::IndexNotFound {
2883                    name: idx_name.clone(),
2884                })
2885            })?
2886        };
2887        let table = self
2888            .active_catalog_mut()
2889            .get_mut(&table_name)
2890            .expect("table found above");
2891        table.rebuild_nsw_index(&idx_name, target)?;
2892        // v6.3.1 — ALTER INDEX REBUILD potentially with new encoding
2893        // changes cost characteristics; evict any cached plans.
2894        self.plan_cache.evict_referencing(&table_name);
2895        Ok(QueryResult::CommandOk {
2896            affected: 0,
2897            modified_catalog: !self.in_transaction(),
2898        })
2899    }
2900
2901    fn exec_create_index(
2902        &mut self,
2903        stmt: CreateIndexStatement,
2904    ) -> Result<QueryResult, EngineError> {
2905        let table = self
2906            .active_catalog_mut()
2907            .get_mut(&stmt.table)
2908            .ok_or_else(|| {
2909                EngineError::Storage(StorageError::TableNotFound {
2910                    name: stmt.table.clone(),
2911                })
2912            })?;
2913        // `IF NOT EXISTS` reduces DuplicateIndex to a no-op CommandOk.
2914        if stmt.if_not_exists && table.indices().iter().any(|i| i.name == stmt.name) {
2915            return Ok(QueryResult::CommandOk {
2916                affected: 0,
2917                modified_catalog: false,
2918            });
2919        }
2920        // v7.9.14 — multi-column index parses through; engine
2921        // builds a single-column BTree on the leading column only.
2922        // The extras live on the AST so spg-server's dispatcher
2923        // can emit a PG-wire NoticeResponse / log line. Composite
2924        // BTree keys land in v7.10.
2925        let _ = &stmt.extra_columns; // intentional drop on engine side
2926        let table_name = stmt.table.clone();
2927        // v6.8.0 — resolve INCLUDE column names to positions. Done
2928        // before `add_index` so a typo error surfaces before any
2929        // catalog mutation lands.
2930        let included_positions: Vec<usize> = if stmt.included_columns.is_empty() {
2931            Vec::new()
2932        } else {
2933            let schema = table.schema();
2934            stmt.included_columns
2935                .iter()
2936                .map(|c| {
2937                    schema.column_position(c).ok_or_else(|| {
2938                        EngineError::Storage(StorageError::ColumnNotFound {
2939                            column: c.clone(),
2940                        })
2941                    })
2942                })
2943                .collect::<Result<Vec<_>, _>>()?
2944        };
2945        match stmt.method {
2946            IndexMethod::BTree => table.add_index(stmt.name.clone(), &stmt.column)?,
2947            IndexMethod::Hnsw => {
2948                if !included_positions.is_empty() {
2949                    return Err(EngineError::Unsupported(
2950                        "INCLUDE columns are not supported on HNSW indexes".into(),
2951                    ));
2952                }
2953                table.add_nsw_index(stmt.name.clone(), &stmt.column, spg_storage::NSW_DEFAULT_M)?;
2954            }
2955            // v6.7.1 — BRIN. Pure metadata; no in-memory data.
2956            IndexMethod::Brin => {
2957                if !included_positions.is_empty() {
2958                    return Err(EngineError::Unsupported(
2959                        "INCLUDE columns are not supported on BRIN indexes".into(),
2960                    ));
2961                }
2962                table.add_brin_index(stmt.name.clone(), &stmt.column)?;
2963            }
2964        }
2965        if !included_positions.is_empty()
2966            && let Some(idx) = table.indices_mut().iter_mut().find(|i| i.name == stmt.name)
2967        {
2968            idx.included_columns = included_positions;
2969        }
2970        // v6.8.1 — persist partial-index predicate. Stored as the
2971        // expression's Display form so the catalog snapshot stays
2972        // pure (storage has no spg-sql dependency). The runtime
2973        // maintenance path treats partial indexes identically to
2974        // full indexes for v6.8.1 (over-maintenance is safe; the
2975        // planner-side "use partial when query WHERE implies the
2976        // predicate" pass is STABILITY carve-out).
2977        if let Some(pred_expr) = &stmt.partial_predicate {
2978            let canonical = pred_expr.to_string();
2979            if matches!(stmt.method, IndexMethod::Hnsw | IndexMethod::Brin) {
2980                return Err(EngineError::Unsupported(
2981                    "WHERE predicates are not supported on HNSW or BRIN indexes".into(),
2982                ));
2983            }
2984            if let Some(idx) = table.indices_mut().iter_mut().find(|i| i.name == stmt.name) {
2985                idx.partial_predicate = Some(canonical);
2986            }
2987        }
2988        // v6.8.2 — persist expression index key. Same Display-form
2989        // storage; the runtime maintenance pass evaluates each
2990        // row's expression to derive the index key, but for v6.8.2
2991        // the engine falls through to the bare-column-reference
2992        // path and the expression is preserved for format-layer
2993        // round-trip + future planner work. Carved-out in
2994        // STABILITY § "Out of v6.8".
2995        if let Some(key_expr) = &stmt.expression {
2996            if matches!(stmt.method, IndexMethod::Hnsw | IndexMethod::Brin) {
2997                return Err(EngineError::Unsupported(
2998                    "Expression keys are not supported on HNSW or BRIN indexes".into(),
2999                ));
3000            }
3001            let canonical = key_expr.to_string();
3002            if let Some(idx) = table.indices_mut().iter_mut().find(|i| i.name == stmt.name) {
3003                idx.expression = Some(canonical);
3004            }
3005        }
3006        // v7.9.29 — persist `is_unique` flag on the storage Index.
3007        // Combined with `partial_predicate`, INSERT enforcement
3008        // checks that no other row whose predicate evaluates true
3009        // shares the same indexed key. Parser already rejected
3010        // `UNIQUE` on HNSW / BRIN, so plain BTree here.
3011        // For multi-column UNIQUE INDEX the extras matter (the
3012        // full tuple is the uniqueness key), so resolve them to
3013        // column positions and persist on the index too.
3014        if stmt.is_unique {
3015            let mut extra_positions: alloc::vec::Vec<usize> = alloc::vec::Vec::new();
3016            for col_name in &stmt.extra_columns {
3017                let pos = table
3018                    .schema()
3019                    .columns
3020                    .iter()
3021                    .position(|c| c.name.eq_ignore_ascii_case(col_name))
3022                    .ok_or_else(|| {
3023                        EngineError::Unsupported(alloc::format!(
3024                            "UNIQUE INDEX {:?}: extra column {col_name:?} not in table {:?}",
3025                            stmt.name, stmt.table
3026                        ))
3027                    })?;
3028                extra_positions.push(pos);
3029            }
3030            if let Some(idx) = table.indices_mut().iter_mut().find(|i| i.name == stmt.name) {
3031                idx.is_unique = true;
3032                idx.extra_column_positions = extra_positions;
3033            }
3034            // At index-creation time, check the existing rows for
3035            // pre-existing duplicates that would have violated the
3036            // new constraint — otherwise CREATE UNIQUE INDEX would
3037            // silently leave duplicates in place.
3038            let snapshot_indices = table.indices().to_vec();
3039            let snapshot_rows: alloc::vec::Vec<spg_storage::Row> =
3040                table.rows().iter().cloned().collect();
3041            let snapshot_schema = table.schema().clone();
3042            let idx_ref = snapshot_indices
3043                .iter()
3044                .find(|i| i.name == stmt.name)
3045                .expect("just-added index");
3046            check_existing_unique_violation(idx_ref, &snapshot_schema, &snapshot_rows)?;
3047        }
3048        // v6.3.1 — adding an index can change the optimal plan for
3049        // any cached query that references this table.
3050        self.plan_cache.evict_referencing(&table_name);
3051        Ok(QueryResult::CommandOk {
3052            affected: 0,
3053            modified_catalog: !self.in_transaction(),
3054        })
3055    }
3056
3057    fn exec_create_table(
3058        &mut self,
3059        stmt: CreateTableStatement,
3060    ) -> Result<QueryResult, EngineError> {
3061        if stmt.if_not_exists && self.active_catalog().get(&stmt.name).is_some() {
3062            return Ok(QueryResult::CommandOk {
3063                affected: 0,
3064                modified_catalog: false,
3065            });
3066        }
3067        let table_name = stmt.name.clone();
3068        // v7.9.13 — pluck the names of any columns marked
3069        // `PRIMARY KEY` inline so the post-create-table pass can
3070        // build an implicit BTree index. mailrs F1.
3071        let inline_pk_columns: Vec<String> = stmt
3072            .columns
3073            .iter()
3074            .filter(|c| c.is_primary_key)
3075            .map(|c| c.name.clone())
3076            .collect();
3077        // v7.9.19 — table-level constraints: PRIMARY KEY (a, b, ...)
3078        // and UNIQUE (a, b, ...). Each builds a BTree index on the
3079        // leading column (the existing single-column storage tier)
3080        // and registers a UniquenessConstraint on the schema for
3081        // INSERT-time enforcement of the full tuple. mailrs G1/G6.
3082        let cols = stmt
3083            .columns
3084            .into_iter()
3085            .map(column_def_to_schema)
3086            .collect::<Result<Vec<_>, _>>()?;
3087        // Composite NOT-NULL implication for PRIMARY KEY columns.
3088        let mut cols = cols;
3089        for tc in &stmt.table_constraints {
3090            if let spg_sql::ast::TableConstraint::PrimaryKey { columns, .. } = tc {
3091                for col_name in columns {
3092                    if let Some(col) = cols.iter_mut().find(|c| c.name == *col_name) {
3093                        col.nullable = false;
3094                    }
3095                }
3096            }
3097        }
3098        // v7.6.1 — resolve every FK in the statement against the
3099        // already-known catalog. Validates: parent table exists,
3100        // parent column names exist, arity matches, parent columns
3101        // have a PK / UNIQUE index. Self-referencing FKs (parent
3102        // table == this table) resolve against the column list we
3103        // just built — they don't need the catalog yet.
3104        let mut fks: Vec<spg_storage::ForeignKeyConstraint> =
3105            Vec::with_capacity(stmt.foreign_keys.len());
3106        for fk in stmt.foreign_keys {
3107            fks.push(resolve_foreign_key(
3108                &table_name,
3109                &cols,
3110                fk,
3111                self.active_catalog(),
3112            )?);
3113        }
3114        let mut schema = TableSchema::new(table_name.clone(), cols);
3115        schema.foreign_keys = fks;
3116        // v7.9.19 — translate AST table_constraints to storage
3117        // UniquenessConstraints (column name → position) so the
3118        // INSERT enforcement helper sees positions directly.
3119        let mut uc_storage: Vec<spg_storage::UniquenessConstraint> = Vec::new();
3120        for tc in &stmt.table_constraints {
3121            let (is_pk, names) = match tc {
3122                spg_sql::ast::TableConstraint::PrimaryKey { columns, .. } => {
3123                    (true, columns.clone())
3124                }
3125                spg_sql::ast::TableConstraint::Unique { columns, .. } => {
3126                    (false, columns.clone())
3127                }
3128            };
3129            let mut positions = Vec::with_capacity(names.len());
3130            for n in &names {
3131                let pos = schema
3132                    .columns
3133                    .iter()
3134                    .position(|c| c.name == *n)
3135                    .ok_or_else(|| {
3136                        EngineError::Unsupported(alloc::format!(
3137                            "table constraint references unknown column {n:?}"
3138                        ))
3139                    })?;
3140                positions.push(pos);
3141            }
3142            uc_storage.push(spg_storage::UniquenessConstraint {
3143                is_primary_key: is_pk,
3144                columns: positions,
3145            });
3146        }
3147        schema.uniqueness_constraints = uc_storage.clone();
3148        self.active_catalog_mut().create_table(schema)?;
3149        // v7.9.13 — implicit BTree per inline PK column +
3150        // v7.9.19 — implicit BTree on the leading column of every
3151        // table-level PRIMARY KEY / UNIQUE constraint.
3152        let table = self
3153            .active_catalog_mut()
3154            .get_mut(&table_name)
3155            .expect("just created");
3156        for (i, col_name) in inline_pk_columns.iter().enumerate() {
3157            let idx_name = if inline_pk_columns.len() == 1 {
3158                alloc::format!("{table_name}_pkey")
3159            } else {
3160                alloc::format!("{table_name}_pkey_{i}")
3161            };
3162            if let Err(e) = table.add_index(idx_name, col_name) {
3163                return Err(EngineError::Storage(e));
3164            }
3165        }
3166        for (i, tc) in stmt.table_constraints.iter().enumerate() {
3167            let (is_pk, names) = match tc {
3168                spg_sql::ast::TableConstraint::PrimaryKey { columns, .. } => {
3169                    (true, columns)
3170                }
3171                spg_sql::ast::TableConstraint::Unique { columns, .. } => {
3172                    (false, columns)
3173                }
3174            };
3175            let leading = &names[0];
3176            // Skip if a same-column BTree already exists (e.g.
3177            // inline PK on the leading column).
3178            let already = table
3179                .indices()
3180                .iter()
3181                .any(|idx| {
3182                    matches!(idx.kind, spg_storage::IndexKind::BTree(_))
3183                        && table.schema().columns[idx.column_position].name == *leading
3184                });
3185            if already {
3186                continue;
3187            }
3188            let suffix = if is_pk { "pkey" } else { "key" };
3189            let idx_name = if names.len() == 1 {
3190                alloc::format!("{table_name}_{leading}_{suffix}")
3191            } else {
3192                alloc::format!("{table_name}_{leading}_{suffix}_{i}")
3193            };
3194            if let Err(e) = table.add_index(idx_name, leading) {
3195                return Err(EngineError::Storage(e));
3196            }
3197        }
3198        Ok(QueryResult::CommandOk {
3199            affected: 0,
3200            modified_catalog: !self.in_transaction(),
3201        })
3202    }
3203
3204    fn exec_insert(&mut self, stmt: InsertStatement) -> Result<QueryResult, EngineError> {
3205        // v7.9.21 — snapshot the clock fn pointer before the mut
3206        // borrow on the catalog opens; runtime DEFAULT eval needs
3207        // it inside the row hot loop.
3208        let clock = self.clock;
3209        let table = self
3210            .active_catalog_mut()
3211            .get_mut(&stmt.table)
3212            .ok_or_else(|| {
3213                EngineError::Storage(StorageError::TableNotFound {
3214                    name: stmt.table.clone(),
3215                })
3216            })?;
3217        // v3.1.5: clone the columns vector only (not the whole
3218        // TableSchema — saves one String alloc for the table name).
3219        // We need an owned snapshot because we'll call `table.insert`
3220        // (mutable borrow on `table`) inside the row loop while
3221        // reading schema fields.
3222        let column_meta: Vec<ColumnSchema> = table.schema().columns.clone();
3223        let schema_cols_len = column_meta.len();
3224        // Build a permutation `tuple_pos[c] = Some(j)` meaning schema
3225        // column `c` is filled from the `j`-th tuple slot; `None` means
3226        // "fill with NULL". Validated once and reused for every row.
3227        let tuple_pos: Option<Vec<Option<usize>>> = match &stmt.columns {
3228            None => None, // 1-1 mapping, fast path
3229            Some(cols) => {
3230                let mut map = alloc::vec![None; schema_cols_len];
3231                for (j, name) in cols.iter().enumerate() {
3232                    let idx = column_meta
3233                        .iter()
3234                        .position(|c| c.name == *name)
3235                        .ok_or_else(|| {
3236                            EngineError::Eval(EvalError::ColumnNotFound { name: name.clone() })
3237                        })?;
3238                    if map[idx].is_some() {
3239                        return Err(EngineError::Storage(StorageError::ArityMismatch {
3240                            expected: schema_cols_len,
3241                            actual: cols.len(),
3242                        }));
3243                    }
3244                    map[idx] = Some(j);
3245                }
3246                // Omitted columns must either be nullable, carry a
3247                // DEFAULT, or be AUTO_INCREMENT. Catch NOT NULL
3248                // omissions up front so the WAL stays clean.
3249                for (i, col) in column_meta.iter().enumerate() {
3250                    if map[i].is_none()
3251                        && !col.nullable
3252                        && col.default.is_none()
3253                        && col.runtime_default.is_none()
3254                        && !col.auto_increment
3255                    {
3256                        return Err(EngineError::Storage(StorageError::NullInNotNull {
3257                            column: col.name.clone(),
3258                        }));
3259                    }
3260                }
3261                Some(map)
3262            }
3263        };
3264        let expected_tuple_len = stmt.columns.as_ref().map_or(schema_cols_len, Vec::len);
3265        // v7.6.2 — snapshot this table's FK list before the
3266        // mutable-borrow window so we can run parent lookups
3267        // against the immutable catalog after parsing. Empty vec is
3268        // the no-FK fast path; clone cost is O(fks * arity) which
3269        // is < 100 ns for typical schemas.
3270        let fks = table.schema().foreign_keys.clone();
3271        let mut affected = 0usize;
3272        // Stage 1 — parse + AUTO_INC + coerce all rows under the
3273        // single mutable borrow.
3274        let mut all_values: Vec<Vec<Value>> = Vec::with_capacity(stmt.rows.len());
3275        for tuple in stmt.rows {
3276            if tuple.len() != expected_tuple_len {
3277                return Err(EngineError::Storage(StorageError::ArityMismatch {
3278                    expected: expected_tuple_len,
3279                    actual: tuple.len(),
3280                }));
3281            }
3282            // Fast path: no column-list permutation → tuple slot j
3283            // maps to schema column j. We can zip schema with tuple
3284            // and skip the `raw_tuple` staging allocation entirely.
3285            let values: Vec<Value> = if let Some(map) = &tuple_pos {
3286                // Permuted path: still need raw_tuple to index by `map[i]`.
3287                let raw_tuple: Vec<Value> = tuple
3288                    .into_iter()
3289                    .map(literal_expr_to_value)
3290                    .collect::<Result<_, _>>()?;
3291                let mut out = Vec::with_capacity(schema_cols_len);
3292                for (i, col) in column_meta.iter().enumerate() {
3293                    let mut raw = match map[i] {
3294                        Some(j) => raw_tuple[j].clone(),
3295                        None => resolve_column_default_free(col, clock)?,
3296                    };
3297                    if col.auto_increment && raw.is_null() {
3298                        let next = table.next_auto_value(i).ok_or_else(|| {
3299                            EngineError::Unsupported(alloc::format!(
3300                                "AUTO_INCREMENT applies to integer columns only (column `{}`)",
3301                                col.name
3302                            ))
3303                        })?;
3304                        raw = Value::BigInt(next);
3305                    }
3306                    out.push(coerce_value(raw, col.ty, &col.name, i)?);
3307                }
3308                out
3309            } else {
3310                // 1-1 mapping fast path: single Vec alloc, no raw_tuple.
3311                let mut out = Vec::with_capacity(schema_cols_len);
3312                for (i, (col, expr)) in column_meta.iter().zip(tuple).enumerate() {
3313                    let mut raw = literal_expr_to_value(expr)?;
3314                    if col.auto_increment && raw.is_null() {
3315                        let next = table.next_auto_value(i).ok_or_else(|| {
3316                            EngineError::Unsupported(alloc::format!(
3317                                "AUTO_INCREMENT applies to integer columns only (column `{}`)",
3318                                col.name
3319                            ))
3320                        })?;
3321                        raw = Value::BigInt(next);
3322                    }
3323                    out.push(coerce_value(raw, col.ty, &col.name, i)?);
3324                }
3325                out
3326            };
3327            all_values.push(values);
3328        }
3329        // Stage 2 — FK enforcement on the immutable catalog.
3330        // Non-lexical lifetimes release the mutable borrow on
3331        // `table` here since stage 1 was the last use. The
3332        // parent-table lookup runs before any row is committed.
3333        let uniqueness = table.schema().uniqueness_constraints.clone();
3334        let _ = table;
3335        if !fks.is_empty() {
3336            enforce_fk_inserts(self.active_catalog(), &stmt.table, &fks, &all_values)?;
3337        }
3338        // v7.9.19 — composite UNIQUE / PRIMARY KEY enforcement.
3339        enforce_uniqueness_inserts(
3340            self.active_catalog(),
3341            &stmt.table,
3342            &uniqueness,
3343            &all_values,
3344        )?;
3345        // v7.9.29 — CREATE UNIQUE INDEX [WHERE pred] enforcement.
3346        // Independent of table-level UniquenessConstraint (which
3347        // can't carry a predicate). Walks the table's indexes;
3348        // for each `is_unique` index, only rows whose
3349        // partial_predicate evaluates truthy are checked for
3350        // collision. mailrs K1.
3351        enforce_unique_index_inserts(
3352            self.active_catalog(),
3353            &stmt.table,
3354            &all_values,
3355        )?;
3356        // v7.9.8 / v7.9.9 — ON CONFLICT handling.
3357        //   - `DO NOTHING` filters `all_values` to non-conflicting
3358        //     rows + drops within-batch duplicates.
3359        //   - `DO UPDATE SET …` ALSO filters, but for each
3360        //     conflicting row it queues an UPDATE on the existing
3361        //     row using the incoming row's values as `EXCLUDED.*`.
3362        let mut pending_updates: Vec<(usize, Vec<Value>)> = Vec::new();
3363        let mut skipped_count = 0usize;
3364        if let Some(clause) = &stmt.on_conflict {
3365            let conflict_cols = resolve_on_conflict_columns(
3366                self.active_catalog(),
3367                &stmt.table,
3368                clause.target_columns.as_slice(),
3369            )?;
3370            let mut kept: Vec<Vec<Value>> = Vec::with_capacity(all_values.len());
3371            let mut seen_keys: Vec<Vec<Value>> = Vec::new();
3372            for values in all_values {
3373                let key_tuple: Vec<&Value> =
3374                    conflict_cols.iter().map(|&c| &values[c]).collect();
3375                // SQL spec: NULL in any conflict column means "no
3376                // conflict possible" (NULL ≠ NULL for uniqueness).
3377                let has_null_key = key_tuple.iter().any(|v| matches!(v, Value::Null));
3378                let collides_with_table = !has_null_key
3379                    && on_conflict_keys_exist(
3380                        self.active_catalog(),
3381                        &stmt.table,
3382                        &conflict_cols,
3383                        &key_tuple,
3384                    );
3385                let key_tuple_owned: Vec<Value> =
3386                    key_tuple.iter().map(|v| (*v).clone()).collect();
3387                let collides_with_batch = !has_null_key
3388                    && seen_keys.iter().any(|k| k == &key_tuple_owned);
3389                let collides = collides_with_table || collides_with_batch;
3390                match (&clause.action, collides) {
3391                    (_, false) => {
3392                        seen_keys.push(key_tuple_owned);
3393                        kept.push(values);
3394                    }
3395                    (spg_sql::ast::OnConflictAction::Nothing, true) => {
3396                        skipped_count += 1;
3397                    }
3398                    (
3399                        spg_sql::ast::OnConflictAction::Update {
3400                            assignments,
3401                            where_,
3402                        },
3403                        true,
3404                    ) => {
3405                        if !collides_with_table {
3406                            skipped_count += 1;
3407                            continue;
3408                        }
3409                        let target_pos = lookup_row_position_by_keys(
3410                            self.active_catalog(),
3411                            &stmt.table,
3412                            &conflict_cols,
3413                            &key_tuple,
3414                        )
3415                        .ok_or_else(|| {
3416                            EngineError::Unsupported(
3417                                "ON CONFLICT DO UPDATE: conflict detected but row \
3418                                 position could not be resolved (cold-tier row?)"
3419                                    .into(),
3420                            )
3421                        })?;
3422                        let updated = apply_on_conflict_assignments(
3423                            self.active_catalog(),
3424                            &stmt.table,
3425                            target_pos,
3426                            &values,
3427                            assignments,
3428                            where_.as_ref(),
3429                        )?;
3430                        if let Some(new_row) = updated {
3431                            pending_updates.push((target_pos, new_row));
3432                        } else {
3433                            skipped_count += 1;
3434                        }
3435                    }
3436                }
3437            }
3438            all_values = kept;
3439        }
3440        // Stage 3 — insert all rows under a fresh mutable borrow.
3441        let table = self
3442            .active_catalog_mut()
3443            .get_mut(&stmt.table)
3444            .ok_or_else(|| {
3445                EngineError::Storage(StorageError::TableNotFound {
3446                    name: stmt.table.clone(),
3447                })
3448            })?;
3449        // v7.9.4 — keep RETURNING projection rows separate per
3450        // INSERT and per UPDATE branch so DO UPDATE pushes the new
3451        // post-update state, not the incoming-only values.
3452        let mut returning_rows: Vec<Vec<Value>> = Vec::new();
3453        for values in all_values {
3454            if stmt.returning.is_some() {
3455                returning_rows.push(values.clone());
3456            }
3457            table.insert(Row::new(values))?;
3458            affected += 1;
3459        }
3460        // v7.9.9 — apply ON CONFLICT DO UPDATE rewrites collected
3461        // in the conflict-resolution pass. update_row handles
3462        // index maintenance + body re-encoding.
3463        for (pos, new_row) in pending_updates {
3464            if stmt.returning.is_some() {
3465                returning_rows.push(new_row.clone());
3466            }
3467            table.update_row(pos, new_row)?;
3468            affected += 1;
3469        }
3470        let _ = skipped_count;
3471        // v7.9.4/v7.9.9 — RETURNING streams the rows that ended
3472        // up in the table after this statement (insert or
3473        // post-update on conflict).
3474        if let Some(items) = &stmt.returning {
3475            let _ = table;
3476            return self.build_returning_rows(
3477                &stmt.table,
3478                items,
3479                returning_rows,
3480            );
3481        }
3482        // v6.2.1 — auto-analyze: track per-table modified-row
3483        // counter so the background sweep can decide when to
3484        // re-ANALYZE. Cheap path on the autocommit-wrap hot loop
3485        // — one BTreeMap entry update per INSERT batch.
3486        if !self.in_transaction() && affected > 0 {
3487            self.statistics
3488                .record_modifications(&stmt.table, affected as u64);
3489        }
3490        Ok(QueryResult::CommandOk {
3491            affected,
3492            modified_catalog: !self.in_transaction(),
3493        })
3494    }
3495
3496    /// v4.5: SELECT with cooperative cancellation. The token is
3497    /// honoured between UNION peers and inside the bare-SELECT row
3498    /// loop; HNSW kNN graph walks and the aggregate executor don't
3499    /// honour it yet (deferred — those paths bound their work
3500    /// internally by `LIMIT k` and `GROUP BY` cardinality).
3501    /// v6.10.2 — cold-tier time-travel scan. Resolves the segment
3502    /// by id, decodes each row body against the table's current
3503    /// schema, applies the SELECT's projection + optional WHERE +
3504    /// optional LIMIT, returns a `Rows` result. JOINs / aggregates
3505    /// / ORDER BY are unsupported on this path (STABILITY carve-
3506    /// out); operators wanting them should restore the segment
3507    /// into a regular table first.
3508    fn exec_select_as_of_segment(
3509        &self,
3510        stmt: &SelectStatement,
3511        from: &spg_sql::ast::FromClause,
3512        segment_id: u32,
3513    ) -> Result<QueryResult, EngineError> {
3514        // v6.10.2 scope: no joins, no aggregates, no ORDER BY,
3515        // no GROUP BY / HAVING / UNION / OFFSET / DISTINCT.
3516        if !from.joins.is_empty()
3517            || stmt.group_by.is_some()
3518            || stmt.having.is_some()
3519            || !stmt.unions.is_empty()
3520            || !stmt.order_by.is_empty()
3521            || stmt.offset.is_some()
3522            || stmt.distinct
3523            || aggregate::uses_aggregate(stmt)
3524        {
3525            return Err(EngineError::Unsupported(
3526                "AS OF SEGMENT supports SELECT projection + WHERE + LIMIT only \
3527                 (joins / aggregates / ORDER BY are STABILITY § \"Out of v6.10\")"
3528                    .into(),
3529            ));
3530        }
3531        let table = self
3532            .active_catalog()
3533            .get(&from.primary.name)
3534            .ok_or_else(|| StorageError::TableNotFound {
3535                name: from.primary.name.clone(),
3536            })?;
3537        let schema = table.schema().clone();
3538        let schema_cols = &schema.columns;
3539        let alias = from
3540            .primary
3541            .alias
3542            .as_deref()
3543            .unwrap_or(from.primary.name.as_str());
3544        let ctx = EvalContext::new(schema_cols, Some(alias));
3545        let seg = self
3546            .active_catalog()
3547            .cold_segment(segment_id)
3548            .ok_or_else(|| {
3549                EngineError::Unsupported(alloc::format!(
3550                    "AS OF SEGMENT: cold segment {segment_id} not registered"
3551                ))
3552            })?;
3553        let mut out_rows: Vec<Row> = Vec::new();
3554        let mut limit_remaining: Option<usize> =
3555            stmt.limit_literal().and_then(|n| usize::try_from(n).ok());
3556        for (_key, body) in seg.scan() {
3557            let (row, _consumed) = spg_storage::decode_row_body_dense(&body, &schema)
3558                .map_err(EngineError::Storage)?;
3559            if let Some(where_expr) = &stmt.where_ {
3560                let cond = self.eval_expr_simple(where_expr, &row, &ctx)?;
3561                if !matches!(cond, Value::Bool(true)) {
3562                    continue;
3563                }
3564            }
3565            // Projection.
3566            let projected = self.project_row_simple(&row, &stmt.items, schema_cols, alias)?;
3567            out_rows.push(projected);
3568            if let Some(rem) = limit_remaining.as_mut() {
3569                if *rem == 0 {
3570                    out_rows.pop();
3571                    break;
3572                }
3573                *rem -= 1;
3574            }
3575        }
3576        // Output column schema: derive from SELECT items.
3577        let columns = self.derive_output_columns(&stmt.items, schema_cols, alias);
3578        Ok(QueryResult::Rows {
3579            columns,
3580            rows: out_rows,
3581        })
3582    }
3583
3584    /// v6.10.2 — simple-path WHERE eval that doesn't go through
3585    /// the correlated-subquery / Memoize machinery. AS OF SEGMENT
3586    /// scan paths predicate against a snapshot frozen segment, no
3587    /// cross-row state.
3588    fn eval_expr_simple(
3589        &self,
3590        expr: &Expr,
3591        row: &Row,
3592        ctx: &EvalContext,
3593    ) -> Result<Value, EngineError> {
3594        let cancel = CancelToken::none();
3595        self.eval_expr_with_correlated(expr, row, ctx, cancel, None)
3596    }
3597
3598    /// v7.9.4 — INSERT / UPDATE / DELETE RETURNING projector.
3599    /// Given the table name, the user-supplied projection items,
3600    /// and the mutated rows (post-insert / post-update values, or
3601    /// pre-delete snapshot), build a `QueryResult::Rows` whose
3602    /// schema describes the projected columns. Mailrs migration
3603    /// blocker #1.
3604    fn build_returning_rows(
3605        &self,
3606        table_name: &str,
3607        items: &[SelectItem],
3608        mutated_rows: Vec<Vec<Value>>,
3609    ) -> Result<QueryResult, EngineError> {
3610        let table = self.active_catalog().get(table_name).ok_or_else(|| {
3611            EngineError::Storage(StorageError::TableNotFound {
3612                name: table_name.into(),
3613            })
3614        })?;
3615        let schema_cols = table.schema().columns.clone();
3616        let columns = self.derive_output_columns(items, &schema_cols, table_name);
3617        let mut out_rows: Vec<Row> = Vec::with_capacity(mutated_rows.len());
3618        for values in mutated_rows {
3619            let row = Row::new(values);
3620            let projected = self.project_row_simple(&row, items, &schema_cols, table_name)?;
3621            out_rows.push(projected);
3622        }
3623        Ok(QueryResult::Rows {
3624            columns,
3625            rows: out_rows,
3626        })
3627    }
3628
3629    /// v6.10.2 — projection for AS OF SEGMENT. Resolves
3630    /// `SelectItem::Wildcard` to all schema columns and
3631    /// `SelectItem::Expr` via the regular eval path.
3632    fn project_row_simple(
3633        &self,
3634        row: &Row,
3635        items: &[SelectItem],
3636        schema_cols: &[ColumnSchema],
3637        alias: &str,
3638    ) -> Result<Row, EngineError> {
3639        let ctx = EvalContext::new(schema_cols, Some(alias));
3640        let cancel = CancelToken::none();
3641        let mut out_vals = Vec::new();
3642        for item in items {
3643            match item {
3644                SelectItem::Wildcard => {
3645                    out_vals.extend(row.values.iter().cloned());
3646                }
3647                SelectItem::Expr { expr, .. } => {
3648                    let v = self.eval_expr_with_correlated(expr, row, &ctx, cancel, None)?;
3649                    out_vals.push(v);
3650                }
3651            }
3652        }
3653        Ok(Row::new(out_vals))
3654    }
3655
3656    /// v6.10.2 — derive the output `ColumnSchema` list for an
3657    /// AS OF SEGMENT projection. Wildcards take the full schema;
3658    /// expressions take the alias if present or a synthetic
3659    /// `?column?` (PG convention) otherwise.
3660    fn derive_output_columns(
3661        &self,
3662        items: &[SelectItem],
3663        schema_cols: &[ColumnSchema],
3664        _alias: &str,
3665    ) -> Vec<ColumnSchema> {
3666        let mut out = Vec::new();
3667        for item in items {
3668            match item {
3669                SelectItem::Wildcard => {
3670                    out.extend(schema_cols.iter().cloned());
3671                }
3672                SelectItem::Expr { alias, .. } => {
3673                    let name = alias
3674                        .clone()
3675                        .unwrap_or_else(|| "?column?".to_string());
3676                    // Default to Text; the caller's row values
3677                    // carry the actual type. v6.10.2 scope.
3678                    out.push(ColumnSchema::new(name, DataType::Text, true));
3679                }
3680            }
3681        }
3682        out
3683    }
3684
3685    fn exec_select_cancel(
3686        &self,
3687        stmt: &SelectStatement,
3688        cancel: CancelToken<'_>,
3689    ) -> Result<QueryResult, EngineError> {
3690        cancel.check()?;
3691        // v6.10.2 — cold-tier time-travel short-circuit. When the
3692        // primary TableRef carries `AS OF SEGMENT '<id>'`, run a
3693        // dedicated cold-segment scan instead of the regular
3694        // hot+index path. The scope is intentionally narrow for
3695        // v6.10.2 — bare `SELECT * FROM <t> AS OF SEGMENT 'id'`,
3696        // optionally with a single-column-equality WHERE. JOINs /
3697        // aggregates / ORDER BY / subqueries on top of a time-
3698        // travelled scan are STABILITY § "Out of v6.10".
3699        if let Some(from) = &stmt.from
3700            && let Some(seg_id) = from.primary.as_of_segment
3701        {
3702            return self.exec_select_as_of_segment(stmt, from, seg_id);
3703        }
3704        // v6.2.0 / v6.5.0 — virtual-table short-circuits. Detected
3705        // pre-CTE because they don't read from the catalog and
3706        // shouldn't participate in regular FROM resolution.
3707        if let Some(from) = &stmt.from
3708            && from.joins.is_empty()
3709            && stmt.where_.is_none()
3710            && stmt.group_by.is_none()
3711            && stmt.having.is_none()
3712            && stmt.unions.is_empty()
3713            && stmt.order_by.is_empty()
3714            && stmt.limit.is_none()
3715            && stmt.offset.is_none()
3716            && !stmt.distinct
3717            && stmt.items.iter().all(|i| matches!(i, SelectItem::Wildcard))
3718        {
3719            let lower = from.primary.name.to_ascii_lowercase();
3720            match lower.as_str() {
3721                "spg_statistic" => return Ok(self.exec_spg_statistic()),
3722                // v6.5.0 — observability v2 virtual tables.
3723                "spg_stat_replication" => return Ok(self.exec_spg_stat_replication()),
3724                "spg_stat_segment" => return Ok(self.exec_spg_stat_segment()),
3725                "spg_stat_query" => return Ok(self.exec_spg_stat_query()),
3726                "spg_stat_activity" => return Ok(self.exec_spg_stat_activity()),
3727                "spg_audit_chain" => return Ok(self.exec_spg_audit_chain()),
3728                "spg_audit_verify" => return Ok(self.exec_spg_audit_verify()),
3729                "spg_table_ddl" => return Ok(self.exec_spg_table_ddl()),
3730                "spg_role_ddl" => return Ok(self.exec_spg_role_ddl()),
3731                "spg_database_ddl" => return Ok(self.exec_spg_database_ddl()),
3732                _ => {}
3733            }
3734        }
3735        // v4.11: CTEs materialise into a temporary enriched catalog
3736        // *before* anything else — the body SELECT can then refer
3737        // to CTE names via the regular FROM-clause resolution.
3738        // Uncorrelated only: each CTE body runs once against the
3739        // current catalog, not against later CTEs' results (left-
3740        // to-right materialisation would relax this, but we keep
3741        // it simple for v4.11 MVP).
3742        if !stmt.ctes.is_empty() {
3743            return self.exec_with_ctes(stmt, cancel);
3744        }
3745        // v4.10: subqueries (uncorrelated) are resolved here, before
3746        // the executor sees the row loop. We clone the statement so
3747        // we can mutate without disturbing the caller's AST — most
3748        // queries pass through with no subquery nodes and the clone
3749        // is cheap; with subqueries the materialisation cost
3750        // dominates anyway.
3751        let mut stmt_owned;
3752        let stmt_ref: &SelectStatement = if expr_tree_has_subquery(stmt) {
3753            stmt_owned = stmt.clone();
3754            self.resolve_select_subqueries(&mut stmt_owned, cancel)?;
3755            &stmt_owned
3756        } else {
3757            stmt
3758        };
3759        if stmt_ref.unions.is_empty() {
3760            return self.exec_bare_select_cancel(stmt_ref, cancel);
3761        }
3762        // UNION path: clone-strip the head into a bare block (its own
3763        // DISTINCT and any inner ORDER BY are dropped by parser rule —
3764        // the wrapper SelectStatement carries them), execute, then chain
3765        // peers with left-associative dedup semantics.
3766        let mut head = stmt_ref.clone();
3767        head.unions = Vec::new();
3768        head.order_by = Vec::new();
3769        head.limit = None;
3770        let QueryResult::Rows { columns, mut rows } =
3771            self.exec_bare_select_cancel(&head, cancel)?
3772        else {
3773            unreachable!("bare SELECT cannot return CommandOk")
3774        };
3775        for (kind, peer) in &stmt_ref.unions {
3776            let QueryResult::Rows {
3777                columns: peer_cols,
3778                rows: peer_rows,
3779            } = self.exec_bare_select_cancel(peer, cancel)?
3780            else {
3781                unreachable!("bare SELECT cannot return CommandOk")
3782            };
3783            if peer_cols.len() != columns.len() {
3784                return Err(EngineError::Unsupported(alloc::format!(
3785                    "UNION arity mismatch: head has {} columns, peer has {}",
3786                    columns.len(),
3787                    peer_cols.len()
3788                )));
3789            }
3790            rows.extend(peer_rows);
3791            if matches!(kind, UnionKind::Distinct) {
3792                rows = dedup_rows(rows);
3793            }
3794        }
3795        // ORDER BY at the top of a UNION applies to the combined result.
3796        // Eval against the projected schema (NOT the source table).
3797        if !stmt.order_by.is_empty() {
3798            let synth_ctx = EvalContext::new(&columns, None);
3799            let descs: Vec<bool> = stmt.order_by.iter().map(|o| o.desc).collect();
3800            let mut tagged: Vec<(Vec<f64>, Row)> = Vec::with_capacity(rows.len());
3801            for r in rows {
3802                let keys = build_order_keys(&stmt.order_by, &r, &synth_ctx)?;
3803                tagged.push((keys, r));
3804            }
3805            sort_by_keys(&mut tagged, &descs);
3806            rows = tagged.into_iter().map(|(_, r)| r).collect();
3807        }
3808        apply_offset_and_limit(&mut rows, stmt.offset_literal(), stmt.limit_literal());
3809        Ok(QueryResult::Rows { columns, rows })
3810    }
3811
3812    #[allow(clippy::too_many_lines)]
3813    #[allow(clippy::too_many_lines)] // huge match — splitting fragments the planner
3814    fn exec_bare_select_cancel(
3815        &self,
3816        stmt: &SelectStatement,
3817        cancel: CancelToken<'_>,
3818    ) -> Result<QueryResult, EngineError> {
3819        // v4.12: window-function path. When the projection contains
3820        // any `name(args) OVER (...)` we route to the dedicated
3821        // executor — partition + sort + per-row window value before
3822        // the regular projection.
3823        if select_has_window(stmt) {
3824            return self.exec_select_with_window(stmt, cancel);
3825        }
3826        // Constant SELECT (no FROM) — evaluate each item once against an
3827        // empty dummy row. Useful for `SELECT 1`, `SELECT coalesce(...)`,
3828        // `SELECT '7'::INT`. Column references will surface as
3829        // ColumnNotFound on eval since the schema is empty.
3830        let Some(from) = &stmt.from else {
3831            let empty_schema: Vec<ColumnSchema> = Vec::new();
3832            let ctx = EvalContext::new(&empty_schema, None);
3833            let projection = build_projection(&stmt.items, &empty_schema, "")?;
3834            let dummy_row = Row::new(Vec::new());
3835            let mut values = Vec::with_capacity(projection.len());
3836            for p in &projection {
3837                values.push(eval::eval_expr(&p.expr, &dummy_row, &ctx)?);
3838            }
3839            let columns: Vec<ColumnSchema> = projection
3840                .into_iter()
3841                .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
3842                .collect();
3843            return Ok(QueryResult::Rows {
3844                columns,
3845                rows: alloc::vec![Row::new(values)],
3846            });
3847        };
3848        // Multi-table FROM (one or more joined peers) goes through the
3849        // nested-loop join executor. Single-table FROM stays on the
3850        // existing scan + index-seek path.
3851        if !from.joins.is_empty() {
3852            return self.exec_joined_select(stmt, from);
3853        }
3854        let primary = &from.primary;
3855        let table = self.active_catalog().get(&primary.name).ok_or_else(|| {
3856            StorageError::TableNotFound {
3857                name: primary.name.clone(),
3858            }
3859        })?;
3860        let schema_cols = &table.schema().columns;
3861        // The qualifier accepted on column refs is the alias (if any) else the
3862        // bare table name.
3863        let alias = primary.alias.as_deref().unwrap_or(primary.name.as_str());
3864        let ctx = EvalContext::new(schema_cols, Some(alias));
3865
3866        // NSW kNN planner: `ORDER BY col <-> literal LIMIT k` with no
3867        // WHERE and an NSW index on `col` skips the full scan. The
3868        // walk returns rows already in ascending-distance order, so
3869        // ORDER BY / LIMIT are honoured implicitly.
3870        if let Some(nsw_rows) = try_nsw_knn(stmt, table, schema_cols, alias) {
3871            return materialise_in_order(stmt, table, schema_cols, alias, &nsw_rows);
3872        }
3873
3874        // Index seek: if WHERE is `col = literal` (or commuted) and the
3875        // referenced column has an index, dispatch each locator through
3876        // the catalog (hot tier → borrow, cold tier → page-read +
3877        // decode) and iterate just those rows. Otherwise fall back to a
3878        // full scan over the hot tier (cold-tier rows are only reached
3879        // via index seek in v5.1 — full table scans against cold-tier
3880        // data ship in v5.2 with the freezer's per-segment scan API).
3881        let indexed_rows: Option<Vec<Cow<'_, Row>>> = stmt
3882            .where_
3883            .as_ref()
3884            .and_then(|w| try_index_seek(w, schema_cols, self.active_catalog(), table, alias));
3885
3886        // Aggregate path: filter rows first, then hand off to the
3887        // aggregate executor which does its own projection + ORDER BY.
3888        if aggregate::uses_aggregate(stmt) {
3889            let mut filtered: Vec<&Row> = Vec::new();
3890            // v6.2.6 — Memoize: per-query LRU cache for correlated
3891            // scalar subqueries. Fresh per row-loop entry so each
3892            // SELECT execution gets an isolated cache.
3893            let mut memo = memoize::MemoizeCache::new();
3894            if let Some(rows) = &indexed_rows {
3895                for cow in rows {
3896                    let row = cow.as_ref();
3897                    if let Some(where_expr) = &stmt.where_ {
3898                        let cond = self.eval_expr_with_correlated(
3899                            where_expr,
3900                            row,
3901                            &ctx,
3902                            cancel,
3903                            Some(&mut memo),
3904                        )?;
3905                        if !matches!(cond, Value::Bool(true)) {
3906                            continue;
3907                        }
3908                    }
3909                    filtered.push(row);
3910                }
3911            } else {
3912                for i in 0..table.row_count() {
3913                    let row = &table.rows()[i];
3914                    if let Some(where_expr) = &stmt.where_ {
3915                        let cond = self.eval_expr_with_correlated(
3916                            where_expr,
3917                            row,
3918                            &ctx,
3919                            cancel,
3920                            Some(&mut memo),
3921                        )?;
3922                        if !matches!(cond, Value::Bool(true)) {
3923                            continue;
3924                        }
3925                    }
3926                    filtered.push(row);
3927                }
3928            }
3929            let mut agg = aggregate::run(stmt, &filtered, schema_cols, Some(alias))?;
3930            apply_offset_and_limit(&mut agg.rows, stmt.offset_literal(), stmt.limit_literal());
3931            return Ok(QueryResult::Rows {
3932                columns: agg.columns,
3933                rows: agg.rows,
3934            });
3935        }
3936
3937        let projection = build_projection(&stmt.items, schema_cols, alias)?;
3938
3939        // Materialise the filter pass into `(order_key, projected_row)`
3940        // tuples. The order key is `None` when there's no ORDER BY clause.
3941        let mut tagged: Vec<(Vec<f64>, Row)> = Vec::new();
3942        // v6.2.6 — Memoize per-row WHERE eval shares one cache.
3943        let mut memo = memoize::MemoizeCache::new();
3944        // Inline the per-row work in a closure so the indexed and full-
3945        // scan branches share the body.
3946        let mut process_row = |row: &Row, loop_idx: usize| -> Result<(), EngineError> {
3947            if loop_idx.is_multiple_of(256) {
3948                cancel.check()?;
3949            }
3950            if let Some(where_expr) = &stmt.where_ {
3951                let cond = self.eval_expr_with_correlated(
3952                    where_expr,
3953                    row,
3954                    &ctx,
3955                    cancel,
3956                    Some(&mut memo),
3957                )?;
3958                if !matches!(cond, Value::Bool(true)) {
3959                    return Ok(());
3960                }
3961            }
3962            let mut values = Vec::with_capacity(projection.len());
3963            for p in &projection {
3964                values.push(eval::eval_expr(&p.expr, row, &ctx)?);
3965            }
3966            let order_keys = if stmt.order_by.is_empty() {
3967                Vec::new()
3968            } else {
3969                build_order_keys(&stmt.order_by, row, &ctx)?
3970            };
3971            tagged.push((order_keys, Row::new(values)));
3972            Ok(())
3973        };
3974        if let Some(rows) = &indexed_rows {
3975            for (loop_idx, cow) in rows.iter().enumerate() {
3976                process_row(cow.as_ref(), loop_idx)?;
3977            }
3978        } else {
3979            for i in 0..table.row_count() {
3980                process_row(&table.rows()[i], i)?;
3981            }
3982        }
3983
3984        if !stmt.order_by.is_empty() {
3985            // Partial-sort fast path: when LIMIT is small relative to
3986            // the row count, select_nth_unstable + sort just the
3987            // prefix is O(n + k log k) instead of O(n log n). DISTINCT
3988            // requires the full sort because de-dup happens after.
3989            let keep = if stmt.distinct {
3990                None
3991            } else {
3992                stmt.limit_literal()
3993                    .map(|l| l as usize + stmt.offset_literal().map_or(0, |o| o as usize))
3994            };
3995            let descs: Vec<bool> = stmt.order_by.iter().map(|o| o.desc).collect();
3996            partial_sort_tagged(&mut tagged, keep, &descs);
3997        }
3998
3999        let mut output_rows: Vec<Row> = tagged.into_iter().map(|(_, r)| r).collect();
4000        if stmt.distinct {
4001            output_rows = dedup_rows(output_rows);
4002        }
4003        apply_offset_and_limit(&mut output_rows, stmt.offset_literal(), stmt.limit_literal());
4004
4005        let columns: Vec<ColumnSchema> = projection
4006            .into_iter()
4007            .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
4008            .collect();
4009
4010        Ok(QueryResult::Rows {
4011            columns,
4012            rows: output_rows,
4013        })
4014    }
4015
4016    /// Multi-table SELECT executor (one or more JOIN peers).
4017    ///
4018    /// v1.10 builds the joined row set up-front via nested-loop joins,
4019    /// then runs WHERE + projection + ORDER BY against the combined
4020    /// rows. No index seek. Aggregates and DISTINCT still work because
4021    /// the executor delegates projection through the same shared paths.
4022    #[allow(clippy::too_many_lines)]
4023    fn exec_joined_select(
4024        &self,
4025        stmt: &SelectStatement,
4026        from: &FromClause,
4027    ) -> Result<QueryResult, EngineError> {
4028        // Resolve every table reference up front so we surface
4029        // TableNotFound before we start the cartesian work.
4030        let primary_table = self
4031            .active_catalog()
4032            .get(&from.primary.name)
4033            .ok_or_else(|| StorageError::TableNotFound {
4034                name: from.primary.name.clone(),
4035            })?;
4036        let primary_alias = from
4037            .primary
4038            .alias
4039            .as_deref()
4040            .unwrap_or(from.primary.name.as_str())
4041            .to_string();
4042        let mut joined_tables: Vec<(&Table, String, JoinKind, Option<&Expr>)> = Vec::new();
4043        for j in &from.joins {
4044            let t = self.active_catalog().get(&j.table.name).ok_or_else(|| {
4045                StorageError::TableNotFound {
4046                    name: j.table.name.clone(),
4047                }
4048            })?;
4049            let a = j
4050                .table
4051                .alias
4052                .as_deref()
4053                .unwrap_or(j.table.name.as_str())
4054                .to_string();
4055            joined_tables.push((t, a, j.kind, j.on.as_ref()));
4056        }
4057
4058        // Build the combined schema: composite "alias.col" names so the
4059        // qualified-column resolver can find anything by exact match.
4060        let mut combined_schema: Vec<ColumnSchema> = Vec::new();
4061        for col in &primary_table.schema().columns {
4062            combined_schema.push(ColumnSchema::new(
4063                alloc::format!("{primary_alias}.{}", col.name),
4064                col.ty,
4065                col.nullable,
4066            ));
4067        }
4068        for (t, a, _, _) in &joined_tables {
4069            for col in &t.schema().columns {
4070                combined_schema.push(ColumnSchema::new(
4071                    alloc::format!("{a}.{}", col.name),
4072                    col.ty,
4073                    col.nullable,
4074                ));
4075            }
4076        }
4077        let ctx = EvalContext::new(&combined_schema, None);
4078
4079        // Nested-loop join. Starting set: every primary row, padded with
4080        // (no joined columns yet).
4081        let mut working: Vec<Row> = primary_table.rows().iter().cloned().collect();
4082        let mut produced_len = primary_table.schema().columns.len();
4083        for (t, _, kind, on) in &joined_tables {
4084            let right_arity = t.schema().columns.len();
4085            let mut next: Vec<Row> = Vec::new();
4086            for left in &working {
4087                let mut left_matched = false;
4088                for right in t.rows() {
4089                    let mut combined_vals = left.values.clone();
4090                    combined_vals.extend(right.values.iter().cloned());
4091                    // Pad combined to the eventual full width so the
4092                    // partial schema still matches positions used by ON.
4093                    let combined = Row::new(combined_vals);
4094                    let keep = if let Some(on_expr) = on {
4095                        let cond = eval::eval_expr(on_expr, &combined, &ctx)?;
4096                        matches!(cond, Value::Bool(true))
4097                    } else {
4098                        // CROSS / comma-list: every pair survives.
4099                        true
4100                    };
4101                    if keep {
4102                        next.push(combined);
4103                        left_matched = true;
4104                    }
4105                }
4106                if !left_matched && matches!(kind, JoinKind::Left) {
4107                    // LEFT OUTER JOIN: emit the left row with NULLs on
4108                    // the right side when no peer matched.
4109                    let mut combined_vals = left.values.clone();
4110                    for _ in 0..right_arity {
4111                        combined_vals.push(Value::Null);
4112                    }
4113                    next.push(Row::new(combined_vals));
4114                }
4115            }
4116            working = next;
4117            produced_len += right_arity;
4118            debug_assert!(produced_len <= combined_schema.len());
4119        }
4120
4121        // WHERE filter against combined rows.
4122        let mut filtered: Vec<Row> = Vec::new();
4123        for row in working {
4124            if let Some(where_expr) = &stmt.where_ {
4125                let cond = eval::eval_expr(where_expr, &row, &ctx)?;
4126                if !matches!(cond, Value::Bool(true)) {
4127                    continue;
4128                }
4129            }
4130            filtered.push(row);
4131        }
4132
4133        // Aggregate path: handle GROUP BY / aggregate calls over the
4134        // joined+filtered rows.
4135        if aggregate::uses_aggregate(stmt) {
4136            let refs: Vec<&Row> = filtered.iter().collect();
4137            let mut agg = aggregate::run(stmt, &refs, &combined_schema, None)?;
4138            apply_offset_and_limit(&mut agg.rows, stmt.offset_literal(), stmt.limit_literal());
4139            return Ok(QueryResult::Rows {
4140                columns: agg.columns,
4141                rows: agg.rows,
4142            });
4143        }
4144
4145        let projection = build_projection(&stmt.items, &combined_schema, "")?;
4146        let mut tagged: Vec<(Vec<f64>, Row)> = Vec::new();
4147        for row in &filtered {
4148            let mut values = Vec::with_capacity(projection.len());
4149            for p in &projection {
4150                values.push(eval::eval_expr(&p.expr, row, &ctx)?);
4151            }
4152            let order_keys = if stmt.order_by.is_empty() {
4153                Vec::new()
4154            } else {
4155                build_order_keys(&stmt.order_by, row, &ctx)?
4156            };
4157            tagged.push((order_keys, Row::new(values)));
4158        }
4159        if !stmt.order_by.is_empty() {
4160            let keep = if stmt.distinct {
4161                None
4162            } else {
4163                stmt.limit_literal()
4164                    .map(|l| l as usize + stmt.offset_literal().map_or(0, |o| o as usize))
4165            };
4166            let descs: Vec<bool> = stmt.order_by.iter().map(|o| o.desc).collect();
4167            partial_sort_tagged(&mut tagged, keep, &descs);
4168        }
4169        let mut output_rows: Vec<Row> = tagged.into_iter().map(|(_, r)| r).collect();
4170        if stmt.distinct {
4171            output_rows = dedup_rows(output_rows);
4172        }
4173        apply_offset_and_limit(&mut output_rows, stmt.offset_literal(), stmt.limit_literal());
4174        let columns: Vec<ColumnSchema> = projection
4175            .into_iter()
4176            .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
4177            .collect();
4178        Ok(QueryResult::Rows {
4179            columns,
4180            rows: output_rows,
4181        })
4182    }
4183}
4184
4185/// One row-producing projection: an expression to evaluate, the resulting
4186/// column's user-visible name, its inferred type, and nullability.
4187#[derive(Debug, Clone)]
4188struct ProjectedItem {
4189    expr: Expr,
4190    output_name: String,
4191    ty: DataType,
4192    nullable: bool,
4193}
4194
4195/// Dedupe a row set, preserving first-seen order. `Row`'s `PartialEq` is
4196/// structural (`Vec<Value>` ⇒ pairwise `Value` equality), which gives SQL
4197/// `NULL = NULL → TRUE` and `NaN = NaN → FALSE`. The first agrees with
4198/// the spec's "two NULLs are not distinct"; the second is a tolerated
4199/// quirk for v1 (no NaN literals are reachable from the SQL surface).
4200fn dedup_rows(rows: Vec<Row>) -> Vec<Row> {
4201    let mut out: Vec<Row> = Vec::with_capacity(rows.len());
4202    for r in rows {
4203        if !out.iter().any(|seen| seen == &r) {
4204            out.push(r);
4205        }
4206    }
4207    out
4208}
4209
4210/// Coerce a `Value` to an `f64` sort key for ORDER BY. Numbers map directly;
4211/// NULL sorts last (treated as `+∞`); booleans are 0.0 / 1.0; text uses lex
4212/// order via the byte values; vectors are not sortable.
4213fn value_to_order_key(v: &Value) -> Result<f64, EngineError> {
4214    match v {
4215        Value::Null => Ok(f64::INFINITY),
4216        Value::SmallInt(n) => Ok(f64::from(*n)),
4217        Value::Int(n) => Ok(f64::from(*n)),
4218        Value::Date(d) => Ok(f64::from(*d)),
4219        #[allow(clippy::cast_precision_loss)]
4220        Value::Timestamp(t) => Ok(*t as f64),
4221        #[allow(clippy::cast_precision_loss)]
4222        Value::Numeric { scaled, scale } => {
4223            // Scaled integer / 10^scale, computed via f64 for sort
4224            // ordering only. Precision losses here only matter for
4225            // ORDER BY tie-breaks well past 15 significant digits.
4226            // `f64::powi` lives in std; we hand-roll the loop so the
4227            // no_std engine crate doesn't need it.
4228            let mut divisor = 1.0_f64;
4229            for _ in 0..*scale {
4230                divisor *= 10.0;
4231            }
4232            Ok((*scaled as f64) / divisor)
4233        }
4234        #[allow(clippy::cast_precision_loss)]
4235        Value::BigInt(n) => Ok(*n as f64),
4236        Value::Float(x) => Ok(*x),
4237        Value::Bool(b) => Ok(if *b { 1.0 } else { 0.0 }),
4238        Value::Text(s) => {
4239            // Lex order by codepoints — good enough for ORDER BY name.
4240            // Map first 8 bytes packed into u64 as a coarse key; ties fall to
4241            // partial_cmp Equal. v1.x can swap in a real string comparator.
4242            let mut key: u64 = 0;
4243            for &b in s.as_bytes().iter().take(8) {
4244                key = (key << 8) | u64::from(b);
4245            }
4246            #[allow(clippy::cast_precision_loss)]
4247            Ok(key as f64)
4248        }
4249        Value::Vector(_) | Value::Sq8Vector(_) | Value::HalfVector(_) => {
4250            Err(EngineError::Unsupported(
4251                "ORDER BY of a raw vector column is not meaningful — use `<->`".into(),
4252            ))
4253        }
4254        Value::Interval { .. } => Err(EngineError::Unsupported(
4255            "ORDER BY of an INTERVAL is not supported in v2.11 \
4256             (months vs micros has no single canonical ordering)"
4257                .into(),
4258        )),
4259        Value::Json(_) => Err(EngineError::Unsupported(
4260            "ORDER BY of a JSON value is not supported — cast the document to text first".into(),
4261        )),
4262        // v7.5.0 — Value is #[non_exhaustive]; future variants need
4263        // an explicit ORDER BY mapping. Surface as Unsupported until
4264        // engine support is added.
4265        _ => Err(EngineError::Unsupported(
4266            "ORDER BY of this value type is not supported".into(),
4267        )),
4268    }
4269}
4270
4271/// Try to plan a WHERE clause as an equality lookup against an existing
4272/// index. Returns the candidate row indices on success; `None` means the
4273/// caller should fall back to a full scan.
4274///
4275/// v0.8 recognises a single top-level `col = literal` (in either operand
4276/// order). AND chains and range scans land in later milestones.
4277/// Look for `ORDER BY col <dist-op> literal LIMIT k` against an
4278/// NSW-indexed vector column. Recognised distance ops: `<->` (L2),
4279/// `<#>` (inner product), `<=>` (cosine). When a WHERE clause is
4280/// present, the planner does an "over-fetch and filter" pass — it
4281/// asks the graph for `k * over_fetch` candidates, evaluates WHERE
4282/// against each, and trims back to `k`. Returns the row indices in
4283/// ascending-distance order when the plan applies.
4284fn try_nsw_knn(
4285    stmt: &SelectStatement,
4286    table: &Table,
4287    schema_cols: &[ColumnSchema],
4288    table_alias: &str,
4289) -> Option<Vec<usize>> {
4290    if stmt.distinct {
4291        return None;
4292    }
4293    let limit = usize::try_from(stmt.limit_literal()?).ok()?;
4294    if limit == 0 {
4295        return None;
4296    }
4297    // v6.4.0 — NSW kNN dispatch needs a single ORDER BY key on the
4298    // distance metric. Multi-key ORDER BY falls through to the
4299    // generic sort path.
4300    if stmt.order_by.len() != 1 {
4301        return None;
4302    }
4303    let order = &stmt.order_by[0];
4304    // NSW kNN returns rows ascending by distance — DESC inverts the
4305    // natural order, so the planner can't handle it without a sort
4306    // pass. Fall back to the generic ORDER BY path.
4307    if order.desc {
4308        return None;
4309    }
4310    let Expr::Binary { lhs, op, rhs } = &order.expr else {
4311        return None;
4312    };
4313    let metric = match op {
4314        BinOp::L2Distance => spg_storage::NswMetric::L2,
4315        BinOp::InnerProduct => spg_storage::NswMetric::InnerProduct,
4316        BinOp::CosineDistance => spg_storage::NswMetric::Cosine,
4317        _ => return None,
4318    };
4319    // Accept both `col <op> literal` and `literal <op> col`.
4320    let ((Expr::Column(col), literal) | (literal, Expr::Column(col))) =
4321        (lhs.as_ref(), rhs.as_ref())
4322    else {
4323        return None;
4324    };
4325    if let Some(q) = &col.qualifier
4326        && q != table_alias
4327    {
4328        return None;
4329    }
4330    let col_pos = schema_cols.iter().position(|s| s.name == col.name)?;
4331    let query = literal_to_vector(literal)?;
4332    let idx = spg_storage::nsw_index_on(table, col_pos)?;
4333    if let Some(where_expr) = &stmt.where_ {
4334        // Over-fetch and filter. The factor (10×) is a heuristic that
4335        // covers typical selectivity for the corpus tests; v2.x will
4336        // make it configurable.
4337        let over_fetch = limit.saturating_mul(10).max(NSW_OVER_FETCH_FLOOR);
4338        let candidates = spg_storage::nsw_query(table, &idx.name, &query, over_fetch, metric);
4339        let ctx = EvalContext::new(schema_cols, Some(table_alias));
4340        let mut kept: Vec<usize> = Vec::with_capacity(limit);
4341        for i in candidates {
4342            let row = &table.rows()[i];
4343            let cond = eval::eval_expr(where_expr, row, &ctx).ok()?;
4344            if matches!(cond, Value::Bool(true)) {
4345                kept.push(i);
4346                if kept.len() >= limit {
4347                    break;
4348                }
4349            }
4350        }
4351        Some(kept)
4352    } else {
4353        Some(spg_storage::nsw_query(
4354            table, &idx.name, &query, limit, metric,
4355        ))
4356    }
4357}
4358
4359/// Lower bound on the over-fetch pool when WHERE is present — even
4360/// for tiny `LIMIT 1` queries we keep enough candidates to absorb a
4361/// few WHERE rejections.
4362const NSW_OVER_FETCH_FLOOR: usize = 32;
4363
4364/// Pull a `Vec<f32>` out of a literal-or-cast expression. Returns
4365/// `None` for anything we can't fold at plan time.
4366fn literal_to_vector(e: &Expr) -> Option<Vec<f32>> {
4367    match e {
4368        Expr::Literal(Literal::Vector(v)) => Some(v.clone()),
4369        Expr::Cast { expr, .. } => literal_to_vector(expr),
4370        _ => None,
4371    }
4372}
4373
4374/// Materialise rows in a planner-supplied order (used by the NSW path)
4375/// without re-running ORDER BY. The projection + LIMIT slot mirror the
4376/// equivalent block in `exec_bare_select`.
4377fn materialise_in_order(
4378    stmt: &SelectStatement,
4379    table: &Table,
4380    schema_cols: &[ColumnSchema],
4381    table_alias: &str,
4382    ordered_rows: &[usize],
4383) -> Result<QueryResult, EngineError> {
4384    let ctx = EvalContext::new(schema_cols, Some(table_alias));
4385    let projection = build_projection(&stmt.items, schema_cols, table_alias)?;
4386    let mut output_rows: Vec<Row> = Vec::with_capacity(ordered_rows.len());
4387    for &i in ordered_rows {
4388        let row = &table.rows()[i];
4389        let mut values = Vec::with_capacity(projection.len());
4390        for p in &projection {
4391            values.push(eval::eval_expr(&p.expr, row, &ctx)?);
4392        }
4393        output_rows.push(Row::new(values));
4394    }
4395    apply_offset_and_limit(&mut output_rows, stmt.offset_literal(), stmt.limit_literal());
4396    let columns: Vec<ColumnSchema> = projection
4397        .into_iter()
4398        .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
4399        .collect();
4400    Ok(QueryResult::Rows {
4401        columns,
4402        rows: output_rows,
4403    })
4404}
4405
4406fn try_index_seek<'a>(
4407    where_expr: &Expr,
4408    schema_cols: &[ColumnSchema],
4409    catalog: &'a Catalog,
4410    table: &'a Table,
4411    table_alias: &str,
4412) -> Option<Vec<Cow<'a, Row>>> {
4413    let Expr::Binary {
4414        lhs,
4415        op: BinOp::Eq,
4416        rhs,
4417    } = where_expr
4418    else {
4419        return None;
4420    };
4421    let (col_pos, value) = resolve_col_literal_pair(lhs, rhs, schema_cols, table_alias)
4422        .or_else(|| resolve_col_literal_pair(rhs, lhs, schema_cols, table_alias))?;
4423    let idx = table.index_on(col_pos)?;
4424    let key = IndexKey::from_value(&value)?;
4425    let locators = idx.lookup_eq(&key);
4426    let table_name = table.schema().name.as_str();
4427    // v5.1: each locator dispatches to either the hot tier (zero-
4428    // copy borrow of `table.rows()[i]`) or a cold-tier segment
4429    // (one page read + dense row decode, ~µs scale). Cold rows are
4430    // returned as `Cow::Owned` so the caller's `&Row` iteration
4431    // doesn't see a tier distinction; pre-freezer (no cold
4432    // segments loaded) every locator is `Hot` and every entry is
4433    // `Cow::Borrowed` — identical cost to the pre-v5.1 path.
4434    let mut out: Vec<Cow<'a, Row>> = Vec::with_capacity(locators.len());
4435    for loc in locators {
4436        match *loc {
4437            spg_storage::RowLocator::Hot(i) => {
4438                if let Some(row) = table.rows().get(i) {
4439                    out.push(Cow::Borrowed(row));
4440                }
4441            }
4442            spg_storage::RowLocator::Cold { segment_id, .. } => {
4443                if let Some(row) = catalog.resolve_cold_locator(table_name, segment_id, &key) {
4444                    out.push(Cow::Owned(row));
4445                }
4446            }
4447        }
4448    }
4449    Some(out)
4450}
4451
4452/// v5.2.3: extract `(column_position, IndexKey)` when `where_expr`
4453/// is a simple `col = literal` predicate suitable for a `BTree` index
4454/// seek. Used by `exec_update_cancel` / `exec_delete_cancel` to
4455/// decide whether a write touches a cold-tier row (which requires
4456/// promote-on-write / shadow-on-delete) before falling through to
4457/// the hot-tier row walk.
4458///
4459/// Returns `None` for any predicate shape the planner can't push
4460/// down to an index seek — complex WHERE clauses always take the
4461/// hot-only path (cold rows are immutable to non-indexed writes
4462/// until a future scan-fanout sub-version).
4463fn try_pk_predicate(
4464    where_expr: &Expr,
4465    schema_cols: &[ColumnSchema],
4466    table_alias: &str,
4467) -> Option<(usize, IndexKey)> {
4468    let Expr::Binary {
4469        lhs,
4470        op: BinOp::Eq,
4471        rhs,
4472    } = where_expr
4473    else {
4474        return None;
4475    };
4476    let (col_pos, value) = resolve_col_literal_pair(lhs, rhs, schema_cols, table_alias)
4477        .or_else(|| resolve_col_literal_pair(rhs, lhs, schema_cols, table_alias))?;
4478    let key = IndexKey::from_value(&value)?;
4479    Some((col_pos, key))
4480}
4481
4482fn resolve_col_literal_pair(
4483    col_side: &Expr,
4484    lit_side: &Expr,
4485    schema_cols: &[ColumnSchema],
4486    table_alias: &str,
4487) -> Option<(usize, Value)> {
4488    let Expr::Column(c) = col_side else {
4489        return None;
4490    };
4491    if let Some(q) = &c.qualifier
4492        && q != table_alias
4493    {
4494        return None;
4495    }
4496    let pos = schema_cols.iter().position(|s| s.name == c.name)?;
4497    let Expr::Literal(l) = lit_side else {
4498        return None;
4499    };
4500    let v = match l {
4501        Literal::Integer(n) => {
4502            if let Ok(small) = i32::try_from(*n) {
4503                Value::Int(small)
4504            } else {
4505                Value::BigInt(*n)
4506            }
4507        }
4508        Literal::Float(x) => Value::Float(*x),
4509        Literal::String(s) => Value::Text(s.clone()),
4510        Literal::Bool(b) => Value::Bool(*b),
4511        Literal::Null => Value::Null,
4512        // Vector and Interval literals can't be used as B-tree index keys.
4513        // Tell the planner to fall back to full-scan.
4514        Literal::Vector(_) | Literal::Interval { .. } => return None,
4515    };
4516    Some((pos, v))
4517}
4518
4519/// Find the schema entry that a SELECT-list `Expr::Column` refers to.
4520/// Mirrors `resolve_column` in `eval.rs`, but returns a proper
4521/// `EngineError` so the projection-build path keeps `UnknownQualifier`
4522/// vs `ColumnNotFound` distinct.
4523fn resolve_projection_column<'a>(
4524    c: &ColumnName,
4525    schema_cols: &'a [ColumnSchema],
4526    table_alias: &str,
4527) -> Result<&'a ColumnSchema, EngineError> {
4528    if let Some(q) = &c.qualifier {
4529        let composite = alloc::format!("{q}.{name}", name = c.name);
4530        if let Some(s) = schema_cols.iter().find(|s| s.name == composite) {
4531            return Ok(s);
4532        }
4533        // Single-table case: the qualifier may equal the active alias —
4534        // then look for the bare column name.
4535        if q == table_alias
4536            && let Some(s) = schema_cols.iter().find(|s| s.name == c.name)
4537        {
4538            return Ok(s);
4539        }
4540        // For multi-table schemas the qualifier is unknown only if no
4541        // column bears the "<q>." prefix. For single-table, the alias
4542        // mismatch alone is enough.
4543        let prefix = alloc::format!("{q}.");
4544        let qualifier_known =
4545            q == table_alias || schema_cols.iter().any(|s| s.name.starts_with(&prefix));
4546        if !qualifier_known {
4547            return Err(EngineError::Eval(EvalError::UnknownQualifier {
4548                qualifier: q.clone(),
4549            }));
4550        }
4551        return Err(EngineError::Eval(EvalError::ColumnNotFound {
4552            name: c.name.clone(),
4553        }));
4554    }
4555    if let Some(s) = schema_cols.iter().find(|s| s.name == c.name) {
4556        return Ok(s);
4557    }
4558    let suffix = alloc::format!(".{name}", name = c.name);
4559    let mut matches = schema_cols.iter().filter(|s| s.name.ends_with(&suffix));
4560    let first = matches.next();
4561    let extra = matches.next();
4562    match (first, extra) {
4563        (Some(s), None) => Ok(s),
4564        (Some(_), Some(_)) => Err(EngineError::Eval(EvalError::TypeMismatch {
4565            detail: alloc::format!("ambiguous column reference: {}", c.name),
4566        })),
4567        _ => Err(EngineError::Eval(EvalError::ColumnNotFound {
4568            name: c.name.clone(),
4569        })),
4570    }
4571}
4572
4573fn build_projection(
4574    items: &[SelectItem],
4575    schema_cols: &[ColumnSchema],
4576    table_alias: &str,
4577) -> Result<Vec<ProjectedItem>, EngineError> {
4578    let mut out = Vec::new();
4579    for item in items {
4580        match item {
4581            SelectItem::Wildcard => {
4582                for col in schema_cols {
4583                    out.push(ProjectedItem {
4584                        expr: Expr::Column(ColumnName {
4585                            qualifier: None,
4586                            name: col.name.clone(),
4587                        }),
4588                        output_name: col.name.clone(),
4589                        ty: col.ty,
4590                        nullable: col.nullable,
4591                    });
4592                }
4593            }
4594            SelectItem::Expr { expr, alias } => {
4595                // Plain column ref keeps full schema info (real type +
4596                // nullability). Compound expressions evaluate fine but have
4597                // no static type — surface them as nullable TEXT, which is
4598                // what most clients render anyway.
4599                if let Expr::Column(c) = expr {
4600                    let sch = resolve_projection_column(c, schema_cols, table_alias)?;
4601                    let output_name = alias.clone().unwrap_or_else(|| c.name.clone());
4602                    out.push(ProjectedItem {
4603                        expr: expr.clone(),
4604                        output_name,
4605                        ty: sch.ty,
4606                        nullable: sch.nullable,
4607                    });
4608                } else {
4609                    let output_name = alias.clone().unwrap_or_else(|| expr.to_string());
4610                    out.push(ProjectedItem {
4611                        expr: expr.clone(),
4612                        output_name,
4613                        ty: DataType::Text,
4614                        nullable: true,
4615                    });
4616                }
4617            }
4618        }
4619    }
4620    Ok(out)
4621}
4622
4623/// Promote an integer to a NUMERIC value at the requested scale.
4624/// Rejects values that, after scaling, would overflow the column's
4625/// precision budget.
4626fn numeric_from_integer(
4627    n: i128,
4628    precision: u8,
4629    scale: u8,
4630    col_name: &str,
4631) -> Result<Value, EngineError> {
4632    let factor = pow10_i128(scale);
4633    let scaled = n.checked_mul(factor).ok_or_else(|| {
4634        EngineError::Unsupported(alloc::format!(
4635            "integer overflow scaling value for column `{col_name}` to scale {scale}"
4636        ))
4637    })?;
4638    check_precision(scaled, precision, col_name)?;
4639    Ok(Value::Numeric { scaled, scale })
4640}
4641
4642/// Float → NUMERIC. Uses round-half-away-from-zero on `x * 10^scale`,
4643/// then verifies the result fits the column's precision.
4644#[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
4645fn numeric_from_float(
4646    x: f64,
4647    precision: u8,
4648    scale: u8,
4649    col_name: &str,
4650) -> Result<Value, EngineError> {
4651    if !x.is_finite() {
4652        return Err(EngineError::Unsupported(alloc::format!(
4653            "cannot store non-finite float in NUMERIC column `{col_name}`"
4654        )));
4655    }
4656    let mut factor = 1.0_f64;
4657    for _ in 0..scale {
4658        factor *= 10.0;
4659    }
4660    // Round half-away-from-zero by biasing then casting (`as i128`
4661    // truncates toward zero, so the bias + truncation gives the
4662    // desired rounding). `f64::floor` / `ceil` live in std; we don't
4663    // need them — the cast handles the truncation step.
4664    let shifted = x * factor;
4665    let biased = if shifted >= 0.0 {
4666        shifted + 0.5
4667    } else {
4668        shifted - 0.5
4669    };
4670    // Range-check before casting back to i128 — the cast itself is
4671    // saturating in Rust, which would silently truncate huge inputs.
4672    if !(-1e38..=1e38).contains(&biased) {
4673        return Err(EngineError::Unsupported(alloc::format!(
4674            "value {x} overflows NUMERIC range for column `{col_name}`"
4675        )));
4676    }
4677    let scaled = biased as i128;
4678    check_precision(scaled, precision, col_name)?;
4679    Ok(Value::Numeric { scaled, scale })
4680}
4681
4682/// Move a Numeric value from `src_scale` to `dst_scale`. Going up
4683/// multiplies by 10; going down rounds half-away-from-zero.
4684fn numeric_rescale(
4685    scaled: i128,
4686    src_scale: u8,
4687    precision: u8,
4688    dst_scale: u8,
4689    col_name: &str,
4690) -> Result<Value, EngineError> {
4691    let new_scaled = if dst_scale >= src_scale {
4692        let bump = pow10_i128(dst_scale - src_scale);
4693        scaled.checked_mul(bump).ok_or_else(|| {
4694            EngineError::Unsupported(alloc::format!(
4695                "overflow rescaling NUMERIC for column `{col_name}`"
4696            ))
4697        })?
4698    } else {
4699        let drop = pow10_i128(src_scale - dst_scale);
4700        let half = drop / 2;
4701        if scaled >= 0 {
4702            (scaled + half) / drop
4703        } else {
4704            (scaled - half) / drop
4705        }
4706    };
4707    check_precision(new_scaled, precision, col_name)?;
4708    Ok(Value::Numeric {
4709        scaled: new_scaled,
4710        scale: dst_scale,
4711    })
4712}
4713
4714/// Drop the fractional part of a scaled integer, returning the integer
4715/// portion (toward zero). Used for NUMERIC → INT casts.
4716const fn numeric_truncate_to_integer(scaled: i128, scale: u8) -> i128 {
4717    if scale == 0 {
4718        return scaled;
4719    }
4720    let factor = pow10_i128_const(scale);
4721    scaled / factor
4722}
4723
4724/// Verify a scaled NUMERIC value fits the column's declared precision.
4725/// `precision == 0` is the "unconstrained" form (bare `NUMERIC`); we
4726/// skip the check there.
4727fn check_precision(scaled: i128, precision: u8, col_name: &str) -> Result<(), EngineError> {
4728    if precision == 0 {
4729        return Ok(());
4730    }
4731    let limit = pow10_i128(precision);
4732    if scaled.unsigned_abs() >= limit.unsigned_abs() {
4733        return Err(EngineError::Unsupported(alloc::format!(
4734            "NUMERIC value exceeds precision {precision} for column `{col_name}`"
4735        )));
4736    }
4737    Ok(())
4738}
4739
4740const fn pow10_i128_const(p: u8) -> i128 {
4741    let mut acc: i128 = 1;
4742    let mut i = 0;
4743    while i < p {
4744        acc *= 10;
4745        i += 1;
4746    }
4747    acc
4748}
4749
4750fn pow10_i128(p: u8) -> i128 {
4751    pow10_i128_const(p)
4752}
4753
4754/// Walk a parsed `Statement`, swapping any `NOW()` /
4755/// `CURRENT_TIMESTAMP()` / `CURRENT_DATE()` function calls for a
4756/// literal cast that wraps the engine's per-statement clock reading.
4757/// When `now_micros` is `None`, calls stay as-is and surface as
4758/// `unknown function` at eval time — keeps the error path explicit.
4759/// v4.10: pre-walk the WHERE / projection / etc. of a SELECT and
4760/// replace every subquery node with a materialised literal. SPG
4761/// only supports uncorrelated subqueries — the inner SELECT does
4762/// not see outer-row columns, so the result is the same for every
4763/// outer row and can be evaluated once.
4764///
4765/// Returns the rewritten statement; the caller passes this to the
4766/// regular row-loop executor which no longer sees Subquery nodes
4767/// in its tree.
4768impl Engine {
4769    /// v4.12 window executor. Implements `ROW_NUMBER` / `RANK` /
4770    /// `DENSE_RANK` and the partition-aware aggregates `SUM` /
4771    /// `AVG` / `COUNT` / `MIN` / `MAX`. The plan is:
4772    /// 1. Apply the WHERE filter.
4773    /// 2. For each unique `WindowFunction` node in the projection,
4774    ///    partition + sort, compute the per-row value.
4775    /// 3. Append the window values as synthetic columns (`__win_N`)
4776    ///    to the row schema.
4777    /// 4. Rewrite the projection to read those columns.
4778    /// 5. Hand off to the regular project / ORDER BY / LIMIT pipe.
4779    #[allow(
4780        clippy::too_many_lines,
4781        clippy::type_complexity,
4782        clippy::needless_range_loop
4783    )] // window-eval is one cohesive pipe; splitting fragments
4784    fn exec_select_with_window(
4785        &self,
4786        stmt: &SelectStatement,
4787        cancel: CancelToken<'_>,
4788    ) -> Result<QueryResult, EngineError> {
4789        let from = stmt.from.as_ref().ok_or_else(|| {
4790            EngineError::Unsupported("window functions require a FROM clause".into())
4791        })?;
4792        // For v4.12 we only support a single-table FROM. Joins +
4793        // windows is queued for v5.x.
4794        if !from.joins.is_empty() {
4795            return Err(EngineError::Unsupported(
4796                "JOIN with window functions not yet supported".into(),
4797            ));
4798        }
4799        let primary = &from.primary;
4800        let table = self.active_catalog().get(&primary.name).ok_or_else(|| {
4801            StorageError::TableNotFound {
4802                name: primary.name.clone(),
4803            }
4804        })?;
4805        let alias = primary.alias.as_deref().unwrap_or(primary.name.as_str());
4806        let schema_cols = &table.schema().columns;
4807        let ctx = EvalContext::new(schema_cols, Some(alias));
4808
4809        // 1) Filter pass.
4810        let mut filtered: Vec<&Row> = Vec::new();
4811        for (i, row) in table.rows().iter().enumerate() {
4812            if i.is_multiple_of(256) {
4813                cancel.check()?;
4814            }
4815            if let Some(w) = &stmt.where_ {
4816                let cond = eval::eval_expr(w, row, &ctx)?;
4817                if !matches!(cond, Value::Bool(true)) {
4818                    continue;
4819                }
4820            }
4821            filtered.push(row);
4822        }
4823        let n_rows = filtered.len();
4824
4825        // 2) Collect unique window function nodes from projection.
4826        let mut window_nodes: Vec<Expr> = Vec::new();
4827        for item in &stmt.items {
4828            if let SelectItem::Expr { expr, .. } = item {
4829                collect_window_nodes(expr, &mut window_nodes);
4830            }
4831        }
4832
4833        // 3) For each window, compute per-row value.
4834        // Index: same order as window_nodes; for row i, win_vals[w][i].
4835        let mut win_vals: Vec<Vec<Value>> = Vec::with_capacity(window_nodes.len());
4836        for wnode in &window_nodes {
4837            let Expr::WindowFunction {
4838                name,
4839                args,
4840                partition_by,
4841                order_by,
4842                frame,
4843                null_treatment,
4844            } = wnode
4845            else {
4846                unreachable!("collect_window_nodes pushes only WindowFunction");
4847            };
4848            // Compute (partition_key, order_key, original_index) for each row.
4849            let mut indexed: Vec<(Vec<Value>, Vec<(Value, bool)>, usize)> =
4850                Vec::with_capacity(n_rows);
4851            for (i, row) in filtered.iter().enumerate() {
4852                let pkey: Vec<Value> = partition_by
4853                    .iter()
4854                    .map(|p| eval::eval_expr(p, row, &ctx))
4855                    .collect::<Result<_, _>>()?;
4856                let okey: Vec<(Value, bool)> = order_by
4857                    .iter()
4858                    .map(|(e, desc)| eval::eval_expr(e, row, &ctx).map(|v| (v, *desc)))
4859                    .collect::<Result<_, _>>()?;
4860                indexed.push((pkey, okey, i));
4861            }
4862            // Sort by (partition_key, order_key). Partition key uses
4863            // a stable encoded form; order key respects ASC/DESC.
4864            indexed.sort_by(|a, b| {
4865                let p_cmp = partition_key_cmp(&a.0, &b.0);
4866                if p_cmp != core::cmp::Ordering::Equal {
4867                    return p_cmp;
4868                }
4869                order_key_cmp(&a.1, &b.1)
4870            });
4871            // Per-partition compute.
4872            let mut out_vals: Vec<Value> = alloc::vec![Value::Null; n_rows];
4873            let mut p_start = 0;
4874            while p_start < indexed.len() {
4875                let mut p_end = p_start + 1;
4876                while p_end < indexed.len()
4877                    && partition_key_cmp(&indexed[p_start].0, &indexed[p_end].0)
4878                        == core::cmp::Ordering::Equal
4879                {
4880                    p_end += 1;
4881                }
4882                // Compute the function within this partition slice.
4883                compute_window_partition(
4884                    name,
4885                    args,
4886                    !order_by.is_empty(),
4887                    frame.as_ref(),
4888                    *null_treatment,
4889                    &indexed[p_start..p_end],
4890                    &filtered,
4891                    &ctx,
4892                    &mut out_vals,
4893                )?;
4894                p_start = p_end;
4895            }
4896            win_vals.push(out_vals);
4897        }
4898
4899        // 4) Build extended schema: original columns + synthetic.
4900        let mut ext_cols = schema_cols.clone();
4901        for i in 0..window_nodes.len() {
4902            ext_cols.push(ColumnSchema::new(
4903                alloc::format!("__win_{i}"),
4904                DataType::Text, // type doesn't matter for projection eval
4905                true,
4906            ));
4907        }
4908        // 5) Build extended rows: each row gets its window values appended.
4909        let mut ext_rows: Vec<Row> = Vec::with_capacity(n_rows);
4910        for i in 0..n_rows {
4911            let mut values = filtered[i].values.clone();
4912            for w in 0..window_nodes.len() {
4913                values.push(win_vals[w][i].clone());
4914            }
4915            ext_rows.push(Row::new(values));
4916        }
4917        // 6) Rewrite the projection: WindowFunction nodes → Column(__win_N).
4918        let mut rewritten_items: Vec<SelectItem> = Vec::with_capacity(stmt.items.len());
4919        for item in &stmt.items {
4920            let new_item = match item {
4921                SelectItem::Wildcard => SelectItem::Wildcard,
4922                SelectItem::Expr { expr, alias } => {
4923                    let mut e = expr.clone();
4924                    rewrite_window_to_columns(&mut e, &window_nodes);
4925                    SelectItem::Expr {
4926                        expr: e,
4927                        alias: alias.clone(),
4928                    }
4929                }
4930            };
4931            rewritten_items.push(new_item);
4932        }
4933
4934        // 7) Project into final rows.
4935        let ext_ctx = EvalContext::new(&ext_cols, Some(alias));
4936        let projection = build_projection(&rewritten_items, &ext_cols, alias)?;
4937        let mut tagged: Vec<(Vec<f64>, Row)> = Vec::with_capacity(n_rows);
4938        for (i, row) in ext_rows.iter().enumerate() {
4939            if i.is_multiple_of(256) {
4940                cancel.check()?;
4941            }
4942            let mut values = Vec::with_capacity(projection.len());
4943            for p in &projection {
4944                values.push(eval::eval_expr(&p.expr, row, &ext_ctx)?);
4945            }
4946            let order_keys = if stmt.order_by.is_empty() {
4947                Vec::new()
4948            } else {
4949                let mut keys = Vec::with_capacity(stmt.order_by.len());
4950                for o in &stmt.order_by {
4951                    let mut e = o.expr.clone();
4952                    rewrite_window_to_columns(&mut e, &window_nodes);
4953                    let key = eval::eval_expr(&e, row, &ext_ctx)?;
4954                    keys.push(value_to_order_key(&key)?);
4955                }
4956                keys
4957            };
4958            tagged.push((order_keys, Row::new(values)));
4959        }
4960        // ORDER BY + LIMIT/OFFSET on the projected rows.
4961        if !stmt.order_by.is_empty() {
4962            let descs: Vec<bool> = stmt.order_by.iter().map(|o| o.desc).collect();
4963            sort_by_keys(&mut tagged, &descs);
4964        }
4965        let mut out_rows: Vec<Row> = tagged.into_iter().map(|(_, r)| r).collect();
4966        apply_offset_and_limit(&mut out_rows, stmt.offset_literal(), stmt.limit_literal());
4967        let final_cols: Vec<ColumnSchema> = projection
4968            .into_iter()
4969            .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
4970            .collect();
4971        Ok(QueryResult::Rows {
4972            columns: final_cols,
4973            rows: out_rows,
4974        })
4975    }
4976
4977    /// v4.11: materialise each CTE into a temp table inside a
4978    /// cloned catalog, then run the body SELECT against a fresh
4979    /// engine instance that owns the enriched catalog. The clone
4980    /// is moderately expensive — only paid by CTE-bearing queries.
4981    /// Subqueries inside CTE bodies / the main body resolve as
4982    /// usual; `clock_fn` is propagated so `NOW()` lines up.
4983    fn exec_with_ctes(
4984        &self,
4985        stmt: &SelectStatement,
4986        cancel: CancelToken<'_>,
4987    ) -> Result<QueryResult, EngineError> {
4988        cancel.check()?;
4989        let mut catalog = self.active_catalog().clone();
4990        for cte in &stmt.ctes {
4991            if catalog.get(&cte.name).is_some() {
4992                return Err(EngineError::Unsupported(alloc::format!(
4993                    "CTE name {:?} shadows an existing table; rename the CTE",
4994                    cte.name
4995                )));
4996            }
4997            let (columns, rows) = if cte.recursive {
4998                self.materialise_recursive_cte(cte, &catalog, cancel)?
4999            } else {
5000                let body_result = self.exec_select_cancel(&cte.body, cancel)?;
5001                let QueryResult::Rows { columns, rows } = body_result else {
5002                    return Err(EngineError::Unsupported(alloc::format!(
5003                        "CTE {:?} body did not return rows",
5004                        cte.name
5005                    )));
5006                };
5007                (columns, rows)
5008            };
5009            // v4.22: the projection builder labels any non-column
5010            // expression as Text — including literal SELECT 1.
5011            // Promote each column's type to whatever the rows
5012            // actually carry so the CTE storage table accepts them.
5013            let inferred = infer_column_types(&columns, &rows);
5014            let mut columns = inferred;
5015            // v4.22: apply optional `WITH name(a, b, c)` overrides.
5016            if !cte.column_overrides.is_empty() {
5017                if cte.column_overrides.len() != columns.len() {
5018                    return Err(EngineError::Unsupported(alloc::format!(
5019                        "CTE {:?} column list has {} names but body returns {} columns",
5020                        cte.name,
5021                        cte.column_overrides.len(),
5022                        columns.len()
5023                    )));
5024                }
5025                for (col, name) in columns.iter_mut().zip(cte.column_overrides.iter()) {
5026                    col.name.clone_from(name);
5027                }
5028            }
5029            let schema = TableSchema::new(cte.name.clone(), columns);
5030            catalog.create_table(schema).map_err(EngineError::Storage)?;
5031            let table = catalog
5032                .get_mut(&cte.name)
5033                .expect("just-created CTE table must exist");
5034            for row in rows {
5035                table.insert(row).map_err(EngineError::Storage)?;
5036            }
5037        }
5038        // Strip CTEs from the body before running on the temp engine
5039        // so we don't recurse forever.
5040        let mut body = stmt.clone();
5041        body.ctes = Vec::new();
5042        let mut temp = Engine::restore(catalog);
5043        if let Some(c) = self.clock {
5044            temp = temp.with_clock(c);
5045        }
5046        if let Some(f) = self.salt_fn {
5047            temp = temp.with_salt_fn(f);
5048        }
5049        temp.exec_select_cancel(&body, cancel)
5050    }
5051
5052    /// v4.22: materialise a WITH RECURSIVE CTE. The body must be a
5053    /// UNION (or UNION ALL) of an anchor that does not reference
5054    /// the CTE name, and one or more recursive terms that do. The
5055    /// anchor runs first; each subsequent iteration runs the
5056    /// recursive term against a temp catalog where the CTE name is
5057    /// bound to the *previous* iteration's output. Iteration stops
5058    /// when the recursive term yields no rows; UNION (DISTINCT)
5059    /// deduplicates against the accumulated result, UNION ALL does
5060    /// not. A hard cap on total rows prevents runaway queries.
5061    #[allow(clippy::too_many_lines)]
5062    fn materialise_recursive_cte(
5063        &self,
5064        cte: &spg_sql::ast::Cte,
5065        base_catalog: &Catalog,
5066        cancel: CancelToken<'_>,
5067    ) -> Result<(Vec<ColumnSchema>, Vec<Row>), EngineError> {
5068        const MAX_TOTAL_ROWS: usize = 1_000_000;
5069        const MAX_ITERATIONS: usize = 100_000;
5070        cancel.check()?;
5071        if cte.body.unions.is_empty() {
5072            return Err(EngineError::Unsupported(alloc::format!(
5073                "WITH RECURSIVE {:?} body must be a UNION of an anchor and a recursive term",
5074                cte.name
5075            )));
5076        }
5077        // Anchor: the body's leading SELECT, with unions stripped.
5078        let mut anchor = cte.body.clone();
5079        let union_terms = core::mem::take(&mut anchor.unions);
5080        anchor.ctes = Vec::new();
5081        // Anchor must not reference the CTE name.
5082        if select_refers_to(&anchor, &cte.name) {
5083            return Err(EngineError::Unsupported(alloc::format!(
5084                "WITH RECURSIVE {:?}: the anchor must not reference the CTE itself",
5085                cte.name
5086            )));
5087        }
5088        let anchor_result = self.exec_select_cancel(&anchor, cancel)?;
5089        let QueryResult::Rows {
5090            columns: anchor_cols,
5091            rows: anchor_rows,
5092        } = anchor_result
5093        else {
5094            return Err(EngineError::Unsupported(alloc::format!(
5095                "WITH RECURSIVE {:?}: anchor did not return rows",
5096                cte.name
5097            )));
5098        };
5099        // The projection builder labels non-column expressions Text;
5100        // refine column types from the anchor's actual values so the
5101        // intermediate iter-catalog tables accept them.
5102        let mut columns = infer_column_types(&anchor_cols, &anchor_rows);
5103        if !cte.column_overrides.is_empty() {
5104            if cte.column_overrides.len() != columns.len() {
5105                return Err(EngineError::Unsupported(alloc::format!(
5106                    "CTE {:?} column list has {} names but anchor returns {} columns",
5107                    cte.name,
5108                    cte.column_overrides.len(),
5109                    columns.len()
5110                )));
5111            }
5112            for (col, name) in columns.iter_mut().zip(cte.column_overrides.iter()) {
5113                col.name.clone_from(name);
5114            }
5115        }
5116        let mut all_rows: Vec<Row> = anchor_rows.clone();
5117        let mut working_set: Vec<Row> = anchor_rows;
5118        let mut seen: alloc::collections::BTreeSet<Vec<u8>> = alloc::collections::BTreeSet::new();
5119        // Track at least one "all UNION ALL" flag — if every union
5120        // kind is ALL we skip the dedup step (faster + matches PG).
5121        let all_union_all = union_terms.iter().all(|(k, _)| matches!(k, UnionKind::All));
5122        if !all_union_all {
5123            for r in &all_rows {
5124                seen.insert(encode_row_key(r));
5125            }
5126        }
5127        for iter in 0..MAX_ITERATIONS {
5128            cancel.check()?;
5129            if working_set.is_empty() {
5130                break;
5131            }
5132            // Build a fresh catalog: base + CTE bound to working_set.
5133            let mut iter_catalog = base_catalog.clone();
5134            let schema = TableSchema::new(cte.name.clone(), columns.clone());
5135            iter_catalog
5136                .create_table(schema)
5137                .map_err(EngineError::Storage)?;
5138            {
5139                let table = iter_catalog.get_mut(&cte.name).expect("just-created");
5140                for row in &working_set {
5141                    table.insert(row.clone()).map_err(EngineError::Storage)?;
5142                }
5143            }
5144            let mut iter_engine = Engine::restore(iter_catalog);
5145            if let Some(c) = self.clock {
5146                iter_engine = iter_engine.with_clock(c);
5147            }
5148            if let Some(f) = self.salt_fn {
5149                iter_engine = iter_engine.with_salt_fn(f);
5150            }
5151            // Run each recursive term in sequence and collect new rows.
5152            let mut next_set: Vec<Row> = Vec::new();
5153            for (_, term) in &union_terms {
5154                let mut term = term.clone();
5155                term.ctes = Vec::new();
5156                let r = iter_engine.exec_select_cancel(&term, cancel)?;
5157                let QueryResult::Rows {
5158                    columns: rc,
5159                    rows: rs,
5160                } = r
5161                else {
5162                    return Err(EngineError::Unsupported(alloc::format!(
5163                        "WITH RECURSIVE {:?}: recursive term did not return rows",
5164                        cte.name
5165                    )));
5166                };
5167                if rc.len() != columns.len() {
5168                    return Err(EngineError::Unsupported(alloc::format!(
5169                        "WITH RECURSIVE {:?}: column count of recursive term ({}) does not match anchor ({})",
5170                        cte.name,
5171                        rc.len(),
5172                        columns.len()
5173                    )));
5174                }
5175                for row in rs {
5176                    if !all_union_all {
5177                        let key = encode_row_key(&row);
5178                        if !seen.insert(key) {
5179                            continue;
5180                        }
5181                    }
5182                    next_set.push(row);
5183                }
5184            }
5185            if next_set.is_empty() {
5186                break;
5187            }
5188            all_rows.extend(next_set.iter().cloned());
5189            working_set = next_set;
5190            if all_rows.len() > MAX_TOTAL_ROWS {
5191                return Err(EngineError::Unsupported(alloc::format!(
5192                    "WITH RECURSIVE {:?}: produced more than {MAX_TOTAL_ROWS} rows — likely runaway recursion",
5193                    cte.name
5194                )));
5195            }
5196            if iter + 1 == MAX_ITERATIONS {
5197                return Err(EngineError::Unsupported(alloc::format!(
5198                    "WITH RECURSIVE {:?}: exceeded {MAX_ITERATIONS} iterations",
5199                    cte.name
5200                )));
5201            }
5202        }
5203        Ok((columns, all_rows))
5204    }
5205
5206    fn resolve_select_subqueries(
5207        &self,
5208        stmt: &mut SelectStatement,
5209        cancel: CancelToken<'_>,
5210    ) -> Result<(), EngineError> {
5211        for item in &mut stmt.items {
5212            if let SelectItem::Expr { expr, .. } = item {
5213                self.resolve_expr_subqueries(expr, cancel)?;
5214            }
5215        }
5216        if let Some(w) = &mut stmt.where_ {
5217            self.resolve_expr_subqueries(w, cancel)?;
5218        }
5219        if let Some(gs) = &mut stmt.group_by {
5220            for g in gs {
5221                self.resolve_expr_subqueries(g, cancel)?;
5222            }
5223        }
5224        if let Some(h) = &mut stmt.having {
5225            self.resolve_expr_subqueries(h, cancel)?;
5226        }
5227        for o in &mut stmt.order_by {
5228            self.resolve_expr_subqueries(&mut o.expr, cancel)?;
5229        }
5230        for (_, peer) in &mut stmt.unions {
5231            self.resolve_select_subqueries(peer, cancel)?;
5232        }
5233        Ok(())
5234    }
5235
5236    #[allow(clippy::only_used_in_recursion)] // engine handle reads aren't really pure
5237    fn resolve_expr_subqueries(
5238        &self,
5239        e: &mut Expr,
5240        cancel: CancelToken<'_>,
5241    ) -> Result<(), EngineError> {
5242        // Replace-on-this-node cases first.
5243        if let Some(replacement) = self.subquery_replacement(e, cancel)? {
5244            *e = replacement;
5245            return Ok(());
5246        }
5247        match e {
5248            Expr::Binary { lhs, rhs, .. } => {
5249                self.resolve_expr_subqueries(lhs, cancel)?;
5250                self.resolve_expr_subqueries(rhs, cancel)?;
5251            }
5252            Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
5253                self.resolve_expr_subqueries(expr, cancel)?;
5254            }
5255            Expr::FunctionCall { args, .. } => {
5256                for a in args {
5257                    self.resolve_expr_subqueries(a, cancel)?;
5258                }
5259            }
5260            Expr::Like { expr, pattern, .. } => {
5261                self.resolve_expr_subqueries(expr, cancel)?;
5262                self.resolve_expr_subqueries(pattern, cancel)?;
5263            }
5264            Expr::Extract { source, .. } => self.resolve_expr_subqueries(source, cancel)?,
5265            // v4.12 window functions — recurse into args + ORDER BY
5266            // + PARTITION BY in case they carry inner subqueries.
5267            Expr::WindowFunction {
5268                args,
5269                partition_by,
5270                order_by,
5271                ..
5272            } => {
5273                for a in args {
5274                    self.resolve_expr_subqueries(a, cancel)?;
5275                }
5276                for p in partition_by {
5277                    self.resolve_expr_subqueries(p, cancel)?;
5278                }
5279                for (e, _) in order_by {
5280                    self.resolve_expr_subqueries(e, cancel)?;
5281                }
5282            }
5283            // Subquery nodes are handled in subquery_replacement
5284            // (which returned None — defensive no-op); Literal /
5285            // Column are leaves.
5286            Expr::ScalarSubquery(_)
5287            | Expr::Exists { .. }
5288            | Expr::InSubquery { .. }
5289            | Expr::Literal(_)
5290            | Expr::Placeholder(_)
5291            | Expr::Column(_) => {}
5292        }
5293        Ok(())
5294    }
5295
5296    /// v4.23: per-row eval that handles correlated subqueries.
5297    /// Equivalent to `eval::eval_expr` when the expression has no
5298    /// subqueries; otherwise clones the expression, substitutes
5299    /// outer-row columns into each surviving subquery node, runs
5300    /// the inner SELECT, and replaces the node with the literal
5301    /// result. Only the WHERE-filter call sites use this path so
5302    /// the uncorrelated fast path is preserved everywhere else.
5303    fn eval_expr_with_correlated(
5304        &self,
5305        expr: &Expr,
5306        row: &Row,
5307        ctx: &EvalContext<'_>,
5308        cancel: CancelToken<'_>,
5309        memo: Option<&mut memoize::MemoizeCache>,
5310    ) -> Result<Value, EngineError> {
5311        if !expr_has_subquery(expr) {
5312            return eval::eval_expr(expr, row, ctx).map_err(EngineError::Eval);
5313        }
5314        let mut e = expr.clone();
5315        self.resolve_correlated_in_expr(&mut e, row, ctx, cancel, memo)?;
5316        eval::eval_expr(&e, row, ctx).map_err(EngineError::Eval)
5317    }
5318
5319    fn resolve_correlated_in_expr(
5320        &self,
5321        e: &mut Expr,
5322        row: &Row,
5323        ctx: &EvalContext<'_>,
5324        cancel: CancelToken<'_>,
5325        mut memo: Option<&mut memoize::MemoizeCache>,
5326    ) -> Result<(), EngineError> {
5327        match e {
5328            Expr::ScalarSubquery(inner) => {
5329                // v6.2.6 — Memoize: build the cache key from the
5330                // pre-substitution subquery repr + the outer row's
5331                // values. Two outer rows with identical correlated
5332                // values hit the same entry.
5333                let cache_key = memo.as_ref().map(|_| memoize::CacheKey {
5334                    subquery_repr: alloc::format!("{}", **inner),
5335                    outer_values: row.values.clone(),
5336                });
5337                if let (Some(cache), Some(k)) = (memo.as_deref_mut(), cache_key.as_ref())
5338                    && let Some(cached) = cache.get(k)
5339                {
5340                    *e = value_to_literal_expr(cached)?;
5341                    return Ok(());
5342                }
5343                let mut s = (**inner).clone();
5344                substitute_outer_columns(&mut s, row, ctx);
5345                let r = self.exec_select_cancel(&s, cancel)?;
5346                let QueryResult::Rows { rows, .. } = r else {
5347                    return Err(EngineError::Unsupported(
5348                        "scalar subquery: inner did not return rows".into(),
5349                    ));
5350                };
5351                let value = match rows.as_slice() {
5352                    [] => Value::Null,
5353                    [r0] => r0.values.first().cloned().unwrap_or(Value::Null),
5354                    _ => {
5355                        return Err(EngineError::Unsupported(alloc::format!(
5356                            "scalar subquery returned {} rows; expected 0 or 1",
5357                            rows.len()
5358                        )));
5359                    }
5360                };
5361                if let (Some(cache), Some(k)) = (memo.as_deref_mut(), cache_key) {
5362                    cache.insert(k, value.clone());
5363                }
5364                *e = value_to_literal_expr(value)?;
5365            }
5366            Expr::Exists { subquery, negated } => {
5367                let mut s = (**subquery).clone();
5368                substitute_outer_columns(&mut s, row, ctx);
5369                let r = self.exec_select_cancel(&s, cancel)?;
5370                let exists = matches!(r, QueryResult::Rows { rows, .. } if !rows.is_empty());
5371                let bit = if *negated { !exists } else { exists };
5372                *e = Expr::Literal(Literal::Bool(bit));
5373            }
5374            Expr::InSubquery {
5375                expr: lhs,
5376                subquery,
5377                negated,
5378            } => {
5379                self.resolve_correlated_in_expr(lhs, row, ctx, cancel, memo.as_deref_mut())?;
5380                let lhs_val = eval::eval_expr(lhs, row, ctx).map_err(EngineError::Eval)?;
5381                let mut s = (**subquery).clone();
5382                substitute_outer_columns(&mut s, row, ctx);
5383                let r = self.exec_select_cancel(&s, cancel)?;
5384                let QueryResult::Rows { columns, rows, .. } = r else {
5385                    return Err(EngineError::Unsupported(
5386                        "IN-subquery: inner did not return rows".into(),
5387                    ));
5388                };
5389                if columns.len() != 1 {
5390                    return Err(EngineError::Unsupported(alloc::format!(
5391                        "IN-subquery must project exactly one column; got {}",
5392                        columns.len()
5393                    )));
5394                }
5395                let mut found = false;
5396                let mut any_null = false;
5397                for r0 in rows {
5398                    let v = r0.values.into_iter().next().unwrap_or(Value::Null);
5399                    if v.is_null() {
5400                        any_null = true;
5401                        continue;
5402                    }
5403                    if value_cmp(&v, &lhs_val) == core::cmp::Ordering::Equal {
5404                        found = true;
5405                        break;
5406                    }
5407                }
5408                let bit = if found {
5409                    !*negated
5410                } else if any_null {
5411                    return Err(EngineError::Unsupported(
5412                        "IN-subquery with NULL in result and no match: NULL semantics not yet implemented".into(),
5413                    ));
5414                } else {
5415                    *negated
5416                };
5417                *e = Expr::Literal(Literal::Bool(bit));
5418            }
5419            Expr::Binary { lhs, rhs, .. } => {
5420                self.resolve_correlated_in_expr(lhs, row, ctx, cancel, memo.as_deref_mut())?;
5421                self.resolve_correlated_in_expr(rhs, row, ctx, cancel, memo.as_deref_mut())?;
5422            }
5423            Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
5424                self.resolve_correlated_in_expr(expr, row, ctx, cancel, memo.as_deref_mut())?;
5425            }
5426            Expr::Like { expr, pattern, .. } => {
5427                self.resolve_correlated_in_expr(expr, row, ctx, cancel, memo.as_deref_mut())?;
5428                self.resolve_correlated_in_expr(pattern, row, ctx, cancel, memo.as_deref_mut())?;
5429            }
5430            Expr::FunctionCall { args, .. } => {
5431                for a in args {
5432                    self.resolve_correlated_in_expr(a, row, ctx, cancel, memo.as_deref_mut())?;
5433                }
5434            }
5435            Expr::Extract { source, .. } => {
5436                self.resolve_correlated_in_expr(source, row, ctx, cancel, memo.as_deref_mut())?;
5437            }
5438            Expr::WindowFunction { .. } | Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => {}
5439        }
5440        Ok(())
5441    }
5442
5443    fn subquery_replacement(
5444        &self,
5445        e: &Expr,
5446        cancel: CancelToken<'_>,
5447    ) -> Result<Option<Expr>, EngineError> {
5448        match e {
5449            Expr::ScalarSubquery(inner) => {
5450                let mut s = (**inner).clone();
5451                // Recurse into the inner SELECT first so nested
5452                // subqueries materialise bottom-up.
5453                self.resolve_select_subqueries(&mut s, cancel)?;
5454                let r = match self.exec_bare_select_cancel(&s, cancel) {
5455                    Ok(r) => r,
5456                    Err(e) if is_correlation_error(&e) => return Ok(None),
5457                    Err(e) => return Err(e),
5458                };
5459                let QueryResult::Rows { rows, .. } = r else {
5460                    return Err(EngineError::Unsupported(
5461                        "scalar subquery: inner statement did not return rows".into(),
5462                    ));
5463                };
5464                let value = match rows.as_slice() {
5465                    [] => Value::Null,
5466                    [row] => row.values.first().cloned().unwrap_or(Value::Null),
5467                    _ => {
5468                        return Err(EngineError::Unsupported(alloc::format!(
5469                            "scalar subquery returned {} rows; expected 0 or 1",
5470                            rows.len()
5471                        )));
5472                    }
5473                };
5474                Ok(Some(value_to_literal_expr(value)?))
5475            }
5476            Expr::Exists { subquery, negated } => {
5477                let mut s = (**subquery).clone();
5478                self.resolve_select_subqueries(&mut s, cancel)?;
5479                let r = match self.exec_bare_select_cancel(&s, cancel) {
5480                    Ok(r) => r,
5481                    Err(e) if is_correlation_error(&e) => return Ok(None),
5482                    Err(e) => return Err(e),
5483                };
5484                let exists = match r {
5485                    QueryResult::Rows { rows, .. } => !rows.is_empty(),
5486                    QueryResult::CommandOk { .. } => false,
5487                };
5488                let bit = if *negated { !exists } else { exists };
5489                Ok(Some(Expr::Literal(Literal::Bool(bit))))
5490            }
5491            Expr::InSubquery {
5492                expr,
5493                subquery,
5494                negated,
5495            } => {
5496                let mut s = (**subquery).clone();
5497                self.resolve_select_subqueries(&mut s, cancel)?;
5498                let r = match self.exec_bare_select_cancel(&s, cancel) {
5499                    Ok(r) => r,
5500                    Err(e) if is_correlation_error(&e) => return Ok(None),
5501                    Err(e) => return Err(e),
5502                };
5503                let QueryResult::Rows { columns, rows, .. } = r else {
5504                    return Err(EngineError::Unsupported(
5505                        "IN-subquery: inner statement did not return rows".into(),
5506                    ));
5507                };
5508                if columns.len() != 1 {
5509                    return Err(EngineError::Unsupported(alloc::format!(
5510                        "IN-subquery must project exactly one column; got {}",
5511                        columns.len()
5512                    )));
5513                }
5514                // Build the same OR-Eq chain the parse-time literal-list
5515                // path constructs, with each value lifted into a Literal.
5516                let mut acc: Option<Expr> = None;
5517                for row in rows {
5518                    let v = row.values.into_iter().next().unwrap_or(Value::Null);
5519                    let lit = value_to_literal_expr(v)?;
5520                    let cmp = Expr::Binary {
5521                        lhs: expr.clone(),
5522                        op: BinOp::Eq,
5523                        rhs: Box::new(lit),
5524                    };
5525                    acc = Some(match acc {
5526                        None => cmp,
5527                        Some(prev) => Expr::Binary {
5528                            lhs: Box::new(prev),
5529                            op: BinOp::Or,
5530                            rhs: Box::new(cmp),
5531                        },
5532                    });
5533                }
5534                let combined = acc.unwrap_or(Expr::Literal(Literal::Bool(false)));
5535                let final_expr = if *negated {
5536                    Expr::Unary {
5537                        op: UnOp::Not,
5538                        expr: Box::new(combined),
5539                    }
5540                } else {
5541                    combined
5542                };
5543                Ok(Some(final_expr))
5544            }
5545            _ => Ok(None),
5546        }
5547    }
5548}
5549
5550// ---- v4.12 window-function helpers ----
5551// The (partition-key, order-key, original-index) tuple shape used
5552// across these helpers is intrinsic to the planner. Factoring it
5553// into a typedef adds indirection without making the code clearer,
5554// so several lints are allowed inline on the affected functions
5555// rather than module-wide.
5556
5557/// v4.22: cheap structural scan for `FROM <name>` (qualified or
5558/// not) inside a SELECT — used to verify the anchor of a WITH
5559/// RECURSIVE CTE doesn't recurse into itself. Conservative: walks
5560/// FROM joins, subqueries, and unions.
5561fn select_refers_to(stmt: &SelectStatement, target: &str) -> bool {
5562    if let Some(from) = &stmt.from
5563        && from_refers_to(from, target)
5564    {
5565        return true;
5566    }
5567    for (_, peer) in &stmt.unions {
5568        if select_refers_to(peer, target) {
5569            return true;
5570        }
5571    }
5572    for item in &stmt.items {
5573        if let SelectItem::Expr { expr, .. } = item
5574            && expr_refers_to(expr, target)
5575        {
5576            return true;
5577        }
5578    }
5579    if let Some(w) = &stmt.where_
5580        && expr_refers_to(w, target)
5581    {
5582        return true;
5583    }
5584    false
5585}
5586
5587fn from_refers_to(from: &FromClause, target: &str) -> bool {
5588    if from.primary.name.eq_ignore_ascii_case(target) {
5589        return true;
5590    }
5591    from.joins
5592        .iter()
5593        .any(|j| j.table.name.eq_ignore_ascii_case(target))
5594}
5595
5596fn expr_refers_to(e: &Expr, target: &str) -> bool {
5597    match e {
5598        Expr::ScalarSubquery(s) => select_refers_to(s, target),
5599        Expr::Exists { subquery, .. } | Expr::InSubquery { subquery, .. } => {
5600            select_refers_to(subquery, target)
5601        }
5602        Expr::Binary { lhs, rhs, .. } => expr_refers_to(lhs, target) || expr_refers_to(rhs, target),
5603        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
5604            expr_refers_to(expr, target)
5605        }
5606        Expr::Like { expr, pattern, .. } => {
5607            expr_refers_to(expr, target) || expr_refers_to(pattern, target)
5608        }
5609        Expr::FunctionCall { args, .. } => args.iter().any(|a| expr_refers_to(a, target)),
5610        Expr::Extract { source, .. } => expr_refers_to(source, target),
5611        Expr::WindowFunction {
5612            args,
5613            partition_by,
5614            order_by,
5615            ..
5616        } => {
5617            args.iter().any(|a| expr_refers_to(a, target))
5618                || partition_by.iter().any(|p| expr_refers_to(p, target))
5619                || order_by.iter().any(|(o, _)| expr_refers_to(o, target))
5620        }
5621        Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => false,
5622    }
5623}
5624
5625/// v4.22: pick more specific column types from observed rows when
5626/// the projection builder defaulted to Text (the v1.x behavior for
5627/// non-column expressions). Lets `WITH t(n) AS (SELECT 1 ...)`
5628/// land an Int column in the CTE storage table rather than failing
5629/// the insert with "expected TEXT, got INT".
5630fn infer_column_types(columns: &[ColumnSchema], rows: &[Row]) -> Vec<ColumnSchema> {
5631    let mut out = columns.to_vec();
5632    for (col_idx, col) in out.iter_mut().enumerate() {
5633        if col.ty != DataType::Text {
5634            continue;
5635        }
5636        let mut inferred: Option<DataType> = None;
5637        let mut all_null = true;
5638        for row in rows {
5639            let Some(v) = row.values.get(col_idx) else {
5640                continue;
5641            };
5642            let ty = match v {
5643                Value::Null => continue,
5644                Value::SmallInt(_) => DataType::SmallInt,
5645                Value::Int(_) => DataType::Int,
5646                Value::BigInt(_) => DataType::BigInt,
5647                Value::Float(_) => DataType::Float,
5648                Value::Bool(_) => DataType::Bool,
5649                Value::Vector(_) => DataType::Vector {
5650                    dim: 0,
5651                    encoding: VecEncoding::F32,
5652                },
5653                _ => DataType::Text,
5654            };
5655            all_null = false;
5656            inferred = Some(match inferred {
5657                None => ty,
5658                Some(prev) if prev == ty => prev,
5659                Some(_) => DataType::Text,
5660            });
5661        }
5662        if let Some(t) = inferred {
5663            col.ty = t;
5664            col.nullable = true;
5665        } else if all_null {
5666            col.nullable = true;
5667        }
5668    }
5669    out
5670}
5671
5672/// v4.26: render a human-readable plan tree for `EXPLAIN <select>`.
5673/// Lines are pushed into `out`; `depth` controls indentation. We
5674/// describe the rewritten SELECT — what the executor *would* do —
5675/// using the engine handle to spot indexed lookups and table shapes.
5676#[allow(clippy::too_many_lines, clippy::format_push_string)]
5677/// v6.2.4 — Walk every line of the rendered plan tree and append
5678/// per-operator stats. Lines that name a known operator get
5679/// `(rows=N)` (`actual_rows` of the top-level operator equals the
5680/// final result row count; scans report their catalog row count
5681/// as the rows-considered metric). Other lines — Filter / Join /
5682/// GroupBy / OrderBy etc. — are marked `(—)` so the surface is
5683/// complete-by-construction; v6.2.5 fills these in via inline
5684/// executor counters.
5685/// v6.8.3 — surface "CREATE INDEX …" suggestions for every
5686/// `(table, column)` pair the query touches via WHERE / JOIN
5687/// that doesn't already have an index on the owning table.
5688/// Walks the SELECT's FROM clauses + WHERE expression tree;
5689/// returns one line per missing index. Deterministic order:
5690/// FROM-clause iteration order, then column-reference walk
5691/// order inside each WHERE. Each suggestion is a copy-pastable
5692/// DDL string.
5693fn build_index_suggestions(stmt: &SelectStatement, engine: &Engine) -> Vec<String> {
5694    use alloc::collections::BTreeSet;
5695    let mut seen: BTreeSet<(String, String)> = BTreeSet::new();
5696    let mut out: Vec<String> = Vec::new();
5697    let cat = engine.active_catalog();
5698    // Build a (table, qualifier-or-alias) list from the FROM clause
5699    // so unqualified column refs in WHERE resolve to the correct
5700    // table.
5701    let Some(from) = &stmt.from else {
5702        return out;
5703    };
5704    let mut tables: Vec<String> = Vec::new();
5705    tables.push(from.primary.name.clone());
5706    for j in &from.joins {
5707        tables.push(j.table.name.clone());
5708    }
5709    // Collect column refs from the WHERE expression. JOIN ON
5710    // predicates also feed in.
5711    let mut col_refs: Vec<spg_sql::ast::ColumnName> = Vec::new();
5712    if let Some(w) = &stmt.where_ {
5713        collect_column_refs(w, &mut col_refs);
5714    }
5715    for j in &from.joins {
5716        if let Some(on) = &j.on {
5717            collect_column_refs(on, &mut col_refs);
5718        }
5719    }
5720    for cn in &col_refs {
5721        // Resolve owner table: explicit qualifier first, else
5722        // first table in FROM that has a column of this name.
5723        let owner: Option<String> = if let Some(q) = &cn.qualifier {
5724            tables.iter().find(|t| t == &q).cloned()
5725        } else {
5726            tables.iter().find_map(|t| {
5727                cat.get(t).and_then(|tbl| {
5728                    if tbl.schema().column_position(&cn.name).is_some() {
5729                        Some(t.clone())
5730                    } else {
5731                        None
5732                    }
5733                })
5734            })
5735        };
5736        let Some(owner) = owner else {
5737            continue;
5738        };
5739        let Some(tbl) = cat.get(&owner) else {
5740            continue;
5741        };
5742        let Some(col_pos) = tbl.schema().column_position(&cn.name) else {
5743            continue;
5744        };
5745        // Skip if any BTree index already covers this column as
5746        // its key.
5747        let already_indexed = tbl.indices().iter().any(|i| {
5748            matches!(i.kind, spg_storage::IndexKind::BTree(_))
5749                && i.column_position == col_pos
5750                && i.expression.is_none()
5751                && i.partial_predicate.is_none()
5752        });
5753        if already_indexed {
5754            continue;
5755        }
5756        if seen.insert((owner.clone(), cn.name.clone())) {
5757            out.push(alloc::format!(
5758                "SUGGEST: CREATE INDEX ix_{}_{} ON {} ({})",
5759                owner,
5760                cn.name,
5761                owner,
5762                cn.name
5763            ));
5764        }
5765    }
5766    out
5767}
5768
5769/// Walks an `Expr` and pushes every `ColumnName` it references.
5770/// Order is depth-first, left-to-right.
5771fn collect_column_refs(expr: &Expr, out: &mut Vec<spg_sql::ast::ColumnName>) {
5772    match expr {
5773        Expr::Column(cn) => out.push(cn.clone()),
5774        Expr::FunctionCall { args, .. } => {
5775            for a in args {
5776                collect_column_refs(a, out);
5777            }
5778        }
5779        Expr::Binary { lhs, rhs, .. } => {
5780            collect_column_refs(lhs, out);
5781            collect_column_refs(rhs, out);
5782        }
5783        Expr::Unary { expr: e, .. } => collect_column_refs(e, out),
5784        _ => {}
5785    }
5786}
5787
5788fn annotate_explain_lines(lines: &mut [String], total_rows: usize, engine: &Engine) {
5789    let catalog = engine.active_catalog();
5790    let cold_ids = catalog.cold_segment_ids_global();
5791    let any_cold = !cold_ids.is_empty();
5792    let cold_ids_repr = if any_cold {
5793        let mut s = alloc::string::String::from("[");
5794        for (i, id) in cold_ids.iter().enumerate() {
5795            if i > 0 {
5796                s.push(',');
5797            }
5798            s.push_str(&alloc::format!("{id}"));
5799        }
5800        s.push(']');
5801        s
5802    } else {
5803        alloc::string::String::new()
5804    };
5805    for (idx, line) in lines.iter_mut().enumerate() {
5806        let trimmed = line.trim_start();
5807        let is_top_level = idx == 0;
5808        if is_top_level {
5809            line.push_str(&alloc::format!(" (rows={total_rows})"));
5810            continue;
5811        }
5812        if let Some(rest) = trimmed.strip_prefix("From: ") {
5813            let (name, scan_kind) = match rest.split_once(" [") {
5814                Some((n, k)) => (n.trim(), k.trim_end_matches(']')),
5815                None => (rest.trim(), ""),
5816            };
5817            let bare = name.split_whitespace().next().unwrap_or(name);
5818            let hot = catalog.get(bare).map(|t| t.rows().len());
5819            // v6.2.7 — `cold_segments=[id0,id1,…]` enumerates every
5820            // cold-tier segment the scan COULD have walked. v6.2.x
5821            // can tighten to per-table by walking the table's
5822            // BTree-index cold locators.
5823            let annot = match (hot, scan_kind) {
5824                (Some(h), "full scan") => {
5825                    let mut s = alloc::format!(" (hot_rows={h}");
5826                    if any_cold {
5827                        s.push_str(&alloc::format!(
5828                            ", cold_tier=present, cold_segments={cold_ids_repr}"
5829                        ));
5830                    }
5831                    s.push(')');
5832                    s
5833                }
5834                (Some(h), "index seek") => {
5835                    let mut s = alloc::format!(" (hot_rows≤{h}");
5836                    if any_cold {
5837                        s.push_str(&alloc::format!(
5838                            ", cold_tier=present, cold_segments={cold_ids_repr}"
5839                        ));
5840                    }
5841                    s.push(')');
5842                    s
5843                }
5844                _ => " (rows=—)".to_string(),
5845            };
5846            line.push_str(&annot);
5847            continue;
5848        }
5849        // Filter / GroupBy / Having / OrderBy / Limit / Join etc.
5850        line.push_str(" (rows=—)");
5851    }
5852}
5853
5854fn explain_select(stmt: &SelectStatement, engine: &Engine, depth: usize, out: &mut Vec<String>) {
5855    let pad = "  ".repeat(depth);
5856    // 1) Top-level operator label.
5857    let top = if !stmt.ctes.is_empty() {
5858        if stmt.ctes.iter().any(|c| c.recursive) {
5859            "CTEScan (WITH RECURSIVE)"
5860        } else {
5861            "CTEScan (WITH)"
5862        }
5863    } else if !stmt.unions.is_empty() {
5864        "UnionScan"
5865    } else if select_has_window(stmt) {
5866        "WindowAgg"
5867    } else if aggregate::uses_aggregate(stmt) {
5868        "Aggregate"
5869    } else if stmt.distinct {
5870        "Distinct"
5871    } else if stmt.from.is_some() {
5872        "TableScan"
5873    } else {
5874        "Result"
5875    };
5876    out.push(alloc::format!("{pad}{top}"));
5877    let child = "  ".repeat(depth + 1);
5878    // 2) CTE bodies.
5879    for cte in &stmt.ctes {
5880        let head = if cte.recursive {
5881            alloc::format!("{child}CTE (recursive): {}", cte.name)
5882        } else {
5883            alloc::format!("{child}CTE: {}", cte.name)
5884        };
5885        out.push(head);
5886        explain_select(&cte.body, engine, depth + 2, out);
5887    }
5888    // 3) FROM details — primary table + joins, index hits.
5889    if let Some(from) = &stmt.from {
5890        let mut tag = alloc::format!("{child}From: {}", from.primary.name);
5891        if let Some(alias) = &from.primary.alias {
5892            tag.push_str(&alloc::format!(" AS {alias}"));
5893        }
5894        // Try to detect an index-seek opportunity on WHERE against
5895        // the primary table — same heuristic the executor uses.
5896        if let Some(w) = &stmt.where_
5897            && let Some(table) = engine.active_catalog().get(&from.primary.name)
5898        {
5899            let alias = from.primary.alias.as_deref().unwrap_or(&from.primary.name);
5900            let cols = &table.schema().columns;
5901            if try_index_seek(w, cols, engine.active_catalog(), table, alias).is_some() {
5902                tag.push_str(" [index seek]");
5903            } else {
5904                tag.push_str(" [full scan]");
5905            }
5906        } else {
5907            tag.push_str(" [full scan]");
5908        }
5909        out.push(tag);
5910        for j in &from.joins {
5911            let kind = match j.kind {
5912                spg_sql::ast::JoinKind::Inner => "INNER JOIN",
5913                spg_sql::ast::JoinKind::Left => "LEFT JOIN",
5914                spg_sql::ast::JoinKind::Cross => "CROSS JOIN",
5915            };
5916            let mut s = alloc::format!("{child}{kind}: {}", j.table.name);
5917            if let Some(alias) = &j.table.alias {
5918                s.push_str(&alloc::format!(" AS {alias}"));
5919            }
5920            if j.on.is_some() {
5921                s.push_str(" (ON …)");
5922            }
5923            out.push(s);
5924        }
5925    }
5926    // 4) WHERE / GROUP BY / HAVING / ORDER BY / LIMIT / OFFSET.
5927    if let Some(w) = &stmt.where_ {
5928        let mut s = alloc::format!("{child}Filter: {w}");
5929        if expr_has_subquery(w) {
5930            s.push_str(" [subquery]");
5931        }
5932        out.push(s);
5933    }
5934    if let Some(gs) = &stmt.group_by {
5935        let mut parts = Vec::new();
5936        for g in gs {
5937            parts.push(alloc::format!("{g}"));
5938        }
5939        out.push(alloc::format!("{child}GroupBy: {}", parts.join(", ")));
5940    }
5941    if let Some(h) = &stmt.having {
5942        out.push(alloc::format!("{child}Having: {h}"));
5943    }
5944    for o in &stmt.order_by {
5945        let dir = if o.desc { "DESC" } else { "ASC" };
5946        out.push(alloc::format!("{child}OrderBy: {} {dir}", o.expr));
5947    }
5948    if let Some(lim) = stmt.limit {
5949        out.push(alloc::format!("{child}Limit: {lim}"));
5950    }
5951    if let Some(off) = stmt.offset {
5952        out.push(alloc::format!("{child}Offset: {off}"));
5953    }
5954    // 5) Projection — collapse Wildcard or render N items.
5955    if stmt
5956        .items
5957        .iter()
5958        .any(|it| matches!(it, SelectItem::Wildcard))
5959    {
5960        out.push(alloc::format!("{child}Project: *"));
5961    } else {
5962        out.push(alloc::format!(
5963            "{child}Project: {} item(s)",
5964            stmt.items.len()
5965        ));
5966    }
5967    // 6) Recurse into UNION peers.
5968    for (kind, peer) in &stmt.unions {
5969        let label = match kind {
5970            UnionKind::All => "UNION ALL",
5971            UnionKind::Distinct => "UNION",
5972        };
5973        out.push(alloc::format!("{child}{label}"));
5974        explain_select(peer, engine, depth + 2, out);
5975    }
5976}
5977
5978/// v4.23: recognise the engine errors that indicate the inner
5979/// SELECT couldn't be evaluated in isolation because it references
5980/// an outer column — used by `subquery_replacement` to skip
5981/// materialisation and let row-eval handle it instead.
5982fn is_correlation_error(e: &EngineError) -> bool {
5983    matches!(
5984        e,
5985        EngineError::Eval(
5986            eval::EvalError::ColumnNotFound { .. } | eval::EvalError::UnknownQualifier { .. }
5987        )
5988    )
5989}
5990
5991/// v4.23: walk every Expr in `stmt` and replace each Column ref
5992/// that targets the outer scope (qualifier matches the outer
5993/// table alias) with a Literal carrying the outer row's value.
5994/// Conservative: only qualified refs are substituted, so the user
5995/// must write `outer_alias.col` to reference an outer column. This
5996/// matches PG's lexical scoping for correlated subqueries and
5997/// avoids accidentally rebinding inner columns of the same name.
5998fn substitute_outer_columns(stmt: &mut SelectStatement, row: &Row, ctx: &EvalContext<'_>) {
5999    let Some(outer_alias) = ctx.table_alias else {
6000        return;
6001    };
6002    substitute_in_select(stmt, row, ctx, outer_alias);
6003}
6004
6005fn substitute_in_select(
6006    stmt: &mut SelectStatement,
6007    row: &Row,
6008    ctx: &EvalContext<'_>,
6009    outer_alias: &str,
6010) {
6011    for item in &mut stmt.items {
6012        if let SelectItem::Expr { expr, .. } = item {
6013            substitute_in_expr(expr, row, ctx, outer_alias);
6014        }
6015    }
6016    if let Some(w) = &mut stmt.where_ {
6017        substitute_in_expr(w, row, ctx, outer_alias);
6018    }
6019    if let Some(gs) = &mut stmt.group_by {
6020        for g in gs {
6021            substitute_in_expr(g, row, ctx, outer_alias);
6022        }
6023    }
6024    if let Some(h) = &mut stmt.having {
6025        substitute_in_expr(h, row, ctx, outer_alias);
6026    }
6027    for o in &mut stmt.order_by {
6028        substitute_in_expr(&mut o.expr, row, ctx, outer_alias);
6029    }
6030    for (_, peer) in &mut stmt.unions {
6031        substitute_in_select(peer, row, ctx, outer_alias);
6032    }
6033}
6034
6035fn substitute_in_expr(e: &mut Expr, row: &Row, ctx: &EvalContext<'_>, outer_alias: &str) {
6036    if let Expr::Column(c) = e
6037        && let Some(qual) = &c.qualifier
6038        && qual.eq_ignore_ascii_case(outer_alias)
6039    {
6040        // Look up the column's index in the outer schema.
6041        if let Some(idx) = ctx
6042            .columns
6043            .iter()
6044            .position(|sc| sc.name.eq_ignore_ascii_case(&c.name))
6045        {
6046            let v = row.values.get(idx).cloned().unwrap_or(Value::Null);
6047            if let Ok(lit) = value_to_literal_expr(v) {
6048                *e = lit;
6049                return;
6050            }
6051        }
6052    }
6053    match e {
6054        Expr::Binary { lhs, rhs, .. } => {
6055            substitute_in_expr(lhs, row, ctx, outer_alias);
6056            substitute_in_expr(rhs, row, ctx, outer_alias);
6057        }
6058        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
6059            substitute_in_expr(expr, row, ctx, outer_alias);
6060        }
6061        Expr::Like { expr, pattern, .. } => {
6062            substitute_in_expr(expr, row, ctx, outer_alias);
6063            substitute_in_expr(pattern, row, ctx, outer_alias);
6064        }
6065        Expr::FunctionCall { args, .. } => {
6066            for a in args {
6067                substitute_in_expr(a, row, ctx, outer_alias);
6068            }
6069        }
6070        Expr::Extract { source, .. } => substitute_in_expr(source, row, ctx, outer_alias),
6071        Expr::WindowFunction {
6072            args,
6073            partition_by,
6074            order_by,
6075            ..
6076        } => {
6077            for a in args {
6078                substitute_in_expr(a, row, ctx, outer_alias);
6079            }
6080            for p in partition_by {
6081                substitute_in_expr(p, row, ctx, outer_alias);
6082            }
6083            for (o, _) in order_by {
6084                substitute_in_expr(o, row, ctx, outer_alias);
6085            }
6086        }
6087        Expr::ScalarSubquery(s) => substitute_in_select(s, row, ctx, outer_alias),
6088        Expr::Exists { subquery, .. } | Expr::InSubquery { subquery, .. } => {
6089            substitute_in_select(subquery, row, ctx, outer_alias);
6090        }
6091        Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => {}
6092    }
6093}
6094
6095/// v4.22: encode a Row to a comparable byte key for UNION-DISTINCT
6096/// dedup inside the recursive iteration. Crude but deterministic
6097/// — Debug prints embed type discriminants so NULL ≠ "" ≠ 0.
6098fn encode_row_key(row: &Row) -> Vec<u8> {
6099    let mut out = Vec::new();
6100    for v in &row.values {
6101        let s = alloc::format!("{v:?}|");
6102        out.extend_from_slice(s.as_bytes());
6103    }
6104    out
6105}
6106
6107fn select_has_window(stmt: &SelectStatement) -> bool {
6108    for item in &stmt.items {
6109        if let SelectItem::Expr { expr, .. } = item
6110            && expr_has_window(expr)
6111        {
6112            return true;
6113        }
6114    }
6115    false
6116}
6117
6118fn expr_has_window(e: &Expr) -> bool {
6119    match e {
6120        Expr::WindowFunction { .. } => true,
6121        Expr::Binary { lhs, rhs, .. } => expr_has_window(lhs) || expr_has_window(rhs),
6122        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
6123            expr_has_window(expr)
6124        }
6125        Expr::FunctionCall { args, .. } => args.iter().any(expr_has_window),
6126        Expr::Like { expr, pattern, .. } => expr_has_window(expr) || expr_has_window(pattern),
6127        Expr::Extract { source, .. } => expr_has_window(source),
6128        Expr::ScalarSubquery(_)
6129        | Expr::Exists { .. }
6130        | Expr::InSubquery { .. }
6131        | Expr::Literal(_)
6132        | Expr::Placeholder(_)
6133        | Expr::Column(_) => false,
6134    }
6135}
6136
6137fn collect_window_nodes(e: &Expr, out: &mut Vec<Expr>) {
6138    if let Expr::WindowFunction { .. } = e {
6139        // Deduplicate by structural equality on the expression
6140        // (cheap because window args + partition + order are
6141        // small). Without dedup we'd recompute identical windows
6142        // once per occurrence in the projection.
6143        if !out.iter().any(|x| x == e) {
6144            out.push(e.clone());
6145        }
6146        return;
6147    }
6148    match e {
6149        // Already handled by the early-return at the top.
6150        Expr::WindowFunction { .. } => unreachable!(),
6151        Expr::Binary { lhs, rhs, .. } => {
6152            collect_window_nodes(lhs, out);
6153            collect_window_nodes(rhs, out);
6154        }
6155        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
6156            collect_window_nodes(expr, out);
6157        }
6158        Expr::FunctionCall { args, .. } => {
6159            for a in args {
6160                collect_window_nodes(a, out);
6161            }
6162        }
6163        Expr::Like { expr, pattern, .. } => {
6164            collect_window_nodes(expr, out);
6165            collect_window_nodes(pattern, out);
6166        }
6167        Expr::Extract { source, .. } => collect_window_nodes(source, out),
6168        _ => {}
6169    }
6170}
6171
6172fn rewrite_window_to_columns(e: &mut Expr, window_nodes: &[Expr]) {
6173    if let Expr::WindowFunction { .. } = e
6174        && let Some(idx) = window_nodes.iter().position(|w| w == e)
6175    {
6176        *e = Expr::Column(spg_sql::ast::ColumnName {
6177            qualifier: None,
6178            name: alloc::format!("__win_{idx}"),
6179        });
6180        return;
6181    }
6182    match e {
6183        Expr::Binary { lhs, rhs, .. } => {
6184            rewrite_window_to_columns(lhs, window_nodes);
6185            rewrite_window_to_columns(rhs, window_nodes);
6186        }
6187        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
6188            rewrite_window_to_columns(expr, window_nodes);
6189        }
6190        Expr::FunctionCall { args, .. } => {
6191            for a in args {
6192                rewrite_window_to_columns(a, window_nodes);
6193            }
6194        }
6195        Expr::Like { expr, pattern, .. } => {
6196            rewrite_window_to_columns(expr, window_nodes);
6197            rewrite_window_to_columns(pattern, window_nodes);
6198        }
6199        Expr::Extract { source, .. } => rewrite_window_to_columns(source, window_nodes),
6200        _ => {}
6201    }
6202}
6203
6204/// Total order over partition-key tuples. NULL sorts as the
6205/// lowest value (matches the `<` partial order's NULL-last
6206/// behaviour with `INFINITY` flipped).
6207fn partition_key_cmp(a: &[Value], b: &[Value]) -> core::cmp::Ordering {
6208    for (x, y) in a.iter().zip(b.iter()) {
6209        let c = value_cmp(x, y);
6210        if c != core::cmp::Ordering::Equal {
6211            return c;
6212        }
6213    }
6214    a.len().cmp(&b.len())
6215}
6216
6217fn order_key_cmp(a: &[(Value, bool)], b: &[(Value, bool)]) -> core::cmp::Ordering {
6218    for ((va, desc), (vb, _)) in a.iter().zip(b.iter()) {
6219        let c = value_cmp(va, vb);
6220        let c = if *desc { c.reverse() } else { c };
6221        if c != core::cmp::Ordering::Equal {
6222            return c;
6223        }
6224    }
6225    a.len().cmp(&b.len())
6226}
6227
6228#[allow(clippy::match_same_arms)] // explicit arms per type document the supported pairs
6229fn value_cmp(a: &Value, b: &Value) -> core::cmp::Ordering {
6230    use core::cmp::Ordering;
6231    match (a, b) {
6232        (Value::Null, Value::Null) => Ordering::Equal,
6233        (Value::Null, _) => Ordering::Less,
6234        (_, Value::Null) => Ordering::Greater,
6235        (Value::Int(x), Value::Int(y)) => x.cmp(y),
6236        (Value::BigInt(x), Value::BigInt(y)) => x.cmp(y),
6237        (Value::SmallInt(x), Value::SmallInt(y)) => x.cmp(y),
6238        (Value::Text(x), Value::Text(y)) => x.cmp(y),
6239        (Value::Bool(x), Value::Bool(y)) => x.cmp(y),
6240        (Value::Float(x), Value::Float(y)) => x.partial_cmp(y).unwrap_or(Ordering::Equal),
6241        (Value::Date(x), Value::Date(y)) => x.cmp(y),
6242        (Value::Timestamp(x), Value::Timestamp(y)) => x.cmp(y),
6243        // Cross-type compare: fall back to the debug rendering —
6244        // same-partition is the goal, exact order is irrelevant.
6245        _ => alloc::format!("{a:?}").cmp(&alloc::format!("{b:?}")),
6246    }
6247}
6248
6249/// Compute the window function's per-row output for one partition.
6250/// `slice` has (partition key, order key, original-row-index)
6251/// tuples already sorted by order key. `filtered_rows` is the
6252/// full row list indexed by original-row-index. `out_vals` is
6253/// the destination, also indexed by original-row-index.
6254#[allow(
6255    clippy::too_many_arguments,
6256    clippy::cast_possible_truncation,
6257    clippy::cast_possible_wrap,
6258    clippy::cast_precision_loss,
6259    clippy::cast_sign_loss,
6260    clippy::doc_markdown,
6261    clippy::too_many_lines,
6262    clippy::type_complexity,
6263    clippy::match_same_arms
6264)]
6265fn compute_window_partition(
6266    name: &str,
6267    args: &[Expr],
6268    ordered: bool,
6269    frame: Option<&WindowFrame>,
6270    null_treatment: spg_sql::ast::NullTreatment,
6271    slice: &[(Vec<Value>, Vec<(Value, bool)>, usize)],
6272    filtered_rows: &[&Row],
6273    ctx: &EvalContext<'_>,
6274    out_vals: &mut [Value],
6275) -> Result<(), EngineError> {
6276    let ignore_nulls = matches!(null_treatment, spg_sql::ast::NullTreatment::Ignore);
6277    let lower = name.to_ascii_lowercase();
6278    match lower.as_str() {
6279        "row_number" => {
6280            for (rank, (_, _, idx)) in slice.iter().enumerate() {
6281                out_vals[*idx] = Value::BigInt((rank + 1) as i64);
6282            }
6283            Ok(())
6284        }
6285        "rank" => {
6286            let mut prev_key: Option<&[(Value, bool)]> = None;
6287            let mut current_rank: i64 = 1;
6288            for (i, (_, okey, idx)) in slice.iter().enumerate() {
6289                if let Some(p) = prev_key
6290                    && order_key_cmp(p, okey) != core::cmp::Ordering::Equal
6291                {
6292                    current_rank = (i + 1) as i64;
6293                }
6294                if prev_key.is_none() {
6295                    current_rank = 1;
6296                }
6297                out_vals[*idx] = Value::BigInt(current_rank);
6298                prev_key = Some(okey.as_slice());
6299            }
6300            Ok(())
6301        }
6302        "dense_rank" => {
6303            let mut prev_key: Option<&[(Value, bool)]> = None;
6304            let mut current_rank: i64 = 0;
6305            for (_, okey, idx) in slice {
6306                if prev_key.is_none_or(|p| order_key_cmp(p, okey) != core::cmp::Ordering::Equal) {
6307                    current_rank += 1;
6308                }
6309                out_vals[*idx] = Value::BigInt(current_rank);
6310                prev_key = Some(okey.as_slice());
6311            }
6312            Ok(())
6313        }
6314        "sum" | "avg" | "min" | "max" | "count" | "count_star" => {
6315            // Pre-evaluate the function arg per row in the slice
6316            // (count_star has no arg).
6317            let arg_values: Vec<Value> = if lower == "count_star" || args.is_empty() {
6318                slice.iter().map(|_| Value::Null).collect()
6319            } else {
6320                slice
6321                    .iter()
6322                    .map(|(_, _, idx)| eval::eval_expr(&args[0], filtered_rows[*idx], ctx))
6323                    .collect::<Result<_, _>>()
6324                    .map_err(EngineError::Eval)?
6325            };
6326            // v4.20: pick the effective frame. Explicit frame
6327            // overrides the implicit default (running for ordered,
6328            // whole-partition for unordered).
6329            let eff = effective_frame(frame, ordered)?;
6330            #[allow(clippy::needless_range_loop)]
6331            for i in 0..slice.len() {
6332                let (lo, hi) = frame_bounds_for_row(&eff, i, slice);
6333                let mut sum: f64 = 0.0;
6334                let mut count: i64 = 0;
6335                let mut min_v: Option<f64> = None;
6336                let mut max_v: Option<f64> = None;
6337                let mut row_count: i64 = 0;
6338                if lo <= hi {
6339                    for j in lo..=hi {
6340                        let v = &arg_values[j];
6341                        match lower.as_str() {
6342                            "count_star" => row_count += 1,
6343                            "count" => {
6344                                if !v.is_null() {
6345                                    count += 1;
6346                                }
6347                            }
6348                            _ => {
6349                                if let Some(x) = value_to_f64(v) {
6350                                    sum += x;
6351                                    count += 1;
6352                                    min_v = Some(min_v.map_or(x, |m| m.min(x)));
6353                                    max_v = Some(max_v.map_or(x, |m| m.max(x)));
6354                                }
6355                            }
6356                        }
6357                    }
6358                }
6359                let value = match lower.as_str() {
6360                    "count_star" => Value::BigInt(row_count),
6361                    "count" => Value::BigInt(count),
6362                    "sum" => Value::Float(sum),
6363                    "avg" => {
6364                        if count == 0 {
6365                            Value::Null
6366                        } else {
6367                            Value::Float(sum / count as f64)
6368                        }
6369                    }
6370                    "min" => min_v.map_or(Value::Null, Value::Float),
6371                    "max" => max_v.map_or(Value::Null, Value::Float),
6372                    _ => unreachable!(),
6373                };
6374                let (_, _, idx) = &slice[i];
6375                out_vals[*idx] = value;
6376            }
6377            Ok(())
6378        }
6379        "lag" | "lead" => {
6380            // lag(expr [, offset [, default]])
6381            // lead(expr [, offset [, default]])
6382            if args.is_empty() {
6383                return Err(EngineError::Unsupported(alloc::format!(
6384                    "{lower}() requires at least one argument"
6385                )));
6386            }
6387            let offset: i64 = if args.len() >= 2 {
6388                let v = eval::eval_expr(&args[1], filtered_rows[slice[0].2], ctx)
6389                    .map_err(EngineError::Eval)?;
6390                match v {
6391                    Value::SmallInt(n) => i64::from(n),
6392                    Value::Int(n) => i64::from(n),
6393                    Value::BigInt(n) => n,
6394                    _ => {
6395                        return Err(EngineError::Unsupported(alloc::format!(
6396                            "{lower}() offset must be integer"
6397                        )));
6398                    }
6399                }
6400            } else {
6401                1
6402            };
6403            let default: Value = if args.len() >= 3 {
6404                eval::eval_expr(&args[2], filtered_rows[slice[0].2], ctx)
6405                    .map_err(EngineError::Eval)?
6406            } else {
6407                Value::Null
6408            };
6409            let values: Vec<Value> = slice
6410                .iter()
6411                .map(|(_, _, idx)| eval::eval_expr(&args[0], filtered_rows[*idx], ctx))
6412                .collect::<Result<_, _>>()
6413                .map_err(EngineError::Eval)?;
6414            let n = slice.len();
6415            for (i, (_, _, idx)) in slice.iter().enumerate() {
6416                let signed_offset = if lower == "lag" { -offset } else { offset };
6417                let v = if ignore_nulls {
6418                    // v6.4.2 — IGNORE NULLS: walk in the offset direction
6419                    // skipping NULL values; the `offset`-th non-NULL
6420                    // encountered is the result.
6421                    let step: i64 = if signed_offset >= 0 { 1 } else { -1 };
6422                    let needed: i64 = signed_offset.abs();
6423                    if needed == 0 {
6424                        values[i].clone()
6425                    } else {
6426                        let mut j: i64 = i as i64;
6427                        let mut hits: i64 = 0;
6428                        let mut found: Option<Value> = None;
6429                        loop {
6430                            j += step;
6431                            if j < 0 || j >= n as i64 {
6432                                break;
6433                            }
6434                            #[allow(clippy::cast_sign_loss)]
6435                            let v = &values[j as usize];
6436                            if !v.is_null() {
6437                                hits += 1;
6438                                if hits == needed {
6439                                    found = Some(v.clone());
6440                                    break;
6441                                }
6442                            }
6443                        }
6444                        found.unwrap_or_else(|| default.clone())
6445                    }
6446                } else {
6447                    let target_signed = i64::try_from(i).unwrap_or(i64::MAX) + signed_offset;
6448                    if target_signed < 0
6449                        || target_signed >= i64::try_from(n).unwrap_or(i64::MAX)
6450                    {
6451                        default.clone()
6452                    } else {
6453                        #[allow(clippy::cast_sign_loss)]
6454                        {
6455                            values[target_signed as usize].clone()
6456                        }
6457                    }
6458                };
6459                out_vals[*idx] = v;
6460            }
6461            Ok(())
6462        }
6463        "first_value" | "last_value" | "nth_value" => {
6464            if args.is_empty() {
6465                return Err(EngineError::Unsupported(alloc::format!(
6466                    "{lower}() requires at least one argument"
6467                )));
6468            }
6469            let values: Vec<Value> = slice
6470                .iter()
6471                .map(|(_, _, idx)| eval::eval_expr(&args[0], filtered_rows[*idx], ctx))
6472                .collect::<Result<_, _>>()
6473                .map_err(EngineError::Eval)?;
6474            let nth: usize = if lower == "nth_value" {
6475                if args.len() < 2 {
6476                    return Err(EngineError::Unsupported(
6477                        "nth_value() requires (expr, n)".into(),
6478                    ));
6479                }
6480                let v = eval::eval_expr(&args[1], filtered_rows[slice[0].2], ctx)
6481                    .map_err(EngineError::Eval)?;
6482                let raw = match v {
6483                    Value::SmallInt(n) => i64::from(n),
6484                    Value::Int(n) => i64::from(n),
6485                    Value::BigInt(n) => n,
6486                    _ => {
6487                        return Err(EngineError::Unsupported(
6488                            "nth_value() n must be integer".into(),
6489                        ));
6490                    }
6491                };
6492                if raw < 1 {
6493                    return Err(EngineError::Unsupported(
6494                        "nth_value() n must be >= 1".into(),
6495                    ));
6496                }
6497                #[allow(clippy::cast_sign_loss)]
6498                {
6499                    raw as usize
6500                }
6501            } else {
6502                0
6503            };
6504            let eff = effective_frame(frame, ordered)?;
6505            for i in 0..slice.len() {
6506                let (lo, hi) = frame_bounds_for_row(&eff, i, slice);
6507                let (_, _, idx) = &slice[i];
6508                let v = if lo > hi {
6509                    Value::Null
6510                } else if ignore_nulls && matches!(lower.as_str(), "first_value" | "last_value") {
6511                    // v6.4.2 — IGNORE NULLS: skip NULL cells when
6512                    // selecting the boundary value within the frame.
6513                    if lower == "first_value" {
6514                        (lo..=hi)
6515                            .find_map(|j| {
6516                                let v = &values[j];
6517                                (!v.is_null()).then(|| v.clone())
6518                            })
6519                            .unwrap_or(Value::Null)
6520                    } else {
6521                        (lo..=hi)
6522                            .rev()
6523                            .find_map(|j| {
6524                                let v = &values[j];
6525                                (!v.is_null()).then(|| v.clone())
6526                            })
6527                            .unwrap_or(Value::Null)
6528                    }
6529                } else {
6530                    match lower.as_str() {
6531                        "first_value" => values[lo].clone(),
6532                        "last_value" => values[hi].clone(),
6533                        "nth_value" => {
6534                            let pos = lo + nth - 1;
6535                            if pos > hi {
6536                                Value::Null
6537                            } else {
6538                                values[pos].clone()
6539                            }
6540                        }
6541                        _ => unreachable!(),
6542                    }
6543                };
6544                out_vals[*idx] = v;
6545            }
6546            Ok(())
6547        }
6548        "ntile" => {
6549            if args.is_empty() {
6550                return Err(EngineError::Unsupported(
6551                    "ntile(n) requires an integer argument".into(),
6552                ));
6553            }
6554            let v = eval::eval_expr(&args[0], filtered_rows[slice[0].2], ctx)
6555                .map_err(EngineError::Eval)?;
6556            let bucket_count: i64 = match v {
6557                Value::SmallInt(n) => i64::from(n),
6558                Value::Int(n) => i64::from(n),
6559                Value::BigInt(n) => n,
6560                _ => {
6561                    return Err(EngineError::Unsupported(
6562                        "ntile() argument must be integer".into(),
6563                    ));
6564                }
6565            };
6566            if bucket_count < 1 {
6567                return Err(EngineError::Unsupported(
6568                    "ntile() argument must be >= 1".into(),
6569                ));
6570            }
6571            #[allow(clippy::cast_sign_loss)]
6572            let buckets = bucket_count as usize;
6573            let n = slice.len();
6574            // Each bucket gets `base` rows; the first `extras` buckets
6575            // get one extra. PG semantics.
6576            let base = n / buckets;
6577            let extras = n % buckets;
6578            let mut bucket: usize = 1;
6579            let mut remaining_in_bucket = if extras > 0 { base + 1 } else { base };
6580            let mut buckets_with_extra_remaining = extras;
6581            for (_, _, idx) in slice {
6582                if remaining_in_bucket == 0 {
6583                    bucket += 1;
6584                    buckets_with_extra_remaining = buckets_with_extra_remaining.saturating_sub(1);
6585                    remaining_in_bucket = if buckets_with_extra_remaining > 0 {
6586                        base + 1
6587                    } else {
6588                        base
6589                    };
6590                    // Edge: if base==0 and extras==0, all rows fit;
6591                    // shouldn't reach here, but guard anyway.
6592                    if remaining_in_bucket == 0 {
6593                        remaining_in_bucket = 1;
6594                    }
6595                }
6596                out_vals[*idx] = Value::BigInt(i64::try_from(bucket).unwrap_or(i64::MAX));
6597                remaining_in_bucket -= 1;
6598            }
6599            Ok(())
6600        }
6601        "percent_rank" => {
6602            // (rank - 1) / (n - 1) where rank is the standard RANK().
6603            // Single-row partitions get 0.
6604            let n = slice.len();
6605            let mut prev_key: Option<&[(Value, bool)]> = None;
6606            let mut current_rank: i64 = 1;
6607            for (i, (_, okey, idx)) in slice.iter().enumerate() {
6608                if let Some(p) = prev_key
6609                    && order_key_cmp(p, okey) != core::cmp::Ordering::Equal
6610                {
6611                    current_rank = i64::try_from(i + 1).unwrap_or(i64::MAX);
6612                }
6613                if prev_key.is_none() {
6614                    current_rank = 1;
6615                }
6616                #[allow(clippy::cast_precision_loss)]
6617                let pr = if n <= 1 {
6618                    0.0
6619                } else {
6620                    (current_rank - 1) as f64 / (n - 1) as f64
6621                };
6622                out_vals[*idx] = Value::Float(pr);
6623                prev_key = Some(okey.as_slice());
6624            }
6625            Ok(())
6626        }
6627        "cume_dist" => {
6628            // # rows up to and including this row's peer group / n.
6629            let n = slice.len();
6630            // First pass: find peer-group-end rank for each row.
6631            for i in 0..slice.len() {
6632                let peer_end = peer_group_end(slice, i);
6633                #[allow(clippy::cast_precision_loss)]
6634                let cd = (peer_end + 1) as f64 / n as f64;
6635                let (_, _, idx) = &slice[i];
6636                out_vals[*idx] = Value::Float(cd);
6637            }
6638            Ok(())
6639        }
6640        other => Err(EngineError::Unsupported(alloc::format!(
6641            "window function {other:?} not supported (v4.21: row_number/rank/dense_rank/sum/avg/count/min/max/lag/lead/first_value/last_value/nth_value/ntile/percent_rank/cume_dist)"
6642        ))),
6643    }
6644}
6645
6646/// v4.20: resolve the user-provided frame down to a normalised
6647/// `(kind, start, end)`. `None` means default — derive from
6648/// `ordered`: ordered ⇒ RANGE UNBOUNDED PRECEDING AND CURRENT ROW,
6649/// unordered ⇒ ROWS UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING.
6650/// Single-bound shorthand (e.g. `ROWS 5 PRECEDING`) normalises
6651/// end → CURRENT ROW per the PG spec.
6652fn effective_frame(
6653    frame: Option<&WindowFrame>,
6654    ordered: bool,
6655) -> Result<(FrameKind, FrameBound, FrameBound), EngineError> {
6656    match frame {
6657        None => {
6658            if ordered {
6659                Ok((
6660                    FrameKind::Range,
6661                    FrameBound::UnboundedPreceding,
6662                    FrameBound::CurrentRow,
6663                ))
6664            } else {
6665                Ok((
6666                    FrameKind::Rows,
6667                    FrameBound::UnboundedPreceding,
6668                    FrameBound::UnboundedFollowing,
6669                ))
6670            }
6671        }
6672        Some(fr) => {
6673            let end = fr.end.clone().unwrap_or(FrameBound::CurrentRow);
6674            // Reject start > end (a few impossible combinations).
6675            if matches!(fr.start, FrameBound::UnboundedFollowing)
6676                || matches!(end, FrameBound::UnboundedPreceding)
6677            {
6678                return Err(EngineError::Unsupported(alloc::format!(
6679                    "invalid frame: start={:?} end={:?}",
6680                    fr.start,
6681                    end
6682                )));
6683            }
6684            // RANGE OFFSET PRECEDING / FOLLOWING needs value-typed
6685            // arithmetic on the ORDER BY key (e.g. `RANGE BETWEEN
6686            // INTERVAL '1 day' PRECEDING AND CURRENT ROW`). Not
6687            // implemented in v4.20.
6688            if fr.kind == FrameKind::Range
6689                && (matches!(
6690                    fr.start,
6691                    FrameBound::OffsetPreceding(_) | FrameBound::OffsetFollowing(_)
6692                ) || matches!(
6693                    end,
6694                    FrameBound::OffsetPreceding(_) | FrameBound::OffsetFollowing(_)
6695                ))
6696            {
6697                return Err(EngineError::Unsupported(
6698                    "RANGE with explicit offset bounds is not supported (v4.20: only UNBOUNDED / CURRENT ROW for RANGE)".into(),
6699                ));
6700            }
6701            Ok((fr.kind, fr.start.clone(), end))
6702        }
6703    }
6704}
6705
6706/// Compute `(lo, hi)` row-index bounds inside the partition slice
6707/// for the row at position `i`. Inclusive, clamped to
6708/// `[0, slice.len()-1]`. Empty result if `lo > hi`.
6709#[allow(clippy::type_complexity)]
6710fn frame_bounds_for_row(
6711    eff: &(FrameKind, FrameBound, FrameBound),
6712    i: usize,
6713    slice: &[(Vec<Value>, Vec<(Value, bool)>, usize)],
6714) -> (usize, usize) {
6715    let (kind, start, end) = eff;
6716    let n = slice.len();
6717    let last = n.saturating_sub(1);
6718    let (mut lo, mut hi) = match kind {
6719        FrameKind::Rows => {
6720            let lo = match start {
6721                FrameBound::UnboundedPreceding => 0,
6722                FrameBound::OffsetPreceding(k) => {
6723                    let k = usize::try_from(*k).unwrap_or(usize::MAX);
6724                    i.saturating_sub(k)
6725                }
6726                FrameBound::CurrentRow => i,
6727                FrameBound::OffsetFollowing(k) => {
6728                    let k = usize::try_from(*k).unwrap_or(usize::MAX);
6729                    i.saturating_add(k).min(last)
6730                }
6731                FrameBound::UnboundedFollowing => last,
6732            };
6733            let hi = match end {
6734                FrameBound::UnboundedPreceding => 0,
6735                FrameBound::OffsetPreceding(k) => {
6736                    let k = usize::try_from(*k).unwrap_or(usize::MAX);
6737                    i.saturating_sub(k)
6738                }
6739                FrameBound::CurrentRow => i,
6740                FrameBound::OffsetFollowing(k) => {
6741                    let k = usize::try_from(*k).unwrap_or(usize::MAX);
6742                    i.saturating_add(k).min(last)
6743                }
6744                FrameBound::UnboundedFollowing => last,
6745            };
6746            (lo, hi)
6747        }
6748        FrameKind::Range => {
6749            // RANGE bounds are peer-aware. With only UNBOUNDED and
6750            // CURRENT ROW supported (rejected at effective_frame for
6751            // explicit offsets), the start/end map to the
6752            // partition's full extent at the same-order-key peer
6753            // group boundary.
6754            let lo = match start {
6755                FrameBound::UnboundedPreceding => 0,
6756                FrameBound::CurrentRow => peer_group_start(slice, i),
6757                FrameBound::UnboundedFollowing => last,
6758                _ => unreachable!("offset bounds rejected for RANGE"),
6759            };
6760            let hi = match end {
6761                FrameBound::UnboundedPreceding => 0,
6762                FrameBound::CurrentRow => peer_group_end(slice, i),
6763                FrameBound::UnboundedFollowing => last,
6764                _ => unreachable!("offset bounds rejected for RANGE"),
6765            };
6766            (lo, hi)
6767        }
6768    };
6769    if hi >= n {
6770        hi = last;
6771    }
6772    if lo >= n {
6773        lo = last;
6774    }
6775    (lo, hi)
6776}
6777
6778/// Find the inclusive index of the first row with the same ORDER
6779/// BY key as `slice[i]`. Slice is already sorted by partition then
6780/// order, so peers are contiguous.
6781#[allow(clippy::type_complexity)]
6782fn peer_group_start(slice: &[(Vec<Value>, Vec<(Value, bool)>, usize)], i: usize) -> usize {
6783    let key = &slice[i].1;
6784    let mut j = i;
6785    while j > 0 && order_key_cmp(&slice[j - 1].1, key) == core::cmp::Ordering::Equal {
6786        j -= 1;
6787    }
6788    j
6789}
6790
6791/// Find the inclusive index of the last row with the same ORDER
6792/// BY key as `slice[i]`.
6793#[allow(clippy::type_complexity)]
6794fn peer_group_end(slice: &[(Vec<Value>, Vec<(Value, bool)>, usize)], i: usize) -> usize {
6795    let key = &slice[i].1;
6796    let mut j = i;
6797    while j + 1 < slice.len() && order_key_cmp(&slice[j + 1].1, key) == core::cmp::Ordering::Equal {
6798        j += 1;
6799    }
6800    j
6801}
6802
6803fn value_to_f64(v: &Value) -> Option<f64> {
6804    match v {
6805        Value::SmallInt(n) => Some(f64::from(*n)),
6806        Value::Int(n) => Some(f64::from(*n)),
6807        #[allow(clippy::cast_precision_loss)]
6808        Value::BigInt(n) => Some(*n as f64),
6809        Value::Float(x) => Some(*x),
6810        _ => None,
6811    }
6812}
6813
6814/// Quick scan for any subquery-bearing node in a SELECT's WHERE /
6815/// projection / `order_by` — saves cloning the AST when there are
6816/// none (the common case).
6817fn expr_tree_has_subquery(stmt: &SelectStatement) -> bool {
6818    let mut any = false;
6819    for item in &stmt.items {
6820        if let SelectItem::Expr { expr, .. } = item {
6821            any = any || expr_has_subquery(expr);
6822        }
6823    }
6824    if let Some(w) = &stmt.where_ {
6825        any = any || expr_has_subquery(w);
6826    }
6827    if let Some(h) = &stmt.having {
6828        any = any || expr_has_subquery(h);
6829    }
6830    for o in &stmt.order_by {
6831        any = any || expr_has_subquery(&o.expr);
6832    }
6833    for (_, peer) in &stmt.unions {
6834        any = any || expr_tree_has_subquery(peer);
6835    }
6836    any
6837}
6838
6839fn expr_has_subquery(e: &Expr) -> bool {
6840    match e {
6841        Expr::ScalarSubquery(_) | Expr::Exists { .. } | Expr::InSubquery { .. } => true,
6842        Expr::Binary { lhs, rhs, .. } => expr_has_subquery(lhs) || expr_has_subquery(rhs),
6843        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
6844            expr_has_subquery(expr)
6845        }
6846        Expr::FunctionCall { args, .. } => args.iter().any(expr_has_subquery),
6847        Expr::Like { expr, pattern, .. } => expr_has_subquery(expr) || expr_has_subquery(pattern),
6848        Expr::Extract { source, .. } => expr_has_subquery(source),
6849        Expr::WindowFunction {
6850            args,
6851            partition_by,
6852            order_by,
6853            ..
6854        } => {
6855            args.iter().any(expr_has_subquery)
6856                || partition_by.iter().any(expr_has_subquery)
6857                || order_by.iter().any(|(e, _)| expr_has_subquery(e))
6858        }
6859        Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => false,
6860    }
6861}
6862
6863/// v4.10 helper: materialise a runtime `Value` back into an AST
6864/// `Expr::Literal` for the subquery-rewrite path. Supports the
6865/// types `Literal` can represent (Integer / Float / Text / Bool /
6866/// Null). Date / Timestamp / Numeric / Vector / Interval / JSON
6867/// would lose precision through Literal and aren't supported in
6868/// uncorrelated-subquery results; they error with a clear hint.
6869fn value_to_literal_expr(v: Value) -> Result<Expr, EngineError> {
6870    let lit = match v {
6871        Value::Null => Literal::Null,
6872        Value::SmallInt(n) => Literal::Integer(i64::from(n)),
6873        Value::Int(n) => Literal::Integer(i64::from(n)),
6874        Value::BigInt(n) => Literal::Integer(n),
6875        Value::Float(x) => Literal::Float(x),
6876        Value::Text(s) | Value::Json(s) => Literal::String(s),
6877        Value::Bool(b) => Literal::Bool(b),
6878        other => {
6879            return Err(EngineError::Unsupported(alloc::format!(
6880                "subquery result type {:?} not yet materialisable; cast to text or integer in the inner SELECT",
6881                other.data_type()
6882            )));
6883        }
6884    };
6885    Ok(Expr::Literal(lit))
6886}
6887
6888/// v6.1.1 — walk the prepared `Statement` AST and replace every
6889/// `Expr::Placeholder(n)` with `Expr::Literal(value_to_literal(
6890/// params[n-1]))`. The dispatch downstream sees a `Statement`
6891/// indistinguishable from a simple-query parse, so the exec path
6892/// stays unchanged.
6893///
6894/// Errors fall into one shape: a `$N` references past the bound
6895/// `params.len()`. Out-of-range happens when the Bind didn't
6896/// supply enough values; pgwire surfaces this as a protocol error
6897/// to the client.
6898fn substitute_placeholders(stmt: &mut Statement, params: &[Value]) -> Result<(), EngineError> {
6899    match stmt {
6900        Statement::Select(s) => substitute_select(s, params)?,
6901        Statement::Insert(ins) => {
6902            for row in &mut ins.rows {
6903                for e in row {
6904                    substitute_expr(e, params)?;
6905                }
6906            }
6907        }
6908        Statement::Update(u) => {
6909            for (_, e) in &mut u.assignments {
6910                substitute_expr(e, params)?;
6911            }
6912            if let Some(w) = &mut u.where_ {
6913                substitute_expr(w, params)?;
6914            }
6915        }
6916        Statement::Delete(d) => {
6917            if let Some(w) = &mut d.where_ {
6918                substitute_expr(w, params)?;
6919            }
6920        }
6921        Statement::Explain(e) => substitute_select(&mut e.inner, params)?,
6922        // Other statements (CREATE / BEGIN / SHOW / …) have no
6923        // expression slots; no walk needed.
6924        _ => {}
6925    }
6926    Ok(())
6927}
6928
6929fn substitute_select(
6930    s: &mut SelectStatement,
6931    params: &[Value],
6932) -> Result<(), EngineError> {
6933    for item in &mut s.items {
6934        if let SelectItem::Expr { expr, .. } = item {
6935            substitute_expr(expr, params)?;
6936        }
6937    }
6938    if let Some(w) = &mut s.where_ {
6939        substitute_expr(w, params)?;
6940    }
6941    if let Some(gs) = &mut s.group_by {
6942        for g in gs {
6943            substitute_expr(g, params)?;
6944        }
6945    }
6946    if let Some(h) = &mut s.having {
6947        substitute_expr(h, params)?;
6948    }
6949    for o in &mut s.order_by {
6950        substitute_expr(&mut o.expr, params)?;
6951    }
6952    for (_, peer) in &mut s.unions {
6953        substitute_select(peer, params)?;
6954    }
6955    // v7.9.24 — LIMIT $N / OFFSET $N placeholder resolution.
6956    // mailrs H2. After this pass each LIMIT/OFFSET that was a
6957    // Placeholder is rewritten to Literal so the existing
6958    // `LimitExpr::as_literal` path consumes a concrete u32.
6959    if let Some(le) = s.limit {
6960        s.limit = Some(resolve_limit_placeholder(le, params)?);
6961    }
6962    if let Some(le) = s.offset {
6963        s.offset = Some(resolve_limit_placeholder(le, params)?);
6964    }
6965    Ok(())
6966}
6967
6968fn resolve_limit_placeholder(
6969    le: spg_sql::ast::LimitExpr,
6970    params: &[Value],
6971) -> Result<spg_sql::ast::LimitExpr, EngineError> {
6972    use spg_sql::ast::LimitExpr;
6973    match le {
6974        LimitExpr::Literal(_) => Ok(le),
6975        LimitExpr::Placeholder(n) => {
6976            let idx = usize::from(n).saturating_sub(1);
6977            let v = params.get(idx).ok_or_else(|| {
6978                EngineError::Eval(EvalError::PlaceholderOutOfRange {
6979                    n,
6980                    bound: u16::try_from(params.len()).unwrap_or(u16::MAX),
6981                })
6982            })?;
6983            let int = match v {
6984                Value::SmallInt(x) => Some(i64::from(*x)),
6985                Value::Int(x) => Some(i64::from(*x)),
6986                Value::BigInt(x) => Some(*x),
6987                _ => None,
6988            }
6989            .ok_or_else(|| {
6990                EngineError::Unsupported(alloc::format!(
6991                    "LIMIT/OFFSET ${n} bound to non-integer {v:?}"
6992                ))
6993            })?;
6994            if int < 0 {
6995                return Err(EngineError::Unsupported(alloc::format!(
6996                    "LIMIT/OFFSET ${n} bound to negative value {int}"
6997                )));
6998            }
6999            let bounded = u32::try_from(int).map_err(|_| {
7000                EngineError::Unsupported(alloc::format!(
7001                    "LIMIT/OFFSET ${n} value {int} exceeds u32 range"
7002                ))
7003            })?;
7004            Ok(LimitExpr::Literal(bounded))
7005        }
7006    }
7007}
7008
7009fn substitute_expr(e: &mut Expr, params: &[Value]) -> Result<(), EngineError> {
7010    if let Expr::Placeholder(n) = e {
7011        let idx = usize::from(*n).saturating_sub(1);
7012        let v = params.get(idx).ok_or_else(|| {
7013            EngineError::Eval(EvalError::PlaceholderOutOfRange {
7014                n: *n,
7015                bound: u16::try_from(params.len()).unwrap_or(u16::MAX),
7016            })
7017        })?;
7018        *e = Expr::Literal(value_to_literal(v.clone()));
7019        return Ok(());
7020    }
7021    match e {
7022        Expr::Binary { lhs, rhs, .. } => {
7023            substitute_expr(lhs, params)?;
7024            substitute_expr(rhs, params)?;
7025        }
7026        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
7027            substitute_expr(expr, params)?;
7028        }
7029        Expr::FunctionCall { args, .. } => {
7030            for a in args {
7031                substitute_expr(a, params)?;
7032            }
7033        }
7034        Expr::Like { expr, pattern, .. } => {
7035            substitute_expr(expr, params)?;
7036            substitute_expr(pattern, params)?;
7037        }
7038        Expr::Extract { source, .. } => substitute_expr(source, params)?,
7039        Expr::ScalarSubquery(s) => substitute_select(s, params)?,
7040        Expr::Exists { subquery, .. } => substitute_select(subquery, params)?,
7041        Expr::InSubquery { expr, subquery, .. } => {
7042            substitute_expr(expr, params)?;
7043            substitute_select(subquery, params)?;
7044        }
7045        Expr::WindowFunction {
7046            args,
7047            partition_by,
7048            order_by,
7049            ..
7050        } => {
7051            for a in args {
7052                substitute_expr(a, params)?;
7053            }
7054            for p in partition_by {
7055                substitute_expr(p, params)?;
7056            }
7057            for (e, _) in order_by {
7058                substitute_expr(e, params)?;
7059            }
7060        }
7061        Expr::Literal(_) | Expr::Column(_) => {}
7062        // Already handled above.
7063        Expr::Placeholder(_) => unreachable!("Placeholder handled at top of fn"),
7064    }
7065    Ok(())
7066}
7067
7068/// v6.1.1 — convert a runtime `Value` into the closest matching
7069/// `Literal` for the substitute walker. Lossless for the simple
7070/// scalars (Int / Float / Text / Bool); Numeric / Date / Timestamp
7071/// / Json / Interval render as their canonical text form so the
7072/// downstream coerce_value can re-parse against the target column
7073/// type. SQ8 / HalfVector cells are NOT expected as bind params;
7074/// pgwire's Bind decodes vector params to the f32 representation
7075/// before they reach this helper.
7076/// v6.2.0 — total ordering on `Value`s used by ANALYZE to sort a
7077/// column's non-NULL sample before histogram building. Cross-type
7078/// pairs (Int vs Float, Date vs Timestamp, …) compare via the
7079/// same widening the eval-side `compare` operator uses; everything
7080/// else (the genuinely-incompatible pairs) falls back to ordering
7081/// by canonical string form so the sort is still total + stable.
7082/// Vector / SQ8 / Half / Json / Numeric / Interval values reach
7083/// here only via the string-fallback path because vector columns
7084/// are filtered out upstream.
7085fn sort_values_for_histogram(a: &Value, b: &Value) -> core::cmp::Ordering {
7086    use core::cmp::Ordering;
7087    match (a, b) {
7088        (Value::SmallInt(a), Value::SmallInt(b)) => a.cmp(b),
7089        (Value::Int(a), Value::Int(b)) => a.cmp(b),
7090        (Value::BigInt(a), Value::BigInt(b)) => a.cmp(b),
7091        (Value::SmallInt(a), Value::Int(b)) => i32::from(*a).cmp(b),
7092        (Value::Int(a), Value::SmallInt(b)) => a.cmp(&i32::from(*b)),
7093        (Value::Int(a), Value::BigInt(b)) => i64::from(*a).cmp(b),
7094        (Value::BigInt(a), Value::Int(b)) => a.cmp(&i64::from(*b)),
7095        (Value::SmallInt(a), Value::BigInt(b)) => i64::from(*a).cmp(b),
7096        (Value::BigInt(a), Value::SmallInt(b)) => a.cmp(&i64::from(*b)),
7097        (Value::Float(a), Value::Float(b)) => a.partial_cmp(b).unwrap_or(Ordering::Equal),
7098        (Value::Text(a), Value::Text(b)) | (Value::Json(a), Value::Json(b)) => a.cmp(b),
7099        (Value::Bool(a), Value::Bool(b)) => a.cmp(b),
7100        (Value::Date(a), Value::Date(b)) => a.cmp(b),
7101        (Value::Timestamp(a), Value::Timestamp(b)) => a.cmp(b),
7102        // Mixed numeric/float — widen to f64 and compare.
7103        (Value::SmallInt(n), Value::Float(x)) => {
7104            (f64::from(*n)).partial_cmp(x).unwrap_or(Ordering::Equal)
7105        }
7106        (Value::Float(x), Value::SmallInt(n)) => {
7107            x.partial_cmp(&f64::from(*n)).unwrap_or(Ordering::Equal)
7108        }
7109        (Value::Int(n), Value::Float(x)) => {
7110            (f64::from(*n)).partial_cmp(x).unwrap_or(Ordering::Equal)
7111        }
7112        (Value::Float(x), Value::Int(n)) => {
7113            x.partial_cmp(&f64::from(*n)).unwrap_or(Ordering::Equal)
7114        }
7115        (Value::BigInt(n), Value::Float(x)) => {
7116            #[allow(clippy::cast_precision_loss)]
7117            let nf = *n as f64;
7118            nf.partial_cmp(x).unwrap_or(Ordering::Equal)
7119        }
7120        (Value::Float(x), Value::BigInt(n)) => {
7121            #[allow(clippy::cast_precision_loss)]
7122            let nf = *n as f64;
7123            x.partial_cmp(&nf).unwrap_or(Ordering::Equal)
7124        }
7125        // Cross-type fallback: lexicographic on canonical form.
7126        // Total + stable so the sort is well-defined.
7127        _ => canonical_value_repr(a).cmp(&canonical_value_repr(b)),
7128    }
7129}
7130
7131/// v6.2.0 — render the histogram bounds list as a `[v0, v1, ...]`
7132/// string for the `spg_statistic.histogram_bounds` column. Values
7133/// containing `,` or `[` / `]` are JSON-style escaped so the
7134/// rendering round-trips through a future parser; v6.2.0 only
7135/// uses the rendered form for human consumption, so the escaping
7136/// is conservative.
7137fn render_histogram_bounds(bounds: &[alloc::string::String]) -> alloc::string::String {
7138    let mut out = alloc::string::String::with_capacity(bounds.len() * 8 + 2);
7139    out.push('[');
7140    for (i, b) in bounds.iter().enumerate() {
7141        if i > 0 {
7142            out.push_str(", ");
7143        }
7144        let needs_quote = b.contains([',', '[', ']', '"']) || b.is_empty();
7145        if needs_quote {
7146            out.push('"');
7147            for ch in b.chars() {
7148                if ch == '"' || ch == '\\' {
7149                    out.push('\\');
7150                }
7151                out.push(ch);
7152            }
7153            out.push('"');
7154        } else {
7155            out.push_str(b);
7156        }
7157    }
7158    out.push(']');
7159    out
7160}
7161
7162/// v6.2.0 — canonical textual form of a `Value` for histogram
7163/// bound storage. Strings used by ANALYZE for sort + bound output.
7164/// INT / BIGINT → decimal; FLOAT → shortest-round-trip via
7165/// `{:?}`; TEXT pass-through; BOOL → `t` / `f`; DATE / TIMESTAMP →
7166/// the same form `format_date` / `format_timestamp` produce for
7167/// SQL Display. Vector / SQ8 / Half / Json / Numeric / Interval
7168/// reach this only via a non-Vector column (vector columns are
7169/// skipped upstream); they fall back to a Debug-derived form so
7170/// stats still serialise without crashing.
7171pub(crate) fn canonical_value_repr(v: &Value) -> alloc::string::String {
7172    match v {
7173        Value::Null => "NULL".to_string(),
7174        Value::SmallInt(n) => alloc::format!("{n}"),
7175        Value::Int(n) => alloc::format!("{n}"),
7176        Value::BigInt(n) => alloc::format!("{n}"),
7177        Value::Float(x) => alloc::format!("{x:?}"),
7178        Value::Text(s) | Value::Json(s) => s.clone(),
7179        Value::Bool(b) => if *b { "t" } else { "f" }.to_string(),
7180        Value::Date(d) => eval::format_date(*d),
7181        Value::Timestamp(t) => eval::format_timestamp(*t),
7182        Value::Interval { months, micros } => eval::format_interval(*months, *micros),
7183        Value::Numeric { scaled, scale } => eval::format_numeric(*scaled, *scale),
7184        Value::Vector(_) | Value::Sq8Vector(_) | Value::HalfVector(_) => {
7185            // Unreachable in practice (vector columns are filtered
7186            // out before this). Defensive fallback so a future
7187            // vector-stats path doesn't crash.
7188            alloc::format!("{v:?}")
7189        }
7190        // v7.5.0 — Value is #[non_exhaustive] for downstream
7191        // forward-compat. Future variants fall through to Debug
7192        // form here (same shape as the vector fallback above).
7193        _ => alloc::format!("{v:?}"),
7194    }
7195}
7196
7197/// v6.2.0 — true for engine-managed catalog tables that the bare
7198/// `ANALYZE` (no target) should skip. v6.2.0 has no internal
7199/// tables yet (publications / subscriptions / users / statistics
7200/// all live as engine fields, not catalog tables), so this is a
7201/// reserved future-proofing hook — every existing user table is
7202/// analysed.
7203const fn is_internal_table_name(_name: &str) -> bool {
7204    false
7205}
7206
7207fn value_to_literal(v: Value) -> Literal {
7208    match v {
7209        Value::Null => Literal::Null,
7210        Value::SmallInt(n) => Literal::Integer(i64::from(n)),
7211        Value::Int(n) => Literal::Integer(i64::from(n)),
7212        Value::BigInt(n) => Literal::Integer(n),
7213        Value::Float(x) => Literal::Float(x),
7214        Value::Text(s) | Value::Json(s) => Literal::String(s),
7215        Value::Bool(b) => Literal::Bool(b),
7216        Value::Vector(v) => Literal::Vector(v),
7217        Value::Numeric { scaled, scale } => {
7218            Literal::String(eval::format_numeric(scaled, scale))
7219        }
7220        Value::Date(d) => Literal::String(eval::format_date(d)),
7221        Value::Timestamp(t) => Literal::String(eval::format_timestamp(t)),
7222        Value::Interval { months, micros } => Literal::Interval {
7223            months,
7224            micros,
7225            text: eval::format_interval(months, micros),
7226        },
7227        // SQ8 / halfvec cells dequantise to f32 before reaching the
7228        // substitute walker; pgwire's Bind path handles that.
7229        Value::Sq8Vector(q) => Literal::Vector(spg_storage::quantize::dequantize(&q)),
7230        Value::HalfVector(h) => Literal::Vector(h.to_f32_vec()),
7231        // v7.5.0 — Value is #[non_exhaustive]; future variants
7232        // render as Debug-form String literal until explicit
7233        // mapping is added.
7234        v => Literal::String(alloc::format!("{v:?}")),
7235    }
7236}
7237
7238fn rewrite_clock_calls(stmt: &mut Statement, now_micros: Option<i64>) {
7239    let Some(now) = now_micros else {
7240        return;
7241    };
7242    match stmt {
7243        Statement::Select(s) => rewrite_select_clock(s, now),
7244        Statement::Insert(ins) => {
7245            for row in &mut ins.rows {
7246                for e in row {
7247                    rewrite_expr_clock(e, now);
7248                }
7249            }
7250        }
7251        _ => {}
7252    }
7253}
7254
7255fn rewrite_select_clock(s: &mut SelectStatement, now: i64) {
7256    for item in &mut s.items {
7257        if let SelectItem::Expr { expr, .. } = item {
7258            rewrite_expr_clock(expr, now);
7259        }
7260    }
7261    if let Some(w) = &mut s.where_ {
7262        rewrite_expr_clock(w, now);
7263    }
7264    if let Some(gs) = &mut s.group_by {
7265        for g in gs {
7266            rewrite_expr_clock(g, now);
7267        }
7268    }
7269    if let Some(h) = &mut s.having {
7270        rewrite_expr_clock(h, now);
7271    }
7272    for o in &mut s.order_by {
7273        rewrite_expr_clock(&mut o.expr, now);
7274    }
7275    for (_, peer) in &mut s.unions {
7276        rewrite_select_clock(peer, now);
7277    }
7278}
7279
7280/// v3.0.3 hot path: every recursion lands in exactly one `match` arm.
7281/// Literal / Column-with-qualifier (the dominant cases on a typical
7282/// AST) take a single pattern dispatch and exit. The clock-rewrite
7283/// targets (zero-arg `NOW` / `CURRENT_TIMESTAMP` / `CURRENT_DATE`
7284/// functions, and bare `CURRENT_TIMESTAMP` / `CURRENT_DATE` column
7285/// refs) sit on their own arms with match guards so the fall-through
7286/// to the recursive arms is unambiguous.
7287fn rewrite_expr_clock(e: &mut Expr, now: i64) {
7288    // Fast-path test on the no-recursion shapes first. We can't fold
7289    // them into the big match below because they need to *replace* `e`
7290    // outright; the recursive arms below match on its sub-fields.
7291    if let Some(replacement) = clock_replacement_for(e, now) {
7292        *e = replacement;
7293        return;
7294    }
7295    match e {
7296        Expr::Binary { lhs, rhs, .. } => {
7297            rewrite_expr_clock(lhs, now);
7298            rewrite_expr_clock(rhs, now);
7299        }
7300        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
7301            rewrite_expr_clock(expr, now);
7302        }
7303        Expr::FunctionCall { args, .. } => {
7304            for a in args {
7305                rewrite_expr_clock(a, now);
7306            }
7307        }
7308        Expr::Like { expr, pattern, .. } => {
7309            rewrite_expr_clock(expr, now);
7310            rewrite_expr_clock(pattern, now);
7311        }
7312        Expr::Extract { source, .. } => rewrite_expr_clock(source, now),
7313        // v4.10 subquery nodes — recurse into the inner SELECT's
7314        // expression slots so e.g. SELECT NOW() in a scalar
7315        // subquery picks up the same instant as the outer query.
7316        Expr::ScalarSubquery(s) => rewrite_select_clock(s, now),
7317        Expr::Exists { subquery, .. } => rewrite_select_clock(subquery, now),
7318        Expr::InSubquery { expr, subquery, .. } => {
7319            rewrite_expr_clock(expr, now);
7320            rewrite_select_clock(subquery, now);
7321        }
7322        // v4.12 window functions — args + PARTITION BY + ORDER BY
7323        // may all reference clock literals.
7324        Expr::WindowFunction {
7325            args,
7326            partition_by,
7327            order_by,
7328            ..
7329        } => {
7330            for a in args {
7331                rewrite_expr_clock(a, now);
7332            }
7333            for p in partition_by {
7334                rewrite_expr_clock(p, now);
7335            }
7336            for (e, _) in order_by {
7337                rewrite_expr_clock(e, now);
7338            }
7339        }
7340        Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => {}
7341    }
7342}
7343
7344/// Returns `Some(Expr)` when `e` is one of the clock-call shapes that
7345/// must be rewritten; otherwise `None` so the caller falls through to
7346/// the recursive walk. Identifies both function-call forms (`NOW()` /
7347/// `CURRENT_TIMESTAMP()` / `CURRENT_DATE()`) and bare-identifier forms
7348/// (`CURRENT_TIMESTAMP` / `CURRENT_DATE` as unqualified column refs,
7349/// which is how PG accepts them without parens).
7350fn clock_replacement_for(e: &Expr, now: i64) -> Option<Expr> {
7351    let (kind, name) = match e {
7352        Expr::FunctionCall { name, args } if args.is_empty() => (ClockSite::Fn, name.as_str()),
7353        Expr::Column(c) if c.qualifier.is_none() => (ClockSite::BareIdent, c.name.as_str()),
7354        _ => return None,
7355    };
7356    // ASCII case-insensitive name match. Limited to the three keywords
7357    // that actually need rewriting.
7358    let matched = match name.len() {
7359        3 if kind == ClockSite::Fn && name.eq_ignore_ascii_case("now") => Some(true),
7360        12 if name.eq_ignore_ascii_case("current_date") => Some(false),
7361        17 if name.eq_ignore_ascii_case("current_timestamp") => Some(true),
7362        _ => None,
7363    };
7364    let is_timestamp = matched?;
7365    let payload = if is_timestamp {
7366        now
7367    } else {
7368        now.div_euclid(86_400_000_000)
7369    };
7370    let target = if is_timestamp {
7371        spg_sql::ast::CastTarget::Timestamp
7372    } else {
7373        spg_sql::ast::CastTarget::Date
7374    };
7375    Some(Expr::Cast {
7376        expr: alloc::boxed::Box::new(Expr::Literal(spg_sql::ast::Literal::Integer(payload))),
7377        target,
7378    })
7379}
7380
7381#[derive(Debug, Clone, Copy, PartialEq, Eq)]
7382enum ClockSite {
7383    Fn,
7384    BareIdent,
7385}
7386
7387/// `ORDER BY <integer>` references the N-th SELECT item (1-based).
7388/// Swap the integer literal for the matching item's expression so the
7389/// executor doesn't need a special-case branch. Recurses into UNION
7390/// peers because each peer keeps its own SELECT list.
7391/// v6.4.1 — expand `GROUP BY ALL` to every non-aggregate SELECT-list
7392/// item. Mirrors DuckDB / PG 19 semantics. Wildcards (`SELECT * …`)
7393/// are NOT expanded by GROUP BY ALL (PG 19 leaves the wildcard intact
7394/// and groups by whatever explicit non-aggregates remain — none in
7395/// the wildcard-only case, which still works for non-aggregate
7396/// queries).
7397fn expand_group_by_all(s: &mut SelectStatement) {
7398    if !s.group_by_all {
7399        for (_, peer) in &mut s.unions {
7400            expand_group_by_all(peer);
7401        }
7402        return;
7403    }
7404    let mut groups: Vec<Expr> = Vec::new();
7405    for item in &s.items {
7406        if let SelectItem::Expr { expr, .. } = item
7407            && !aggregate::contains_aggregate(expr)
7408        {
7409            groups.push(expr.clone());
7410        }
7411    }
7412    s.group_by = Some(groups);
7413    s.group_by_all = false;
7414    for (_, peer) in &mut s.unions {
7415        expand_group_by_all(peer);
7416    }
7417}
7418
7419fn resolve_order_by_position(s: &mut SelectStatement) {
7420    // v6.4.0 — iterate every ORDER BY key. Position references
7421    // (`ORDER BY 2`) bind to the 1-based projection index;
7422    // identifier references that match a SELECT-list alias bind to
7423    // the projected expression (Step 4 of L3a).
7424    for order in &mut s.order_by {
7425        match &order.expr {
7426            Expr::Literal(Literal::Integer(n)) if *n >= 1 => {
7427                if let Ok(idx_one_based) = usize::try_from(*n) {
7428                    let idx = idx_one_based - 1;
7429                    if idx < s.items.len()
7430                        && let SelectItem::Expr { expr, .. } = &s.items[idx]
7431                    {
7432                        order.expr = expr.clone();
7433                    }
7434                }
7435            }
7436            Expr::Column(c) if c.qualifier.is_none() => {
7437                // Alias-in-ORDER-BY lookup.
7438                for item in &s.items {
7439                    if let SelectItem::Expr {
7440                        expr,
7441                        alias: Some(a),
7442                    } = item
7443                        && a == &c.name
7444                    {
7445                        order.expr = expr.clone();
7446                        break;
7447                    }
7448                }
7449            }
7450            _ => {}
7451        }
7452    }
7453    for (_, peer) in &mut s.unions {
7454        resolve_order_by_position(peer);
7455    }
7456}
7457
7458/// Sort `tagged` by `f64` key, reversing the comparator under DESC.
7459/// Used by the UNION ORDER BY path; per-block paths inline the same
7460/// comparator because they already hold `&OrderBy` directly.
7461/// v3.1.1: partial-sort helper. When `keep` (= offset + limit) is
7462/// strictly less than `tagged.len()`, run `select_nth_unstable_by` to
7463/// partition the prefix in O(n), then sort just that prefix in O(k
7464/// log k). Total O(n + k log k), vs O(n log n) for a full sort. The
7465/// caller decides what `keep` is; passing `None` (no LIMIT) keeps the
7466/// full-sort behaviour.
7467///
7468/// `tagged` holds `(Option<f64>, Row)` (the SELECT path) — `None` keys
7469/// sort last in ascending order, mirroring NULL-sorts-last in SQL.
7470fn partial_sort_tagged(
7471    tagged: &mut Vec<(Vec<f64>, Row)>,
7472    keep: Option<usize>,
7473    descs: &[bool],
7474) {
7475    let cmp = |a: &(Vec<f64>, Row), b: &(Vec<f64>, Row)| cmp_multi_key(&a.0, &b.0, descs);
7476    match keep {
7477        Some(k) if k < tagged.len() && k > 0 => {
7478            let pivot = k - 1;
7479            tagged.select_nth_unstable_by(pivot, cmp);
7480            tagged[..k].sort_by(cmp);
7481            tagged.truncate(k);
7482        }
7483        _ => {
7484            tagged.sort_by(cmp);
7485        }
7486    }
7487}
7488
7489fn sort_by_keys(tagged: &mut [(Vec<f64>, Row)], descs: &[bool]) {
7490    tagged.sort_by(|a, b| cmp_multi_key(&a.0, &b.0, descs));
7491}
7492
7493/// v6.4.0 — multi-key ORDER BY comparator. Each key's per-key DESC
7494/// flag is honored independently. NULL is encoded as `f64::INFINITY`
7495/// so it sorts last in ASC and first in DESC (matches PG default).
7496fn cmp_multi_key(a: &[f64], b: &[f64], descs: &[bool]) -> core::cmp::Ordering {
7497    use core::cmp::Ordering;
7498    for (i, (ka, kb)) in a.iter().zip(b.iter()).enumerate() {
7499        let ord = ka.partial_cmp(kb).unwrap_or(Ordering::Equal);
7500        let ord = if descs.get(i).copied().unwrap_or(false) {
7501            ord.reverse()
7502        } else {
7503            ord
7504        };
7505        if ord != Ordering::Equal {
7506            return ord;
7507        }
7508    }
7509    Ordering::Equal
7510}
7511
7512/// v6.4.0 — eval every ORDER BY expression for a row and pack the
7513/// resulting keys into a `Vec<f64>`. NULL → `f64::INFINITY`.
7514fn build_order_keys(
7515    order_by: &[OrderBy],
7516    row: &Row,
7517    ctx: &EvalContext,
7518) -> Result<Vec<f64>, EngineError> {
7519    let mut keys = Vec::with_capacity(order_by.len());
7520    for o in order_by {
7521        let v = eval::eval_expr(&o.expr, row, ctx)?;
7522        keys.push(value_to_order_key(&v)?);
7523    }
7524    Ok(keys)
7525}
7526
7527/// Drop the first `offset` rows then truncate to `limit`. PG / `MySQL`
7528/// agree: OFFSET applies *after* ORDER BY but *before* LIMIT (so
7529/// `LIMIT 10 OFFSET 5` keeps rows 6..=15).
7530fn apply_offset_and_limit(rows: &mut Vec<Row>, offset: Option<u32>, limit: Option<u32>) {
7531    if let Some(off) = offset {
7532        let off = off as usize;
7533        if off >= rows.len() {
7534            rows.clear();
7535        } else {
7536            rows.drain(..off);
7537        }
7538    }
7539    if let Some(n) = limit {
7540        rows.truncate(n as usize);
7541    }
7542}
7543
7544/// v7.6.1 — resolve a parser-level `ForeignKeyConstraint` (column
7545/// names + parent table name) into the storage-layer shape (column
7546/// indices + same parent table). Validates everything the engine
7547/// needs to know about the FK at CREATE TABLE time:
7548///
7549///   - parent table exists (catalog lookup, unless self-referencing)
7550///   - parent columns exist on the parent table
7551///   - parent column list matches the local arity (defaults to the
7552///     parent's primary index column when omitted)
7553///   - parent columns are covered by a `BTree` UNIQUE-class index
7554///     (SPG's stand-in for `PRIMARY KEY`/`UNIQUE`) — required so
7555///     the v7.6.2 INSERT path can do an O(log n) parent lookup
7556///   - local columns exist on the table being created
7557fn resolve_foreign_key(
7558    local_table_name: &str,
7559    local_cols: &[ColumnSchema],
7560    fk: spg_sql::ast::ForeignKeyConstraint,
7561    catalog: &Catalog,
7562) -> Result<spg_storage::ForeignKeyConstraint, EngineError> {
7563    // Resolve local columns.
7564    let mut local_columns = Vec::with_capacity(fk.columns.len());
7565    for name in &fk.columns {
7566        let pos = local_cols
7567            .iter()
7568            .position(|c| c.name == *name)
7569            .ok_or_else(|| {
7570                EngineError::Unsupported(alloc::format!(
7571                    "FOREIGN KEY references unknown local column {name:?}"
7572                ))
7573            })?;
7574        local_columns.push(pos);
7575    }
7576    // Self-referencing FK: parent table is the one we're creating.
7577    // The parent column resolution uses the local column list since
7578    // the catalog doesn't have this table yet.
7579    let is_self_ref = fk.parent_table == local_table_name;
7580    let (parent_cols_for_lookup, parent_table_str): (&[ColumnSchema], &str) = if is_self_ref {
7581        (local_cols, local_table_name)
7582    } else {
7583        let parent_table = catalog.get(&fk.parent_table).ok_or_else(|| {
7584            EngineError::Storage(StorageError::TableNotFound {
7585                name: fk.parent_table.clone(),
7586            })
7587        })?;
7588        (parent_table.schema().columns.as_slice(), fk.parent_table.as_str())
7589    };
7590    // Resolve parent column names → positions. If the FK omitted the
7591    // parent column list, fall back to the parent's primary index
7592    // column (single-column only — composite default is rejected
7593    // because there's no unambiguous "PK" in SPG's index list).
7594    let parent_columns: Vec<usize> = if fk.parent_columns.is_empty() {
7595        if fk.columns.len() != 1 {
7596            return Err(EngineError::Unsupported(
7597                "composite FOREIGN KEY without explicit parent column list is not supported \
7598                 — list the parent columns explicitly"
7599                    .into(),
7600            ));
7601        }
7602        // Find a single BTree index on the parent and use its column.
7603        let pos = pick_pk_index_column(catalog, parent_table_str, is_self_ref, local_cols)
7604            .ok_or_else(|| {
7605                EngineError::Unsupported(alloc::format!(
7606                    "parent table {parent_table_str:?} has no PRIMARY-key / UNIQUE BTree index \
7607                     to default the FOREIGN KEY against"
7608                ))
7609            })?;
7610        alloc::vec![pos]
7611    } else {
7612        let mut out = Vec::with_capacity(fk.parent_columns.len());
7613        for name in &fk.parent_columns {
7614            let pos = parent_cols_for_lookup
7615                .iter()
7616                .position(|c| c.name == *name)
7617                .ok_or_else(|| {
7618                    EngineError::Unsupported(alloc::format!(
7619                        "FOREIGN KEY references unknown parent column \
7620                         {name:?} on table {parent_table_str:?}"
7621                    ))
7622                })?;
7623            out.push(pos);
7624        }
7625        out
7626    };
7627    if parent_columns.len() != local_columns.len() {
7628        return Err(EngineError::Unsupported(alloc::format!(
7629            "FOREIGN KEY arity mismatch: {} local columns vs {} parent columns",
7630            local_columns.len(),
7631            parent_columns.len()
7632        )));
7633    }
7634    // For non-self-referencing FKs, verify the parent column set is
7635    // covered by a BTree index. SPG doesn't have a `PRIMARY KEY`
7636    // declaration; the convention is "the parent column for FK
7637    // purposes must have a BTree index" — which the user creates via
7638    // `CREATE INDEX ... USING btree (col)` (the default). We accept
7639    // any single-column BTree index that covers a parent column;
7640    // composite parent column lists require an index whose `column_position`
7641    // matches the first parent column (multi-column BTree indices
7642    // are not in the v7.x roadmap).
7643    if !is_self_ref {
7644        let parent_table = catalog
7645            .get(&fk.parent_table)
7646            .expect("checked above");
7647        let primary_parent_col = parent_columns[0];
7648        let has_btree = parent_table.schema().columns.get(primary_parent_col).is_some()
7649            && parent_table
7650                .indices()
7651                .iter()
7652                .any(|idx| {
7653                    matches!(idx.kind, spg_storage::IndexKind::BTree(_))
7654                        && idx.column_position == primary_parent_col
7655                        && idx.partial_predicate.is_none()
7656                });
7657        if !has_btree {
7658            return Err(EngineError::Unsupported(alloc::format!(
7659                "FOREIGN KEY parent column on {:?} is not covered by an unconditional BTree \
7660                 index — create one with `CREATE INDEX ... ON {} ({})` first",
7661                parent_table_str,
7662                parent_table_str,
7663                parent_table.schema().columns[primary_parent_col].name,
7664            )));
7665        }
7666    }
7667    let on_delete = fk_action_sql_to_storage(fk.on_delete);
7668    let on_update = fk_action_sql_to_storage(fk.on_update);
7669    Ok(spg_storage::ForeignKeyConstraint {
7670        name: fk.name,
7671        local_columns,
7672        parent_table: fk.parent_table,
7673        parent_columns,
7674        on_delete,
7675        on_update,
7676    })
7677}
7678
7679/// v7.6.1 — pick a sentinel "primary key" column from the parent
7680/// table when the FK didn't name parent columns. Picks the first
7681/// single-column unconditional BTree index — that's the closest
7682/// thing SPG has to a PRIMARY KEY today. Self-referencing FKs use
7683/// `local_cols` as the column source.
7684fn pick_pk_index_column(
7685    catalog: &Catalog,
7686    parent_name: &str,
7687    is_self_ref: bool,
7688    local_cols: &[ColumnSchema],
7689) -> Option<usize> {
7690    if is_self_ref {
7691        // Self-ref FK omitted parent columns: pick column 0 by
7692        // convention (no catalog entry yet). Engine will widen this
7693        // when v7.6.7 lands; v7.6.1 only handles the explicit form.
7694        let _ = local_cols;
7695        return Some(0);
7696    }
7697    let parent = catalog.get(parent_name)?;
7698    parent.indices().iter().find_map(|idx| {
7699        if matches!(idx.kind, spg_storage::IndexKind::BTree(_))
7700            && idx.partial_predicate.is_none()
7701            && idx.included_columns.is_empty()
7702            && idx.expression.is_none()
7703        {
7704            Some(idx.column_position)
7705        } else {
7706            None
7707        }
7708    })
7709}
7710
7711/// v7.9.8 / v7.9.10 — resolve the column positions that
7712/// identify a conflict for ON CONFLICT. Returns a Vec of
7713/// column positions (1 element for single-column form, N for
7714/// composite). When the user wrote bare `ON CONFLICT DO …`,
7715/// falls back to the table's first unconditional BTree index
7716/// (always single-column today).
7717fn resolve_on_conflict_columns(
7718    catalog: &Catalog,
7719    table_name: &str,
7720    target: &[String],
7721) -> Result<Vec<usize>, EngineError> {
7722    let table = catalog.get(table_name).ok_or_else(|| {
7723        EngineError::Storage(StorageError::TableNotFound {
7724            name: table_name.into(),
7725        })
7726    })?;
7727    if target.is_empty() {
7728        let pos = table
7729            .indices()
7730            .iter()
7731            .find_map(|idx| {
7732                if matches!(idx.kind, spg_storage::IndexKind::BTree(_))
7733                    && idx.partial_predicate.is_none()
7734                    && idx.included_columns.is_empty()
7735                    && idx.expression.is_none()
7736                {
7737                    Some(idx.column_position)
7738                } else {
7739                    None
7740                }
7741            })
7742            .ok_or_else(|| {
7743                EngineError::Unsupported(alloc::format!(
7744                    "ON CONFLICT without target requires a UNIQUE BTree index on {table_name:?}"
7745                ))
7746            })?;
7747        return Ok(alloc::vec![pos]);
7748    }
7749    let mut out = Vec::with_capacity(target.len());
7750    for name in target {
7751        let pos = table
7752            .schema()
7753            .columns
7754            .iter()
7755            .position(|c| c.name == *name)
7756            .ok_or_else(|| {
7757                EngineError::Unsupported(alloc::format!(
7758                    "ON CONFLICT target column {name:?} not found on {table_name:?}"
7759                ))
7760            })?;
7761        out.push(pos);
7762    }
7763    Ok(out)
7764}
7765
7766/// v7.9.8 — check whether the BTree index on `column_pos` of
7767/// `table_name` already has a row with this key.
7768fn on_conflict_key_exists(
7769    catalog: &Catalog,
7770    table_name: &str,
7771    column_pos: usize,
7772    key: &Value,
7773) -> bool {
7774    let Some(table) = catalog.get(table_name) else {
7775        return false;
7776    };
7777    let Some(idx_key) = spg_storage::IndexKey::from_value(key) else {
7778        return false;
7779    };
7780    table.indices().iter().any(|idx| {
7781        matches!(idx.kind, spg_storage::IndexKind::BTree(_))
7782            && idx.column_position == column_pos
7783            && idx.partial_predicate.is_none()
7784            && !idx.lookup_eq(&idx_key).is_empty()
7785    })
7786}
7787
7788/// v7.9.9 / v7.9.10 — look up an existing row's position by
7789/// matching all `column_positions` against the incoming `key`
7790/// tuple. Single-column shape (one column) reduces to the
7791/// canonical PK lookup; composite shapes scan linearly until
7792/// every position matches.
7793fn lookup_row_position_by_keys(
7794    catalog: &Catalog,
7795    table_name: &str,
7796    column_positions: &[usize],
7797    key: &[&Value],
7798) -> Option<usize> {
7799    let table = catalog.get(table_name)?;
7800    table.rows().iter().position(|r| {
7801        column_positions
7802            .iter()
7803            .enumerate()
7804            .all(|(i, &pos)| r.values.get(pos) == Some(key[i]))
7805    })
7806}
7807
7808/// v7.9.10 — does the table already contain a row whose
7809/// `column_positions` tuple equals `key`? Single-column shape
7810/// uses the existing BTree fast path; composite shapes fall
7811/// back to a row scan.
7812fn on_conflict_keys_exist(
7813    catalog: &Catalog,
7814    table_name: &str,
7815    column_positions: &[usize],
7816    key: &[&Value],
7817) -> bool {
7818    if column_positions.len() == 1 {
7819        return on_conflict_key_exists(
7820            catalog,
7821            table_name,
7822            column_positions[0],
7823            key[0],
7824        );
7825    }
7826    let Some(table) = catalog.get(table_name) else {
7827        return false;
7828    };
7829    table.rows().iter().any(|r| {
7830        column_positions
7831            .iter()
7832            .enumerate()
7833            .all(|(i, &pos)| r.values.get(pos) == Some(key[i]))
7834    })
7835}
7836
7837/// v7.9.9 — apply ON CONFLICT DO UPDATE SET assignments to an
7838/// existing row.
7839///
7840/// `incoming` is the rejected INSERT row (used to resolve
7841/// `EXCLUDED.col` references in the assignment exprs);
7842/// `target_pos` is the position of the existing row in the table.
7843/// Each assignment substitutes `EXCLUDED.col` with the matching
7844/// incoming value, evaluates the resulting expression against
7845/// the existing row, and writes the new value into the
7846/// corresponding column of the returned `Vec<Value>`. If
7847/// `where_` evaluates falsy, returns Ok(None) — PG behaviour:
7848/// the conflicting row is silently kept unchanged.
7849fn apply_on_conflict_assignments(
7850    catalog: &Catalog,
7851    table_name: &str,
7852    target_pos: usize,
7853    incoming: &[Value],
7854    assignments: &[(String, Expr)],
7855    where_: Option<&Expr>,
7856) -> Result<Option<Vec<Value>>, EngineError> {
7857    let table = catalog.get(table_name).ok_or_else(|| {
7858        EngineError::Storage(StorageError::TableNotFound {
7859            name: table_name.into(),
7860        })
7861    })?;
7862    let schema_cols = table.schema().columns.clone();
7863    let existing = table
7864        .rows()
7865        .get(target_pos)
7866        .ok_or_else(|| {
7867            EngineError::Unsupported(alloc::format!(
7868                "ON CONFLICT DO UPDATE: row position {target_pos} out of bounds on {table_name:?}"
7869            ))
7870        })?
7871        .clone();
7872    let ctx = eval::EvalContext::new(&schema_cols, Some(table_name));
7873    // Optional WHERE filter on the conflict row.
7874    if let Some(w) = where_ {
7875        let pred = w.clone();
7876        let pred = substitute_excluded_refs(pred, &schema_cols, incoming);
7877        let v = eval::eval_expr(&pred, &existing, &ctx)?;
7878        if !matches!(v, Value::Bool(true)) {
7879            return Ok(None);
7880        }
7881    }
7882    let mut new_values = existing.values.clone();
7883    for (col_name, expr) in assignments {
7884        let target_idx = schema_cols
7885            .iter()
7886            .position(|c| c.name == *col_name)
7887            .ok_or_else(|| {
7888                EngineError::Eval(EvalError::ColumnNotFound {
7889                    name: col_name.clone(),
7890                })
7891            })?;
7892        let sub = substitute_excluded_refs(expr.clone(), &schema_cols, incoming);
7893        let v = eval::eval_expr(&sub, &existing, &ctx)?;
7894        new_values[target_idx] =
7895            coerce_value(v, schema_cols[target_idx].ty, col_name, target_idx)?;
7896    }
7897    Ok(Some(new_values))
7898}
7899
7900/// v7.9.9 — walk an `Expr` tree replacing any `Column { qualifier:
7901/// "EXCLUDED", name }` reference with a `Literal` of the matching
7902/// value from the incoming-row vec. Resolution against the
7903/// child-table column list (by name).
7904fn substitute_excluded_refs(
7905    expr: Expr,
7906    schema_cols: &[ColumnSchema],
7907    incoming: &[Value],
7908) -> Expr {
7909    use spg_sql::ast::ColumnName;
7910    match expr {
7911        Expr::Column(ColumnName { qualifier, name })
7912            if qualifier
7913                .as_deref()
7914                .is_some_and(|q| q.eq_ignore_ascii_case("excluded")) =>
7915        {
7916            let pos = schema_cols.iter().position(|c| c.name == name);
7917            match pos {
7918                Some(p) => {
7919                    let v = incoming.get(p).cloned().unwrap_or(Value::Null);
7920                    value_to_literal_expr(v).unwrap_or_else(|_| {
7921                        Expr::Literal(spg_sql::ast::Literal::Null)
7922                    })
7923                }
7924                None => Expr::Column(ColumnName { qualifier, name }),
7925            }
7926        }
7927        Expr::Binary { op, lhs, rhs } => Expr::Binary {
7928            op,
7929            lhs: Box::new(substitute_excluded_refs(*lhs, schema_cols, incoming)),
7930            rhs: Box::new(substitute_excluded_refs(*rhs, schema_cols, incoming)),
7931        },
7932        Expr::Unary { op, expr } => Expr::Unary {
7933            op,
7934            expr: Box::new(substitute_excluded_refs(*expr, schema_cols, incoming)),
7935        },
7936        Expr::FunctionCall { name, args } => Expr::FunctionCall {
7937            name,
7938            args: args
7939                .into_iter()
7940                .map(|a| substitute_excluded_refs(a, schema_cols, incoming))
7941                .collect(),
7942        },
7943        other => other,
7944    }
7945}
7946
7947/// v7.6.2 / v7.6.7 — INSERT-side FK enforcement. For every row
7948/// about to be inserted into `child_table`, every FK declared on
7949/// that table is checked: the row's FK columns must either be
7950/// NULL (SQL spec skip) or match an existing parent row via the
7951/// parent's BTree PK / UNIQUE index.
7952///
7953/// Returns `EngineError::Unsupported` with a `FOREIGN KEY violation`
7954/// payload on first failure.
7955///
7956/// **Self-referencing FKs (v7.6.7 widening):** when `fk.parent_table
7957/// == child_table`, the parent rows visible to this check are
7958///  (a) rows already committed to the table, plus
7959///  (b) earlier rows from the *same* `rows` batch.
7960/// This makes `INSERT INTO tree VALUES (1, NULL), (2, 1), (3, 2)`
7961/// work in a single statement — common pattern for bulk-loading
7962/// hierarchies.
7963/// v7.9.19 — enforce table-level UNIQUE / PRIMARY KEY tuple
7964/// constraints at INSERT time. For each constraint declared on
7965/// the target table, check that no existing row + no earlier row
7966/// in the same batch has the same full-column tuple. NULL in
7967/// any column lifts the row out of the check (SQL spec: NULL
7968/// ≠ NULL for uniqueness). mailrs G1 + G6.
7969fn enforce_uniqueness_inserts(
7970    catalog: &Catalog,
7971    child_table: &str,
7972    constraints: &[spg_storage::UniquenessConstraint],
7973    rows: &[Vec<Value>],
7974) -> Result<(), EngineError> {
7975    if constraints.is_empty() {
7976        return Ok(());
7977    }
7978    let table = catalog.get(child_table).ok_or_else(|| {
7979        EngineError::Storage(StorageError::TableNotFound {
7980            name: child_table.into(),
7981        })
7982    })?;
7983    for uc in constraints {
7984        for (batch_idx, row_values) in rows.iter().enumerate() {
7985            let key: Vec<&Value> = uc.columns.iter().map(|&i| &row_values[i]).collect();
7986            let has_null = key.iter().any(|v| matches!(v, Value::Null));
7987            if has_null {
7988                continue;
7989            }
7990            // Table-side collision: scan existing rows.
7991            let collides_in_table = table.rows().iter().any(|prow| {
7992                uc.columns
7993                    .iter()
7994                    .enumerate()
7995                    .all(|(i, &p)| prow.values.get(p) == Some(key[i]))
7996            });
7997            // Batch-side collision: earlier rows in the same INSERT.
7998            let collides_in_batch = rows[..batch_idx].iter().any(|earlier| {
7999                uc.columns
8000                    .iter()
8001                    .enumerate()
8002                    .all(|(i, &p)| earlier.get(p) == Some(key[i]))
8003            });
8004            if collides_in_table || collides_in_batch {
8005                let kind = if uc.is_primary_key { "PRIMARY KEY" } else { "UNIQUE" };
8006                let col_names: Vec<String> = uc
8007                    .columns
8008                    .iter()
8009                    .map(|&i| table.schema().columns[i].name.clone())
8010                    .collect();
8011                return Err(EngineError::Unsupported(alloc::format!(
8012                    "{kind} violation on {child_table:?} columns {col_names:?}: \
8013                     row #{batch_idx} duplicates an existing key"
8014                )));
8015            }
8016        }
8017    }
8018    Ok(())
8019}
8020
8021/// v7.9.29 — `true` iff `v` counts as a truthy SQL value for a
8022/// WHERE-style predicate. NULL → false (three-valued logic
8023/// collapses to "skip this row" for index inclusion). Numeric
8024/// non-zero, BIGINT non-zero, TINYINT non-zero, BOOLEAN true → true.
8025/// Everything else (strings, vectors, JSON, …) is not a valid
8026/// predicate result and surfaces as `false` so a malformed
8027/// predicate degrades to "row not in index" rather than panicking.
8028fn predicate_truthy(v: &spg_storage::Value) -> bool {
8029    use spg_storage::Value as V;
8030    match v {
8031        V::Bool(b) => *b,
8032        V::Int(n) => *n != 0,
8033        V::BigInt(n) => *n != 0,
8034        V::SmallInt(n) => *n != 0,
8035        _ => false,
8036    }
8037}
8038
8039/// v7.9.29 — at CREATE UNIQUE INDEX time, scan the table's
8040/// committed rows for pre-existing duplicates. If any pair of rows
8041/// matches the predicate AND has the same index key, refuse to
8042/// create the index so the user fixes the data before retrying.
8043fn check_existing_unique_violation(
8044    idx: &spg_storage::Index,
8045    schema: &spg_storage::TableSchema,
8046    rows: &[spg_storage::Row],
8047) -> Result<(), EngineError> {
8048    let predicate_expr = match idx.partial_predicate.as_deref() {
8049        Some(s) => Some(spg_sql::parser::parse_expression(s).map_err(|e| {
8050            EngineError::Unsupported(alloc::format!(
8051                "stored partial predicate {s:?} failed to re-parse: {e:?}"
8052            ))
8053        })?),
8054        None => None,
8055    };
8056    let ctx = eval::EvalContext::new(&schema.columns, None);
8057    let key_positions = unique_key_positions(idx);
8058    let mut seen: alloc::vec::Vec<alloc::vec::Vec<spg_storage::Value>> = alloc::vec::Vec::new();
8059    for row in rows {
8060        if let Some(expr) = &predicate_expr {
8061            let v = eval::eval_expr(expr, row, &ctx).map_err(|e| {
8062                EngineError::Unsupported(alloc::format!(
8063                    "evaluating UNIQUE INDEX predicate against existing row: {e:?}"
8064                ))
8065            })?;
8066            if !predicate_truthy(&v) {
8067                continue;
8068            }
8069        }
8070        let key: alloc::vec::Vec<spg_storage::Value> = key_positions
8071            .iter()
8072            .map(|&p| {
8073                row.values
8074                    .get(p)
8075                    .cloned()
8076                    .unwrap_or(spg_storage::Value::Null)
8077            })
8078            .collect();
8079        if key.iter().any(|v| matches!(v, spg_storage::Value::Null)) {
8080            continue;
8081        }
8082        if seen.iter().any(|other| *other == key) {
8083            return Err(EngineError::Unsupported(alloc::format!(
8084                "CREATE UNIQUE INDEX {:?}: existing rows already violate the constraint",
8085                idx.name
8086            )));
8087        }
8088        seen.push(key);
8089    }
8090    Ok(())
8091}
8092
8093/// v7.9.29 — full key tuple for a UNIQUE INDEX (leading +
8094/// extra positions). For single-column indexes this is just
8095/// `[column_position]`.
8096fn unique_key_positions(idx: &spg_storage::Index) -> alloc::vec::Vec<usize> {
8097    let mut out = alloc::vec::Vec::with_capacity(1 + idx.extra_column_positions.len());
8098    out.push(idx.column_position);
8099    out.extend_from_slice(&idx.extra_column_positions);
8100    out
8101}
8102
8103/// v7.9.29 — at INSERT time, walk every `is_unique` index on the
8104/// target table. For each, eval the index's optional predicate
8105/// against (a) the candidate row and (b) every committed row plus
8106/// earlier batch rows; only rows where the predicate is truthy
8107/// participate. A duplicate key among predicate-matching rows is a
8108/// uniqueness violation. NULL keys lift the row out of the check
8109/// (matching PG's "UNIQUE allows multiple NULLs" semantics).
8110fn enforce_unique_index_inserts(
8111    catalog: &Catalog,
8112    table_name: &str,
8113    rows: &[alloc::vec::Vec<spg_storage::Value>],
8114) -> Result<(), EngineError> {
8115    let table = catalog.get(table_name).ok_or_else(|| {
8116        EngineError::Storage(StorageError::TableNotFound {
8117            name: table_name.into(),
8118        })
8119    })?;
8120    let schema = table.schema();
8121    let ctx = eval::EvalContext::new(&schema.columns, None);
8122    for idx in table.indices() {
8123        if !idx.is_unique {
8124            continue;
8125        }
8126        // Re-parse the predicate once per index per batch.
8127        let predicate_expr = match idx.partial_predicate.as_deref() {
8128            Some(s) => Some(spg_sql::parser::parse_expression(s).map_err(|e| {
8129                EngineError::Unsupported(alloc::format!(
8130                    "UNIQUE INDEX {:?} predicate {s:?} failed to re-parse: {e:?}",
8131                    idx.name
8132                ))
8133            })?),
8134            None => None,
8135        };
8136        let key_positions = unique_key_positions(idx);
8137        let key_of = |values: &[spg_storage::Value]| -> alloc::vec::Vec<spg_storage::Value> {
8138            key_positions
8139                .iter()
8140                .map(|&p| {
8141                    values
8142                        .get(p)
8143                        .cloned()
8144                        .unwrap_or(spg_storage::Value::Null)
8145                })
8146                .collect()
8147        };
8148        // Helper: does `values` participate in this index? (predicate
8149        // truthy when present.) Wraps `values` into a transient Row
8150        // because eval_expr requires &Row.
8151        let participates = |values: &[spg_storage::Value]| -> Result<bool, EngineError> {
8152            let Some(expr) = &predicate_expr else {
8153                return Ok(true);
8154            };
8155            let tmp_row = spg_storage::Row {
8156                values: values.to_vec(),
8157            };
8158            let v = eval::eval_expr(expr, &tmp_row, &ctx).map_err(|e| {
8159                EngineError::Unsupported(alloc::format!(
8160                    "UNIQUE INDEX {:?} predicate eval: {e:?}",
8161                    idx.name
8162                ))
8163            })?;
8164            Ok(predicate_truthy(&v))
8165        };
8166        for (batch_idx, row_values) in rows.iter().enumerate() {
8167            if !participates(row_values)? {
8168                continue;
8169            }
8170            let key = key_of(row_values);
8171            if key.iter().any(|v| matches!(v, spg_storage::Value::Null)) {
8172                continue;
8173            }
8174            // Committed-table collision.
8175            for prow in table.rows() {
8176                if !participates(&prow.values)? {
8177                    continue;
8178                }
8179                if key_of(&prow.values) == key {
8180                    return Err(EngineError::Unsupported(alloc::format!(
8181                        "UNIQUE INDEX {:?} violation on {table_name:?}: \
8182                         row #{batch_idx} duplicates an existing key",
8183                        idx.name
8184                    )));
8185                }
8186            }
8187            // Within-batch collision: earlier rows in the same INSERT.
8188            for earlier in &rows[..batch_idx] {
8189                if !participates(earlier)? {
8190                    continue;
8191                }
8192                if key_of(earlier) == key {
8193                    return Err(EngineError::Unsupported(alloc::format!(
8194                        "UNIQUE INDEX {:?} violation on {table_name:?}: \
8195                         row #{batch_idx} duplicates an earlier row in the same batch",
8196                        idx.name
8197                    )));
8198                }
8199            }
8200        }
8201    }
8202    Ok(())
8203}
8204
8205fn enforce_fk_inserts(
8206    catalog: &Catalog,
8207    child_table: &str,
8208    fks: &[spg_storage::ForeignKeyConstraint],
8209    rows: &[Vec<Value>],
8210) -> Result<(), EngineError> {
8211    for fk in fks {
8212        let parent_is_self = fk.parent_table == child_table;
8213        let parent = if parent_is_self {
8214            // Self-ref: read the current state of the same table.
8215            // The mut borrow on child has been dropped by the caller.
8216            catalog.get(child_table).ok_or_else(|| {
8217                EngineError::Storage(StorageError::TableNotFound {
8218                    name: child_table.into(),
8219                })
8220            })?
8221        } else {
8222            catalog.get(&fk.parent_table).ok_or_else(|| {
8223                EngineError::Storage(StorageError::TableNotFound {
8224                    name: fk.parent_table.clone(),
8225                })
8226            })?
8227        };
8228        for (batch_idx, row_values) in rows.iter().enumerate() {
8229            // Single-column FK fast path: try the parent's BTree
8230            // index for an O(log n) lookup. Composite FKs fall back
8231            // to a parent-row scan.
8232            if fk.local_columns.len() == 1 {
8233                let v = &row_values[fk.local_columns[0]];
8234                if matches!(v, Value::Null) {
8235                    continue;
8236                }
8237                let parent_col = fk.parent_columns[0];
8238                let key = spg_storage::IndexKey::from_value(v).ok_or_else(|| {
8239                    EngineError::Unsupported(alloc::format!(
8240                        "FOREIGN KEY column value of type {:?} is not index-eligible",
8241                        v.data_type()
8242                    ))
8243                })?;
8244                let present_committed = parent.indices().iter().any(|idx| {
8245                    matches!(idx.kind, spg_storage::IndexKind::BTree(_))
8246                        && idx.column_position == parent_col
8247                        && idx.partial_predicate.is_none()
8248                        && !idx.lookup_eq(&key).is_empty()
8249                });
8250                // v7.6.7 self-ref widening: also accept a match
8251                // against earlier rows in this same batch when the
8252                // FK points at the table being inserted into.
8253                let present_in_batch = parent_is_self
8254                    && rows[..batch_idx].iter().any(|earlier| {
8255                        earlier.get(parent_col) == Some(v)
8256                    });
8257                if !(present_committed || present_in_batch) {
8258                    return Err(EngineError::Unsupported(alloc::format!(
8259                        "FOREIGN KEY violation: no parent row in {:?} where {} = {:?}",
8260                        fk.parent_table,
8261                        parent
8262                            .schema()
8263                            .columns
8264                            .get(parent_col)
8265                            .map_or("?", |c| c.name.as_str()),
8266                        v,
8267                    )));
8268                }
8269            } else {
8270                // Composite FK: scan parent rows. v7.6.7 also
8271                // accepts a match against earlier rows in the same
8272                // batch (self-ref bulk-loading of hierarchies).
8273                if fk.local_columns
8274                    .iter()
8275                    .all(|&i| matches!(row_values.get(i), Some(Value::Null)))
8276                {
8277                    continue;
8278                }
8279                let local: Vec<&Value> = fk.local_columns.iter().map(|&i| &row_values[i]).collect();
8280                let parent_match_committed = parent.rows().iter().any(|prow| {
8281                    fk.parent_columns
8282                        .iter()
8283                        .enumerate()
8284                        .all(|(i, &pi)| prow.values.get(pi) == Some(local[i]))
8285                });
8286                let parent_match_in_batch = parent_is_self
8287                    && rows[..batch_idx].iter().any(|earlier| {
8288                        fk.parent_columns
8289                            .iter()
8290                            .enumerate()
8291                            .all(|(i, &pi)| earlier.get(pi) == Some(local[i]))
8292                    });
8293                if !(parent_match_committed || parent_match_in_batch) {
8294                    return Err(EngineError::Unsupported(alloc::format!(
8295                        "FOREIGN KEY violation: no parent row in {:?} matching composite key",
8296                        fk.parent_table,
8297                    )));
8298                }
8299            }
8300        }
8301    }
8302    Ok(())
8303}
8304
8305/// v7.6.4 / v7.6.5 — one step of the FK action plan computed for a
8306/// DELETE on a parent. The plan is a list of these steps, stacked
8307/// across the FK graph by `plan_fk_parent_deletions`.
8308#[derive(Debug, Clone)]
8309struct FkChildStep {
8310    child_table: String,
8311    action: FkChildAction,
8312}
8313
8314#[derive(Debug, Clone)]
8315enum FkChildAction {
8316    /// CASCADE — remove these rows. Sorted, deduplicated positions.
8317    Delete { positions: Vec<usize> },
8318    /// SET NULL — for each (row, column) in the flat list, write
8319    /// NULL into that child cell. Multiple FKs on the same row may
8320    /// produce overlapping entries (deduped at plan time).
8321    SetNull {
8322        positions: Vec<usize>,
8323        columns: Vec<usize>,
8324    },
8325    /// SET DEFAULT — same shape as SetNull but writes the column's
8326    /// declared DEFAULT value (resolved at plan time). Columns
8327    /// without a DEFAULT raise an error during planning.
8328    SetDefault {
8329        positions: Vec<usize>,
8330        columns: Vec<usize>,
8331        defaults: Vec<Value>,
8332    },
8333}
8334
8335/// v7.6.3 → v7.6.5 — plan FK fallout for a DELETE on a parent table.
8336///
8337/// Walks every table in the catalog looking for FKs whose
8338/// `parent_table` is `parent_table_name`. For each such FK + each
8339/// to-be-deleted parent row:
8340///
8341///   - RESTRICT / NoAction → error, no plan returned
8342///   - CASCADE → child rows get scheduled for deletion; recursive
8343///   - SetNull → child FK column(s) scheduled to be NULL-ed.
8344///     Verified NULL-able at plan time.
8345///   - SetDefault → child FK column(s) scheduled to be reset to
8346///     their declared DEFAULT. Columns without a DEFAULT raise.
8347///
8348/// SET NULL / SET DEFAULT do NOT cascade further — the child row
8349/// stays; only one of its columns mutates.
8350fn plan_fk_parent_deletions(
8351    catalog: &Catalog,
8352    parent_table_name: &str,
8353    to_delete_positions: &[usize],
8354    to_delete_rows: &[Vec<Value>],
8355) -> Result<Vec<FkChildStep>, EngineError> {
8356    use alloc::collections::{BTreeMap, BTreeSet};
8357    if to_delete_rows.is_empty() {
8358        return Ok(Vec::new());
8359    }
8360    let mut delete_plan: BTreeMap<String, BTreeSet<usize>> = BTreeMap::new();
8361    // setnull / setdefault keyed by child_table → (row_idx, col_idx) → optional default
8362    let mut setnull_plan: BTreeMap<String, BTreeSet<(usize, usize)>> = BTreeMap::new();
8363    let mut setdefault_plan: BTreeMap<String, BTreeMap<(usize, usize), Value>> =
8364        BTreeMap::new();
8365    let mut visited: BTreeSet<(String, usize)> = BTreeSet::new();
8366    for &p in to_delete_positions {
8367        visited.insert((parent_table_name.to_string(), p));
8368    }
8369    let mut work: Vec<(String, Vec<Value>)> = to_delete_rows
8370        .iter()
8371        .map(|r| (parent_table_name.to_string(), r.clone()))
8372        .collect();
8373    while let Some((cur_parent, parent_row)) = work.pop() {
8374        for child_name in catalog.table_names() {
8375            let child = catalog
8376                .get(&child_name)
8377                .expect("table_names → catalog.get round-trip is total");
8378            for fk in &child.schema().foreign_keys {
8379                if fk.parent_table != cur_parent {
8380                    continue;
8381                }
8382                let parent_key: Vec<&Value> = fk
8383                    .parent_columns
8384                    .iter()
8385                    .map(|&pi| &parent_row[pi])
8386                    .collect();
8387                if parent_key.iter().any(|v| matches!(v, Value::Null)) {
8388                    continue;
8389                }
8390                for (child_row_idx, child_row) in child.rows().iter().enumerate() {
8391                    if child_name == cur_parent
8392                        && visited.contains(&(child_name.clone(), child_row_idx))
8393                    {
8394                        continue;
8395                    }
8396                    let matches_key = fk
8397                        .local_columns
8398                        .iter()
8399                        .enumerate()
8400                        .all(|(i, &li)| child_row.values.get(li) == Some(parent_key[i]));
8401                    if !matches_key {
8402                        continue;
8403                    }
8404                    match fk.on_delete {
8405                        spg_storage::FkAction::Restrict
8406                        | spg_storage::FkAction::NoAction => {
8407                            return Err(EngineError::Unsupported(alloc::format!(
8408                                "FOREIGN KEY violation: DELETE on {cur_parent:?} is \
8409                                 restricted by FK from {child_name:?}.{:?}",
8410                                fk.local_columns,
8411                            )));
8412                        }
8413                        spg_storage::FkAction::Cascade => {
8414                            if visited.insert((child_name.clone(), child_row_idx)) {
8415                                delete_plan
8416                                    .entry(child_name.clone())
8417                                    .or_default()
8418                                    .insert(child_row_idx);
8419                                work.push((child_name.clone(), child_row.values.clone()));
8420                            }
8421                        }
8422                        spg_storage::FkAction::SetNull => {
8423                            // Verify every local FK column is NULL-able.
8424                            for &li in &fk.local_columns {
8425                                let col = child.schema().columns.get(li).ok_or_else(|| {
8426                                    EngineError::Unsupported(alloc::format!(
8427                                        "FK local column {li} missing in {child_name:?}"
8428                                    ))
8429                                })?;
8430                                if !col.nullable {
8431                                    return Err(EngineError::Unsupported(alloc::format!(
8432                                        "FOREIGN KEY ON DELETE SET NULL: column \
8433                                         {child_name:?}.{:?} is NOT NULL — cannot SET NULL",
8434                                        col.name,
8435                                    )));
8436                                }
8437                            }
8438                            let entry = setnull_plan.entry(child_name.clone()).or_default();
8439                            for &li in &fk.local_columns {
8440                                entry.insert((child_row_idx, li));
8441                            }
8442                        }
8443                        spg_storage::FkAction::SetDefault => {
8444                            // Resolve the DEFAULT for every local FK col.
8445                            let entry =
8446                                setdefault_plan.entry(child_name.clone()).or_default();
8447                            for &li in &fk.local_columns {
8448                                let col = child.schema().columns.get(li).ok_or_else(|| {
8449                                    EngineError::Unsupported(alloc::format!(
8450                                        "FK local column {li} missing in {child_name:?}"
8451                                    ))
8452                                })?;
8453                                let default = col.default.clone().ok_or_else(|| {
8454                                    EngineError::Unsupported(alloc::format!(
8455                                        "FOREIGN KEY ON DELETE SET DEFAULT: column \
8456                                         {child_name:?}.{:?} has no DEFAULT declared",
8457                                        col.name,
8458                                    ))
8459                                })?;
8460                                entry.insert((child_row_idx, li), default);
8461                            }
8462                        }
8463                    }
8464                }
8465            }
8466        }
8467    }
8468    // Flatten the three plans into the ordered `FkChildStep` list.
8469    // Deletes are applied last per child (after any null/default
8470    // re-writes on the same child) so a child row that's both
8471    // re-written and then cascade-deleted only ends up deleted —
8472    // but in v7.6.5 SetNull/Cascade never overlap on the same row
8473    // (a single FK chooses exactly one action), so the order is
8474    // mostly a precaution.
8475    let mut steps: Vec<FkChildStep> = Vec::new();
8476    for (child_table, entries) in setnull_plan {
8477        let (positions, columns): (Vec<usize>, Vec<usize>) = entries.into_iter().unzip();
8478        steps.push(FkChildStep {
8479            child_table,
8480            action: FkChildAction::SetNull { positions, columns },
8481        });
8482    }
8483    for (child_table, entries) in setdefault_plan {
8484        let mut positions = Vec::with_capacity(entries.len());
8485        let mut columns = Vec::with_capacity(entries.len());
8486        let mut defaults = Vec::with_capacity(entries.len());
8487        for ((p, c), v) in entries {
8488            positions.push(p);
8489            columns.push(c);
8490            defaults.push(v);
8491        }
8492        steps.push(FkChildStep {
8493            child_table,
8494            action: FkChildAction::SetDefault {
8495                positions,
8496                columns,
8497                defaults,
8498            },
8499        });
8500    }
8501    for (child_table, positions) in delete_plan {
8502        steps.push(FkChildStep {
8503            child_table,
8504            action: FkChildAction::Delete {
8505                positions: positions.into_iter().collect(),
8506            },
8507        });
8508    }
8509    Ok(steps)
8510}
8511
8512/// v7.6.6 — plan FK fallout for an UPDATE that mutates parent-side
8513/// PK/UNIQUE columns. Walks every other table whose FK references
8514/// `parent_table_name`; for each FK whose parent_columns overlap a
8515/// mutated column, decides the action by `fk.on_update`.
8516///
8517///   - RESTRICT / NoAction → error if any child references the OLD
8518///     value
8519///   - CASCADE → child FK columns get rewritten to the NEW parent
8520///     value (a SetNull-style update step with the new value)
8521///   - SetNull → child FK columns set to NULL
8522///   - SetDefault → child FK columns set to declared default
8523///
8524/// `plan_with_old` is `(row_position, old_values, new_values)` so
8525/// the planner can detect "did this row's parent key actually
8526/// change?" — only rows where at least one referenced parent
8527/// column moved trigger inbound work.
8528fn plan_fk_parent_updates(
8529    catalog: &Catalog,
8530    parent_table_name: &str,
8531    plan_with_old: &[(usize, Vec<Value>, Vec<Value>)],
8532) -> Result<Vec<FkChildStep>, EngineError> {
8533    use alloc::collections::BTreeMap;
8534    if plan_with_old.is_empty() {
8535        return Ok(Vec::new());
8536    }
8537    // For each child table we may touch, build per-child step
8538    // lists. UPDATE never deletes children — `delete_plan` stays
8539    // empty here but is kept structurally aligned with
8540    // `plan_fk_parent_deletions` for future use.
8541    let delete_plan: BTreeMap<String, alloc::collections::BTreeSet<usize>> = BTreeMap::new();
8542    let mut setnull_plan: BTreeMap<
8543        String,
8544        alloc::collections::BTreeSet<(usize, usize)>,
8545    > = BTreeMap::new();
8546    let mut setdefault_plan: BTreeMap<String, BTreeMap<(usize, usize), Value>> =
8547        BTreeMap::new();
8548    // Cascade-update plan: child_table → row_idx → col_idx → new_value
8549    let mut cascade_plan: BTreeMap<String, BTreeMap<(usize, usize), Value>> = BTreeMap::new();
8550
8551    for child_name in catalog.table_names() {
8552        let child = catalog
8553            .get(&child_name)
8554            .expect("table_names → catalog.get total");
8555        for fk in &child.schema().foreign_keys {
8556            if fk.parent_table != parent_table_name {
8557                continue;
8558            }
8559            for (_pos, old_row, new_row) in plan_with_old {
8560                // Did any parent FK column change?
8561                let key_changed = fk
8562                    .parent_columns
8563                    .iter()
8564                    .any(|&pi| old_row.get(pi) != new_row.get(pi));
8565                if !key_changed {
8566                    continue;
8567                }
8568                // The OLD parent key — used to find referring children.
8569                let old_key: Vec<&Value> = fk
8570                    .parent_columns
8571                    .iter()
8572                    .map(|&pi| &old_row[pi])
8573                    .collect();
8574                if old_key.iter().any(|v| matches!(v, Value::Null)) {
8575                    // NULL parent has no children — skip.
8576                    continue;
8577                }
8578                let new_key: Vec<&Value> = fk
8579                    .parent_columns
8580                    .iter()
8581                    .map(|&pi| &new_row[pi])
8582                    .collect();
8583                for (child_row_idx, child_row) in child.rows().iter().enumerate() {
8584                    // Self-ref same-row updates: a row updating its
8585                    // own PK doesn't restrict itself.
8586                    if child_name == parent_table_name
8587                        && plan_with_old
8588                            .iter()
8589                            .any(|(p, _, _)| *p == child_row_idx)
8590                    {
8591                        continue;
8592                    }
8593                    let matches_key = fk
8594                        .local_columns
8595                        .iter()
8596                        .enumerate()
8597                        .all(|(i, &li)| child_row.values.get(li) == Some(old_key[i]));
8598                    if !matches_key {
8599                        continue;
8600                    }
8601                    match fk.on_update {
8602                        spg_storage::FkAction::Restrict
8603                        | spg_storage::FkAction::NoAction => {
8604                            return Err(EngineError::Unsupported(alloc::format!(
8605                                "FOREIGN KEY violation: UPDATE on {parent_table_name:?} PK is \
8606                                 restricted by FK from {child_name:?}.{:?}",
8607                                fk.local_columns,
8608                            )));
8609                        }
8610                        spg_storage::FkAction::Cascade => {
8611                            // Rewrite child FK columns to new key.
8612                            let entry = cascade_plan.entry(child_name.clone()).or_default();
8613                            for (i, &li) in fk.local_columns.iter().enumerate() {
8614                                entry.insert((child_row_idx, li), new_key[i].clone());
8615                            }
8616                        }
8617                        spg_storage::FkAction::SetNull => {
8618                            for &li in &fk.local_columns {
8619                                let col = child.schema().columns.get(li).ok_or_else(|| {
8620                                    EngineError::Unsupported(alloc::format!(
8621                                        "FK local column {li} missing in {child_name:?}"
8622                                    ))
8623                                })?;
8624                                if !col.nullable {
8625                                    return Err(EngineError::Unsupported(alloc::format!(
8626                                        "FOREIGN KEY ON UPDATE SET NULL: column \
8627                                         {child_name:?}.{:?} is NOT NULL",
8628                                        col.name,
8629                                    )));
8630                                }
8631                            }
8632                            let entry = setnull_plan.entry(child_name.clone()).or_default();
8633                            for &li in &fk.local_columns {
8634                                entry.insert((child_row_idx, li));
8635                            }
8636                        }
8637                        spg_storage::FkAction::SetDefault => {
8638                            let entry =
8639                                setdefault_plan.entry(child_name.clone()).or_default();
8640                            for &li in &fk.local_columns {
8641                                let col = child.schema().columns.get(li).ok_or_else(|| {
8642                                    EngineError::Unsupported(alloc::format!(
8643                                        "FK local column {li} missing in {child_name:?}"
8644                                    ))
8645                                })?;
8646                                let default = col.default.clone().ok_or_else(|| {
8647                                    EngineError::Unsupported(alloc::format!(
8648                                        "FOREIGN KEY ON UPDATE SET DEFAULT: column \
8649                                         {child_name:?}.{:?} has no DEFAULT",
8650                                        col.name,
8651                                    ))
8652                                })?;
8653                                entry.insert((child_row_idx, li), default);
8654                            }
8655                        }
8656                    }
8657                }
8658            }
8659        }
8660    }
8661    // Flatten into FkChildStep list. UPDATE doesn't produce
8662    // DeleteSteps (CASCADE on UPDATE just rewrites FK values).
8663    let mut steps: Vec<FkChildStep> = Vec::new();
8664    for (child_table, entries) in cascade_plan {
8665        let mut positions = Vec::with_capacity(entries.len());
8666        let mut columns = Vec::with_capacity(entries.len());
8667        let mut defaults = Vec::with_capacity(entries.len());
8668        for ((p, c), v) in entries {
8669            positions.push(p);
8670            columns.push(c);
8671            defaults.push(v);
8672        }
8673        // We reuse `FkChildAction::SetDefault` for cascade-update:
8674        // both shapes are "write a known value into specific cells"
8675        // — `apply_per_cell_writes` doesn't care whether the value
8676        // came from a DEFAULT declaration or a new parent key.
8677        steps.push(FkChildStep {
8678            child_table,
8679            action: FkChildAction::SetDefault {
8680                positions,
8681                columns,
8682                defaults,
8683            },
8684        });
8685    }
8686    for (child_table, entries) in setnull_plan {
8687        let (positions, columns): (Vec<usize>, Vec<usize>) = entries.into_iter().unzip();
8688        steps.push(FkChildStep {
8689            child_table,
8690            action: FkChildAction::SetNull { positions, columns },
8691        });
8692    }
8693    for (child_table, entries) in setdefault_plan {
8694        let mut positions = Vec::with_capacity(entries.len());
8695        let mut columns = Vec::with_capacity(entries.len());
8696        let mut defaults = Vec::with_capacity(entries.len());
8697        for ((p, c), v) in entries {
8698            positions.push(p);
8699            columns.push(c);
8700            defaults.push(v);
8701        }
8702        steps.push(FkChildStep {
8703            child_table,
8704            action: FkChildAction::SetDefault {
8705                positions,
8706                columns,
8707                defaults,
8708            },
8709        });
8710    }
8711    let _ = delete_plan; // UPDATE never deletes children.
8712    Ok(steps)
8713}
8714
8715/// v7.6.5 — apply one FK child step to the catalog. Encapsulates
8716/// the three action variants so the DELETE executor stays a
8717/// simple loop over the planned steps.
8718fn apply_fk_child_step(
8719    catalog: &mut Catalog,
8720    step: &FkChildStep,
8721) -> Result<(), EngineError> {
8722    let child = catalog.get_mut(&step.child_table).ok_or_else(|| {
8723        EngineError::Storage(StorageError::TableNotFound {
8724            name: step.child_table.clone(),
8725        })
8726    })?;
8727    match &step.action {
8728        FkChildAction::Delete { positions } => {
8729            let _ = child.delete_rows(positions);
8730        }
8731        FkChildAction::SetNull { positions, columns } => {
8732            apply_per_cell_writes(child, positions, columns, |_| Value::Null)?;
8733        }
8734        FkChildAction::SetDefault {
8735            positions,
8736            columns,
8737            defaults,
8738        } => {
8739            apply_per_cell_writes(child, positions, columns, |i| defaults[i].clone())?;
8740        }
8741    }
8742    Ok(())
8743}
8744
8745/// v7.6.5 — write new values into selected child cells via
8746/// `Table::update_row` (the catalog's existing UPDATE entry).
8747/// Groups writes by row position so multi-column updates on the
8748/// same row only call `update_row` once. `value_for(i)` produces
8749/// the new value for the i-th (position, column) entry.
8750fn apply_per_cell_writes(
8751    child: &mut spg_storage::Table,
8752    positions: &[usize],
8753    columns: &[usize],
8754    mut value_for: impl FnMut(usize) -> Value,
8755) -> Result<(), EngineError> {
8756    use alloc::collections::BTreeMap;
8757    let mut by_row: BTreeMap<usize, Vec<(usize, Value)>> = BTreeMap::new();
8758    for i in 0..positions.len() {
8759        by_row
8760            .entry(positions[i])
8761            .or_default()
8762            .push((columns[i], value_for(i)));
8763    }
8764    for (pos, mutations) in by_row {
8765        let mut new_values = child.rows()[pos].values.clone();
8766        for (col, v) in mutations {
8767            if let Some(slot) = new_values.get_mut(col) {
8768                *slot = v;
8769            }
8770        }
8771        child
8772            .update_row(pos, new_values)
8773            .map_err(EngineError::Storage)?;
8774    }
8775    Ok(())
8776}
8777
8778fn fk_action_sql_to_storage(a: spg_sql::ast::FkAction) -> spg_storage::FkAction {
8779    match a {
8780        spg_sql::ast::FkAction::Restrict => spg_storage::FkAction::Restrict,
8781        spg_sql::ast::FkAction::Cascade => spg_storage::FkAction::Cascade,
8782        spg_sql::ast::FkAction::SetNull => spg_storage::FkAction::SetNull,
8783        spg_sql::ast::FkAction::SetDefault => spg_storage::FkAction::SetDefault,
8784        spg_sql::ast::FkAction::NoAction => spg_storage::FkAction::NoAction,
8785    }
8786}
8787
8788/// v7.9.21 — resolve a column's DEFAULT for INSERT-time
8789/// default-fill. Free fn (rather than `&self`) so callers
8790/// with an active `&mut Table` borrow can still use it.
8791/// Literal defaults take the cached path (`col.default`);
8792/// runtime defaults hit `clock_fn` at each call. mailrs G4.
8793fn resolve_column_default_free(
8794    col: &ColumnSchema,
8795    clock_fn: Option<ClockFn>,
8796) -> Result<Value, EngineError> {
8797    if let Some(rt) = &col.runtime_default {
8798        return eval_runtime_default_free(rt, col.ty, clock_fn);
8799    }
8800    Ok(col.default.clone().unwrap_or(Value::Null))
8801}
8802
8803fn eval_runtime_default_free(
8804    rt: &str,
8805    ty: DataType,
8806    clock_fn: Option<ClockFn>,
8807) -> Result<Value, EngineError> {
8808    let s = rt.trim().to_ascii_lowercase();
8809    let canonical = s.trim_end_matches("()");
8810    let now_us = match clock_fn {
8811        Some(f) => f(),
8812        None => 0,
8813    };
8814    let v = match canonical {
8815        "now" | "current_timestamp" | "localtimestamp" => {
8816            Value::Timestamp(now_us)
8817        }
8818        "current_date" => Value::Date((now_us / 86_400_000_000) as i32),
8819        "current_time" | "localtime" => Value::Timestamp(now_us),
8820        other => {
8821            return Err(EngineError::Unsupported(alloc::format!(
8822                "runtime DEFAULT expression {other:?} not supported \
8823                 (v7.9.21 whitelist: now() / current_timestamp / \
8824                 current_date / current_time / localtimestamp / \
8825                 localtime)"
8826            )));
8827        }
8828    };
8829    coerce_value(v, ty, "DEFAULT", 0)
8830}
8831
8832/// v7.9.21 — true when a DEFAULT expression needs INSERT-time
8833/// evaluation rather than being cacheable as a literal Value.
8834/// FunctionCall is the immediate case (`now()`,
8835/// `current_timestamp`). Literal expressions and simple sign-
8836/// flipped numerics still take the static-cache path.
8837fn is_runtime_default_expr(expr: &Expr) -> bool {
8838    match expr {
8839        Expr::FunctionCall { .. } => true,
8840        Expr::Unary { expr, .. } => is_runtime_default_expr(expr),
8841        _ => false,
8842    }
8843}
8844
8845fn column_def_to_schema(c: ColumnDef) -> Result<ColumnSchema, EngineError> {
8846    let ty = column_type_to_data_type(c.ty);
8847    let mut schema = ColumnSchema::new(c.name.clone(), ty, c.nullable);
8848    if let Some(default_expr) = c.default {
8849        // v7.9.21 — distinguish literal defaults (evaluated once
8850        // at CREATE TABLE) from expression defaults (deferred to
8851        // INSERT). Function calls (`now()`, `current_timestamp`
8852        // — see v7.9.20 keyword promotion) take the runtime path.
8853        // Literals continue to cache. mailrs G4.
8854        if is_runtime_default_expr(&default_expr) {
8855            let display = alloc::format!("{default_expr}");
8856            schema = schema.with_runtime_default(display);
8857        } else {
8858            let raw = literal_expr_to_value(default_expr)?;
8859            let coerced = coerce_value(raw, ty, &c.name, 0)?;
8860            schema = schema.with_default(coerced);
8861        }
8862    }
8863    if c.auto_increment {
8864        // AUTO_INCREMENT only makes sense on integer-shaped columns.
8865        if !matches!(ty, DataType::SmallInt | DataType::Int | DataType::BigInt) {
8866            return Err(EngineError::Unsupported(alloc::format!(
8867                "AUTO_INCREMENT requires an integer column type, got {ty:?}"
8868            )));
8869        }
8870        schema = schema.with_auto_increment();
8871    }
8872    Ok(schema)
8873}
8874
8875const fn column_type_to_data_type(t: ColumnTypeName) -> DataType {
8876    match t {
8877        ColumnTypeName::SmallInt => DataType::SmallInt,
8878        ColumnTypeName::Int => DataType::Int,
8879        ColumnTypeName::BigInt => DataType::BigInt,
8880        ColumnTypeName::Float => DataType::Float,
8881        ColumnTypeName::Text => DataType::Text,
8882        ColumnTypeName::Varchar(n) => DataType::Varchar(n),
8883        ColumnTypeName::Char(n) => DataType::Char(n),
8884        ColumnTypeName::Bool => DataType::Bool,
8885        ColumnTypeName::Vector { dim, encoding } => DataType::Vector {
8886            dim,
8887            encoding: match encoding {
8888                SqlVecEncoding::F32 => VecEncoding::F32,
8889                SqlVecEncoding::Sq8 => VecEncoding::Sq8,
8890                SqlVecEncoding::F16 => VecEncoding::F16,
8891            },
8892        },
8893        ColumnTypeName::Numeric(precision, scale) => DataType::Numeric { precision, scale },
8894        ColumnTypeName::Date => DataType::Date,
8895        ColumnTypeName::Timestamp => DataType::Timestamp,
8896        ColumnTypeName::Timestamptz => DataType::Timestamptz,
8897        ColumnTypeName::Json => DataType::Json,
8898        ColumnTypeName::Jsonb => DataType::Jsonb,
8899    }
8900}
8901
8902/// Convert an INSERT VALUES expression to a storage Value. Supports literal
8903/// expressions, unary-minus over numeric literals, and pgvector-style
8904/// `'[..]'::vector` cast (v1.2). Anything more complex returns `Unsupported`.
8905fn literal_expr_to_value(expr: Expr) -> Result<Value, EngineError> {
8906    match expr {
8907        Expr::Literal(l) => Ok(literal_to_value(l)),
8908        Expr::Cast { expr, target } => {
8909            let inner_value = literal_expr_to_value(*expr)?;
8910            crate::eval::cast_value(inner_value, target).map_err(EngineError::Eval)
8911        }
8912        Expr::Unary {
8913            op: UnOp::Neg,
8914            expr,
8915        } => match *expr {
8916            Expr::Literal(Literal::Integer(n)) => {
8917                // Fold to i32 if it fits, else BigInt. Parser emits Integer(i64)
8918                // — overflow on negate of i64::MIN is the one edge case.
8919                let neg = n.checked_neg().ok_or_else(|| {
8920                    EngineError::Unsupported("integer literal overflow on negation".into())
8921                })?;
8922                Ok(int_value_for(neg))
8923            }
8924            Expr::Literal(Literal::Float(x)) => Ok(Value::Float(-x)),
8925            other => Err(EngineError::Unsupported(alloc::format!(
8926                "unary minus over non-literal expression: {other:?}"
8927            ))),
8928        },
8929        other => Err(EngineError::Unsupported(alloc::format!(
8930            "non-literal INSERT value expression: {other:?}"
8931        ))),
8932    }
8933}
8934
8935fn literal_to_value(l: Literal) -> Value {
8936    match l {
8937        Literal::Integer(n) => int_value_for(n),
8938        Literal::Float(x) => Value::Float(x),
8939        Literal::String(s) => Value::Text(s),
8940        Literal::Bool(b) => Value::Bool(b),
8941        Literal::Null => Value::Null,
8942        Literal::Vector(v) => Value::Vector(v),
8943        Literal::Interval { months, micros, .. } => Value::Interval { months, micros },
8944    }
8945}
8946
8947/// Pick `Int` (`i32`) when the literal fits, else `BigInt`. `INT` vs `BIGINT`
8948/// columns will still enforce the right tag downstream — this is just the
8949/// default we synthesise from an unannotated integer literal.
8950fn int_value_for(n: i64) -> Value {
8951    if let Ok(small) = i32::try_from(n) {
8952        Value::Int(small)
8953    } else {
8954        Value::BigInt(n)
8955    }
8956}
8957
8958/// Widen / narrow `v` to fit `expected`. Numerics permit safe widening
8959/// (`Int → BigInt`, `Int/BigInt → Float`) and best-effort narrowing
8960/// (`BigInt → Int` succeeds only when the value fits in `i32`). Everything
8961/// else returns `TypeMismatch` carrying the column name for caller diagnostics.
8962/// `NULL` is always permitted; the nullability check happens later in storage.
8963#[allow(clippy::too_many_lines)]
8964fn coerce_value(
8965    v: Value,
8966    expected: DataType,
8967    col_name: &str,
8968    position: usize,
8969) -> Result<Value, EngineError> {
8970    if v.is_null() {
8971        return Ok(Value::Null);
8972    }
8973    let actual = v.data_type().expect("non-null");
8974    if actual == expected {
8975        return Ok(v);
8976    }
8977    let coerced =
8978        match (v, expected) {
8979            (Value::Int(n), DataType::BigInt) => Some(Value::BigInt(i64::from(n))),
8980            (Value::Int(n), DataType::Float) => Some(Value::Float(f64::from(n))),
8981            (Value::Int(n), DataType::SmallInt) => i16::try_from(n).ok().map(Value::SmallInt),
8982            (Value::Int(n), DataType::Numeric { precision, scale }) => Some(numeric_from_integer(
8983                i128::from(n),
8984                precision,
8985                scale,
8986                col_name,
8987            )?),
8988            (Value::SmallInt(n), DataType::Int) => Some(Value::Int(i32::from(n))),
8989            (Value::SmallInt(n), DataType::BigInt) => Some(Value::BigInt(i64::from(n))),
8990            (Value::SmallInt(n), DataType::Float) => Some(Value::Float(f64::from(n))),
8991            (Value::SmallInt(n), DataType::Numeric { precision, scale }) => Some(
8992                numeric_from_integer(i128::from(n), precision, scale, col_name)?,
8993            ),
8994            (Value::BigInt(n), DataType::Int) => i32::try_from(n).ok().map(Value::Int),
8995            (Value::BigInt(n), DataType::SmallInt) => i16::try_from(n).ok().map(Value::SmallInt),
8996            #[allow(clippy::cast_precision_loss)]
8997            (Value::BigInt(n), DataType::Float) => Some(Value::Float(n as f64)),
8998            (Value::BigInt(n), DataType::Numeric { precision, scale }) => Some(
8999                numeric_from_integer(i128::from(n), precision, scale, col_name)?,
9000            ),
9001            (Value::Float(x), DataType::Numeric { precision, scale }) => {
9002                Some(numeric_from_float(x, precision, scale, col_name)?)
9003            }
9004            // Text → DATE / TIMESTAMP: parse canonical text forms.
9005            (Value::Text(s), DataType::Date) => {
9006                let d = eval::parse_date_literal(&s).ok_or_else(|| {
9007                    EngineError::Eval(EvalError::TypeMismatch {
9008                        detail: alloc::format!(
9009                            "cannot parse {s:?} as DATE for column `{col_name}`"
9010                        ),
9011                    })
9012                })?;
9013                Some(Value::Date(d))
9014            }
9015            // v4.9: Text ↔ JSON coercion. No structural validation —
9016            // any text literal is accepted; the responsibility for
9017            // valid JSON lies with the producer.
9018            (Value::Text(s), DataType::Json | DataType::Jsonb) => Some(Value::Json(s)),
9019            (Value::Json(s), DataType::Text) => Some(Value::Text(s)),
9020            (Value::Text(s), DataType::Timestamp | DataType::Timestamptz) => {
9021                let t = eval::parse_timestamp_literal(&s).ok_or_else(|| {
9022                    EngineError::Eval(EvalError::TypeMismatch {
9023                        detail: alloc::format!(
9024                            "cannot parse {s:?} as TIMESTAMP for column `{col_name}`"
9025                        ),
9026                    })
9027                })?;
9028                Some(Value::Timestamp(t))
9029            }
9030            // DATE ↔ TIMESTAMP convertibility (DATE → midnight,
9031            // TIMESTAMP → day truncation).
9032            (Value::Date(d), DataType::Timestamp | DataType::Timestamptz) => {
9033                Some(Value::Timestamp(i64::from(d) * 86_400_000_000))
9034            }
9035            // v7.9.21 — Value::Timestamp lands in either Timestamp
9036            // or Timestamptz columns; the on-disk layout is the
9037            // same i64 microseconds UTC.
9038            (Value::Timestamp(t), DataType::Timestamptz) => Some(Value::Timestamp(t)),
9039            (Value::Timestamp(t), DataType::Date) => {
9040                let days = t.div_euclid(86_400_000_000);
9041                i32::try_from(days).ok().map(Value::Date)
9042            }
9043            (
9044                Value::Numeric {
9045                    scaled,
9046                    scale: src_scale,
9047                },
9048                DataType::Numeric { precision, scale },
9049            ) => Some(numeric_rescale(
9050                scaled, src_scale, precision, scale, col_name,
9051            )?),
9052            #[allow(clippy::cast_precision_loss)]
9053            (Value::Numeric { scaled, scale }, DataType::Float) => {
9054                let mut div = 1.0_f64;
9055                for _ in 0..scale {
9056                    div *= 10.0;
9057                }
9058                Some(Value::Float((scaled as f64) / div))
9059            }
9060            (Value::Numeric { scaled, scale }, DataType::Int) => {
9061                let truncated = numeric_truncate_to_integer(scaled, scale);
9062                i32::try_from(truncated).ok().map(Value::Int)
9063            }
9064            (Value::Numeric { scaled, scale }, DataType::BigInt) => {
9065                let truncated = numeric_truncate_to_integer(scaled, scale);
9066                i64::try_from(truncated).ok().map(Value::BigInt)
9067            }
9068            (Value::Numeric { scaled, scale }, DataType::SmallInt) => {
9069                let truncated = numeric_truncate_to_integer(scaled, scale);
9070                i16::try_from(truncated).ok().map(Value::SmallInt)
9071            }
9072            // VARCHAR(n) enforces an upper bound on character count.
9073            (Value::Text(s), DataType::Varchar(max)) => {
9074                if u32::try_from(s.chars().count()).unwrap_or(u32::MAX) <= max {
9075                    Some(Value::Text(s))
9076                } else {
9077                    return Err(EngineError::Unsupported(alloc::format!(
9078                        "value for VARCHAR({max}) column `{col_name}` exceeds length: \
9079                     {} chars",
9080                        s.chars().count()
9081                    )));
9082                }
9083            }
9084            // v6.0.1: f32 → SQ8 INSERT-time quantisation. Triggered
9085            // when the column declares `VECTOR(N) USING SQ8` and
9086            // the INSERT VALUES expression yields a raw f32 vector
9087            // (the normal pgvector-shape literal). Dim mismatch
9088            // falls through the `_ => None` arm and surfaces as
9089            // `TypeMismatch` with the expected SQ8 column type —
9090            // matching the F32 path's existing error.
9091            (
9092                Value::Vector(v),
9093                DataType::Vector {
9094                    dim,
9095                    encoding: VecEncoding::Sq8,
9096                },
9097            ) if v.len() == dim as usize => {
9098                Some(Value::Sq8Vector(spg_storage::quantize::quantize(&v)))
9099            }
9100            // v6.0.3: f32 → f16 INSERT-time conversion for HALF
9101            // columns. Bit-exact at the storage layer (modulo
9102            // half-precision rounding); no rerank pass needed at
9103            // search time.
9104            (
9105                Value::Vector(v),
9106                DataType::Vector {
9107                    dim,
9108                    encoding: VecEncoding::F16,
9109                },
9110            ) if v.len() == dim as usize => Some(Value::HalfVector(
9111                spg_storage::halfvec::HalfVector::from_f32_slice(&v),
9112            )),
9113            // CHAR(n) right-pads with U+0020 to exactly n chars; if the input
9114            // is already longer we reject (PG truncates trailing-space-only;
9115            // staying strict for v1).
9116            (Value::Text(s), DataType::Char(size)) => {
9117                let len = u32::try_from(s.chars().count()).unwrap_or(u32::MAX);
9118                if len > size {
9119                    return Err(EngineError::Unsupported(alloc::format!(
9120                        "value for CHAR({size}) column `{col_name}` exceeds length: \
9121                     {len} chars"
9122                    )));
9123                }
9124                let need = (size - len) as usize;
9125                let mut padded = s;
9126                padded.reserve(need);
9127                for _ in 0..need {
9128                    padded.push(' ');
9129                }
9130                Some(Value::Text(padded))
9131            }
9132            _ => None,
9133        };
9134    coerced.ok_or(EngineError::Storage(StorageError::TypeMismatch {
9135        column: col_name.into(),
9136        expected,
9137        actual,
9138        position,
9139    }))
9140}
9141
9142#[cfg(test)]
9143mod tests {
9144    use super::*;
9145    use alloc::vec;
9146
9147    fn unwrap_command_ok(r: &QueryResult) -> usize {
9148        match r {
9149            QueryResult::CommandOk { affected, .. } => *affected,
9150            QueryResult::Rows { .. } => panic!("expected CommandOk, got Rows"),
9151        }
9152    }
9153
9154    #[test]
9155    fn create_table_registers_schema() {
9156        let mut e = Engine::new();
9157        e.execute("CREATE TABLE foo (a INT NOT NULL, b TEXT)")
9158            .unwrap();
9159        assert_eq!(e.catalog().table_count(), 1);
9160        let t = e.catalog().get("foo").unwrap();
9161        assert_eq!(t.schema().columns.len(), 2);
9162        assert_eq!(t.schema().columns[0].ty, DataType::Int);
9163        assert!(!t.schema().columns[0].nullable);
9164        assert_eq!(t.schema().columns[1].ty, DataType::Text);
9165    }
9166
9167    #[test]
9168    fn create_table_vector_default_is_f32_encoded() {
9169        let mut e = Engine::new();
9170        e.execute("CREATE TABLE t (v VECTOR(8))").unwrap();
9171        let t = e.catalog().get("t").unwrap();
9172        assert_eq!(
9173            t.schema().columns[0].ty,
9174            DataType::Vector {
9175                dim: 8,
9176                encoding: VecEncoding::F32,
9177            },
9178        );
9179    }
9180
9181    #[test]
9182    fn create_table_vector_using_sq8_succeeds() {
9183        // v6.0.1 step 3: the step-1 fence in `column_def_to_schema`
9184        // is lifted. CREATE TABLE persists an SQ8 column type in
9185        // the catalog; INSERT (next test) quantises raw f32 input.
9186        let mut e = Engine::new();
9187        e.execute("CREATE TABLE t (v VECTOR(8) USING SQ8)").unwrap();
9188        let t = e.catalog().get("t").unwrap();
9189        assert_eq!(
9190            t.schema().columns[0].ty,
9191            DataType::Vector {
9192                dim: 8,
9193                encoding: VecEncoding::Sq8,
9194            },
9195        );
9196    }
9197
9198    #[test]
9199    fn insert_into_sq8_column_quantises_f32_payload() {
9200        // v6.0.1 step 3: INSERT-time `coerce_value` rewrites a raw
9201        // `Value::Vector(Vec<f32>)` literal into the column's
9202        // quantised representation. The row that lands in the
9203        // catalog must therefore hold a `Value::Sq8Vector`, not the
9204        // original f32 buffer — that's the bit that delivers the
9205        // 4× compression target.
9206        let mut e = Engine::new();
9207        e.execute("CREATE TABLE t (v VECTOR(4) USING SQ8)").unwrap();
9208        e.execute("INSERT INTO t VALUES ([0.0, 0.25, 0.5, 1.0])")
9209            .unwrap();
9210        let t = e.catalog().get("t").unwrap();
9211        assert_eq!(t.rows().len(), 1);
9212        match &t.rows()[0].values[0] {
9213            Value::Sq8Vector(q) => {
9214                assert_eq!(q.bytes.len(), 4);
9215                // min/max are derived from the payload: min=0.0, max=1.0.
9216                assert!((q.min - 0.0).abs() < 1e-6);
9217                assert!((q.max - 1.0).abs() < 1e-6);
9218            }
9219            other => panic!("expected Sq8Vector cell, got {other:?}"),
9220        }
9221    }
9222
9223    #[test]
9224    fn create_table_vector_using_half_succeeds_and_insert_converts_to_f16() {
9225        // v6.0.3: CREATE TABLE accepts USING HALF; INSERT path
9226        // converts the incoming `Value::Vector(Vec<f32>)` cell
9227        // into `Value::HalfVector(HalfVector)` via the new
9228        // `coerce_value` arm. The dequantised round-trip is
9229        // bit-exact for f16-representable values, so 0.0 / 0.25
9230        // / 0.5 / 1.0 hit their grid points exactly.
9231        let mut e = Engine::new();
9232        e.execute("CREATE TABLE t (v VECTOR(4) USING HALF)")
9233            .unwrap();
9234        e.execute("INSERT INTO t VALUES ([0.0, 0.25, 0.5, 1.0])")
9235            .unwrap();
9236        let t = e.catalog().get("t").unwrap();
9237        assert_eq!(t.rows().len(), 1);
9238        match &t.rows()[0].values[0] {
9239            Value::HalfVector(h) => {
9240                assert_eq!(h.dim(), 4);
9241                let back = h.to_f32_vec();
9242                let expected = alloc::vec![0.0_f32, 0.25, 0.5, 1.0];
9243                for (g, e) in back.iter().zip(expected.iter()) {
9244                    assert!(
9245                        (g - e).abs() < 1e-6,
9246                        "{g} vs {e} should be exact on f16 grid"
9247                    );
9248                }
9249            }
9250            other => panic!("expected HalfVector cell, got {other:?}"),
9251        }
9252    }
9253
9254    #[test]
9255    fn alter_index_rebuild_in_place_succeeds() {
9256        // v6.0.4: bare REBUILD (no encoding switch) walks every
9257        // row again to rebuild the NSW graph. Verifies the engine
9258        // dispatch + storage helper plumbing without changing any
9259        // cell encoding.
9260        let mut e = Engine::new();
9261        e.execute("CREATE TABLE t (id INT NOT NULL, v VECTOR(3) NOT NULL)")
9262            .unwrap();
9263        for i in 0..8_i32 {
9264            #[allow(clippy::cast_precision_loss)]
9265            let base = (i as f32) * 0.1;
9266            e.execute(&alloc::format!(
9267                "INSERT INTO t VALUES ({i}, [{base}, {b1}, {b2}])",
9268                b1 = base + 0.01,
9269                b2 = base + 0.02,
9270            ))
9271            .unwrap();
9272        }
9273        e.execute("CREATE INDEX t_idx ON t USING hnsw (v)").unwrap();
9274        e.execute("ALTER INDEX t_idx REBUILD").unwrap();
9275        // Schema encoding stays F32 (no encoding clause).
9276        assert_eq!(
9277            e.catalog().get("t").unwrap().schema().columns[1].ty,
9278            DataType::Vector {
9279                dim: 3,
9280                encoding: VecEncoding::F32,
9281            },
9282        );
9283    }
9284
9285    #[test]
9286    fn alter_index_rebuild_with_encoding_switches_cell_type() {
9287        // v6.0.4: REBUILD WITH (encoding = SQ8) recodes every
9288        // stored cell from F32 → SQ8 + rebuilds the graph atop the
9289        // new encoding. Post-rebuild, cells must be Sq8Vector and
9290        // the schema must report encoding = Sq8.
9291        let mut e = Engine::new();
9292        e.execute("CREATE TABLE t (id INT NOT NULL, v VECTOR(4) NOT NULL)")
9293            .unwrap();
9294        e.execute("INSERT INTO t VALUES (1, [0.0, 0.25, 0.5, 1.0])")
9295            .unwrap();
9296        e.execute("CREATE INDEX t_idx ON t USING hnsw (v)").unwrap();
9297        e.execute("ALTER INDEX t_idx REBUILD WITH (encoding = SQ8)")
9298            .unwrap();
9299        let t = e.catalog().get("t").unwrap();
9300        assert_eq!(
9301            t.schema().columns[1].ty,
9302            DataType::Vector {
9303                dim: 4,
9304                encoding: VecEncoding::Sq8,
9305            },
9306        );
9307        assert!(matches!(t.rows()[0].values[1], Value::Sq8Vector(_)));
9308    }
9309
9310    #[test]
9311    fn alter_index_rebuild_unknown_index_errors() {
9312        let mut e = Engine::new();
9313        let err = e.execute("ALTER INDEX nope REBUILD").unwrap_err();
9314        assert!(
9315            matches!(
9316                &err,
9317                EngineError::Storage(StorageError::IndexNotFound { name }) if name == "nope"
9318            ),
9319            "got: {err}"
9320        );
9321    }
9322
9323    #[test]
9324    fn alter_index_rebuild_on_btree_index_errors() {
9325        // REBUILD on a B-tree index has no semantic meaning in
9326        // v6.0.4 — rejected at the storage layer with `Unsupported`.
9327        let mut e = Engine::new();
9328        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
9329        e.execute("INSERT INTO t VALUES (1)").unwrap();
9330        e.execute("CREATE INDEX t_idx ON t (id)").unwrap();
9331        let err = e.execute("ALTER INDEX t_idx REBUILD").unwrap_err();
9332        assert!(
9333            matches!(&err, EngineError::Storage(StorageError::Unsupported(_))),
9334            "got: {err}"
9335        );
9336    }
9337
9338    #[test]
9339    fn prepared_insert_substitutes_placeholders() {
9340        // v6.1.1: prepare() parses once; execute_prepared() walks the
9341        // AST and replaces $1/$2 with the param Values BEFORE the
9342        // dispatch sees them. Same logical result as a simple-query
9343        // INSERT, but parse happens once per *statement*, not per
9344        // execution.
9345        let mut e = Engine::new();
9346        e.execute("CREATE TABLE t (id INT NOT NULL, name TEXT NOT NULL)")
9347            .unwrap();
9348        let stmt = e.prepare("INSERT INTO t VALUES ($1, $2)").unwrap();
9349        for (id, name) in [(1, "alice"), (2, "bob"), (3, "carol")] {
9350            e.execute_prepared(
9351                stmt.clone(),
9352                &[Value::Int(id), Value::Text(name.into())],
9353            )
9354            .unwrap();
9355        }
9356        // Read back via simple-query SELECT.
9357        let rows_result = e.execute("SELECT id, name FROM t").unwrap();
9358        let QueryResult::Rows { rows, .. } = rows_result else {
9359            panic!("expected Rows")
9360        };
9361        assert_eq!(rows.len(), 3);
9362    }
9363
9364    #[test]
9365    fn prepared_select_with_placeholder_filters_rows() {
9366        let mut e = Engine::new();
9367        e.execute("CREATE TABLE t (id INT NOT NULL, v INT NOT NULL)")
9368            .unwrap();
9369        for i in 0..10_i32 {
9370            e.execute(&alloc::format!("INSERT INTO t VALUES ({i}, {})", i * 7))
9371                .unwrap();
9372        }
9373        let stmt = e
9374            .prepare("SELECT id FROM t WHERE v = $1")
9375            .unwrap();
9376        let QueryResult::Rows { rows, .. } = e
9377            .execute_prepared(stmt, &[Value::Int(35)])
9378            .unwrap()
9379        else {
9380            panic!("expected Rows")
9381        };
9382        // v = 35 means i*7 = 35 → i = 5.
9383        assert_eq!(rows.len(), 1);
9384        assert_eq!(rows[0].values[0], Value::Int(5));
9385    }
9386
9387    #[test]
9388    fn prepared_too_few_params_errors() {
9389        let mut e = Engine::new();
9390        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
9391        let stmt = e.prepare("INSERT INTO t VALUES ($1)").unwrap();
9392        let err = e.execute_prepared(stmt, &[]).unwrap_err();
9393        assert!(
9394            matches!(
9395                &err,
9396                EngineError::Eval(EvalError::PlaceholderOutOfRange { n: 1, bound: 0 })
9397            ),
9398            "got: {err}"
9399        );
9400    }
9401
9402    #[test]
9403    fn insert_into_half_column_dim_mismatch_errors() {
9404        let mut e = Engine::new();
9405        e.execute("CREATE TABLE t (v VECTOR(4) USING HALF)")
9406            .unwrap();
9407        let err = e.execute("INSERT INTO t VALUES ([1.0, 2.0])").unwrap_err();
9408        assert!(matches!(
9409            &err,
9410            EngineError::Storage(StorageError::TypeMismatch { .. })
9411        ));
9412    }
9413
9414    #[test]
9415    fn insert_into_sq8_column_dim_mismatch_errors() {
9416        // Dim mismatch falls through the `coerce_value` Vector→Sq8
9417        // arm's guard and surfaces as `TypeMismatch` — the same
9418        // error the F32 path produces today, so client error
9419        // handling stays uniform across encodings.
9420        let mut e = Engine::new();
9421        e.execute("CREATE TABLE t (v VECTOR(4) USING SQ8)").unwrap();
9422        let err = e.execute("INSERT INTO t VALUES ([1.0, 2.0])").unwrap_err();
9423        assert!(
9424            matches!(
9425                &err,
9426                EngineError::Storage(StorageError::TypeMismatch { .. })
9427            ),
9428            "got: {err}",
9429        );
9430    }
9431
9432    #[test]
9433    fn create_table_duplicate_errors() {
9434        let mut e = Engine::new();
9435        e.execute("CREATE TABLE foo (a INT)").unwrap();
9436        let err = e.execute("CREATE TABLE foo (a INT)").unwrap_err();
9437        assert!(matches!(
9438            err,
9439            EngineError::Storage(StorageError::DuplicateTable { ref name }) if name == "foo"
9440        ));
9441    }
9442
9443    #[test]
9444    fn insert_into_unknown_table_errors() {
9445        let mut e = Engine::new();
9446        let err = e.execute("INSERT INTO ghost VALUES (1)").unwrap_err();
9447        assert!(matches!(
9448            err,
9449            EngineError::Storage(StorageError::TableNotFound { ref name }) if name == "ghost"
9450        ));
9451    }
9452
9453    #[test]
9454    fn insert_happy_path_reports_one_affected() {
9455        let mut e = Engine::new();
9456        e.execute("CREATE TABLE foo (a INT NOT NULL)").unwrap();
9457        let r = e.execute("INSERT INTO foo VALUES (42)").unwrap();
9458        assert_eq!(unwrap_command_ok(&r), 1);
9459        assert_eq!(e.catalog().get("foo").unwrap().row_count(), 1);
9460    }
9461
9462    #[test]
9463    fn insert_arity_mismatch_propagates() {
9464        let mut e = Engine::new();
9465        e.execute("CREATE TABLE foo (a INT, b TEXT)").unwrap();
9466        let err = e.execute("INSERT INTO foo VALUES (1)").unwrap_err();
9467        assert!(matches!(
9468            err,
9469            EngineError::Storage(StorageError::ArityMismatch { .. })
9470        ));
9471    }
9472
9473    #[test]
9474    fn insert_negative_integer_via_unary_minus() {
9475        let mut e = Engine::new();
9476        e.execute("CREATE TABLE foo (a INT NOT NULL)").unwrap();
9477        e.execute("INSERT INTO foo VALUES (-7)").unwrap();
9478        let rows = e.catalog().get("foo").unwrap().rows();
9479        assert_eq!(rows[0].values[0], Value::Int(-7));
9480    }
9481
9482    #[test]
9483    fn insert_non_literal_expr_unsupported() {
9484        let mut e = Engine::new();
9485        e.execute("CREATE TABLE foo (a INT NOT NULL)").unwrap();
9486        let err = e.execute("INSERT INTO foo VALUES (1 + 2)").unwrap_err();
9487        assert!(matches!(err, EngineError::Unsupported(_)));
9488    }
9489
9490    #[test]
9491    fn select_star_returns_all_rows_in_insertion_order() {
9492        let mut e = Engine::new();
9493        e.execute("CREATE TABLE foo (a INT NOT NULL, b TEXT NOT NULL)")
9494            .unwrap();
9495        e.execute("INSERT INTO foo VALUES (1, 'one')").unwrap();
9496        e.execute("INSERT INTO foo VALUES (2, 'two')").unwrap();
9497        e.execute("INSERT INTO foo VALUES (3, 'three')").unwrap();
9498
9499        let r = e.execute("SELECT * FROM foo").unwrap();
9500        let QueryResult::Rows { columns, rows } = r else {
9501            panic!("expected Rows")
9502        };
9503        assert_eq!(columns.len(), 2);
9504        assert_eq!(columns[0].name, "a");
9505        assert_eq!(rows.len(), 3);
9506        assert_eq!(
9507            rows[1].values,
9508            vec![Value::Int(2), Value::Text("two".into())]
9509        );
9510    }
9511
9512    #[test]
9513    fn select_star_on_empty_table_returns_zero_rows() {
9514        let mut e = Engine::new();
9515        e.execute("CREATE TABLE foo (a INT)").unwrap();
9516        let r = e.execute("SELECT * FROM foo").unwrap();
9517        match r {
9518            QueryResult::Rows { rows, .. } => assert!(rows.is_empty()),
9519            QueryResult::CommandOk { .. } => panic!("expected Rows"),
9520        }
9521    }
9522
9523    // --- v0.4: WHERE + projection ------------------------------------------
9524
9525    fn make_three_row_users(e: &mut Engine) {
9526        e.execute("CREATE TABLE users (id INT NOT NULL, name TEXT NOT NULL, score INT)")
9527            .unwrap();
9528        e.execute("INSERT INTO users VALUES (1, 'alice', 90)")
9529            .unwrap();
9530        e.execute("INSERT INTO users VALUES (2, 'bob', NULL)")
9531            .unwrap();
9532        e.execute("INSERT INTO users VALUES (3, 'cara', 70)")
9533            .unwrap();
9534    }
9535
9536    fn unwrap_rows(r: QueryResult) -> (Vec<ColumnSchema>, Vec<Row>) {
9537        match r {
9538            QueryResult::Rows { columns, rows } => (columns, rows),
9539            QueryResult::CommandOk { .. } => panic!("expected Rows"),
9540        }
9541    }
9542
9543    #[test]
9544    fn where_filter_passes_only_true_rows() {
9545        let mut e = Engine::new();
9546        make_three_row_users(&mut e);
9547        let r = e.execute("SELECT * FROM users WHERE id > 1").unwrap();
9548        let (_, rows) = unwrap_rows(r);
9549        assert_eq!(rows.len(), 2);
9550        assert_eq!(rows[0].values[0], Value::Int(2));
9551        assert_eq!(rows[1].values[0], Value::Int(3));
9552    }
9553
9554    #[test]
9555    fn where_with_null_result_filters_out_row() {
9556        let mut e = Engine::new();
9557        make_three_row_users(&mut e);
9558        // score is NULL for bob → score > 80 is NULL → row excluded
9559        let r = e.execute("SELECT * FROM users WHERE score > 80").unwrap();
9560        let (_, rows) = unwrap_rows(r);
9561        assert_eq!(rows.len(), 1);
9562        assert_eq!(rows[0].values[1], Value::Text("alice".into()));
9563    }
9564
9565    #[test]
9566    fn projection_named_columns() {
9567        let mut e = Engine::new();
9568        make_three_row_users(&mut e);
9569        let r = e.execute("SELECT name, score FROM users").unwrap();
9570        let (cols, rows) = unwrap_rows(r);
9571        assert_eq!(cols.len(), 2);
9572        assert_eq!(cols[0].name, "name");
9573        assert_eq!(cols[1].name, "score");
9574        assert_eq!(rows.len(), 3);
9575        assert_eq!(
9576            rows[0].values,
9577            vec![Value::Text("alice".into()), Value::Int(90)]
9578        );
9579    }
9580
9581    #[test]
9582    fn projection_with_column_alias() {
9583        let mut e = Engine::new();
9584        make_three_row_users(&mut e);
9585        let r = e
9586            .execute("SELECT name AS who FROM users WHERE id = 1")
9587            .unwrap();
9588        let (cols, rows) = unwrap_rows(r);
9589        assert_eq!(cols[0].name, "who");
9590        assert_eq!(rows.len(), 1);
9591        assert_eq!(rows[0].values[0], Value::Text("alice".into()));
9592    }
9593
9594    #[test]
9595    fn qualified_column_with_table_alias_resolves() {
9596        let mut e = Engine::new();
9597        make_three_row_users(&mut e);
9598        let r = e
9599            .execute("SELECT u.id, u.name FROM users AS u WHERE u.id < 3")
9600            .unwrap();
9601        let (cols, rows) = unwrap_rows(r);
9602        assert_eq!(cols.len(), 2);
9603        assert_eq!(rows.len(), 2);
9604    }
9605
9606    #[test]
9607    fn qualified_column_with_wrong_alias_errors() {
9608        let mut e = Engine::new();
9609        make_three_row_users(&mut e);
9610        let err = e.execute("SELECT x.id FROM users AS u").unwrap_err();
9611        assert!(matches!(
9612            err,
9613            EngineError::Eval(EvalError::UnknownQualifier { ref qualifier }) if qualifier == "x"
9614        ));
9615    }
9616
9617    #[test]
9618    fn select_unknown_column_errors_in_projection() {
9619        let mut e = Engine::new();
9620        make_three_row_users(&mut e);
9621        let err = e.execute("SELECT ghost FROM users").unwrap_err();
9622        assert!(matches!(
9623            err,
9624            EngineError::Eval(EvalError::ColumnNotFound { ref name }) if name == "ghost"
9625        ));
9626    }
9627
9628    #[test]
9629    fn where_unknown_column_errors() {
9630        let mut e = Engine::new();
9631        make_three_row_users(&mut e);
9632        let err = e
9633            .execute("SELECT * FROM users WHERE ghost = 1")
9634            .unwrap_err();
9635        assert!(matches!(
9636            err,
9637            EngineError::Eval(EvalError::ColumnNotFound { .. })
9638        ));
9639    }
9640
9641    #[test]
9642    fn expression_projection_evaluates_and_renders() {
9643        // Compound expressions in the SELECT list are evaluated per row;
9644        // the output column is typed TEXT, name defaults to the expression.
9645        let mut e = Engine::new();
9646        e.execute("CREATE TABLE t (a INT NOT NULL)").unwrap();
9647        e.execute("INSERT INTO t VALUES (3)").unwrap();
9648        let (_, rows) = unwrap_rows(e.execute("SELECT 1 + 2 FROM t").unwrap());
9649        assert_eq!(rows.len(), 1);
9650        // The expression evaluates to integer 3; rendered as the cell value
9651        // (storage::Value::Int(3) since arithmetic kept ints).
9652        assert_eq!(rows[0].values[0], Value::Int(3));
9653    }
9654
9655    #[test]
9656    fn select_unknown_table_errors() {
9657        let mut e = Engine::new();
9658        let err = e.execute("SELECT * FROM ghost").unwrap_err();
9659        assert!(matches!(
9660            err,
9661            EngineError::Storage(StorageError::TableNotFound { .. })
9662        ));
9663    }
9664
9665    #[test]
9666    fn invalid_sql_returns_parse_error() {
9667        // v4.4: UPDATE is now real SQL, so use a true syntactic
9668        // garbage payload for the parse-error path.
9669        let mut e = Engine::new();
9670        let err = e.execute("THIS_IS_NOT_A_KEYWORD foo bar baz").unwrap_err();
9671        assert!(matches!(err, EngineError::Parse(_)));
9672    }
9673
9674    // --- v0.8 CREATE INDEX + index seek ------------------------------------
9675
9676    #[test]
9677    fn create_index_registers_on_table() {
9678        let mut e = Engine::new();
9679        make_three_row_users(&mut e);
9680        e.execute("CREATE INDEX by_name ON users (name)").unwrap();
9681        let t = e.catalog().get("users").unwrap();
9682        assert_eq!(t.indices().len(), 1);
9683        assert_eq!(t.indices()[0].name, "by_name");
9684    }
9685
9686    #[test]
9687    fn create_index_on_unknown_table_errors() {
9688        let mut e = Engine::new();
9689        let err = e.execute("CREATE INDEX i ON ghost (a)").unwrap_err();
9690        assert!(matches!(
9691            err,
9692            EngineError::Storage(StorageError::TableNotFound { .. })
9693        ));
9694    }
9695
9696    #[test]
9697    fn create_index_on_unknown_column_errors() {
9698        let mut e = Engine::new();
9699        make_three_row_users(&mut e);
9700        let err = e.execute("CREATE INDEX i ON users (ghost)").unwrap_err();
9701        assert!(matches!(
9702            err,
9703            EngineError::Storage(StorageError::ColumnNotFound { .. })
9704        ));
9705    }
9706
9707    #[test]
9708    fn select_eq_uses_index_returns_same_rows_as_scan() {
9709        // Build two engines: one with an index, one without. Same query →
9710        // same row set (index is a planner optimisation, not a semantic
9711        // change).
9712        let mut without = Engine::new();
9713        make_three_row_users(&mut without);
9714        let mut with = Engine::new();
9715        make_three_row_users(&mut with);
9716        with.execute("CREATE INDEX by_id ON users (id)").unwrap();
9717
9718        let q = "SELECT * FROM users WHERE id = 2";
9719        let (_, no_idx_rows) = unwrap_rows(without.execute(q).unwrap());
9720        let (_, idx_rows) = unwrap_rows(with.execute(q).unwrap());
9721        assert_eq!(no_idx_rows, idx_rows);
9722        assert_eq!(idx_rows.len(), 1);
9723    }
9724
9725    #[test]
9726    fn select_eq_with_no_matching_index_value_returns_empty() {
9727        let mut e = Engine::new();
9728        make_three_row_users(&mut e);
9729        e.execute("CREATE INDEX by_id ON users (id)").unwrap();
9730        let (_, rows) = unwrap_rows(e.execute("SELECT * FROM users WHERE id = 999").unwrap());
9731        assert_eq!(rows.len(), 0);
9732    }
9733
9734    // --- v0.9 transactions -------------------------------------------------
9735
9736    #[test]
9737    fn begin_sets_in_transaction_flag() {
9738        let mut e = Engine::new();
9739        assert!(!e.in_transaction());
9740        e.execute("BEGIN").unwrap();
9741        assert!(e.in_transaction());
9742    }
9743
9744    #[test]
9745    fn double_begin_errors() {
9746        let mut e = Engine::new();
9747        e.execute("BEGIN").unwrap();
9748        let err = e.execute("BEGIN").unwrap_err();
9749        assert_eq!(err, EngineError::TransactionAlreadyOpen);
9750    }
9751
9752    #[test]
9753    fn commit_without_begin_errors() {
9754        let mut e = Engine::new();
9755        let err = e.execute("COMMIT").unwrap_err();
9756        assert_eq!(err, EngineError::NoActiveTransaction);
9757    }
9758
9759    #[test]
9760    fn rollback_without_begin_errors() {
9761        let mut e = Engine::new();
9762        let err = e.execute("ROLLBACK").unwrap_err();
9763        assert_eq!(err, EngineError::NoActiveTransaction);
9764    }
9765
9766    #[test]
9767    fn commit_applies_shadow_to_committed_catalog() {
9768        let mut e = Engine::new();
9769        e.execute("CREATE TABLE t (v INT NOT NULL)").unwrap();
9770        e.execute("BEGIN").unwrap();
9771        e.execute("INSERT INTO t VALUES (1)").unwrap();
9772        e.execute("INSERT INTO t VALUES (2)").unwrap();
9773        e.execute("COMMIT").unwrap();
9774        assert!(!e.in_transaction());
9775        assert_eq!(e.catalog().get("t").unwrap().row_count(), 2);
9776    }
9777
9778    #[test]
9779    fn rollback_discards_shadow() {
9780        let mut e = Engine::new();
9781        e.execute("CREATE TABLE t (v INT NOT NULL)").unwrap();
9782        e.execute("BEGIN").unwrap();
9783        e.execute("INSERT INTO t VALUES (1)").unwrap();
9784        e.execute("INSERT INTO t VALUES (2)").unwrap();
9785        e.execute("ROLLBACK").unwrap();
9786        assert!(!e.in_transaction());
9787        assert_eq!(e.catalog().get("t").unwrap().row_count(), 0);
9788    }
9789
9790    #[test]
9791    fn select_during_tx_sees_uncommitted_writes_own_session() {
9792        // The shadow catalog is read by SELECTs while a TX is open — the
9793        // session can see its own pending writes.
9794        let mut e = Engine::new();
9795        e.execute("CREATE TABLE t (v INT NOT NULL)").unwrap();
9796        e.execute("BEGIN").unwrap();
9797        e.execute("INSERT INTO t VALUES (42)").unwrap();
9798        let (_, rows) = unwrap_rows(e.execute("SELECT * FROM t").unwrap());
9799        assert_eq!(rows.len(), 1);
9800        assert_eq!(rows[0].values[0], Value::Int(42));
9801    }
9802
9803    #[test]
9804    fn snapshot_with_no_users_is_bare_catalog_format() {
9805        let mut e = Engine::new();
9806        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
9807        let bytes = e.snapshot();
9808        assert_eq!(
9809            &bytes[..8],
9810            b"SPGDB001",
9811            "must be the bare v3.x catalog magic"
9812        );
9813        let e2 = Engine::restore_envelope(&bytes).unwrap();
9814        assert!(e2.users().is_empty());
9815        assert_eq!(e2.catalog().table_count(), 1);
9816    }
9817
9818    #[test]
9819    fn snapshot_with_users_round_trips_both_via_envelope() {
9820        let mut e = Engine::new();
9821        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
9822        e.create_user("alice", "pw1", Role::Admin, [9; 16]).unwrap();
9823        e.create_user("bob", "pw2", Role::ReadOnly, [5; 16])
9824            .unwrap();
9825        let bytes = e.snapshot();
9826        assert_eq!(&bytes[..8], b"SPGENV01", "must be the v4.1 envelope magic");
9827        let e2 = Engine::restore_envelope(&bytes).unwrap();
9828        assert_eq!(e2.users().len(), 2);
9829        assert_eq!(e2.verify_user("alice", "pw1"), Some(Role::Admin));
9830        assert_eq!(e2.verify_user("bob", "pw2"), Some(Role::ReadOnly));
9831        assert_eq!(e2.verify_user("alice", "wrong"), None);
9832        assert_eq!(e2.catalog().table_count(), 1);
9833    }
9834
9835    #[test]
9836    fn ddl_inside_tx_also_rolled_back() {
9837        let mut e = Engine::new();
9838        e.execute("BEGIN").unwrap();
9839        e.execute("CREATE TABLE t (v INT)").unwrap();
9840        // Visible inside the TX.
9841        e.execute("SELECT * FROM t").unwrap();
9842        e.execute("ROLLBACK").unwrap();
9843        // Gone after rollback.
9844        let err = e.execute("SELECT * FROM t").unwrap_err();
9845        assert!(matches!(
9846            err,
9847            EngineError::Storage(StorageError::TableNotFound { .. })
9848        ));
9849    }
9850
9851    // ── v6.1.2: CREATE / DROP PUBLICATION (engine-side) ──────
9852
9853    #[test]
9854    fn create_publication_lands_in_catalog() {
9855        let mut e = Engine::new();
9856        assert!(e.publications().is_empty());
9857        e.execute("CREATE PUBLICATION pub_a").unwrap();
9858        assert_eq!(e.publications().len(), 1);
9859        assert!(e.publications().contains("pub_a"));
9860    }
9861
9862    #[test]
9863    fn create_publication_duplicate_errors() {
9864        let mut e = Engine::new();
9865        e.execute("CREATE PUBLICATION pub_a").unwrap();
9866        let err = e.execute("CREATE PUBLICATION pub_a").unwrap_err();
9867        assert!(
9868            alloc::format!("{err:?}").contains("DuplicateName"),
9869            "got {err:?}"
9870        );
9871    }
9872
9873    #[test]
9874    fn drop_publication_silent_when_absent() {
9875        let mut e = Engine::new();
9876        // PG-compatible: DROP a publication that doesn't exist
9877        // succeeds (no-op) but reports zero affected.
9878        let r = e.execute("DROP PUBLICATION nope").unwrap();
9879        match r {
9880            QueryResult::CommandOk { affected, .. } => assert_eq!(affected, 0),
9881            other => panic!("expected CommandOk, got {other:?}"),
9882        }
9883    }
9884
9885    #[test]
9886    fn drop_publication_present_reports_one_affected() {
9887        let mut e = Engine::new();
9888        e.execute("CREATE PUBLICATION pub_a").unwrap();
9889        let r = e.execute("DROP PUBLICATION pub_a").unwrap();
9890        match r {
9891            QueryResult::CommandOk {
9892                affected,
9893                modified_catalog,
9894            } => {
9895                assert_eq!(affected, 1);
9896                assert!(modified_catalog);
9897            }
9898            other => panic!("expected CommandOk, got {other:?}"),
9899        }
9900        assert!(e.publications().is_empty());
9901    }
9902
9903    #[test]
9904    fn publications_persist_across_snapshot_restore() {
9905        // The persist-across-restart ship-gate at the engine layer —
9906        // snapshot → restore_envelope round trip must preserve the
9907        // publication catalog. The spg-server e2e covers the
9908        // process-restart variant.
9909        let mut e = Engine::new();
9910        e.execute("CREATE PUBLICATION pub_a").unwrap();
9911        e.execute("CREATE PUBLICATION pub_b FOR ALL TABLES").unwrap();
9912        let snap = e.snapshot();
9913        let e2 = Engine::restore_envelope(&snap).unwrap();
9914        assert_eq!(e2.publications().len(), 2);
9915        assert!(e2.publications().contains("pub_a"));
9916        assert!(e2.publications().contains("pub_b"));
9917    }
9918
9919    #[test]
9920    fn create_publication_allowed_inside_transaction() {
9921        // v6.1.4 dropped the v6.1.2 in-TX guard — PG allows
9922        // CREATE PUBLICATION inside a TX and the auto-commit
9923        // wrap path needs the same allowance.
9924        let mut e = Engine::new();
9925        e.execute("BEGIN").unwrap();
9926        e.execute("CREATE PUBLICATION pub_a").unwrap();
9927        e.execute("COMMIT").unwrap();
9928        assert!(e.publications().contains("pub_a"));
9929    }
9930
9931    // ── v6.1.3: SHOW PUBLICATIONS + FOR-list variants ───────
9932
9933    #[test]
9934    fn create_publication_for_table_list_lands_with_scope() {
9935        let mut e = Engine::new();
9936        e.execute("CREATE TABLE t1 (id INT NOT NULL)").unwrap();
9937        e.execute("CREATE TABLE t2 (id INT NOT NULL)").unwrap();
9938        e.execute("CREATE PUBLICATION pub_a FOR TABLE t1, t2")
9939            .unwrap();
9940        let scope = e.publications().get("pub_a").cloned();
9941        let Some(spg_sql::ast::PublicationScope::ForTables(ts)) = scope else {
9942            panic!("expected ForTables scope, got {scope:?}")
9943        };
9944        assert_eq!(ts, alloc::vec!["t1".to_string(), "t2".to_string()]);
9945    }
9946
9947    #[test]
9948    fn create_publication_all_tables_except_lands_with_scope() {
9949        let mut e = Engine::new();
9950        e.execute("CREATE PUBLICATION pub_a FOR ALL TABLES EXCEPT t3")
9951            .unwrap();
9952        let scope = e.publications().get("pub_a").cloned();
9953        let Some(spg_sql::ast::PublicationScope::AllTablesExcept(ts)) = scope else {
9954            panic!("expected AllTablesExcept scope, got {scope:?}")
9955        };
9956        assert_eq!(ts, alloc::vec!["t3".to_string()]);
9957    }
9958
9959    #[test]
9960    fn show_publications_empty_returns_zero_rows() {
9961        let e = Engine::new();
9962        let r = e.execute_readonly("SHOW PUBLICATIONS").unwrap();
9963        let QueryResult::Rows { rows, columns } = r else {
9964            panic!()
9965        };
9966        assert!(rows.is_empty());
9967        assert_eq!(columns.len(), 3);
9968        assert_eq!(columns[0].name, "name");
9969        assert_eq!(columns[1].name, "scope");
9970        assert_eq!(columns[2].name, "table_count");
9971    }
9972
9973    #[test]
9974    fn show_publications_returns_one_row_per_publication_ordered_by_name() {
9975        let mut e = Engine::new();
9976        e.execute("CREATE PUBLICATION z_pub").unwrap();
9977        e.execute("CREATE PUBLICATION a_pub FOR TABLE t1, t2")
9978            .unwrap();
9979        e.execute("CREATE PUBLICATION m_pub FOR ALL TABLES EXCEPT bad")
9980            .unwrap();
9981        let r = e.execute_readonly("SHOW PUBLICATIONS").unwrap();
9982        let QueryResult::Rows { rows, .. } = r else {
9983            panic!()
9984        };
9985        assert_eq!(rows.len(), 3);
9986        // Alphabetical order: a_pub, m_pub, z_pub.
9987        let names: Vec<&str> = rows
9988            .iter()
9989            .map(|r| {
9990                if let Value::Text(s) = &r.values[0] {
9991                    s.as_str()
9992                } else {
9993                    panic!()
9994                }
9995            })
9996            .collect();
9997        assert_eq!(names, alloc::vec!["a_pub", "m_pub", "z_pub"]);
9998        // Row 0 — a_pub scope summary + table_count = 2.
9999        match &rows[0].values[1] {
10000            Value::Text(s) => assert_eq!(s, "FOR TABLE t1, t2"),
10001            other => panic!("expected Text, got {other:?}"),
10002        }
10003        assert_eq!(rows[0].values[2], Value::Int(2));
10004        // Row 1 — m_pub.
10005        match &rows[1].values[1] {
10006            Value::Text(s) => assert_eq!(s, "FOR ALL TABLES EXCEPT bad"),
10007            other => panic!("expected Text, got {other:?}"),
10008        }
10009        assert_eq!(rows[1].values[2], Value::Int(1));
10010        // Row 2 — z_pub (AllTables → NULL count).
10011        match &rows[2].values[1] {
10012            Value::Text(s) => assert_eq!(s, "FOR ALL TABLES"),
10013            other => panic!("expected Text, got {other:?}"),
10014        }
10015        assert_eq!(rows[2].values[2], Value::Null);
10016    }
10017
10018    #[test]
10019    fn for_list_scopes_persist_across_snapshot() {
10020        // The v6.1.2 envelope-v3 round-trip exercised AllTables;
10021        // v6.1.3 needs the scope-1 / scope-2 tags to survive too.
10022        let mut e = Engine::new();
10023        e.execute("CREATE PUBLICATION p1 FOR TABLE t1, t2").unwrap();
10024        e.execute("CREATE PUBLICATION p2 FOR ALL TABLES EXCEPT bad, worse")
10025            .unwrap();
10026        let snap = e.snapshot();
10027        let e2 = Engine::restore_envelope(&snap).unwrap();
10028        assert_eq!(e2.publications().len(), 2);
10029        let p1 = e2.publications().get("p1").cloned();
10030        let Some(spg_sql::ast::PublicationScope::ForTables(ts)) = p1 else {
10031            panic!("p1 scope lost: {p1:?}")
10032        };
10033        assert_eq!(ts, alloc::vec!["t1".to_string(), "t2".to_string()]);
10034        let p2 = e2.publications().get("p2").cloned();
10035        let Some(spg_sql::ast::PublicationScope::AllTablesExcept(ts)) = p2 else {
10036            panic!("p2 scope lost: {p2:?}")
10037        };
10038        assert_eq!(ts, alloc::vec!["bad".to_string(), "worse".to_string()]);
10039    }
10040
10041    // ── v6.1.4: CREATE / DROP SUBSCRIPTION + SHOW + envelope v4 ─
10042
10043    #[test]
10044    fn create_subscription_lands_in_catalog_with_defaults() {
10045        let mut e = Engine::new();
10046        e.execute(
10047            "CREATE SUBSCRIPTION sub_a CONNECTION 'host=127.0.0.1 port=20002' PUBLICATION pub_a",
10048        )
10049        .unwrap();
10050        let s = e.subscriptions().get("sub_a").cloned().expect("present");
10051        assert_eq!(s.conn_str, "host=127.0.0.1 port=20002");
10052        assert_eq!(s.publications, alloc::vec!["pub_a".to_string()]);
10053        assert!(s.enabled);
10054        assert_eq!(s.last_received_pos, 0);
10055    }
10056
10057    #[test]
10058    fn create_subscription_duplicate_name_errors() {
10059        let mut e = Engine::new();
10060        e.execute("CREATE SUBSCRIPTION s CONNECTION 'host=x' PUBLICATION p")
10061            .unwrap();
10062        let err = e
10063            .execute("CREATE SUBSCRIPTION s CONNECTION 'host=y' PUBLICATION p")
10064            .unwrap_err();
10065        assert!(
10066            alloc::format!("{err:?}").contains("DuplicateName"),
10067            "got {err:?}"
10068        );
10069    }
10070
10071    #[test]
10072    fn drop_subscription_silent_when_absent() {
10073        let mut e = Engine::new();
10074        let r = e.execute("DROP SUBSCRIPTION never").unwrap();
10075        match r {
10076            QueryResult::CommandOk { affected, .. } => assert_eq!(affected, 0),
10077            other => panic!("expected CommandOk, got {other:?}"),
10078        }
10079    }
10080
10081    #[test]
10082    fn subscription_advance_updates_last_pos_monotone() {
10083        let mut e = Engine::new();
10084        e.execute("CREATE SUBSCRIPTION s CONNECTION 'h=x' PUBLICATION p")
10085            .unwrap();
10086        assert!(e.subscription_advance("s", 100));
10087        assert_eq!(e.subscriptions().get("s").unwrap().last_received_pos, 100);
10088        assert!(e.subscription_advance("s", 50)); // stale → ignored
10089        assert_eq!(e.subscriptions().get("s").unwrap().last_received_pos, 100);
10090        assert!(e.subscription_advance("s", 200));
10091        assert_eq!(e.subscriptions().get("s").unwrap().last_received_pos, 200);
10092        assert!(!e.subscription_advance("missing", 1));
10093    }
10094
10095    #[test]
10096    fn show_subscriptions_returns_rows_ordered_by_name() {
10097        let mut e = Engine::new();
10098        e.execute("CREATE SUBSCRIPTION z_sub CONNECTION 'h=x' PUBLICATION p1, p2")
10099            .unwrap();
10100        e.execute("CREATE SUBSCRIPTION a_sub CONNECTION 'h=y' PUBLICATION p3")
10101            .unwrap();
10102        let r = e.execute_readonly("SHOW SUBSCRIPTIONS").unwrap();
10103        let QueryResult::Rows { rows, columns } = r else {
10104            panic!()
10105        };
10106        assert_eq!(rows.len(), 2);
10107        assert_eq!(columns.len(), 5);
10108        assert_eq!(columns[0].name, "name");
10109        assert_eq!(columns[4].name, "last_received_pos");
10110        // Alphabetical: a_sub, z_sub.
10111        let names: Vec<&str> = rows
10112            .iter()
10113            .map(|r| {
10114                if let Value::Text(s) = &r.values[0] {
10115                    s.as_str()
10116                } else {
10117                    panic!()
10118                }
10119            })
10120            .collect();
10121        assert_eq!(names, alloc::vec!["a_sub", "z_sub"]);
10122        // Row 0: a_sub
10123        assert_eq!(rows[0].values[1], Value::Text("h=y".to_string()));
10124        assert_eq!(rows[0].values[2], Value::Text("p3".to_string()));
10125        assert_eq!(rows[0].values[3], Value::Bool(true));
10126        assert_eq!(rows[0].values[4], Value::BigInt(0));
10127        // Row 1: z_sub — publications join with ", "
10128        assert_eq!(rows[1].values[2], Value::Text("p1, p2".to_string()));
10129    }
10130
10131    #[test]
10132    fn subscriptions_persist_across_snapshot_envelope_v4() {
10133        let mut e = Engine::new();
10134        e.execute("CREATE SUBSCRIPTION s1 CONNECTION 'h=A' PUBLICATION p1, p2")
10135            .unwrap();
10136        e.execute("CREATE SUBSCRIPTION s2 CONNECTION 'h=B' PUBLICATION p3")
10137            .unwrap();
10138        e.subscription_advance("s2", 42);
10139        let snap = e.snapshot();
10140        let e2 = Engine::restore_envelope(&snap).unwrap();
10141        assert_eq!(e2.subscriptions().len(), 2);
10142        let s1 = e2.subscriptions().get("s1").unwrap();
10143        assert_eq!(s1.conn_str, "h=A");
10144        assert_eq!(s1.publications, alloc::vec!["p1".to_string(), "p2".to_string()]);
10145        assert_eq!(s1.last_received_pos, 0);
10146        let s2 = e2.subscriptions().get("s2").unwrap();
10147        assert_eq!(s2.last_received_pos, 42);
10148    }
10149
10150    #[test]
10151    fn v3_envelope_loads_with_empty_subscriptions() {
10152        // v3 snapshot (publications-only). Forge it by hand so we
10153        // verify v6.1.4 readers don't panic — they must surface
10154        // empty subscriptions and a populated publication table.
10155        let mut e = Engine::new();
10156        e.execute("CREATE PUBLICATION pub_legacy").unwrap();
10157        let catalog = e.catalog.serialize();
10158        let users = crate::users::serialize_users(&e.users);
10159        let pubs = e.publications.serialize();
10160        let mut buf = Vec::new();
10161        buf.extend_from_slice(b"SPGENV01");
10162        buf.push(3u8); // v3
10163        buf.extend_from_slice(&u32::try_from(catalog.len()).unwrap().to_le_bytes());
10164        buf.extend_from_slice(&catalog);
10165        buf.extend_from_slice(&u32::try_from(users.len()).unwrap().to_le_bytes());
10166        buf.extend_from_slice(&users);
10167        buf.extend_from_slice(&u32::try_from(pubs.len()).unwrap().to_le_bytes());
10168        buf.extend_from_slice(&pubs);
10169        let crc = spg_crypto::crc32::crc32(&buf);
10170        buf.extend_from_slice(&crc.to_le_bytes());
10171
10172        let e2 = Engine::restore_envelope(&buf).expect("v3 envelope restores under v4 reader");
10173        assert!(e2.subscriptions().is_empty());
10174        assert!(e2.publications().contains("pub_legacy"));
10175    }
10176
10177    #[test]
10178    fn create_subscription_allowed_inside_transaction() {
10179        let mut e = Engine::new();
10180        e.execute("BEGIN").unwrap();
10181        e.execute("CREATE SUBSCRIPTION s CONNECTION 'h=x' PUBLICATION p")
10182            .unwrap();
10183        e.execute("COMMIT").unwrap();
10184        assert!(e.subscriptions().contains("s"));
10185    }
10186
10187    #[test]
10188    // ── v6.2.0: ANALYZE + spg_statistic + envelope v5 ──────────
10189
10190    #[test]
10191    fn analyze_populates_histogram_bounds() {
10192        let mut e = Engine::new();
10193        e.execute("CREATE TABLE t (id INT NOT NULL, name TEXT)").unwrap();
10194        for i in 0..50 {
10195            e.execute(&alloc::format!(
10196                "INSERT INTO t VALUES ({i}, 'name{i}')"
10197            ))
10198            .unwrap();
10199        }
10200        e.execute("ANALYZE t").unwrap();
10201        let stats = e.statistics();
10202        let id_stats = stats.get("t", "id").unwrap();
10203        assert!(id_stats.histogram_bounds.len() >= 2);
10204        assert_eq!(id_stats.histogram_bounds.first().unwrap(), "0");
10205        assert_eq!(id_stats.histogram_bounds.last().unwrap(), "49");
10206        assert!((id_stats.null_frac - 0.0).abs() < 1e-6);
10207        assert_eq!(id_stats.n_distinct, 50);
10208    }
10209
10210    #[test]
10211    fn reanalyze_overwrites_prior_stats() {
10212        let mut e = Engine::new();
10213        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
10214        for i in 0..10 {
10215            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
10216        }
10217        e.execute("ANALYZE t").unwrap();
10218        let n1 = e.statistics().get("t", "id").unwrap().n_distinct;
10219        assert_eq!(n1, 10);
10220        for i in 10..30 {
10221            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
10222        }
10223        e.execute("ANALYZE t").unwrap();
10224        let n2 = e.statistics().get("t", "id").unwrap().n_distinct;
10225        assert_eq!(n2, 30);
10226    }
10227
10228    #[test]
10229    fn analyze_unknown_table_errors() {
10230        let mut e = Engine::new();
10231        let err = e.execute("ANALYZE nonexistent").unwrap_err();
10232        assert!(matches!(err, EngineError::Storage(StorageError::TableNotFound { .. })));
10233    }
10234
10235    #[test]
10236    fn bare_analyze_covers_all_user_tables() {
10237        let mut e = Engine::new();
10238        e.execute("CREATE TABLE t1 (id INT NOT NULL)").unwrap();
10239        e.execute("CREATE TABLE t2 (name TEXT NOT NULL)").unwrap();
10240        e.execute("INSERT INTO t1 VALUES (1)").unwrap();
10241        e.execute("INSERT INTO t2 VALUES ('alice')").unwrap();
10242        let r = e.execute("ANALYZE").unwrap();
10243        match r {
10244            QueryResult::CommandOk { affected, modified_catalog } => {
10245                assert_eq!(affected, 2);
10246                assert!(modified_catalog);
10247            }
10248            other => panic!("expected CommandOk, got {other:?}"),
10249        }
10250        assert!(e.statistics().get("t1", "id").is_some());
10251        assert!(e.statistics().get("t2", "name").is_some());
10252    }
10253
10254    #[test]
10255    fn select_from_spg_statistic_returns_rows_per_column() {
10256        let mut e = Engine::new();
10257        e.execute("CREATE TABLE t (id INT NOT NULL, label TEXT)")
10258            .unwrap();
10259        e.execute("INSERT INTO t VALUES (1, 'a')").unwrap();
10260        e.execute("INSERT INTO t VALUES (2, 'b')").unwrap();
10261        e.execute("ANALYZE t").unwrap();
10262        let r = e.execute_readonly("SELECT * FROM spg_statistic").unwrap();
10263        let QueryResult::Rows { rows, columns } = r else {
10264            panic!()
10265        };
10266        // v6.7.0 — spg_statistic gained a `cold_row_count` column.
10267        assert_eq!(columns.len(), 6);
10268        assert_eq!(columns[0].name, "table_name");
10269        assert_eq!(columns[4].name, "histogram_bounds");
10270        assert_eq!(columns[5].name, "cold_row_count");
10271        assert_eq!(rows.len(), 2, "one row per column of t");
10272        // Sorted by (table_name, column_name).
10273        match (&rows[0].values[0], &rows[0].values[1]) {
10274            (Value::Text(t), Value::Text(c)) => {
10275                assert_eq!(t, "t");
10276                // BTreeMap orders (table, column); columns "id" < "label".
10277                assert_eq!(c, "id");
10278            }
10279            _ => panic!(),
10280        }
10281    }
10282
10283    #[test]
10284    fn analyze_skips_vector_columns() {
10285        // Vector columns have their own stats shape (HNSW graph);
10286        // ANALYZE leaves them out of spg_statistic.
10287        let mut e = Engine::new();
10288        e.execute("CREATE TABLE t (id INT NOT NULL, v VECTOR(3) NOT NULL)")
10289            .unwrap();
10290        e.execute("INSERT INTO t VALUES (1, [1, 2, 3])").unwrap();
10291        e.execute("ANALYZE t").unwrap();
10292        assert!(e.statistics().get("t", "id").is_some());
10293        assert!(e.statistics().get("t", "v").is_none());
10294    }
10295
10296    #[test]
10297    fn statistics_persist_across_envelope_v5_round_trip() {
10298        let mut e = Engine::new();
10299        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
10300        for i in 0..20 {
10301            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
10302        }
10303        e.execute("ANALYZE").unwrap();
10304        let snap = e.snapshot();
10305        let e2 = Engine::restore_envelope(&snap).unwrap();
10306        let s = e2.statistics().get("t", "id").unwrap();
10307        assert_eq!(s.n_distinct, 20);
10308    }
10309
10310    // ── v6.2.1 auto-analyze threshold ───────────────────────────
10311
10312    #[test]
10313    fn auto_analyze_threshold_fires_after_10pct_of_min_rows_on_small_table() {
10314        // For a table with 0 rows then 10 inserts → modified=10,
10315        // row_count=10. Threshold = 0.1 × max(10, 100) = 10. So
10316        // after the 10th INSERT the threshold is met.
10317        let mut e = Engine::new();
10318        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
10319        for i in 0..9 {
10320            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
10321        }
10322        assert!(e.tables_needing_analyze().is_empty(), "9 < threshold");
10323        e.execute("INSERT INTO t VALUES (9)").unwrap();
10324        let needs = e.tables_needing_analyze();
10325        assert_eq!(needs, alloc::vec!["t".to_string()]);
10326    }
10327
10328    #[test]
10329    fn auto_analyze_threshold_uses_10pct_of_row_count_for_large_tables() {
10330        // After ANALYZE on 1000 rows, threshold = 0.1 × row_count.
10331        // Each new INSERT bumps both modified and row_count, so to
10332        // trigger from N=1000 we need modifications ≥ 0.1 × (1000+M),
10333        // i.e. M ≥ 112. The test inserts 50 (no fire), then 150
10334        // more (200 total mods, row_count=1200, threshold=120 → fire).
10335        let mut e = Engine::new();
10336        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
10337        for i in 0..1000 {
10338            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
10339        }
10340        e.execute("ANALYZE t").unwrap();
10341        assert!(e.tables_needing_analyze().is_empty(), "fresh ANALYZE");
10342        for i in 1000..1050 {
10343            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
10344        }
10345        assert!(
10346            e.tables_needing_analyze().is_empty(),
10347            "50 inserts < threshold of ~105"
10348        );
10349        for i in 1050..1200 {
10350            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
10351        }
10352        assert_eq!(
10353            e.tables_needing_analyze(),
10354            alloc::vec!["t".to_string()],
10355            "200 inserts > 0.1 × 1200 threshold"
10356        );
10357    }
10358
10359    #[test]
10360    fn auto_analyze_threshold_resets_after_analyze() {
10361        let mut e = Engine::new();
10362        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
10363        for i in 0..200 {
10364            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
10365        }
10366        assert!(!e.tables_needing_analyze().is_empty());
10367        e.execute("ANALYZE").unwrap();
10368        assert!(
10369            e.tables_needing_analyze().is_empty(),
10370            "ANALYZE must reset the counter"
10371        );
10372    }
10373
10374    #[test]
10375    fn auto_analyze_threshold_tracks_updates_and_deletes() {
10376        let mut e = Engine::new();
10377        e.execute("CREATE TABLE t (id INT NOT NULL, label TEXT)").unwrap();
10378        for i in 0..50 {
10379            e.execute(&alloc::format!("INSERT INTO t VALUES ({i}, 'x')"))
10380                .unwrap();
10381        }
10382        e.execute("ANALYZE t").unwrap();
10383        // UPDATE 20 rows + DELETE 5 → modified=25. Threshold = 0.1
10384        // × max(50, 100) = 10. So 25 >= 10 → trigger.
10385        e.execute("UPDATE t SET label = 'y' WHERE id < 20").unwrap();
10386        e.execute("DELETE FROM t WHERE id >= 45").unwrap();
10387        assert_eq!(
10388            e.tables_needing_analyze(),
10389            alloc::vec!["t".to_string()]
10390        );
10391    }
10392
10393    #[test]
10394    fn v4_envelope_loads_with_empty_statistics() {
10395        // Forge a v4 envelope by hand: catalog + users + pubs +
10396        // subs trailer, no statistics. A v6.2.0 reader must accept
10397        // it and surface an empty Statistics.
10398        let mut e = Engine::new();
10399        e.create_user("alice", "secret", crate::users::Role::ReadOnly, [0u8; 16])
10400            .unwrap();
10401        let catalog = e.catalog.serialize();
10402        let users = crate::users::serialize_users(&e.users);
10403        let pubs = e.publications.serialize();
10404        let subs = e.subscriptions.serialize();
10405        let mut buf = Vec::new();
10406        buf.extend_from_slice(b"SPGENV01");
10407        buf.push(4u8);
10408        buf.extend_from_slice(&u32::try_from(catalog.len()).unwrap().to_le_bytes());
10409        buf.extend_from_slice(&catalog);
10410        buf.extend_from_slice(&u32::try_from(users.len()).unwrap().to_le_bytes());
10411        buf.extend_from_slice(&users);
10412        buf.extend_from_slice(&u32::try_from(pubs.len()).unwrap().to_le_bytes());
10413        buf.extend_from_slice(&pubs);
10414        buf.extend_from_slice(&u32::try_from(subs.len()).unwrap().to_le_bytes());
10415        buf.extend_from_slice(&subs);
10416        let crc = spg_crypto::crc32::crc32(&buf);
10417        buf.extend_from_slice(&crc.to_le_bytes());
10418        let e2 = Engine::restore_envelope(&buf).expect("v4 envelope restores");
10419        assert!(e2.statistics().is_empty());
10420    }
10421
10422    #[test]
10423    fn v1_v2_envelope_loads_with_empty_publications() {
10424        // A snapshot taken before v6.1.2 (no publication trailer,
10425        // envelope v2) must still deserialise — and the resulting
10426        // engine must report zero publications. Use the engine's own
10427        // round-trip with no publications: that emits v3 but with an
10428        // empty pubs block. Then forge a v2 envelope by hand to lock
10429        // the back-compat path.
10430        let mut e = Engine::new();
10431        // Force users to be non-empty so the snapshot takes the
10432        // envelope path rather than the bare-catalog fallback.
10433        e.create_user(
10434            "alice",
10435            "secret",
10436            crate::users::Role::ReadOnly,
10437            [0u8; 16],
10438        )
10439        .unwrap();
10440
10441        // Forge an envelope v2: same shape as v3 but no pubs trailer.
10442        let catalog = e.catalog.serialize();
10443        let users = crate::users::serialize_users(&e.users);
10444        let mut buf = Vec::new();
10445        buf.extend_from_slice(b"SPGENV01");
10446        buf.push(2u8); // v2
10447        buf.extend_from_slice(
10448            &u32::try_from(catalog.len()).unwrap().to_le_bytes(),
10449        );
10450        buf.extend_from_slice(&catalog);
10451        buf.extend_from_slice(
10452            &u32::try_from(users.len()).unwrap().to_le_bytes(),
10453        );
10454        buf.extend_from_slice(&users);
10455        let crc = spg_crypto::crc32::crc32(&buf);
10456        buf.extend_from_slice(&crc.to_le_bytes());
10457
10458        let e2 = Engine::restore_envelope(&buf).expect("v2 envelope restores");
10459        assert!(e2.publications().is_empty());
10460    }
10461}