Skip to main content

spg_engine/
lib.rs

1//! SPG execution engine — v0.3 wires the SQL front-end to the in-memory
2//! storage layer. Implements `CREATE TABLE`, single-row `INSERT VALUES`, and
3//! `SELECT * FROM <table>` (no WHERE yet — that lands in v0.4 alongside
4//! expression evaluation against rows).
5#![no_std]
6
7extern crate alloc;
8
9pub mod aggregate;
10pub mod describe;
11pub mod eval;
12pub mod fts;
13pub mod json;
14pub mod memoize;
15pub mod plan_cache;
16pub mod publications;
17pub mod query_stats;
18pub mod reorder;
19pub mod selectivity;
20pub mod statistics;
21pub mod subscriptions;
22pub mod triggers;
23pub mod users;
24
25pub use crate::users::{Role, ScramSecrets, UserError, UserStore};
26
27use alloc::borrow::Cow;
28use alloc::boxed::Box;
29use alloc::collections::BTreeMap;
30use alloc::string::{String, ToString};
31use alloc::vec::Vec;
32use core::fmt;
33
34use spg_sql::ast::{
35    BinOp, ColumnDef, ColumnName, ColumnTypeName, CreateIndexStatement, CreatePublicationStatement,
36    CreateSubscriptionStatement, CreateTableStatement, CreateUserStatement, Expr, FrameBound,
37    FrameKind, FromClause, IndexMethod, InsertStatement, JoinKind, Literal, OrderBy, SelectItem,
38    SelectStatement, Statement, TableRef, UnOp, UnionKind, VecEncoding as SqlVecEncoding,
39    WindowFrame,
40};
41use spg_sql::parser::{self, ParseError};
42use spg_storage::{
43    Catalog, ColumnSchema, CompactReport, DataType, IndexKey, IndexKind, Row, StorageError, Table,
44    TableSchema, Value, VecEncoding,
45};
46
47use crate::eval::{EvalContext, EvalError};
48
49/// Result of executing one statement.
50#[derive(Debug, Clone, PartialEq)]
51#[non_exhaustive]
52pub enum QueryResult {
53    /// DDL or DML succeeded.
54    ///
55    /// `affected` is the row count for `INSERT` and 0 elsewhere.
56    /// `modified_catalog` tells the server whether this statement
57    /// caused the *committed* catalog to change — it's the signal to
58    /// snapshot/audit. False for `BEGIN`/`ROLLBACK`, false for writeful
59    /// statements executed inside a transaction (those only touch the
60    /// shadow), and true for `COMMIT` and for writes outside a TX.
61    CommandOk {
62        affected: usize,
63        modified_catalog: bool,
64    },
65    /// `SELECT` returned a (possibly empty) row set.
66    Rows {
67        columns: Vec<ColumnSchema>,
68        rows: Vec<Row>,
69    },
70}
71
72/// All errors the engine can return.
73///
74/// Marked `#[non_exhaustive]` from v7.5.0 onward: external `match`
75/// must include a `_` arm so new variants in subsequent v7.x releases
76/// are not breaking changes.
77#[derive(Debug, Clone, PartialEq)]
78#[non_exhaustive]
79pub enum EngineError {
80    Parse(ParseError),
81    Storage(StorageError),
82    Eval(EvalError),
83    /// Front-end accepted a construct that the v0.x executor doesn't support.
84    Unsupported(String),
85    /// `BEGIN` while another transaction is already open.
86    TransactionAlreadyOpen,
87    /// `COMMIT` / `ROLLBACK` with no active transaction.
88    NoActiveTransaction,
89    /// v4.0 sentinel: `execute_readonly` got a statement that
90    /// mutates engine state (INSERT / CREATE / BEGIN / COMMIT / …).
91    /// The caller should retake the write lock and dispatch through
92    /// `execute(&mut self)` instead.
93    WriteRequired,
94    /// v4.2: a SELECT would have returned more rows than the
95    /// configured `max_query_rows` cap. Carries the cap.
96    RowLimitExceeded(usize),
97    /// v4.5: cooperative cancellation — the host (server's
98    /// per-query watchdog) set the cancel flag while a long-running
99    /// SELECT / UPDATE / DELETE was scanning rows. The partial work
100    /// is discarded; the caller should surface this as a timeout
101    /// to the client.
102    Cancelled,
103}
104
105impl fmt::Display for EngineError {
106    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
107        match self {
108            Self::Parse(e) => write!(f, "parse: {e}"),
109            Self::Storage(e) => write!(f, "storage: {e}"),
110            Self::Eval(e) => write!(f, "eval: {e}"),
111            Self::Unsupported(s) => write!(f, "unsupported: {s}"),
112            Self::TransactionAlreadyOpen => f.write_str("a transaction is already open"),
113            Self::NoActiveTransaction => f.write_str("no active transaction"),
114            Self::WriteRequired => {
115                f.write_str("statement requires a write lock (use execute, not execute_readonly)")
116            }
117            Self::RowLimitExceeded(n) => {
118                write!(f, "query exceeded max_query_rows={n}")
119            }
120            Self::Cancelled => f.write_str("query cancelled (timeout or client request)"),
121        }
122    }
123}
124
125impl From<ParseError> for EngineError {
126    fn from(e: ParseError) -> Self {
127        Self::Parse(e)
128    }
129}
130impl From<StorageError> for EngineError {
131    fn from(e: StorageError) -> Self {
132        Self::Storage(e)
133    }
134}
135impl From<EvalError> for EngineError {
136    fn from(e: EvalError) -> Self {
137        Self::Eval(e)
138    }
139}
140
141/// The execution engine. Holds the catalog and (later) other server-scope
142/// state. `Engine::new()` is intentionally cheap so callers can construct one
143/// per database, per test.
144/// Function pointer that returns "now" as microseconds since Unix
145/// epoch. The engine is `no_std`, so it can't reach for `std::time`
146/// itself — callers (`spg-server`, the sqllogictest runner) inject a
147/// concrete implementation. `None` means `NOW()` / `CURRENT_*` raise
148/// `Unsupported`.
149pub type ClockFn = fn() -> i64;
150
151/// Function pointer that produces 16 cryptographically random bytes.
152/// Like `ClockFn`, the engine is `no_std` and can't reach for /dev/urandom
153/// itself — host (`spg-server`) injects an OS-backed source. `None`
154/// means SQL-driven `CREATE USER` falls back to a deterministic salt
155/// derived from the username (acceptable in tests; the server always
156/// installs a real RNG so production paths never see this).
157pub type SaltFn = fn() -> [u8; 16];
158
159/// v4.5 cooperative cancellation token. A long-running SELECT /
160/// UPDATE / DELETE checks `is_cancelled` at row-loop checkpoints
161/// and bails with `EngineError::Cancelled`. The host
162/// (`spg-server`) creates an `AtomicBool` per query, spawns a
163/// watchdog thread that sets it after `SPG_QUERY_TIMEOUT_MS`,
164/// and passes it via `execute_with_cancel` / `execute_readonly_with_cancel`.
165///
166/// `CancelToken::none()` is a no-op — used by the legacy `execute`
167/// and `execute_readonly` entry points so existing callers don't
168/// change.
169#[derive(Debug, Clone, Copy)]
170pub struct CancelToken<'a> {
171    flag: Option<&'a core::sync::atomic::AtomicBool>,
172}
173
174impl<'a> CancelToken<'a> {
175    #[must_use]
176    pub const fn none() -> Self {
177        Self { flag: None }
178    }
179
180    #[must_use]
181    pub const fn from_flag(f: &'a core::sync::atomic::AtomicBool) -> Self {
182        Self { flag: Some(f) }
183    }
184
185    #[must_use]
186    pub fn is_cancelled(self) -> bool {
187        self.flag
188            .is_some_and(|f| f.load(core::sync::atomic::Ordering::Relaxed))
189    }
190
191    /// Returns `Err(Cancelled)` if the token has been tripped.
192    /// Used at row-loop checkpoints to bail cooperatively without
193    /// scattering raw `is_cancelled` checks across the executor.
194    #[inline]
195    pub fn check(self) -> Result<(), EngineError> {
196        if self.is_cancelled() {
197            Err(EngineError::Cancelled)
198        } else {
199            Ok(())
200        }
201    }
202}
203
204// ---- snapshot envelope (v4.1, extended with CRC32 in v4.37,  ----
205// ----   publications in v6.1.2 v3, subscriptions in v6.1.4 v4) ----
206//
207// Wraps a catalog blob + a user blob behind a small header so the
208// server can persist both atomically without inventing a new file.
209// Bare catalog blobs (v3.x) still load via `restore_envelope` since
210// the magic check fails fast and the function falls back to
211// `Catalog::deserialize`.
212//
213// Layout — v1 (v4.1, no CRC):
214//   [8 bytes magic "SPGENV01"]
215//   [u8 version = 1]
216//   [u32 catalog_len][catalog bytes]
217//   [u32 users_len][users bytes]
218//
219// Layout — v2 (v4.37, CRC32 of body):
220//   [8 bytes magic "SPGENV01"]
221//   [u8 version = 2]
222//   [u32 catalog_len][catalog bytes]
223//   [u32 users_len][users bytes]
224//   [u32 crc32]                      ← CRC32 of every byte before it.
225//
226// Layout — v3 (v6.1.2, publications trailer):
227//   [8 bytes magic "SPGENV01"]
228//   [u8 version = 3]
229//   [u32 catalog_len][catalog bytes]
230//   [u32 users_len][users bytes]
231//   [u32 pubs_len][publications bytes]
232//   [u32 crc32]
233//
234// Layout — v4 (v6.1.4, subscriptions trailer):
235//   [8 bytes magic "SPGENV01"]
236//   [u8 version = 4]
237//   [u32 catalog_len][catalog bytes]
238//   [u32 users_len][users bytes]
239//   [u32 pubs_len][publications bytes]
240//   [u32 subs_len][subscriptions bytes]
241//   [u32 crc32]
242//
243// Layout — v5 (v6.2.0, statistics trailer):
244//   [8 bytes magic "SPGENV01"]
245//   [u8 version = 5]
246//   [u32 catalog_len][catalog bytes]
247//   [u32 users_len][users bytes]
248//   [u32 pubs_len][publications bytes]
249//   [u32 subs_len][subscriptions bytes]
250//   [u32 stats_len][statistics bytes]      ← NEW
251//   [u32 crc32]
252//
253// Writers emit v5 from v6.2.0 on. Readers accept all of {v1, v2,
254// v3, v4, v5}: v1/v2 load with empty publications / subscriptions /
255// statistics; v3 loads with empty subscriptions + statistics; v4
256// loads with empty statistics; v5 deserialises all three. Older
257// SPG versions reading a v5 envelope fall through the version
258// match to `EnvelopeParse::Bare` — pre-v6.2.0 binaries cannot
259// open v6.2.0+ snapshots (matches the v6.1.2 / v6.1.4 breaks).
260
261const ENVELOPE_MAGIC: &[u8; 8] = b"SPGENV01";
262const ENVELOPE_VERSION_V1: u8 = 1;
263const ENVELOPE_VERSION_V2: u8 = 2;
264const ENVELOPE_VERSION_V3: u8 = 3;
265const ENVELOPE_VERSION_V4: u8 = 4;
266const ENVELOPE_VERSION_V5: u8 = 5;
267
268fn build_envelope(catalog: &[u8], users: &[u8], pubs: &[u8], subs: &[u8], stats: &[u8]) -> Vec<u8> {
269    let mut out = Vec::with_capacity(
270        8 + 1
271            + 4
272            + catalog.len()
273            + 4
274            + users.len()
275            + 4
276            + pubs.len()
277            + 4
278            + subs.len()
279            + 4
280            + stats.len()
281            + 4,
282    );
283    out.extend_from_slice(ENVELOPE_MAGIC);
284    out.push(ENVELOPE_VERSION_V5);
285    out.extend_from_slice(
286        &u32::try_from(catalog.len())
287            .expect("≤ 4G catalog")
288            .to_le_bytes(),
289    );
290    out.extend_from_slice(catalog);
291    out.extend_from_slice(
292        &u32::try_from(users.len())
293            .expect("≤ 4G users")
294            .to_le_bytes(),
295    );
296    out.extend_from_slice(users);
297    out.extend_from_slice(
298        &u32::try_from(pubs.len())
299            .expect("≤ 4G publications")
300            .to_le_bytes(),
301    );
302    out.extend_from_slice(pubs);
303    out.extend_from_slice(
304        &u32::try_from(subs.len())
305            .expect("≤ 4G subscriptions")
306            .to_le_bytes(),
307    );
308    out.extend_from_slice(subs);
309    out.extend_from_slice(
310        &u32::try_from(stats.len())
311            .expect("≤ 4G statistics")
312            .to_le_bytes(),
313    );
314    out.extend_from_slice(stats);
315    let crc = spg_crypto::crc32::crc32(&out);
316    out.extend_from_slice(&crc.to_le_bytes());
317    out
318}
319
320/// Outcome of envelope parsing: either bare-catalog fallback, a
321/// successfully split section trio from a v1/v2/v3 envelope, or an
322/// explicit corruption error from a v2/v3 CRC mismatch. `Bare`
323/// (catalog-only fallback) preserves v3.x readability. v1/v2
324/// envelopes set `publications` to `None`; v3 sets it to the
325/// publications byte slice.
326enum EnvelopeParse<'a> {
327    Bare,
328    Pair {
329        catalog: &'a [u8],
330        users: &'a [u8],
331        publications: Option<&'a [u8]>,
332        subscriptions: Option<&'a [u8]>,
333        statistics: Option<&'a [u8]>,
334    },
335    CrcMismatch {
336        expected: u32,
337        computed: u32,
338    },
339}
340
341/// Returns `EnvelopeParse::Pair` for a valid v1 / v2 / v3 envelope,
342/// `Bare` for a buffer that doesn't look like an envelope (v3.x
343/// bare catalog fallback), and `CrcMismatch` for a v2/v3 envelope
344/// whose trailing CRC32 doesn't match the body.
345fn split_envelope(buf: &[u8]) -> EnvelopeParse<'_> {
346    if buf.len() < 8 + 1 + 4 || &buf[..8] != ENVELOPE_MAGIC {
347        return EnvelopeParse::Bare;
348    }
349    let version = buf[8];
350    if !matches!(
351        version,
352        ENVELOPE_VERSION_V1
353            | ENVELOPE_VERSION_V2
354            | ENVELOPE_VERSION_V3
355            | ENVELOPE_VERSION_V4
356            | ENVELOPE_VERSION_V5
357    ) {
358        return EnvelopeParse::Bare;
359    }
360    let mut p = 9usize;
361    let Some(cat_len_bytes) = buf.get(p..p + 4) else {
362        return EnvelopeParse::Bare;
363    };
364    let Ok(cat_len_arr) = cat_len_bytes.try_into() else {
365        return EnvelopeParse::Bare;
366    };
367    let cat_len = u32::from_le_bytes(cat_len_arr) as usize;
368    p += 4;
369    if p + cat_len + 4 > buf.len() {
370        return EnvelopeParse::Bare;
371    }
372    let catalog = &buf[p..p + cat_len];
373    p += cat_len;
374    let Some(user_len_bytes) = buf.get(p..p + 4) else {
375        return EnvelopeParse::Bare;
376    };
377    let Ok(user_len_arr) = user_len_bytes.try_into() else {
378        return EnvelopeParse::Bare;
379    };
380    let user_len = u32::from_le_bytes(user_len_arr) as usize;
381    p += 4;
382    if p + user_len > buf.len() {
383        return EnvelopeParse::Bare;
384    }
385    let users = &buf[p..p + user_len];
386    p += user_len;
387    let publications = if matches!(
388        version,
389        ENVELOPE_VERSION_V3 | ENVELOPE_VERSION_V4 | ENVELOPE_VERSION_V5
390    ) {
391        // [u32 pubs_len][publications bytes]
392        let Some(pubs_len_bytes) = buf.get(p..p + 4) else {
393            return EnvelopeParse::Bare;
394        };
395        let Ok(pubs_len_arr) = pubs_len_bytes.try_into() else {
396            return EnvelopeParse::Bare;
397        };
398        let pubs_len = u32::from_le_bytes(pubs_len_arr) as usize;
399        p += 4;
400        if p + pubs_len > buf.len() {
401            return EnvelopeParse::Bare;
402        }
403        let pubs_slice = &buf[p..p + pubs_len];
404        p += pubs_len;
405        Some(pubs_slice)
406    } else {
407        None
408    };
409    let subscriptions = if matches!(version, ENVELOPE_VERSION_V4 | ENVELOPE_VERSION_V5) {
410        // [u32 subs_len][subscriptions bytes]
411        let Some(subs_len_bytes) = buf.get(p..p + 4) else {
412            return EnvelopeParse::Bare;
413        };
414        let Ok(subs_len_arr) = subs_len_bytes.try_into() else {
415            return EnvelopeParse::Bare;
416        };
417        let subs_len = u32::from_le_bytes(subs_len_arr) as usize;
418        p += 4;
419        if p + subs_len > buf.len() {
420            return EnvelopeParse::Bare;
421        }
422        let subs_slice = &buf[p..p + subs_len];
423        p += subs_len;
424        Some(subs_slice)
425    } else {
426        None
427    };
428    let statistics = if version == ENVELOPE_VERSION_V5 {
429        // [u32 stats_len][statistics bytes]
430        let Some(stats_len_bytes) = buf.get(p..p + 4) else {
431            return EnvelopeParse::Bare;
432        };
433        let Ok(stats_len_arr) = stats_len_bytes.try_into() else {
434            return EnvelopeParse::Bare;
435        };
436        let stats_len = u32::from_le_bytes(stats_len_arr) as usize;
437        p += 4;
438        if p + stats_len > buf.len() {
439            return EnvelopeParse::Bare;
440        }
441        let stats_slice = &buf[p..p + stats_len];
442        p += stats_len;
443        Some(stats_slice)
444    } else {
445        None
446    };
447    if matches!(
448        version,
449        ENVELOPE_VERSION_V2 | ENVELOPE_VERSION_V3 | ENVELOPE_VERSION_V4 | ENVELOPE_VERSION_V5
450    ) {
451        if p + 4 != buf.len() {
452            return EnvelopeParse::Bare;
453        }
454        let Ok(crc_arr) = buf[p..p + 4].try_into() else {
455            return EnvelopeParse::Bare;
456        };
457        let expected = u32::from_le_bytes(crc_arr);
458        let computed = spg_crypto::crc32::crc32(&buf[..p]);
459        if expected != computed {
460            return EnvelopeParse::CrcMismatch { expected, computed };
461        }
462    } else if p != buf.len() {
463        // v1: must end exactly at the users section.
464        return EnvelopeParse::Bare;
465    }
466    EnvelopeParse::Pair {
467        catalog,
468        users,
469        publications,
470        subscriptions,
471        statistics,
472    }
473}
474
475/// v4.41.1 opaque transaction handle. Returned by `Engine::alloc_tx_id`,
476/// threaded through `Engine::execute_in` so dispatch can identify which
477/// in-flight TX a statement belongs to. `IMPLICIT_TX` is the reserved
478/// slot every legacy caller — engine self-tests, spg-cli, spg-embedded,
479/// startup replay — implicitly uses through the unchanged
480/// `Engine::execute(sql)` API. v4.41.1 keeps at most one active slot at
481/// runtime (dispatch holds `engine.write()` across the wrap, same as
482/// v4.34); the map shape is here to let v4.42 turn on N in-flight
483/// implicit TXs without reshuffling the engine internals.
484#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
485pub struct TxId(pub u64);
486
487/// Reserved slot used by `Engine::execute(sql)` — the legacy single-
488/// global-shadow path. New `alloc_tx_id` handles start at 1.
489pub const IMPLICIT_TX: TxId = TxId(0);
490
491/// v6.7.3 — default segment-size threshold used by `COMPACT COLD
492/// SEGMENTS` when no explicit target is supplied. Segments whose
493/// `OwnedSegment::bytes().len()` is **strictly** less than this
494/// value are eligible to merge. spg-server reads
495/// `SPG_COMPACTION_TARGET_SEGMENT_BYTES` to override.
496pub const COMPACTION_TARGET_DEFAULT_BYTES: u64 = 4 * 1024 * 1024;
497
498/// Per-slot transaction state. Held inside `tx_catalogs[tx_id]` for the
499/// lifetime of a BEGIN..COMMIT (or BEGIN..ROLLBACK) window. Drops when
500/// the TX commits (its `catalog` is moved over `Engine.catalog`) or
501/// rolls back (slot removed, catalog discarded).
502#[derive(Debug, Default, Clone)]
503struct TxState {
504    /// The TX's shadow copy of the catalog. Started as a clone of
505    /// `Engine.catalog` at BEGIN time; writes flow into it; COMMIT
506    /// installs it over `Engine.catalog`. `Catalog::clone()` is O(1)
507    /// since v4.40 (`PersistentVec` rows + `PersistentBTreeMap` indices).
508    catalog: Catalog,
509    /// Per-TX savepoint stack. Each entry pairs the savepoint name with
510    /// a clone of `catalog` at the moment `SAVEPOINT <name>` fired.
511    /// `ROLLBACK TO <name>` restores from the entry and pops everything
512    /// after it; `RELEASE <name>` discards the entry and everything
513    /// after; COMMIT/ROLLBACK clears the whole stack.
514    savepoints: Vec<(String, Catalog)>,
515}
516
517/// v7.11.0 — frozen read-only view of the engine's committed state.
518/// Constructed via [`Engine::clone_snapshot`]. Holds clones of the
519/// catalog, statistics, clock function, and row-cap config — the
520/// four fields the `execute_readonly` path actually reads. Cheap to
521/// `Clone` (each clone shares the underlying `PersistentVec` row
522/// storage; only the trie root pointers copy). Send + Sync so a
523/// snapshot can be moved across `tokio::task::spawn_blocking`
524/// boundaries without coordination.
525///
526/// The contract: a snapshot reflects the engine's state at the
527/// moment `clone_snapshot()` returned. Subsequent writes to the
528/// engine are NOT visible. Callers who need fresher data take a
529/// new snapshot.
530#[derive(Debug, Clone)]
531pub struct CatalogSnapshot {
532    catalog: Catalog,
533    statistics: statistics::Statistics,
534    clock: Option<ClockFn>,
535    max_query_rows: Option<usize>,
536}
537
538#[derive(Debug, Default)]
539pub struct Engine {
540    /// Committed catalog — what survives `Engine::snapshot()` and what
541    /// outside-TX `SELECT`s read.
542    catalog: Catalog,
543    /// Active TX slots, keyed by `TxId`. Empty when no TX is in flight.
544    /// v4.41.1 runtime invariant: at most one entry (single-writer
545    /// model unchanged). v4.42 will let dispatch hold multiple entries
546    /// concurrently for group commit + engine MVCC.
547    tx_catalogs: BTreeMap<TxId, TxState>,
548    /// Which slot the next exec_* call should mutate. Set by
549    /// `execute_in(sql, tx_id)` at the entry point; legacy `execute(sql)`
550    /// sets it to `IMPLICIT_TX`. None when no TX is in flight (read /
551    /// write goes straight against `catalog`).
552    current_tx: Option<TxId>,
553    /// Monotonic counter for `alloc_tx_id`. Starts at 1 — slot 0 is
554    /// reserved for `IMPLICIT_TX`.
555    next_tx_id: u64,
556    /// Optional wall clock used to satisfy `NOW()` / `CURRENT_TIMESTAMP`
557    /// / `CURRENT_DATE`. Set by the host environment.
558    clock: Option<ClockFn>,
559    /// v4.1 cryptographic RNG for per-user password salt. Set by the
560    /// host. `None` means SQL-driven `CREATE USER` uses a
561    /// deterministic fallback — see `SaltFn`.
562    salt_fn: Option<SaltFn>,
563    /// v4.2 per-query row cap. `None` = unlimited. When set, a
564    /// SELECT that materialises more than `n` rows returns
565    /// `EngineError::RowLimitExceeded`. Enforced before the result
566    /// is shaped into wire frames so a runaway scan can't blow the
567    /// server's heap.
568    max_query_rows: Option<usize>,
569    /// v4.1 RBAC user table. Empty means "no RBAC configured yet" —
570    /// the server decides what that means at the auth boundary
571    /// (open mode vs legacy single-password mode). User CRUD goes
572    /// through `create_user`/`drop_user`/`verify_user`; persistence
573    /// rides the snapshot envelope alongside the catalog.
574    users: UserStore,
575    /// v6.1.2 logical-replication publication catalog. Empty until
576    /// `CREATE PUBLICATION` runs. Persistence rides the v3 envelope
577    /// trailer (see `build_envelope`).
578    publications: publications::Publications,
579    /// v6.1.4 logical-replication subscription catalog. Empty until
580    /// `CREATE SUBSCRIPTION` runs. Persistence rides the v4 envelope
581    /// trailer.
582    subscriptions: subscriptions::Subscriptions,
583    /// v6.2.0 — per-column statistics for the cost-based optimizer.
584    /// Populated by `ANALYZE`; queried via `spg_statistic` virtual
585    /// table. Persistence rides the v5 envelope trailer.
586    statistics: statistics::Statistics,
587    /// v6.3.0 — engine-level plan cache. Caches the post-`prepare()`
588    /// `Statement` keyed on SQL text. In-memory only — does NOT ride
589    /// the snapshot envelope (rebuilt on demand after restart).
590    plan_cache: plan_cache::PlanCache,
591    /// v6.5.1 — per-distinct-SQL execution stats. In-memory only,
592    /// surfaced via `spg_stat_query` virtual table. Updated by the
593    /// `execute_*` paths after a successful execute.
594    query_stats: query_stats::QueryStats,
595    /// v6.5.2 — connection-state provider callback. spg-server
596    /// registers a function at startup that snapshots its
597    /// per-pgwire-connection registry into `ActivityRow`s; engine
598    /// reads through it on every `SELECT * FROM spg_stat_activity`.
599    /// `None` ⇒ no-data (returns empty rows; matches the no_std
600    /// embedded callers that don't run pgwire).
601    activity_provider: Option<ActivityProvider>,
602    /// v6.5.3 — audit-chain provider + verifier. Same pattern as
603    /// activity_provider: spg-server registers both at startup;
604    /// engine reads through on `SELECT * FROM spg_audit_chain` and
605    /// `SELECT * FROM spg_audit_verify`. `None` ⇒ no-data.
606    audit_chain_provider: Option<AuditChainProvider>,
607    audit_verifier: Option<AuditVerifier>,
608    /// v6.5.6 — slow-query log threshold in microseconds. When set,
609    /// every successful execute whose elapsed exceeds the threshold
610    /// gets fed to the registered slow-query log callback (so
611    /// spg-server can emit a structured log line). Default `None`
612    /// = no slow-query logging.
613    slow_query_threshold_us: Option<u64>,
614    slow_query_logger: Option<SlowQueryLogger>,
615    /// v7.12.1 — session parameters set via `SET <name> = <value>`.
616    /// Only `default_text_search_config` is consumed by the engine
617    /// today (the FTS function dispatcher reads it when
618    /// `to_tsvector(text)` is called without an explicit config).
619    /// All other names are accepted + recorded so PG-dump output
620    /// loads, but have no behavioural effect.
621    session_params: BTreeMap<String, String>,
622    /// v7.12.7 — depth counter for trigger-emitted embedded SQL.
623    /// Each time the engine executes a `DeferredEmbeddedStmt` it
624    /// increments this; the recursive `execute_stmt_with_cancel`
625    /// inside that path checks against [`MAX_TRIGGER_RECURSION`]
626    /// to bound runaway cascades (trigger A's UPDATE on table B
627    /// fires trigger B which UPDATEs table A which fires trigger
628    /// A again…). Reset to 0 once the original DML returns.
629    trigger_recursion_depth: u32,
630}
631
632/// v7.12.7 — hard cap on nested trigger-emitted embedded SQL
633/// fires. 16 deep is well past anything a normal trigger graph
634/// uses while still preventing infinite-loop wedging.
635const MAX_TRIGGER_RECURSION: u32 = 16;
636
637/// v6.5.6 — callback signature for slow-query log emission. Called
638/// with `(sql, elapsed_us)` once per successful execute that crosses
639/// the threshold.
640pub type SlowQueryLogger = fn(&str, u64);
641
642/// v6.5.4 — synthesise a `CREATE TABLE` statement from catalog
643/// state. Round-trips through `Engine::execute` to recreate the
644/// same schema (sans data + indexes — indexes are emitted as a
645/// separate `CREATE INDEX` chain in `spg_database_ddl`).
646fn render_create_table(name: &str, columns: &[ColumnSchema]) -> String {
647    let mut out = alloc::format!("CREATE TABLE {name} (");
648    for (i, col) in columns.iter().enumerate() {
649        if i > 0 {
650            out.push_str(", ");
651        }
652        out.push_str(&col.name);
653        out.push(' ');
654        out.push_str(&render_data_type(col.ty));
655        if !col.nullable {
656            out.push_str(" NOT NULL");
657        }
658        if col.auto_increment {
659            out.push_str(" AUTO_INCREMENT");
660        }
661    }
662    out.push(')');
663    out
664}
665
666fn render_data_type(ty: DataType) -> String {
667    match ty {
668        DataType::SmallInt => "SMALLINT".into(),
669        DataType::Int => "INT".into(),
670        DataType::BigInt => "BIGINT".into(),
671        DataType::Float => "FLOAT".into(),
672        DataType::Text => "TEXT".into(),
673        DataType::Varchar(n) => alloc::format!("VARCHAR({n})"),
674        DataType::Char(n) => alloc::format!("CHAR({n})"),
675        DataType::Bool => "BOOL".into(),
676        DataType::Vector { dim, encoding } => match encoding {
677            spg_storage::VecEncoding::F32 => alloc::format!("VECTOR({dim})"),
678            spg_storage::VecEncoding::Sq8 => alloc::format!("VECTOR({dim}) USING SQ8"),
679            spg_storage::VecEncoding::F16 => alloc::format!("VECTOR({dim}) USING HALF"),
680        },
681        DataType::Numeric { precision, scale } => {
682            alloc::format!("NUMERIC({precision},{scale})")
683        }
684        DataType::Date => "DATE".into(),
685        DataType::Timestamp => "TIMESTAMP".into(),
686        DataType::Interval => "INTERVAL".into(),
687        DataType::Json => "JSON".into(),
688        DataType::Jsonb => "JSONB".into(),
689        DataType::Timestamptz => "TIMESTAMPTZ".into(),
690        DataType::Bytes => "BYTEA".into(),
691        DataType::TextArray => "TEXT[]".into(),
692        DataType::IntArray => "INT[]".into(),
693        DataType::BigIntArray => "BIGINT[]".into(),
694        DataType::TsVector => "TSVECTOR".into(),
695        DataType::TsQuery => "TSQUERY".into(),
696    }
697}
698
699/// v6.5.2 — one row of `spg_stat_activity`. Engine-public so
700/// spg-server can construct rows without re-exporting internal
701/// dispatch types.
702#[derive(Debug, Clone)]
703pub struct ActivityRow {
704    pub pid: u32,
705    pub user: String,
706    pub started_at_us: i64,
707    pub current_sql: String,
708    pub wait_event: String,
709    pub elapsed_us: i64,
710    pub in_transaction: bool,
711}
712
713/// v6.5.2 — provider callback type. Fresh snapshot returned each
714/// call; engine doesn't cache the slice.
715pub type ActivityProvider = fn() -> Vec<ActivityRow>;
716
717/// v6.5.3 — one row of `spg_audit_chain`. Engine-public so
718/// spg-server can construct rows directly from `AuditEntry`.
719#[derive(Debug, Clone)]
720pub struct AuditRow {
721    pub seq: i64,
722    pub ts_ms: i64,
723    pub prev_hash_hex: String,
724    pub entry_hash_hex: String,
725    pub sql: String,
726}
727
728/// v6.5.3 — chain-table provider + verifier. spg-server registers
729/// fn pointers that snapshot / verify the audit log. `verify`
730/// returns `(verified_count, broken_at_seq)` — `broken_at_seq` is
731/// `-1` on a clean chain.
732pub type AuditChainProvider = fn() -> Vec<AuditRow>;
733pub type AuditVerifier = fn() -> (i64, i64);
734
735impl Engine {
736    pub fn new() -> Self {
737        Self {
738            catalog: Catalog::new(),
739            tx_catalogs: BTreeMap::new(),
740            current_tx: None,
741            next_tx_id: 1,
742            clock: None,
743            salt_fn: None,
744            max_query_rows: None,
745            users: UserStore::new(),
746            publications: publications::Publications::new(),
747            subscriptions: subscriptions::Subscriptions::new(),
748            statistics: statistics::Statistics::new(),
749            plan_cache: plan_cache::PlanCache::new(),
750            query_stats: query_stats::QueryStats::new(),
751            activity_provider: None,
752            audit_chain_provider: None,
753            audit_verifier: None,
754            slow_query_threshold_us: None,
755            slow_query_logger: None,
756            session_params: BTreeMap::new(),
757            trigger_recursion_depth: 0,
758        }
759    }
760
761    /// v7.11.0 — clone the engine's committed catalog + read-time
762    /// state into a frozen `CatalogSnapshot`. Cheap (`Catalog` is
763    /// backed by `PersistentVec`; cloning is O(log n) per table).
764    /// Subsequent writes to this engine are invisible to the
765    /// snapshot; the snapshot is self-contained and can be moved
766    /// to another thread for concurrent `execute_readonly_on_snapshot`
767    /// calls. The basis for [`AsyncReadHandle`] in spg-embedded-tokio
768    /// and any other read-fanout pattern.
769    #[must_use]
770    pub fn clone_snapshot(&self) -> CatalogSnapshot {
771        CatalogSnapshot {
772            catalog: self.active_catalog().clone(),
773            statistics: self.statistics.clone(),
774            clock: self.clock,
775            max_query_rows: self.max_query_rows,
776        }
777    }
778
779    /// v7.11.1 — execute a read-only SQL statement against a
780    /// `CatalogSnapshot` without touching this engine. Same
781    /// semantics as `execute_readonly` but parameterised on the
782    /// snapshot's catalog. Reject DDL/DML the same way
783    /// `execute_readonly` does. Static-on-Self so the caller can
784    /// dispatch without holding an `Engine` borrow alongside the
785    /// snapshot.
786    pub fn execute_readonly_on_snapshot(
787        snapshot: &CatalogSnapshot,
788        sql: &str,
789    ) -> Result<QueryResult, EngineError> {
790        Self::execute_readonly_on_snapshot_with_cancel(snapshot, sql, CancelToken::none())
791    }
792
793    /// v7.11.1 — `execute_readonly_on_snapshot` with cooperative
794    /// cancellation. Builds a transient `Engine` over the snapshot
795    /// state, runs `execute_readonly_with_cancel`, drops. The
796    /// transient engine is cheap to construct (no I/O; everything
797    /// is just struct moves) and lets the existing read path stay
798    /// untouched.
799    pub fn execute_readonly_on_snapshot_with_cancel(
800        snapshot: &CatalogSnapshot,
801        sql: &str,
802        cancel: CancelToken<'_>,
803    ) -> Result<QueryResult, EngineError> {
804        let transient = Engine {
805            catalog: snapshot.catalog.clone(),
806            statistics: snapshot.statistics.clone(),
807            clock: snapshot.clock,
808            max_query_rows: snapshot.max_query_rows,
809            ..Engine::default()
810        };
811        transient.execute_readonly_with_cancel(sql, cancel)
812    }
813
814    /// Construct an engine restored from a previously-snapshotted catalog
815    /// (see `snapshot()`).
816    pub fn restore(catalog: Catalog) -> Self {
817        Self {
818            catalog,
819            tx_catalogs: BTreeMap::new(),
820            current_tx: None,
821            next_tx_id: 1,
822            clock: None,
823            salt_fn: None,
824            max_query_rows: None,
825            users: UserStore::new(),
826            publications: publications::Publications::new(),
827            subscriptions: subscriptions::Subscriptions::new(),
828            statistics: statistics::Statistics::new(),
829            plan_cache: plan_cache::PlanCache::new(),
830            query_stats: query_stats::QueryStats::new(),
831            activity_provider: None,
832            audit_chain_provider: None,
833            audit_verifier: None,
834            slow_query_threshold_us: None,
835            slow_query_logger: None,
836            session_params: BTreeMap::new(),
837            trigger_recursion_depth: 0,
838        }
839    }
840
841    /// Restore an engine + user table from a v4.1 envelope produced
842    /// by `snapshot_with_users()`. Falls back to plain catalog-only
843    /// restore if the envelope magic isn't present (so v3.x snapshot
844    /// files still load). v6.1.2 adds the optional publications
845    /// trailer (envelope v3); a v1/v2 envelope deserialises to an
846    /// empty publication table.
847    pub fn restore_envelope(buf: &[u8]) -> Result<Self, EngineError> {
848        match split_envelope(buf) {
849            EnvelopeParse::Pair {
850                catalog: catalog_bytes,
851                users: user_bytes,
852                publications: pub_bytes,
853                subscriptions: sub_bytes,
854                statistics: stats_bytes,
855            } => {
856                let catalog = Catalog::deserialize(catalog_bytes).map_err(EngineError::Storage)?;
857                let users = users::deserialize_users(user_bytes)
858                    .map_err(|e| EngineError::Unsupported(alloc::format!("users restore: {e}")))?;
859                let publications = match pub_bytes {
860                    Some(b) => publications::Publications::deserialize(b).map_err(|e| {
861                        EngineError::Unsupported(alloc::format!("publications restore: {e:?}"))
862                    })?,
863                    None => publications::Publications::new(),
864                };
865                let subscriptions = match sub_bytes {
866                    Some(b) => subscriptions::Subscriptions::deserialize(b).map_err(|e| {
867                        EngineError::Unsupported(alloc::format!("subscriptions restore: {e:?}"))
868                    })?,
869                    None => subscriptions::Subscriptions::new(),
870                };
871                let statistics = match stats_bytes {
872                    Some(b) => statistics::Statistics::deserialize(b).map_err(|e| {
873                        EngineError::Unsupported(alloc::format!("statistics restore: {e:?}"))
874                    })?,
875                    None => statistics::Statistics::new(),
876                };
877                Ok(Self {
878                    catalog,
879                    tx_catalogs: BTreeMap::new(),
880                    current_tx: None,
881                    next_tx_id: 1,
882                    clock: None,
883                    salt_fn: None,
884                    max_query_rows: None,
885                    users,
886                    publications,
887                    subscriptions,
888                    statistics,
889                    plan_cache: plan_cache::PlanCache::new(),
890                    query_stats: query_stats::QueryStats::new(),
891                    activity_provider: None,
892                    audit_chain_provider: None,
893                    audit_verifier: None,
894                    slow_query_threshold_us: None,
895                    slow_query_logger: None,
896                    session_params: BTreeMap::new(),
897                    trigger_recursion_depth: 0,
898                })
899            }
900            EnvelopeParse::CrcMismatch { expected, computed } => {
901                Err(EngineError::Storage(StorageError::Corrupt(alloc::format!(
902                    "snapshot envelope CRC32 mismatch (expected={expected:#010x}, computed={computed:#010x})"
903                ))))
904            }
905            EnvelopeParse::Bare => {
906                let catalog = Catalog::deserialize(buf).map_err(EngineError::Storage)?;
907                Ok(Self::restore(catalog))
908            }
909        }
910    }
911
912    pub const fn users(&self) -> &UserStore {
913        &self.users
914    }
915
916    /// `salt` is supplied by the caller (the host has a random
917    /// source; the engine is `no_std`). Caller should pass a fresh
918    /// 16-byte random value per user.
919    pub fn create_user(
920        &mut self,
921        name: &str,
922        password: &str,
923        role: Role,
924        salt: [u8; 16],
925    ) -> Result<(), UserError> {
926        self.users.create(name, password, role, salt)?;
927        // v4.8: also derive SCRAM-SHA-256 secrets so PG-wire SASL
928        // auth can verify without re-running PBKDF2 per attempt.
929        // Uses a fresh salt from the host RNG (falls back to a
930        // deterministic per-username salt when no RNG is wired, same
931        // as the legacy hash path).
932        let scram_salt = self.salt_fn.map_or_else(
933            || {
934                let mut s = [0u8; users::SCRAM_SALT_LEN];
935                let digest = spg_crypto::hash(name.as_bytes());
936                // Use bytes 16..32 of BLAKE3 so we don't reuse the
937                // exact same fallback salt as the BLAKE3 hash path.
938                s.copy_from_slice(&digest[16..32]);
939                s
940            },
941            |f| f(),
942        );
943        self.users
944            .enable_scram(name, password, scram_salt, users::SCRAM_DEFAULT_ITERS)?;
945        Ok(())
946    }
947
948    pub fn drop_user(&mut self, name: &str) -> Result<(), UserError> {
949        self.users.drop(name)
950    }
951
952    pub fn verify_user(&self, name: &str, password: &str) -> Option<Role> {
953        self.users.verify(name, password)
954    }
955
956    /// Builder: attach a wall clock so `NOW()` / `CURRENT_TIMESTAMP` /
957    /// `CURRENT_DATE` evaluate to a real value instead of erroring out.
958    #[must_use]
959    pub const fn with_clock(mut self, clock: ClockFn) -> Self {
960        self.clock = Some(clock);
961        self
962    }
963
964    /// Builder: attach an OS-backed RNG for per-user password salts.
965    /// The host (`spg-server`) typically wires this to `/dev/urandom`.
966    #[must_use]
967    pub const fn with_salt_fn(mut self, f: SaltFn) -> Self {
968        self.salt_fn = Some(f);
969        self
970    }
971
972    /// Builder: cap the number of rows a single SELECT may return.
973    /// Exceeding the cap raises `EngineError::RowLimitExceeded` —
974    /// the bound is checked inside the executor so a runaway
975    /// catalog scan can't allocate millions of rows before the
976    /// server gets a chance to reject the result.
977    #[must_use]
978    pub const fn with_max_query_rows(mut self, n: usize) -> Self {
979        self.max_query_rows = Some(n);
980        self
981    }
982
983    /// The *committed* catalog. Note: during a transaction this returns the
984    /// pre-TX state — `SELECT` inside a TX goes through `execute()` and reads
985    /// the shadow. Tests that inspect outside-TX state should use this.
986    pub const fn catalog(&self) -> &Catalog {
987        &self.catalog
988    }
989
990    /// Serialize the *committed* catalog to bytes. v0.6 was full-snapshot; v0.9
991    /// adds the rule that an open TX's shadow is never snapshotted — only the
992    /// post-COMMIT state is persisted. v4.1 wraps the catalog in an envelope
993    /// when there are users to persist; an empty user table snapshots as the
994    /// bare catalog format (backwards-compat with v3.x readers). v6.1.2
995    /// adds publications to the envelope condition: either non-empty
996    /// users OR non-empty publications now triggers the envelope path.
997    pub fn snapshot(&self) -> Vec<u8> {
998        if self.users.is_empty()
999            && self.publications.is_empty()
1000            && self.subscriptions.is_empty()
1001            && self.statistics.is_empty()
1002        {
1003            self.catalog.serialize()
1004        } else {
1005            build_envelope(
1006                &self.catalog.serialize(),
1007                &users::serialize_users(&self.users),
1008                &self.publications.serialize(),
1009                &self.subscriptions.serialize(),
1010                &self.statistics.serialize(),
1011            )
1012        }
1013    }
1014
1015    /// True when at least one TX slot is in flight. v4.41.1 runtime
1016    /// invariant: at most one slot active at a time (dispatch holds
1017    /// `engine.write()` across the entire wrap). v4.42 will let this
1018    /// return true with multiple slots concurrently.
1019    pub fn in_transaction(&self) -> bool {
1020        !self.tx_catalogs.is_empty()
1021    }
1022
1023    /// v4.41.1 allocate a fresh TX handle. Used by spg-server dispatch
1024    /// to scope each implicit-wrap BEGIN..stmt..COMMIT to its own slot
1025    /// in `tx_catalogs`. v4.42 — the commit-barrier leader allocates
1026    /// one of these per task in its group, runs `BEGIN`+sql+`COMMIT`
1027    /// sequentially under a single `engine.write()` so each task's
1028    /// mutations accumulate into shared state, then either keeps the
1029    /// accumulated state (fsync OK) or restores the pre-image via
1030    /// `replace_catalog` (fsync err).
1031    pub fn alloc_tx_id(&mut self) -> TxId {
1032        let id = TxId(self.next_tx_id);
1033        self.next_tx_id = self.next_tx_id.saturating_add(1);
1034        id
1035    }
1036
1037    /// v4.42 — atomically replace the live catalog. Used by the
1038    /// commit-barrier leader to roll back a group whose batched
1039    /// fsync failed: the leader snapshots `engine.catalog().clone()`
1040    /// (O(1) Arc bump after the v4.39/v4.40 persistent migration)
1041    /// at group start, sequentially applies each task's BEGIN+sql+
1042    /// COMMIT under the same write lock to accumulate mutations
1043    /// into shared state, batches the WAL bytes, fsyncs once, and
1044    /// on failure calls this with the pre-image to undo every
1045    /// task in the group at once.
1046    ///
1047    /// **Does NOT touch `tx_catalogs` / `current_tx`.** Any
1048    /// explicit-TX slot from a concurrent client (created via the
1049    /// legacy `IMPLICIT_TX`-less dispatch path or via the future
1050    /// MVCC-readers v5+ work) has its own snapshot baked into the
1051    /// slot — restoring `self.catalog` to the pre-image leaves
1052    /// those slots untouched, exactly as they were when the leader
1053    /// took the lock. The leader's own implicit-TX slots are all
1054    /// already discarded (`exec_commit` removed them as each
1055    /// task's COMMIT ran) by the time this is reached.
1056    pub fn replace_catalog(&mut self, catalog: Catalog) {
1057        self.catalog = catalog;
1058    }
1059
1060    /// v6.7.0 — public shim around `Catalog::freeze_oldest_to_cold`
1061    /// so tests + the spg-server freezer can drive a freeze without
1062    /// reaching into the private `active_catalog_mut`. v6.7.4
1063    /// parallel freezer will build on this surface.
1064    ///
1065    /// Marks the table's cached `cold_row_count` stale because the
1066    /// freeze added cold locators that ANALYZE hasn't yet refreshed.
1067    pub fn freeze_oldest_to_cold(
1068        &mut self,
1069        table_name: &str,
1070        index_name: &str,
1071        max_rows: usize,
1072    ) -> Result<spg_storage::FreezeReport, EngineError> {
1073        let report = self
1074            .active_catalog_mut()
1075            .freeze_oldest_to_cold(table_name, index_name, max_rows)
1076            .map_err(EngineError::Storage)?;
1077        if let Some(t) = self.active_catalog_mut().get_mut(table_name) {
1078            t.mark_cold_row_count_stale();
1079        }
1080        Ok(report)
1081    }
1082
1083    /// v6.7.5 — public shim used by the spg-server follower's
1084    /// segment-forwarding receiver. Registers a cold-tier segment
1085    /// at a specific id (the master's id, as transmitted on the
1086    /// wire) so the follower's BTree-Cold locators stay byte-
1087    /// identical with the master's. Wraps
1088    /// `Catalog::load_segment_bytes_at` under the standard
1089    /// clone-mutate-replace pattern.
1090    ///
1091    /// Returns `Ok(())` on success **and** on the "slot already
1092    /// occupied" case — a follower mid-reconnect may receive a
1093    /// segment chunk for a segment_id it already has on disk
1094    /// (forwarded last session); the caller should treat that
1095    /// path as a no-op rather than a fatal error.
1096    pub fn receive_cold_segment(
1097        &mut self,
1098        segment_id: u32,
1099        bytes: Vec<u8>,
1100    ) -> Result<(), EngineError> {
1101        let mut new_cat = self.catalog.clone();
1102        match new_cat.load_segment_bytes_at(segment_id, bytes) {
1103            Ok(()) => {
1104                self.replace_catalog(new_cat);
1105                Ok(())
1106            }
1107            Err(StorageError::Corrupt(msg)) if msg.contains("already occupied") => Ok(()),
1108            Err(e) => Err(EngineError::Storage(e)),
1109        }
1110    }
1111
1112    /// v6.7.3 — public shim around `Catalog::compact_cold_segments`
1113    /// driving every BTree index on every user table. Returns one
1114    /// `(table, index, report)` triple for each merge that
1115    /// actually happened (no-op (table, index) pairs are filtered
1116    /// out so callers can size persist-side work to the live
1117    /// merges). Caller is responsible for persisting each
1118    /// `report.merged_segment_bytes` and updating the on-disk
1119    /// segment registry; engine layer is no_std and never
1120    /// touches disk.
1121    ///
1122    /// Marks every touched table's cached `cold_row_count` stale
1123    /// — compaction GC'd some shadowed rows, so the count must be
1124    /// re-derived on the next ANALYZE.
1125    pub fn compact_cold_segments_with_target(
1126        &mut self,
1127        target_segment_bytes: u64,
1128    ) -> Result<Vec<(String, String, CompactReport)>, EngineError> {
1129        let table_names = self.active_catalog().table_names();
1130        let mut reports: Vec<(String, String, CompactReport)> = Vec::new();
1131        for tname in table_names {
1132            if is_internal_table_name(&tname) {
1133                continue;
1134            }
1135            let idx_names: Vec<String> = {
1136                let Some(t) = self.active_catalog().get(&tname) else {
1137                    continue;
1138                };
1139                t.indices()
1140                    .iter()
1141                    .filter(|i| matches!(i.kind, IndexKind::BTree(_)))
1142                    .map(|i| i.name.clone())
1143                    .collect()
1144            };
1145            for iname in idx_names {
1146                let report = self
1147                    .active_catalog_mut()
1148                    .compact_cold_segments(&tname, &iname, target_segment_bytes)
1149                    .map_err(EngineError::Storage)?;
1150                if report.merged_segment_id.is_some() {
1151                    if let Some(t) = self.active_catalog_mut().get_mut(&tname) {
1152                        t.mark_cold_row_count_stale();
1153                    }
1154                    reports.push((tname.clone(), iname, report));
1155                }
1156            }
1157        }
1158        Ok(reports)
1159    }
1160
1161    fn active_catalog(&self) -> &Catalog {
1162        match self.current_tx {
1163            Some(t) => self
1164                .tx_catalogs
1165                .get(&t)
1166                .map_or(&self.catalog, |s| &s.catalog),
1167            None => &self.catalog,
1168        }
1169    }
1170
1171    /// v7.12.4 — snapshot every row-level trigger on `table` that
1172    /// fires for `event` (`"INSERT"` / `"UPDATE"` / `"DELETE"`) at
1173    /// the given `timing` (`"BEFORE"` / `"AFTER"`), and clone its
1174    /// referenced function definition. Returned as a vec of owned
1175    /// `FunctionDef` so the row-write loop can fire them without
1176    /// holding a borrow on the catalog (which would conflict with
1177    /// the table.insert / update_row / delete mutable borrows).
1178    fn snapshot_row_triggers(
1179        &self,
1180        table: &str,
1181        event: &str,
1182        timing: &str,
1183    ) -> Vec<spg_storage::FunctionDef> {
1184        let cat = self.active_catalog();
1185        cat.triggers()
1186            .iter()
1187            .filter(|t| {
1188                t.table == table
1189                    && t.timing.eq_ignore_ascii_case(timing)
1190                    && t.for_each.eq_ignore_ascii_case("row")
1191                    && t.events.iter().any(|e| e.eq_ignore_ascii_case(event))
1192            })
1193            .filter_map(|t| cat.functions().get(&t.function).cloned())
1194            .collect()
1195    }
1196
1197    /// v7.12.7 — drain the trigger-emitted embedded SQL queue.
1198    /// Called by the INSERT / UPDATE / DELETE executors after
1199    /// their main row-write loop returns. Each statement runs
1200    /// inside the same cancel scope as the firing DML and bumps
1201    /// the recursion counter; nested embedded SQL beyond
1202    /// [`MAX_TRIGGER_RECURSION`] errors with a clear message so
1203    /// a trigger-graph cycle surfaces as a query failure instead
1204    /// of stack-blowing the engine.
1205    fn execute_deferred_trigger_stmts(
1206        &mut self,
1207        deferred: Vec<triggers::DeferredEmbeddedStmt>,
1208        cancel: CancelToken<'_>,
1209    ) -> Result<(), EngineError> {
1210        for d in deferred {
1211            if self.trigger_recursion_depth >= MAX_TRIGGER_RECURSION {
1212                return Err(EngineError::Storage(StorageError::Corrupt(alloc::format!(
1213                    "trigger embedded SQL recursion depth {} exceeded (trigger function \
1214                     {:?} would push past the {} cap — check for trigger cycles)",
1215                    self.trigger_recursion_depth,
1216                    d.function,
1217                    MAX_TRIGGER_RECURSION,
1218                ))));
1219            }
1220            self.trigger_recursion_depth += 1;
1221            let res = self.execute_stmt_with_cancel(d.stmt, cancel);
1222            self.trigger_recursion_depth -= 1;
1223            res?;
1224        }
1225        Ok(())
1226    }
1227
1228    fn active_catalog_mut(&mut self) -> &mut Catalog {
1229        let tx = self.current_tx;
1230        match tx {
1231            Some(t) => match self.tx_catalogs.get_mut(&t) {
1232                Some(s) => &mut s.catalog,
1233                None => &mut self.catalog,
1234            },
1235            None => &mut self.catalog,
1236        }
1237    }
1238
1239    /// Read-only execute path. Succeeds for `SELECT` / `SHOW TABLES`
1240    /// / `SHOW COLUMNS`; returns `EngineError::WriteRequired` for
1241    /// every other statement, so the caller can fall through to the
1242    /// `&mut self` `execute` path under a write lock. Engine state is
1243    /// not mutated even on the success path (`rewrite_clock_calls`
1244    /// and `resolve_order_by_position` both mutate the locally-owned
1245    /// AST, not `self`).
1246    ///
1247    /// **v4.0 concurrency**: this is the entry point the server takes
1248    /// under an `RwLock::read()` so multiple `SELECT` clients run in
1249    /// parallel without serialising on a single mutex.
1250    pub fn execute_readonly(&self, sql: &str) -> Result<QueryResult, EngineError> {
1251        self.execute_readonly_with_cancel(sql, CancelToken::none())
1252    }
1253
1254    /// v4.5 — read path with cooperative cancellation. Token's
1255    /// `is_cancelled` is checked at the start (so a watchdog that
1256    /// already fired returns Cancelled immediately) and at row-loop
1257    /// checkpoints inside `exec_select`. SHOW paths are O(small) and
1258    /// don't bother checking.
1259    pub fn execute_readonly_with_cancel(
1260        &self,
1261        sql: &str,
1262        cancel: CancelToken<'_>,
1263    ) -> Result<QueryResult, EngineError> {
1264        cancel.check()?;
1265        let mut stmt = parser::parse_statement(sql)?;
1266        let now_micros = self.clock.map(|f| f());
1267        rewrite_clock_calls(&mut stmt, now_micros);
1268        if let Statement::Select(s) = &mut stmt {
1269            resolve_order_by_position(s);
1270            // v6.2.3 — cost-based JOIN reorder (read path).
1271            reorder::reorder_joins(s, &self.catalog, &self.statistics);
1272        }
1273        let result = match stmt {
1274            Statement::Select(s) => self.exec_select_cancel(&s, cancel),
1275            Statement::ShowTables => Ok(self.exec_show_tables()),
1276            Statement::ShowColumns(table) => self.exec_show_columns(&table),
1277            Statement::ShowUsers => Ok(self.exec_show_users()),
1278            Statement::ShowPublications => Ok(self.exec_show_publications()),
1279            Statement::ShowSubscriptions => Ok(self.exec_show_subscriptions()),
1280            Statement::WaitForWalPosition { .. } => Err(EngineError::Unsupported(
1281                "WAIT FOR WAL POSITION must be handled by the server layer".into(),
1282            )),
1283            Statement::Explain(e) => self.exec_explain(&e, cancel),
1284            _ => Err(EngineError::WriteRequired),
1285        };
1286        self.enforce_row_limit(result)
1287    }
1288
1289    /// v4.2: cap result-set size. Applied after the executor
1290    /// materialises rows but before they leave the engine — wrapping
1291    /// every Rows-returning exec_* function would scatter the check.
1292    fn enforce_row_limit(
1293        &self,
1294        result: Result<QueryResult, EngineError>,
1295    ) -> Result<QueryResult, EngineError> {
1296        if let (Ok(QueryResult::Rows { rows, .. }), Some(cap)) = (&result, self.max_query_rows)
1297            && rows.len() > cap
1298        {
1299            return Err(EngineError::RowLimitExceeded(cap));
1300        }
1301        result
1302    }
1303
1304    pub fn execute(&mut self, sql: &str) -> Result<QueryResult, EngineError> {
1305        self.execute_in_with_cancel(sql, IMPLICIT_TX, CancelToken::none())
1306    }
1307
1308    /// v4.5 — write path with cooperative cancellation. Same dispatch
1309    /// as `execute_in_with_cancel(sql, IMPLICIT_TX, cancel)`. Kept as
1310    /// a separate entry point for backward-compat with the v4.5
1311    /// public API.
1312    pub fn execute_with_cancel(
1313        &mut self,
1314        sql: &str,
1315        cancel: CancelToken<'_>,
1316    ) -> Result<QueryResult, EngineError> {
1317        self.execute_in_with_cancel(sql, IMPLICIT_TX, cancel)
1318    }
1319
1320    /// v4.41.1 multi-slot write entry. Routes `sql` through the TX
1321    /// slot identified by `tx_id` so spg-server dispatch can scope
1322    /// each implicit-wrap BEGIN..stmt..COMMIT to its own slot in
1323    /// `tx_catalogs`. `IMPLICIT_TX` is the legacy single-slot path
1324    /// every other caller (engine self-tests, replay, spg-embedded)
1325    /// implicitly takes via `execute()` / `execute_with_cancel()`.
1326    pub fn execute_in(&mut self, sql: &str, tx_id: TxId) -> Result<QueryResult, EngineError> {
1327        self.execute_in_with_cancel(sql, tx_id, CancelToken::none())
1328    }
1329
1330    /// v4.41.1 write path with cooperative cancellation + explicit TX
1331    /// scope. Sets `self.current_tx` for the duration of the call so
1332    /// every `exec_*` helper transparently sees its TX's shadow
1333    /// catalog and savepoint stack; restores on exit so the field is
1334    /// only valid mid-call (no leakage across calls).
1335    pub fn execute_in_with_cancel(
1336        &mut self,
1337        sql: &str,
1338        tx_id: TxId,
1339        cancel: CancelToken<'_>,
1340    ) -> Result<QueryResult, EngineError> {
1341        let saved = self.current_tx;
1342        self.current_tx = Some(tx_id);
1343        let result = self.execute_inner_with_cancel(sql, cancel);
1344        self.current_tx = saved;
1345        result
1346    }
1347
1348    /// v6.1.1 — parse and pre-process a SQL string ONCE so the
1349    /// resulting [`Statement`] can be cached and re-executed via
1350    /// [`Engine::execute_prepared`]. Returns the same `Statement`
1351    /// the simple-query path would synthesise internally (clock
1352    /// rewrites + ORDER BY position-ref resolution applied at
1353    /// prepare time, since both are session-independent). The
1354    /// `$N` placeholders in the SQL stay as `Expr::Placeholder(n)`
1355    /// nodes; they're resolved to concrete values per-call by
1356    /// `execute_prepared`'s substitution walk.
1357    ///
1358    /// Pgwire's `Parse` (P) message lands here.
1359    pub fn prepare(&self, sql: &str) -> Result<Statement, ParseError> {
1360        let mut stmt = parser::parse_statement(sql)?;
1361        let now_micros = self.clock.map(|f| f());
1362        rewrite_clock_calls(&mut stmt, now_micros);
1363        if let Statement::Select(s) = &mut stmt {
1364            // v6.4.1 — expand `GROUP BY ALL` to every non-aggregate
1365            // SELECT-list item BEFORE position / alias resolution so
1366            // downstream passes see the explicit list.
1367            expand_group_by_all(s);
1368            resolve_order_by_position(s);
1369            // v6.2.3 — cost-based JOIN reorder. No-op for
1370            // single-table FROMs or any non-INNER join shape.
1371            reorder::reorder_joins(s, &self.catalog, &self.statistics);
1372        }
1373        Ok(stmt)
1374    }
1375
1376    /// v6.3.0 — cached prepare. Returns a cloned `Statement` from
1377    /// the plan cache on hit, runs the full `prepare()` path on miss
1378    /// and inserts the resulting plan before returning. Skipping the
1379    /// parse + JOIN-reorder pipeline on hit is the dominant win for
1380    /// JDBC / sqlx / pgx clients that reuse the same SQL string.
1381    ///
1382    /// Returns a cloned `Statement` (not a borrow) because the
1383    /// pgwire layer owns its `PreparedStmt` map per-session and the
1384    /// engine-level cache must stay available for other sessions.
1385    /// Clone cost on a 5-table JOIN AST is well under the parse cost
1386    /// it replaces.
1387    pub fn prepare_cached(&mut self, sql: &str) -> Result<Statement, ParseError> {
1388        // v6.3.1 — version-aware lookup. If the cached plan was
1389        // prepared before the most recent ANALYZE, evict and replan.
1390        let current_version = self.statistics.version();
1391        if let Some(plan) = self.plan_cache.get(sql) {
1392            if plan.statistics_version == current_version {
1393                return Ok(plan.stmt.clone());
1394            }
1395            // Stale entry — fall through to evict + re-prepare.
1396        }
1397        self.plan_cache.evict(sql);
1398        let stmt = self.prepare(sql)?;
1399        let source_tables = plan_cache::collect_source_tables(&stmt);
1400        let plan = plan_cache::PreparedPlan {
1401            stmt: stmt.clone(),
1402            statistics_version: current_version,
1403            source_tables,
1404            describe_columns: alloc::vec::Vec::new(),
1405        };
1406        self.plan_cache.insert(String::from(sql), plan);
1407        Ok(stmt)
1408    }
1409
1410    /// v6.3.0 — read-only accessor for tests and v6.3.1 invalidation.
1411    pub fn plan_cache(&self) -> &plan_cache::PlanCache {
1412        &self.plan_cache
1413    }
1414
1415    /// v6.3.0 — mutable accessor for v6.3.1 invalidation hooks.
1416    pub fn plan_cache_mut(&mut self) -> &mut plan_cache::PlanCache {
1417        &mut self.plan_cache
1418    }
1419
1420    /// v6.3.3 — Describe a prepared `Statement` without executing.
1421    /// Returns `(parameter_oids, output_columns)`. Empty
1422    /// `output_columns` means the statement has no row-producing
1423    /// shape we could resolve here (JOIN, subquery, non-SELECT, …)
1424    /// — pgwire layer maps that to a `NoData` reply.
1425    pub fn describe_prepared(&self, stmt: &Statement) -> (Vec<u32>, Vec<ColumnSchema>) {
1426        describe::describe_prepared(stmt, self.active_catalog())
1427    }
1428
1429    /// v6.1.1 — execute a [`Statement`] previously returned by
1430    /// [`Engine::prepare`], substituting `Expr::Placeholder(n)`
1431    /// nodes for the corresponding [`Value`] in `params` (1-based
1432    /// per PG: `$1` → `params[0]`). Bind-time string parameters
1433    /// are decoded into typed `Value`s by the pgwire layer before
1434    /// this call so the resulting AST hits the same execution
1435    /// path as a simple query — no SQL re-parse.
1436    ///
1437    /// Pgwire's `Execute` (E) message after a `Bind` (B) lands here.
1438    pub fn execute_prepared(
1439        &mut self,
1440        mut stmt: Statement,
1441        params: &[Value],
1442    ) -> Result<QueryResult, EngineError> {
1443        substitute_placeholders(&mut stmt, params)?;
1444        self.execute_stmt_with_cancel(stmt, CancelToken::none())
1445    }
1446
1447    fn execute_inner_with_cancel(
1448        &mut self,
1449        sql: &str,
1450        cancel: CancelToken<'_>,
1451    ) -> Result<QueryResult, EngineError> {
1452        cancel.check()?;
1453        let stmt = self.prepare(sql)?;
1454        // v6.5.1 — wrap the executor with a wall-clock window so we
1455        // can record into spg_stat_query. Skip when the engine has
1456        // no clock attached (no_std embedded callers).
1457        let start_us = self.clock.map(|f| f());
1458        let result = self.execute_stmt_with_cancel(stmt, cancel);
1459        if let (Some(t0), Ok(_)) = (start_us, &result) {
1460            let now = self.clock.map_or(t0, |f| f());
1461            let elapsed = now.saturating_sub(t0).max(0) as u64;
1462            self.query_stats.record(sql, elapsed, now as u64);
1463            // v6.5.6 — slow-query log: fire callback when elapsed
1464            // exceeds the configured floor.
1465            if let (Some(threshold), Some(logger)) =
1466                (self.slow_query_threshold_us, self.slow_query_logger)
1467                && elapsed >= threshold
1468            {
1469                logger(sql, elapsed);
1470            }
1471        }
1472        result
1473    }
1474
1475    fn execute_stmt_with_cancel(
1476        &mut self,
1477        stmt: Statement,
1478        cancel: CancelToken<'_>,
1479    ) -> Result<QueryResult, EngineError> {
1480        cancel.check()?;
1481        let result = match stmt {
1482            Statement::CreateTable(s) => self.exec_create_table(s),
1483            // v7.9.15 — CREATE EXTENSION is a no-op on SPG. Returns
1484            // CommandOk with affected=0; modified_catalog=false so
1485            // the WAL doesn't grow a useless entry. mailrs F3.
1486            Statement::CreateExtension(_) => Ok(QueryResult::CommandOk {
1487                affected: 0,
1488                modified_catalog: false,
1489            }),
1490            // v7.9.27 — DO $$ ... $$ is also a no-op (SPG has no
1491            // PL/pgSQL). mailrs H1 + pg_dump compat.
1492            Statement::DoBlock => Ok(QueryResult::CommandOk {
1493                affected: 0,
1494                modified_catalog: false,
1495            }),
1496            Statement::CreateIndex(s) => self.exec_create_index(s),
1497            Statement::Insert(s) => self.exec_insert(s),
1498            Statement::Update(s) => self.exec_update_cancel(&s, cancel),
1499            Statement::Delete(s) => self.exec_delete_cancel(&s, cancel),
1500            Statement::Select(s) => self.exec_select_cancel(&s, cancel),
1501            Statement::Begin => self.exec_begin(),
1502            Statement::Commit => self.exec_commit(),
1503            Statement::Rollback => self.exec_rollback(),
1504            Statement::Savepoint(name) => self.exec_savepoint(name),
1505            Statement::RollbackToSavepoint(name) => self.exec_rollback_to_savepoint(&name),
1506            Statement::ReleaseSavepoint(name) => self.exec_release_savepoint(&name),
1507            Statement::ShowTables => Ok(self.exec_show_tables()),
1508            Statement::ShowColumns(table) => self.exec_show_columns(&table),
1509            Statement::ShowUsers => Ok(self.exec_show_users()),
1510            Statement::ShowPublications => Ok(self.exec_show_publications()),
1511            Statement::ShowSubscriptions => Ok(self.exec_show_subscriptions()),
1512            Statement::CreateUser(s) => self.exec_create_user(&s),
1513            Statement::DropUser(name) => self.exec_drop_user(&name),
1514            Statement::Explain(e) => self.exec_explain(&e, cancel),
1515            Statement::AlterIndex(s) => self.exec_alter_index(s),
1516            Statement::AlterTable(s) => self.exec_alter_table(s),
1517            Statement::CreatePublication(s) => self.exec_create_publication(s),
1518            Statement::DropPublication(name) => self.exec_drop_publication(&name),
1519            Statement::CreateSubscription(s) => self.exec_create_subscription(s),
1520            Statement::DropSubscription(name) => self.exec_drop_subscription(&name),
1521            // v6.1.7 — WAIT FOR WAL POSITION needs `lag_state`,
1522            // which lives in spg-server's ServerState. The engine
1523            // surfaces a clear error; the server-layer dispatch
1524            // intercepts the SQL before it reaches the engine on
1525            // a server build, so this arm only fires for
1526            // engine-only callers (spg-embedded, lib tests).
1527            Statement::WaitForWalPosition { .. } => Err(EngineError::Unsupported(
1528                "WAIT FOR WAL POSITION must be handled by the server layer".into(),
1529            )),
1530            // v6.2.0 — ANALYZE recomputes per-column histograms.
1531            Statement::Analyze(target) => self.exec_analyze(target.as_deref()),
1532            // v6.7.3 — COMPACT COLD SEGMENTS.
1533            Statement::CompactColdSegments => self.exec_compact_cold_segments(),
1534            // v7.12.1 — SET / RESET session parameter. Engine
1535            // tracks the value in `session_params`; FTS dispatcher
1536            // reads `default_text_search_config`. Everything else
1537            // is a recorded no-op (PG dump compat).
1538            Statement::SetParameter { name, value } => {
1539                self.set_session_param(name, value);
1540                Ok(QueryResult::CommandOk {
1541                    affected: 0,
1542                    modified_catalog: false,
1543                })
1544            }
1545            // v7.12.4 — CREATE FUNCTION / CREATE TRIGGER / DROP …
1546            // for the PL/pgSQL trigger surface. exec_* methods are
1547            // defined alongside the existing CREATE handlers below.
1548            Statement::CreateFunction(s) => self.exec_create_function(s),
1549            Statement::CreateTrigger(s) => self.exec_create_trigger(s),
1550            Statement::DropTrigger {
1551                name,
1552                table,
1553                if_exists,
1554            } => self.exec_drop_trigger(&name, &table, if_exists),
1555            Statement::DropFunction { name, if_exists } => {
1556                self.exec_drop_function(&name, if_exists)
1557            }
1558            Statement::ResetParameter(target) => {
1559                match target {
1560                    None => self.session_params.clear(),
1561                    Some(name) => {
1562                        self.session_params.remove(&name.to_ascii_lowercase());
1563                    }
1564                }
1565                Ok(QueryResult::CommandOk {
1566                    affected: 0,
1567                    modified_catalog: false,
1568                })
1569            }
1570        };
1571        self.enforce_row_limit(result)
1572    }
1573
1574    /// v6.1.2 — `CREATE PUBLICATION` runtime path. Duplicate names
1575    /// surface as `EngineError::Unsupported` so the existing PG-wire
1576    /// error mapping stays uniform; the message carries the name so
1577    /// operators can grep replication-log noise. Inside-transaction
1578    /// invocation is rejected (matches `CREATE USER` / `DROP USER`
1579    /// stance) — replication-catalog mutation is a connection-level
1580    /// administrative op, not a transactional one.
1581    fn exec_create_publication(
1582        &mut self,
1583        s: CreatePublicationStatement,
1584    ) -> Result<QueryResult, EngineError> {
1585        // v6.1.4 — the v6.1.2 "no DDL inside a transaction" guard
1586        // was over-cautious: it also blocked the auto-commit wrap
1587        // path (which begins an internal TX around every WAL-
1588        // logged statement). PG itself allows CREATE PUBLICATION
1589        // inside a transaction (it rolls back with the TX).
1590        self.publications
1591            .create(s.name, s.scope)
1592            .map_err(|e| EngineError::Unsupported(alloc::format!("CREATE PUBLICATION: {e:?}")))?;
1593        Ok(QueryResult::CommandOk {
1594            affected: 1,
1595            modified_catalog: true,
1596        })
1597    }
1598
1599    /// v6.1.2 — `DROP PUBLICATION` runtime path. PG-compatible silent
1600    /// no-op when the publication doesn't exist (returns `affected=0`
1601    /// in that case so the wire-level command tag distinguishes
1602    /// "dropped" from "no-op", though both succeed).
1603    fn exec_drop_publication(&mut self, name: &str) -> Result<QueryResult, EngineError> {
1604        let removed = self.publications.drop(name);
1605        Ok(QueryResult::CommandOk {
1606            affected: usize::from(removed),
1607            modified_catalog: removed,
1608        })
1609    }
1610
1611    /// v6.1.2 — read access to the publication catalog. Used by
1612    /// the v6.1.5 publisher-side WAL filter, by `SHOW PUBLICATIONS`
1613    /// (v6.1.3+), and by e2e tests that need to assert state without
1614    /// going through the wire.
1615    pub const fn publications(&self) -> &publications::Publications {
1616        &self.publications
1617    }
1618
1619    /// v6.1.4 — `CREATE SUBSCRIPTION` runtime path. Defaults
1620    /// `enabled = true` and `last_received_pos = 0` for a freshly-
1621    /// created subscription. The actual worker thread is spawned
1622    /// by spg-server once the engine returns success.
1623    fn exec_create_subscription(
1624        &mut self,
1625        s: CreateSubscriptionStatement,
1626    ) -> Result<QueryResult, EngineError> {
1627        // See exec_create_publication — the in_transaction gate
1628        // was over-cautious; the auto-commit wrap path holds an
1629        // internal TX that this check was incorrectly blocking.
1630        let sub = subscriptions::Subscription {
1631            conn_str: s.conn_str,
1632            publications: s.publications,
1633            enabled: true,
1634            last_received_pos: 0,
1635        };
1636        self.subscriptions
1637            .create(s.name, sub)
1638            .map_err(|e| EngineError::Unsupported(alloc::format!("CREATE SUBSCRIPTION: {e:?}")))?;
1639        Ok(QueryResult::CommandOk {
1640            affected: 1,
1641            modified_catalog: true,
1642        })
1643    }
1644
1645    /// v6.1.4 — `DROP SUBSCRIPTION`. Silent no-op when the name
1646    /// doesn't exist (PG-compatible). The associated worker is
1647    /// torn down by spg-server when it observes the catalog
1648    /// change at the next snapshot or via the engine's
1649    /// subscriptions accessor (the worker polls the catalog on
1650    /// reconnect; v6.1.5's filter-side will tighten this to an
1651    /// explicit signal).
1652    fn exec_drop_subscription(&mut self, name: &str) -> Result<QueryResult, EngineError> {
1653        let removed = self.subscriptions.drop(name);
1654        Ok(QueryResult::CommandOk {
1655            affected: usize::from(removed),
1656            modified_catalog: removed,
1657        })
1658    }
1659
1660    /// v6.1.4 — read access to the subscription catalog. Used by
1661    /// the subscription worker (read its own row to find its
1662    /// publications + last applied position), by SHOW SUBSCRIPTIONS,
1663    /// and by e2e tests asserting state directly.
1664    pub const fn subscriptions(&self) -> &subscriptions::Subscriptions {
1665        &self.subscriptions
1666    }
1667
1668    /// v6.1.4 — write access to `last_received_pos`. Worker
1669    /// calls this after each apply batch (under the engine's
1670    /// write-lock). Returns `false` when the subscription was
1671    /// dropped between when the worker received the record and
1672    /// when this call landed.
1673    pub fn subscription_advance(&mut self, name: &str, pos: u64) -> bool {
1674        self.subscriptions.update_last_received_pos(name, pos)
1675    }
1676
1677    /// v6.1.4 — `SHOW SUBSCRIPTIONS` row materialisation. Returns
1678    /// `(name, conn_str, publications, enabled, last_received_pos)`
1679    /// ordered by subscription name. The `publications` column is
1680    /// the comma-joined list ("p1, p2") for ergonomic SHOW output;
1681    /// callers wanting structured access read `Engine::subscriptions`.
1682    fn exec_show_subscriptions(&self) -> QueryResult {
1683        let columns = alloc::vec![
1684            ColumnSchema::new("name", DataType::Text, false),
1685            ColumnSchema::new("conn_str", DataType::Text, false),
1686            ColumnSchema::new("publications", DataType::Text, false),
1687            ColumnSchema::new("enabled", DataType::Bool, false),
1688            ColumnSchema::new("last_received_pos", DataType::BigInt, false),
1689        ];
1690        let rows: Vec<Row> = self
1691            .subscriptions
1692            .iter()
1693            .map(|(name, sub)| {
1694                Row::new(alloc::vec![
1695                    Value::Text(name.clone()),
1696                    Value::Text(sub.conn_str.clone()),
1697                    Value::Text(sub.publications.join(", ")),
1698                    Value::Bool(sub.enabled),
1699                    Value::BigInt(i64::try_from(sub.last_received_pos).unwrap_or(i64::MAX)),
1700                ])
1701            })
1702            .collect();
1703        QueryResult::Rows { columns, rows }
1704    }
1705
1706    /// v6.2.0 — materialise `spg_statistic` rows. One row per
1707    /// `(table, column)` pair tracked in `Statistics`, with
1708    /// `histogram_bounds` rendered as a `[v0, v1, ...]` string —
1709    /// the same canonical form vector literals use for round-trip.
1710    fn exec_spg_statistic(&self) -> QueryResult {
1711        let columns = alloc::vec![
1712            ColumnSchema::new("table_name", DataType::Text, false),
1713            ColumnSchema::new("column_name", DataType::Text, false),
1714            ColumnSchema::new("null_frac", DataType::Float, false),
1715            ColumnSchema::new("n_distinct", DataType::BigInt, false),
1716            ColumnSchema::new("histogram_bounds", DataType::Text, false),
1717            // v6.7.0 — appended column (v6.2.0 stability contract
1718            // allows APPEND to spg_statistic, not reorder/rename).
1719            // Reports the cached per-table cold-row count; same
1720            // value across every column row of the same table.
1721            ColumnSchema::new("cold_row_count", DataType::BigInt, false),
1722        ];
1723        let rows: Vec<Row> = self
1724            .statistics
1725            .iter()
1726            .map(|((t, c), s)| {
1727                let cold = self
1728                    .catalog
1729                    .get(t)
1730                    .map_or(0, |table| table.cold_row_count());
1731                Row::new(alloc::vec![
1732                    Value::Text(t.clone()),
1733                    Value::Text(c.clone()),
1734                    Value::Float(f64::from(s.null_frac)),
1735                    Value::BigInt(i64::try_from(s.n_distinct).unwrap_or(i64::MAX)),
1736                    Value::Text(render_histogram_bounds(&s.histogram_bounds)),
1737                    Value::BigInt(i64::try_from(cold).unwrap_or(i64::MAX)),
1738                ])
1739            })
1740            .collect();
1741        QueryResult::Rows { columns, rows }
1742    }
1743
1744    /// v6.5.0 — materialise `spg_stat_replication` rows. One row
1745    /// per subscription with `(name, conn_str, publications,
1746    /// last_received_pos, enabled)`. Surface mirrors
1747    /// `SHOW SUBSCRIPTIONS` but follows the virtual-table dispatch
1748    /// shape so it composes with SELECT clauses (WHERE, projection
1749    /// onto specific columns, etc).
1750    fn exec_spg_stat_replication(&self) -> QueryResult {
1751        let columns = alloc::vec![
1752            ColumnSchema::new("name", DataType::Text, false),
1753            ColumnSchema::new("conn_str", DataType::Text, false),
1754            ColumnSchema::new("publications", DataType::Text, false),
1755            ColumnSchema::new("last_received_pos", DataType::BigInt, false),
1756            ColumnSchema::new("enabled", DataType::Bool, false),
1757        ];
1758        let rows: Vec<Row> = self
1759            .subscriptions
1760            .iter()
1761            .map(|(name, sub)| {
1762                Row::new(alloc::vec![
1763                    Value::Text(name.clone()),
1764                    Value::Text(sub.conn_str.clone()),
1765                    Value::Text(sub.publications.join(",")),
1766                    Value::BigInt(i64::try_from(sub.last_received_pos).unwrap_or(i64::MAX)),
1767                    Value::Bool(sub.enabled),
1768                ])
1769            })
1770            .collect();
1771        QueryResult::Rows { columns, rows }
1772    }
1773
1774    /// v6.5.0 — materialise `spg_stat_segment` rows. One row per
1775    /// cold-tier segment with `(segment_id, num_rows, num_pages,
1776    /// total_bytes)`.
1777    ///
1778    /// v6.7.0 — appended `table_name` column resolves the v6.5.0
1779    /// carve-out. Walks every user table's BTree indices to find
1780    /// which table's Cold locators point at each segment. Empty
1781    /// string for orphan segments (loaded via SPG_PRELOAD_COLD_SEGMENT
1782    /// before any index registered a locator). The walk is
1783    /// O(tables × indices × keys); cached per call, not across
1784    /// calls — re-walked on every `SELECT * FROM spg_stat_segment`.
1785    fn exec_spg_stat_segment(&self) -> QueryResult {
1786        let columns = alloc::vec![
1787            ColumnSchema::new("segment_id", DataType::BigInt, false),
1788            ColumnSchema::new("table_name", DataType::Text, false),
1789            ColumnSchema::new("num_rows", DataType::BigInt, false),
1790            ColumnSchema::new("num_pages", DataType::BigInt, false),
1791            ColumnSchema::new("total_bytes", DataType::BigInt, false),
1792        ];
1793        // v6.7.0 — build a segment_id → table_name map by walking
1794        // every user table's BTree indices once. O(tables × indices
1795        // × keys) for the v6.5.0 carve-out resolution; acceptable
1796        // because spg_stat_segment is operator-facing (not on a
1797        // hot-loop path).
1798        let mut segment_owners: alloc::collections::BTreeMap<u32, String> = BTreeMap::new();
1799        for tname in self.catalog.table_names() {
1800            if is_internal_table_name(&tname) {
1801                continue;
1802            }
1803            let Some(t) = self.catalog.get(&tname) else {
1804                continue;
1805            };
1806            for idx in t.indices() {
1807                if let spg_storage::IndexKind::BTree(map) = &idx.kind {
1808                    for (_, locs) in map.iter() {
1809                        for loc in locs {
1810                            if let spg_storage::RowLocator::Cold { segment_id, .. } = loc {
1811                                segment_owners
1812                                    .entry(*segment_id)
1813                                    .or_insert_with(|| tname.clone());
1814                            }
1815                        }
1816                    }
1817                }
1818            }
1819        }
1820        let rows: Vec<Row> = self
1821            .catalog
1822            .cold_segment_ids_global()
1823            .iter()
1824            .filter_map(|&id| {
1825                let seg = self.catalog.cold_segment(id)?;
1826                let meta = seg.meta();
1827                let owner = segment_owners.get(&id).cloned().unwrap_or_default();
1828                Some(Row::new(alloc::vec![
1829                    Value::BigInt(i64::from(id)),
1830                    Value::Text(owner),
1831                    Value::BigInt(i64::try_from(meta.num_rows).unwrap_or(i64::MAX)),
1832                    Value::BigInt(i64::from(meta.num_pages)),
1833                    Value::BigInt(i64::try_from(meta.total_bytes).unwrap_or(i64::MAX)),
1834                ]))
1835            })
1836            .collect();
1837        QueryResult::Rows { columns, rows }
1838    }
1839
1840    /// v6.5.1 — materialise `spg_stat_query` rows. One row per
1841    /// distinct SQL text recorded since the engine booted, capped
1842    /// at `QUERY_STATS_MAX` (1024). Columns:
1843    ///   sql, exec_count, total_us, mean_us, max_us, last_seen_us
1844    /// mean_us = total_us / exec_count (saturating).
1845    fn exec_spg_stat_query(&self) -> QueryResult {
1846        let columns = alloc::vec![
1847            ColumnSchema::new("sql", DataType::Text, false),
1848            ColumnSchema::new("exec_count", DataType::BigInt, false),
1849            ColumnSchema::new("total_us", DataType::BigInt, false),
1850            ColumnSchema::new("mean_us", DataType::BigInt, false),
1851            ColumnSchema::new("max_us", DataType::BigInt, false),
1852            ColumnSchema::new("last_seen_us", DataType::BigInt, false),
1853        ];
1854        let rows: Vec<Row> = self
1855            .query_stats
1856            .snapshot()
1857            .into_iter()
1858            .map(|(sql, s)| {
1859                let mean = if s.exec_count == 0 {
1860                    0
1861                } else {
1862                    s.total_us / s.exec_count
1863                };
1864                Row::new(alloc::vec![
1865                    Value::Text(sql),
1866                    Value::BigInt(i64::try_from(s.exec_count).unwrap_or(i64::MAX)),
1867                    Value::BigInt(i64::try_from(s.total_us).unwrap_or(i64::MAX)),
1868                    Value::BigInt(i64::try_from(mean).unwrap_or(i64::MAX)),
1869                    Value::BigInt(i64::try_from(s.max_us).unwrap_or(i64::MAX)),
1870                    Value::BigInt(i64::try_from(s.last_seen_us).unwrap_or(i64::MAX)),
1871                ])
1872            })
1873            .collect();
1874        QueryResult::Rows { columns, rows }
1875    }
1876
1877    /// v6.5.2 — register a connection-state provider. spg-server
1878    /// calls this at startup with a function that snapshots its
1879    /// per-pgwire-connection registry. Engine reads through the
1880    /// callback on `SELECT * FROM spg_stat_activity`.
1881    #[must_use]
1882    pub const fn with_activity_provider(mut self, f: ActivityProvider) -> Self {
1883        self.activity_provider = Some(f);
1884        self
1885    }
1886
1887    /// v6.5.3 — register audit chain provider + verifier.
1888    #[must_use]
1889    pub const fn with_audit_providers(
1890        mut self,
1891        chain: AuditChainProvider,
1892        verify: AuditVerifier,
1893    ) -> Self {
1894        self.audit_chain_provider = Some(chain);
1895        self.audit_verifier = Some(verify);
1896        self
1897    }
1898
1899    /// v6.5.6 — register a slow-query log callback. `threshold_us`
1900    /// is the floor (in microseconds); only executes above the floor
1901    /// fire the callback. spg-server wires this from
1902    /// `SPG_SLOW_QUERY_THRESHOLD_MS` (default 100 ms).
1903    #[must_use]
1904    pub const fn with_slow_query_log(mut self, threshold_us: u64, logger: SlowQueryLogger) -> Self {
1905        self.slow_query_threshold_us = Some(threshold_us);
1906        self.slow_query_logger = Some(logger);
1907        self
1908    }
1909
1910    /// v6.5.6 — operator knob for plan cache cap. spg-server reads
1911    /// `SPG_PLAN_CACHE_MAX` env at startup; uses this to override
1912    /// the compile-time default of 256.
1913    pub fn set_plan_cache_max(&mut self, n: usize) {
1914        self.plan_cache.set_max_entries(n);
1915    }
1916
1917    /// v6.5.2 — materialise `spg_stat_activity` rows. Pulls a fresh
1918    /// snapshot from the registered `ActivityProvider`. Returns an
1919    /// empty result set when no provider is registered (the no_std
1920    /// embedded path with no pgwire layer).
1921    fn exec_spg_stat_activity(&self) -> QueryResult {
1922        let columns = alloc::vec![
1923            ColumnSchema::new("pid", DataType::Int, false),
1924            ColumnSchema::new("user", DataType::Text, false),
1925            ColumnSchema::new("started_at_us", DataType::BigInt, false),
1926            ColumnSchema::new("current_sql", DataType::Text, false),
1927            ColumnSchema::new("wait_event", DataType::Text, false),
1928            ColumnSchema::new("elapsed_us", DataType::BigInt, false),
1929            ColumnSchema::new("in_transaction", DataType::Bool, false),
1930        ];
1931        let rows: Vec<Row> = self
1932            .activity_provider
1933            .map(|f| f())
1934            .unwrap_or_default()
1935            .into_iter()
1936            .map(|r| {
1937                Row::new(alloc::vec![
1938                    Value::Int(i32::try_from(r.pid).unwrap_or(i32::MAX)),
1939                    Value::Text(r.user),
1940                    Value::BigInt(r.started_at_us),
1941                    Value::Text(r.current_sql),
1942                    Value::Text(r.wait_event),
1943                    Value::BigInt(r.elapsed_us),
1944                    Value::Bool(r.in_transaction),
1945                ])
1946            })
1947            .collect();
1948        QueryResult::Rows { columns, rows }
1949    }
1950
1951    /// v6.5.4 — materialise `spg_table_ddl` rows. One row per user
1952    /// table with `(table_name, ddl)`. Reconstructed from catalog
1953    /// state on demand.
1954    fn exec_spg_table_ddl(&self) -> QueryResult {
1955        let columns = alloc::vec![
1956            ColumnSchema::new("table_name", DataType::Text, false),
1957            ColumnSchema::new("ddl", DataType::Text, false),
1958        ];
1959        let rows: Vec<Row> = self
1960            .catalog
1961            .table_names()
1962            .into_iter()
1963            .filter(|n| !is_internal_table_name(n))
1964            .filter_map(|name| {
1965                let table = self.catalog.get(&name)?;
1966                let ddl = render_create_table(&name, &table.schema().columns);
1967                Some(Row::new(alloc::vec![Value::Text(name), Value::Text(ddl),]))
1968            })
1969            .collect();
1970        QueryResult::Rows { columns, rows }
1971    }
1972
1973    /// v6.5.4 — materialise `spg_role_ddl` rows. One row per user
1974    /// with `(role_name, ddl)`. Password is redacted (matches the
1975    /// `Statement::CreateUser` Display which prints `'<redacted>'`).
1976    fn exec_spg_role_ddl(&self) -> QueryResult {
1977        let columns = alloc::vec![
1978            ColumnSchema::new("role_name", DataType::Text, false),
1979            ColumnSchema::new("ddl", DataType::Text, false),
1980        ];
1981        let rows: Vec<Row> = self
1982            .users
1983            .iter()
1984            .map(|(name, rec)| {
1985                let ddl = alloc::format!(
1986                    "CREATE USER {name} WITH PASSWORD '<redacted>' ROLE '{}'",
1987                    rec.role.as_str(),
1988                );
1989                Row::new(alloc::vec![
1990                    Value::Text(String::from(name)),
1991                    Value::Text(ddl)
1992                ])
1993            })
1994            .collect();
1995        QueryResult::Rows { columns, rows }
1996    }
1997
1998    /// v6.5.4 — materialise `spg_database_ddl`: single row whose
1999    /// `ddl` column concatenates every user table's CREATE +
2000    /// every role's CREATE in deterministic catalog order. Suitable
2001    /// for piping back through `Engine::execute` to recreate a
2002    /// schema-equivalent database.
2003    fn exec_spg_database_ddl(&self) -> QueryResult {
2004        let columns = alloc::vec![ColumnSchema::new("ddl", DataType::Text, false)];
2005        let mut out = String::new();
2006        for (name, rec) in self.users.iter() {
2007            out.push_str(&alloc::format!(
2008                "CREATE USER {name} WITH PASSWORD '<redacted>' ROLE '{}';\n",
2009                rec.role.as_str(),
2010            ));
2011        }
2012        for name in self.catalog.table_names() {
2013            if is_internal_table_name(&name) {
2014                continue;
2015            }
2016            if let Some(table) = self.catalog.get(&name) {
2017                out.push_str(&render_create_table(&name, &table.schema().columns));
2018                out.push_str(";\n");
2019            }
2020        }
2021        QueryResult::Rows {
2022            columns,
2023            rows: alloc::vec![Row::new(alloc::vec![Value::Text(out)])],
2024        }
2025    }
2026
2027    /// v6.5.3 — materialise `spg_audit_chain` rows. Pulls a fresh
2028    /// snapshot from the registered provider; empty when no
2029    /// provider is set.
2030    fn exec_spg_audit_chain(&self) -> QueryResult {
2031        let columns = alloc::vec![
2032            ColumnSchema::new("seq", DataType::BigInt, false),
2033            ColumnSchema::new("ts_ms", DataType::BigInt, false),
2034            ColumnSchema::new("prev_hash", DataType::Text, false),
2035            ColumnSchema::new("entry_hash", DataType::Text, false),
2036            ColumnSchema::new("sql", DataType::Text, false),
2037        ];
2038        let rows: Vec<Row> = self
2039            .audit_chain_provider
2040            .map(|f| f())
2041            .unwrap_or_default()
2042            .into_iter()
2043            .map(|r| {
2044                Row::new(alloc::vec![
2045                    Value::BigInt(r.seq),
2046                    Value::BigInt(r.ts_ms),
2047                    Value::Text(r.prev_hash_hex),
2048                    Value::Text(r.entry_hash_hex),
2049                    Value::Text(r.sql),
2050                ])
2051            })
2052            .collect();
2053        QueryResult::Rows { columns, rows }
2054    }
2055
2056    /// v6.5.3 — materialise `spg_audit_verify` single-row result.
2057    /// `(verified_count, broken_at_seq)` — broken_at_seq is `-1`
2058    /// on a clean chain. Returns one row with both values 0 when
2059    /// no verifier is registered (no-data fallback for embedded
2060    /// callers).
2061    fn exec_spg_audit_verify(&self) -> QueryResult {
2062        let columns = alloc::vec![
2063            ColumnSchema::new("verified_count", DataType::BigInt, false),
2064            ColumnSchema::new("broken_at_seq", DataType::BigInt, false),
2065        ];
2066        let (verified, broken) = self.audit_verifier.map(|f| f()).unwrap_or((0, -1));
2067        let row = Row::new(alloc::vec![Value::BigInt(verified), Value::BigInt(broken),]);
2068        QueryResult::Rows {
2069            columns,
2070            rows: alloc::vec![row],
2071        }
2072    }
2073
2074    /// v6.5.1 — read-only accessor for tests + v6.5.6 ops resets.
2075    pub fn query_stats(&self) -> &query_stats::QueryStats {
2076        &self.query_stats
2077    }
2078
2079    /// v6.5.1 — mutable accessor (clear, etc).
2080    pub fn query_stats_mut(&mut self) -> &mut query_stats::QueryStats {
2081        &mut self.query_stats
2082    }
2083
2084    /// v6.2.0 — read access to the per-column statistics table.
2085    /// Used by the planner (v6.2.2 selectivity functions read this),
2086    /// by `SELECT * FROM spg_statistic`, and by e2e tests.
2087    pub const fn statistics(&self) -> &statistics::Statistics {
2088        &self.statistics
2089    }
2090
2091    /// v6.2.1 — return tables whose modified-row count crossed the
2092    /// auto-analyze threshold since the last ANALYZE on that table.
2093    /// The threshold is `0.1 × max(row_count, MIN_ROWS_FOR_AUTO_
2094    /// ANALYZE)` — combines PG-style fractional + absolute lower
2095    /// bound so a fresh / tiny table doesn't get hammered on every
2096    /// INSERT.
2097    ///
2098    /// Designed to be cheap: walks every user table's
2099    /// `Catalog::table_names()` + reads `statistics::modified_
2100    /// since_last_analyze()` (BTreeMap lookup). The background
2101    /// worker calls this under `engine.read()` then drops the lock
2102    /// before re-acquiring `engine.write()` for the actual ANALYZE.
2103    pub fn tables_needing_analyze(&self) -> Vec<String> {
2104        const MIN_ROWS: u64 = 100;
2105        let mut out = Vec::new();
2106        for name in self.catalog.table_names() {
2107            if is_internal_table_name(&name) {
2108                continue;
2109            }
2110            let Some(table) = self.catalog.get(&name) else {
2111                continue;
2112            };
2113            let row_count = table.rows().len() as u64;
2114            let modified = self.statistics.modified_since_last_analyze(&name);
2115            // Threshold: ceil(0.1 × max(row_count, MIN_ROWS)),
2116            // computed in integer arithmetic so spg-engine stays
2117            // no_std without pulling in libm. `(n + 9) / 10` is
2118            // `ceil(n / 10)` for non-negative `n`.
2119            let base = row_count.max(MIN_ROWS);
2120            let threshold = base.saturating_add(9) / 10;
2121            if modified >= threshold {
2122                out.push(name);
2123            }
2124        }
2125        out
2126    }
2127
2128    /// v6.2.0 — `ANALYZE [<table>]` runtime. Bare `ANALYZE` walks
2129    /// every user table; `ANALYZE <name>` re-stats one. For each
2130    /// target table, single-pass scan + per-column histogram +
2131    /// `null_frac` + `n_distinct`. Replaces the table's prior
2132    /// stats; resets the modified-row counter.
2133    ///
2134    /// v6.2.0 doesn't sample — it scans the full table. v6.2.x
2135    /// can add reservoir sampling at the > 100 K-row mark; not a
2136    /// scope blocker for the current commit since rows ≤ 100 K
2137    /// analyse in milliseconds.
2138    fn exec_analyze(&mut self, target: Option<&str>) -> Result<QueryResult, EngineError> {
2139        let names: Vec<String> = if let Some(name) = target {
2140            // Verify the table exists; surface a clear error if not.
2141            if self.catalog.get(name).is_none() {
2142                return Err(EngineError::Storage(StorageError::TableNotFound {
2143                    name: name.to_string(),
2144                }));
2145            }
2146            alloc::vec![name.to_string()]
2147        } else {
2148            self.catalog
2149                .table_names()
2150                .into_iter()
2151                .filter(|n| !is_internal_table_name(n))
2152                .collect()
2153        };
2154        let mut analysed = 0usize;
2155        for table_name in &names {
2156            self.analyze_one_table(table_name)?;
2157            analysed += 1;
2158        }
2159        // v6.3.1 — plan cache invalidation. Bump stats version so
2160        // future lookups see the new generation, and selectively
2161        // evict every plan whose `source_tables` overlap with the
2162        // ANALYZE target set. Bare ANALYZE (all tables) clears the
2163        // whole cache.
2164        if analysed > 0 {
2165            self.statistics.bump_version();
2166            if target.is_some() {
2167                for t in &names {
2168                    self.plan_cache.evict_referencing(t);
2169                }
2170            } else {
2171                self.plan_cache.clear();
2172            }
2173        }
2174        Ok(QueryResult::CommandOk {
2175            affected: analysed,
2176            modified_catalog: true,
2177        })
2178    }
2179
2180    /// v6.7.3 — `COMPACT COLD SEGMENTS` runtime path. Drives the
2181    /// engine-layer compaction shim with the default
2182    /// 4 MiB segment-size threshold. spg-server intercepts the
2183    /// SQL before it reaches the engine on a server build —
2184    /// it reads `SPG_COMPACTION_TARGET_SEGMENT_BYTES`, calls
2185    /// `Engine::compact_cold_segments_with_target` directly with
2186    /// the env value, and persists every merged segment to
2187    /// v7.12.1 — record a `SET <name> = <value>` parameter. Names
2188    /// are case-folded to lowercase to match PG; values keep their
2189    /// caller-supplied form so observability paths see what was
2190    /// requested. Only `default_text_search_config` is consulted by
2191    /// the engine today.
2192    fn set_session_param(&mut self, name: String, value: spg_sql::ast::SetValue) {
2193        let normalised = match value {
2194            spg_sql::ast::SetValue::String(s) => s,
2195            spg_sql::ast::SetValue::Ident(s) => s,
2196            spg_sql::ast::SetValue::Number(s) => s,
2197            spg_sql::ast::SetValue::Default => String::new(),
2198        };
2199        self.session_params
2200            .insert(name.to_ascii_lowercase(), normalised);
2201    }
2202
2203    /// v7.12.1 — read a session parameter set via `SET`. Used by
2204    /// the FTS function dispatcher to resolve the default config
2205    /// for `to_tsvector(text)` / `plainto_tsquery(text)` etc.
2206    #[must_use]
2207    pub fn session_param(&self, name: &str) -> Option<&str> {
2208        self.session_params
2209            .get(&name.to_ascii_lowercase())
2210            .map(String::as_str)
2211    }
2212
2213    /// v7.12.1 — build an `EvalContext` chained with the session's
2214    /// `default_text_search_config`. Engine-internal callers use
2215    /// this instead of `EvalContext::new` so the FTS function
2216    /// dispatcher sees the SET configuration.
2217    fn ev_ctx<'a>(
2218        &'a self,
2219        columns: &'a [ColumnSchema],
2220        alias: Option<&'a str>,
2221    ) -> EvalContext<'a> {
2222        EvalContext::new(columns, alias)
2223            .with_default_text_search_config(self.session_param("default_text_search_config"))
2224    }
2225
2226    /// `<db>.spg/segments/`. This arm only fires for engine-only
2227    /// callers (spg-embedded, lib tests); in that mode merged
2228    /// segments live in memory and are dropped at process exit.
2229    fn exec_compact_cold_segments(&mut self) -> Result<QueryResult, EngineError> {
2230        let target = COMPACTION_TARGET_DEFAULT_BYTES;
2231        let reports = self.compact_cold_segments_with_target(target)?;
2232        let columns = alloc::vec![
2233            ColumnSchema::new("table_name", DataType::Text, false),
2234            ColumnSchema::new("index_name", DataType::Text, false),
2235            ColumnSchema::new("sources_merged", DataType::BigInt, false),
2236            ColumnSchema::new("merged_segment_id", DataType::BigInt, false),
2237            ColumnSchema::new("merged_rows", DataType::BigInt, false),
2238            ColumnSchema::new("deleted_rows_pruned", DataType::BigInt, false),
2239            ColumnSchema::new("bytes_reclaimed_estimate", DataType::BigInt, false),
2240        ];
2241        let rows: Vec<Row> = reports
2242            .into_iter()
2243            .map(|(tname, iname, report)| {
2244                Row::new(alloc::vec![
2245                    Value::Text(tname),
2246                    Value::Text(iname),
2247                    Value::BigInt(i64::try_from(report.sources.len()).unwrap_or(i64::MAX)),
2248                    Value::BigInt(i64::from(report.merged_segment_id.unwrap_or(0))),
2249                    Value::BigInt(i64::try_from(report.merged_rows).unwrap_or(i64::MAX)),
2250                    Value::BigInt(i64::try_from(report.deleted_rows_pruned).unwrap_or(i64::MAX),),
2251                    Value::BigInt(
2252                        i64::try_from(report.bytes_reclaimed_estimate).unwrap_or(i64::MAX),
2253                    ),
2254                ])
2255            })
2256            .collect();
2257        Ok(QueryResult::Rows { columns, rows })
2258    }
2259
2260    /// Walk a single table's rows once and (re-)populate per-column
2261    /// stats. Drops the existing stats for `table` first so columns
2262    /// that have been DROP-ed between ANALYZEs don't leave stale
2263    /// rows.
2264    fn analyze_one_table(&mut self, table_name: &str) -> Result<(), EngineError> {
2265        let table = self.catalog.get(table_name).ok_or_else(|| {
2266            EngineError::Storage(StorageError::TableNotFound {
2267                name: table_name.to_string(),
2268            })
2269        })?;
2270        let schema = table.schema().clone();
2271        let row_count = table.rows().len();
2272        // For each column, collect (sorted) non-NULL textual values
2273        // + count NULLs; then ask `statistics::build_histogram` to
2274        // produce the 101 bounds and `estimate_n_distinct` the
2275        // distinct count.
2276        self.statistics.clear_table(table_name);
2277        for (col_pos, col_schema) in schema.columns.iter().enumerate() {
2278            // v6.2.0 skip: vector columns have their own stats
2279            // shape (HNSW graph topology). v6.2 deliberation #1.
2280            if matches!(col_schema.ty, DataType::Vector { .. }) {
2281                continue;
2282            }
2283            let mut non_null_values: Vec<Value> = Vec::with_capacity(row_count);
2284            let mut nulls: u64 = 0;
2285            for row in table.rows() {
2286                match row.values.get(col_pos) {
2287                    Some(Value::Null) | None => nulls += 1,
2288                    Some(v) => non_null_values.push(v.clone()),
2289                }
2290            }
2291            // Sort by type-aware ordering (Int as int, Text as
2292            // lex, etc.) so histogram bounds reflect the column's
2293            // natural order — not lexicographic on the string
2294            // representation, which would put "9" after "49".
2295            non_null_values.sort_by(|a, b| sort_values_for_histogram(a, b));
2296            let non_null: Vec<String> = non_null_values.iter().map(canonical_value_repr).collect();
2297            let null_frac = if row_count == 0 {
2298                0.0
2299            } else {
2300                #[allow(clippy::cast_precision_loss)]
2301                let f = nulls as f32 / row_count as f32;
2302                f
2303            };
2304            let n_distinct = statistics::estimate_n_distinct(&non_null);
2305            let histogram_bounds = statistics::build_histogram(&non_null);
2306            self.statistics.set(
2307                table_name.to_string(),
2308                col_schema.name.clone(),
2309                statistics::ColumnStats {
2310                    null_frac,
2311                    n_distinct,
2312                    histogram_bounds,
2313                },
2314            );
2315        }
2316        self.statistics.reset_modified(table_name);
2317        // v6.7.0 — refresh the per-table cold_rows cache. Walk the
2318        // BTree indices and count Cold locators (MAX across
2319        // indices); store the result on the table. Surfaced via
2320        // `spg_statistic.cold_row_count` (new column) and
2321        // `spg_stat_segment.table_name` (new column).
2322        let cold_count = {
2323            let table = self
2324                .active_catalog()
2325                .get(table_name)
2326                .expect("table still present");
2327            table.count_cold_locators()
2328        };
2329        let table_mut = self
2330            .active_catalog_mut()
2331            .get_mut(table_name)
2332            .expect("table still present");
2333        table_mut.set_cold_row_count(cold_count);
2334        Ok(())
2335    }
2336
2337    /// v6.1.3 — `SHOW PUBLICATIONS` row materialisation. Returns
2338    /// `(name, scope, table_count)` ordered by publication name.
2339    ///   - `scope` is the human-readable string:
2340    ///       `"FOR ALL TABLES"` /
2341    ///       `"FOR TABLE t1, t2"` /
2342    ///       `"FOR ALL TABLES EXCEPT t1, t2"`.
2343    ///   - `table_count` is NULL for `AllTables`, the list length
2344    ///     otherwise. NULLability lets clients distinguish "publish
2345    ///     everything" from "publish exactly 0 tables" (the v6.1.3
2346    ///     parser forbids the empty list, but the column shape is
2347    ///     ready for the v6.1.5 publisher-side semantics).
2348    fn exec_show_publications(&self) -> QueryResult {
2349        let columns = alloc::vec![
2350            ColumnSchema::new("name", DataType::Text, false),
2351            ColumnSchema::new("scope", DataType::Text, false),
2352            ColumnSchema::new("table_count", DataType::Int, true),
2353        ];
2354        let rows: Vec<Row> = self
2355            .publications
2356            .iter()
2357            .map(|(name, scope)| {
2358                let (scope_str, count_val) = match scope {
2359                    spg_sql::ast::PublicationScope::AllTables => {
2360                        ("FOR ALL TABLES".to_string(), Value::Null)
2361                    }
2362                    spg_sql::ast::PublicationScope::ForTables(ts) => (
2363                        alloc::format!("FOR TABLE {}", ts.join(", ")),
2364                        Value::Int(i32::try_from(ts.len()).unwrap_or(i32::MAX)),
2365                    ),
2366                    spg_sql::ast::PublicationScope::AllTablesExcept(ts) => (
2367                        alloc::format!("FOR ALL TABLES EXCEPT {}", ts.join(", ")),
2368                        Value::Int(i32::try_from(ts.len()).unwrap_or(i32::MAX)),
2369                    ),
2370                };
2371                Row::new(alloc::vec![
2372                    Value::Text(name.clone()),
2373                    Value::Text(scope_str),
2374                    count_val,
2375                ])
2376            })
2377            .collect();
2378        QueryResult::Rows { columns, rows }
2379    }
2380
2381    /// v4.1 `SHOW USERS` — `(name, role)` per row, ordered by name.
2382    fn exec_show_users(&self) -> QueryResult {
2383        let columns = alloc::vec![
2384            ColumnSchema::new("name", DataType::Text, false),
2385            ColumnSchema::new("role", DataType::Text, false),
2386        ];
2387        let rows: Vec<Row> = self
2388            .users
2389            .iter()
2390            .map(|(name, rec)| {
2391                Row::new(alloc::vec![
2392                    Value::Text(name.to_string()),
2393                    Value::Text(rec.role.as_str().to_string()),
2394                ])
2395            })
2396            .collect();
2397        QueryResult::Rows { columns, rows }
2398    }
2399
2400    fn exec_create_user(&mut self, s: &CreateUserStatement) -> Result<QueryResult, EngineError> {
2401        if self.in_transaction() {
2402            return Err(EngineError::Unsupported(
2403                "CREATE USER is not allowed inside a transaction".into(),
2404            ));
2405        }
2406        let role = users::Role::parse(&s.role).ok_or_else(|| {
2407            EngineError::Unsupported(alloc::format!("invalid role: {:?}", s.role))
2408        })?;
2409        // Prefer the host-injected RNG. Falls back to a deterministic
2410        // salt derived from the username only when no RNG is wired —
2411        // acceptable for tests; the server always installs one.
2412        let salt = self.salt_fn.map_or_else(
2413            || {
2414                let mut s_bytes = [0u8; 16];
2415                let digest = spg_crypto::hash(s.name.as_bytes());
2416                s_bytes.copy_from_slice(&digest[..16]);
2417                s_bytes
2418            },
2419            |f| f(),
2420        );
2421        self.users
2422            .create(&s.name, &s.password, role, salt)
2423            .map_err(|e| EngineError::Unsupported(alloc::format!("CREATE USER: {e}")))?;
2424        Ok(QueryResult::CommandOk {
2425            affected: 1,
2426            modified_catalog: true,
2427        })
2428    }
2429
2430    fn exec_drop_user(&mut self, name: &str) -> Result<QueryResult, EngineError> {
2431        if self.in_transaction() {
2432            return Err(EngineError::Unsupported(
2433                "DROP USER is not allowed inside a transaction".into(),
2434            ));
2435        }
2436        self.users
2437            .drop(name)
2438            .map_err(|e| EngineError::Unsupported(alloc::format!("DROP USER: {e}")))?;
2439        Ok(QueryResult::CommandOk {
2440            affected: 1,
2441            modified_catalog: true,
2442        })
2443    }
2444
2445    /// v7.12.4 — `CREATE [OR REPLACE] FUNCTION`. Stores the
2446    /// function metadata in the catalog. PL/pgSQL bodies are
2447    /// already parsed by the SQL parser; we re-canonicalise the
2448    /// body to source text for storage (the executor re-parses
2449    /// it at trigger fire time — see the trigger fire path).
2450    fn exec_create_function(
2451        &mut self,
2452        s: spg_sql::ast::CreateFunctionStatement,
2453    ) -> Result<QueryResult, EngineError> {
2454        let args_repr = render_function_args(&s.args);
2455        let returns = match &s.returns {
2456            spg_sql::ast::FunctionReturn::Trigger => alloc::string::String::from("TRIGGER"),
2457            spg_sql::ast::FunctionReturn::Void => alloc::string::String::from("VOID"),
2458            spg_sql::ast::FunctionReturn::Type(t) => alloc::format!("{t}"),
2459            spg_sql::ast::FunctionReturn::Other(s) => s.clone(),
2460        };
2461        let body_text = match &s.body {
2462            spg_sql::ast::FunctionBody::PlPgSql(b) => alloc::format!("{b}"),
2463            spg_sql::ast::FunctionBody::Raw(s) => s.clone(),
2464        };
2465        let def = spg_storage::FunctionDef {
2466            name: s.name.clone(),
2467            args_repr,
2468            returns,
2469            language: s.language.clone(),
2470            body: body_text,
2471        };
2472        self.active_catalog_mut()
2473            .create_function(def, s.or_replace)
2474            .map_err(EngineError::Storage)?;
2475        Ok(QueryResult::CommandOk {
2476            affected: 0,
2477            modified_catalog: true,
2478        })
2479    }
2480
2481    /// v7.12.4 — `CREATE [OR REPLACE] TRIGGER`. The referenced
2482    /// function must already exist in the catalog (forward
2483    /// references defer to a later release). Persists the
2484    /// trigger metadata for the row-write hooks below to consult.
2485    fn exec_create_trigger(
2486        &mut self,
2487        s: spg_sql::ast::CreateTriggerStatement,
2488    ) -> Result<QueryResult, EngineError> {
2489        let timing = match s.timing {
2490            spg_sql::ast::TriggerTiming::Before => "BEFORE",
2491            spg_sql::ast::TriggerTiming::After => "AFTER",
2492            spg_sql::ast::TriggerTiming::InsteadOf => "INSTEAD OF",
2493        };
2494        let events: Vec<alloc::string::String> = s
2495            .events
2496            .iter()
2497            .map(|e| match e {
2498                spg_sql::ast::TriggerEvent::Insert => alloc::string::String::from("INSERT"),
2499                spg_sql::ast::TriggerEvent::Update => alloc::string::String::from("UPDATE"),
2500                spg_sql::ast::TriggerEvent::Delete => alloc::string::String::from("DELETE"),
2501                spg_sql::ast::TriggerEvent::Truncate => alloc::string::String::from("TRUNCATE"),
2502            })
2503            .collect();
2504        let for_each = match s.for_each {
2505            spg_sql::ast::TriggerForEach::Row => "ROW",
2506            spg_sql::ast::TriggerForEach::Statement => "STATEMENT",
2507        };
2508        let def = spg_storage::TriggerDef {
2509            name: s.name.clone(),
2510            table: s.table.clone(),
2511            timing: alloc::string::String::from(timing),
2512            events,
2513            for_each: alloc::string::String::from(for_each),
2514            function: s.function.clone(),
2515        };
2516        self.active_catalog_mut()
2517            .create_trigger(def, s.or_replace)
2518            .map_err(EngineError::Storage)?;
2519        Ok(QueryResult::CommandOk {
2520            affected: 0,
2521            modified_catalog: true,
2522        })
2523    }
2524
2525    fn exec_drop_trigger(
2526        &mut self,
2527        name: &str,
2528        table: &str,
2529        if_exists: bool,
2530    ) -> Result<QueryResult, EngineError> {
2531        let removed = self.active_catalog_mut().drop_trigger(name, table);
2532        if !removed && !if_exists {
2533            return Err(EngineError::Storage(spg_storage::StorageError::Corrupt(
2534                alloc::format!("trigger {name:?} on {table:?} does not exist"),
2535            )));
2536        }
2537        Ok(QueryResult::CommandOk {
2538            affected: usize::from(removed),
2539            modified_catalog: removed,
2540        })
2541    }
2542
2543    fn exec_drop_function(
2544        &mut self,
2545        name: &str,
2546        if_exists: bool,
2547    ) -> Result<QueryResult, EngineError> {
2548        let removed = self.active_catalog_mut().drop_function(name);
2549        if !removed && !if_exists {
2550            return Err(EngineError::Storage(spg_storage::StorageError::Corrupt(
2551                alloc::format!("function {name:?} does not exist"),
2552            )));
2553        }
2554        Ok(QueryResult::CommandOk {
2555            affected: usize::from(removed),
2556            modified_catalog: removed,
2557        })
2558    }
2559
2560    /// v4.4 `UPDATE <table> SET col = expr [, ...] [WHERE cond]`.
2561    /// Filter pass uses the same WHERE eval as `exec_select`. Per
2562    /// matched row, evaluate each RHS expression against the *old*
2563    /// row, then call `Table::update_row` which rebuilds indices.
2564    /// Indexed columns are correctly reflected because rebuild
2565    /// happens after the cell rewrite.
2566    fn exec_update_cancel(
2567        &mut self,
2568        stmt: &spg_sql::ast::UpdateStatement,
2569        cancel: CancelToken<'_>,
2570    ) -> Result<QueryResult, EngineError> {
2571        // v7.12.5 — snapshot BEFORE/AFTER UPDATE row triggers + the
2572        // session FTS config before the table mut-borrow opens (the
2573        // INSERT path uses the same pattern). Empty vecs are the
2574        // common "no triggers on this table" fast path.
2575        let before_update_triggers = self.snapshot_row_triggers(&stmt.table, "UPDATE", "BEFORE");
2576        let after_update_triggers = self.snapshot_row_triggers(&stmt.table, "UPDATE", "AFTER");
2577        let trigger_session_cfg: Option<String> = self
2578            .session_params
2579            .get("default_text_search_config")
2580            .cloned();
2581        // v5.2.3: if the WHERE is a PK equality and matches a cold-
2582        // tier row, promote it back to the hot tier *before* the
2583        // hot-row walk. The promote pushes the row to the end of
2584        // `table.rows`, where the upcoming SET-evaluation loop will
2585        // pick it up and apply the assignments. Lookups for the key
2586        // never observe a gap because `promote_cold_row` inserts the
2587        // hot row before retiring the cold locator.
2588        if let Some(w) = &stmt.where_ {
2589            let schema_cols = self
2590                .active_catalog()
2591                .get(&stmt.table)
2592                .ok_or_else(|| {
2593                    EngineError::Storage(StorageError::TableNotFound {
2594                        name: stmt.table.clone(),
2595                    })
2596                })?
2597                .schema()
2598                .columns
2599                .clone();
2600            if let Some((col_pos, key)) = try_pk_predicate(w, &schema_cols, stmt.table.as_str())
2601                && let Some(idx_name) = self
2602                    .active_catalog()
2603                    .get(&stmt.table)
2604                    .and_then(|t| t.index_on(col_pos).map(|i| i.name.clone()))
2605            {
2606                // Promote may be a no-op (key is hot-only or absent);
2607                // we don't care about the return value here — the
2608                // subsequent hot walk will either match or not.
2609                let _ = self
2610                    .active_catalog_mut()
2611                    .promote_cold_row(&stmt.table, &idx_name, &key);
2612            }
2613        }
2614
2615        // v7.12.1 — cache session FTS config before the table
2616        // mut-borrow (same reason as exec_delete).
2617        let ts_cfg: Option<String> = self
2618            .session_param("default_text_search_config")
2619            .map(String::from);
2620        let table = self
2621            .active_catalog_mut()
2622            .get_mut(&stmt.table)
2623            .ok_or_else(|| {
2624                EngineError::Storage(StorageError::TableNotFound {
2625                    name: stmt.table.clone(),
2626                })
2627            })?;
2628        let schema_cols: Vec<ColumnSchema> = table.schema().columns.clone();
2629        // Resolve each SET target to a column position once, validate
2630        // up front so a typo'd column doesn't leave a partial mutation
2631        // behind.
2632        let mut targets: Vec<(usize, &Expr)> = Vec::with_capacity(stmt.assignments.len());
2633        for (col, expr) in &stmt.assignments {
2634            let pos = schema_cols
2635                .iter()
2636                .position(|c| c.name == *col)
2637                .ok_or_else(|| {
2638                    EngineError::Eval(EvalError::ColumnNotFound { name: col.clone() })
2639                })?;
2640            targets.push((pos, expr));
2641        }
2642        let ctx = EvalContext::new(&schema_cols, Some(stmt.table.as_str()))
2643            .with_default_text_search_config(ts_cfg.as_deref());
2644        // Walk every row, evaluate WHERE then SET expressions. We
2645        // gather (position, new_values) tuples first and apply them
2646        // afterwards so the WHERE/RHS evaluation reads the original
2647        // row state — matches PG semantics (UPDATE doesn't see its
2648        // own writes).
2649        let mut planned: Vec<(usize, Vec<Value>)> = Vec::new();
2650        for (i, row) in table.rows().iter().enumerate() {
2651            // v4.5: cooperative cancel checkpoint every 256 rows so
2652            // a runaway UPDATE without WHERE doesn't drag past the
2653            // server's query-timeout watchdog.
2654            if i.is_multiple_of(256) {
2655                cancel.check()?;
2656            }
2657            if let Some(w) = &stmt.where_ {
2658                let cond = eval::eval_expr(w, row, &ctx)?;
2659                if !matches!(cond, Value::Bool(true)) {
2660                    continue;
2661                }
2662            }
2663            let mut new_vals = row.values.clone();
2664            for (pos, expr) in &targets {
2665                let v = eval::eval_expr(expr, row, &ctx)?;
2666                new_vals[*pos] =
2667                    coerce_value(v, schema_cols[*pos].ty, &schema_cols[*pos].name, *pos)?;
2668            }
2669            planned.push((i, new_vals));
2670        }
2671        // v7.6.6 — capture pre-update row values for the FK
2672        // enforcement passes below. `planned` carries new values
2673        // only; pair them with the old row.
2674        let plan_with_old: Vec<(usize, Vec<Value>, Vec<Value>)> = planned
2675            .iter()
2676            .map(|(pos, new_vals)| (*pos, table.rows()[*pos].values.clone(), new_vals.clone()))
2677            .collect();
2678        let self_fks = table.schema().foreign_keys.clone();
2679        // v7.12.5 — `affected` is computed post-BEFORE-trigger
2680        // below (triggers may RETURN NULL to skip individual
2681        // rows). The pre-trigger len shape is no longer accurate.
2682        // Release mutable borrow on `table` for the FK passes.
2683        let _ = table;
2684        // v7.6.6 — Stage 2a: outbound FK check. For every row whose
2685        // local FK columns changed, the new value must exist in the
2686        // parent.
2687        if !self_fks.is_empty() {
2688            let new_rows: Vec<Vec<Value>> = planned
2689                .iter()
2690                .map(|(_pos, new_vals)| new_vals.clone())
2691                .collect();
2692            enforce_fk_inserts(self.active_catalog(), &stmt.table, &self_fks, &new_rows)?;
2693        }
2694        // v7.6.6 — Stage 2b: inbound FK check. For every row that
2695        // changed value in a column that *some other table* uses as
2696        // a FK parent column, react per `on_update` action.
2697        let child_plan =
2698            plan_fk_parent_updates(self.active_catalog(), &stmt.table, &plan_with_old)?;
2699        // Stage 3a — apply each child-side action.
2700        for step in &child_plan {
2701            apply_fk_child_step(self.active_catalog_mut(), step)?;
2702        }
2703        // Stage 3b — apply the original UPDATE.
2704        let table = self
2705            .active_catalog_mut()
2706            .get_mut(&stmt.table)
2707            .ok_or_else(|| {
2708                EngineError::Storage(StorageError::TableNotFound {
2709                    name: stmt.table.clone(),
2710                })
2711            })?;
2712        // v7.12.5 — fire BEFORE/AFTER UPDATE row-level triggers
2713        // around the apply loop. BEFORE sees NEW=candidate +
2714        // OLD=current; may rewrite NEW or RETURN NULL to skip.
2715        // AFTER sees NEW=post-write + OLD=pre-write (both read-
2716        // only).
2717        //
2718        // Filter `planned` through the BEFORE pass first so the
2719        // RETURNING snapshot reflects what actually got written
2720        // (triggers may rewrite cells, including a cancellation).
2721        let mut applied_after_before: Vec<(usize, Row, Row)> = Vec::with_capacity(planned.len());
2722        // v7.12.7 — embedded SQL queue.
2723        let mut deferred_embedded: Vec<triggers::DeferredEmbeddedStmt> = Vec::new();
2724        for (pos, new_vals) in &planned {
2725            let old_row = table.rows()[*pos].clone();
2726            let mut new_row = Row::new(new_vals.clone());
2727            let mut skip = false;
2728            for fd in &before_update_triggers {
2729                let (outcome, deferred) = triggers::fire_row_trigger(
2730                    fd,
2731                    Some(new_row.clone()),
2732                    Some(&old_row),
2733                    &stmt.table,
2734                    &schema_cols,
2735                    &[],
2736                    trigger_session_cfg.as_deref(),
2737                    false,
2738                )
2739                .map_err(|e| EngineError::Storage(StorageError::Corrupt(alloc::format!("{e}"))))?;
2740                deferred_embedded.extend(deferred);
2741                match outcome {
2742                    triggers::TriggerOutcome::Row(r) => new_row = r,
2743                    triggers::TriggerOutcome::Skip => {
2744                        skip = true;
2745                        break;
2746                    }
2747                }
2748            }
2749            if !skip {
2750                applied_after_before.push((*pos, new_row, old_row));
2751            }
2752        }
2753        // v7.9.4 — snapshot post-update values for RETURNING (post-
2754        // BEFORE-trigger because triggers can rewrite cells).
2755        let updated_for_returning: Vec<Vec<Value>> = if stmt.returning.is_some() {
2756            applied_after_before
2757                .iter()
2758                .map(|(_pos, new_row, _old)| new_row.values.clone())
2759                .collect()
2760        } else {
2761            Vec::new()
2762        };
2763        let affected = applied_after_before.len();
2764        // Apply, then fire AFTER triggers per row. AFTER runs read-
2765        // only against the freshly-written row; v7.12.4-shape
2766        // assignment errors with a clear message.
2767        for (pos, new_row, old_row) in applied_after_before {
2768            table.update_row(pos, new_row.values.clone())?;
2769            for fd in &after_update_triggers {
2770                let (_outcome, deferred) = triggers::fire_row_trigger(
2771                    fd,
2772                    Some(new_row.clone()),
2773                    Some(&old_row),
2774                    &stmt.table,
2775                    &schema_cols,
2776                    &[],
2777                    trigger_session_cfg.as_deref(),
2778                    true,
2779                )
2780                .map_err(|e| EngineError::Storage(StorageError::Corrupt(alloc::format!("{e}"))))?;
2781                deferred_embedded.extend(deferred);
2782            }
2783        }
2784        let _ = table;
2785        // v7.12.7 — drain trigger-emitted embedded SQL for this UPDATE.
2786        self.execute_deferred_trigger_stmts(deferred_embedded, cancel)?;
2787        // v6.2.1 — auto-analyze modified-row tracking for UPDATE.
2788        if !self.in_transaction() && affected > 0 {
2789            self.statistics
2790                .record_modifications(&stmt.table, affected as u64);
2791        }
2792        // v7.9.4 — RETURNING projection.
2793        if let Some(items) = &stmt.returning {
2794            return self.build_returning_rows(&stmt.table, items, updated_for_returning);
2795        }
2796        Ok(QueryResult::CommandOk {
2797            affected,
2798            modified_catalog: !self.in_transaction(),
2799        })
2800    }
2801
2802    /// v4.4 `DELETE FROM <table> [WHERE cond]`. Collects matching
2803    /// positions then delegates to `Table::delete_rows` (single index
2804    /// rebuild for the batch).
2805    fn exec_delete_cancel(
2806        &mut self,
2807        stmt: &spg_sql::ast::DeleteStatement,
2808        cancel: CancelToken<'_>,
2809    ) -> Result<QueryResult, EngineError> {
2810        // v7.12.5 — snapshot BEFORE/AFTER DELETE row triggers + the
2811        // session FTS config before the mut borrow (same shape as
2812        // INSERT / UPDATE).
2813        let before_delete_triggers = self.snapshot_row_triggers(&stmt.table, "DELETE", "BEFORE");
2814        let after_delete_triggers = self.snapshot_row_triggers(&stmt.table, "DELETE", "AFTER");
2815        let trigger_session_cfg: Option<String> = self
2816            .session_params
2817            .get("default_text_search_config")
2818            .cloned();
2819        // v5.2.3: PK-targeted DELETE → first retire any cold-tier
2820        // locator for the key. The cold row body stays in the
2821        // segment (becoming shadowed garbage that a future
2822        // compaction pass reclaims) but the index no longer
2823        // resolves it. The shadow count contributes to the
2824        // affected total; the subsequent hot walk handles any hot
2825        // rows for the same key.
2826        let mut cold_shadow_count: usize = 0;
2827        if let Some(w) = &stmt.where_ {
2828            let schema_cols = self
2829                .active_catalog()
2830                .get(&stmt.table)
2831                .ok_or_else(|| {
2832                    EngineError::Storage(StorageError::TableNotFound {
2833                        name: stmt.table.clone(),
2834                    })
2835                })?
2836                .schema()
2837                .columns
2838                .clone();
2839            if let Some((col_pos, key)) = try_pk_predicate(w, &schema_cols, stmt.table.as_str())
2840                && let Some(idx_name) = self
2841                    .active_catalog()
2842                    .get(&stmt.table)
2843                    .and_then(|t| t.index_on(col_pos).map(|i| i.name.clone()))
2844            {
2845                cold_shadow_count = self
2846                    .active_catalog_mut()
2847                    .shadow_cold_row(&stmt.table, &idx_name, &key)
2848                    .unwrap_or(0);
2849            }
2850        }
2851
2852        // v7.12.1 — cache the session FTS config as an owned
2853        // String before the mutable table borrow below; the
2854        // ctx-builder then references it via `as_deref` so the
2855        // immutable read of `session_params` doesn't conflict
2856        // with the mut borrow chain.
2857        let ts_cfg: Option<String> = self
2858            .session_param("default_text_search_config")
2859            .map(String::from);
2860        let table = self
2861            .active_catalog_mut()
2862            .get_mut(&stmt.table)
2863            .ok_or_else(|| {
2864                EngineError::Storage(StorageError::TableNotFound {
2865                    name: stmt.table.clone(),
2866                })
2867            })?;
2868        let schema_cols: Vec<ColumnSchema> = table.schema().columns.clone();
2869        let ctx = EvalContext::new(&schema_cols, Some(stmt.table.as_str()))
2870            .with_default_text_search_config(ts_cfg.as_deref());
2871        let mut positions: Vec<usize> = Vec::new();
2872        // v7.6.3 — collect every to-delete row's full Value tuple
2873        // alongside its position, so the FK enforcement pass can
2874        // run after the mut borrow drops.
2875        let mut to_delete_rows: Vec<Vec<Value>> = Vec::new();
2876        for (i, row) in table.rows().iter().enumerate() {
2877            if i.is_multiple_of(256) {
2878                cancel.check()?;
2879            }
2880            let keep = if let Some(w) = &stmt.where_ {
2881                let cond = eval::eval_expr(w, row, &ctx)?;
2882                !matches!(cond, Value::Bool(true))
2883            } else {
2884                false
2885            };
2886            if !keep {
2887                positions.push(i);
2888                to_delete_rows.push(row.values.clone());
2889            }
2890        }
2891        // v7.6.3 / v7.6.4 — Stage 2: FK enforcement on the immutable
2892        // catalog. Release the mut borrow and run reverse-scan
2893        // against every child table whose FK targets this table.
2894        // RESTRICT / NoAction raise an error; CASCADE returns a
2895        // cascade plan that stage 3 applies after the primary delete.
2896        // SET NULL / SET DEFAULT remain Unsupported until v7.6.5.
2897        let _ = table;
2898        // v7.12.5 — BEFORE DELETE row-level triggers. Each fires
2899        // with NEW=None / OLD=pre-delete row; RETURN OLD (or NEW)
2900        // = proceed, RETURN NULL = skip the row entirely. The
2901        // filter must run BEFORE the FK cascade plan so cascaded
2902        // child rows track the trigger's skip-decision on the
2903        // parent.
2904        // v7.12.7 — embedded SQL queue.
2905        let mut deferred_embedded: Vec<triggers::DeferredEmbeddedStmt> = Vec::new();
2906        if !before_delete_triggers.is_empty() {
2907            let mut filtered_positions: Vec<usize> = Vec::with_capacity(positions.len());
2908            let mut filtered_old_rows: Vec<Vec<Value>> = Vec::with_capacity(to_delete_rows.len());
2909            for (pos, old_vals) in positions.iter().zip(to_delete_rows.iter()) {
2910                let old_row = Row::new(old_vals.clone());
2911                let mut cancel_this = false;
2912                for fd in &before_delete_triggers {
2913                    let (outcome, deferred) = triggers::fire_row_trigger(
2914                        fd,
2915                        None,
2916                        Some(&old_row),
2917                        &stmt.table,
2918                        &schema_cols,
2919                        &[],
2920                        trigger_session_cfg.as_deref(),
2921                        false,
2922                    )
2923                    .map_err(|e| {
2924                        EngineError::Storage(StorageError::Corrupt(alloc::format!("{e}")))
2925                    })?;
2926                    deferred_embedded.extend(deferred);
2927                    if matches!(outcome, triggers::TriggerOutcome::Skip) {
2928                        cancel_this = true;
2929                        break;
2930                    }
2931                }
2932                if !cancel_this {
2933                    filtered_positions.push(*pos);
2934                    filtered_old_rows.push(old_vals.clone());
2935                }
2936            }
2937            positions = filtered_positions;
2938            to_delete_rows = filtered_old_rows;
2939        }
2940        let cascade_plan = plan_fk_parent_deletions(
2941            self.active_catalog(),
2942            &stmt.table,
2943            &positions,
2944            &to_delete_rows,
2945        )?;
2946        // Stage 3a — apply each FK child step (SET NULL / SET
2947        // DEFAULT / CASCADE delete) before deleting the parent.
2948        // The plan is already ordered: nulls/defaults first, then
2949        // cascade deletes (so a row mutated and later deleted
2950        // surfaces as deleted — though v7.6.5 doesn't produce
2951        // that overlap today).
2952        for step in &cascade_plan {
2953            apply_fk_child_step(self.active_catalog_mut(), step)?;
2954        }
2955        // Stage 3b — actually delete the original target rows.
2956        let table = self
2957            .active_catalog_mut()
2958            .get_mut(&stmt.table)
2959            .ok_or_else(|| {
2960                EngineError::Storage(StorageError::TableNotFound {
2961                    name: stmt.table.clone(),
2962                })
2963            })?;
2964        let affected = table.delete_rows(&positions) + cold_shadow_count;
2965        let _ = table;
2966        // v7.12.5 — AFTER DELETE row-level triggers fire post-write
2967        // with NEW=None / OLD=pre-delete row (each from the
2968        // already-snapshotted to_delete_rows). Return value is
2969        // ignored (matches PG AFTER semantics).
2970        if !after_delete_triggers.is_empty() {
2971            for old_vals in &to_delete_rows {
2972                let old_row = Row::new(old_vals.clone());
2973                for fd in &after_delete_triggers {
2974                    let (_outcome, deferred) = triggers::fire_row_trigger(
2975                        fd,
2976                        None,
2977                        Some(&old_row),
2978                        &stmt.table,
2979                        &schema_cols,
2980                        &[],
2981                        trigger_session_cfg.as_deref(),
2982                        true,
2983                    )
2984                    .map_err(|e| {
2985                        EngineError::Storage(StorageError::Corrupt(alloc::format!("{e}")))
2986                    })?;
2987                    deferred_embedded.extend(deferred);
2988                }
2989            }
2990        }
2991        // v7.12.7 — drain trigger-emitted embedded SQL for this DELETE.
2992        self.execute_deferred_trigger_stmts(deferred_embedded, cancel)?;
2993        // v6.2.1 — auto-analyze modified-row tracking for DELETE.
2994        if !self.in_transaction() && affected > 0 {
2995            self.statistics
2996                .record_modifications(&stmt.table, affected as u64);
2997        }
2998        // v7.9.4 — RETURNING projection over the soon-to-be-gone
2999        // rows. `to_delete_rows` was snapshotted in stage 1 before
3000        // mutation, so the projection sees the pre-delete state
3001        // (matches PG semantics: DELETE RETURNING returns the row
3002        // as it was just before removal).
3003        if let Some(items) = &stmt.returning {
3004            return self.build_returning_rows(&stmt.table, items, to_delete_rows);
3005        }
3006        Ok(QueryResult::CommandOk {
3007            affected,
3008            modified_catalog: !self.in_transaction(),
3009        })
3010    }
3011
3012    /// `SHOW TABLES` — one row per table in the active catalog.
3013    /// Column name is `name` so result-set consumers can downstream
3014    /// `SELECT name FROM ...` style logic if needed.
3015    /// v4.26: `EXPLAIN [ANALYZE] <select>`. Returns a single-column
3016    /// `QUERY PLAN` text table — first line names the top operator
3017    /// (Scan / Aggregate / Window / etc.), indented children list
3018    /// FROM joins, WHERE filters, ORDER BY / LIMIT, projection
3019    /// shape, and any active index hits. `ANALYZE` execs the inner
3020    /// SELECT and appends actual-row + elapsed-micros annotations.
3021    #[allow(clippy::format_push_string)]
3022    fn exec_explain(
3023        &self,
3024        e: &spg_sql::ast::ExplainStatement,
3025        cancel: CancelToken<'_>,
3026    ) -> Result<QueryResult, EngineError> {
3027        let mut lines = Vec::<String>::new();
3028        explain_select(&e.inner, self, 0, &mut lines);
3029        if e.suggest {
3030            // v6.8.3 — index advisor. Walks the SELECT's FROM
3031            // tables + WHERE column refs; for each (table, column)
3032            // pair that lacks an index, append a SUGGEST line with
3033            // a copy-pastable `CREATE INDEX` statement. This is a
3034            // pure-syntax heuristic — no cardinality estimation —
3035            // matching the v6.8.3 design intent of "tell the
3036            // operator where indexes are missing", not "give the
3037            // mathematically optimal index set".
3038            let suggestions = build_index_suggestions(&e.inner, self);
3039            for s in suggestions {
3040                lines.push(s);
3041            }
3042        } else if e.analyze {
3043            // v6.2.4 — EXPLAIN ANALYZE annotates each operator line
3044            // with `(rows=N)` where the row count is computable
3045            // without re-executing the full query:
3046            //   - Top-level operator (first non-indented line):
3047            //     rows = final result.len()
3048            //   - "From: <table> [full scan]" lines: rows =
3049            //     table.rows().len() (catalog read; no execution)
3050            //   - "From: <table> [index seek]": indeterminate —
3051            //     the index step would need re-execution; v6.2.5
3052            //     adds per-operator wall-clock + hot/cold rows
3053            //     instrumentation that makes this concrete.
3054            //   - Everything else: marked `(—)` so the surface
3055            //     stays well-defined without silently dropping
3056            //     stats. v6.2.5 fills in via inline executor
3057            //     instrumentation.
3058            // Total elapsed lands on a trailing `Total: …` line.
3059            let started = self.clock.map(|f| f());
3060            let exec = self.exec_select_cancel(&e.inner, cancel)?;
3061            let elapsed_micros = match (self.clock, started) {
3062                (Some(f), Some(s)) => Some(f().saturating_sub(s)),
3063                _ => None,
3064            };
3065            let row_count = if let QueryResult::Rows { rows, .. } = &exec {
3066                rows.len()
3067            } else {
3068                0
3069            };
3070            annotate_explain_lines(&mut lines, row_count, self);
3071            let mut total = alloc::format!("Total: rows={row_count}");
3072            if let Some(us) = elapsed_micros {
3073                total.push_str(&alloc::format!(" elapsed={us}us"));
3074            }
3075            lines.push(total);
3076        }
3077        let columns = alloc::vec![ColumnSchema::new("QUERY PLAN", DataType::Text, false)];
3078        let rows: Vec<Row> = lines
3079            .into_iter()
3080            .map(|l| Row::new(alloc::vec![Value::Text(l)]))
3081            .collect();
3082        Ok(QueryResult::Rows { columns, rows })
3083    }
3084
3085    fn exec_show_tables(&self) -> QueryResult {
3086        let columns = alloc::vec![ColumnSchema::new("name", DataType::Text, false)];
3087        let rows: Vec<Row> = self
3088            .active_catalog()
3089            .table_names()
3090            .into_iter()
3091            .map(|n| Row::new(alloc::vec![Value::Text(n)]))
3092            .collect();
3093        QueryResult::Rows { columns, rows }
3094    }
3095
3096    /// `SHOW COLUMNS FROM <table>` — one row per column with the
3097    /// declared name, SQL type rendering, and nullability flag.
3098    fn exec_show_columns(&self, table_name: &str) -> Result<QueryResult, EngineError> {
3099        let table =
3100            self.active_catalog()
3101                .get(table_name)
3102                .ok_or_else(|| StorageError::TableNotFound {
3103                    name: table_name.into(),
3104                })?;
3105        let columns = alloc::vec![
3106            ColumnSchema::new("name", DataType::Text, false),
3107            ColumnSchema::new("type", DataType::Text, false),
3108            ColumnSchema::new("nullable", DataType::Bool, false),
3109        ];
3110        let rows: Vec<Row> = table
3111            .schema()
3112            .columns
3113            .iter()
3114            .map(|c| {
3115                Row::new(alloc::vec![
3116                    Value::Text(c.name.clone()),
3117                    Value::Text(alloc::format!("{}", c.ty)),
3118                    Value::Bool(c.nullable),
3119                ])
3120            })
3121            .collect();
3122        Ok(QueryResult::Rows { columns, rows })
3123    }
3124
3125    fn exec_begin(&mut self) -> Result<QueryResult, EngineError> {
3126        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
3127        if self.tx_catalogs.contains_key(&tx_id) {
3128            return Err(EngineError::TransactionAlreadyOpen);
3129        }
3130        self.tx_catalogs.insert(
3131            tx_id,
3132            TxState {
3133                catalog: self.catalog.clone(),
3134                savepoints: Vec::new(),
3135            },
3136        );
3137        Ok(QueryResult::CommandOk {
3138            affected: 0,
3139            modified_catalog: false,
3140        })
3141    }
3142
3143    fn exec_commit(&mut self) -> Result<QueryResult, EngineError> {
3144        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
3145        let state = self
3146            .tx_catalogs
3147            .remove(&tx_id)
3148            .ok_or(EngineError::NoActiveTransaction)?;
3149        self.catalog = state.catalog;
3150        // All savepoints become permanent at COMMIT and the stack
3151        // resets for the next TX (`state.savepoints` is discarded with
3152        // `state`).
3153        Ok(QueryResult::CommandOk {
3154            affected: 0,
3155            modified_catalog: true,
3156        })
3157    }
3158
3159    fn exec_rollback(&mut self) -> Result<QueryResult, EngineError> {
3160        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
3161        if self.tx_catalogs.remove(&tx_id).is_none() {
3162            return Err(EngineError::NoActiveTransaction);
3163        }
3164        // savepoints discarded with the TxState
3165        Ok(QueryResult::CommandOk {
3166            affected: 0,
3167            modified_catalog: false,
3168        })
3169    }
3170
3171    fn exec_savepoint(&mut self, name: String) -> Result<QueryResult, EngineError> {
3172        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
3173        let state = self
3174            .tx_catalogs
3175            .get_mut(&tx_id)
3176            .ok_or(EngineError::NoActiveTransaction)?;
3177        // PG re-uses an existing savepoint name by dropping the older
3178        // entry and pushing a fresh one — match that behaviour so
3179        // application code can `SAVEPOINT sp; ...; SAVEPOINT sp` freely.
3180        state.savepoints.retain(|(n, _)| n != &name);
3181        let snapshot = state.catalog.clone();
3182        state.savepoints.push((name, snapshot));
3183        Ok(QueryResult::CommandOk {
3184            affected: 0,
3185            modified_catalog: false,
3186        })
3187    }
3188
3189    fn exec_rollback_to_savepoint(&mut self, name: &str) -> Result<QueryResult, EngineError> {
3190        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
3191        let state = self
3192            .tx_catalogs
3193            .get_mut(&tx_id)
3194            .ok_or(EngineError::NoActiveTransaction)?;
3195        let pos = state
3196            .savepoints
3197            .iter()
3198            .rposition(|(n, _)| n == name)
3199            .ok_or_else(|| {
3200                EngineError::Unsupported(alloc::format!("savepoint not found: {name}"))
3201            })?;
3202        // The savepoint stays on the stack (PG semantics): a later
3203        // `RELEASE` or further `ROLLBACK TO` is still allowed. Everything
3204        // after it is discarded.
3205        let snapshot = state.savepoints[pos].1.clone();
3206        state.savepoints.truncate(pos + 1);
3207        state.catalog = snapshot;
3208        Ok(QueryResult::CommandOk {
3209            affected: 0,
3210            modified_catalog: false,
3211        })
3212    }
3213
3214    fn exec_release_savepoint(&mut self, name: &str) -> Result<QueryResult, EngineError> {
3215        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
3216        let state = self
3217            .tx_catalogs
3218            .get_mut(&tx_id)
3219            .ok_or(EngineError::NoActiveTransaction)?;
3220        let pos = state
3221            .savepoints
3222            .iter()
3223            .rposition(|(n, _)| n == name)
3224            .ok_or_else(|| {
3225                EngineError::Unsupported(alloc::format!("savepoint not found: {name}"))
3226            })?;
3227        // RELEASE keeps the work since the savepoint, just discards the
3228        // bookmark plus everything nested under it.
3229        state.savepoints.truncate(pos);
3230        Ok(QueryResult::CommandOk {
3231            affected: 0,
3232            modified_catalog: false,
3233        })
3234    }
3235
3236    /// v6.0.4 — synchronous `ALTER INDEX <name> REBUILD [WITH
3237    /// (encoding = …)]`. Walks every table in the active catalog
3238    /// looking for an index matching `stmt.name`, then delegates the
3239    /// rebuild (including any encoding switch) to
3240    /// `Table::rebuild_nsw_index`. The "live" non-blocking
3241    /// optimisation is v6.0.4.1 / v6.1.x territory.
3242    /// v6.7.2 — `ALTER TABLE t SET hot_tier_bytes = X`. Dispatch
3243    /// arm. Currently the only setting is `hot_tier_bytes`; later
3244    /// v6.7.x can extend `AlterTableTarget` without touching this
3245    /// arm structure.
3246    fn exec_alter_table(
3247        &mut self,
3248        s: spg_sql::ast::AlterTableStatement,
3249    ) -> Result<QueryResult, EngineError> {
3250        match s.target {
3251            spg_sql::ast::AlterTableTarget::SetHotTierBytes(n) => {
3252                let table = self.active_catalog_mut().get_mut(&s.name).ok_or_else(|| {
3253                    EngineError::Storage(StorageError::TableNotFound {
3254                        name: s.name.clone(),
3255                    })
3256                })?;
3257                table.schema_mut().hot_tier_bytes = Some(n);
3258            }
3259            spg_sql::ast::AlterTableTarget::AddForeignKey(fk) => {
3260                // v7.6.8 — resolve FK against the live catalog first
3261                // (validates parent table, columns, indices). Then
3262                // verify every existing row in the child table
3263                // satisfies the new constraint. Then install it.
3264                let cols_snapshot = self
3265                    .active_catalog()
3266                    .get(&s.name)
3267                    .ok_or_else(|| {
3268                        EngineError::Storage(StorageError::TableNotFound {
3269                            name: s.name.clone(),
3270                        })
3271                    })?
3272                    .schema()
3273                    .columns
3274                    .clone();
3275                let storage_fk =
3276                    resolve_foreign_key(&s.name, &cols_snapshot, fk, self.active_catalog())?;
3277                // Verify existing rows. Treat them as a virtual
3278                // INSERT batch — reusing the v7.6.2 enforce helper.
3279                let existing_rows: Vec<Vec<Value>> = self
3280                    .active_catalog()
3281                    .get(&s.name)
3282                    .expect("checked above")
3283                    .rows()
3284                    .iter()
3285                    .map(|r| r.values.clone())
3286                    .collect();
3287                enforce_fk_inserts(
3288                    self.active_catalog(),
3289                    &s.name,
3290                    core::slice::from_ref(&storage_fk),
3291                    &existing_rows,
3292                )?;
3293                // Reject duplicate constraint name.
3294                let table = self
3295                    .active_catalog_mut()
3296                    .get_mut(&s.name)
3297                    .expect("checked above");
3298                if let Some(name) = &storage_fk.name
3299                    && table
3300                        .schema()
3301                        .foreign_keys
3302                        .iter()
3303                        .any(|f| f.name.as_ref() == Some(name))
3304                {
3305                    return Err(EngineError::Unsupported(alloc::format!(
3306                        "ALTER TABLE ADD CONSTRAINT: a constraint named {name:?} already exists"
3307                    )));
3308                }
3309                table.schema_mut().foreign_keys.push(storage_fk);
3310            }
3311            spg_sql::ast::AlterTableTarget::DropForeignKey(name) => {
3312                let table = self.active_catalog_mut().get_mut(&s.name).ok_or_else(|| {
3313                    EngineError::Storage(StorageError::TableNotFound {
3314                        name: s.name.clone(),
3315                    })
3316                })?;
3317                let fks = &mut table.schema_mut().foreign_keys;
3318                let before = fks.len();
3319                fks.retain(|f| f.name.as_ref() != Some(&name));
3320                if fks.len() == before {
3321                    return Err(EngineError::Unsupported(alloc::format!(
3322                        "ALTER TABLE DROP CONSTRAINT: no FK named {name:?} on {:?}",
3323                        s.name
3324                    )));
3325                }
3326            }
3327        }
3328        Ok(QueryResult::CommandOk {
3329            affected: 0,
3330            modified_catalog: !self.in_transaction(),
3331        })
3332    }
3333
3334    fn exec_alter_index(
3335        &mut self,
3336        stmt: spg_sql::ast::AlterIndexStatement,
3337    ) -> Result<QueryResult, EngineError> {
3338        // Translate the optional SQL-side encoding choice into the
3339        // storage-side enum; the same SqlVecEncoding -> VecEncoding
3340        // bridge `column_type_to_data_type` uses.
3341        let spg_sql::ast::AlterIndexStatement {
3342            name: idx_name,
3343            target,
3344        } = stmt;
3345        let spg_sql::ast::AlterIndexTarget::Rebuild { encoding } = target;
3346        let target = encoding.map(|e| match e {
3347            SqlVecEncoding::F32 => VecEncoding::F32,
3348            SqlVecEncoding::Sq8 => VecEncoding::Sq8,
3349            SqlVecEncoding::F16 => VecEncoding::F16,
3350        });
3351        // Linear scan: index names are globally unique within a
3352        // catalog (enforced by add_nsw_index_inner) so the first
3353        // match is the only one. Save the table name to avoid
3354        // borrowing while we then take a mut borrow.
3355        let table_name = {
3356            let cat = self.active_catalog();
3357            let mut found: Option<String> = None;
3358            for tname in cat.table_names() {
3359                if let Some(t) = cat.get(&tname)
3360                    && t.indices().iter().any(|i| i.name == idx_name)
3361                {
3362                    found = Some(tname);
3363                    break;
3364                }
3365            }
3366            found.ok_or_else(|| {
3367                EngineError::Storage(StorageError::IndexNotFound {
3368                    name: idx_name.clone(),
3369                })
3370            })?
3371        };
3372        let table = self
3373            .active_catalog_mut()
3374            .get_mut(&table_name)
3375            .expect("table found above");
3376        table.rebuild_nsw_index(&idx_name, target)?;
3377        // v6.3.1 — ALTER INDEX REBUILD potentially with new encoding
3378        // changes cost characteristics; evict any cached plans.
3379        self.plan_cache.evict_referencing(&table_name);
3380        Ok(QueryResult::CommandOk {
3381            affected: 0,
3382            modified_catalog: !self.in_transaction(),
3383        })
3384    }
3385
3386    fn exec_create_index(
3387        &mut self,
3388        stmt: CreateIndexStatement,
3389    ) -> Result<QueryResult, EngineError> {
3390        let table = self
3391            .active_catalog_mut()
3392            .get_mut(&stmt.table)
3393            .ok_or_else(|| {
3394                EngineError::Storage(StorageError::TableNotFound {
3395                    name: stmt.table.clone(),
3396                })
3397            })?;
3398        // `IF NOT EXISTS` reduces DuplicateIndex to a no-op CommandOk.
3399        if stmt.if_not_exists && table.indices().iter().any(|i| i.name == stmt.name) {
3400            return Ok(QueryResult::CommandOk {
3401                affected: 0,
3402                modified_catalog: false,
3403            });
3404        }
3405        // v7.9.14 — multi-column index parses through; engine
3406        // builds a single-column BTree on the leading column only.
3407        // The extras live on the AST so spg-server's dispatcher
3408        // can emit a PG-wire NoticeResponse / log line. Composite
3409        // BTree keys land in v7.10.
3410        let _ = &stmt.extra_columns; // intentional drop on engine side
3411        let table_name = stmt.table.clone();
3412        // v6.8.0 — resolve INCLUDE column names to positions. Done
3413        // before `add_index` so a typo error surfaces before any
3414        // catalog mutation lands.
3415        let included_positions: Vec<usize> = if stmt.included_columns.is_empty() {
3416            Vec::new()
3417        } else {
3418            let schema = table.schema();
3419            stmt.included_columns
3420                .iter()
3421                .map(|c| {
3422                    schema.column_position(c).ok_or_else(|| {
3423                        EngineError::Storage(StorageError::ColumnNotFound { column: c.clone() })
3424                    })
3425                })
3426                .collect::<Result<Vec<_>, _>>()?
3427        };
3428        match stmt.method {
3429            IndexMethod::BTree => table.add_index(stmt.name.clone(), &stmt.column)?,
3430            IndexMethod::Hnsw => {
3431                if !included_positions.is_empty() {
3432                    return Err(EngineError::Unsupported(
3433                        "INCLUDE columns are not supported on HNSW indexes".into(),
3434                    ));
3435                }
3436                table.add_nsw_index(stmt.name.clone(), &stmt.column, spg_storage::NSW_DEFAULT_M)?;
3437            }
3438            // v6.7.1 — BRIN. Pure metadata; no in-memory data.
3439            IndexMethod::Brin => {
3440                if !included_positions.is_empty() {
3441                    return Err(EngineError::Unsupported(
3442                        "INCLUDE columns are not supported on BRIN indexes".into(),
3443                    ));
3444                }
3445                table.add_brin_index(stmt.name.clone(), &stmt.column)?;
3446            }
3447            // v7.12.3 — GIN inverted index. Real posting-list-backed
3448            // GIN when the indexed column is `tsvector`; falls back
3449            // to a BTree on the leading column for any other column
3450            // type so v7.9.26b's `pg_dump` compatibility (GIN on
3451            // JSONB etc. silently loading as BTree) is preserved.
3452            // Operators see the real GIN only where it matters; old
3453            // schemas keep loading.
3454            IndexMethod::Gin => {
3455                if !included_positions.is_empty() {
3456                    return Err(EngineError::Unsupported(
3457                        "INCLUDE columns are not supported on GIN indexes".into(),
3458                    ));
3459                }
3460                let col_pos = table
3461                    .schema()
3462                    .column_position(&stmt.column)
3463                    .ok_or_else(|| {
3464                        EngineError::Storage(StorageError::ColumnNotFound {
3465                            column: stmt.column.clone(),
3466                        })
3467                    })?;
3468                if table.schema().columns[col_pos].ty == spg_storage::DataType::TsVector {
3469                    table
3470                        .add_gin_index(stmt.name.clone(), &stmt.column)
3471                        .map_err(EngineError::Storage)?;
3472                } else {
3473                    // v7.9.26b BTree fallback — the catalog still
3474                    // gets an index entry on the leading column so
3475                    // pg_dump scripts that name GIN on JSONB / etc.
3476                    // load clean; query-time gain stays opt-in for
3477                    // tsvector callers.
3478                    table.add_index(stmt.name.clone(), &stmt.column)?;
3479                }
3480            }
3481        }
3482        if !included_positions.is_empty()
3483            && let Some(idx) = table.indices_mut().iter_mut().find(|i| i.name == stmt.name)
3484        {
3485            idx.included_columns = included_positions;
3486        }
3487        // v6.8.1 — persist partial-index predicate. Stored as the
3488        // expression's Display form so the catalog snapshot stays
3489        // pure (storage has no spg-sql dependency). The runtime
3490        // maintenance path treats partial indexes identically to
3491        // full indexes for v6.8.1 (over-maintenance is safe; the
3492        // planner-side "use partial when query WHERE implies the
3493        // predicate" pass is STABILITY carve-out).
3494        if let Some(pred_expr) = &stmt.partial_predicate {
3495            let canonical = pred_expr.to_string();
3496            if matches!(
3497                stmt.method,
3498                IndexMethod::Hnsw | IndexMethod::Brin | IndexMethod::Gin
3499            ) {
3500                return Err(EngineError::Unsupported(
3501                    "WHERE predicates are not supported on HNSW or BRIN indexes".into(),
3502                ));
3503            }
3504            if let Some(idx) = table.indices_mut().iter_mut().find(|i| i.name == stmt.name) {
3505                idx.partial_predicate = Some(canonical);
3506            }
3507        }
3508        // v6.8.2 — persist expression index key. Same Display-form
3509        // storage; the runtime maintenance pass evaluates each
3510        // row's expression to derive the index key, but for v6.8.2
3511        // the engine falls through to the bare-column-reference
3512        // path and the expression is preserved for format-layer
3513        // round-trip + future planner work. Carved-out in
3514        // STABILITY § "Out of v6.8".
3515        if let Some(key_expr) = &stmt.expression {
3516            if matches!(
3517                stmt.method,
3518                IndexMethod::Hnsw | IndexMethod::Brin | IndexMethod::Gin
3519            ) {
3520                return Err(EngineError::Unsupported(
3521                    "Expression keys are not supported on HNSW or BRIN indexes".into(),
3522                ));
3523            }
3524            let canonical = key_expr.to_string();
3525            if let Some(idx) = table.indices_mut().iter_mut().find(|i| i.name == stmt.name) {
3526                idx.expression = Some(canonical);
3527            }
3528        }
3529        // v7.9.29 — persist `is_unique` flag on the storage Index.
3530        // Combined with `partial_predicate`, INSERT enforcement
3531        // checks that no other row whose predicate evaluates true
3532        // shares the same indexed key. Parser already rejected
3533        // `UNIQUE` on HNSW / BRIN, so plain BTree here.
3534        // For multi-column UNIQUE INDEX the extras matter (the
3535        // full tuple is the uniqueness key), so resolve them to
3536        // column positions and persist on the index too.
3537        if stmt.is_unique {
3538            let mut extra_positions: alloc::vec::Vec<usize> = alloc::vec::Vec::new();
3539            for col_name in &stmt.extra_columns {
3540                let pos = table
3541                    .schema()
3542                    .columns
3543                    .iter()
3544                    .position(|c| c.name.eq_ignore_ascii_case(col_name))
3545                    .ok_or_else(|| {
3546                        EngineError::Unsupported(alloc::format!(
3547                            "UNIQUE INDEX {:?}: extra column {col_name:?} not in table {:?}",
3548                            stmt.name,
3549                            stmt.table
3550                        ))
3551                    })?;
3552                extra_positions.push(pos);
3553            }
3554            if let Some(idx) = table.indices_mut().iter_mut().find(|i| i.name == stmt.name) {
3555                idx.is_unique = true;
3556                idx.extra_column_positions = extra_positions;
3557            }
3558            // At index-creation time, check the existing rows for
3559            // pre-existing duplicates that would have violated the
3560            // new constraint — otherwise CREATE UNIQUE INDEX would
3561            // silently leave duplicates in place.
3562            let snapshot_indices = table.indices().to_vec();
3563            let snapshot_rows: alloc::vec::Vec<spg_storage::Row> =
3564                table.rows().iter().cloned().collect();
3565            let snapshot_schema = table.schema().clone();
3566            let idx_ref = snapshot_indices
3567                .iter()
3568                .find(|i| i.name == stmt.name)
3569                .expect("just-added index");
3570            check_existing_unique_violation(idx_ref, &snapshot_schema, &snapshot_rows)?;
3571        }
3572        // v6.3.1 — adding an index can change the optimal plan for
3573        // any cached query that references this table.
3574        self.plan_cache.evict_referencing(&table_name);
3575        Ok(QueryResult::CommandOk {
3576            affected: 0,
3577            modified_catalog: !self.in_transaction(),
3578        })
3579    }
3580
3581    fn exec_create_table(
3582        &mut self,
3583        stmt: CreateTableStatement,
3584    ) -> Result<QueryResult, EngineError> {
3585        if stmt.if_not_exists && self.active_catalog().get(&stmt.name).is_some() {
3586            return Ok(QueryResult::CommandOk {
3587                affected: 0,
3588                modified_catalog: false,
3589            });
3590        }
3591        let table_name = stmt.name.clone();
3592        // v7.9.13 — pluck the names of any columns marked
3593        // `PRIMARY KEY` inline so the post-create-table pass can
3594        // build an implicit BTree index. mailrs F1.
3595        let inline_pk_columns: Vec<String> = stmt
3596            .columns
3597            .iter()
3598            .filter(|c| c.is_primary_key)
3599            .map(|c| c.name.clone())
3600            .collect();
3601        // v7.9.19 — table-level constraints: PRIMARY KEY (a, b, ...)
3602        // and UNIQUE (a, b, ...). Each builds a BTree index on the
3603        // leading column (the existing single-column storage tier)
3604        // and registers a UniquenessConstraint on the schema for
3605        // INSERT-time enforcement of the full tuple. mailrs G1/G6.
3606        let cols = stmt
3607            .columns
3608            .into_iter()
3609            .map(column_def_to_schema)
3610            .collect::<Result<Vec<_>, _>>()?;
3611        // Composite NOT-NULL implication for PRIMARY KEY columns.
3612        let mut cols = cols;
3613        for tc in &stmt.table_constraints {
3614            if let spg_sql::ast::TableConstraint::PrimaryKey { columns, .. } = tc {
3615                for col_name in columns {
3616                    if let Some(col) = cols.iter_mut().find(|c| c.name == *col_name) {
3617                        col.nullable = false;
3618                    }
3619                }
3620            }
3621        }
3622        // v7.6.1 — resolve every FK in the statement against the
3623        // already-known catalog. Validates: parent table exists,
3624        // parent column names exist, arity matches, parent columns
3625        // have a PK / UNIQUE index. Self-referencing FKs (parent
3626        // table == this table) resolve against the column list we
3627        // just built — they don't need the catalog yet.
3628        let mut fks: Vec<spg_storage::ForeignKeyConstraint> =
3629            Vec::with_capacity(stmt.foreign_keys.len());
3630        for fk in stmt.foreign_keys {
3631            fks.push(resolve_foreign_key(
3632                &table_name,
3633                &cols,
3634                fk,
3635                self.active_catalog(),
3636            )?);
3637        }
3638        let mut schema = TableSchema::new(table_name.clone(), cols);
3639        schema.foreign_keys = fks;
3640        // v7.9.19 — translate AST table_constraints to storage
3641        // UniquenessConstraints (column name → position) so the
3642        // INSERT enforcement helper sees positions directly.
3643        let mut uc_storage: Vec<spg_storage::UniquenessConstraint> = Vec::new();
3644        for tc in &stmt.table_constraints {
3645            let (is_pk, names) = match tc {
3646                spg_sql::ast::TableConstraint::PrimaryKey { columns, .. } => {
3647                    (true, columns.clone())
3648                }
3649                spg_sql::ast::TableConstraint::Unique { columns, .. } => (false, columns.clone()),
3650            };
3651            let mut positions = Vec::with_capacity(names.len());
3652            for n in &names {
3653                let pos = schema
3654                    .columns
3655                    .iter()
3656                    .position(|c| c.name == *n)
3657                    .ok_or_else(|| {
3658                        EngineError::Unsupported(alloc::format!(
3659                            "table constraint references unknown column {n:?}"
3660                        ))
3661                    })?;
3662                positions.push(pos);
3663            }
3664            uc_storage.push(spg_storage::UniquenessConstraint {
3665                is_primary_key: is_pk,
3666                columns: positions,
3667            });
3668        }
3669        schema.uniqueness_constraints = uc_storage.clone();
3670        self.active_catalog_mut().create_table(schema)?;
3671        // v7.9.13 — implicit BTree per inline PK column +
3672        // v7.9.19 — implicit BTree on the leading column of every
3673        // table-level PRIMARY KEY / UNIQUE constraint.
3674        let table = self
3675            .active_catalog_mut()
3676            .get_mut(&table_name)
3677            .expect("just created");
3678        for (i, col_name) in inline_pk_columns.iter().enumerate() {
3679            let idx_name = if inline_pk_columns.len() == 1 {
3680                alloc::format!("{table_name}_pkey")
3681            } else {
3682                alloc::format!("{table_name}_pkey_{i}")
3683            };
3684            if let Err(e) = table.add_index(idx_name, col_name) {
3685                return Err(EngineError::Storage(e));
3686            }
3687        }
3688        for (i, tc) in stmt.table_constraints.iter().enumerate() {
3689            let (is_pk, names) = match tc {
3690                spg_sql::ast::TableConstraint::PrimaryKey { columns, .. } => (true, columns),
3691                spg_sql::ast::TableConstraint::Unique { columns, .. } => (false, columns),
3692            };
3693            let leading = &names[0];
3694            // Skip if a same-column BTree already exists (e.g.
3695            // inline PK on the leading column).
3696            let already = table.indices().iter().any(|idx| {
3697                matches!(idx.kind, spg_storage::IndexKind::BTree(_))
3698                    && table.schema().columns[idx.column_position].name == *leading
3699            });
3700            if already {
3701                continue;
3702            }
3703            let suffix = if is_pk { "pkey" } else { "key" };
3704            let idx_name = if names.len() == 1 {
3705                alloc::format!("{table_name}_{leading}_{suffix}")
3706            } else {
3707                alloc::format!("{table_name}_{leading}_{suffix}_{i}")
3708            };
3709            if let Err(e) = table.add_index(idx_name, leading) {
3710                return Err(EngineError::Storage(e));
3711            }
3712        }
3713        Ok(QueryResult::CommandOk {
3714            affected: 0,
3715            modified_catalog: !self.in_transaction(),
3716        })
3717    }
3718
3719    fn exec_insert(&mut self, stmt: InsertStatement) -> Result<QueryResult, EngineError> {
3720        // v7.9.21 — snapshot the clock fn pointer before the mut
3721        // borrow on the catalog opens; runtime DEFAULT eval needs
3722        // it inside the row hot loop.
3723        let clock = self.clock;
3724        // v7.12.4 — snapshot row-level triggers + their referenced
3725        // functions before the mut borrow on the catalog opens.
3726        // Cloned out so the row hot loop can fire them without
3727        // re-borrowing the catalog (which would conflict with
3728        // table.insert's mutable borrow).
3729        let before_insert_triggers = self.snapshot_row_triggers(&stmt.table, "INSERT", "BEFORE");
3730        let after_insert_triggers = self.snapshot_row_triggers(&stmt.table, "INSERT", "AFTER");
3731        let trigger_session_cfg: Option<alloc::string::String> = self
3732            .session_params
3733            .get("default_text_search_config")
3734            .cloned();
3735        let table = self
3736            .active_catalog_mut()
3737            .get_mut(&stmt.table)
3738            .ok_or_else(|| {
3739                EngineError::Storage(StorageError::TableNotFound {
3740                    name: stmt.table.clone(),
3741                })
3742            })?;
3743        // v3.1.5: clone the columns vector only (not the whole
3744        // TableSchema — saves one String alloc for the table name).
3745        // We need an owned snapshot because we'll call `table.insert`
3746        // (mutable borrow on `table`) inside the row loop while
3747        // reading schema fields.
3748        let column_meta: Vec<ColumnSchema> = table.schema().columns.clone();
3749        let schema_cols_len = column_meta.len();
3750        // Build a permutation `tuple_pos[c] = Some(j)` meaning schema
3751        // column `c` is filled from the `j`-th tuple slot; `None` means
3752        // "fill with NULL". Validated once and reused for every row.
3753        let tuple_pos: Option<Vec<Option<usize>>> = match &stmt.columns {
3754            None => None, // 1-1 mapping, fast path
3755            Some(cols) => {
3756                let mut map = alloc::vec![None; schema_cols_len];
3757                for (j, name) in cols.iter().enumerate() {
3758                    let idx = column_meta
3759                        .iter()
3760                        .position(|c| c.name == *name)
3761                        .ok_or_else(|| {
3762                            EngineError::Eval(EvalError::ColumnNotFound { name: name.clone() })
3763                        })?;
3764                    if map[idx].is_some() {
3765                        return Err(EngineError::Storage(StorageError::ArityMismatch {
3766                            expected: schema_cols_len,
3767                            actual: cols.len(),
3768                        }));
3769                    }
3770                    map[idx] = Some(j);
3771                }
3772                // Omitted columns must either be nullable, carry a
3773                // DEFAULT, or be AUTO_INCREMENT. Catch NOT NULL
3774                // omissions up front so the WAL stays clean.
3775                for (i, col) in column_meta.iter().enumerate() {
3776                    if map[i].is_none()
3777                        && !col.nullable
3778                        && col.default.is_none()
3779                        && col.runtime_default.is_none()
3780                        && !col.auto_increment
3781                    {
3782                        return Err(EngineError::Storage(StorageError::NullInNotNull {
3783                            column: col.name.clone(),
3784                        }));
3785                    }
3786                }
3787                Some(map)
3788            }
3789        };
3790        let expected_tuple_len = stmt.columns.as_ref().map_or(schema_cols_len, Vec::len);
3791        // v7.6.2 — snapshot this table's FK list before the
3792        // mutable-borrow window so we can run parent lookups
3793        // against the immutable catalog after parsing. Empty vec is
3794        // the no-FK fast path; clone cost is O(fks * arity) which
3795        // is < 100 ns for typical schemas.
3796        let fks = table.schema().foreign_keys.clone();
3797        let mut affected = 0usize;
3798        // Stage 1 — parse + AUTO_INC + coerce all rows under the
3799        // single mutable borrow.
3800        let mut all_values: Vec<Vec<Value>> = Vec::with_capacity(stmt.rows.len());
3801        for tuple in stmt.rows {
3802            if tuple.len() != expected_tuple_len {
3803                return Err(EngineError::Storage(StorageError::ArityMismatch {
3804                    expected: expected_tuple_len,
3805                    actual: tuple.len(),
3806                }));
3807            }
3808            // Fast path: no column-list permutation → tuple slot j
3809            // maps to schema column j. We can zip schema with tuple
3810            // and skip the `raw_tuple` staging allocation entirely.
3811            let values: Vec<Value> = if let Some(map) = &tuple_pos {
3812                // Permuted path: still need raw_tuple to index by `map[i]`.
3813                let raw_tuple: Vec<Value> = tuple
3814                    .into_iter()
3815                    .map(literal_expr_to_value)
3816                    .collect::<Result<_, _>>()?;
3817                let mut out = Vec::with_capacity(schema_cols_len);
3818                for (i, col) in column_meta.iter().enumerate() {
3819                    let mut raw = match map[i] {
3820                        Some(j) => raw_tuple[j].clone(),
3821                        None => resolve_column_default_free(col, clock)?,
3822                    };
3823                    if col.auto_increment && raw.is_null() {
3824                        let next = table.next_auto_value(i).ok_or_else(|| {
3825                            EngineError::Unsupported(alloc::format!(
3826                                "AUTO_INCREMENT applies to integer columns only (column `{}`)",
3827                                col.name
3828                            ))
3829                        })?;
3830                        raw = Value::BigInt(next);
3831                    }
3832                    out.push(coerce_value(raw, col.ty, &col.name, i)?);
3833                }
3834                out
3835            } else {
3836                // 1-1 mapping fast path: single Vec alloc, no raw_tuple.
3837                let mut out = Vec::with_capacity(schema_cols_len);
3838                for (i, (col, expr)) in column_meta.iter().zip(tuple).enumerate() {
3839                    let mut raw = literal_expr_to_value(expr)?;
3840                    if col.auto_increment && raw.is_null() {
3841                        let next = table.next_auto_value(i).ok_or_else(|| {
3842                            EngineError::Unsupported(alloc::format!(
3843                                "AUTO_INCREMENT applies to integer columns only (column `{}`)",
3844                                col.name
3845                            ))
3846                        })?;
3847                        raw = Value::BigInt(next);
3848                    }
3849                    out.push(coerce_value(raw, col.ty, &col.name, i)?);
3850                }
3851                out
3852            };
3853            all_values.push(values);
3854        }
3855        // Stage 2 — FK enforcement on the immutable catalog.
3856        // Non-lexical lifetimes release the mutable borrow on
3857        // `table` here since stage 1 was the last use. The
3858        // parent-table lookup runs before any row is committed.
3859        let uniqueness = table.schema().uniqueness_constraints.clone();
3860        let _ = table;
3861        if !fks.is_empty() {
3862            enforce_fk_inserts(self.active_catalog(), &stmt.table, &fks, &all_values)?;
3863        }
3864        // v7.9.19 — composite UNIQUE / PRIMARY KEY enforcement.
3865        enforce_uniqueness_inserts(self.active_catalog(), &stmt.table, &uniqueness, &all_values)?;
3866        // v7.9.29 — CREATE UNIQUE INDEX [WHERE pred] enforcement.
3867        // Independent of table-level UniquenessConstraint (which
3868        // can't carry a predicate). Walks the table's indexes;
3869        // for each `is_unique` index, only rows whose
3870        // partial_predicate evaluates truthy are checked for
3871        // collision. mailrs K1.
3872        enforce_unique_index_inserts(self.active_catalog(), &stmt.table, &all_values)?;
3873        // v7.9.8 / v7.9.9 — ON CONFLICT handling.
3874        //   - `DO NOTHING` filters `all_values` to non-conflicting
3875        //     rows + drops within-batch duplicates.
3876        //   - `DO UPDATE SET …` ALSO filters, but for each
3877        //     conflicting row it queues an UPDATE on the existing
3878        //     row using the incoming row's values as `EXCLUDED.*`.
3879        let mut pending_updates: Vec<(usize, Vec<Value>)> = Vec::new();
3880        let mut skipped_count = 0usize;
3881        if let Some(clause) = &stmt.on_conflict {
3882            let conflict_cols = resolve_on_conflict_columns(
3883                self.active_catalog(),
3884                &stmt.table,
3885                clause.target_columns.as_slice(),
3886            )?;
3887            let mut kept: Vec<Vec<Value>> = Vec::with_capacity(all_values.len());
3888            let mut seen_keys: Vec<Vec<Value>> = Vec::new();
3889            for values in all_values {
3890                let key_tuple: Vec<&Value> = conflict_cols.iter().map(|&c| &values[c]).collect();
3891                // SQL spec: NULL in any conflict column means "no
3892                // conflict possible" (NULL ≠ NULL for uniqueness).
3893                let has_null_key = key_tuple.iter().any(|v| matches!(v, Value::Null));
3894                let collides_with_table = !has_null_key
3895                    && on_conflict_keys_exist(
3896                        self.active_catalog(),
3897                        &stmt.table,
3898                        &conflict_cols,
3899                        &key_tuple,
3900                    );
3901                let key_tuple_owned: Vec<Value> = key_tuple.iter().map(|v| (*v).clone()).collect();
3902                let collides_with_batch =
3903                    !has_null_key && seen_keys.iter().any(|k| k == &key_tuple_owned);
3904                let collides = collides_with_table || collides_with_batch;
3905                match (&clause.action, collides) {
3906                    (_, false) => {
3907                        seen_keys.push(key_tuple_owned);
3908                        kept.push(values);
3909                    }
3910                    (spg_sql::ast::OnConflictAction::Nothing, true) => {
3911                        skipped_count += 1;
3912                    }
3913                    (
3914                        spg_sql::ast::OnConflictAction::Update {
3915                            assignments,
3916                            where_,
3917                        },
3918                        true,
3919                    ) => {
3920                        if !collides_with_table {
3921                            skipped_count += 1;
3922                            continue;
3923                        }
3924                        let target_pos = lookup_row_position_by_keys(
3925                            self.active_catalog(),
3926                            &stmt.table,
3927                            &conflict_cols,
3928                            &key_tuple,
3929                        )
3930                        .ok_or_else(|| {
3931                            EngineError::Unsupported(
3932                                "ON CONFLICT DO UPDATE: conflict detected but row \
3933                                 position could not be resolved (cold-tier row?)"
3934                                    .into(),
3935                            )
3936                        })?;
3937                        let updated = apply_on_conflict_assignments(
3938                            self.active_catalog(),
3939                            &stmt.table,
3940                            target_pos,
3941                            &values,
3942                            assignments,
3943                            where_.as_ref(),
3944                        )?;
3945                        if let Some(new_row) = updated {
3946                            pending_updates.push((target_pos, new_row));
3947                        } else {
3948                            skipped_count += 1;
3949                        }
3950                    }
3951                }
3952            }
3953            all_values = kept;
3954        }
3955        // Stage 3 — insert all rows under a fresh mutable borrow.
3956        let table = self
3957            .active_catalog_mut()
3958            .get_mut(&stmt.table)
3959            .ok_or_else(|| {
3960                EngineError::Storage(StorageError::TableNotFound {
3961                    name: stmt.table.clone(),
3962                })
3963            })?;
3964        // v7.9.4 — keep RETURNING projection rows separate per
3965        // INSERT and per UPDATE branch so DO UPDATE pushes the new
3966        // post-update state, not the incoming-only values.
3967        let mut returning_rows: Vec<Vec<Value>> = Vec::new();
3968        // v7.12.7 — collect embedded SQL emitted by any trigger
3969        // fire across the row loop; engine drains the queue after
3970        // the table mut borrow drops.
3971        let mut deferred_embedded: Vec<triggers::DeferredEmbeddedStmt> = Vec::new();
3972        'rowloop: for values in all_values {
3973            let mut row = Row::new(values);
3974            // v7.12.4 — BEFORE INSERT row-level triggers. Each
3975            // trigger may rewrite NEW cells (e.g. populate
3976            // `search_vector := to_tsvector(...)`) and may return
3977            // NULL to skip the row entirely.
3978            for fd in &before_insert_triggers {
3979                let (outcome, deferred) = triggers::fire_row_trigger(
3980                    fd,
3981                    Some(row.clone()),
3982                    None,
3983                    &stmt.table,
3984                    &column_meta,
3985                    &[],
3986                    trigger_session_cfg.as_deref(),
3987                    false,
3988                )
3989                .map_err(|e| EngineError::Storage(StorageError::Corrupt(alloc::format!("{e}"))))?;
3990                deferred_embedded.extend(deferred);
3991                match outcome {
3992                    triggers::TriggerOutcome::Row(r) => row = r,
3993                    triggers::TriggerOutcome::Skip => continue 'rowloop,
3994                }
3995            }
3996            if stmt.returning.is_some() {
3997                returning_rows.push(row.values.clone());
3998            }
3999            // v7.12.4 — clone for the AFTER trigger view; insert
4000            // moves the row into the table.
4001            let inserted = row.clone();
4002            table.insert(row)?;
4003            affected += 1;
4004            // v7.12.4 — AFTER INSERT row-level triggers fire post-
4005            // write. Return value is ignored (PG semantics); we
4006            // surface any error from the body up to the caller.
4007            for fd in &after_insert_triggers {
4008                let (_outcome, deferred) = triggers::fire_row_trigger(
4009                    fd,
4010                    Some(inserted.clone()),
4011                    None,
4012                    &stmt.table,
4013                    &column_meta,
4014                    &[],
4015                    trigger_session_cfg.as_deref(),
4016                    true,
4017                )
4018                .map_err(|e| EngineError::Storage(StorageError::Corrupt(alloc::format!("{e}"))))?;
4019                deferred_embedded.extend(deferred);
4020            }
4021        }
4022        // v7.9.9 — apply ON CONFLICT DO UPDATE rewrites collected
4023        // in the conflict-resolution pass. update_row handles
4024        // index maintenance + body re-encoding.
4025        for (pos, new_row) in pending_updates {
4026            if stmt.returning.is_some() {
4027                returning_rows.push(new_row.clone());
4028            }
4029            table.update_row(pos, new_row)?;
4030            affected += 1;
4031        }
4032        let _ = skipped_count;
4033        // v7.12.7 — drop the table mut borrow and drain any
4034        // trigger-emitted embedded SQL queued during this INSERT.
4035        // The borrow has to release first because each deferred
4036        // stmt may UPDATE / INSERT / DELETE the same (or another)
4037        // table — including, in principle, this one.
4038        let _ = table;
4039        self.execute_deferred_trigger_stmts(deferred_embedded, CancelToken::none())?;
4040        // v7.9.4/v7.9.9 — RETURNING streams the rows that ended
4041        // up in the table after this statement (insert or
4042        // post-update on conflict).
4043        if let Some(items) = &stmt.returning {
4044            return self.build_returning_rows(&stmt.table, items, returning_rows);
4045        }
4046        // v6.2.1 — auto-analyze: track per-table modified-row
4047        // counter so the background sweep can decide when to
4048        // re-ANALYZE. Cheap path on the autocommit-wrap hot loop
4049        // — one BTreeMap entry update per INSERT batch.
4050        if !self.in_transaction() && affected > 0 {
4051            self.statistics
4052                .record_modifications(&stmt.table, affected as u64);
4053        }
4054        Ok(QueryResult::CommandOk {
4055            affected,
4056            modified_catalog: !self.in_transaction(),
4057        })
4058    }
4059
4060    /// v4.5: SELECT with cooperative cancellation. The token is
4061    /// honoured between UNION peers and inside the bare-SELECT row
4062    /// loop; HNSW kNN graph walks and the aggregate executor don't
4063    /// honour it yet (deferred — those paths bound their work
4064    /// internally by `LIMIT k` and `GROUP BY` cardinality).
4065    /// v6.10.2 — cold-tier time-travel scan. Resolves the segment
4066    /// by id, decodes each row body against the table's current
4067    /// schema, applies the SELECT's projection + optional WHERE +
4068    /// optional LIMIT, returns a `Rows` result. JOINs / aggregates
4069    /// / ORDER BY are unsupported on this path (STABILITY carve-
4070    /// out); operators wanting them should restore the segment
4071    /// into a regular table first.
4072    fn exec_select_as_of_segment(
4073        &self,
4074        stmt: &SelectStatement,
4075        from: &spg_sql::ast::FromClause,
4076        segment_id: u32,
4077    ) -> Result<QueryResult, EngineError> {
4078        // v6.10.2 scope: no joins, no aggregates, no ORDER BY,
4079        // no GROUP BY / HAVING / UNION / OFFSET / DISTINCT.
4080        if !from.joins.is_empty()
4081            || stmt.group_by.is_some()
4082            || stmt.having.is_some()
4083            || !stmt.unions.is_empty()
4084            || !stmt.order_by.is_empty()
4085            || stmt.offset.is_some()
4086            || stmt.distinct
4087            || aggregate::uses_aggregate(stmt)
4088        {
4089            return Err(EngineError::Unsupported(
4090                "AS OF SEGMENT supports SELECT projection + WHERE + LIMIT only \
4091                 (joins / aggregates / ORDER BY are STABILITY § \"Out of v6.10\")"
4092                    .into(),
4093            ));
4094        }
4095        let table = self
4096            .active_catalog()
4097            .get(&from.primary.name)
4098            .ok_or_else(|| StorageError::TableNotFound {
4099                name: from.primary.name.clone(),
4100            })?;
4101        let schema = table.schema().clone();
4102        let schema_cols = &schema.columns;
4103        let alias = from
4104            .primary
4105            .alias
4106            .as_deref()
4107            .unwrap_or(from.primary.name.as_str());
4108        let ctx = EvalContext::new(schema_cols, Some(alias));
4109        let seg = self
4110            .active_catalog()
4111            .cold_segment(segment_id)
4112            .ok_or_else(|| {
4113                EngineError::Unsupported(alloc::format!(
4114                    "AS OF SEGMENT: cold segment {segment_id} not registered"
4115                ))
4116            })?;
4117        let mut out_rows: Vec<Row> = Vec::new();
4118        let mut limit_remaining: Option<usize> =
4119            stmt.limit_literal().and_then(|n| usize::try_from(n).ok());
4120        for (_key, body) in seg.scan() {
4121            let (row, _consumed) =
4122                spg_storage::decode_row_body_dense(&body, &schema).map_err(EngineError::Storage)?;
4123            if let Some(where_expr) = &stmt.where_ {
4124                let cond = self.eval_expr_simple(where_expr, &row, &ctx)?;
4125                if !matches!(cond, Value::Bool(true)) {
4126                    continue;
4127                }
4128            }
4129            // Projection.
4130            let projected = self.project_row_simple(&row, &stmt.items, schema_cols, alias)?;
4131            out_rows.push(projected);
4132            if let Some(rem) = limit_remaining.as_mut() {
4133                if *rem == 0 {
4134                    out_rows.pop();
4135                    break;
4136                }
4137                *rem -= 1;
4138            }
4139        }
4140        // Output column schema: derive from SELECT items.
4141        let columns = self.derive_output_columns(&stmt.items, schema_cols, alias);
4142        Ok(QueryResult::Rows {
4143            columns,
4144            rows: out_rows,
4145        })
4146    }
4147
4148    /// v6.10.2 — simple-path WHERE eval that doesn't go through
4149    /// the correlated-subquery / Memoize machinery. AS OF SEGMENT
4150    /// scan paths predicate against a snapshot frozen segment, no
4151    /// cross-row state.
4152    fn eval_expr_simple(
4153        &self,
4154        expr: &Expr,
4155        row: &Row,
4156        ctx: &EvalContext,
4157    ) -> Result<Value, EngineError> {
4158        let cancel = CancelToken::none();
4159        self.eval_expr_with_correlated(expr, row, ctx, cancel, None)
4160    }
4161
4162    /// v7.9.4 — INSERT / UPDATE / DELETE RETURNING projector.
4163    /// Given the table name, the user-supplied projection items,
4164    /// and the mutated rows (post-insert / post-update values, or
4165    /// pre-delete snapshot), build a `QueryResult::Rows` whose
4166    /// schema describes the projected columns. Mailrs migration
4167    /// blocker #1.
4168    fn build_returning_rows(
4169        &self,
4170        table_name: &str,
4171        items: &[SelectItem],
4172        mutated_rows: Vec<Vec<Value>>,
4173    ) -> Result<QueryResult, EngineError> {
4174        let table = self.active_catalog().get(table_name).ok_or_else(|| {
4175            EngineError::Storage(StorageError::TableNotFound {
4176                name: table_name.into(),
4177            })
4178        })?;
4179        let schema_cols = table.schema().columns.clone();
4180        let columns = self.derive_output_columns(items, &schema_cols, table_name);
4181        let mut out_rows: Vec<Row> = Vec::with_capacity(mutated_rows.len());
4182        for values in mutated_rows {
4183            let row = Row::new(values);
4184            let projected = self.project_row_simple(&row, items, &schema_cols, table_name)?;
4185            out_rows.push(projected);
4186        }
4187        Ok(QueryResult::Rows {
4188            columns,
4189            rows: out_rows,
4190        })
4191    }
4192
4193    /// v6.10.2 — projection for AS OF SEGMENT. Resolves
4194    /// `SelectItem::Wildcard` to all schema columns and
4195    /// `SelectItem::Expr` via the regular eval path.
4196    fn project_row_simple(
4197        &self,
4198        row: &Row,
4199        items: &[SelectItem],
4200        schema_cols: &[ColumnSchema],
4201        alias: &str,
4202    ) -> Result<Row, EngineError> {
4203        let ctx = EvalContext::new(schema_cols, Some(alias));
4204        let cancel = CancelToken::none();
4205        let mut out_vals = Vec::new();
4206        for item in items {
4207            match item {
4208                SelectItem::Wildcard => {
4209                    out_vals.extend(row.values.iter().cloned());
4210                }
4211                SelectItem::Expr { expr, .. } => {
4212                    let v = self.eval_expr_with_correlated(expr, row, &ctx, cancel, None)?;
4213                    out_vals.push(v);
4214                }
4215            }
4216        }
4217        Ok(Row::new(out_vals))
4218    }
4219
4220    /// v6.10.2 — derive the output `ColumnSchema` list for an
4221    /// AS OF SEGMENT projection. Wildcards take the full schema;
4222    /// expressions take the alias if present or a synthetic
4223    /// `?column?` (PG convention) otherwise.
4224    fn derive_output_columns(
4225        &self,
4226        items: &[SelectItem],
4227        schema_cols: &[ColumnSchema],
4228        _alias: &str,
4229    ) -> Vec<ColumnSchema> {
4230        let mut out = Vec::new();
4231        for item in items {
4232            match item {
4233                SelectItem::Wildcard => {
4234                    out.extend(schema_cols.iter().cloned());
4235                }
4236                SelectItem::Expr { alias, .. } => {
4237                    let name = alias.clone().unwrap_or_else(|| "?column?".to_string());
4238                    // Default to Text; the caller's row values
4239                    // carry the actual type. v6.10.2 scope.
4240                    out.push(ColumnSchema::new(name, DataType::Text, true));
4241                }
4242            }
4243        }
4244        out
4245    }
4246
4247    fn exec_select_cancel(
4248        &self,
4249        stmt: &SelectStatement,
4250        cancel: CancelToken<'_>,
4251    ) -> Result<QueryResult, EngineError> {
4252        cancel.check()?;
4253        // v6.10.2 — cold-tier time-travel short-circuit. When the
4254        // primary TableRef carries `AS OF SEGMENT '<id>'`, run a
4255        // dedicated cold-segment scan instead of the regular
4256        // hot+index path. The scope is intentionally narrow for
4257        // v6.10.2 — bare `SELECT * FROM <t> AS OF SEGMENT 'id'`,
4258        // optionally with a single-column-equality WHERE. JOINs /
4259        // aggregates / ORDER BY / subqueries on top of a time-
4260        // travelled scan are STABILITY § "Out of v6.10".
4261        if let Some(from) = &stmt.from
4262            && let Some(seg_id) = from.primary.as_of_segment
4263        {
4264            return self.exec_select_as_of_segment(stmt, from, seg_id);
4265        }
4266        // v6.2.0 / v6.5.0 — virtual-table short-circuits. Detected
4267        // pre-CTE because they don't read from the catalog and
4268        // shouldn't participate in regular FROM resolution.
4269        if let Some(from) = &stmt.from
4270            && from.joins.is_empty()
4271            && stmt.where_.is_none()
4272            && stmt.group_by.is_none()
4273            && stmt.having.is_none()
4274            && stmt.unions.is_empty()
4275            && stmt.order_by.is_empty()
4276            && stmt.limit.is_none()
4277            && stmt.offset.is_none()
4278            && !stmt.distinct
4279            && stmt.items.iter().all(|i| matches!(i, SelectItem::Wildcard))
4280        {
4281            let lower = from.primary.name.to_ascii_lowercase();
4282            match lower.as_str() {
4283                "spg_statistic" => return Ok(self.exec_spg_statistic()),
4284                // v6.5.0 — observability v2 virtual tables.
4285                "spg_stat_replication" => return Ok(self.exec_spg_stat_replication()),
4286                "spg_stat_segment" => return Ok(self.exec_spg_stat_segment()),
4287                "spg_stat_query" => return Ok(self.exec_spg_stat_query()),
4288                "spg_stat_activity" => return Ok(self.exec_spg_stat_activity()),
4289                "spg_audit_chain" => return Ok(self.exec_spg_audit_chain()),
4290                "spg_audit_verify" => return Ok(self.exec_spg_audit_verify()),
4291                "spg_table_ddl" => return Ok(self.exec_spg_table_ddl()),
4292                "spg_role_ddl" => return Ok(self.exec_spg_role_ddl()),
4293                "spg_database_ddl" => return Ok(self.exec_spg_database_ddl()),
4294                _ => {}
4295            }
4296        }
4297        // v4.11: CTEs materialise into a temporary enriched catalog
4298        // *before* anything else — the body SELECT can then refer
4299        // to CTE names via the regular FROM-clause resolution.
4300        // Uncorrelated only: each CTE body runs once against the
4301        // current catalog, not against later CTEs' results (left-
4302        // to-right materialisation would relax this, but we keep
4303        // it simple for v4.11 MVP).
4304        if !stmt.ctes.is_empty() {
4305            return self.exec_with_ctes(stmt, cancel);
4306        }
4307        // v4.10: subqueries (uncorrelated) are resolved here, before
4308        // the executor sees the row loop. We clone the statement so
4309        // we can mutate without disturbing the caller's AST — most
4310        // queries pass through with no subquery nodes and the clone
4311        // is cheap; with subqueries the materialisation cost
4312        // dominates anyway.
4313        let mut stmt_owned;
4314        let stmt_ref: &SelectStatement = if expr_tree_has_subquery(stmt) {
4315            stmt_owned = stmt.clone();
4316            self.resolve_select_subqueries(&mut stmt_owned, cancel)?;
4317            &stmt_owned
4318        } else {
4319            stmt
4320        };
4321        if stmt_ref.unions.is_empty() {
4322            return self.exec_bare_select_cancel(stmt_ref, cancel);
4323        }
4324        // UNION path: clone-strip the head into a bare block (its own
4325        // DISTINCT and any inner ORDER BY are dropped by parser rule —
4326        // the wrapper SelectStatement carries them), execute, then chain
4327        // peers with left-associative dedup semantics.
4328        let mut head = stmt_ref.clone();
4329        head.unions = Vec::new();
4330        head.order_by = Vec::new();
4331        head.limit = None;
4332        let QueryResult::Rows { columns, mut rows } =
4333            self.exec_bare_select_cancel(&head, cancel)?
4334        else {
4335            unreachable!("bare SELECT cannot return CommandOk")
4336        };
4337        for (kind, peer) in &stmt_ref.unions {
4338            let QueryResult::Rows {
4339                columns: peer_cols,
4340                rows: peer_rows,
4341            } = self.exec_bare_select_cancel(peer, cancel)?
4342            else {
4343                unreachable!("bare SELECT cannot return CommandOk")
4344            };
4345            if peer_cols.len() != columns.len() {
4346                return Err(EngineError::Unsupported(alloc::format!(
4347                    "UNION arity mismatch: head has {} columns, peer has {}",
4348                    columns.len(),
4349                    peer_cols.len()
4350                )));
4351            }
4352            rows.extend(peer_rows);
4353            if matches!(kind, UnionKind::Distinct) {
4354                rows = dedup_rows(rows);
4355            }
4356        }
4357        // ORDER BY at the top of a UNION applies to the combined result.
4358        // Eval against the projected schema (NOT the source table).
4359        if !stmt.order_by.is_empty() {
4360            let synth_ctx = EvalContext::new(&columns, None);
4361            let descs: Vec<bool> = stmt.order_by.iter().map(|o| o.desc).collect();
4362            let mut tagged: Vec<(Vec<f64>, Row)> = Vec::with_capacity(rows.len());
4363            for r in rows {
4364                let keys = build_order_keys(&stmt.order_by, &r, &synth_ctx)?;
4365                tagged.push((keys, r));
4366            }
4367            sort_by_keys(&mut tagged, &descs);
4368            rows = tagged.into_iter().map(|(_, r)| r).collect();
4369        }
4370        apply_offset_and_limit(&mut rows, stmt.offset_literal(), stmt.limit_literal());
4371        Ok(QueryResult::Rows { columns, rows })
4372    }
4373
4374    #[allow(clippy::too_many_lines)]
4375    #[allow(clippy::too_many_lines)] // huge match — splitting fragments the planner
4376    /// v7.11.7 — execute `SELECT … FROM unnest(expr) [AS] alias …`.
4377    /// Synthesises a single-column virtual table whose column type
4378    /// is TEXT and whose rows are the array elements. Routes
4379    /// through the regular projection / WHERE / ORDER BY / LIMIT
4380    /// machinery so set-returning UNNEST composes naturally with
4381    /// the rest of the SELECT surface.
4382    fn exec_select_unnest(
4383        &self,
4384        stmt: &SelectStatement,
4385        primary: &TableRef,
4386        cancel: CancelToken<'_>,
4387    ) -> Result<QueryResult, EngineError> {
4388        let expr = primary
4389            .unnest_expr
4390            .as_deref()
4391            .expect("caller guards unnest_expr.is_some()");
4392        // Evaluate the array expression once. Empty schema / empty
4393        // row — uncorrelated UNNEST cannot reference outer columns.
4394        let empty_schema: alloc::vec::Vec<ColumnSchema> = alloc::vec::Vec::new();
4395        let ctx = EvalContext::new(&empty_schema, None);
4396        let dummy_row = Row::new(alloc::vec::Vec::new());
4397        // v7.11.13 — unnest dispatches per array element type so
4398        // INT[] / BIGINT[] surface their PG types in projection.
4399        let (elem_dtype, rows): (DataType, alloc::vec::Vec<Row>) =
4400            match eval::eval_expr(expr, &dummy_row, &ctx).map_err(EngineError::Eval)? {
4401                Value::Null => (DataType::Text, alloc::vec::Vec::new()),
4402                Value::TextArray(items) => {
4403                    let rows = items
4404                        .into_iter()
4405                        .map(|item| {
4406                            Row::new(alloc::vec![match item {
4407                                Some(s) => Value::Text(s),
4408                                None => Value::Null,
4409                            }])
4410                        })
4411                        .collect();
4412                    (DataType::Text, rows)
4413                }
4414                Value::IntArray(items) => {
4415                    let rows = items
4416                        .into_iter()
4417                        .map(|item| {
4418                            Row::new(alloc::vec![match item {
4419                                Some(n) => Value::Int(n),
4420                                None => Value::Null,
4421                            }])
4422                        })
4423                        .collect();
4424                    (DataType::Int, rows)
4425                }
4426                Value::BigIntArray(items) => {
4427                    let rows = items
4428                        .into_iter()
4429                        .map(|item| {
4430                            Row::new(alloc::vec![match item {
4431                                Some(n) => Value::BigInt(n),
4432                                None => Value::Null,
4433                            }])
4434                        })
4435                        .collect();
4436                    (DataType::BigInt, rows)
4437                }
4438                other => {
4439                    return Err(EngineError::Unsupported(alloc::format!(
4440                        "unnest() expects an array argument, got {:?}",
4441                        other.data_type()
4442                    )));
4443                }
4444            };
4445        let alias = primary
4446            .alias
4447            .clone()
4448            .unwrap_or_else(|| "unnest".to_string());
4449        let col_schema = ColumnSchema::new(alias.clone(), elem_dtype, true);
4450        let schema_cols = alloc::vec![col_schema.clone()];
4451        let scan_ctx = EvalContext::new(&schema_cols, Some(&alias));
4452        // Apply WHERE.
4453        let filtered: alloc::vec::Vec<Row> = if let Some(w) = &stmt.where_ {
4454            let mut out = alloc::vec::Vec::with_capacity(rows.len());
4455            for row in rows {
4456                cancel.check()?;
4457                let v = eval::eval_expr(w, &row, &scan_ctx).map_err(EngineError::Eval)?;
4458                if matches!(v, Value::Bool(true)) {
4459                    out.push(row);
4460                }
4461            }
4462            out
4463        } else {
4464            rows
4465        };
4466        // Projection.
4467        let projection = build_projection(&stmt.items, &schema_cols, &alias)?;
4468        let mut projected_rows: alloc::vec::Vec<Row> =
4469            alloc::vec::Vec::with_capacity(filtered.len());
4470        for row in &filtered {
4471            let mut vals = alloc::vec::Vec::with_capacity(projection.len());
4472            for p in &projection {
4473                vals.push(eval::eval_expr(&p.expr, row, &scan_ctx).map_err(EngineError::Eval)?);
4474            }
4475            projected_rows.push(Row::new(vals));
4476        }
4477        // ORDER BY / LIMIT — apply on the projected rows (cheap;
4478        // unnest result sets are small by design).
4479        let columns: alloc::vec::Vec<ColumnSchema> = projection
4480            .iter()
4481            .map(|p| ColumnSchema::new(p.output_name.clone(), p.ty, p.nullable))
4482            .collect();
4483        // Re-evaluate ORDER BY against the source schema (pre-projection
4484        // so col refs by name still resolve through `scan_ctx`).
4485        if !stmt.order_by.is_empty() {
4486            let mut indexed: alloc::vec::Vec<(usize, Vec<Value>)> = filtered
4487                .iter()
4488                .enumerate()
4489                .map(|(i, r)| -> Result<_, EngineError> {
4490                    let keys: Result<Vec<Value>, EngineError> = stmt
4491                        .order_by
4492                        .iter()
4493                        .map(|ob| {
4494                            eval::eval_expr(&ob.expr, r, &scan_ctx).map_err(EngineError::Eval)
4495                        })
4496                        .collect();
4497                    Ok((i, keys?))
4498                })
4499                .collect::<Result<_, _>>()?;
4500            indexed.sort_by(|a, b| {
4501                for (idx, (ka, kb)) in a.1.iter().zip(b.1.iter()).enumerate() {
4502                    let mut cmp = value_cmp(ka, kb);
4503                    if stmt.order_by[idx].desc {
4504                        cmp = cmp.reverse();
4505                    }
4506                    if cmp != core::cmp::Ordering::Equal {
4507                        return cmp;
4508                    }
4509                }
4510                core::cmp::Ordering::Equal
4511            });
4512            projected_rows = indexed
4513                .into_iter()
4514                .map(|(i, _)| projected_rows[i].clone())
4515                .collect();
4516        }
4517        // LIMIT / OFFSET — apply at the tail.
4518        if let Some(offset) = stmt.offset_literal() {
4519            let off = (offset as usize).min(projected_rows.len());
4520            projected_rows.drain(..off);
4521        }
4522        if let Some(limit) = stmt.limit_literal() {
4523            projected_rows.truncate(limit as usize);
4524        }
4525        Ok(QueryResult::Rows {
4526            columns,
4527            rows: projected_rows,
4528        })
4529    }
4530
4531    fn exec_bare_select_cancel(
4532        &self,
4533        stmt: &SelectStatement,
4534        cancel: CancelToken<'_>,
4535    ) -> Result<QueryResult, EngineError> {
4536        // v4.12: window-function path. When the projection contains
4537        // any `name(args) OVER (...)` we route to the dedicated
4538        // executor — partition + sort + per-row window value before
4539        // the regular projection.
4540        if select_has_window(stmt) {
4541            return self.exec_select_with_window(stmt, cancel);
4542        }
4543        // Constant SELECT (no FROM) — evaluate each item once against an
4544        // empty dummy row. Useful for `SELECT 1`, `SELECT coalesce(...)`,
4545        // `SELECT '7'::INT`. Column references will surface as
4546        // ColumnNotFound on eval since the schema is empty.
4547        let Some(from) = &stmt.from else {
4548            let empty_schema: Vec<ColumnSchema> = Vec::new();
4549            let ctx = self.ev_ctx(&empty_schema, None);
4550            let projection = build_projection(&stmt.items, &empty_schema, "")?;
4551            let dummy_row = Row::new(Vec::new());
4552            let mut values = Vec::with_capacity(projection.len());
4553            for p in &projection {
4554                values.push(eval::eval_expr(&p.expr, &dummy_row, &ctx)?);
4555            }
4556            let columns: Vec<ColumnSchema> = projection
4557                .into_iter()
4558                .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
4559                .collect();
4560            return Ok(QueryResult::Rows {
4561                columns,
4562                rows: alloc::vec![Row::new(values)],
4563            });
4564        };
4565        // Multi-table FROM (one or more joined peers) goes through the
4566        // nested-loop join executor. Single-table FROM stays on the
4567        // existing scan + index-seek path.
4568        if !from.joins.is_empty() {
4569            return self.exec_joined_select(stmt, from);
4570        }
4571        // v7.11.7 — `FROM unnest(<expr>) [AS] <alias>`. Synthesise a
4572        // single-column table at SELECT entry by evaluating the
4573        // expression once against the empty row (UNNEST is
4574        // uncorrelated in v7.11; correlated / LATERAL unnest is a
4575        // v7.12 carve-out). Build a virtual `Table` in a heap-only
4576        // catalog, then route to the regular scan path.
4577        if from.primary.unnest_expr.is_some() {
4578            return self.exec_select_unnest(stmt, &from.primary, cancel);
4579        }
4580        let primary = &from.primary;
4581        let table = self.active_catalog().get(&primary.name).ok_or_else(|| {
4582            StorageError::TableNotFound {
4583                name: primary.name.clone(),
4584            }
4585        })?;
4586        let schema_cols = &table.schema().columns;
4587        // The qualifier accepted on column refs is the alias (if any) else the
4588        // bare table name.
4589        let alias = primary.alias.as_deref().unwrap_or(primary.name.as_str());
4590        let ctx = self.ev_ctx(schema_cols, Some(alias));
4591
4592        // NSW kNN planner: `ORDER BY col <-> literal LIMIT k` with no
4593        // WHERE and an NSW index on `col` skips the full scan. The
4594        // walk returns rows already in ascending-distance order, so
4595        // ORDER BY / LIMIT are honoured implicitly.
4596        if let Some(nsw_rows) = try_nsw_knn(stmt, table, schema_cols, alias) {
4597            return materialise_in_order(stmt, table, schema_cols, alias, &nsw_rows);
4598        }
4599
4600        // Index seek: if WHERE is `col = literal` (or commuted) and the
4601        // referenced column has an index, dispatch each locator through
4602        // the catalog (hot tier → borrow, cold tier → page-read +
4603        // decode) and iterate just those rows. Otherwise fall back to a
4604        // full scan over the hot tier (cold-tier rows are only reached
4605        // via index seek in v5.1 — full table scans against cold-tier
4606        // data ship in v5.2 with the freezer's per-segment scan API).
4607        let indexed_rows: Option<Vec<Cow<'_, Row>>> = stmt.where_.as_ref().and_then(|w| {
4608            // BTree / col=literal seek first — covers the v7.11.3 multi-
4609            // column AND case and the leading-column equality lookup.
4610            try_index_seek(w, schema_cols, self.active_catalog(), table, alias).or_else(|| {
4611                // v7.12.3 — GIN-accelerated `WHERE col @@ tsquery`
4612                // when the column has a `USING gin` index. Returns an
4613                // over-approximate candidate set; the WHERE re-eval
4614                // loop below verifies the full `@@` predicate per row.
4615                try_gin_seek(w, schema_cols, self.active_catalog(), table, alias, &ctx)
4616            })
4617        });
4618
4619        // Aggregate path: filter rows first, then hand off to the
4620        // aggregate executor which does its own projection + ORDER BY.
4621        if aggregate::uses_aggregate(stmt) {
4622            let mut filtered: Vec<&Row> = Vec::new();
4623            // v6.2.6 — Memoize: per-query LRU cache for correlated
4624            // scalar subqueries. Fresh per row-loop entry so each
4625            // SELECT execution gets an isolated cache.
4626            let mut memo = memoize::MemoizeCache::new();
4627            if let Some(rows) = &indexed_rows {
4628                for cow in rows {
4629                    let row = cow.as_ref();
4630                    if let Some(where_expr) = &stmt.where_ {
4631                        let cond = self.eval_expr_with_correlated(
4632                            where_expr,
4633                            row,
4634                            &ctx,
4635                            cancel,
4636                            Some(&mut memo),
4637                        )?;
4638                        if !matches!(cond, Value::Bool(true)) {
4639                            continue;
4640                        }
4641                    }
4642                    filtered.push(row);
4643                }
4644            } else {
4645                for i in 0..table.row_count() {
4646                    let row = &table.rows()[i];
4647                    if let Some(where_expr) = &stmt.where_ {
4648                        let cond = self.eval_expr_with_correlated(
4649                            where_expr,
4650                            row,
4651                            &ctx,
4652                            cancel,
4653                            Some(&mut memo),
4654                        )?;
4655                        if !matches!(cond, Value::Bool(true)) {
4656                            continue;
4657                        }
4658                    }
4659                    filtered.push(row);
4660                }
4661            }
4662            let mut agg = aggregate::run(stmt, &filtered, schema_cols, Some(alias))?;
4663            apply_offset_and_limit(&mut agg.rows, stmt.offset_literal(), stmt.limit_literal());
4664            return Ok(QueryResult::Rows {
4665                columns: agg.columns,
4666                rows: agg.rows,
4667            });
4668        }
4669
4670        let projection = build_projection(&stmt.items, schema_cols, alias)?;
4671
4672        // Materialise the filter pass into `(order_key, projected_row)`
4673        // tuples. The order key is `None` when there's no ORDER BY clause.
4674        let mut tagged: Vec<(Vec<f64>, Row)> = Vec::new();
4675        // v6.2.6 — Memoize per-row WHERE eval shares one cache.
4676        let mut memo = memoize::MemoizeCache::new();
4677        // Inline the per-row work in a closure so the indexed and full-
4678        // scan branches share the body.
4679        let mut process_row = |row: &Row, loop_idx: usize| -> Result<(), EngineError> {
4680            if loop_idx.is_multiple_of(256) {
4681                cancel.check()?;
4682            }
4683            if let Some(where_expr) = &stmt.where_ {
4684                let cond =
4685                    self.eval_expr_with_correlated(where_expr, row, &ctx, cancel, Some(&mut memo))?;
4686                if !matches!(cond, Value::Bool(true)) {
4687                    return Ok(());
4688                }
4689            }
4690            let mut values = Vec::with_capacity(projection.len());
4691            for p in &projection {
4692                values.push(eval::eval_expr(&p.expr, row, &ctx)?);
4693            }
4694            let order_keys = if stmt.order_by.is_empty() {
4695                Vec::new()
4696            } else {
4697                build_order_keys(&stmt.order_by, row, &ctx)?
4698            };
4699            tagged.push((order_keys, Row::new(values)));
4700            Ok(())
4701        };
4702        if let Some(rows) = &indexed_rows {
4703            for (loop_idx, cow) in rows.iter().enumerate() {
4704                process_row(cow.as_ref(), loop_idx)?;
4705            }
4706        } else {
4707            for i in 0..table.row_count() {
4708                process_row(&table.rows()[i], i)?;
4709            }
4710        }
4711
4712        if !stmt.order_by.is_empty() {
4713            // Partial-sort fast path: when LIMIT is small relative to
4714            // the row count, select_nth_unstable + sort just the
4715            // prefix is O(n + k log k) instead of O(n log n). DISTINCT
4716            // requires the full sort because de-dup happens after.
4717            let keep = if stmt.distinct {
4718                None
4719            } else {
4720                stmt.limit_literal()
4721                    .map(|l| l as usize + stmt.offset_literal().map_or(0, |o| o as usize))
4722            };
4723            let descs: Vec<bool> = stmt.order_by.iter().map(|o| o.desc).collect();
4724            partial_sort_tagged(&mut tagged, keep, &descs);
4725        }
4726
4727        let mut output_rows: Vec<Row> = tagged.into_iter().map(|(_, r)| r).collect();
4728        if stmt.distinct {
4729            output_rows = dedup_rows(output_rows);
4730        }
4731        apply_offset_and_limit(
4732            &mut output_rows,
4733            stmt.offset_literal(),
4734            stmt.limit_literal(),
4735        );
4736
4737        let columns: Vec<ColumnSchema> = projection
4738            .into_iter()
4739            .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
4740            .collect();
4741
4742        Ok(QueryResult::Rows {
4743            columns,
4744            rows: output_rows,
4745        })
4746    }
4747
4748    /// Multi-table SELECT executor (one or more JOIN peers).
4749    ///
4750    /// v1.10 builds the joined row set up-front via nested-loop joins,
4751    /// then runs WHERE + projection + ORDER BY against the combined
4752    /// rows. No index seek. Aggregates and DISTINCT still work because
4753    /// the executor delegates projection through the same shared paths.
4754    #[allow(clippy::too_many_lines)]
4755    fn exec_joined_select(
4756        &self,
4757        stmt: &SelectStatement,
4758        from: &FromClause,
4759    ) -> Result<QueryResult, EngineError> {
4760        // Resolve every table reference up front so we surface
4761        // TableNotFound before we start the cartesian work.
4762        let primary_table = self
4763            .active_catalog()
4764            .get(&from.primary.name)
4765            .ok_or_else(|| StorageError::TableNotFound {
4766                name: from.primary.name.clone(),
4767            })?;
4768        let primary_alias = from
4769            .primary
4770            .alias
4771            .as_deref()
4772            .unwrap_or(from.primary.name.as_str())
4773            .to_string();
4774        let mut joined_tables: Vec<(&Table, String, JoinKind, Option<&Expr>)> = Vec::new();
4775        for j in &from.joins {
4776            let t = self.active_catalog().get(&j.table.name).ok_or_else(|| {
4777                StorageError::TableNotFound {
4778                    name: j.table.name.clone(),
4779                }
4780            })?;
4781            let a = j
4782                .table
4783                .alias
4784                .as_deref()
4785                .unwrap_or(j.table.name.as_str())
4786                .to_string();
4787            joined_tables.push((t, a, j.kind, j.on.as_ref()));
4788        }
4789
4790        // Build the combined schema: composite "alias.col" names so the
4791        // qualified-column resolver can find anything by exact match.
4792        let mut combined_schema: Vec<ColumnSchema> = Vec::new();
4793        for col in &primary_table.schema().columns {
4794            combined_schema.push(ColumnSchema::new(
4795                alloc::format!("{primary_alias}.{}", col.name),
4796                col.ty,
4797                col.nullable,
4798            ));
4799        }
4800        for (t, a, _, _) in &joined_tables {
4801            for col in &t.schema().columns {
4802                combined_schema.push(ColumnSchema::new(
4803                    alloc::format!("{a}.{}", col.name),
4804                    col.ty,
4805                    col.nullable,
4806                ));
4807            }
4808        }
4809        let ctx = EvalContext::new(&combined_schema, None);
4810
4811        // Nested-loop join. Starting set: every primary row, padded with
4812        // (no joined columns yet).
4813        let mut working: Vec<Row> = primary_table.rows().iter().cloned().collect();
4814        let mut produced_len = primary_table.schema().columns.len();
4815        for (t, _, kind, on) in &joined_tables {
4816            let right_arity = t.schema().columns.len();
4817            let mut next: Vec<Row> = Vec::new();
4818            for left in &working {
4819                let mut left_matched = false;
4820                for right in t.rows() {
4821                    let mut combined_vals = left.values.clone();
4822                    combined_vals.extend(right.values.iter().cloned());
4823                    // Pad combined to the eventual full width so the
4824                    // partial schema still matches positions used by ON.
4825                    let combined = Row::new(combined_vals);
4826                    let keep = if let Some(on_expr) = on {
4827                        let cond = eval::eval_expr(on_expr, &combined, &ctx)?;
4828                        matches!(cond, Value::Bool(true))
4829                    } else {
4830                        // CROSS / comma-list: every pair survives.
4831                        true
4832                    };
4833                    if keep {
4834                        next.push(combined);
4835                        left_matched = true;
4836                    }
4837                }
4838                if !left_matched && matches!(kind, JoinKind::Left) {
4839                    // LEFT OUTER JOIN: emit the left row with NULLs on
4840                    // the right side when no peer matched.
4841                    let mut combined_vals = left.values.clone();
4842                    for _ in 0..right_arity {
4843                        combined_vals.push(Value::Null);
4844                    }
4845                    next.push(Row::new(combined_vals));
4846                }
4847            }
4848            working = next;
4849            produced_len += right_arity;
4850            debug_assert!(produced_len <= combined_schema.len());
4851        }
4852
4853        // WHERE filter against combined rows.
4854        let mut filtered: Vec<Row> = Vec::new();
4855        for row in working {
4856            if let Some(where_expr) = &stmt.where_ {
4857                let cond = eval::eval_expr(where_expr, &row, &ctx)?;
4858                if !matches!(cond, Value::Bool(true)) {
4859                    continue;
4860                }
4861            }
4862            filtered.push(row);
4863        }
4864
4865        // Aggregate path: handle GROUP BY / aggregate calls over the
4866        // joined+filtered rows.
4867        if aggregate::uses_aggregate(stmt) {
4868            let refs: Vec<&Row> = filtered.iter().collect();
4869            let mut agg = aggregate::run(stmt, &refs, &combined_schema, None)?;
4870            apply_offset_and_limit(&mut agg.rows, stmt.offset_literal(), stmt.limit_literal());
4871            return Ok(QueryResult::Rows {
4872                columns: agg.columns,
4873                rows: agg.rows,
4874            });
4875        }
4876
4877        let projection = build_projection(&stmt.items, &combined_schema, "")?;
4878        let mut tagged: Vec<(Vec<f64>, Row)> = Vec::new();
4879        for row in &filtered {
4880            let mut values = Vec::with_capacity(projection.len());
4881            for p in &projection {
4882                values.push(eval::eval_expr(&p.expr, row, &ctx)?);
4883            }
4884            let order_keys = if stmt.order_by.is_empty() {
4885                Vec::new()
4886            } else {
4887                build_order_keys(&stmt.order_by, row, &ctx)?
4888            };
4889            tagged.push((order_keys, Row::new(values)));
4890        }
4891        if !stmt.order_by.is_empty() {
4892            let keep = if stmt.distinct {
4893                None
4894            } else {
4895                stmt.limit_literal()
4896                    .map(|l| l as usize + stmt.offset_literal().map_or(0, |o| o as usize))
4897            };
4898            let descs: Vec<bool> = stmt.order_by.iter().map(|o| o.desc).collect();
4899            partial_sort_tagged(&mut tagged, keep, &descs);
4900        }
4901        let mut output_rows: Vec<Row> = tagged.into_iter().map(|(_, r)| r).collect();
4902        if stmt.distinct {
4903            output_rows = dedup_rows(output_rows);
4904        }
4905        apply_offset_and_limit(
4906            &mut output_rows,
4907            stmt.offset_literal(),
4908            stmt.limit_literal(),
4909        );
4910        let columns: Vec<ColumnSchema> = projection
4911            .into_iter()
4912            .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
4913            .collect();
4914        Ok(QueryResult::Rows {
4915            columns,
4916            rows: output_rows,
4917        })
4918    }
4919}
4920
4921/// One row-producing projection: an expression to evaluate, the resulting
4922/// column's user-visible name, its inferred type, and nullability.
4923#[derive(Debug, Clone)]
4924struct ProjectedItem {
4925    expr: Expr,
4926    output_name: String,
4927    ty: DataType,
4928    nullable: bool,
4929}
4930
4931/// Dedupe a row set, preserving first-seen order. `Row`'s `PartialEq` is
4932/// structural (`Vec<Value>` ⇒ pairwise `Value` equality), which gives SQL
4933/// `NULL = NULL → TRUE` and `NaN = NaN → FALSE`. The first agrees with
4934/// the spec's "two NULLs are not distinct"; the second is a tolerated
4935/// quirk for v1 (no NaN literals are reachable from the SQL surface).
4936fn dedup_rows(rows: Vec<Row>) -> Vec<Row> {
4937    let mut out: Vec<Row> = Vec::with_capacity(rows.len());
4938    for r in rows {
4939        if !out.iter().any(|seen| seen == &r) {
4940            out.push(r);
4941        }
4942    }
4943    out
4944}
4945
4946/// Coerce a `Value` to an `f64` sort key for ORDER BY. Numbers map directly;
4947/// NULL sorts last (treated as `+∞`); booleans are 0.0 / 1.0; text uses lex
4948/// order via the byte values; vectors are not sortable.
4949fn value_to_order_key(v: &Value) -> Result<f64, EngineError> {
4950    match v {
4951        Value::Null => Ok(f64::INFINITY),
4952        Value::SmallInt(n) => Ok(f64::from(*n)),
4953        Value::Int(n) => Ok(f64::from(*n)),
4954        Value::Date(d) => Ok(f64::from(*d)),
4955        #[allow(clippy::cast_precision_loss)]
4956        Value::Timestamp(t) => Ok(*t as f64),
4957        #[allow(clippy::cast_precision_loss)]
4958        Value::Numeric { scaled, scale } => {
4959            // Scaled integer / 10^scale, computed via f64 for sort
4960            // ordering only. Precision losses here only matter for
4961            // ORDER BY tie-breaks well past 15 significant digits.
4962            // `f64::powi` lives in std; we hand-roll the loop so the
4963            // no_std engine crate doesn't need it.
4964            let mut divisor = 1.0_f64;
4965            for _ in 0..*scale {
4966                divisor *= 10.0;
4967            }
4968            Ok((*scaled as f64) / divisor)
4969        }
4970        #[allow(clippy::cast_precision_loss)]
4971        Value::BigInt(n) => Ok(*n as f64),
4972        Value::Float(x) => Ok(*x),
4973        Value::Bool(b) => Ok(if *b { 1.0 } else { 0.0 }),
4974        Value::Text(s) => {
4975            // Lex order by codepoints — good enough for ORDER BY name.
4976            // Map first 8 bytes packed into u64 as a coarse key; ties fall to
4977            // partial_cmp Equal. v1.x can swap in a real string comparator.
4978            let mut key: u64 = 0;
4979            for &b in s.as_bytes().iter().take(8) {
4980                key = (key << 8) | u64::from(b);
4981            }
4982            #[allow(clippy::cast_precision_loss)]
4983            Ok(key as f64)
4984        }
4985        Value::Vector(_) | Value::Sq8Vector(_) | Value::HalfVector(_) => {
4986            Err(EngineError::Unsupported(
4987                "ORDER BY of a raw vector column is not meaningful — use `<->`".into(),
4988            ))
4989        }
4990        Value::Interval { .. } => Err(EngineError::Unsupported(
4991            "ORDER BY of an INTERVAL is not supported in v2.11 \
4992             (months vs micros has no single canonical ordering)"
4993                .into(),
4994        )),
4995        Value::Json(_) => Err(EngineError::Unsupported(
4996            "ORDER BY of a JSON value is not supported — cast the document to text first".into(),
4997        )),
4998        // v7.5.0 — Value is #[non_exhaustive]; future variants need
4999        // an explicit ORDER BY mapping. Surface as Unsupported until
5000        // engine support is added.
5001        _ => Err(EngineError::Unsupported(
5002            "ORDER BY of this value type is not supported".into(),
5003        )),
5004    }
5005}
5006
5007/// Try to plan a WHERE clause as an equality lookup against an existing
5008/// index. Returns the candidate row indices on success; `None` means the
5009/// caller should fall back to a full scan.
5010///
5011/// v0.8 recognises a single top-level `col = literal` (in either operand
5012/// order). AND chains and range scans land in later milestones.
5013/// Look for `ORDER BY col <dist-op> literal LIMIT k` against an
5014/// NSW-indexed vector column. Recognised distance ops: `<->` (L2),
5015/// `<#>` (inner product), `<=>` (cosine). When a WHERE clause is
5016/// present, the planner does an "over-fetch and filter" pass — it
5017/// asks the graph for `k * over_fetch` candidates, evaluates WHERE
5018/// against each, and trims back to `k`. Returns the row indices in
5019/// ascending-distance order when the plan applies.
5020fn try_nsw_knn(
5021    stmt: &SelectStatement,
5022    table: &Table,
5023    schema_cols: &[ColumnSchema],
5024    table_alias: &str,
5025) -> Option<Vec<usize>> {
5026    if stmt.distinct {
5027        return None;
5028    }
5029    let limit = usize::try_from(stmt.limit_literal()?).ok()?;
5030    if limit == 0 {
5031        return None;
5032    }
5033    // v6.4.0 — NSW kNN dispatch needs a single ORDER BY key on the
5034    // distance metric. Multi-key ORDER BY falls through to the
5035    // generic sort path.
5036    if stmt.order_by.len() != 1 {
5037        return None;
5038    }
5039    let order = &stmt.order_by[0];
5040    // NSW kNN returns rows ascending by distance — DESC inverts the
5041    // natural order, so the planner can't handle it without a sort
5042    // pass. Fall back to the generic ORDER BY path.
5043    if order.desc {
5044        return None;
5045    }
5046    let Expr::Binary { lhs, op, rhs } = &order.expr else {
5047        return None;
5048    };
5049    let metric = match op {
5050        BinOp::L2Distance => spg_storage::NswMetric::L2,
5051        BinOp::InnerProduct => spg_storage::NswMetric::InnerProduct,
5052        BinOp::CosineDistance => spg_storage::NswMetric::Cosine,
5053        _ => return None,
5054    };
5055    // Accept both `col <op> literal` and `literal <op> col`.
5056    let ((Expr::Column(col), literal) | (literal, Expr::Column(col))) =
5057        (lhs.as_ref(), rhs.as_ref())
5058    else {
5059        return None;
5060    };
5061    if let Some(q) = &col.qualifier
5062        && q != table_alias
5063    {
5064        return None;
5065    }
5066    let col_pos = schema_cols.iter().position(|s| s.name == col.name)?;
5067    let query = literal_to_vector(literal)?;
5068    let idx = spg_storage::nsw_index_on(table, col_pos)?;
5069    if let Some(where_expr) = &stmt.where_ {
5070        // Over-fetch and filter. The factor (10×) is a heuristic that
5071        // covers typical selectivity for the corpus tests; v2.x will
5072        // make it configurable.
5073        let over_fetch = limit.saturating_mul(10).max(NSW_OVER_FETCH_FLOOR);
5074        let candidates = spg_storage::nsw_query(table, &idx.name, &query, over_fetch, metric);
5075        let ctx = EvalContext::new(schema_cols, Some(table_alias));
5076        let mut kept: Vec<usize> = Vec::with_capacity(limit);
5077        for i in candidates {
5078            let row = &table.rows()[i];
5079            let cond = eval::eval_expr(where_expr, row, &ctx).ok()?;
5080            if matches!(cond, Value::Bool(true)) {
5081                kept.push(i);
5082                if kept.len() >= limit {
5083                    break;
5084                }
5085            }
5086        }
5087        Some(kept)
5088    } else {
5089        Some(spg_storage::nsw_query(
5090            table, &idx.name, &query, limit, metric,
5091        ))
5092    }
5093}
5094
5095/// Lower bound on the over-fetch pool when WHERE is present — even
5096/// for tiny `LIMIT 1` queries we keep enough candidates to absorb a
5097/// few WHERE rejections.
5098const NSW_OVER_FETCH_FLOOR: usize = 32;
5099
5100/// Pull a `Vec<f32>` out of a literal-or-cast expression. Returns
5101/// `None` for anything we can't fold at plan time.
5102fn literal_to_vector(e: &Expr) -> Option<Vec<f32>> {
5103    match e {
5104        Expr::Literal(Literal::Vector(v)) => Some(v.clone()),
5105        Expr::Cast { expr, .. } => literal_to_vector(expr),
5106        _ => None,
5107    }
5108}
5109
5110/// Materialise rows in a planner-supplied order (used by the NSW path)
5111/// without re-running ORDER BY. The projection + LIMIT slot mirror the
5112/// equivalent block in `exec_bare_select`.
5113fn materialise_in_order(
5114    stmt: &SelectStatement,
5115    table: &Table,
5116    schema_cols: &[ColumnSchema],
5117    table_alias: &str,
5118    ordered_rows: &[usize],
5119) -> Result<QueryResult, EngineError> {
5120    let ctx = EvalContext::new(schema_cols, Some(table_alias));
5121    let projection = build_projection(&stmt.items, schema_cols, table_alias)?;
5122    let mut output_rows: Vec<Row> = Vec::with_capacity(ordered_rows.len());
5123    for &i in ordered_rows {
5124        let row = &table.rows()[i];
5125        let mut values = Vec::with_capacity(projection.len());
5126        for p in &projection {
5127            values.push(eval::eval_expr(&p.expr, row, &ctx)?);
5128        }
5129        output_rows.push(Row::new(values));
5130    }
5131    apply_offset_and_limit(
5132        &mut output_rows,
5133        stmt.offset_literal(),
5134        stmt.limit_literal(),
5135    );
5136    let columns: Vec<ColumnSchema> = projection
5137        .into_iter()
5138        .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
5139        .collect();
5140    Ok(QueryResult::Rows {
5141        columns,
5142        rows: output_rows,
5143    })
5144}
5145
5146fn try_index_seek<'a>(
5147    where_expr: &Expr,
5148    schema_cols: &[ColumnSchema],
5149    catalog: &'a Catalog,
5150    table: &'a Table,
5151    table_alias: &str,
5152) -> Option<Vec<Cow<'a, Row>>> {
5153    // v7.11.3 — recurse through top-level `AND` so a PG-style
5154    // composite predicate like `WHERE id = 1 AND created_at > $1`
5155    // still hits the index on `id`. The caller re-applies the
5156    // full WHERE expression to each returned row, so dropping the
5157    // residual conjuncts here is correct — the index just narrows
5158    // the candidate set.
5159    if let Expr::Binary {
5160        lhs,
5161        op: BinOp::And,
5162        rhs,
5163    } = where_expr
5164    {
5165        // Try LHS first (typical convention: leading equality on
5166        // the indexed column comes first in user-written SQL).
5167        if let Some(rows) = try_index_seek(lhs, schema_cols, catalog, table, table_alias) {
5168            return Some(rows);
5169        }
5170        return try_index_seek(rhs, schema_cols, catalog, table, table_alias);
5171    }
5172    let Expr::Binary {
5173        lhs,
5174        op: BinOp::Eq,
5175        rhs,
5176    } = where_expr
5177    else {
5178        return None;
5179    };
5180    let (col_pos, value) = resolve_col_literal_pair(lhs, rhs, schema_cols, table_alias)
5181        .or_else(|| resolve_col_literal_pair(rhs, lhs, schema_cols, table_alias))?;
5182    let idx = table.index_on(col_pos)?;
5183    let key = IndexKey::from_value(&value)?;
5184    let locators = idx.lookup_eq(&key);
5185    let table_name = table.schema().name.as_str();
5186    // v5.1: each locator dispatches to either the hot tier (zero-
5187    // copy borrow of `table.rows()[i]`) or a cold-tier segment
5188    // (one page read + dense row decode, ~µs scale). Cold rows are
5189    // returned as `Cow::Owned` so the caller's `&Row` iteration
5190    // doesn't see a tier distinction; pre-freezer (no cold
5191    // segments loaded) every locator is `Hot` and every entry is
5192    // `Cow::Borrowed` — identical cost to the pre-v5.1 path.
5193    let mut out: Vec<Cow<'a, Row>> = Vec::with_capacity(locators.len());
5194    for loc in locators {
5195        match *loc {
5196            spg_storage::RowLocator::Hot(i) => {
5197                if let Some(row) = table.rows().get(i) {
5198                    out.push(Cow::Borrowed(row));
5199                }
5200            }
5201            spg_storage::RowLocator::Cold { segment_id, .. } => {
5202                if let Some(row) = catalog.resolve_cold_locator(table_name, segment_id, &key) {
5203                    out.push(Cow::Owned(row));
5204                }
5205            }
5206        }
5207    }
5208    Some(out)
5209}
5210
5211/// v7.12.3 — GIN-accelerated candidate seek for `WHERE col @@ <ts_query>`.
5212///
5213/// Recurses through top-level `AND` like [`try_index_seek`] so a
5214/// composite predicate `WHERE search_vector @@ q AND id > $1` still
5215/// hits the GIN index on `search_vector` — the caller re-applies the
5216/// full WHERE expression to each returned candidate, so dropping the
5217/// `id > $1` residual here stays semantically correct.
5218///
5219/// Returns `None` when:
5220///   - no leaf is a `col @@ <rhs>` shape on a GIN-indexed column;
5221///   - the RHS can't be const-evaluated to a `Value::TsQuery`
5222///     (typically because it references row columns);
5223///   - the resolved `TsQuery` uses query shapes the MVP doesn't
5224///     accelerate (`Not`, `Phrase` — those fall through to full scan).
5225///
5226/// On `Some(rows)` the caller iterates only `rows` and re-evaluates
5227/// the full `@@` predicate per row, so an over-approximate candidate
5228/// set is safe.
5229fn try_gin_seek<'a>(
5230    where_expr: &Expr,
5231    schema_cols: &[ColumnSchema],
5232    catalog: &'a Catalog,
5233    table: &'a Table,
5234    table_alias: &str,
5235    ctx: &eval::EvalContext<'_>,
5236) -> Option<Vec<Cow<'a, Row>>> {
5237    if let Expr::Binary {
5238        lhs,
5239        op: BinOp::And,
5240        rhs,
5241    } = where_expr
5242    {
5243        if let Some(rows) = try_gin_seek(lhs, schema_cols, catalog, table, table_alias, ctx) {
5244            return Some(rows);
5245        }
5246        return try_gin_seek(rhs, schema_cols, catalog, table, table_alias, ctx);
5247    }
5248    let Expr::Binary {
5249        lhs,
5250        op: BinOp::TsMatch,
5251        rhs,
5252    } = where_expr
5253    else {
5254        return None;
5255    };
5256    // Either side can be the column; pgvector idiom (`vec @@ q`)
5257    // hits the first arm, FROM-clause-derived (`plainto_tsquery($1)
5258    // q ... WHERE search_vector @@ q`) the same. CROSS JOIN derived
5259    // tables resolve `q` to a Column too.
5260    let (col_pos, query) = resolve_gin_col_query(lhs, rhs, schema_cols, table_alias, ctx)
5261        .or_else(|| resolve_gin_col_query(rhs, lhs, schema_cols, table_alias, ctx))?;
5262    let idx = table
5263        .indices()
5264        .iter()
5265        .find(|i| i.column_position == col_pos && i.is_gin())?;
5266    let candidates = gin_query_candidates(idx, &query)?;
5267    let _ = catalog; // cold-tier row resolution unused in MVP; see below.
5268    let mut out: Vec<Cow<'a, Row>> = Vec::with_capacity(candidates.len());
5269    for loc in candidates {
5270        match loc {
5271            spg_storage::RowLocator::Hot(i) => {
5272                if let Some(row) = table.rows().get(i) {
5273                    out.push(Cow::Borrowed(row));
5274                }
5275            }
5276            // GIN cold-tier rows in the MVP: skipped, matching the
5277            // full-scan `@@` path which itself only iterates
5278            // `table.rows()` (hot tier). When v7.13+ adds cold-tier
5279            // scan-time materialisation for `@@`, the parallel
5280            // resolution lands here; until then both paths see the
5281            // same hot-only candidate set so correctness is preserved.
5282            spg_storage::RowLocator::Cold { .. } => {}
5283        }
5284    }
5285    Some(out)
5286}
5287
5288/// v7.12.3 — extract `(column_position, TsQueryAst)` when one side of
5289/// the binary is a column reference to a GIN-indexed tsvector column
5290/// and the other side const-evaluates to a `Value::TsQuery`. Returns
5291/// `None` if the column reference is for the wrong table alias, or if
5292/// the RHS expression depends on row data.
5293fn resolve_gin_col_query(
5294    col_side: &Expr,
5295    query_side: &Expr,
5296    schema_cols: &[ColumnSchema],
5297    table_alias: &str,
5298    ctx: &eval::EvalContext<'_>,
5299) -> Option<(usize, spg_storage::TsQueryAst)> {
5300    let Expr::Column(c) = col_side else {
5301        return None;
5302    };
5303    if let Some(q) = &c.qualifier
5304        && q != table_alias
5305    {
5306        return None;
5307    }
5308    let pos = schema_cols.iter().position(|s| s.name == c.name)?;
5309    // Const-evaluate the query side with an empty row — fails fast
5310    // (with a `ColumnNotFound` / similar) if the expression actually
5311    // depends on row data, which is exactly the bail signal we want.
5312    let empty_row = Row::new(Vec::new());
5313    let v = eval::eval_expr(query_side, &empty_row, ctx).ok()?;
5314    let Value::TsQuery(q) = v else { return None };
5315    Some((pos, q))
5316}
5317
5318/// v7.12.3 — walk a `TsQueryAst` against an [`IndexKind::Gin`] index
5319/// to produce a candidate row-locator set. Returns `None` for query
5320/// shapes the MVP doesn't accelerate (`Not` / `Phrase` — both bail to
5321/// full scan since their semantics need either complementation across
5322/// the whole row set or positional verification beyond what the
5323/// posting list carries).
5324///
5325/// Candidate sets are over-approximate — the caller re-applies the
5326/// full `@@` predicate per row, so reporting "row was in some
5327/// posting list" without verifying positions / weights stays correct.
5328fn gin_query_candidates(
5329    idx: &spg_storage::Index,
5330    query: &spg_storage::TsQueryAst,
5331) -> Option<Vec<spg_storage::RowLocator>> {
5332    use spg_storage::TsQueryAst;
5333    match query {
5334        TsQueryAst::Term { word, .. } => {
5335            let mut v: Vec<spg_storage::RowLocator> = idx.gin_lookup_word(word).to_vec();
5336            v.sort_by_key(locator_sort_key);
5337            v.dedup_by_key(|l| locator_sort_key(l));
5338            Some(v)
5339        }
5340        TsQueryAst::And(l, r) => {
5341            let mut left = gin_query_candidates(idx, l)?;
5342            let mut right = gin_query_candidates(idx, r)?;
5343            left.sort_by_key(locator_sort_key);
5344            right.sort_by_key(locator_sort_key);
5345            // Sorted-merge intersection.
5346            let mut out: Vec<spg_storage::RowLocator> = Vec::new();
5347            let (mut i, mut j) = (0usize, 0usize);
5348            while i < left.len() && j < right.len() {
5349                let lk = locator_sort_key(&left[i]);
5350                let rk = locator_sort_key(&right[j]);
5351                match lk.cmp(&rk) {
5352                    core::cmp::Ordering::Less => i += 1,
5353                    core::cmp::Ordering::Greater => j += 1,
5354                    core::cmp::Ordering::Equal => {
5355                        out.push(left[i]);
5356                        i += 1;
5357                        j += 1;
5358                    }
5359                }
5360            }
5361            Some(out)
5362        }
5363        TsQueryAst::Or(l, r) => {
5364            let mut out = gin_query_candidates(idx, l)?;
5365            out.extend(gin_query_candidates(idx, r)?);
5366            out.sort_by_key(locator_sort_key);
5367            out.dedup_by_key(|l| locator_sort_key(l));
5368            Some(out)
5369        }
5370        // Not / Phrase bail to full scan in the MVP. Not needs
5371        // complementation against the whole row set (not represented
5372        // in the posting-list view); Phrase needs positional
5373        // verification beyond what `word → rows` carries.
5374        TsQueryAst::Not(_) | TsQueryAst::Phrase { .. } => None,
5375    }
5376}
5377
5378/// v7.12.3 — total ordering on `RowLocator` for sort/dedup purposes
5379/// inside the GIN intersection / union loops. Hot rows order by their
5380/// row index; Cold rows order after all Hot rows, then by
5381/// `(segment_id, the cold sub-key)`.
5382fn locator_sort_key(l: &spg_storage::RowLocator) -> (u8, u64, u64) {
5383    match *l {
5384        spg_storage::RowLocator::Hot(i) => (0, i as u64, 0),
5385        spg_storage::RowLocator::Cold {
5386            segment_id,
5387            page_offset,
5388        } => (1, u64::from(segment_id), u64::from(page_offset)),
5389    }
5390}
5391
5392/// v5.2.3: extract `(column_position, IndexKey)` when `where_expr`
5393/// is a simple `col = literal` predicate suitable for a `BTree` index
5394/// seek. Used by `exec_update_cancel` / `exec_delete_cancel` to
5395/// decide whether a write touches a cold-tier row (which requires
5396/// promote-on-write / shadow-on-delete) before falling through to
5397/// the hot-tier row walk.
5398///
5399/// Returns `None` for any predicate shape the planner can't push
5400/// down to an index seek — complex WHERE clauses always take the
5401/// hot-only path (cold rows are immutable to non-indexed writes
5402/// until a future scan-fanout sub-version).
5403fn try_pk_predicate(
5404    where_expr: &Expr,
5405    schema_cols: &[ColumnSchema],
5406    table_alias: &str,
5407) -> Option<(usize, IndexKey)> {
5408    let Expr::Binary {
5409        lhs,
5410        op: BinOp::Eq,
5411        rhs,
5412    } = where_expr
5413    else {
5414        return None;
5415    };
5416    let (col_pos, value) = resolve_col_literal_pair(lhs, rhs, schema_cols, table_alias)
5417        .or_else(|| resolve_col_literal_pair(rhs, lhs, schema_cols, table_alias))?;
5418    let key = IndexKey::from_value(&value)?;
5419    Some((col_pos, key))
5420}
5421
5422fn resolve_col_literal_pair(
5423    col_side: &Expr,
5424    lit_side: &Expr,
5425    schema_cols: &[ColumnSchema],
5426    table_alias: &str,
5427) -> Option<(usize, Value)> {
5428    let Expr::Column(c) = col_side else {
5429        return None;
5430    };
5431    if let Some(q) = &c.qualifier
5432        && q != table_alias
5433    {
5434        return None;
5435    }
5436    let pos = schema_cols.iter().position(|s| s.name == c.name)?;
5437    let Expr::Literal(l) = lit_side else {
5438        return None;
5439    };
5440    let v = match l {
5441        Literal::Integer(n) => {
5442            if let Ok(small) = i32::try_from(*n) {
5443                Value::Int(small)
5444            } else {
5445                Value::BigInt(*n)
5446            }
5447        }
5448        Literal::Float(x) => Value::Float(*x),
5449        Literal::String(s) => Value::Text(s.clone()),
5450        Literal::Bool(b) => Value::Bool(*b),
5451        Literal::Null => Value::Null,
5452        // Vector and Interval literals can't be used as B-tree index keys.
5453        // Tell the planner to fall back to full-scan.
5454        Literal::Vector(_) | Literal::Interval { .. } => return None,
5455    };
5456    Some((pos, v))
5457}
5458
5459/// Find the schema entry that a SELECT-list `Expr::Column` refers to.
5460/// Mirrors `resolve_column` in `eval.rs`, but returns a proper
5461/// `EngineError` so the projection-build path keeps `UnknownQualifier`
5462/// vs `ColumnNotFound` distinct.
5463fn resolve_projection_column<'a>(
5464    c: &ColumnName,
5465    schema_cols: &'a [ColumnSchema],
5466    table_alias: &str,
5467) -> Result<&'a ColumnSchema, EngineError> {
5468    if let Some(q) = &c.qualifier {
5469        let composite = alloc::format!("{q}.{name}", name = c.name);
5470        if let Some(s) = schema_cols.iter().find(|s| s.name == composite) {
5471            return Ok(s);
5472        }
5473        // Single-table case: the qualifier may equal the active alias —
5474        // then look for the bare column name.
5475        if q == table_alias
5476            && let Some(s) = schema_cols.iter().find(|s| s.name == c.name)
5477        {
5478            return Ok(s);
5479        }
5480        // For multi-table schemas the qualifier is unknown only if no
5481        // column bears the "<q>." prefix. For single-table, the alias
5482        // mismatch alone is enough.
5483        let prefix = alloc::format!("{q}.");
5484        let qualifier_known =
5485            q == table_alias || schema_cols.iter().any(|s| s.name.starts_with(&prefix));
5486        if !qualifier_known {
5487            return Err(EngineError::Eval(EvalError::UnknownQualifier {
5488                qualifier: q.clone(),
5489            }));
5490        }
5491        return Err(EngineError::Eval(EvalError::ColumnNotFound {
5492            name: c.name.clone(),
5493        }));
5494    }
5495    if let Some(s) = schema_cols.iter().find(|s| s.name == c.name) {
5496        return Ok(s);
5497    }
5498    let suffix = alloc::format!(".{name}", name = c.name);
5499    let mut matches = schema_cols.iter().filter(|s| s.name.ends_with(&suffix));
5500    let first = matches.next();
5501    let extra = matches.next();
5502    match (first, extra) {
5503        (Some(s), None) => Ok(s),
5504        (Some(_), Some(_)) => Err(EngineError::Eval(EvalError::TypeMismatch {
5505            detail: alloc::format!("ambiguous column reference: {}", c.name),
5506        })),
5507        _ => Err(EngineError::Eval(EvalError::ColumnNotFound {
5508            name: c.name.clone(),
5509        })),
5510    }
5511}
5512
5513fn build_projection(
5514    items: &[SelectItem],
5515    schema_cols: &[ColumnSchema],
5516    table_alias: &str,
5517) -> Result<Vec<ProjectedItem>, EngineError> {
5518    let mut out = Vec::new();
5519    for item in items {
5520        match item {
5521            SelectItem::Wildcard => {
5522                for col in schema_cols {
5523                    out.push(ProjectedItem {
5524                        expr: Expr::Column(ColumnName {
5525                            qualifier: None,
5526                            name: col.name.clone(),
5527                        }),
5528                        output_name: col.name.clone(),
5529                        ty: col.ty,
5530                        nullable: col.nullable,
5531                    });
5532                }
5533            }
5534            SelectItem::Expr { expr, alias } => {
5535                // Plain column ref keeps full schema info (real type +
5536                // nullability). Compound expressions evaluate fine but have
5537                // no static type — surface them as nullable TEXT, which is
5538                // what most clients render anyway.
5539                if let Expr::Column(c) = expr {
5540                    let sch = resolve_projection_column(c, schema_cols, table_alias)?;
5541                    let output_name = alias.clone().unwrap_or_else(|| c.name.clone());
5542                    out.push(ProjectedItem {
5543                        expr: expr.clone(),
5544                        output_name,
5545                        ty: sch.ty,
5546                        nullable: sch.nullable,
5547                    });
5548                } else {
5549                    let output_name = alias.clone().unwrap_or_else(|| expr.to_string());
5550                    out.push(ProjectedItem {
5551                        expr: expr.clone(),
5552                        output_name,
5553                        ty: DataType::Text,
5554                        nullable: true,
5555                    });
5556                }
5557            }
5558        }
5559    }
5560    Ok(out)
5561}
5562
5563/// Promote an integer to a NUMERIC value at the requested scale.
5564/// Rejects values that, after scaling, would overflow the column's
5565/// precision budget.
5566fn numeric_from_integer(
5567    n: i128,
5568    precision: u8,
5569    scale: u8,
5570    col_name: &str,
5571) -> Result<Value, EngineError> {
5572    let factor = pow10_i128(scale);
5573    let scaled = n.checked_mul(factor).ok_or_else(|| {
5574        EngineError::Unsupported(alloc::format!(
5575            "integer overflow scaling value for column `{col_name}` to scale {scale}"
5576        ))
5577    })?;
5578    check_precision(scaled, precision, col_name)?;
5579    Ok(Value::Numeric { scaled, scale })
5580}
5581
5582/// Float → NUMERIC. Uses round-half-away-from-zero on `x * 10^scale`,
5583/// then verifies the result fits the column's precision.
5584#[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
5585fn numeric_from_float(
5586    x: f64,
5587    precision: u8,
5588    scale: u8,
5589    col_name: &str,
5590) -> Result<Value, EngineError> {
5591    if !x.is_finite() {
5592        return Err(EngineError::Unsupported(alloc::format!(
5593            "cannot store non-finite float in NUMERIC column `{col_name}`"
5594        )));
5595    }
5596    let mut factor = 1.0_f64;
5597    for _ in 0..scale {
5598        factor *= 10.0;
5599    }
5600    // Round half-away-from-zero by biasing then casting (`as i128`
5601    // truncates toward zero, so the bias + truncation gives the
5602    // desired rounding). `f64::floor` / `ceil` live in std; we don't
5603    // need them — the cast handles the truncation step.
5604    let shifted = x * factor;
5605    let biased = if shifted >= 0.0 {
5606        shifted + 0.5
5607    } else {
5608        shifted - 0.5
5609    };
5610    // Range-check before casting back to i128 — the cast itself is
5611    // saturating in Rust, which would silently truncate huge inputs.
5612    if !(-1e38..=1e38).contains(&biased) {
5613        return Err(EngineError::Unsupported(alloc::format!(
5614            "value {x} overflows NUMERIC range for column `{col_name}`"
5615        )));
5616    }
5617    let scaled = biased as i128;
5618    check_precision(scaled, precision, col_name)?;
5619    Ok(Value::Numeric { scaled, scale })
5620}
5621
5622/// Move a Numeric value from `src_scale` to `dst_scale`. Going up
5623/// multiplies by 10; going down rounds half-away-from-zero.
5624fn numeric_rescale(
5625    scaled: i128,
5626    src_scale: u8,
5627    precision: u8,
5628    dst_scale: u8,
5629    col_name: &str,
5630) -> Result<Value, EngineError> {
5631    let new_scaled = if dst_scale >= src_scale {
5632        let bump = pow10_i128(dst_scale - src_scale);
5633        scaled.checked_mul(bump).ok_or_else(|| {
5634            EngineError::Unsupported(alloc::format!(
5635                "overflow rescaling NUMERIC for column `{col_name}`"
5636            ))
5637        })?
5638    } else {
5639        let drop = pow10_i128(src_scale - dst_scale);
5640        let half = drop / 2;
5641        if scaled >= 0 {
5642            (scaled + half) / drop
5643        } else {
5644            (scaled - half) / drop
5645        }
5646    };
5647    check_precision(new_scaled, precision, col_name)?;
5648    Ok(Value::Numeric {
5649        scaled: new_scaled,
5650        scale: dst_scale,
5651    })
5652}
5653
5654/// Drop the fractional part of a scaled integer, returning the integer
5655/// portion (toward zero). Used for NUMERIC → INT casts.
5656const fn numeric_truncate_to_integer(scaled: i128, scale: u8) -> i128 {
5657    if scale == 0 {
5658        return scaled;
5659    }
5660    let factor = pow10_i128_const(scale);
5661    scaled / factor
5662}
5663
5664/// Verify a scaled NUMERIC value fits the column's declared precision.
5665/// `precision == 0` is the "unconstrained" form (bare `NUMERIC`); we
5666/// skip the check there.
5667fn check_precision(scaled: i128, precision: u8, col_name: &str) -> Result<(), EngineError> {
5668    if precision == 0 {
5669        return Ok(());
5670    }
5671    let limit = pow10_i128(precision);
5672    if scaled.unsigned_abs() >= limit.unsigned_abs() {
5673        return Err(EngineError::Unsupported(alloc::format!(
5674            "NUMERIC value exceeds precision {precision} for column `{col_name}`"
5675        )));
5676    }
5677    Ok(())
5678}
5679
5680const fn pow10_i128_const(p: u8) -> i128 {
5681    let mut acc: i128 = 1;
5682    let mut i = 0;
5683    while i < p {
5684        acc *= 10;
5685        i += 1;
5686    }
5687    acc
5688}
5689
5690fn pow10_i128(p: u8) -> i128 {
5691    pow10_i128_const(p)
5692}
5693
5694/// Walk a parsed `Statement`, swapping any `NOW()` /
5695/// `CURRENT_TIMESTAMP()` / `CURRENT_DATE()` function calls for a
5696/// literal cast that wraps the engine's per-statement clock reading.
5697/// When `now_micros` is `None`, calls stay as-is and surface as
5698/// `unknown function` at eval time — keeps the error path explicit.
5699/// v4.10: pre-walk the WHERE / projection / etc. of a SELECT and
5700/// replace every subquery node with a materialised literal. SPG
5701/// only supports uncorrelated subqueries — the inner SELECT does
5702/// not see outer-row columns, so the result is the same for every
5703/// outer row and can be evaluated once.
5704///
5705/// Returns the rewritten statement; the caller passes this to the
5706/// regular row-loop executor which no longer sees Subquery nodes
5707/// in its tree.
5708impl Engine {
5709    /// v4.12 window executor. Implements `ROW_NUMBER` / `RANK` /
5710    /// `DENSE_RANK` and the partition-aware aggregates `SUM` /
5711    /// `AVG` / `COUNT` / `MIN` / `MAX`. The plan is:
5712    /// 1. Apply the WHERE filter.
5713    /// 2. For each unique `WindowFunction` node in the projection,
5714    ///    partition + sort, compute the per-row value.
5715    /// 3. Append the window values as synthetic columns (`__win_N`)
5716    ///    to the row schema.
5717    /// 4. Rewrite the projection to read those columns.
5718    /// 5. Hand off to the regular project / ORDER BY / LIMIT pipe.
5719    #[allow(
5720        clippy::too_many_lines,
5721        clippy::type_complexity,
5722        clippy::needless_range_loop
5723    )] // window-eval is one cohesive pipe; splitting fragments
5724    fn exec_select_with_window(
5725        &self,
5726        stmt: &SelectStatement,
5727        cancel: CancelToken<'_>,
5728    ) -> Result<QueryResult, EngineError> {
5729        let from = stmt.from.as_ref().ok_or_else(|| {
5730            EngineError::Unsupported("window functions require a FROM clause".into())
5731        })?;
5732        // For v4.12 we only support a single-table FROM. Joins +
5733        // windows is queued for v5.x.
5734        if !from.joins.is_empty() {
5735            return Err(EngineError::Unsupported(
5736                "JOIN with window functions not yet supported".into(),
5737            ));
5738        }
5739        let primary = &from.primary;
5740        let table = self.active_catalog().get(&primary.name).ok_or_else(|| {
5741            StorageError::TableNotFound {
5742                name: primary.name.clone(),
5743            }
5744        })?;
5745        let alias = primary.alias.as_deref().unwrap_or(primary.name.as_str());
5746        let schema_cols = &table.schema().columns;
5747        let ctx = self.ev_ctx(schema_cols, Some(alias));
5748
5749        // 1) Filter pass.
5750        let mut filtered: Vec<&Row> = Vec::new();
5751        for (i, row) in table.rows().iter().enumerate() {
5752            if i.is_multiple_of(256) {
5753                cancel.check()?;
5754            }
5755            if let Some(w) = &stmt.where_ {
5756                let cond = eval::eval_expr(w, row, &ctx)?;
5757                if !matches!(cond, Value::Bool(true)) {
5758                    continue;
5759                }
5760            }
5761            filtered.push(row);
5762        }
5763        let n_rows = filtered.len();
5764
5765        // 2) Collect unique window function nodes from projection.
5766        let mut window_nodes: Vec<Expr> = Vec::new();
5767        for item in &stmt.items {
5768            if let SelectItem::Expr { expr, .. } = item {
5769                collect_window_nodes(expr, &mut window_nodes);
5770            }
5771        }
5772
5773        // 3) For each window, compute per-row value.
5774        // Index: same order as window_nodes; for row i, win_vals[w][i].
5775        let mut win_vals: Vec<Vec<Value>> = Vec::with_capacity(window_nodes.len());
5776        for wnode in &window_nodes {
5777            let Expr::WindowFunction {
5778                name,
5779                args,
5780                partition_by,
5781                order_by,
5782                frame,
5783                null_treatment,
5784            } = wnode
5785            else {
5786                unreachable!("collect_window_nodes pushes only WindowFunction");
5787            };
5788            // Compute (partition_key, order_key, original_index) for each row.
5789            let mut indexed: Vec<(Vec<Value>, Vec<(Value, bool)>, usize)> =
5790                Vec::with_capacity(n_rows);
5791            for (i, row) in filtered.iter().enumerate() {
5792                let pkey: Vec<Value> = partition_by
5793                    .iter()
5794                    .map(|p| eval::eval_expr(p, row, &ctx))
5795                    .collect::<Result<_, _>>()?;
5796                let okey: Vec<(Value, bool)> = order_by
5797                    .iter()
5798                    .map(|(e, desc)| eval::eval_expr(e, row, &ctx).map(|v| (v, *desc)))
5799                    .collect::<Result<_, _>>()?;
5800                indexed.push((pkey, okey, i));
5801            }
5802            // Sort by (partition_key, order_key). Partition key uses
5803            // a stable encoded form; order key respects ASC/DESC.
5804            indexed.sort_by(|a, b| {
5805                let p_cmp = partition_key_cmp(&a.0, &b.0);
5806                if p_cmp != core::cmp::Ordering::Equal {
5807                    return p_cmp;
5808                }
5809                order_key_cmp(&a.1, &b.1)
5810            });
5811            // Per-partition compute.
5812            let mut out_vals: Vec<Value> = alloc::vec![Value::Null; n_rows];
5813            let mut p_start = 0;
5814            while p_start < indexed.len() {
5815                let mut p_end = p_start + 1;
5816                while p_end < indexed.len()
5817                    && partition_key_cmp(&indexed[p_start].0, &indexed[p_end].0)
5818                        == core::cmp::Ordering::Equal
5819                {
5820                    p_end += 1;
5821                }
5822                // Compute the function within this partition slice.
5823                compute_window_partition(
5824                    name,
5825                    args,
5826                    !order_by.is_empty(),
5827                    frame.as_ref(),
5828                    *null_treatment,
5829                    &indexed[p_start..p_end],
5830                    &filtered,
5831                    &ctx,
5832                    &mut out_vals,
5833                )?;
5834                p_start = p_end;
5835            }
5836            win_vals.push(out_vals);
5837        }
5838
5839        // 4) Build extended schema: original columns + synthetic.
5840        let mut ext_cols = schema_cols.clone();
5841        for i in 0..window_nodes.len() {
5842            ext_cols.push(ColumnSchema::new(
5843                alloc::format!("__win_{i}"),
5844                DataType::Text, // type doesn't matter for projection eval
5845                true,
5846            ));
5847        }
5848        // 5) Build extended rows: each row gets its window values appended.
5849        let mut ext_rows: Vec<Row> = Vec::with_capacity(n_rows);
5850        for i in 0..n_rows {
5851            let mut values = filtered[i].values.clone();
5852            for w in 0..window_nodes.len() {
5853                values.push(win_vals[w][i].clone());
5854            }
5855            ext_rows.push(Row::new(values));
5856        }
5857        // 6) Rewrite the projection: WindowFunction nodes → Column(__win_N).
5858        let mut rewritten_items: Vec<SelectItem> = Vec::with_capacity(stmt.items.len());
5859        for item in &stmt.items {
5860            let new_item = match item {
5861                SelectItem::Wildcard => SelectItem::Wildcard,
5862                SelectItem::Expr { expr, alias } => {
5863                    let mut e = expr.clone();
5864                    rewrite_window_to_columns(&mut e, &window_nodes);
5865                    SelectItem::Expr {
5866                        expr: e,
5867                        alias: alias.clone(),
5868                    }
5869                }
5870            };
5871            rewritten_items.push(new_item);
5872        }
5873
5874        // 7) Project into final rows.
5875        let ext_ctx = EvalContext::new(&ext_cols, Some(alias));
5876        let projection = build_projection(&rewritten_items, &ext_cols, alias)?;
5877        let mut tagged: Vec<(Vec<f64>, Row)> = Vec::with_capacity(n_rows);
5878        for (i, row) in ext_rows.iter().enumerate() {
5879            if i.is_multiple_of(256) {
5880                cancel.check()?;
5881            }
5882            let mut values = Vec::with_capacity(projection.len());
5883            for p in &projection {
5884                values.push(eval::eval_expr(&p.expr, row, &ext_ctx)?);
5885            }
5886            let order_keys = if stmt.order_by.is_empty() {
5887                Vec::new()
5888            } else {
5889                let mut keys = Vec::with_capacity(stmt.order_by.len());
5890                for o in &stmt.order_by {
5891                    let mut e = o.expr.clone();
5892                    rewrite_window_to_columns(&mut e, &window_nodes);
5893                    let key = eval::eval_expr(&e, row, &ext_ctx)?;
5894                    keys.push(value_to_order_key(&key)?);
5895                }
5896                keys
5897            };
5898            tagged.push((order_keys, Row::new(values)));
5899        }
5900        // ORDER BY + LIMIT/OFFSET on the projected rows.
5901        if !stmt.order_by.is_empty() {
5902            let descs: Vec<bool> = stmt.order_by.iter().map(|o| o.desc).collect();
5903            sort_by_keys(&mut tagged, &descs);
5904        }
5905        let mut out_rows: Vec<Row> = tagged.into_iter().map(|(_, r)| r).collect();
5906        apply_offset_and_limit(&mut out_rows, stmt.offset_literal(), stmt.limit_literal());
5907        let final_cols: Vec<ColumnSchema> = projection
5908            .into_iter()
5909            .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
5910            .collect();
5911        Ok(QueryResult::Rows {
5912            columns: final_cols,
5913            rows: out_rows,
5914        })
5915    }
5916
5917    /// v4.11: materialise each CTE into a temp table inside a
5918    /// cloned catalog, then run the body SELECT against a fresh
5919    /// engine instance that owns the enriched catalog. The clone
5920    /// is moderately expensive — only paid by CTE-bearing queries.
5921    /// Subqueries inside CTE bodies / the main body resolve as
5922    /// usual; `clock_fn` is propagated so `NOW()` lines up.
5923    fn exec_with_ctes(
5924        &self,
5925        stmt: &SelectStatement,
5926        cancel: CancelToken<'_>,
5927    ) -> Result<QueryResult, EngineError> {
5928        cancel.check()?;
5929        let mut catalog = self.active_catalog().clone();
5930        for cte in &stmt.ctes {
5931            if catalog.get(&cte.name).is_some() {
5932                return Err(EngineError::Unsupported(alloc::format!(
5933                    "CTE name {:?} shadows an existing table; rename the CTE",
5934                    cte.name
5935                )));
5936            }
5937            let (columns, rows) = if cte.recursive {
5938                self.materialise_recursive_cte(cte, &catalog, cancel)?
5939            } else {
5940                let body_result = self.exec_select_cancel(&cte.body, cancel)?;
5941                let QueryResult::Rows { columns, rows } = body_result else {
5942                    return Err(EngineError::Unsupported(alloc::format!(
5943                        "CTE {:?} body did not return rows",
5944                        cte.name
5945                    )));
5946                };
5947                (columns, rows)
5948            };
5949            // v4.22: the projection builder labels any non-column
5950            // expression as Text — including literal SELECT 1.
5951            // Promote each column's type to whatever the rows
5952            // actually carry so the CTE storage table accepts them.
5953            let inferred = infer_column_types(&columns, &rows);
5954            let mut columns = inferred;
5955            // v4.22: apply optional `WITH name(a, b, c)` overrides.
5956            if !cte.column_overrides.is_empty() {
5957                if cte.column_overrides.len() != columns.len() {
5958                    return Err(EngineError::Unsupported(alloc::format!(
5959                        "CTE {:?} column list has {} names but body returns {} columns",
5960                        cte.name,
5961                        cte.column_overrides.len(),
5962                        columns.len()
5963                    )));
5964                }
5965                for (col, name) in columns.iter_mut().zip(cte.column_overrides.iter()) {
5966                    col.name.clone_from(name);
5967                }
5968            }
5969            let schema = TableSchema::new(cte.name.clone(), columns);
5970            catalog.create_table(schema).map_err(EngineError::Storage)?;
5971            let table = catalog
5972                .get_mut(&cte.name)
5973                .expect("just-created CTE table must exist");
5974            for row in rows {
5975                table.insert(row).map_err(EngineError::Storage)?;
5976            }
5977        }
5978        // Strip CTEs from the body before running on the temp engine
5979        // so we don't recurse forever.
5980        let mut body = stmt.clone();
5981        body.ctes = Vec::new();
5982        let mut temp = Engine::restore(catalog);
5983        if let Some(c) = self.clock {
5984            temp = temp.with_clock(c);
5985        }
5986        if let Some(f) = self.salt_fn {
5987            temp = temp.with_salt_fn(f);
5988        }
5989        temp.exec_select_cancel(&body, cancel)
5990    }
5991
5992    /// v4.22: materialise a WITH RECURSIVE CTE. The body must be a
5993    /// UNION (or UNION ALL) of an anchor that does not reference
5994    /// the CTE name, and one or more recursive terms that do. The
5995    /// anchor runs first; each subsequent iteration runs the
5996    /// recursive term against a temp catalog where the CTE name is
5997    /// bound to the *previous* iteration's output. Iteration stops
5998    /// when the recursive term yields no rows; UNION (DISTINCT)
5999    /// deduplicates against the accumulated result, UNION ALL does
6000    /// not. A hard cap on total rows prevents runaway queries.
6001    #[allow(clippy::too_many_lines)]
6002    fn materialise_recursive_cte(
6003        &self,
6004        cte: &spg_sql::ast::Cte,
6005        base_catalog: &Catalog,
6006        cancel: CancelToken<'_>,
6007    ) -> Result<(Vec<ColumnSchema>, Vec<Row>), EngineError> {
6008        const MAX_TOTAL_ROWS: usize = 1_000_000;
6009        const MAX_ITERATIONS: usize = 100_000;
6010        cancel.check()?;
6011        if cte.body.unions.is_empty() {
6012            return Err(EngineError::Unsupported(alloc::format!(
6013                "WITH RECURSIVE {:?} body must be a UNION of an anchor and a recursive term",
6014                cte.name
6015            )));
6016        }
6017        // Anchor: the body's leading SELECT, with unions stripped.
6018        let mut anchor = cte.body.clone();
6019        let union_terms = core::mem::take(&mut anchor.unions);
6020        anchor.ctes = Vec::new();
6021        // Anchor must not reference the CTE name.
6022        if select_refers_to(&anchor, &cte.name) {
6023            return Err(EngineError::Unsupported(alloc::format!(
6024                "WITH RECURSIVE {:?}: the anchor must not reference the CTE itself",
6025                cte.name
6026            )));
6027        }
6028        let anchor_result = self.exec_select_cancel(&anchor, cancel)?;
6029        let QueryResult::Rows {
6030            columns: anchor_cols,
6031            rows: anchor_rows,
6032        } = anchor_result
6033        else {
6034            return Err(EngineError::Unsupported(alloc::format!(
6035                "WITH RECURSIVE {:?}: anchor did not return rows",
6036                cte.name
6037            )));
6038        };
6039        // The projection builder labels non-column expressions Text;
6040        // refine column types from the anchor's actual values so the
6041        // intermediate iter-catalog tables accept them.
6042        let mut columns = infer_column_types(&anchor_cols, &anchor_rows);
6043        if !cte.column_overrides.is_empty() {
6044            if cte.column_overrides.len() != columns.len() {
6045                return Err(EngineError::Unsupported(alloc::format!(
6046                    "CTE {:?} column list has {} names but anchor returns {} columns",
6047                    cte.name,
6048                    cte.column_overrides.len(),
6049                    columns.len()
6050                )));
6051            }
6052            for (col, name) in columns.iter_mut().zip(cte.column_overrides.iter()) {
6053                col.name.clone_from(name);
6054            }
6055        }
6056        let mut all_rows: Vec<Row> = anchor_rows.clone();
6057        let mut working_set: Vec<Row> = anchor_rows;
6058        let mut seen: alloc::collections::BTreeSet<Vec<u8>> = alloc::collections::BTreeSet::new();
6059        // Track at least one "all UNION ALL" flag — if every union
6060        // kind is ALL we skip the dedup step (faster + matches PG).
6061        let all_union_all = union_terms.iter().all(|(k, _)| matches!(k, UnionKind::All));
6062        if !all_union_all {
6063            for r in &all_rows {
6064                seen.insert(encode_row_key(r));
6065            }
6066        }
6067        for iter in 0..MAX_ITERATIONS {
6068            cancel.check()?;
6069            if working_set.is_empty() {
6070                break;
6071            }
6072            // Build a fresh catalog: base + CTE bound to working_set.
6073            let mut iter_catalog = base_catalog.clone();
6074            let schema = TableSchema::new(cte.name.clone(), columns.clone());
6075            iter_catalog
6076                .create_table(schema)
6077                .map_err(EngineError::Storage)?;
6078            {
6079                let table = iter_catalog.get_mut(&cte.name).expect("just-created");
6080                for row in &working_set {
6081                    table.insert(row.clone()).map_err(EngineError::Storage)?;
6082                }
6083            }
6084            let mut iter_engine = Engine::restore(iter_catalog);
6085            if let Some(c) = self.clock {
6086                iter_engine = iter_engine.with_clock(c);
6087            }
6088            if let Some(f) = self.salt_fn {
6089                iter_engine = iter_engine.with_salt_fn(f);
6090            }
6091            // Run each recursive term in sequence and collect new rows.
6092            let mut next_set: Vec<Row> = Vec::new();
6093            for (_, term) in &union_terms {
6094                let mut term = term.clone();
6095                term.ctes = Vec::new();
6096                let r = iter_engine.exec_select_cancel(&term, cancel)?;
6097                let QueryResult::Rows {
6098                    columns: rc,
6099                    rows: rs,
6100                } = r
6101                else {
6102                    return Err(EngineError::Unsupported(alloc::format!(
6103                        "WITH RECURSIVE {:?}: recursive term did not return rows",
6104                        cte.name
6105                    )));
6106                };
6107                if rc.len() != columns.len() {
6108                    return Err(EngineError::Unsupported(alloc::format!(
6109                        "WITH RECURSIVE {:?}: column count of recursive term ({}) does not match anchor ({})",
6110                        cte.name,
6111                        rc.len(),
6112                        columns.len()
6113                    )));
6114                }
6115                for row in rs {
6116                    if !all_union_all {
6117                        let key = encode_row_key(&row);
6118                        if !seen.insert(key) {
6119                            continue;
6120                        }
6121                    }
6122                    next_set.push(row);
6123                }
6124            }
6125            if next_set.is_empty() {
6126                break;
6127            }
6128            all_rows.extend(next_set.iter().cloned());
6129            working_set = next_set;
6130            if all_rows.len() > MAX_TOTAL_ROWS {
6131                return Err(EngineError::Unsupported(alloc::format!(
6132                    "WITH RECURSIVE {:?}: produced more than {MAX_TOTAL_ROWS} rows — likely runaway recursion",
6133                    cte.name
6134                )));
6135            }
6136            if iter + 1 == MAX_ITERATIONS {
6137                return Err(EngineError::Unsupported(alloc::format!(
6138                    "WITH RECURSIVE {:?}: exceeded {MAX_ITERATIONS} iterations",
6139                    cte.name
6140                )));
6141            }
6142        }
6143        Ok((columns, all_rows))
6144    }
6145
6146    fn resolve_select_subqueries(
6147        &self,
6148        stmt: &mut SelectStatement,
6149        cancel: CancelToken<'_>,
6150    ) -> Result<(), EngineError> {
6151        for item in &mut stmt.items {
6152            if let SelectItem::Expr { expr, .. } = item {
6153                self.resolve_expr_subqueries(expr, cancel)?;
6154            }
6155        }
6156        if let Some(w) = &mut stmt.where_ {
6157            self.resolve_expr_subqueries(w, cancel)?;
6158        }
6159        if let Some(gs) = &mut stmt.group_by {
6160            for g in gs {
6161                self.resolve_expr_subqueries(g, cancel)?;
6162            }
6163        }
6164        if let Some(h) = &mut stmt.having {
6165            self.resolve_expr_subqueries(h, cancel)?;
6166        }
6167        for o in &mut stmt.order_by {
6168            self.resolve_expr_subqueries(&mut o.expr, cancel)?;
6169        }
6170        for (_, peer) in &mut stmt.unions {
6171            self.resolve_select_subqueries(peer, cancel)?;
6172        }
6173        Ok(())
6174    }
6175
6176    #[allow(clippy::only_used_in_recursion)] // engine handle reads aren't really pure
6177    fn resolve_expr_subqueries(
6178        &self,
6179        e: &mut Expr,
6180        cancel: CancelToken<'_>,
6181    ) -> Result<(), EngineError> {
6182        // Replace-on-this-node cases first.
6183        if let Some(replacement) = self.subquery_replacement(e, cancel)? {
6184            *e = replacement;
6185            return Ok(());
6186        }
6187        match e {
6188            Expr::Binary { lhs, rhs, .. } => {
6189                self.resolve_expr_subqueries(lhs, cancel)?;
6190                self.resolve_expr_subqueries(rhs, cancel)?;
6191            }
6192            Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
6193                self.resolve_expr_subqueries(expr, cancel)?;
6194            }
6195            Expr::FunctionCall { args, .. } => {
6196                for a in args {
6197                    self.resolve_expr_subqueries(a, cancel)?;
6198                }
6199            }
6200            Expr::Like { expr, pattern, .. } => {
6201                self.resolve_expr_subqueries(expr, cancel)?;
6202                self.resolve_expr_subqueries(pattern, cancel)?;
6203            }
6204            Expr::Extract { source, .. } => self.resolve_expr_subqueries(source, cancel)?,
6205            // v4.12 window functions — recurse into args + ORDER BY
6206            // + PARTITION BY in case they carry inner subqueries.
6207            Expr::WindowFunction {
6208                args,
6209                partition_by,
6210                order_by,
6211                ..
6212            } => {
6213                for a in args {
6214                    self.resolve_expr_subqueries(a, cancel)?;
6215                }
6216                for p in partition_by {
6217                    self.resolve_expr_subqueries(p, cancel)?;
6218                }
6219                for (e, _) in order_by {
6220                    self.resolve_expr_subqueries(e, cancel)?;
6221                }
6222            }
6223            // Subquery nodes are handled in subquery_replacement
6224            // (which returned None — defensive no-op); Literal /
6225            // Column are leaves.
6226            Expr::ScalarSubquery(_)
6227            | Expr::Exists { .. }
6228            | Expr::InSubquery { .. }
6229            | Expr::Literal(_)
6230            | Expr::Placeholder(_)
6231            | Expr::Column(_) => {}
6232            // v7.10.10 — recurse children.
6233            Expr::Array(items) => {
6234                for elem in items {
6235                    self.resolve_expr_subqueries(elem, cancel)?;
6236                }
6237            }
6238            Expr::ArraySubscript { target, index } => {
6239                self.resolve_expr_subqueries(target, cancel)?;
6240                self.resolve_expr_subqueries(index, cancel)?;
6241            }
6242            Expr::AnyAll { expr, array, .. } => {
6243                self.resolve_expr_subqueries(expr, cancel)?;
6244                self.resolve_expr_subqueries(array, cancel)?;
6245            }
6246        }
6247        Ok(())
6248    }
6249
6250    /// v4.23: per-row eval that handles correlated subqueries.
6251    /// Equivalent to `eval::eval_expr` when the expression has no
6252    /// subqueries; otherwise clones the expression, substitutes
6253    /// outer-row columns into each surviving subquery node, runs
6254    /// the inner SELECT, and replaces the node with the literal
6255    /// result. Only the WHERE-filter call sites use this path so
6256    /// the uncorrelated fast path is preserved everywhere else.
6257    fn eval_expr_with_correlated(
6258        &self,
6259        expr: &Expr,
6260        row: &Row,
6261        ctx: &EvalContext<'_>,
6262        cancel: CancelToken<'_>,
6263        memo: Option<&mut memoize::MemoizeCache>,
6264    ) -> Result<Value, EngineError> {
6265        if !expr_has_subquery(expr) {
6266            return eval::eval_expr(expr, row, ctx).map_err(EngineError::Eval);
6267        }
6268        let mut e = expr.clone();
6269        self.resolve_correlated_in_expr(&mut e, row, ctx, cancel, memo)?;
6270        eval::eval_expr(&e, row, ctx).map_err(EngineError::Eval)
6271    }
6272
6273    fn resolve_correlated_in_expr(
6274        &self,
6275        e: &mut Expr,
6276        row: &Row,
6277        ctx: &EvalContext<'_>,
6278        cancel: CancelToken<'_>,
6279        mut memo: Option<&mut memoize::MemoizeCache>,
6280    ) -> Result<(), EngineError> {
6281        match e {
6282            Expr::ScalarSubquery(inner) => {
6283                // v6.2.6 — Memoize: build the cache key from the
6284                // pre-substitution subquery repr + the outer row's
6285                // values. Two outer rows with identical correlated
6286                // values hit the same entry.
6287                let cache_key = memo.as_ref().map(|_| memoize::CacheKey {
6288                    subquery_repr: alloc::format!("{}", **inner),
6289                    outer_values: row.values.clone(),
6290                });
6291                if let (Some(cache), Some(k)) = (memo.as_deref_mut(), cache_key.as_ref())
6292                    && let Some(cached) = cache.get(k)
6293                {
6294                    *e = value_to_literal_expr(cached)?;
6295                    return Ok(());
6296                }
6297                let mut s = (**inner).clone();
6298                substitute_outer_columns(&mut s, row, ctx);
6299                let r = self.exec_select_cancel(&s, cancel)?;
6300                let QueryResult::Rows { rows, .. } = r else {
6301                    return Err(EngineError::Unsupported(
6302                        "scalar subquery: inner did not return rows".into(),
6303                    ));
6304                };
6305                let value = match rows.as_slice() {
6306                    [] => Value::Null,
6307                    [r0] => r0.values.first().cloned().unwrap_or(Value::Null),
6308                    _ => {
6309                        return Err(EngineError::Unsupported(alloc::format!(
6310                            "scalar subquery returned {} rows; expected 0 or 1",
6311                            rows.len()
6312                        )));
6313                    }
6314                };
6315                if let (Some(cache), Some(k)) = (memo.as_deref_mut(), cache_key) {
6316                    cache.insert(k, value.clone());
6317                }
6318                *e = value_to_literal_expr(value)?;
6319            }
6320            Expr::Exists { subquery, negated } => {
6321                let mut s = (**subquery).clone();
6322                substitute_outer_columns(&mut s, row, ctx);
6323                let r = self.exec_select_cancel(&s, cancel)?;
6324                let exists = matches!(r, QueryResult::Rows { rows, .. } if !rows.is_empty());
6325                let bit = if *negated { !exists } else { exists };
6326                *e = Expr::Literal(Literal::Bool(bit));
6327            }
6328            Expr::InSubquery {
6329                expr: lhs,
6330                subquery,
6331                negated,
6332            } => {
6333                self.resolve_correlated_in_expr(lhs, row, ctx, cancel, memo.as_deref_mut())?;
6334                let lhs_val = eval::eval_expr(lhs, row, ctx).map_err(EngineError::Eval)?;
6335                let mut s = (**subquery).clone();
6336                substitute_outer_columns(&mut s, row, ctx);
6337                let r = self.exec_select_cancel(&s, cancel)?;
6338                let QueryResult::Rows { columns, rows, .. } = r else {
6339                    return Err(EngineError::Unsupported(
6340                        "IN-subquery: inner did not return rows".into(),
6341                    ));
6342                };
6343                if columns.len() != 1 {
6344                    return Err(EngineError::Unsupported(alloc::format!(
6345                        "IN-subquery must project exactly one column; got {}",
6346                        columns.len()
6347                    )));
6348                }
6349                let mut found = false;
6350                let mut any_null = false;
6351                for r0 in rows {
6352                    let v = r0.values.into_iter().next().unwrap_or(Value::Null);
6353                    if v.is_null() {
6354                        any_null = true;
6355                        continue;
6356                    }
6357                    if value_cmp(&v, &lhs_val) == core::cmp::Ordering::Equal {
6358                        found = true;
6359                        break;
6360                    }
6361                }
6362                let bit = if found {
6363                    !*negated
6364                } else if any_null {
6365                    return Err(EngineError::Unsupported(
6366                        "IN-subquery with NULL in result and no match: NULL semantics not yet implemented".into(),
6367                    ));
6368                } else {
6369                    *negated
6370                };
6371                *e = Expr::Literal(Literal::Bool(bit));
6372            }
6373            Expr::Binary { lhs, rhs, .. } => {
6374                self.resolve_correlated_in_expr(lhs, row, ctx, cancel, memo.as_deref_mut())?;
6375                self.resolve_correlated_in_expr(rhs, row, ctx, cancel, memo.as_deref_mut())?;
6376            }
6377            Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
6378                self.resolve_correlated_in_expr(expr, row, ctx, cancel, memo.as_deref_mut())?;
6379            }
6380            Expr::Like { expr, pattern, .. } => {
6381                self.resolve_correlated_in_expr(expr, row, ctx, cancel, memo.as_deref_mut())?;
6382                self.resolve_correlated_in_expr(pattern, row, ctx, cancel, memo.as_deref_mut())?;
6383            }
6384            Expr::FunctionCall { args, .. } => {
6385                for a in args {
6386                    self.resolve_correlated_in_expr(a, row, ctx, cancel, memo.as_deref_mut())?;
6387                }
6388            }
6389            Expr::Extract { source, .. } => {
6390                self.resolve_correlated_in_expr(source, row, ctx, cancel, memo.as_deref_mut())?;
6391            }
6392            Expr::WindowFunction { .. }
6393            | Expr::Literal(_)
6394            | Expr::Placeholder(_)
6395            | Expr::Column(_) => {}
6396            // v7.10.10 — recurse children.
6397            Expr::Array(items) => {
6398                for elem in items {
6399                    self.resolve_correlated_in_expr(elem, row, ctx, cancel, memo.as_deref_mut())?;
6400                }
6401            }
6402            Expr::ArraySubscript { target, index } => {
6403                self.resolve_correlated_in_expr(target, row, ctx, cancel, memo.as_deref_mut())?;
6404                self.resolve_correlated_in_expr(index, row, ctx, cancel, memo.as_deref_mut())?;
6405            }
6406            Expr::AnyAll { expr, array, .. } => {
6407                self.resolve_correlated_in_expr(expr, row, ctx, cancel, memo.as_deref_mut())?;
6408                self.resolve_correlated_in_expr(array, row, ctx, cancel, memo.as_deref_mut())?;
6409            }
6410        }
6411        Ok(())
6412    }
6413
6414    fn subquery_replacement(
6415        &self,
6416        e: &Expr,
6417        cancel: CancelToken<'_>,
6418    ) -> Result<Option<Expr>, EngineError> {
6419        match e {
6420            Expr::ScalarSubquery(inner) => {
6421                let mut s = (**inner).clone();
6422                // Recurse into the inner SELECT first so nested
6423                // subqueries materialise bottom-up.
6424                self.resolve_select_subqueries(&mut s, cancel)?;
6425                let r = match self.exec_bare_select_cancel(&s, cancel) {
6426                    Ok(r) => r,
6427                    Err(e) if is_correlation_error(&e) => return Ok(None),
6428                    Err(e) => return Err(e),
6429                };
6430                let QueryResult::Rows { rows, .. } = r else {
6431                    return Err(EngineError::Unsupported(
6432                        "scalar subquery: inner statement did not return rows".into(),
6433                    ));
6434                };
6435                let value = match rows.as_slice() {
6436                    [] => Value::Null,
6437                    [row] => row.values.first().cloned().unwrap_or(Value::Null),
6438                    _ => {
6439                        return Err(EngineError::Unsupported(alloc::format!(
6440                            "scalar subquery returned {} rows; expected 0 or 1",
6441                            rows.len()
6442                        )));
6443                    }
6444                };
6445                Ok(Some(value_to_literal_expr(value)?))
6446            }
6447            Expr::Exists { subquery, negated } => {
6448                let mut s = (**subquery).clone();
6449                self.resolve_select_subqueries(&mut s, cancel)?;
6450                let r = match self.exec_bare_select_cancel(&s, cancel) {
6451                    Ok(r) => r,
6452                    Err(e) if is_correlation_error(&e) => return Ok(None),
6453                    Err(e) => return Err(e),
6454                };
6455                let exists = match r {
6456                    QueryResult::Rows { rows, .. } => !rows.is_empty(),
6457                    QueryResult::CommandOk { .. } => false,
6458                };
6459                let bit = if *negated { !exists } else { exists };
6460                Ok(Some(Expr::Literal(Literal::Bool(bit))))
6461            }
6462            Expr::InSubquery {
6463                expr,
6464                subquery,
6465                negated,
6466            } => {
6467                let mut s = (**subquery).clone();
6468                self.resolve_select_subqueries(&mut s, cancel)?;
6469                let r = match self.exec_bare_select_cancel(&s, cancel) {
6470                    Ok(r) => r,
6471                    Err(e) if is_correlation_error(&e) => return Ok(None),
6472                    Err(e) => return Err(e),
6473                };
6474                let QueryResult::Rows { columns, rows, .. } = r else {
6475                    return Err(EngineError::Unsupported(
6476                        "IN-subquery: inner statement did not return rows".into(),
6477                    ));
6478                };
6479                if columns.len() != 1 {
6480                    return Err(EngineError::Unsupported(alloc::format!(
6481                        "IN-subquery must project exactly one column; got {}",
6482                        columns.len()
6483                    )));
6484                }
6485                // Build the same OR-Eq chain the parse-time literal-list
6486                // path constructs, with each value lifted into a Literal.
6487                let mut acc: Option<Expr> = None;
6488                for row in rows {
6489                    let v = row.values.into_iter().next().unwrap_or(Value::Null);
6490                    let lit = value_to_literal_expr(v)?;
6491                    let cmp = Expr::Binary {
6492                        lhs: expr.clone(),
6493                        op: BinOp::Eq,
6494                        rhs: Box::new(lit),
6495                    };
6496                    acc = Some(match acc {
6497                        None => cmp,
6498                        Some(prev) => Expr::Binary {
6499                            lhs: Box::new(prev),
6500                            op: BinOp::Or,
6501                            rhs: Box::new(cmp),
6502                        },
6503                    });
6504                }
6505                let combined = acc.unwrap_or(Expr::Literal(Literal::Bool(false)));
6506                let final_expr = if *negated {
6507                    Expr::Unary {
6508                        op: UnOp::Not,
6509                        expr: Box::new(combined),
6510                    }
6511                } else {
6512                    combined
6513                };
6514                Ok(Some(final_expr))
6515            }
6516            _ => Ok(None),
6517        }
6518    }
6519}
6520
6521// ---- v4.12 window-function helpers ----
6522// The (partition-key, order-key, original-index) tuple shape used
6523// across these helpers is intrinsic to the planner. Factoring it
6524// into a typedef adds indirection without making the code clearer,
6525// so several lints are allowed inline on the affected functions
6526// rather than module-wide.
6527
6528/// v4.22: cheap structural scan for `FROM <name>` (qualified or
6529/// not) inside a SELECT — used to verify the anchor of a WITH
6530/// RECURSIVE CTE doesn't recurse into itself. Conservative: walks
6531/// FROM joins, subqueries, and unions.
6532fn select_refers_to(stmt: &SelectStatement, target: &str) -> bool {
6533    if let Some(from) = &stmt.from
6534        && from_refers_to(from, target)
6535    {
6536        return true;
6537    }
6538    for (_, peer) in &stmt.unions {
6539        if select_refers_to(peer, target) {
6540            return true;
6541        }
6542    }
6543    for item in &stmt.items {
6544        if let SelectItem::Expr { expr, .. } = item
6545            && expr_refers_to(expr, target)
6546        {
6547            return true;
6548        }
6549    }
6550    if let Some(w) = &stmt.where_
6551        && expr_refers_to(w, target)
6552    {
6553        return true;
6554    }
6555    false
6556}
6557
6558fn from_refers_to(from: &FromClause, target: &str) -> bool {
6559    if from.primary.name.eq_ignore_ascii_case(target) {
6560        return true;
6561    }
6562    from.joins
6563        .iter()
6564        .any(|j| j.table.name.eq_ignore_ascii_case(target))
6565}
6566
6567fn expr_refers_to(e: &Expr, target: &str) -> bool {
6568    match e {
6569        Expr::ScalarSubquery(s) => select_refers_to(s, target),
6570        Expr::Exists { subquery, .. } | Expr::InSubquery { subquery, .. } => {
6571            select_refers_to(subquery, target)
6572        }
6573        Expr::Binary { lhs, rhs, .. } => expr_refers_to(lhs, target) || expr_refers_to(rhs, target),
6574        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
6575            expr_refers_to(expr, target)
6576        }
6577        Expr::Like { expr, pattern, .. } => {
6578            expr_refers_to(expr, target) || expr_refers_to(pattern, target)
6579        }
6580        Expr::FunctionCall { args, .. } => args.iter().any(|a| expr_refers_to(a, target)),
6581        Expr::Extract { source, .. } => expr_refers_to(source, target),
6582        Expr::WindowFunction {
6583            args,
6584            partition_by,
6585            order_by,
6586            ..
6587        } => {
6588            args.iter().any(|a| expr_refers_to(a, target))
6589                || partition_by.iter().any(|p| expr_refers_to(p, target))
6590                || order_by.iter().any(|(o, _)| expr_refers_to(o, target))
6591        }
6592        Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => false,
6593        Expr::Array(items) => items.iter().any(|e| expr_refers_to(e, target)),
6594        Expr::ArraySubscript { target: t, index } => {
6595            expr_refers_to(t, target) || expr_refers_to(index, target)
6596        }
6597        Expr::AnyAll { expr, array, .. } => {
6598            expr_refers_to(expr, target) || expr_refers_to(array, target)
6599        }
6600    }
6601}
6602
6603/// v4.22: pick more specific column types from observed rows when
6604/// the projection builder defaulted to Text (the v1.x behavior for
6605/// non-column expressions). Lets `WITH t(n) AS (SELECT 1 ...)`
6606/// land an Int column in the CTE storage table rather than failing
6607/// the insert with "expected TEXT, got INT".
6608fn infer_column_types(columns: &[ColumnSchema], rows: &[Row]) -> Vec<ColumnSchema> {
6609    let mut out = columns.to_vec();
6610    for (col_idx, col) in out.iter_mut().enumerate() {
6611        if col.ty != DataType::Text {
6612            continue;
6613        }
6614        let mut inferred: Option<DataType> = None;
6615        let mut all_null = true;
6616        for row in rows {
6617            let Some(v) = row.values.get(col_idx) else {
6618                continue;
6619            };
6620            let ty = match v {
6621                Value::Null => continue,
6622                Value::SmallInt(_) => DataType::SmallInt,
6623                Value::Int(_) => DataType::Int,
6624                Value::BigInt(_) => DataType::BigInt,
6625                Value::Float(_) => DataType::Float,
6626                Value::Bool(_) => DataType::Bool,
6627                Value::Vector(_) => DataType::Vector {
6628                    dim: 0,
6629                    encoding: VecEncoding::F32,
6630                },
6631                _ => DataType::Text,
6632            };
6633            all_null = false;
6634            inferred = Some(match inferred {
6635                None => ty,
6636                Some(prev) if prev == ty => prev,
6637                Some(_) => DataType::Text,
6638            });
6639        }
6640        if let Some(t) = inferred {
6641            col.ty = t;
6642            col.nullable = true;
6643        } else if all_null {
6644            col.nullable = true;
6645        }
6646    }
6647    out
6648}
6649
6650/// v4.26: render a human-readable plan tree for `EXPLAIN <select>`.
6651/// Lines are pushed into `out`; `depth` controls indentation. We
6652/// describe the rewritten SELECT — what the executor *would* do —
6653/// using the engine handle to spot indexed lookups and table shapes.
6654#[allow(clippy::too_many_lines, clippy::format_push_string)]
6655/// v6.2.4 — Walk every line of the rendered plan tree and append
6656/// per-operator stats. Lines that name a known operator get
6657/// `(rows=N)` (`actual_rows` of the top-level operator equals the
6658/// final result row count; scans report their catalog row count
6659/// as the rows-considered metric). Other lines — Filter / Join /
6660/// GroupBy / OrderBy etc. — are marked `(—)` so the surface is
6661/// complete-by-construction; v6.2.5 fills these in via inline
6662/// executor counters.
6663/// v6.8.3 — surface "CREATE INDEX …" suggestions for every
6664/// `(table, column)` pair the query touches via WHERE / JOIN
6665/// that doesn't already have an index on the owning table.
6666/// Walks the SELECT's FROM clauses + WHERE expression tree;
6667/// returns one line per missing index. Deterministic order:
6668/// FROM-clause iteration order, then column-reference walk
6669/// order inside each WHERE. Each suggestion is a copy-pastable
6670/// DDL string.
6671fn build_index_suggestions(stmt: &SelectStatement, engine: &Engine) -> Vec<String> {
6672    use alloc::collections::BTreeSet;
6673    let mut seen: BTreeSet<(String, String)> = BTreeSet::new();
6674    let mut out: Vec<String> = Vec::new();
6675    let cat = engine.active_catalog();
6676    // Build a (table, qualifier-or-alias) list from the FROM clause
6677    // so unqualified column refs in WHERE resolve to the correct
6678    // table.
6679    let Some(from) = &stmt.from else {
6680        return out;
6681    };
6682    let mut tables: Vec<String> = Vec::new();
6683    tables.push(from.primary.name.clone());
6684    for j in &from.joins {
6685        tables.push(j.table.name.clone());
6686    }
6687    // Collect column refs from the WHERE expression. JOIN ON
6688    // predicates also feed in.
6689    let mut col_refs: Vec<spg_sql::ast::ColumnName> = Vec::new();
6690    if let Some(w) = &stmt.where_ {
6691        collect_column_refs(w, &mut col_refs);
6692    }
6693    for j in &from.joins {
6694        if let Some(on) = &j.on {
6695            collect_column_refs(on, &mut col_refs);
6696        }
6697    }
6698    for cn in &col_refs {
6699        // Resolve owner table: explicit qualifier first, else
6700        // first table in FROM that has a column of this name.
6701        let owner: Option<String> = if let Some(q) = &cn.qualifier {
6702            tables.iter().find(|t| t == &q).cloned()
6703        } else {
6704            tables.iter().find_map(|t| {
6705                cat.get(t).and_then(|tbl| {
6706                    if tbl.schema().column_position(&cn.name).is_some() {
6707                        Some(t.clone())
6708                    } else {
6709                        None
6710                    }
6711                })
6712            })
6713        };
6714        let Some(owner) = owner else {
6715            continue;
6716        };
6717        let Some(tbl) = cat.get(&owner) else {
6718            continue;
6719        };
6720        let Some(col_pos) = tbl.schema().column_position(&cn.name) else {
6721            continue;
6722        };
6723        // Skip if any BTree index already covers this column as
6724        // its key.
6725        let already_indexed = tbl.indices().iter().any(|i| {
6726            matches!(i.kind, spg_storage::IndexKind::BTree(_))
6727                && i.column_position == col_pos
6728                && i.expression.is_none()
6729                && i.partial_predicate.is_none()
6730        });
6731        if already_indexed {
6732            continue;
6733        }
6734        if seen.insert((owner.clone(), cn.name.clone())) {
6735            out.push(alloc::format!(
6736                "SUGGEST: CREATE INDEX ix_{}_{} ON {} ({})",
6737                owner,
6738                cn.name,
6739                owner,
6740                cn.name
6741            ));
6742        }
6743    }
6744    out
6745}
6746
6747/// Walks an `Expr` and pushes every `ColumnName` it references.
6748/// Order is depth-first, left-to-right.
6749fn collect_column_refs(expr: &Expr, out: &mut Vec<spg_sql::ast::ColumnName>) {
6750    match expr {
6751        Expr::Column(cn) => out.push(cn.clone()),
6752        Expr::FunctionCall { args, .. } => {
6753            for a in args {
6754                collect_column_refs(a, out);
6755            }
6756        }
6757        Expr::Binary { lhs, rhs, .. } => {
6758            collect_column_refs(lhs, out);
6759            collect_column_refs(rhs, out);
6760        }
6761        Expr::Unary { expr: e, .. } => collect_column_refs(e, out),
6762        _ => {}
6763    }
6764}
6765
6766fn annotate_explain_lines(lines: &mut [String], total_rows: usize, engine: &Engine) {
6767    let catalog = engine.active_catalog();
6768    let cold_ids = catalog.cold_segment_ids_global();
6769    let any_cold = !cold_ids.is_empty();
6770    let cold_ids_repr = if any_cold {
6771        let mut s = alloc::string::String::from("[");
6772        for (i, id) in cold_ids.iter().enumerate() {
6773            if i > 0 {
6774                s.push(',');
6775            }
6776            s.push_str(&alloc::format!("{id}"));
6777        }
6778        s.push(']');
6779        s
6780    } else {
6781        alloc::string::String::new()
6782    };
6783    for (idx, line) in lines.iter_mut().enumerate() {
6784        let trimmed = line.trim_start();
6785        let is_top_level = idx == 0;
6786        if is_top_level {
6787            line.push_str(&alloc::format!(" (rows={total_rows})"));
6788            continue;
6789        }
6790        if let Some(rest) = trimmed.strip_prefix("From: ") {
6791            let (name, scan_kind) = match rest.split_once(" [") {
6792                Some((n, k)) => (n.trim(), k.trim_end_matches(']')),
6793                None => (rest.trim(), ""),
6794            };
6795            let bare = name.split_whitespace().next().unwrap_or(name);
6796            let hot = catalog.get(bare).map(|t| t.rows().len());
6797            // v6.2.7 — `cold_segments=[id0,id1,…]` enumerates every
6798            // cold-tier segment the scan COULD have walked. v6.2.x
6799            // can tighten to per-table by walking the table's
6800            // BTree-index cold locators.
6801            let annot = match (hot, scan_kind) {
6802                (Some(h), "full scan") => {
6803                    let mut s = alloc::format!(" (hot_rows={h}");
6804                    if any_cold {
6805                        s.push_str(&alloc::format!(
6806                            ", cold_tier=present, cold_segments={cold_ids_repr}"
6807                        ));
6808                    }
6809                    s.push(')');
6810                    s
6811                }
6812                (Some(h), "index seek") => {
6813                    let mut s = alloc::format!(" (hot_rows≤{h}");
6814                    if any_cold {
6815                        s.push_str(&alloc::format!(
6816                            ", cold_tier=present, cold_segments={cold_ids_repr}"
6817                        ));
6818                    }
6819                    s.push(')');
6820                    s
6821                }
6822                _ => " (rows=—)".to_string(),
6823            };
6824            line.push_str(&annot);
6825            continue;
6826        }
6827        // Filter / GroupBy / Having / OrderBy / Limit / Join etc.
6828        line.push_str(" (rows=—)");
6829    }
6830}
6831
6832fn explain_select(stmt: &SelectStatement, engine: &Engine, depth: usize, out: &mut Vec<String>) {
6833    let pad = "  ".repeat(depth);
6834    // 1) Top-level operator label.
6835    let top = if !stmt.ctes.is_empty() {
6836        if stmt.ctes.iter().any(|c| c.recursive) {
6837            "CTEScan (WITH RECURSIVE)"
6838        } else {
6839            "CTEScan (WITH)"
6840        }
6841    } else if !stmt.unions.is_empty() {
6842        "UnionScan"
6843    } else if select_has_window(stmt) {
6844        "WindowAgg"
6845    } else if aggregate::uses_aggregate(stmt) {
6846        "Aggregate"
6847    } else if stmt.distinct {
6848        "Distinct"
6849    } else if stmt.from.is_some() {
6850        "TableScan"
6851    } else {
6852        "Result"
6853    };
6854    out.push(alloc::format!("{pad}{top}"));
6855    let child = "  ".repeat(depth + 1);
6856    // 2) CTE bodies.
6857    for cte in &stmt.ctes {
6858        let head = if cte.recursive {
6859            alloc::format!("{child}CTE (recursive): {}", cte.name)
6860        } else {
6861            alloc::format!("{child}CTE: {}", cte.name)
6862        };
6863        out.push(head);
6864        explain_select(&cte.body, engine, depth + 2, out);
6865    }
6866    // 3) FROM details — primary table + joins, index hits.
6867    if let Some(from) = &stmt.from {
6868        let mut tag = alloc::format!("{child}From: {}", from.primary.name);
6869        if let Some(alias) = &from.primary.alias {
6870            tag.push_str(&alloc::format!(" AS {alias}"));
6871        }
6872        // Try to detect an index-seek opportunity on WHERE against
6873        // the primary table — same heuristic the executor uses.
6874        if let Some(w) = &stmt.where_
6875            && let Some(table) = engine.active_catalog().get(&from.primary.name)
6876        {
6877            let alias = from.primary.alias.as_deref().unwrap_or(&from.primary.name);
6878            let cols = &table.schema().columns;
6879            if try_index_seek(w, cols, engine.active_catalog(), table, alias).is_some() {
6880                tag.push_str(" [index seek]");
6881            } else {
6882                tag.push_str(" [full scan]");
6883            }
6884        } else {
6885            tag.push_str(" [full scan]");
6886        }
6887        out.push(tag);
6888        for j in &from.joins {
6889            let kind = match j.kind {
6890                spg_sql::ast::JoinKind::Inner => "INNER JOIN",
6891                spg_sql::ast::JoinKind::Left => "LEFT JOIN",
6892                spg_sql::ast::JoinKind::Cross => "CROSS JOIN",
6893            };
6894            let mut s = alloc::format!("{child}{kind}: {}", j.table.name);
6895            if let Some(alias) = &j.table.alias {
6896                s.push_str(&alloc::format!(" AS {alias}"));
6897            }
6898            if j.on.is_some() {
6899                s.push_str(" (ON …)");
6900            }
6901            out.push(s);
6902        }
6903    }
6904    // 4) WHERE / GROUP BY / HAVING / ORDER BY / LIMIT / OFFSET.
6905    if let Some(w) = &stmt.where_ {
6906        let mut s = alloc::format!("{child}Filter: {w}");
6907        if expr_has_subquery(w) {
6908            s.push_str(" [subquery]");
6909        }
6910        out.push(s);
6911    }
6912    if let Some(gs) = &stmt.group_by {
6913        let mut parts = Vec::new();
6914        for g in gs {
6915            parts.push(alloc::format!("{g}"));
6916        }
6917        out.push(alloc::format!("{child}GroupBy: {}", parts.join(", ")));
6918    }
6919    if let Some(h) = &stmt.having {
6920        out.push(alloc::format!("{child}Having: {h}"));
6921    }
6922    for o in &stmt.order_by {
6923        let dir = if o.desc { "DESC" } else { "ASC" };
6924        out.push(alloc::format!("{child}OrderBy: {} {dir}", o.expr));
6925    }
6926    if let Some(lim) = stmt.limit {
6927        out.push(alloc::format!("{child}Limit: {lim}"));
6928    }
6929    if let Some(off) = stmt.offset {
6930        out.push(alloc::format!("{child}Offset: {off}"));
6931    }
6932    // 5) Projection — collapse Wildcard or render N items.
6933    if stmt
6934        .items
6935        .iter()
6936        .any(|it| matches!(it, SelectItem::Wildcard))
6937    {
6938        out.push(alloc::format!("{child}Project: *"));
6939    } else {
6940        out.push(alloc::format!(
6941            "{child}Project: {} item(s)",
6942            stmt.items.len()
6943        ));
6944    }
6945    // 6) Recurse into UNION peers.
6946    for (kind, peer) in &stmt.unions {
6947        let label = match kind {
6948            UnionKind::All => "UNION ALL",
6949            UnionKind::Distinct => "UNION",
6950        };
6951        out.push(alloc::format!("{child}{label}"));
6952        explain_select(peer, engine, depth + 2, out);
6953    }
6954}
6955
6956/// v4.23: recognise the engine errors that indicate the inner
6957/// SELECT couldn't be evaluated in isolation because it references
6958/// an outer column — used by `subquery_replacement` to skip
6959/// materialisation and let row-eval handle it instead.
6960fn is_correlation_error(e: &EngineError) -> bool {
6961    matches!(
6962        e,
6963        EngineError::Eval(
6964            eval::EvalError::ColumnNotFound { .. } | eval::EvalError::UnknownQualifier { .. }
6965        )
6966    )
6967}
6968
6969/// v4.23: walk every Expr in `stmt` and replace each Column ref
6970/// that targets the outer scope (qualifier matches the outer
6971/// table alias) with a Literal carrying the outer row's value.
6972/// Conservative: only qualified refs are substituted, so the user
6973/// must write `outer_alias.col` to reference an outer column. This
6974/// matches PG's lexical scoping for correlated subqueries and
6975/// avoids accidentally rebinding inner columns of the same name.
6976fn substitute_outer_columns(stmt: &mut SelectStatement, row: &Row, ctx: &EvalContext<'_>) {
6977    let Some(outer_alias) = ctx.table_alias else {
6978        return;
6979    };
6980    substitute_in_select(stmt, row, ctx, outer_alias);
6981}
6982
6983fn substitute_in_select(
6984    stmt: &mut SelectStatement,
6985    row: &Row,
6986    ctx: &EvalContext<'_>,
6987    outer_alias: &str,
6988) {
6989    for item in &mut stmt.items {
6990        if let SelectItem::Expr { expr, .. } = item {
6991            substitute_in_expr(expr, row, ctx, outer_alias);
6992        }
6993    }
6994    if let Some(w) = &mut stmt.where_ {
6995        substitute_in_expr(w, row, ctx, outer_alias);
6996    }
6997    if let Some(gs) = &mut stmt.group_by {
6998        for g in gs {
6999            substitute_in_expr(g, row, ctx, outer_alias);
7000        }
7001    }
7002    if let Some(h) = &mut stmt.having {
7003        substitute_in_expr(h, row, ctx, outer_alias);
7004    }
7005    for o in &mut stmt.order_by {
7006        substitute_in_expr(&mut o.expr, row, ctx, outer_alias);
7007    }
7008    for (_, peer) in &mut stmt.unions {
7009        substitute_in_select(peer, row, ctx, outer_alias);
7010    }
7011}
7012
7013fn substitute_in_expr(e: &mut Expr, row: &Row, ctx: &EvalContext<'_>, outer_alias: &str) {
7014    if let Expr::Column(c) = e
7015        && let Some(qual) = &c.qualifier
7016        && qual.eq_ignore_ascii_case(outer_alias)
7017    {
7018        // Look up the column's index in the outer schema.
7019        if let Some(idx) = ctx
7020            .columns
7021            .iter()
7022            .position(|sc| sc.name.eq_ignore_ascii_case(&c.name))
7023        {
7024            let v = row.values.get(idx).cloned().unwrap_or(Value::Null);
7025            if let Ok(lit) = value_to_literal_expr(v) {
7026                *e = lit;
7027                return;
7028            }
7029        }
7030    }
7031    match e {
7032        Expr::Binary { lhs, rhs, .. } => {
7033            substitute_in_expr(lhs, row, ctx, outer_alias);
7034            substitute_in_expr(rhs, row, ctx, outer_alias);
7035        }
7036        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
7037            substitute_in_expr(expr, row, ctx, outer_alias);
7038        }
7039        Expr::Like { expr, pattern, .. } => {
7040            substitute_in_expr(expr, row, ctx, outer_alias);
7041            substitute_in_expr(pattern, row, ctx, outer_alias);
7042        }
7043        Expr::FunctionCall { args, .. } => {
7044            for a in args {
7045                substitute_in_expr(a, row, ctx, outer_alias);
7046            }
7047        }
7048        Expr::Extract { source, .. } => substitute_in_expr(source, row, ctx, outer_alias),
7049        Expr::WindowFunction {
7050            args,
7051            partition_by,
7052            order_by,
7053            ..
7054        } => {
7055            for a in args {
7056                substitute_in_expr(a, row, ctx, outer_alias);
7057            }
7058            for p in partition_by {
7059                substitute_in_expr(p, row, ctx, outer_alias);
7060            }
7061            for (o, _) in order_by {
7062                substitute_in_expr(o, row, ctx, outer_alias);
7063            }
7064        }
7065        Expr::ScalarSubquery(s) => substitute_in_select(s, row, ctx, outer_alias),
7066        Expr::Exists { subquery, .. } | Expr::InSubquery { subquery, .. } => {
7067            substitute_in_select(subquery, row, ctx, outer_alias);
7068        }
7069        Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => {}
7070        Expr::Array(items) => {
7071            for elem in items {
7072                substitute_in_expr(elem, row, ctx, outer_alias);
7073            }
7074        }
7075        Expr::ArraySubscript { target, index } => {
7076            substitute_in_expr(target, row, ctx, outer_alias);
7077            substitute_in_expr(index, row, ctx, outer_alias);
7078        }
7079        Expr::AnyAll { expr, array, .. } => {
7080            substitute_in_expr(expr, row, ctx, outer_alias);
7081            substitute_in_expr(array, row, ctx, outer_alias);
7082        }
7083    }
7084}
7085
7086/// v4.22: encode a Row to a comparable byte key for UNION-DISTINCT
7087/// dedup inside the recursive iteration. Crude but deterministic
7088/// — Debug prints embed type discriminants so NULL ≠ "" ≠ 0.
7089fn encode_row_key(row: &Row) -> Vec<u8> {
7090    let mut out = Vec::new();
7091    for v in &row.values {
7092        let s = alloc::format!("{v:?}|");
7093        out.extend_from_slice(s.as_bytes());
7094    }
7095    out
7096}
7097
7098fn select_has_window(stmt: &SelectStatement) -> bool {
7099    for item in &stmt.items {
7100        if let SelectItem::Expr { expr, .. } = item
7101            && expr_has_window(expr)
7102        {
7103            return true;
7104        }
7105    }
7106    false
7107}
7108
7109fn expr_has_window(e: &Expr) -> bool {
7110    match e {
7111        Expr::WindowFunction { .. } => true,
7112        Expr::Binary { lhs, rhs, .. } => expr_has_window(lhs) || expr_has_window(rhs),
7113        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
7114            expr_has_window(expr)
7115        }
7116        Expr::FunctionCall { args, .. } => args.iter().any(expr_has_window),
7117        Expr::Like { expr, pattern, .. } => expr_has_window(expr) || expr_has_window(pattern),
7118        Expr::Extract { source, .. } => expr_has_window(source),
7119        Expr::ScalarSubquery(_)
7120        | Expr::Exists { .. }
7121        | Expr::InSubquery { .. }
7122        | Expr::Literal(_)
7123        | Expr::Placeholder(_)
7124        | Expr::Column(_) => false,
7125        Expr::Array(items) => items.iter().any(expr_has_window),
7126        Expr::ArraySubscript { target, index } => expr_has_window(target) || expr_has_window(index),
7127        Expr::AnyAll { expr, array, .. } => expr_has_window(expr) || expr_has_window(array),
7128    }
7129}
7130
7131fn collect_window_nodes(e: &Expr, out: &mut Vec<Expr>) {
7132    if let Expr::WindowFunction { .. } = e {
7133        // Deduplicate by structural equality on the expression
7134        // (cheap because window args + partition + order are
7135        // small). Without dedup we'd recompute identical windows
7136        // once per occurrence in the projection.
7137        if !out.iter().any(|x| x == e) {
7138            out.push(e.clone());
7139        }
7140        return;
7141    }
7142    match e {
7143        // Already handled by the early-return at the top.
7144        Expr::WindowFunction { .. } => unreachable!(),
7145        Expr::Binary { lhs, rhs, .. } => {
7146            collect_window_nodes(lhs, out);
7147            collect_window_nodes(rhs, out);
7148        }
7149        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
7150            collect_window_nodes(expr, out);
7151        }
7152        Expr::FunctionCall { args, .. } => {
7153            for a in args {
7154                collect_window_nodes(a, out);
7155            }
7156        }
7157        Expr::Like { expr, pattern, .. } => {
7158            collect_window_nodes(expr, out);
7159            collect_window_nodes(pattern, out);
7160        }
7161        Expr::Extract { source, .. } => collect_window_nodes(source, out),
7162        _ => {}
7163    }
7164}
7165
7166fn rewrite_window_to_columns(e: &mut Expr, window_nodes: &[Expr]) {
7167    if let Expr::WindowFunction { .. } = e
7168        && let Some(idx) = window_nodes.iter().position(|w| w == e)
7169    {
7170        *e = Expr::Column(spg_sql::ast::ColumnName {
7171            qualifier: None,
7172            name: alloc::format!("__win_{idx}"),
7173        });
7174        return;
7175    }
7176    match e {
7177        Expr::Binary { lhs, rhs, .. } => {
7178            rewrite_window_to_columns(lhs, window_nodes);
7179            rewrite_window_to_columns(rhs, window_nodes);
7180        }
7181        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
7182            rewrite_window_to_columns(expr, window_nodes);
7183        }
7184        Expr::FunctionCall { args, .. } => {
7185            for a in args {
7186                rewrite_window_to_columns(a, window_nodes);
7187            }
7188        }
7189        Expr::Like { expr, pattern, .. } => {
7190            rewrite_window_to_columns(expr, window_nodes);
7191            rewrite_window_to_columns(pattern, window_nodes);
7192        }
7193        Expr::Extract { source, .. } => rewrite_window_to_columns(source, window_nodes),
7194        _ => {}
7195    }
7196}
7197
7198/// Total order over partition-key tuples. NULL sorts as the
7199/// lowest value (matches the `<` partial order's NULL-last
7200/// behaviour with `INFINITY` flipped).
7201fn partition_key_cmp(a: &[Value], b: &[Value]) -> core::cmp::Ordering {
7202    for (x, y) in a.iter().zip(b.iter()) {
7203        let c = value_cmp(x, y);
7204        if c != core::cmp::Ordering::Equal {
7205            return c;
7206        }
7207    }
7208    a.len().cmp(&b.len())
7209}
7210
7211fn order_key_cmp(a: &[(Value, bool)], b: &[(Value, bool)]) -> core::cmp::Ordering {
7212    for ((va, desc), (vb, _)) in a.iter().zip(b.iter()) {
7213        let c = value_cmp(va, vb);
7214        let c = if *desc { c.reverse() } else { c };
7215        if c != core::cmp::Ordering::Equal {
7216            return c;
7217        }
7218    }
7219    a.len().cmp(&b.len())
7220}
7221
7222#[allow(clippy::match_same_arms)] // explicit arms per type document the supported pairs
7223fn value_cmp(a: &Value, b: &Value) -> core::cmp::Ordering {
7224    use core::cmp::Ordering;
7225    match (a, b) {
7226        (Value::Null, Value::Null) => Ordering::Equal,
7227        (Value::Null, _) => Ordering::Less,
7228        (_, Value::Null) => Ordering::Greater,
7229        (Value::Int(x), Value::Int(y)) => x.cmp(y),
7230        (Value::BigInt(x), Value::BigInt(y)) => x.cmp(y),
7231        (Value::SmallInt(x), Value::SmallInt(y)) => x.cmp(y),
7232        (Value::Text(x), Value::Text(y)) => x.cmp(y),
7233        (Value::Bool(x), Value::Bool(y)) => x.cmp(y),
7234        (Value::Float(x), Value::Float(y)) => x.partial_cmp(y).unwrap_or(Ordering::Equal),
7235        (Value::Date(x), Value::Date(y)) => x.cmp(y),
7236        (Value::Timestamp(x), Value::Timestamp(y)) => x.cmp(y),
7237        // Cross-type compare: fall back to the debug rendering —
7238        // same-partition is the goal, exact order is irrelevant.
7239        _ => alloc::format!("{a:?}").cmp(&alloc::format!("{b:?}")),
7240    }
7241}
7242
7243/// Compute the window function's per-row output for one partition.
7244/// `slice` has (partition key, order key, original-row-index)
7245/// tuples already sorted by order key. `filtered_rows` is the
7246/// full row list indexed by original-row-index. `out_vals` is
7247/// the destination, also indexed by original-row-index.
7248#[allow(
7249    clippy::too_many_arguments,
7250    clippy::cast_possible_truncation,
7251    clippy::cast_possible_wrap,
7252    clippy::cast_precision_loss,
7253    clippy::cast_sign_loss,
7254    clippy::doc_markdown,
7255    clippy::too_many_lines,
7256    clippy::type_complexity,
7257    clippy::match_same_arms
7258)]
7259fn compute_window_partition(
7260    name: &str,
7261    args: &[Expr],
7262    ordered: bool,
7263    frame: Option<&WindowFrame>,
7264    null_treatment: spg_sql::ast::NullTreatment,
7265    slice: &[(Vec<Value>, Vec<(Value, bool)>, usize)],
7266    filtered_rows: &[&Row],
7267    ctx: &EvalContext<'_>,
7268    out_vals: &mut [Value],
7269) -> Result<(), EngineError> {
7270    let ignore_nulls = matches!(null_treatment, spg_sql::ast::NullTreatment::Ignore);
7271    let lower = name.to_ascii_lowercase();
7272    match lower.as_str() {
7273        "row_number" => {
7274            for (rank, (_, _, idx)) in slice.iter().enumerate() {
7275                out_vals[*idx] = Value::BigInt((rank + 1) as i64);
7276            }
7277            Ok(())
7278        }
7279        "rank" => {
7280            let mut prev_key: Option<&[(Value, bool)]> = None;
7281            let mut current_rank: i64 = 1;
7282            for (i, (_, okey, idx)) in slice.iter().enumerate() {
7283                if let Some(p) = prev_key
7284                    && order_key_cmp(p, okey) != core::cmp::Ordering::Equal
7285                {
7286                    current_rank = (i + 1) as i64;
7287                }
7288                if prev_key.is_none() {
7289                    current_rank = 1;
7290                }
7291                out_vals[*idx] = Value::BigInt(current_rank);
7292                prev_key = Some(okey.as_slice());
7293            }
7294            Ok(())
7295        }
7296        "dense_rank" => {
7297            let mut prev_key: Option<&[(Value, bool)]> = None;
7298            let mut current_rank: i64 = 0;
7299            for (_, okey, idx) in slice {
7300                if prev_key.is_none_or(|p| order_key_cmp(p, okey) != core::cmp::Ordering::Equal) {
7301                    current_rank += 1;
7302                }
7303                out_vals[*idx] = Value::BigInt(current_rank);
7304                prev_key = Some(okey.as_slice());
7305            }
7306            Ok(())
7307        }
7308        "sum" | "avg" | "min" | "max" | "count" | "count_star" => {
7309            // Pre-evaluate the function arg per row in the slice
7310            // (count_star has no arg).
7311            let arg_values: Vec<Value> = if lower == "count_star" || args.is_empty() {
7312                slice.iter().map(|_| Value::Null).collect()
7313            } else {
7314                slice
7315                    .iter()
7316                    .map(|(_, _, idx)| eval::eval_expr(&args[0], filtered_rows[*idx], ctx))
7317                    .collect::<Result<_, _>>()
7318                    .map_err(EngineError::Eval)?
7319            };
7320            // v4.20: pick the effective frame. Explicit frame
7321            // overrides the implicit default (running for ordered,
7322            // whole-partition for unordered).
7323            let eff = effective_frame(frame, ordered)?;
7324            #[allow(clippy::needless_range_loop)]
7325            for i in 0..slice.len() {
7326                let (lo, hi) = frame_bounds_for_row(&eff, i, slice);
7327                let mut sum: f64 = 0.0;
7328                let mut count: i64 = 0;
7329                let mut min_v: Option<f64> = None;
7330                let mut max_v: Option<f64> = None;
7331                let mut row_count: i64 = 0;
7332                if lo <= hi {
7333                    for j in lo..=hi {
7334                        let v = &arg_values[j];
7335                        match lower.as_str() {
7336                            "count_star" => row_count += 1,
7337                            "count" => {
7338                                if !v.is_null() {
7339                                    count += 1;
7340                                }
7341                            }
7342                            _ => {
7343                                if let Some(x) = value_to_f64(v) {
7344                                    sum += x;
7345                                    count += 1;
7346                                    min_v = Some(min_v.map_or(x, |m| m.min(x)));
7347                                    max_v = Some(max_v.map_or(x, |m| m.max(x)));
7348                                }
7349                            }
7350                        }
7351                    }
7352                }
7353                let value = match lower.as_str() {
7354                    "count_star" => Value::BigInt(row_count),
7355                    "count" => Value::BigInt(count),
7356                    "sum" => Value::Float(sum),
7357                    "avg" => {
7358                        if count == 0 {
7359                            Value::Null
7360                        } else {
7361                            Value::Float(sum / count as f64)
7362                        }
7363                    }
7364                    "min" => min_v.map_or(Value::Null, Value::Float),
7365                    "max" => max_v.map_or(Value::Null, Value::Float),
7366                    _ => unreachable!(),
7367                };
7368                let (_, _, idx) = &slice[i];
7369                out_vals[*idx] = value;
7370            }
7371            Ok(())
7372        }
7373        "lag" | "lead" => {
7374            // lag(expr [, offset [, default]])
7375            // lead(expr [, offset [, default]])
7376            if args.is_empty() {
7377                return Err(EngineError::Unsupported(alloc::format!(
7378                    "{lower}() requires at least one argument"
7379                )));
7380            }
7381            let offset: i64 = if args.len() >= 2 {
7382                let v = eval::eval_expr(&args[1], filtered_rows[slice[0].2], ctx)
7383                    .map_err(EngineError::Eval)?;
7384                match v {
7385                    Value::SmallInt(n) => i64::from(n),
7386                    Value::Int(n) => i64::from(n),
7387                    Value::BigInt(n) => n,
7388                    _ => {
7389                        return Err(EngineError::Unsupported(alloc::format!(
7390                            "{lower}() offset must be integer"
7391                        )));
7392                    }
7393                }
7394            } else {
7395                1
7396            };
7397            let default: Value = if args.len() >= 3 {
7398                eval::eval_expr(&args[2], filtered_rows[slice[0].2], ctx)
7399                    .map_err(EngineError::Eval)?
7400            } else {
7401                Value::Null
7402            };
7403            let values: Vec<Value> = slice
7404                .iter()
7405                .map(|(_, _, idx)| eval::eval_expr(&args[0], filtered_rows[*idx], ctx))
7406                .collect::<Result<_, _>>()
7407                .map_err(EngineError::Eval)?;
7408            let n = slice.len();
7409            for (i, (_, _, idx)) in slice.iter().enumerate() {
7410                let signed_offset = if lower == "lag" { -offset } else { offset };
7411                let v = if ignore_nulls {
7412                    // v6.4.2 — IGNORE NULLS: walk in the offset direction
7413                    // skipping NULL values; the `offset`-th non-NULL
7414                    // encountered is the result.
7415                    let step: i64 = if signed_offset >= 0 { 1 } else { -1 };
7416                    let needed: i64 = signed_offset.abs();
7417                    if needed == 0 {
7418                        values[i].clone()
7419                    } else {
7420                        let mut j: i64 = i as i64;
7421                        let mut hits: i64 = 0;
7422                        let mut found: Option<Value> = None;
7423                        loop {
7424                            j += step;
7425                            if j < 0 || j >= n as i64 {
7426                                break;
7427                            }
7428                            #[allow(clippy::cast_sign_loss)]
7429                            let v = &values[j as usize];
7430                            if !v.is_null() {
7431                                hits += 1;
7432                                if hits == needed {
7433                                    found = Some(v.clone());
7434                                    break;
7435                                }
7436                            }
7437                        }
7438                        found.unwrap_or_else(|| default.clone())
7439                    }
7440                } else {
7441                    let target_signed = i64::try_from(i).unwrap_or(i64::MAX) + signed_offset;
7442                    if target_signed < 0 || target_signed >= i64::try_from(n).unwrap_or(i64::MAX) {
7443                        default.clone()
7444                    } else {
7445                        #[allow(clippy::cast_sign_loss)]
7446                        {
7447                            values[target_signed as usize].clone()
7448                        }
7449                    }
7450                };
7451                out_vals[*idx] = v;
7452            }
7453            Ok(())
7454        }
7455        "first_value" | "last_value" | "nth_value" => {
7456            if args.is_empty() {
7457                return Err(EngineError::Unsupported(alloc::format!(
7458                    "{lower}() requires at least one argument"
7459                )));
7460            }
7461            let values: Vec<Value> = slice
7462                .iter()
7463                .map(|(_, _, idx)| eval::eval_expr(&args[0], filtered_rows[*idx], ctx))
7464                .collect::<Result<_, _>>()
7465                .map_err(EngineError::Eval)?;
7466            let nth: usize = if lower == "nth_value" {
7467                if args.len() < 2 {
7468                    return Err(EngineError::Unsupported(
7469                        "nth_value() requires (expr, n)".into(),
7470                    ));
7471                }
7472                let v = eval::eval_expr(&args[1], filtered_rows[slice[0].2], ctx)
7473                    .map_err(EngineError::Eval)?;
7474                let raw = match v {
7475                    Value::SmallInt(n) => i64::from(n),
7476                    Value::Int(n) => i64::from(n),
7477                    Value::BigInt(n) => n,
7478                    _ => {
7479                        return Err(EngineError::Unsupported(
7480                            "nth_value() n must be integer".into(),
7481                        ));
7482                    }
7483                };
7484                if raw < 1 {
7485                    return Err(EngineError::Unsupported(
7486                        "nth_value() n must be >= 1".into(),
7487                    ));
7488                }
7489                #[allow(clippy::cast_sign_loss)]
7490                {
7491                    raw as usize
7492                }
7493            } else {
7494                0
7495            };
7496            let eff = effective_frame(frame, ordered)?;
7497            for i in 0..slice.len() {
7498                let (lo, hi) = frame_bounds_for_row(&eff, i, slice);
7499                let (_, _, idx) = &slice[i];
7500                let v = if lo > hi {
7501                    Value::Null
7502                } else if ignore_nulls && matches!(lower.as_str(), "first_value" | "last_value") {
7503                    // v6.4.2 — IGNORE NULLS: skip NULL cells when
7504                    // selecting the boundary value within the frame.
7505                    if lower == "first_value" {
7506                        (lo..=hi)
7507                            .find_map(|j| {
7508                                let v = &values[j];
7509                                (!v.is_null()).then(|| v.clone())
7510                            })
7511                            .unwrap_or(Value::Null)
7512                    } else {
7513                        (lo..=hi)
7514                            .rev()
7515                            .find_map(|j| {
7516                                let v = &values[j];
7517                                (!v.is_null()).then(|| v.clone())
7518                            })
7519                            .unwrap_or(Value::Null)
7520                    }
7521                } else {
7522                    match lower.as_str() {
7523                        "first_value" => values[lo].clone(),
7524                        "last_value" => values[hi].clone(),
7525                        "nth_value" => {
7526                            let pos = lo + nth - 1;
7527                            if pos > hi {
7528                                Value::Null
7529                            } else {
7530                                values[pos].clone()
7531                            }
7532                        }
7533                        _ => unreachable!(),
7534                    }
7535                };
7536                out_vals[*idx] = v;
7537            }
7538            Ok(())
7539        }
7540        "ntile" => {
7541            if args.is_empty() {
7542                return Err(EngineError::Unsupported(
7543                    "ntile(n) requires an integer argument".into(),
7544                ));
7545            }
7546            let v = eval::eval_expr(&args[0], filtered_rows[slice[0].2], ctx)
7547                .map_err(EngineError::Eval)?;
7548            let bucket_count: i64 = match v {
7549                Value::SmallInt(n) => i64::from(n),
7550                Value::Int(n) => i64::from(n),
7551                Value::BigInt(n) => n,
7552                _ => {
7553                    return Err(EngineError::Unsupported(
7554                        "ntile() argument must be integer".into(),
7555                    ));
7556                }
7557            };
7558            if bucket_count < 1 {
7559                return Err(EngineError::Unsupported(
7560                    "ntile() argument must be >= 1".into(),
7561                ));
7562            }
7563            #[allow(clippy::cast_sign_loss)]
7564            let buckets = bucket_count as usize;
7565            let n = slice.len();
7566            // Each bucket gets `base` rows; the first `extras` buckets
7567            // get one extra. PG semantics.
7568            let base = n / buckets;
7569            let extras = n % buckets;
7570            let mut bucket: usize = 1;
7571            let mut remaining_in_bucket = if extras > 0 { base + 1 } else { base };
7572            let mut buckets_with_extra_remaining = extras;
7573            for (_, _, idx) in slice {
7574                if remaining_in_bucket == 0 {
7575                    bucket += 1;
7576                    buckets_with_extra_remaining = buckets_with_extra_remaining.saturating_sub(1);
7577                    remaining_in_bucket = if buckets_with_extra_remaining > 0 {
7578                        base + 1
7579                    } else {
7580                        base
7581                    };
7582                    // Edge: if base==0 and extras==0, all rows fit;
7583                    // shouldn't reach here, but guard anyway.
7584                    if remaining_in_bucket == 0 {
7585                        remaining_in_bucket = 1;
7586                    }
7587                }
7588                out_vals[*idx] = Value::BigInt(i64::try_from(bucket).unwrap_or(i64::MAX));
7589                remaining_in_bucket -= 1;
7590            }
7591            Ok(())
7592        }
7593        "percent_rank" => {
7594            // (rank - 1) / (n - 1) where rank is the standard RANK().
7595            // Single-row partitions get 0.
7596            let n = slice.len();
7597            let mut prev_key: Option<&[(Value, bool)]> = None;
7598            let mut current_rank: i64 = 1;
7599            for (i, (_, okey, idx)) in slice.iter().enumerate() {
7600                if let Some(p) = prev_key
7601                    && order_key_cmp(p, okey) != core::cmp::Ordering::Equal
7602                {
7603                    current_rank = i64::try_from(i + 1).unwrap_or(i64::MAX);
7604                }
7605                if prev_key.is_none() {
7606                    current_rank = 1;
7607                }
7608                #[allow(clippy::cast_precision_loss)]
7609                let pr = if n <= 1 {
7610                    0.0
7611                } else {
7612                    (current_rank - 1) as f64 / (n - 1) as f64
7613                };
7614                out_vals[*idx] = Value::Float(pr);
7615                prev_key = Some(okey.as_slice());
7616            }
7617            Ok(())
7618        }
7619        "cume_dist" => {
7620            // # rows up to and including this row's peer group / n.
7621            let n = slice.len();
7622            // First pass: find peer-group-end rank for each row.
7623            for i in 0..slice.len() {
7624                let peer_end = peer_group_end(slice, i);
7625                #[allow(clippy::cast_precision_loss)]
7626                let cd = (peer_end + 1) as f64 / n as f64;
7627                let (_, _, idx) = &slice[i];
7628                out_vals[*idx] = Value::Float(cd);
7629            }
7630            Ok(())
7631        }
7632        other => Err(EngineError::Unsupported(alloc::format!(
7633            "window function {other:?} not supported (v4.21: row_number/rank/dense_rank/sum/avg/count/min/max/lag/lead/first_value/last_value/nth_value/ntile/percent_rank/cume_dist)"
7634        ))),
7635    }
7636}
7637
7638/// v4.20: resolve the user-provided frame down to a normalised
7639/// `(kind, start, end)`. `None` means default — derive from
7640/// `ordered`: ordered ⇒ RANGE UNBOUNDED PRECEDING AND CURRENT ROW,
7641/// unordered ⇒ ROWS UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING.
7642/// Single-bound shorthand (e.g. `ROWS 5 PRECEDING`) normalises
7643/// end → CURRENT ROW per the PG spec.
7644fn effective_frame(
7645    frame: Option<&WindowFrame>,
7646    ordered: bool,
7647) -> Result<(FrameKind, FrameBound, FrameBound), EngineError> {
7648    match frame {
7649        None => {
7650            if ordered {
7651                Ok((
7652                    FrameKind::Range,
7653                    FrameBound::UnboundedPreceding,
7654                    FrameBound::CurrentRow,
7655                ))
7656            } else {
7657                Ok((
7658                    FrameKind::Rows,
7659                    FrameBound::UnboundedPreceding,
7660                    FrameBound::UnboundedFollowing,
7661                ))
7662            }
7663        }
7664        Some(fr) => {
7665            let end = fr.end.clone().unwrap_or(FrameBound::CurrentRow);
7666            // Reject start > end (a few impossible combinations).
7667            if matches!(fr.start, FrameBound::UnboundedFollowing)
7668                || matches!(end, FrameBound::UnboundedPreceding)
7669            {
7670                return Err(EngineError::Unsupported(alloc::format!(
7671                    "invalid frame: start={:?} end={:?}",
7672                    fr.start,
7673                    end
7674                )));
7675            }
7676            // RANGE OFFSET PRECEDING / FOLLOWING needs value-typed
7677            // arithmetic on the ORDER BY key (e.g. `RANGE BETWEEN
7678            // INTERVAL '1 day' PRECEDING AND CURRENT ROW`). Not
7679            // implemented in v4.20.
7680            if fr.kind == FrameKind::Range
7681                && (matches!(
7682                    fr.start,
7683                    FrameBound::OffsetPreceding(_) | FrameBound::OffsetFollowing(_)
7684                ) || matches!(
7685                    end,
7686                    FrameBound::OffsetPreceding(_) | FrameBound::OffsetFollowing(_)
7687                ))
7688            {
7689                return Err(EngineError::Unsupported(
7690                    "RANGE with explicit offset bounds is not supported (v4.20: only UNBOUNDED / CURRENT ROW for RANGE)".into(),
7691                ));
7692            }
7693            Ok((fr.kind, fr.start.clone(), end))
7694        }
7695    }
7696}
7697
7698/// Compute `(lo, hi)` row-index bounds inside the partition slice
7699/// for the row at position `i`. Inclusive, clamped to
7700/// `[0, slice.len()-1]`. Empty result if `lo > hi`.
7701#[allow(clippy::type_complexity)]
7702fn frame_bounds_for_row(
7703    eff: &(FrameKind, FrameBound, FrameBound),
7704    i: usize,
7705    slice: &[(Vec<Value>, Vec<(Value, bool)>, usize)],
7706) -> (usize, usize) {
7707    let (kind, start, end) = eff;
7708    let n = slice.len();
7709    let last = n.saturating_sub(1);
7710    let (mut lo, mut hi) = match kind {
7711        FrameKind::Rows => {
7712            let lo = match start {
7713                FrameBound::UnboundedPreceding => 0,
7714                FrameBound::OffsetPreceding(k) => {
7715                    let k = usize::try_from(*k).unwrap_or(usize::MAX);
7716                    i.saturating_sub(k)
7717                }
7718                FrameBound::CurrentRow => i,
7719                FrameBound::OffsetFollowing(k) => {
7720                    let k = usize::try_from(*k).unwrap_or(usize::MAX);
7721                    i.saturating_add(k).min(last)
7722                }
7723                FrameBound::UnboundedFollowing => last,
7724            };
7725            let hi = match end {
7726                FrameBound::UnboundedPreceding => 0,
7727                FrameBound::OffsetPreceding(k) => {
7728                    let k = usize::try_from(*k).unwrap_or(usize::MAX);
7729                    i.saturating_sub(k)
7730                }
7731                FrameBound::CurrentRow => i,
7732                FrameBound::OffsetFollowing(k) => {
7733                    let k = usize::try_from(*k).unwrap_or(usize::MAX);
7734                    i.saturating_add(k).min(last)
7735                }
7736                FrameBound::UnboundedFollowing => last,
7737            };
7738            (lo, hi)
7739        }
7740        FrameKind::Range => {
7741            // RANGE bounds are peer-aware. With only UNBOUNDED and
7742            // CURRENT ROW supported (rejected at effective_frame for
7743            // explicit offsets), the start/end map to the
7744            // partition's full extent at the same-order-key peer
7745            // group boundary.
7746            let lo = match start {
7747                FrameBound::UnboundedPreceding => 0,
7748                FrameBound::CurrentRow => peer_group_start(slice, i),
7749                FrameBound::UnboundedFollowing => last,
7750                _ => unreachable!("offset bounds rejected for RANGE"),
7751            };
7752            let hi = match end {
7753                FrameBound::UnboundedPreceding => 0,
7754                FrameBound::CurrentRow => peer_group_end(slice, i),
7755                FrameBound::UnboundedFollowing => last,
7756                _ => unreachable!("offset bounds rejected for RANGE"),
7757            };
7758            (lo, hi)
7759        }
7760    };
7761    if hi >= n {
7762        hi = last;
7763    }
7764    if lo >= n {
7765        lo = last;
7766    }
7767    (lo, hi)
7768}
7769
7770/// Find the inclusive index of the first row with the same ORDER
7771/// BY key as `slice[i]`. Slice is already sorted by partition then
7772/// order, so peers are contiguous.
7773#[allow(clippy::type_complexity)]
7774fn peer_group_start(slice: &[(Vec<Value>, Vec<(Value, bool)>, usize)], i: usize) -> usize {
7775    let key = &slice[i].1;
7776    let mut j = i;
7777    while j > 0 && order_key_cmp(&slice[j - 1].1, key) == core::cmp::Ordering::Equal {
7778        j -= 1;
7779    }
7780    j
7781}
7782
7783/// Find the inclusive index of the last row with the same ORDER
7784/// BY key as `slice[i]`.
7785#[allow(clippy::type_complexity)]
7786fn peer_group_end(slice: &[(Vec<Value>, Vec<(Value, bool)>, usize)], i: usize) -> usize {
7787    let key = &slice[i].1;
7788    let mut j = i;
7789    while j + 1 < slice.len() && order_key_cmp(&slice[j + 1].1, key) == core::cmp::Ordering::Equal {
7790        j += 1;
7791    }
7792    j
7793}
7794
7795fn value_to_f64(v: &Value) -> Option<f64> {
7796    match v {
7797        Value::SmallInt(n) => Some(f64::from(*n)),
7798        Value::Int(n) => Some(f64::from(*n)),
7799        #[allow(clippy::cast_precision_loss)]
7800        Value::BigInt(n) => Some(*n as f64),
7801        Value::Float(x) => Some(*x),
7802        _ => None,
7803    }
7804}
7805
7806/// Quick scan for any subquery-bearing node in a SELECT's WHERE /
7807/// projection / `order_by` — saves cloning the AST when there are
7808/// none (the common case).
7809fn expr_tree_has_subquery(stmt: &SelectStatement) -> bool {
7810    let mut any = false;
7811    for item in &stmt.items {
7812        if let SelectItem::Expr { expr, .. } = item {
7813            any = any || expr_has_subquery(expr);
7814        }
7815    }
7816    if let Some(w) = &stmt.where_ {
7817        any = any || expr_has_subquery(w);
7818    }
7819    if let Some(h) = &stmt.having {
7820        any = any || expr_has_subquery(h);
7821    }
7822    for o in &stmt.order_by {
7823        any = any || expr_has_subquery(&o.expr);
7824    }
7825    for (_, peer) in &stmt.unions {
7826        any = any || expr_tree_has_subquery(peer);
7827    }
7828    any
7829}
7830
7831fn expr_has_subquery(e: &Expr) -> bool {
7832    match e {
7833        Expr::ScalarSubquery(_) | Expr::Exists { .. } | Expr::InSubquery { .. } => true,
7834        Expr::Binary { lhs, rhs, .. } => expr_has_subquery(lhs) || expr_has_subquery(rhs),
7835        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
7836            expr_has_subquery(expr)
7837        }
7838        Expr::FunctionCall { args, .. } => args.iter().any(expr_has_subquery),
7839        Expr::Like { expr, pattern, .. } => expr_has_subquery(expr) || expr_has_subquery(pattern),
7840        Expr::Extract { source, .. } => expr_has_subquery(source),
7841        Expr::WindowFunction {
7842            args,
7843            partition_by,
7844            order_by,
7845            ..
7846        } => {
7847            args.iter().any(expr_has_subquery)
7848                || partition_by.iter().any(expr_has_subquery)
7849                || order_by.iter().any(|(e, _)| expr_has_subquery(e))
7850        }
7851        Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => false,
7852        Expr::Array(items) => items.iter().any(expr_has_subquery),
7853        Expr::ArraySubscript { target, index } => {
7854            expr_has_subquery(target) || expr_has_subquery(index)
7855        }
7856        Expr::AnyAll { expr, array, .. } => expr_has_subquery(expr) || expr_has_subquery(array),
7857    }
7858}
7859
7860/// v4.10 helper: materialise a runtime `Value` back into an AST
7861/// `Expr::Literal` for the subquery-rewrite path. Supports the
7862/// types `Literal` can represent (Integer / Float / Text / Bool /
7863/// Null). Date / Timestamp / Numeric / Vector / Interval / JSON
7864/// would lose precision through Literal and aren't supported in
7865/// uncorrelated-subquery results; they error with a clear hint.
7866fn value_to_literal_expr(v: Value) -> Result<Expr, EngineError> {
7867    let lit = match v {
7868        Value::Null => Literal::Null,
7869        Value::SmallInt(n) => Literal::Integer(i64::from(n)),
7870        Value::Int(n) => Literal::Integer(i64::from(n)),
7871        Value::BigInt(n) => Literal::Integer(n),
7872        Value::Float(x) => Literal::Float(x),
7873        Value::Text(s) | Value::Json(s) => Literal::String(s),
7874        Value::Bool(b) => Literal::Bool(b),
7875        other => {
7876            return Err(EngineError::Unsupported(alloc::format!(
7877                "subquery result type {:?} not yet materialisable; cast to text or integer in the inner SELECT",
7878                other.data_type()
7879            )));
7880        }
7881    };
7882    Ok(Expr::Literal(lit))
7883}
7884
7885/// v6.1.1 — walk the prepared `Statement` AST and replace every
7886/// `Expr::Placeholder(n)` with `Expr::Literal(value_to_literal(
7887/// params[n-1]))`. The dispatch downstream sees a `Statement`
7888/// indistinguishable from a simple-query parse, so the exec path
7889/// stays unchanged.
7890///
7891/// Errors fall into one shape: a `$N` references past the bound
7892/// `params.len()`. Out-of-range happens when the Bind didn't
7893/// supply enough values; pgwire surfaces this as a protocol error
7894/// to the client.
7895fn substitute_placeholders(stmt: &mut Statement, params: &[Value]) -> Result<(), EngineError> {
7896    match stmt {
7897        Statement::Select(s) => substitute_select(s, params)?,
7898        Statement::Insert(ins) => {
7899            for row in &mut ins.rows {
7900                for e in row {
7901                    substitute_expr(e, params)?;
7902                }
7903            }
7904        }
7905        Statement::Update(u) => {
7906            for (_, e) in &mut u.assignments {
7907                substitute_expr(e, params)?;
7908            }
7909            if let Some(w) = &mut u.where_ {
7910                substitute_expr(w, params)?;
7911            }
7912        }
7913        Statement::Delete(d) => {
7914            if let Some(w) = &mut d.where_ {
7915                substitute_expr(w, params)?;
7916            }
7917        }
7918        Statement::Explain(e) => substitute_select(&mut e.inner, params)?,
7919        // Other statements (CREATE / BEGIN / SHOW / …) have no
7920        // expression slots; no walk needed.
7921        _ => {}
7922    }
7923    Ok(())
7924}
7925
7926fn substitute_select(s: &mut SelectStatement, params: &[Value]) -> Result<(), EngineError> {
7927    for item in &mut s.items {
7928        if let SelectItem::Expr { expr, .. } = item {
7929            substitute_expr(expr, params)?;
7930        }
7931    }
7932    if let Some(w) = &mut s.where_ {
7933        substitute_expr(w, params)?;
7934    }
7935    if let Some(gs) = &mut s.group_by {
7936        for g in gs {
7937            substitute_expr(g, params)?;
7938        }
7939    }
7940    if let Some(h) = &mut s.having {
7941        substitute_expr(h, params)?;
7942    }
7943    for o in &mut s.order_by {
7944        substitute_expr(&mut o.expr, params)?;
7945    }
7946    for (_, peer) in &mut s.unions {
7947        substitute_select(peer, params)?;
7948    }
7949    // v7.9.24 — LIMIT $N / OFFSET $N placeholder resolution.
7950    // mailrs H2. After this pass each LIMIT/OFFSET that was a
7951    // Placeholder is rewritten to Literal so the existing
7952    // `LimitExpr::as_literal` path consumes a concrete u32.
7953    if let Some(le) = s.limit {
7954        s.limit = Some(resolve_limit_placeholder(le, params)?);
7955    }
7956    if let Some(le) = s.offset {
7957        s.offset = Some(resolve_limit_placeholder(le, params)?);
7958    }
7959    Ok(())
7960}
7961
7962fn resolve_limit_placeholder(
7963    le: spg_sql::ast::LimitExpr,
7964    params: &[Value],
7965) -> Result<spg_sql::ast::LimitExpr, EngineError> {
7966    use spg_sql::ast::LimitExpr;
7967    match le {
7968        LimitExpr::Literal(_) => Ok(le),
7969        LimitExpr::Placeholder(n) => {
7970            let idx = usize::from(n).saturating_sub(1);
7971            let v = params.get(idx).ok_or_else(|| {
7972                EngineError::Eval(EvalError::PlaceholderOutOfRange {
7973                    n,
7974                    bound: u16::try_from(params.len()).unwrap_or(u16::MAX),
7975                })
7976            })?;
7977            let int = match v {
7978                Value::SmallInt(x) => Some(i64::from(*x)),
7979                Value::Int(x) => Some(i64::from(*x)),
7980                Value::BigInt(x) => Some(*x),
7981                _ => None,
7982            }
7983            .ok_or_else(|| {
7984                EngineError::Unsupported(alloc::format!(
7985                    "LIMIT/OFFSET ${n} bound to non-integer {v:?}"
7986                ))
7987            })?;
7988            if int < 0 {
7989                return Err(EngineError::Unsupported(alloc::format!(
7990                    "LIMIT/OFFSET ${n} bound to negative value {int}"
7991                )));
7992            }
7993            let bounded = u32::try_from(int).map_err(|_| {
7994                EngineError::Unsupported(alloc::format!(
7995                    "LIMIT/OFFSET ${n} value {int} exceeds u32 range"
7996                ))
7997            })?;
7998            Ok(LimitExpr::Literal(bounded))
7999        }
8000    }
8001}
8002
8003fn substitute_expr(e: &mut Expr, params: &[Value]) -> Result<(), EngineError> {
8004    if let Expr::Placeholder(n) = e {
8005        let idx = usize::from(*n).saturating_sub(1);
8006        let v = params.get(idx).ok_or_else(|| {
8007            EngineError::Eval(EvalError::PlaceholderOutOfRange {
8008                n: *n,
8009                bound: u16::try_from(params.len()).unwrap_or(u16::MAX),
8010            })
8011        })?;
8012        *e = Expr::Literal(value_to_literal(v.clone()));
8013        return Ok(());
8014    }
8015    match e {
8016        Expr::Binary { lhs, rhs, .. } => {
8017            substitute_expr(lhs, params)?;
8018            substitute_expr(rhs, params)?;
8019        }
8020        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
8021            substitute_expr(expr, params)?;
8022        }
8023        Expr::FunctionCall { args, .. } => {
8024            for a in args {
8025                substitute_expr(a, params)?;
8026            }
8027        }
8028        Expr::Like { expr, pattern, .. } => {
8029            substitute_expr(expr, params)?;
8030            substitute_expr(pattern, params)?;
8031        }
8032        Expr::Extract { source, .. } => substitute_expr(source, params)?,
8033        Expr::ScalarSubquery(s) => substitute_select(s, params)?,
8034        Expr::Exists { subquery, .. } => substitute_select(subquery, params)?,
8035        Expr::InSubquery { expr, subquery, .. } => {
8036            substitute_expr(expr, params)?;
8037            substitute_select(subquery, params)?;
8038        }
8039        Expr::WindowFunction {
8040            args,
8041            partition_by,
8042            order_by,
8043            ..
8044        } => {
8045            for a in args {
8046                substitute_expr(a, params)?;
8047            }
8048            for p in partition_by {
8049                substitute_expr(p, params)?;
8050            }
8051            for (e, _) in order_by {
8052                substitute_expr(e, params)?;
8053            }
8054        }
8055        Expr::Literal(_) | Expr::Column(_) => {}
8056        // Already handled above.
8057        Expr::Placeholder(_) => unreachable!("Placeholder handled at top of fn"),
8058        Expr::Array(items) => {
8059            for elem in items {
8060                substitute_expr(elem, params)?;
8061            }
8062        }
8063        Expr::ArraySubscript { target, index } => {
8064            substitute_expr(target, params)?;
8065            substitute_expr(index, params)?;
8066        }
8067        Expr::AnyAll { expr, array, .. } => {
8068            substitute_expr(expr, params)?;
8069            substitute_expr(array, params)?;
8070        }
8071    }
8072    Ok(())
8073}
8074
8075/// v6.1.1 — convert a runtime `Value` into the closest matching
8076/// `Literal` for the substitute walker. Lossless for the simple
8077/// scalars (Int / Float / Text / Bool); Numeric / Date / Timestamp
8078/// / Json / Interval render as their canonical text form so the
8079/// downstream coerce_value can re-parse against the target column
8080/// type. SQ8 / HalfVector cells are NOT expected as bind params;
8081/// pgwire's Bind decodes vector params to the f32 representation
8082/// before they reach this helper.
8083/// v6.2.0 — total ordering on `Value`s used by ANALYZE to sort a
8084/// column's non-NULL sample before histogram building. Cross-type
8085/// pairs (Int vs Float, Date vs Timestamp, …) compare via the
8086/// same widening the eval-side `compare` operator uses; everything
8087/// else (the genuinely-incompatible pairs) falls back to ordering
8088/// by canonical string form so the sort is still total + stable.
8089/// Vector / SQ8 / Half / Json / Numeric / Interval values reach
8090/// here only via the string-fallback path because vector columns
8091/// are filtered out upstream.
8092fn sort_values_for_histogram(a: &Value, b: &Value) -> core::cmp::Ordering {
8093    use core::cmp::Ordering;
8094    match (a, b) {
8095        (Value::SmallInt(a), Value::SmallInt(b)) => a.cmp(b),
8096        (Value::Int(a), Value::Int(b)) => a.cmp(b),
8097        (Value::BigInt(a), Value::BigInt(b)) => a.cmp(b),
8098        (Value::SmallInt(a), Value::Int(b)) => i32::from(*a).cmp(b),
8099        (Value::Int(a), Value::SmallInt(b)) => a.cmp(&i32::from(*b)),
8100        (Value::Int(a), Value::BigInt(b)) => i64::from(*a).cmp(b),
8101        (Value::BigInt(a), Value::Int(b)) => a.cmp(&i64::from(*b)),
8102        (Value::SmallInt(a), Value::BigInt(b)) => i64::from(*a).cmp(b),
8103        (Value::BigInt(a), Value::SmallInt(b)) => a.cmp(&i64::from(*b)),
8104        (Value::Float(a), Value::Float(b)) => a.partial_cmp(b).unwrap_or(Ordering::Equal),
8105        (Value::Text(a), Value::Text(b)) | (Value::Json(a), Value::Json(b)) => a.cmp(b),
8106        (Value::Bool(a), Value::Bool(b)) => a.cmp(b),
8107        (Value::Date(a), Value::Date(b)) => a.cmp(b),
8108        (Value::Timestamp(a), Value::Timestamp(b)) => a.cmp(b),
8109        // Mixed numeric/float — widen to f64 and compare.
8110        (Value::SmallInt(n), Value::Float(x)) => {
8111            (f64::from(*n)).partial_cmp(x).unwrap_or(Ordering::Equal)
8112        }
8113        (Value::Float(x), Value::SmallInt(n)) => {
8114            x.partial_cmp(&f64::from(*n)).unwrap_or(Ordering::Equal)
8115        }
8116        (Value::Int(n), Value::Float(x)) => {
8117            (f64::from(*n)).partial_cmp(x).unwrap_or(Ordering::Equal)
8118        }
8119        (Value::Float(x), Value::Int(n)) => {
8120            x.partial_cmp(&f64::from(*n)).unwrap_or(Ordering::Equal)
8121        }
8122        (Value::BigInt(n), Value::Float(x)) => {
8123            #[allow(clippy::cast_precision_loss)]
8124            let nf = *n as f64;
8125            nf.partial_cmp(x).unwrap_or(Ordering::Equal)
8126        }
8127        (Value::Float(x), Value::BigInt(n)) => {
8128            #[allow(clippy::cast_precision_loss)]
8129            let nf = *n as f64;
8130            x.partial_cmp(&nf).unwrap_or(Ordering::Equal)
8131        }
8132        // Cross-type fallback: lexicographic on canonical form.
8133        // Total + stable so the sort is well-defined.
8134        _ => canonical_value_repr(a).cmp(&canonical_value_repr(b)),
8135    }
8136}
8137
8138/// v6.2.0 — render the histogram bounds list as a `[v0, v1, ...]`
8139/// string for the `spg_statistic.histogram_bounds` column. Values
8140/// containing `,` or `[` / `]` are JSON-style escaped so the
8141/// rendering round-trips through a future parser; v6.2.0 only
8142/// uses the rendered form for human consumption, so the escaping
8143/// is conservative.
8144fn render_histogram_bounds(bounds: &[alloc::string::String]) -> alloc::string::String {
8145    let mut out = alloc::string::String::with_capacity(bounds.len() * 8 + 2);
8146    out.push('[');
8147    for (i, b) in bounds.iter().enumerate() {
8148        if i > 0 {
8149            out.push_str(", ");
8150        }
8151        let needs_quote = b.contains([',', '[', ']', '"']) || b.is_empty();
8152        if needs_quote {
8153            out.push('"');
8154            for ch in b.chars() {
8155                if ch == '"' || ch == '\\' {
8156                    out.push('\\');
8157                }
8158                out.push(ch);
8159            }
8160            out.push('"');
8161        } else {
8162            out.push_str(b);
8163        }
8164    }
8165    out.push(']');
8166    out
8167}
8168
8169/// v6.2.0 — canonical textual form of a `Value` for histogram
8170/// bound storage. Strings used by ANALYZE for sort + bound output.
8171/// INT / BIGINT → decimal; FLOAT → shortest-round-trip via
8172/// `{:?}`; TEXT pass-through; BOOL → `t` / `f`; DATE / TIMESTAMP →
8173/// the same form `format_date` / `format_timestamp` produce for
8174/// SQL Display. Vector / SQ8 / Half / Json / Numeric / Interval
8175/// reach this only via a non-Vector column (vector columns are
8176/// skipped upstream); they fall back to a Debug-derived form so
8177/// stats still serialise without crashing.
8178pub(crate) fn canonical_value_repr(v: &Value) -> alloc::string::String {
8179    match v {
8180        Value::Null => "NULL".to_string(),
8181        Value::SmallInt(n) => alloc::format!("{n}"),
8182        Value::Int(n) => alloc::format!("{n}"),
8183        Value::BigInt(n) => alloc::format!("{n}"),
8184        Value::Float(x) => alloc::format!("{x:?}"),
8185        Value::Text(s) | Value::Json(s) => s.clone(),
8186        Value::Bool(b) => if *b { "t" } else { "f" }.to_string(),
8187        Value::Date(d) => eval::format_date(*d),
8188        Value::Timestamp(t) => eval::format_timestamp(*t),
8189        Value::Interval { months, micros } => eval::format_interval(*months, *micros),
8190        Value::Numeric { scaled, scale } => eval::format_numeric(*scaled, *scale),
8191        Value::Vector(_) | Value::Sq8Vector(_) | Value::HalfVector(_) => {
8192            // Unreachable in practice (vector columns are filtered
8193            // out before this). Defensive fallback so a future
8194            // vector-stats path doesn't crash.
8195            alloc::format!("{v:?}")
8196        }
8197        // v7.5.0 — Value is #[non_exhaustive] for downstream
8198        // forward-compat. Future variants fall through to Debug
8199        // form here (same shape as the vector fallback above).
8200        _ => alloc::format!("{v:?}"),
8201    }
8202}
8203
8204/// v6.2.0 — true for engine-managed catalog tables that the bare
8205/// `ANALYZE` (no target) should skip. v6.2.0 has no internal
8206/// tables yet (publications / subscriptions / users / statistics
8207/// all live as engine fields, not catalog tables), so this is a
8208/// reserved future-proofing hook — every existing user table is
8209/// analysed.
8210const fn is_internal_table_name(_name: &str) -> bool {
8211    false
8212}
8213
8214fn value_to_literal(v: Value) -> Literal {
8215    match v {
8216        Value::Null => Literal::Null,
8217        Value::SmallInt(n) => Literal::Integer(i64::from(n)),
8218        Value::Int(n) => Literal::Integer(i64::from(n)),
8219        Value::BigInt(n) => Literal::Integer(n),
8220        Value::Float(x) => Literal::Float(x),
8221        Value::Text(s) | Value::Json(s) => Literal::String(s),
8222        Value::Bool(b) => Literal::Bool(b),
8223        Value::Vector(v) => Literal::Vector(v),
8224        Value::Numeric { scaled, scale } => Literal::String(eval::format_numeric(scaled, scale)),
8225        Value::Date(d) => Literal::String(eval::format_date(d)),
8226        Value::Timestamp(t) => Literal::String(eval::format_timestamp(t)),
8227        Value::Interval { months, micros } => Literal::Interval {
8228            months,
8229            micros,
8230            text: eval::format_interval(months, micros),
8231        },
8232        // SQ8 / halfvec cells dequantise to f32 before reaching the
8233        // substitute walker; pgwire's Bind path handles that.
8234        Value::Sq8Vector(q) => Literal::Vector(spg_storage::quantize::dequantize(&q)),
8235        Value::HalfVector(h) => Literal::Vector(h.to_f32_vec()),
8236        // v7.5.0 — Value is #[non_exhaustive]; future variants
8237        // render as Debug-form String literal until explicit
8238        // mapping is added.
8239        v => Literal::String(alloc::format!("{v:?}")),
8240    }
8241}
8242
8243fn rewrite_clock_calls(stmt: &mut Statement, now_micros: Option<i64>) {
8244    let Some(now) = now_micros else {
8245        return;
8246    };
8247    match stmt {
8248        Statement::Select(s) => rewrite_select_clock(s, now),
8249        Statement::Insert(ins) => {
8250            for row in &mut ins.rows {
8251                for e in row {
8252                    rewrite_expr_clock(e, now);
8253                }
8254            }
8255        }
8256        _ => {}
8257    }
8258}
8259
8260fn rewrite_select_clock(s: &mut SelectStatement, now: i64) {
8261    for item in &mut s.items {
8262        if let SelectItem::Expr { expr, .. } = item {
8263            rewrite_expr_clock(expr, now);
8264        }
8265    }
8266    if let Some(w) = &mut s.where_ {
8267        rewrite_expr_clock(w, now);
8268    }
8269    if let Some(gs) = &mut s.group_by {
8270        for g in gs {
8271            rewrite_expr_clock(g, now);
8272        }
8273    }
8274    if let Some(h) = &mut s.having {
8275        rewrite_expr_clock(h, now);
8276    }
8277    for o in &mut s.order_by {
8278        rewrite_expr_clock(&mut o.expr, now);
8279    }
8280    for (_, peer) in &mut s.unions {
8281        rewrite_select_clock(peer, now);
8282    }
8283}
8284
8285/// v3.0.3 hot path: every recursion lands in exactly one `match` arm.
8286/// Literal / Column-with-qualifier (the dominant cases on a typical
8287/// AST) take a single pattern dispatch and exit. The clock-rewrite
8288/// targets (zero-arg `NOW` / `CURRENT_TIMESTAMP` / `CURRENT_DATE`
8289/// functions, and bare `CURRENT_TIMESTAMP` / `CURRENT_DATE` column
8290/// refs) sit on their own arms with match guards so the fall-through
8291/// to the recursive arms is unambiguous.
8292fn rewrite_expr_clock(e: &mut Expr, now: i64) {
8293    // Fast-path test on the no-recursion shapes first. We can't fold
8294    // them into the big match below because they need to *replace* `e`
8295    // outright; the recursive arms below match on its sub-fields.
8296    if let Some(replacement) = clock_replacement_for(e, now) {
8297        *e = replacement;
8298        return;
8299    }
8300    match e {
8301        Expr::Binary { lhs, rhs, .. } => {
8302            rewrite_expr_clock(lhs, now);
8303            rewrite_expr_clock(rhs, now);
8304        }
8305        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
8306            rewrite_expr_clock(expr, now);
8307        }
8308        Expr::FunctionCall { args, .. } => {
8309            for a in args {
8310                rewrite_expr_clock(a, now);
8311            }
8312        }
8313        Expr::Like { expr, pattern, .. } => {
8314            rewrite_expr_clock(expr, now);
8315            rewrite_expr_clock(pattern, now);
8316        }
8317        Expr::Extract { source, .. } => rewrite_expr_clock(source, now),
8318        // v4.10 subquery nodes — recurse into the inner SELECT's
8319        // expression slots so e.g. SELECT NOW() in a scalar
8320        // subquery picks up the same instant as the outer query.
8321        Expr::ScalarSubquery(s) => rewrite_select_clock(s, now),
8322        Expr::Exists { subquery, .. } => rewrite_select_clock(subquery, now),
8323        Expr::InSubquery { expr, subquery, .. } => {
8324            rewrite_expr_clock(expr, now);
8325            rewrite_select_clock(subquery, now);
8326        }
8327        // v4.12 window functions — args + PARTITION BY + ORDER BY
8328        // may all reference clock literals.
8329        Expr::WindowFunction {
8330            args,
8331            partition_by,
8332            order_by,
8333            ..
8334        } => {
8335            for a in args {
8336                rewrite_expr_clock(a, now);
8337            }
8338            for p in partition_by {
8339                rewrite_expr_clock(p, now);
8340            }
8341            for (e, _) in order_by {
8342                rewrite_expr_clock(e, now);
8343            }
8344        }
8345        Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => {}
8346        Expr::Array(items) => {
8347            for elem in items {
8348                rewrite_expr_clock(elem, now);
8349            }
8350        }
8351        Expr::ArraySubscript { target, index } => {
8352            rewrite_expr_clock(target, now);
8353            rewrite_expr_clock(index, now);
8354        }
8355        Expr::AnyAll { expr, array, .. } => {
8356            rewrite_expr_clock(expr, now);
8357            rewrite_expr_clock(array, now);
8358        }
8359    }
8360}
8361
8362/// Returns `Some(Expr)` when `e` is one of the clock-call shapes that
8363/// must be rewritten; otherwise `None` so the caller falls through to
8364/// the recursive walk. Identifies both function-call forms (`NOW()` /
8365/// `CURRENT_TIMESTAMP()` / `CURRENT_DATE()`) and bare-identifier forms
8366/// (`CURRENT_TIMESTAMP` / `CURRENT_DATE` as unqualified column refs,
8367/// which is how PG accepts them without parens).
8368fn clock_replacement_for(e: &Expr, now: i64) -> Option<Expr> {
8369    let (kind, name) = match e {
8370        Expr::FunctionCall { name, args } if args.is_empty() => (ClockSite::Fn, name.as_str()),
8371        Expr::Column(c) if c.qualifier.is_none() => (ClockSite::BareIdent, c.name.as_str()),
8372        _ => return None,
8373    };
8374    // ASCII case-insensitive name match. Limited to the three keywords
8375    // that actually need rewriting.
8376    let matched = match name.len() {
8377        3 if kind == ClockSite::Fn && name.eq_ignore_ascii_case("now") => Some(true),
8378        12 if name.eq_ignore_ascii_case("current_date") => Some(false),
8379        17 if name.eq_ignore_ascii_case("current_timestamp") => Some(true),
8380        _ => None,
8381    };
8382    let is_timestamp = matched?;
8383    let payload = if is_timestamp {
8384        now
8385    } else {
8386        now.div_euclid(86_400_000_000)
8387    };
8388    let target = if is_timestamp {
8389        spg_sql::ast::CastTarget::Timestamp
8390    } else {
8391        spg_sql::ast::CastTarget::Date
8392    };
8393    Some(Expr::Cast {
8394        expr: alloc::boxed::Box::new(Expr::Literal(spg_sql::ast::Literal::Integer(payload))),
8395        target,
8396    })
8397}
8398
8399#[derive(Debug, Clone, Copy, PartialEq, Eq)]
8400enum ClockSite {
8401    Fn,
8402    BareIdent,
8403}
8404
8405/// `ORDER BY <integer>` references the N-th SELECT item (1-based).
8406/// Swap the integer literal for the matching item's expression so the
8407/// executor doesn't need a special-case branch. Recurses into UNION
8408/// peers because each peer keeps its own SELECT list.
8409/// v6.4.1 — expand `GROUP BY ALL` to every non-aggregate SELECT-list
8410/// item. Mirrors DuckDB / PG 19 semantics. Wildcards (`SELECT * …`)
8411/// are NOT expanded by GROUP BY ALL (PG 19 leaves the wildcard intact
8412/// and groups by whatever explicit non-aggregates remain — none in
8413/// the wildcard-only case, which still works for non-aggregate
8414/// queries).
8415fn expand_group_by_all(s: &mut SelectStatement) {
8416    if !s.group_by_all {
8417        for (_, peer) in &mut s.unions {
8418            expand_group_by_all(peer);
8419        }
8420        return;
8421    }
8422    let mut groups: Vec<Expr> = Vec::new();
8423    for item in &s.items {
8424        if let SelectItem::Expr { expr, .. } = item
8425            && !aggregate::contains_aggregate(expr)
8426        {
8427            groups.push(expr.clone());
8428        }
8429    }
8430    s.group_by = Some(groups);
8431    s.group_by_all = false;
8432    for (_, peer) in &mut s.unions {
8433        expand_group_by_all(peer);
8434    }
8435}
8436
8437fn resolve_order_by_position(s: &mut SelectStatement) {
8438    // v6.4.0 — iterate every ORDER BY key. Position references
8439    // (`ORDER BY 2`) bind to the 1-based projection index;
8440    // identifier references that match a SELECT-list alias bind to
8441    // the projected expression (Step 4 of L3a).
8442    for order in &mut s.order_by {
8443        match &order.expr {
8444            Expr::Literal(Literal::Integer(n)) if *n >= 1 => {
8445                if let Ok(idx_one_based) = usize::try_from(*n) {
8446                    let idx = idx_one_based - 1;
8447                    if idx < s.items.len()
8448                        && let SelectItem::Expr { expr, .. } = &s.items[idx]
8449                    {
8450                        order.expr = expr.clone();
8451                    }
8452                }
8453            }
8454            Expr::Column(c) if c.qualifier.is_none() => {
8455                // Alias-in-ORDER-BY lookup.
8456                for item in &s.items {
8457                    if let SelectItem::Expr {
8458                        expr,
8459                        alias: Some(a),
8460                    } = item
8461                        && a == &c.name
8462                    {
8463                        order.expr = expr.clone();
8464                        break;
8465                    }
8466                }
8467            }
8468            _ => {}
8469        }
8470    }
8471    for (_, peer) in &mut s.unions {
8472        resolve_order_by_position(peer);
8473    }
8474}
8475
8476/// Sort `tagged` by `f64` key, reversing the comparator under DESC.
8477/// Used by the UNION ORDER BY path; per-block paths inline the same
8478/// comparator because they already hold `&OrderBy` directly.
8479/// v3.1.1: partial-sort helper. When `keep` (= offset + limit) is
8480/// strictly less than `tagged.len()`, run `select_nth_unstable_by` to
8481/// partition the prefix in O(n), then sort just that prefix in O(k
8482/// log k). Total O(n + k log k), vs O(n log n) for a full sort. The
8483/// caller decides what `keep` is; passing `None` (no LIMIT) keeps the
8484/// full-sort behaviour.
8485///
8486/// `tagged` holds `(Option<f64>, Row)` (the SELECT path) — `None` keys
8487/// sort last in ascending order, mirroring NULL-sorts-last in SQL.
8488fn partial_sort_tagged(tagged: &mut Vec<(Vec<f64>, Row)>, keep: Option<usize>, descs: &[bool]) {
8489    let cmp = |a: &(Vec<f64>, Row), b: &(Vec<f64>, Row)| cmp_multi_key(&a.0, &b.0, descs);
8490    match keep {
8491        Some(k) if k < tagged.len() && k > 0 => {
8492            let pivot = k - 1;
8493            tagged.select_nth_unstable_by(pivot, cmp);
8494            tagged[..k].sort_by(cmp);
8495            tagged.truncate(k);
8496        }
8497        _ => {
8498            tagged.sort_by(cmp);
8499        }
8500    }
8501}
8502
8503fn sort_by_keys(tagged: &mut [(Vec<f64>, Row)], descs: &[bool]) {
8504    tagged.sort_by(|a, b| cmp_multi_key(&a.0, &b.0, descs));
8505}
8506
8507/// v6.4.0 — multi-key ORDER BY comparator. Each key's per-key DESC
8508/// flag is honored independently. NULL is encoded as `f64::INFINITY`
8509/// so it sorts last in ASC and first in DESC (matches PG default).
8510fn cmp_multi_key(a: &[f64], b: &[f64], descs: &[bool]) -> core::cmp::Ordering {
8511    use core::cmp::Ordering;
8512    for (i, (ka, kb)) in a.iter().zip(b.iter()).enumerate() {
8513        let ord = ka.partial_cmp(kb).unwrap_or(Ordering::Equal);
8514        let ord = if descs.get(i).copied().unwrap_or(false) {
8515            ord.reverse()
8516        } else {
8517            ord
8518        };
8519        if ord != Ordering::Equal {
8520            return ord;
8521        }
8522    }
8523    Ordering::Equal
8524}
8525
8526/// v6.4.0 — eval every ORDER BY expression for a row and pack the
8527/// resulting keys into a `Vec<f64>`. NULL → `f64::INFINITY`.
8528fn build_order_keys(
8529    order_by: &[OrderBy],
8530    row: &Row,
8531    ctx: &EvalContext,
8532) -> Result<Vec<f64>, EngineError> {
8533    let mut keys = Vec::with_capacity(order_by.len());
8534    for o in order_by {
8535        let v = eval::eval_expr(&o.expr, row, ctx)?;
8536        keys.push(value_to_order_key(&v)?);
8537    }
8538    Ok(keys)
8539}
8540
8541/// Drop the first `offset` rows then truncate to `limit`. PG / `MySQL`
8542/// agree: OFFSET applies *after* ORDER BY but *before* LIMIT (so
8543/// `LIMIT 10 OFFSET 5` keeps rows 6..=15).
8544fn apply_offset_and_limit(rows: &mut Vec<Row>, offset: Option<u32>, limit: Option<u32>) {
8545    if let Some(off) = offset {
8546        let off = off as usize;
8547        if off >= rows.len() {
8548            rows.clear();
8549        } else {
8550            rows.drain(..off);
8551        }
8552    }
8553    if let Some(n) = limit {
8554        rows.truncate(n as usize);
8555    }
8556}
8557
8558/// v7.6.1 — resolve a parser-level `ForeignKeyConstraint` (column
8559/// names + parent table name) into the storage-layer shape (column
8560/// indices + same parent table). Validates everything the engine
8561/// needs to know about the FK at CREATE TABLE time:
8562///
8563///   - parent table exists (catalog lookup, unless self-referencing)
8564///   - parent columns exist on the parent table
8565///   - parent column list matches the local arity (defaults to the
8566///     parent's primary index column when omitted)
8567///   - parent columns are covered by a `BTree` UNIQUE-class index
8568///     (SPG's stand-in for `PRIMARY KEY`/`UNIQUE`) — required so
8569///     the v7.6.2 INSERT path can do an O(log n) parent lookup
8570///   - local columns exist on the table being created
8571fn resolve_foreign_key(
8572    local_table_name: &str,
8573    local_cols: &[ColumnSchema],
8574    fk: spg_sql::ast::ForeignKeyConstraint,
8575    catalog: &Catalog,
8576) -> Result<spg_storage::ForeignKeyConstraint, EngineError> {
8577    // Resolve local columns.
8578    let mut local_columns = Vec::with_capacity(fk.columns.len());
8579    for name in &fk.columns {
8580        let pos = local_cols
8581            .iter()
8582            .position(|c| c.name == *name)
8583            .ok_or_else(|| {
8584                EngineError::Unsupported(alloc::format!(
8585                    "FOREIGN KEY references unknown local column {name:?}"
8586                ))
8587            })?;
8588        local_columns.push(pos);
8589    }
8590    // Self-referencing FK: parent table is the one we're creating.
8591    // The parent column resolution uses the local column list since
8592    // the catalog doesn't have this table yet.
8593    let is_self_ref = fk.parent_table == local_table_name;
8594    let (parent_cols_for_lookup, parent_table_str): (&[ColumnSchema], &str) = if is_self_ref {
8595        (local_cols, local_table_name)
8596    } else {
8597        let parent_table = catalog.get(&fk.parent_table).ok_or_else(|| {
8598            EngineError::Storage(StorageError::TableNotFound {
8599                name: fk.parent_table.clone(),
8600            })
8601        })?;
8602        (
8603            parent_table.schema().columns.as_slice(),
8604            fk.parent_table.as_str(),
8605        )
8606    };
8607    // Resolve parent column names → positions. If the FK omitted the
8608    // parent column list, fall back to the parent's primary index
8609    // column (single-column only — composite default is rejected
8610    // because there's no unambiguous "PK" in SPG's index list).
8611    let parent_columns: Vec<usize> = if fk.parent_columns.is_empty() {
8612        if fk.columns.len() != 1 {
8613            return Err(EngineError::Unsupported(
8614                "composite FOREIGN KEY without explicit parent column list is not supported \
8615                 — list the parent columns explicitly"
8616                    .into(),
8617            ));
8618        }
8619        // Find a single BTree index on the parent and use its column.
8620        let pos = pick_pk_index_column(catalog, parent_table_str, is_self_ref, local_cols)
8621            .ok_or_else(|| {
8622                EngineError::Unsupported(alloc::format!(
8623                    "parent table {parent_table_str:?} has no PRIMARY-key / UNIQUE BTree index \
8624                     to default the FOREIGN KEY against"
8625                ))
8626            })?;
8627        alloc::vec![pos]
8628    } else {
8629        let mut out = Vec::with_capacity(fk.parent_columns.len());
8630        for name in &fk.parent_columns {
8631            let pos = parent_cols_for_lookup
8632                .iter()
8633                .position(|c| c.name == *name)
8634                .ok_or_else(|| {
8635                    EngineError::Unsupported(alloc::format!(
8636                        "FOREIGN KEY references unknown parent column \
8637                         {name:?} on table {parent_table_str:?}"
8638                    ))
8639                })?;
8640            out.push(pos);
8641        }
8642        out
8643    };
8644    if parent_columns.len() != local_columns.len() {
8645        return Err(EngineError::Unsupported(alloc::format!(
8646            "FOREIGN KEY arity mismatch: {} local columns vs {} parent columns",
8647            local_columns.len(),
8648            parent_columns.len()
8649        )));
8650    }
8651    // For non-self-referencing FKs, verify the parent column set is
8652    // covered by a BTree index. SPG doesn't have a `PRIMARY KEY`
8653    // declaration; the convention is "the parent column for FK
8654    // purposes must have a BTree index" — which the user creates via
8655    // `CREATE INDEX ... USING btree (col)` (the default). We accept
8656    // any single-column BTree index that covers a parent column;
8657    // composite parent column lists require an index whose `column_position`
8658    // matches the first parent column (multi-column BTree indices
8659    // are not in the v7.x roadmap).
8660    if !is_self_ref {
8661        let parent_table = catalog.get(&fk.parent_table).expect("checked above");
8662        let primary_parent_col = parent_columns[0];
8663        let has_btree = parent_table
8664            .schema()
8665            .columns
8666            .get(primary_parent_col)
8667            .is_some()
8668            && parent_table.indices().iter().any(|idx| {
8669                matches!(idx.kind, spg_storage::IndexKind::BTree(_))
8670                    && idx.column_position == primary_parent_col
8671                    && idx.partial_predicate.is_none()
8672            });
8673        if !has_btree {
8674            return Err(EngineError::Unsupported(alloc::format!(
8675                "FOREIGN KEY parent column on {:?} is not covered by an unconditional BTree \
8676                 index — create one with `CREATE INDEX ... ON {} ({})` first",
8677                parent_table_str,
8678                parent_table_str,
8679                parent_table.schema().columns[primary_parent_col].name,
8680            )));
8681        }
8682    }
8683    let on_delete = fk_action_sql_to_storage(fk.on_delete);
8684    let on_update = fk_action_sql_to_storage(fk.on_update);
8685    Ok(spg_storage::ForeignKeyConstraint {
8686        name: fk.name,
8687        local_columns,
8688        parent_table: fk.parent_table,
8689        parent_columns,
8690        on_delete,
8691        on_update,
8692    })
8693}
8694
8695/// v7.6.1 — pick a sentinel "primary key" column from the parent
8696/// table when the FK didn't name parent columns. Picks the first
8697/// single-column unconditional BTree index — that's the closest
8698/// thing SPG has to a PRIMARY KEY today. Self-referencing FKs use
8699/// `local_cols` as the column source.
8700fn pick_pk_index_column(
8701    catalog: &Catalog,
8702    parent_name: &str,
8703    is_self_ref: bool,
8704    local_cols: &[ColumnSchema],
8705) -> Option<usize> {
8706    if is_self_ref {
8707        // Self-ref FK omitted parent columns: pick column 0 by
8708        // convention (no catalog entry yet). Engine will widen this
8709        // when v7.6.7 lands; v7.6.1 only handles the explicit form.
8710        let _ = local_cols;
8711        return Some(0);
8712    }
8713    let parent = catalog.get(parent_name)?;
8714    parent.indices().iter().find_map(|idx| {
8715        if matches!(idx.kind, spg_storage::IndexKind::BTree(_))
8716            && idx.partial_predicate.is_none()
8717            && idx.included_columns.is_empty()
8718            && idx.expression.is_none()
8719        {
8720            Some(idx.column_position)
8721        } else {
8722            None
8723        }
8724    })
8725}
8726
8727/// v7.9.8 / v7.9.10 — resolve the column positions that
8728/// identify a conflict for ON CONFLICT. Returns a Vec of
8729/// column positions (1 element for single-column form, N for
8730/// composite). When the user wrote bare `ON CONFLICT DO …`,
8731/// falls back to the table's first unconditional BTree index
8732/// (always single-column today).
8733fn resolve_on_conflict_columns(
8734    catalog: &Catalog,
8735    table_name: &str,
8736    target: &[String],
8737) -> Result<Vec<usize>, EngineError> {
8738    let table = catalog.get(table_name).ok_or_else(|| {
8739        EngineError::Storage(StorageError::TableNotFound {
8740            name: table_name.into(),
8741        })
8742    })?;
8743    if target.is_empty() {
8744        let pos = table
8745            .indices()
8746            .iter()
8747            .find_map(|idx| {
8748                if matches!(idx.kind, spg_storage::IndexKind::BTree(_))
8749                    && idx.partial_predicate.is_none()
8750                    && idx.included_columns.is_empty()
8751                    && idx.expression.is_none()
8752                {
8753                    Some(idx.column_position)
8754                } else {
8755                    None
8756                }
8757            })
8758            .ok_or_else(|| {
8759                EngineError::Unsupported(alloc::format!(
8760                    "ON CONFLICT without target requires a UNIQUE BTree index on {table_name:?}"
8761                ))
8762            })?;
8763        return Ok(alloc::vec![pos]);
8764    }
8765    let mut out = Vec::with_capacity(target.len());
8766    for name in target {
8767        let pos = table
8768            .schema()
8769            .columns
8770            .iter()
8771            .position(|c| c.name == *name)
8772            .ok_or_else(|| {
8773                EngineError::Unsupported(alloc::format!(
8774                    "ON CONFLICT target column {name:?} not found on {table_name:?}"
8775                ))
8776            })?;
8777        out.push(pos);
8778    }
8779    Ok(out)
8780}
8781
8782/// v7.9.8 — check whether the BTree index on `column_pos` of
8783/// `table_name` already has a row with this key.
8784fn on_conflict_key_exists(
8785    catalog: &Catalog,
8786    table_name: &str,
8787    column_pos: usize,
8788    key: &Value,
8789) -> bool {
8790    let Some(table) = catalog.get(table_name) else {
8791        return false;
8792    };
8793    let Some(idx_key) = spg_storage::IndexKey::from_value(key) else {
8794        return false;
8795    };
8796    table.indices().iter().any(|idx| {
8797        matches!(idx.kind, spg_storage::IndexKind::BTree(_))
8798            && idx.column_position == column_pos
8799            && idx.partial_predicate.is_none()
8800            && !idx.lookup_eq(&idx_key).is_empty()
8801    })
8802}
8803
8804/// v7.9.9 / v7.9.10 — look up an existing row's position by
8805/// matching all `column_positions` against the incoming `key`
8806/// tuple. Single-column shape (one column) reduces to the
8807/// canonical PK lookup; composite shapes scan linearly until
8808/// every position matches.
8809fn lookup_row_position_by_keys(
8810    catalog: &Catalog,
8811    table_name: &str,
8812    column_positions: &[usize],
8813    key: &[&Value],
8814) -> Option<usize> {
8815    let table = catalog.get(table_name)?;
8816    table.rows().iter().position(|r| {
8817        column_positions
8818            .iter()
8819            .enumerate()
8820            .all(|(i, &pos)| r.values.get(pos) == Some(key[i]))
8821    })
8822}
8823
8824/// v7.9.10 — does the table already contain a row whose
8825/// `column_positions` tuple equals `key`? Single-column shape
8826/// uses the existing BTree fast path; composite shapes fall
8827/// back to a row scan.
8828fn on_conflict_keys_exist(
8829    catalog: &Catalog,
8830    table_name: &str,
8831    column_positions: &[usize],
8832    key: &[&Value],
8833) -> bool {
8834    if column_positions.len() == 1 {
8835        return on_conflict_key_exists(catalog, table_name, column_positions[0], key[0]);
8836    }
8837    let Some(table) = catalog.get(table_name) else {
8838        return false;
8839    };
8840    table.rows().iter().any(|r| {
8841        column_positions
8842            .iter()
8843            .enumerate()
8844            .all(|(i, &pos)| r.values.get(pos) == Some(key[i]))
8845    })
8846}
8847
8848/// v7.9.9 — apply ON CONFLICT DO UPDATE SET assignments to an
8849/// existing row.
8850///
8851/// `incoming` is the rejected INSERT row (used to resolve
8852/// `EXCLUDED.col` references in the assignment exprs);
8853/// `target_pos` is the position of the existing row in the table.
8854/// Each assignment substitutes `EXCLUDED.col` with the matching
8855/// incoming value, evaluates the resulting expression against
8856/// the existing row, and writes the new value into the
8857/// corresponding column of the returned `Vec<Value>`. If
8858/// `where_` evaluates falsy, returns Ok(None) — PG behaviour:
8859/// the conflicting row is silently kept unchanged.
8860fn apply_on_conflict_assignments(
8861    catalog: &Catalog,
8862    table_name: &str,
8863    target_pos: usize,
8864    incoming: &[Value],
8865    assignments: &[(String, Expr)],
8866    where_: Option<&Expr>,
8867) -> Result<Option<Vec<Value>>, EngineError> {
8868    let table = catalog.get(table_name).ok_or_else(|| {
8869        EngineError::Storage(StorageError::TableNotFound {
8870            name: table_name.into(),
8871        })
8872    })?;
8873    let schema_cols = table.schema().columns.clone();
8874    let existing = table
8875        .rows()
8876        .get(target_pos)
8877        .ok_or_else(|| {
8878            EngineError::Unsupported(alloc::format!(
8879                "ON CONFLICT DO UPDATE: row position {target_pos} out of bounds on {table_name:?}"
8880            ))
8881        })?
8882        .clone();
8883    let ctx = eval::EvalContext::new(&schema_cols, Some(table_name));
8884    // Optional WHERE filter on the conflict row.
8885    if let Some(w) = where_ {
8886        let pred = w.clone();
8887        let pred = substitute_excluded_refs(pred, &schema_cols, incoming);
8888        let v = eval::eval_expr(&pred, &existing, &ctx)?;
8889        if !matches!(v, Value::Bool(true)) {
8890            return Ok(None);
8891        }
8892    }
8893    let mut new_values = existing.values.clone();
8894    for (col_name, expr) in assignments {
8895        let target_idx = schema_cols
8896            .iter()
8897            .position(|c| c.name == *col_name)
8898            .ok_or_else(|| {
8899                EngineError::Eval(EvalError::ColumnNotFound {
8900                    name: col_name.clone(),
8901                })
8902            })?;
8903        let sub = substitute_excluded_refs(expr.clone(), &schema_cols, incoming);
8904        let v = eval::eval_expr(&sub, &existing, &ctx)?;
8905        new_values[target_idx] = coerce_value(v, schema_cols[target_idx].ty, col_name, target_idx)?;
8906    }
8907    Ok(Some(new_values))
8908}
8909
8910/// v7.9.9 — walk an `Expr` tree replacing any `Column { qualifier:
8911/// "EXCLUDED", name }` reference with a `Literal` of the matching
8912/// value from the incoming-row vec. Resolution against the
8913/// child-table column list (by name).
8914fn substitute_excluded_refs(expr: Expr, schema_cols: &[ColumnSchema], incoming: &[Value]) -> Expr {
8915    use spg_sql::ast::ColumnName;
8916    match expr {
8917        Expr::Column(ColumnName { qualifier, name })
8918            if qualifier
8919                .as_deref()
8920                .is_some_and(|q| q.eq_ignore_ascii_case("excluded")) =>
8921        {
8922            let pos = schema_cols.iter().position(|c| c.name == name);
8923            match pos {
8924                Some(p) => {
8925                    let v = incoming.get(p).cloned().unwrap_or(Value::Null);
8926                    value_to_literal_expr(v)
8927                        .unwrap_or_else(|_| Expr::Literal(spg_sql::ast::Literal::Null))
8928                }
8929                None => Expr::Column(ColumnName { qualifier, name }),
8930            }
8931        }
8932        Expr::Binary { op, lhs, rhs } => Expr::Binary {
8933            op,
8934            lhs: Box::new(substitute_excluded_refs(*lhs, schema_cols, incoming)),
8935            rhs: Box::new(substitute_excluded_refs(*rhs, schema_cols, incoming)),
8936        },
8937        Expr::Unary { op, expr } => Expr::Unary {
8938            op,
8939            expr: Box::new(substitute_excluded_refs(*expr, schema_cols, incoming)),
8940        },
8941        Expr::FunctionCall { name, args } => Expr::FunctionCall {
8942            name,
8943            args: args
8944                .into_iter()
8945                .map(|a| substitute_excluded_refs(a, schema_cols, incoming))
8946                .collect(),
8947        },
8948        other => other,
8949    }
8950}
8951
8952/// v7.6.2 / v7.6.7 — INSERT-side FK enforcement. For every row
8953/// about to be inserted into `child_table`, every FK declared on
8954/// that table is checked: the row's FK columns must either be
8955/// NULL (SQL spec skip) or match an existing parent row via the
8956/// parent's BTree PK / UNIQUE index.
8957///
8958/// Returns `EngineError::Unsupported` with a `FOREIGN KEY violation`
8959/// payload on first failure.
8960///
8961/// **Self-referencing FKs (v7.6.7 widening):** when `fk.parent_table
8962/// == child_table`, the parent rows visible to this check are
8963///  (a) rows already committed to the table, plus
8964///  (b) earlier rows from the *same* `rows` batch.
8965/// This makes `INSERT INTO tree VALUES (1, NULL), (2, 1), (3, 2)`
8966/// work in a single statement — common pattern for bulk-loading
8967/// hierarchies.
8968/// v7.9.19 — enforce table-level UNIQUE / PRIMARY KEY tuple
8969/// constraints at INSERT time. For each constraint declared on
8970/// the target table, check that no existing row + no earlier row
8971/// in the same batch has the same full-column tuple. NULL in
8972/// any column lifts the row out of the check (SQL spec: NULL
8973/// ≠ NULL for uniqueness). mailrs G1 + G6.
8974fn enforce_uniqueness_inserts(
8975    catalog: &Catalog,
8976    child_table: &str,
8977    constraints: &[spg_storage::UniquenessConstraint],
8978    rows: &[Vec<Value>],
8979) -> Result<(), EngineError> {
8980    if constraints.is_empty() {
8981        return Ok(());
8982    }
8983    let table = catalog.get(child_table).ok_or_else(|| {
8984        EngineError::Storage(StorageError::TableNotFound {
8985            name: child_table.into(),
8986        })
8987    })?;
8988    for uc in constraints {
8989        for (batch_idx, row_values) in rows.iter().enumerate() {
8990            let key: Vec<&Value> = uc.columns.iter().map(|&i| &row_values[i]).collect();
8991            let has_null = key.iter().any(|v| matches!(v, Value::Null));
8992            if has_null {
8993                continue;
8994            }
8995            // Table-side collision: scan existing rows.
8996            let collides_in_table = table.rows().iter().any(|prow| {
8997                uc.columns
8998                    .iter()
8999                    .enumerate()
9000                    .all(|(i, &p)| prow.values.get(p) == Some(key[i]))
9001            });
9002            // Batch-side collision: earlier rows in the same INSERT.
9003            let collides_in_batch = rows[..batch_idx].iter().any(|earlier| {
9004                uc.columns
9005                    .iter()
9006                    .enumerate()
9007                    .all(|(i, &p)| earlier.get(p) == Some(key[i]))
9008            });
9009            if collides_in_table || collides_in_batch {
9010                let kind = if uc.is_primary_key {
9011                    "PRIMARY KEY"
9012                } else {
9013                    "UNIQUE"
9014                };
9015                let col_names: Vec<String> = uc
9016                    .columns
9017                    .iter()
9018                    .map(|&i| table.schema().columns[i].name.clone())
9019                    .collect();
9020                return Err(EngineError::Unsupported(alloc::format!(
9021                    "{kind} violation on {child_table:?} columns {col_names:?}: \
9022                     row #{batch_idx} duplicates an existing key"
9023                )));
9024            }
9025        }
9026    }
9027    Ok(())
9028}
9029
9030/// v7.9.29 — `true` iff `v` counts as a truthy SQL value for a
9031/// WHERE-style predicate. NULL → false (three-valued logic
9032/// collapses to "skip this row" for index inclusion). Numeric
9033/// non-zero, BIGINT non-zero, TINYINT non-zero, BOOLEAN true → true.
9034/// Everything else (strings, vectors, JSON, …) is not a valid
9035/// predicate result and surfaces as `false` so a malformed
9036/// predicate degrades to "row not in index" rather than panicking.
9037fn predicate_truthy(v: &spg_storage::Value) -> bool {
9038    use spg_storage::Value as V;
9039    match v {
9040        V::Bool(b) => *b,
9041        V::Int(n) => *n != 0,
9042        V::BigInt(n) => *n != 0,
9043        V::SmallInt(n) => *n != 0,
9044        _ => false,
9045    }
9046}
9047
9048/// v7.9.29 — at CREATE UNIQUE INDEX time, scan the table's
9049/// committed rows for pre-existing duplicates. If any pair of rows
9050/// matches the predicate AND has the same index key, refuse to
9051/// create the index so the user fixes the data before retrying.
9052fn check_existing_unique_violation(
9053    idx: &spg_storage::Index,
9054    schema: &spg_storage::TableSchema,
9055    rows: &[spg_storage::Row],
9056) -> Result<(), EngineError> {
9057    let predicate_expr = match idx.partial_predicate.as_deref() {
9058        Some(s) => Some(spg_sql::parser::parse_expression(s).map_err(|e| {
9059            EngineError::Unsupported(alloc::format!(
9060                "stored partial predicate {s:?} failed to re-parse: {e:?}"
9061            ))
9062        })?),
9063        None => None,
9064    };
9065    let ctx = eval::EvalContext::new(&schema.columns, None);
9066    let key_positions = unique_key_positions(idx);
9067    let mut seen: alloc::vec::Vec<alloc::vec::Vec<spg_storage::Value>> = alloc::vec::Vec::new();
9068    for row in rows {
9069        if let Some(expr) = &predicate_expr {
9070            let v = eval::eval_expr(expr, row, &ctx).map_err(|e| {
9071                EngineError::Unsupported(alloc::format!(
9072                    "evaluating UNIQUE INDEX predicate against existing row: {e:?}"
9073                ))
9074            })?;
9075            if !predicate_truthy(&v) {
9076                continue;
9077            }
9078        }
9079        let key: alloc::vec::Vec<spg_storage::Value> = key_positions
9080            .iter()
9081            .map(|&p| {
9082                row.values
9083                    .get(p)
9084                    .cloned()
9085                    .unwrap_or(spg_storage::Value::Null)
9086            })
9087            .collect();
9088        if key.iter().any(|v| matches!(v, spg_storage::Value::Null)) {
9089            continue;
9090        }
9091        if seen.iter().any(|other| *other == key) {
9092            return Err(EngineError::Unsupported(alloc::format!(
9093                "CREATE UNIQUE INDEX {:?}: existing rows already violate the constraint",
9094                idx.name
9095            )));
9096        }
9097        seen.push(key);
9098    }
9099    Ok(())
9100}
9101
9102/// v7.9.29 — full key tuple for a UNIQUE INDEX (leading +
9103/// extra positions). For single-column indexes this is just
9104/// `[column_position]`.
9105fn unique_key_positions(idx: &spg_storage::Index) -> alloc::vec::Vec<usize> {
9106    let mut out = alloc::vec::Vec::with_capacity(1 + idx.extra_column_positions.len());
9107    out.push(idx.column_position);
9108    out.extend_from_slice(&idx.extra_column_positions);
9109    out
9110}
9111
9112/// v7.9.29 — at INSERT time, walk every `is_unique` index on the
9113/// target table. For each, eval the index's optional predicate
9114/// against (a) the candidate row and (b) every committed row plus
9115/// earlier batch rows; only rows where the predicate is truthy
9116/// participate. A duplicate key among predicate-matching rows is a
9117/// uniqueness violation. NULL keys lift the row out of the check
9118/// (matching PG's "UNIQUE allows multiple NULLs" semantics).
9119fn enforce_unique_index_inserts(
9120    catalog: &Catalog,
9121    table_name: &str,
9122    rows: &[alloc::vec::Vec<spg_storage::Value>],
9123) -> Result<(), EngineError> {
9124    let table = catalog.get(table_name).ok_or_else(|| {
9125        EngineError::Storage(StorageError::TableNotFound {
9126            name: table_name.into(),
9127        })
9128    })?;
9129    let schema = table.schema();
9130    let ctx = eval::EvalContext::new(&schema.columns, None);
9131    for idx in table.indices() {
9132        if !idx.is_unique {
9133            continue;
9134        }
9135        // Re-parse the predicate once per index per batch.
9136        let predicate_expr = match idx.partial_predicate.as_deref() {
9137            Some(s) => Some(spg_sql::parser::parse_expression(s).map_err(|e| {
9138                EngineError::Unsupported(alloc::format!(
9139                    "UNIQUE INDEX {:?} predicate {s:?} failed to re-parse: {e:?}",
9140                    idx.name
9141                ))
9142            })?),
9143            None => None,
9144        };
9145        let key_positions = unique_key_positions(idx);
9146        let key_of = |values: &[spg_storage::Value]| -> alloc::vec::Vec<spg_storage::Value> {
9147            key_positions
9148                .iter()
9149                .map(|&p| values.get(p).cloned().unwrap_or(spg_storage::Value::Null))
9150                .collect()
9151        };
9152        // Helper: does `values` participate in this index? (predicate
9153        // truthy when present.) Wraps `values` into a transient Row
9154        // because eval_expr requires &Row.
9155        let participates = |values: &[spg_storage::Value]| -> Result<bool, EngineError> {
9156            let Some(expr) = &predicate_expr else {
9157                return Ok(true);
9158            };
9159            let tmp_row = spg_storage::Row {
9160                values: values.to_vec(),
9161            };
9162            let v = eval::eval_expr(expr, &tmp_row, &ctx).map_err(|e| {
9163                EngineError::Unsupported(alloc::format!(
9164                    "UNIQUE INDEX {:?} predicate eval: {e:?}",
9165                    idx.name
9166                ))
9167            })?;
9168            Ok(predicate_truthy(&v))
9169        };
9170        for (batch_idx, row_values) in rows.iter().enumerate() {
9171            if !participates(row_values)? {
9172                continue;
9173            }
9174            let key = key_of(row_values);
9175            if key.iter().any(|v| matches!(v, spg_storage::Value::Null)) {
9176                continue;
9177            }
9178            // Committed-table collision.
9179            for prow in table.rows() {
9180                if !participates(&prow.values)? {
9181                    continue;
9182                }
9183                if key_of(&prow.values) == key {
9184                    return Err(EngineError::Unsupported(alloc::format!(
9185                        "UNIQUE INDEX {:?} violation on {table_name:?}: \
9186                         row #{batch_idx} duplicates an existing key",
9187                        idx.name
9188                    )));
9189                }
9190            }
9191            // Within-batch collision: earlier rows in the same INSERT.
9192            for earlier in &rows[..batch_idx] {
9193                if !participates(earlier)? {
9194                    continue;
9195                }
9196                if key_of(earlier) == key {
9197                    return Err(EngineError::Unsupported(alloc::format!(
9198                        "UNIQUE INDEX {:?} violation on {table_name:?}: \
9199                         row #{batch_idx} duplicates an earlier row in the same batch",
9200                        idx.name
9201                    )));
9202                }
9203            }
9204        }
9205    }
9206    Ok(())
9207}
9208
9209fn enforce_fk_inserts(
9210    catalog: &Catalog,
9211    child_table: &str,
9212    fks: &[spg_storage::ForeignKeyConstraint],
9213    rows: &[Vec<Value>],
9214) -> Result<(), EngineError> {
9215    for fk in fks {
9216        let parent_is_self = fk.parent_table == child_table;
9217        let parent = if parent_is_self {
9218            // Self-ref: read the current state of the same table.
9219            // The mut borrow on child has been dropped by the caller.
9220            catalog.get(child_table).ok_or_else(|| {
9221                EngineError::Storage(StorageError::TableNotFound {
9222                    name: child_table.into(),
9223                })
9224            })?
9225        } else {
9226            catalog.get(&fk.parent_table).ok_or_else(|| {
9227                EngineError::Storage(StorageError::TableNotFound {
9228                    name: fk.parent_table.clone(),
9229                })
9230            })?
9231        };
9232        for (batch_idx, row_values) in rows.iter().enumerate() {
9233            // Single-column FK fast path: try the parent's BTree
9234            // index for an O(log n) lookup. Composite FKs fall back
9235            // to a parent-row scan.
9236            if fk.local_columns.len() == 1 {
9237                let v = &row_values[fk.local_columns[0]];
9238                if matches!(v, Value::Null) {
9239                    continue;
9240                }
9241                let parent_col = fk.parent_columns[0];
9242                let key = spg_storage::IndexKey::from_value(v).ok_or_else(|| {
9243                    EngineError::Unsupported(alloc::format!(
9244                        "FOREIGN KEY column value of type {:?} is not index-eligible",
9245                        v.data_type()
9246                    ))
9247                })?;
9248                let present_committed = parent.indices().iter().any(|idx| {
9249                    matches!(idx.kind, spg_storage::IndexKind::BTree(_))
9250                        && idx.column_position == parent_col
9251                        && idx.partial_predicate.is_none()
9252                        && !idx.lookup_eq(&key).is_empty()
9253                });
9254                // v7.6.7 self-ref widening: also accept a match
9255                // against earlier rows in this same batch when the
9256                // FK points at the table being inserted into.
9257                let present_in_batch = parent_is_self
9258                    && rows[..batch_idx]
9259                        .iter()
9260                        .any(|earlier| earlier.get(parent_col) == Some(v));
9261                if !(present_committed || present_in_batch) {
9262                    return Err(EngineError::Unsupported(alloc::format!(
9263                        "FOREIGN KEY violation: no parent row in {:?} where {} = {:?}",
9264                        fk.parent_table,
9265                        parent
9266                            .schema()
9267                            .columns
9268                            .get(parent_col)
9269                            .map_or("?", |c| c.name.as_str()),
9270                        v,
9271                    )));
9272                }
9273            } else {
9274                // Composite FK: scan parent rows. v7.6.7 also
9275                // accepts a match against earlier rows in the same
9276                // batch (self-ref bulk-loading of hierarchies).
9277                if fk
9278                    .local_columns
9279                    .iter()
9280                    .all(|&i| matches!(row_values.get(i), Some(Value::Null)))
9281                {
9282                    continue;
9283                }
9284                let local: Vec<&Value> = fk.local_columns.iter().map(|&i| &row_values[i]).collect();
9285                let parent_match_committed = parent.rows().iter().any(|prow| {
9286                    fk.parent_columns
9287                        .iter()
9288                        .enumerate()
9289                        .all(|(i, &pi)| prow.values.get(pi) == Some(local[i]))
9290                });
9291                let parent_match_in_batch = parent_is_self
9292                    && rows[..batch_idx].iter().any(|earlier| {
9293                        fk.parent_columns
9294                            .iter()
9295                            .enumerate()
9296                            .all(|(i, &pi)| earlier.get(pi) == Some(local[i]))
9297                    });
9298                if !(parent_match_committed || parent_match_in_batch) {
9299                    return Err(EngineError::Unsupported(alloc::format!(
9300                        "FOREIGN KEY violation: no parent row in {:?} matching composite key",
9301                        fk.parent_table,
9302                    )));
9303                }
9304            }
9305        }
9306    }
9307    Ok(())
9308}
9309
9310/// v7.6.4 / v7.6.5 — one step of the FK action plan computed for a
9311/// DELETE on a parent. The plan is a list of these steps, stacked
9312/// across the FK graph by `plan_fk_parent_deletions`.
9313#[derive(Debug, Clone)]
9314struct FkChildStep {
9315    child_table: String,
9316    action: FkChildAction,
9317}
9318
9319#[derive(Debug, Clone)]
9320enum FkChildAction {
9321    /// CASCADE — remove these rows. Sorted, deduplicated positions.
9322    Delete { positions: Vec<usize> },
9323    /// SET NULL — for each (row, column) in the flat list, write
9324    /// NULL into that child cell. Multiple FKs on the same row may
9325    /// produce overlapping entries (deduped at plan time).
9326    SetNull {
9327        positions: Vec<usize>,
9328        columns: Vec<usize>,
9329    },
9330    /// SET DEFAULT — same shape as SetNull but writes the column's
9331    /// declared DEFAULT value (resolved at plan time). Columns
9332    /// without a DEFAULT raise an error during planning.
9333    SetDefault {
9334        positions: Vec<usize>,
9335        columns: Vec<usize>,
9336        defaults: Vec<Value>,
9337    },
9338}
9339
9340/// v7.6.3 → v7.6.5 — plan FK fallout for a DELETE on a parent table.
9341///
9342/// Walks every table in the catalog looking for FKs whose
9343/// `parent_table` is `parent_table_name`. For each such FK + each
9344/// to-be-deleted parent row:
9345///
9346///   - RESTRICT / NoAction → error, no plan returned
9347///   - CASCADE → child rows get scheduled for deletion; recursive
9348///   - SetNull → child FK column(s) scheduled to be NULL-ed.
9349///     Verified NULL-able at plan time.
9350///   - SetDefault → child FK column(s) scheduled to be reset to
9351///     their declared DEFAULT. Columns without a DEFAULT raise.
9352///
9353/// SET NULL / SET DEFAULT do NOT cascade further — the child row
9354/// stays; only one of its columns mutates.
9355fn plan_fk_parent_deletions(
9356    catalog: &Catalog,
9357    parent_table_name: &str,
9358    to_delete_positions: &[usize],
9359    to_delete_rows: &[Vec<Value>],
9360) -> Result<Vec<FkChildStep>, EngineError> {
9361    use alloc::collections::{BTreeMap, BTreeSet};
9362    if to_delete_rows.is_empty() {
9363        return Ok(Vec::new());
9364    }
9365    let mut delete_plan: BTreeMap<String, BTreeSet<usize>> = BTreeMap::new();
9366    // setnull / setdefault keyed by child_table → (row_idx, col_idx) → optional default
9367    let mut setnull_plan: BTreeMap<String, BTreeSet<(usize, usize)>> = BTreeMap::new();
9368    let mut setdefault_plan: BTreeMap<String, BTreeMap<(usize, usize), Value>> = BTreeMap::new();
9369    let mut visited: BTreeSet<(String, usize)> = BTreeSet::new();
9370    for &p in to_delete_positions {
9371        visited.insert((parent_table_name.to_string(), p));
9372    }
9373    let mut work: Vec<(String, Vec<Value>)> = to_delete_rows
9374        .iter()
9375        .map(|r| (parent_table_name.to_string(), r.clone()))
9376        .collect();
9377    while let Some((cur_parent, parent_row)) = work.pop() {
9378        for child_name in catalog.table_names() {
9379            let child = catalog
9380                .get(&child_name)
9381                .expect("table_names → catalog.get round-trip is total");
9382            for fk in &child.schema().foreign_keys {
9383                if fk.parent_table != cur_parent {
9384                    continue;
9385                }
9386                let parent_key: Vec<&Value> = fk
9387                    .parent_columns
9388                    .iter()
9389                    .map(|&pi| &parent_row[pi])
9390                    .collect();
9391                if parent_key.iter().any(|v| matches!(v, Value::Null)) {
9392                    continue;
9393                }
9394                for (child_row_idx, child_row) in child.rows().iter().enumerate() {
9395                    if child_name == cur_parent
9396                        && visited.contains(&(child_name.clone(), child_row_idx))
9397                    {
9398                        continue;
9399                    }
9400                    let matches_key = fk
9401                        .local_columns
9402                        .iter()
9403                        .enumerate()
9404                        .all(|(i, &li)| child_row.values.get(li) == Some(parent_key[i]));
9405                    if !matches_key {
9406                        continue;
9407                    }
9408                    match fk.on_delete {
9409                        spg_storage::FkAction::Restrict | spg_storage::FkAction::NoAction => {
9410                            return Err(EngineError::Unsupported(alloc::format!(
9411                                "FOREIGN KEY violation: DELETE on {cur_parent:?} is \
9412                                 restricted by FK from {child_name:?}.{:?}",
9413                                fk.local_columns,
9414                            )));
9415                        }
9416                        spg_storage::FkAction::Cascade => {
9417                            if visited.insert((child_name.clone(), child_row_idx)) {
9418                                delete_plan
9419                                    .entry(child_name.clone())
9420                                    .or_default()
9421                                    .insert(child_row_idx);
9422                                work.push((child_name.clone(), child_row.values.clone()));
9423                            }
9424                        }
9425                        spg_storage::FkAction::SetNull => {
9426                            // Verify every local FK column is NULL-able.
9427                            for &li in &fk.local_columns {
9428                                let col = child.schema().columns.get(li).ok_or_else(|| {
9429                                    EngineError::Unsupported(alloc::format!(
9430                                        "FK local column {li} missing in {child_name:?}"
9431                                    ))
9432                                })?;
9433                                if !col.nullable {
9434                                    return Err(EngineError::Unsupported(alloc::format!(
9435                                        "FOREIGN KEY ON DELETE SET NULL: column \
9436                                         {child_name:?}.{:?} is NOT NULL — cannot SET NULL",
9437                                        col.name,
9438                                    )));
9439                                }
9440                            }
9441                            let entry = setnull_plan.entry(child_name.clone()).or_default();
9442                            for &li in &fk.local_columns {
9443                                entry.insert((child_row_idx, li));
9444                            }
9445                        }
9446                        spg_storage::FkAction::SetDefault => {
9447                            // Resolve the DEFAULT for every local FK col.
9448                            let entry = setdefault_plan.entry(child_name.clone()).or_default();
9449                            for &li in &fk.local_columns {
9450                                let col = child.schema().columns.get(li).ok_or_else(|| {
9451                                    EngineError::Unsupported(alloc::format!(
9452                                        "FK local column {li} missing in {child_name:?}"
9453                                    ))
9454                                })?;
9455                                let default = col.default.clone().ok_or_else(|| {
9456                                    EngineError::Unsupported(alloc::format!(
9457                                        "FOREIGN KEY ON DELETE SET DEFAULT: column \
9458                                         {child_name:?}.{:?} has no DEFAULT declared",
9459                                        col.name,
9460                                    ))
9461                                })?;
9462                                entry.insert((child_row_idx, li), default);
9463                            }
9464                        }
9465                    }
9466                }
9467            }
9468        }
9469    }
9470    // Flatten the three plans into the ordered `FkChildStep` list.
9471    // Deletes are applied last per child (after any null/default
9472    // re-writes on the same child) so a child row that's both
9473    // re-written and then cascade-deleted only ends up deleted —
9474    // but in v7.6.5 SetNull/Cascade never overlap on the same row
9475    // (a single FK chooses exactly one action), so the order is
9476    // mostly a precaution.
9477    let mut steps: Vec<FkChildStep> = Vec::new();
9478    for (child_table, entries) in setnull_plan {
9479        let (positions, columns): (Vec<usize>, Vec<usize>) = entries.into_iter().unzip();
9480        steps.push(FkChildStep {
9481            child_table,
9482            action: FkChildAction::SetNull { positions, columns },
9483        });
9484    }
9485    for (child_table, entries) in setdefault_plan {
9486        let mut positions = Vec::with_capacity(entries.len());
9487        let mut columns = Vec::with_capacity(entries.len());
9488        let mut defaults = Vec::with_capacity(entries.len());
9489        for ((p, c), v) in entries {
9490            positions.push(p);
9491            columns.push(c);
9492            defaults.push(v);
9493        }
9494        steps.push(FkChildStep {
9495            child_table,
9496            action: FkChildAction::SetDefault {
9497                positions,
9498                columns,
9499                defaults,
9500            },
9501        });
9502    }
9503    for (child_table, positions) in delete_plan {
9504        steps.push(FkChildStep {
9505            child_table,
9506            action: FkChildAction::Delete {
9507                positions: positions.into_iter().collect(),
9508            },
9509        });
9510    }
9511    Ok(steps)
9512}
9513
9514/// v7.6.6 — plan FK fallout for an UPDATE that mutates parent-side
9515/// PK/UNIQUE columns. Walks every other table whose FK references
9516/// `parent_table_name`; for each FK whose parent_columns overlap a
9517/// mutated column, decides the action by `fk.on_update`.
9518///
9519///   - RESTRICT / NoAction → error if any child references the OLD
9520///     value
9521///   - CASCADE → child FK columns get rewritten to the NEW parent
9522///     value (a SetNull-style update step with the new value)
9523///   - SetNull → child FK columns set to NULL
9524///   - SetDefault → child FK columns set to declared default
9525///
9526/// `plan_with_old` is `(row_position, old_values, new_values)` so
9527/// the planner can detect "did this row's parent key actually
9528/// change?" — only rows where at least one referenced parent
9529/// column moved trigger inbound work.
9530fn plan_fk_parent_updates(
9531    catalog: &Catalog,
9532    parent_table_name: &str,
9533    plan_with_old: &[(usize, Vec<Value>, Vec<Value>)],
9534) -> Result<Vec<FkChildStep>, EngineError> {
9535    use alloc::collections::BTreeMap;
9536    if plan_with_old.is_empty() {
9537        return Ok(Vec::new());
9538    }
9539    // For each child table we may touch, build per-child step
9540    // lists. UPDATE never deletes children — `delete_plan` stays
9541    // empty here but is kept structurally aligned with
9542    // `plan_fk_parent_deletions` for future use.
9543    let delete_plan: BTreeMap<String, alloc::collections::BTreeSet<usize>> = BTreeMap::new();
9544    let mut setnull_plan: BTreeMap<String, alloc::collections::BTreeSet<(usize, usize)>> =
9545        BTreeMap::new();
9546    let mut setdefault_plan: BTreeMap<String, BTreeMap<(usize, usize), Value>> = BTreeMap::new();
9547    // Cascade-update plan: child_table → row_idx → col_idx → new_value
9548    let mut cascade_plan: BTreeMap<String, BTreeMap<(usize, usize), Value>> = BTreeMap::new();
9549
9550    for child_name in catalog.table_names() {
9551        let child = catalog
9552            .get(&child_name)
9553            .expect("table_names → catalog.get total");
9554        for fk in &child.schema().foreign_keys {
9555            if fk.parent_table != parent_table_name {
9556                continue;
9557            }
9558            for (_pos, old_row, new_row) in plan_with_old {
9559                // Did any parent FK column change?
9560                let key_changed = fk
9561                    .parent_columns
9562                    .iter()
9563                    .any(|&pi| old_row.get(pi) != new_row.get(pi));
9564                if !key_changed {
9565                    continue;
9566                }
9567                // The OLD parent key — used to find referring children.
9568                let old_key: Vec<&Value> =
9569                    fk.parent_columns.iter().map(|&pi| &old_row[pi]).collect();
9570                if old_key.iter().any(|v| matches!(v, Value::Null)) {
9571                    // NULL parent has no children — skip.
9572                    continue;
9573                }
9574                let new_key: Vec<&Value> =
9575                    fk.parent_columns.iter().map(|&pi| &new_row[pi]).collect();
9576                for (child_row_idx, child_row) in child.rows().iter().enumerate() {
9577                    // Self-ref same-row updates: a row updating its
9578                    // own PK doesn't restrict itself.
9579                    if child_name == parent_table_name
9580                        && plan_with_old.iter().any(|(p, _, _)| *p == child_row_idx)
9581                    {
9582                        continue;
9583                    }
9584                    let matches_key = fk
9585                        .local_columns
9586                        .iter()
9587                        .enumerate()
9588                        .all(|(i, &li)| child_row.values.get(li) == Some(old_key[i]));
9589                    if !matches_key {
9590                        continue;
9591                    }
9592                    match fk.on_update {
9593                        spg_storage::FkAction::Restrict | spg_storage::FkAction::NoAction => {
9594                            return Err(EngineError::Unsupported(alloc::format!(
9595                                "FOREIGN KEY violation: UPDATE on {parent_table_name:?} PK is \
9596                                 restricted by FK from {child_name:?}.{:?}",
9597                                fk.local_columns,
9598                            )));
9599                        }
9600                        spg_storage::FkAction::Cascade => {
9601                            // Rewrite child FK columns to new key.
9602                            let entry = cascade_plan.entry(child_name.clone()).or_default();
9603                            for (i, &li) in fk.local_columns.iter().enumerate() {
9604                                entry.insert((child_row_idx, li), new_key[i].clone());
9605                            }
9606                        }
9607                        spg_storage::FkAction::SetNull => {
9608                            for &li in &fk.local_columns {
9609                                let col = child.schema().columns.get(li).ok_or_else(|| {
9610                                    EngineError::Unsupported(alloc::format!(
9611                                        "FK local column {li} missing in {child_name:?}"
9612                                    ))
9613                                })?;
9614                                if !col.nullable {
9615                                    return Err(EngineError::Unsupported(alloc::format!(
9616                                        "FOREIGN KEY ON UPDATE SET NULL: column \
9617                                         {child_name:?}.{:?} is NOT NULL",
9618                                        col.name,
9619                                    )));
9620                                }
9621                            }
9622                            let entry = setnull_plan.entry(child_name.clone()).or_default();
9623                            for &li in &fk.local_columns {
9624                                entry.insert((child_row_idx, li));
9625                            }
9626                        }
9627                        spg_storage::FkAction::SetDefault => {
9628                            let entry = setdefault_plan.entry(child_name.clone()).or_default();
9629                            for &li in &fk.local_columns {
9630                                let col = child.schema().columns.get(li).ok_or_else(|| {
9631                                    EngineError::Unsupported(alloc::format!(
9632                                        "FK local column {li} missing in {child_name:?}"
9633                                    ))
9634                                })?;
9635                                let default = col.default.clone().ok_or_else(|| {
9636                                    EngineError::Unsupported(alloc::format!(
9637                                        "FOREIGN KEY ON UPDATE SET DEFAULT: column \
9638                                         {child_name:?}.{:?} has no DEFAULT",
9639                                        col.name,
9640                                    ))
9641                                })?;
9642                                entry.insert((child_row_idx, li), default);
9643                            }
9644                        }
9645                    }
9646                }
9647            }
9648        }
9649    }
9650    // Flatten into FkChildStep list. UPDATE doesn't produce
9651    // DeleteSteps (CASCADE on UPDATE just rewrites FK values).
9652    let mut steps: Vec<FkChildStep> = Vec::new();
9653    for (child_table, entries) in cascade_plan {
9654        let mut positions = Vec::with_capacity(entries.len());
9655        let mut columns = Vec::with_capacity(entries.len());
9656        let mut defaults = Vec::with_capacity(entries.len());
9657        for ((p, c), v) in entries {
9658            positions.push(p);
9659            columns.push(c);
9660            defaults.push(v);
9661        }
9662        // We reuse `FkChildAction::SetDefault` for cascade-update:
9663        // both shapes are "write a known value into specific cells"
9664        // — `apply_per_cell_writes` doesn't care whether the value
9665        // came from a DEFAULT declaration or a new parent key.
9666        steps.push(FkChildStep {
9667            child_table,
9668            action: FkChildAction::SetDefault {
9669                positions,
9670                columns,
9671                defaults,
9672            },
9673        });
9674    }
9675    for (child_table, entries) in setnull_plan {
9676        let (positions, columns): (Vec<usize>, Vec<usize>) = entries.into_iter().unzip();
9677        steps.push(FkChildStep {
9678            child_table,
9679            action: FkChildAction::SetNull { positions, columns },
9680        });
9681    }
9682    for (child_table, entries) in setdefault_plan {
9683        let mut positions = Vec::with_capacity(entries.len());
9684        let mut columns = Vec::with_capacity(entries.len());
9685        let mut defaults = Vec::with_capacity(entries.len());
9686        for ((p, c), v) in entries {
9687            positions.push(p);
9688            columns.push(c);
9689            defaults.push(v);
9690        }
9691        steps.push(FkChildStep {
9692            child_table,
9693            action: FkChildAction::SetDefault {
9694                positions,
9695                columns,
9696                defaults,
9697            },
9698        });
9699    }
9700    let _ = delete_plan; // UPDATE never deletes children.
9701    Ok(steps)
9702}
9703
9704/// v7.6.5 — apply one FK child step to the catalog. Encapsulates
9705/// the three action variants so the DELETE executor stays a
9706/// simple loop over the planned steps.
9707fn apply_fk_child_step(catalog: &mut Catalog, step: &FkChildStep) -> Result<(), EngineError> {
9708    let child = catalog.get_mut(&step.child_table).ok_or_else(|| {
9709        EngineError::Storage(StorageError::TableNotFound {
9710            name: step.child_table.clone(),
9711        })
9712    })?;
9713    match &step.action {
9714        FkChildAction::Delete { positions } => {
9715            let _ = child.delete_rows(positions);
9716        }
9717        FkChildAction::SetNull { positions, columns } => {
9718            apply_per_cell_writes(child, positions, columns, |_| Value::Null)?;
9719        }
9720        FkChildAction::SetDefault {
9721            positions,
9722            columns,
9723            defaults,
9724        } => {
9725            apply_per_cell_writes(child, positions, columns, |i| defaults[i].clone())?;
9726        }
9727    }
9728    Ok(())
9729}
9730
9731/// v7.6.5 — write new values into selected child cells via
9732/// `Table::update_row` (the catalog's existing UPDATE entry).
9733/// Groups writes by row position so multi-column updates on the
9734/// same row only call `update_row` once. `value_for(i)` produces
9735/// the new value for the i-th (position, column) entry.
9736fn apply_per_cell_writes(
9737    child: &mut spg_storage::Table,
9738    positions: &[usize],
9739    columns: &[usize],
9740    mut value_for: impl FnMut(usize) -> Value,
9741) -> Result<(), EngineError> {
9742    use alloc::collections::BTreeMap;
9743    let mut by_row: BTreeMap<usize, Vec<(usize, Value)>> = BTreeMap::new();
9744    for i in 0..positions.len() {
9745        by_row
9746            .entry(positions[i])
9747            .or_default()
9748            .push((columns[i], value_for(i)));
9749    }
9750    for (pos, mutations) in by_row {
9751        let mut new_values = child.rows()[pos].values.clone();
9752        for (col, v) in mutations {
9753            if let Some(slot) = new_values.get_mut(col) {
9754                *slot = v;
9755            }
9756        }
9757        child
9758            .update_row(pos, new_values)
9759            .map_err(EngineError::Storage)?;
9760    }
9761    Ok(())
9762}
9763
9764fn fk_action_sql_to_storage(a: spg_sql::ast::FkAction) -> spg_storage::FkAction {
9765    match a {
9766        spg_sql::ast::FkAction::Restrict => spg_storage::FkAction::Restrict,
9767        spg_sql::ast::FkAction::Cascade => spg_storage::FkAction::Cascade,
9768        spg_sql::ast::FkAction::SetNull => spg_storage::FkAction::SetNull,
9769        spg_sql::ast::FkAction::SetDefault => spg_storage::FkAction::SetDefault,
9770        spg_sql::ast::FkAction::NoAction => spg_storage::FkAction::NoAction,
9771    }
9772}
9773
9774/// v7.9.21 — resolve a column's DEFAULT for INSERT-time
9775/// default-fill. Free fn (rather than `&self`) so callers
9776/// with an active `&mut Table` borrow can still use it.
9777/// Literal defaults take the cached path (`col.default`);
9778/// runtime defaults hit `clock_fn` at each call. mailrs G4.
9779fn resolve_column_default_free(
9780    col: &ColumnSchema,
9781    clock_fn: Option<ClockFn>,
9782) -> Result<Value, EngineError> {
9783    if let Some(rt) = &col.runtime_default {
9784        return eval_runtime_default_free(rt, col.ty, clock_fn);
9785    }
9786    Ok(col.default.clone().unwrap_or(Value::Null))
9787}
9788
9789fn eval_runtime_default_free(
9790    rt: &str,
9791    ty: DataType,
9792    clock_fn: Option<ClockFn>,
9793) -> Result<Value, EngineError> {
9794    let s = rt.trim().to_ascii_lowercase();
9795    let canonical = s.trim_end_matches("()");
9796    let now_us = match clock_fn {
9797        Some(f) => f(),
9798        None => 0,
9799    };
9800    let v = match canonical {
9801        "now" | "current_timestamp" | "localtimestamp" => Value::Timestamp(now_us),
9802        "current_date" => Value::Date((now_us / 86_400_000_000) as i32),
9803        "current_time" | "localtime" => Value::Timestamp(now_us),
9804        other => {
9805            return Err(EngineError::Unsupported(alloc::format!(
9806                "runtime DEFAULT expression {other:?} not supported \
9807                 (v7.9.21 whitelist: now() / current_timestamp / \
9808                 current_date / current_time / localtimestamp / \
9809                 localtime)"
9810            )));
9811        }
9812    };
9813    coerce_value(v, ty, "DEFAULT", 0)
9814}
9815
9816/// v7.9.21 — true when a DEFAULT expression needs INSERT-time
9817/// evaluation rather than being cacheable as a literal Value.
9818/// FunctionCall is the immediate case (`now()`,
9819/// `current_timestamp`). Literal expressions and simple sign-
9820/// flipped numerics still take the static-cache path.
9821fn is_runtime_default_expr(expr: &Expr) -> bool {
9822    match expr {
9823        Expr::FunctionCall { .. } => true,
9824        Expr::Unary { expr, .. } => is_runtime_default_expr(expr),
9825        _ => false,
9826    }
9827}
9828
9829fn column_def_to_schema(c: ColumnDef) -> Result<ColumnSchema, EngineError> {
9830    let ty = column_type_to_data_type(c.ty);
9831    let mut schema = ColumnSchema::new(c.name.clone(), ty, c.nullable);
9832    if let Some(default_expr) = c.default {
9833        // v7.9.21 — distinguish literal defaults (evaluated once
9834        // at CREATE TABLE) from expression defaults (deferred to
9835        // INSERT). Function calls (`now()`, `current_timestamp`
9836        // — see v7.9.20 keyword promotion) take the runtime path.
9837        // Literals continue to cache. mailrs G4.
9838        if is_runtime_default_expr(&default_expr) {
9839            let display = alloc::format!("{default_expr}");
9840            schema = schema.with_runtime_default(display);
9841        } else {
9842            let raw = literal_expr_to_value(default_expr)?;
9843            let coerced = coerce_value(raw, ty, &c.name, 0)?;
9844            schema = schema.with_default(coerced);
9845        }
9846    }
9847    if c.auto_increment {
9848        // AUTO_INCREMENT only makes sense on integer-shaped columns.
9849        if !matches!(ty, DataType::SmallInt | DataType::Int | DataType::BigInt) {
9850            return Err(EngineError::Unsupported(alloc::format!(
9851                "AUTO_INCREMENT requires an integer column type, got {ty:?}"
9852            )));
9853        }
9854        schema = schema.with_auto_increment();
9855    }
9856    Ok(schema)
9857}
9858
9859/// v7.10.4 — decode a BYTEA literal. Accepts:
9860///   * `\xDEADBEEF` (case-insensitive hex; whitespace stripped)
9861///   * `Hello\000world` (backslash escape form; `\\` for literal backslash)
9862///   * Anything else → raw UTF-8 bytes of the input (PG accepts this too).
9863fn decode_bytea_literal(s: &str) -> Result<alloc::vec::Vec<u8>, &'static str> {
9864    let s = s.trim();
9865    if let Some(hex) = s.strip_prefix("\\x").or_else(|| s.strip_prefix("\\X")) {
9866        // Hex form. Each pair of hex digits → one byte.
9867        let cleaned: alloc::string::String = hex.chars().filter(|c| !c.is_whitespace()).collect();
9868        if cleaned.len() % 2 != 0 {
9869            return Err("odd-length hex literal");
9870        }
9871        let mut out = alloc::vec::Vec::with_capacity(cleaned.len() / 2);
9872        let cleaned_bytes = cleaned.as_bytes();
9873        for i in (0..cleaned_bytes.len()).step_by(2) {
9874            let hi = hex_nibble(cleaned_bytes[i])?;
9875            let lo = hex_nibble(cleaned_bytes[i + 1])?;
9876            out.push((hi << 4) | lo);
9877        }
9878        return Ok(out);
9879    }
9880    // Escape form or raw. Walk char-by-char; `\\` and `\NNN` octal
9881    // sequences decode; anything else is a literal byte.
9882    let bytes = s.as_bytes();
9883    let mut out = alloc::vec::Vec::with_capacity(bytes.len());
9884    let mut i = 0;
9885    while i < bytes.len() {
9886        let b = bytes[i];
9887        if b == b'\\' && i + 1 < bytes.len() {
9888            let n = bytes[i + 1];
9889            if n == b'\\' {
9890                out.push(b'\\');
9891                i += 2;
9892                continue;
9893            }
9894            if n.is_ascii_digit()
9895                && i + 3 < bytes.len()
9896                && bytes[i + 2].is_ascii_digit()
9897                && bytes[i + 3].is_ascii_digit()
9898            {
9899                let oct = |x: u8| (x - b'0') as u32;
9900                let v = oct(n) * 64 + oct(bytes[i + 2]) * 8 + oct(bytes[i + 3]);
9901                if v <= 0xFF {
9902                    out.push(v as u8);
9903                    i += 4;
9904                    continue;
9905                }
9906            }
9907        }
9908        out.push(b);
9909        i += 1;
9910    }
9911    Ok(out)
9912}
9913
9914fn hex_nibble(b: u8) -> Result<u8, &'static str> {
9915    match b {
9916        b'0'..=b'9' => Ok(b - b'0'),
9917        b'a'..=b'f' => Ok(b - b'a' + 10),
9918        b'A'..=b'F' => Ok(b - b'A' + 10),
9919        _ => Err("invalid hex digit"),
9920    }
9921}
9922
9923/// v7.10.11 — decode a PG TEXT[] external array form
9924/// (`{a,b,NULL}` with optional double-quoted elements). The
9925/// engine takes a leading/trailing `{`/`}` and splits at commas.
9926/// Quoted elements (`"hello, world"`) preserve embedded commas;
9927/// `\\` and `\"` decode to literal backslash / quote. Plain
9928/// unquoted `NULL` (case-insensitive) maps to `None`.
9929/// v7.11.13 — pick the array type for `ARRAY[lit, …]` from the
9930/// element values. Single-element-type rules:
9931///   - all NULL / all Text → TextArray
9932///   - all Int (or Int+NULL) → IntArray
9933///   - any BigInt without Text → BigIntArray (widening)
9934///   - any Text → TextArray (fallback; non-string elements
9935///     render as text)
9936fn array_literal_widen(items: alloc::vec::Vec<Value>) -> Value {
9937    let mut has_text = false;
9938    let mut has_bigint = false;
9939    let mut has_int = false;
9940    for v in &items {
9941        match v {
9942            Value::Null => {}
9943            Value::Text(_) | Value::Json(_) => has_text = true,
9944            Value::BigInt(_) => has_bigint = true,
9945            Value::Int(_) | Value::SmallInt(_) => has_int = true,
9946            _ => has_text = true,
9947        }
9948    }
9949    if has_text || (!has_bigint && !has_int) {
9950        let out: alloc::vec::Vec<Option<alloc::string::String>> = items
9951            .into_iter()
9952            .map(|v| match v {
9953                Value::Null => None,
9954                Value::Text(s) | Value::Json(s) => Some(s),
9955                other => Some(alloc::format!("{other:?}")),
9956            })
9957            .collect();
9958        return Value::TextArray(out);
9959    }
9960    if has_bigint {
9961        let out: alloc::vec::Vec<Option<i64>> = items
9962            .into_iter()
9963            .map(|v| match v {
9964                Value::Null => None,
9965                Value::Int(n) => Some(i64::from(n)),
9966                Value::SmallInt(n) => Some(i64::from(n)),
9967                Value::BigInt(n) => Some(n),
9968                _ => unreachable!("widen: unexpected non-integer in BigInt path"),
9969            })
9970            .collect();
9971        return Value::BigIntArray(out);
9972    }
9973    let out: alloc::vec::Vec<Option<i32>> = items
9974        .into_iter()
9975        .map(|v| match v {
9976            Value::Null => None,
9977            Value::Int(n) => Some(n),
9978            Value::SmallInt(n) => Some(i32::from(n)),
9979            _ => unreachable!("widen: unexpected non-i32-compatible in Int path"),
9980        })
9981        .collect();
9982    Value::IntArray(out)
9983}
9984
9985fn decode_text_array_literal(
9986    s: &str,
9987) -> Result<alloc::vec::Vec<Option<alloc::string::String>>, &'static str> {
9988    let trimmed = s.trim();
9989    let inner = trimmed
9990        .strip_prefix('{')
9991        .and_then(|x| x.strip_suffix('}'))
9992        .ok_or("TEXT[] literal must be enclosed in '{...}'")?;
9993    let mut out: alloc::vec::Vec<Option<alloc::string::String>> = alloc::vec::Vec::new();
9994    if inner.trim().is_empty() {
9995        return Ok(out);
9996    }
9997    let bytes = inner.as_bytes();
9998    let mut i = 0;
9999    while i <= bytes.len() {
10000        // Skip leading whitespace.
10001        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
10002            i += 1;
10003        }
10004        // Quoted element.
10005        if i < bytes.len() && bytes[i] == b'"' {
10006            i += 1; // open quote
10007            let mut buf = alloc::string::String::new();
10008            while i < bytes.len() && bytes[i] != b'"' {
10009                if bytes[i] == b'\\' && i + 1 < bytes.len() {
10010                    buf.push(bytes[i + 1] as char);
10011                    i += 2;
10012                } else {
10013                    buf.push(bytes[i] as char);
10014                    i += 1;
10015                }
10016            }
10017            if i >= bytes.len() {
10018                return Err("unterminated quoted element");
10019            }
10020            i += 1; // close quote
10021            out.push(Some(buf));
10022        } else {
10023            // Unquoted element — read until next comma or end.
10024            let start = i;
10025            while i < bytes.len() && bytes[i] != b',' {
10026                i += 1;
10027            }
10028            let raw = inner[start..i].trim();
10029            if raw.eq_ignore_ascii_case("NULL") {
10030                out.push(None);
10031            } else {
10032                out.push(Some(alloc::string::ToString::to_string(raw)));
10033            }
10034        }
10035        // Skip whitespace, expect comma or end.
10036        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
10037            i += 1;
10038        }
10039        if i >= bytes.len() {
10040            break;
10041        }
10042        if bytes[i] != b',' {
10043            return Err("expected ',' between TEXT[] elements");
10044        }
10045        i += 1;
10046    }
10047    Ok(out)
10048}
10049
10050/// v7.10.11 — encode a TEXT[] back into the PG external array
10051/// form. NULL elements become the literal `NULL`; elements
10052/// containing commas, quotes, backslashes, or braces are
10053/// double-quoted with `\\` / `\"` escapes.
10054fn encode_text_array(items: &[Option<alloc::string::String>]) -> alloc::string::String {
10055    let mut out = alloc::string::String::with_capacity(2 + items.len() * 8);
10056    out.push('{');
10057    for (i, item) in items.iter().enumerate() {
10058        if i > 0 {
10059            out.push(',');
10060        }
10061        match item {
10062            None => out.push_str("NULL"),
10063            Some(s) => {
10064                let needs_quote = s.is_empty()
10065                    || s.eq_ignore_ascii_case("NULL")
10066                    || s.chars()
10067                        .any(|c| matches!(c, ',' | '{' | '}' | '"' | '\\' | ' ' | '\t'));
10068                if needs_quote {
10069                    out.push('"');
10070                    for c in s.chars() {
10071                        if c == '"' || c == '\\' {
10072                            out.push('\\');
10073                        }
10074                        out.push(c);
10075                    }
10076                    out.push('"');
10077                } else {
10078                    out.push_str(s);
10079                }
10080            }
10081        }
10082    }
10083    out.push('}');
10084    out
10085}
10086
10087/// v7.10.4 — encode BYTEA bytes in PG hex output format
10088/// (`\x` prefix, lowercase hex pairs). Used by Text-side
10089/// round-trip + the wire layer's text-mode encoder.
10090fn encode_bytea_hex(b: &[u8]) -> alloc::string::String {
10091    let mut out = alloc::string::String::with_capacity(2 + 2 * b.len());
10092    out.push_str("\\x");
10093    for byte in b {
10094        let hi = byte >> 4;
10095        let lo = byte & 0x0F;
10096        out.push(hex_digit(hi));
10097        out.push(hex_digit(lo));
10098    }
10099    out
10100}
10101
10102const fn hex_digit(n: u8) -> char {
10103    match n {
10104        0..=9 => (b'0' + n) as char,
10105        10..=15 => (b'a' + n - 10) as char,
10106        _ => '?',
10107    }
10108}
10109
10110const fn column_type_to_data_type(t: ColumnTypeName) -> DataType {
10111    match t {
10112        ColumnTypeName::SmallInt => DataType::SmallInt,
10113        ColumnTypeName::Int => DataType::Int,
10114        ColumnTypeName::BigInt => DataType::BigInt,
10115        ColumnTypeName::Float => DataType::Float,
10116        ColumnTypeName::Text => DataType::Text,
10117        ColumnTypeName::Varchar(n) => DataType::Varchar(n),
10118        ColumnTypeName::Char(n) => DataType::Char(n),
10119        ColumnTypeName::Bool => DataType::Bool,
10120        ColumnTypeName::Vector { dim, encoding } => DataType::Vector {
10121            dim,
10122            encoding: match encoding {
10123                SqlVecEncoding::F32 => VecEncoding::F32,
10124                SqlVecEncoding::Sq8 => VecEncoding::Sq8,
10125                SqlVecEncoding::F16 => VecEncoding::F16,
10126            },
10127        },
10128        ColumnTypeName::Numeric(precision, scale) => DataType::Numeric { precision, scale },
10129        ColumnTypeName::Date => DataType::Date,
10130        ColumnTypeName::Timestamp => DataType::Timestamp,
10131        ColumnTypeName::Timestamptz => DataType::Timestamptz,
10132        ColumnTypeName::Json => DataType::Json,
10133        ColumnTypeName::Jsonb => DataType::Jsonb,
10134        ColumnTypeName::Bytes => DataType::Bytes,
10135        ColumnTypeName::TextArray => DataType::TextArray,
10136        ColumnTypeName::IntArray => DataType::IntArray,
10137        ColumnTypeName::BigIntArray => DataType::BigIntArray,
10138        ColumnTypeName::TsVector => DataType::TsVector,
10139        ColumnTypeName::TsQuery => DataType::TsQuery,
10140    }
10141}
10142
10143/// Convert an INSERT VALUES expression to a storage Value. Supports literal
10144/// expressions, unary-minus over numeric literals, and pgvector-style
10145/// `'[..]'::vector` cast (v1.2). Anything more complex returns `Unsupported`.
10146fn literal_expr_to_value(expr: Expr) -> Result<Value, EngineError> {
10147    match expr {
10148        Expr::Literal(l) => Ok(literal_to_value(l)),
10149        Expr::Cast { expr, target } => {
10150            let inner_value = literal_expr_to_value(*expr)?;
10151            crate::eval::cast_value(inner_value, target).map_err(EngineError::Eval)
10152        }
10153        Expr::Unary {
10154            op: UnOp::Neg,
10155            expr,
10156        } => match *expr {
10157            Expr::Literal(Literal::Integer(n)) => {
10158                // Fold to i32 if it fits, else BigInt. Parser emits Integer(i64)
10159                // — overflow on negate of i64::MIN is the one edge case.
10160                let neg = n.checked_neg().ok_or_else(|| {
10161                    EngineError::Unsupported("integer literal overflow on negation".into())
10162                })?;
10163                Ok(int_value_for(neg))
10164            }
10165            Expr::Literal(Literal::Float(x)) => Ok(Value::Float(-x)),
10166            other => Err(EngineError::Unsupported(alloc::format!(
10167                "unary minus over non-literal expression: {other:?}"
10168            ))),
10169        },
10170        // v7.10.10 — `ARRAY[lit, lit, …]` constructor accepted at
10171        // INSERT-time. Each element must reduce to a Value through
10172        // `literal_expr_to_value`; NULL elements become `None`.
10173        // v7.11.13 — deduce shape from element values: all Int →
10174        // IntArray; any BigInt → BigIntArray (widening); any Text
10175        // → TextArray. Cast targets (`ARRAY[]::INT[]`) flow through
10176        // the outer Cast arm before reaching here and re-coerce.
10177        Expr::Array(items) => {
10178            let mut materialised: alloc::vec::Vec<Value> =
10179                alloc::vec::Vec::with_capacity(items.len());
10180            for elem in items {
10181                materialised.push(literal_expr_to_value(elem)?);
10182            }
10183            Ok(array_literal_widen(materialised))
10184        }
10185        other => Err(EngineError::Unsupported(alloc::format!(
10186            "non-literal INSERT value expression: {other:?}"
10187        ))),
10188    }
10189}
10190
10191fn literal_to_value(l: Literal) -> Value {
10192    match l {
10193        Literal::Integer(n) => int_value_for(n),
10194        Literal::Float(x) => Value::Float(x),
10195        Literal::String(s) => Value::Text(s),
10196        Literal::Bool(b) => Value::Bool(b),
10197        Literal::Null => Value::Null,
10198        Literal::Vector(v) => Value::Vector(v),
10199        Literal::Interval { months, micros, .. } => Value::Interval { months, micros },
10200    }
10201}
10202
10203/// Pick `Int` (`i32`) when the literal fits, else `BigInt`. `INT` vs `BIGINT`
10204/// columns will still enforce the right tag downstream — this is just the
10205/// default we synthesise from an unannotated integer literal.
10206fn int_value_for(n: i64) -> Value {
10207    if let Ok(small) = i32::try_from(n) {
10208        Value::Int(small)
10209    } else {
10210        Value::BigInt(n)
10211    }
10212}
10213
10214/// Widen / narrow `v` to fit `expected`. Numerics permit safe widening
10215/// (`Int → BigInt`, `Int/BigInt → Float`) and best-effort narrowing
10216/// (`BigInt → Int` succeeds only when the value fits in `i32`). Everything
10217/// else returns `TypeMismatch` carrying the column name for caller diagnostics.
10218/// `NULL` is always permitted; the nullability check happens later in storage.
10219#[allow(clippy::too_many_lines)]
10220fn coerce_value(
10221    v: Value,
10222    expected: DataType,
10223    col_name: &str,
10224    position: usize,
10225) -> Result<Value, EngineError> {
10226    if v.is_null() {
10227        return Ok(Value::Null);
10228    }
10229    let actual = v.data_type().expect("non-null");
10230    if actual == expected {
10231        return Ok(v);
10232    }
10233    let coerced = match (v, expected) {
10234        (Value::Int(n), DataType::BigInt) => Some(Value::BigInt(i64::from(n))),
10235        (Value::Int(n), DataType::Float) => Some(Value::Float(f64::from(n))),
10236        (Value::Int(n), DataType::SmallInt) => i16::try_from(n).ok().map(Value::SmallInt),
10237        (Value::Int(n), DataType::Numeric { precision, scale }) => Some(numeric_from_integer(
10238            i128::from(n),
10239            precision,
10240            scale,
10241            col_name,
10242        )?),
10243        (Value::SmallInt(n), DataType::Int) => Some(Value::Int(i32::from(n))),
10244        (Value::SmallInt(n), DataType::BigInt) => Some(Value::BigInt(i64::from(n))),
10245        (Value::SmallInt(n), DataType::Float) => Some(Value::Float(f64::from(n))),
10246        (Value::SmallInt(n), DataType::Numeric { precision, scale }) => Some(numeric_from_integer(
10247            i128::from(n),
10248            precision,
10249            scale,
10250            col_name,
10251        )?),
10252        (Value::BigInt(n), DataType::Int) => i32::try_from(n).ok().map(Value::Int),
10253        (Value::BigInt(n), DataType::SmallInt) => i16::try_from(n).ok().map(Value::SmallInt),
10254        #[allow(clippy::cast_precision_loss)]
10255        (Value::BigInt(n), DataType::Float) => Some(Value::Float(n as f64)),
10256        (Value::BigInt(n), DataType::Numeric { precision, scale }) => Some(numeric_from_integer(
10257            i128::from(n),
10258            precision,
10259            scale,
10260            col_name,
10261        )?),
10262        (Value::Float(x), DataType::Numeric { precision, scale }) => {
10263            Some(numeric_from_float(x, precision, scale, col_name)?)
10264        }
10265        // Text → DATE / TIMESTAMP: parse canonical text forms.
10266        (Value::Text(s), DataType::Date) => {
10267            let d = eval::parse_date_literal(&s).ok_or_else(|| {
10268                EngineError::Eval(EvalError::TypeMismatch {
10269                    detail: alloc::format!("cannot parse {s:?} as DATE for column `{col_name}`"),
10270                })
10271            })?;
10272            Some(Value::Date(d))
10273        }
10274        // v4.9: Text ↔ JSON coercion. No structural validation —
10275        // any text literal is accepted; the responsibility for
10276        // valid JSON lies with the producer.
10277        (Value::Text(s), DataType::Json | DataType::Jsonb) => Some(Value::Json(s)),
10278        (Value::Json(s), DataType::Text) => Some(Value::Text(s)),
10279        // v7.10.4 — Text → BYTEA. Decode PG-style literal forms:
10280        //   - Hex:    `\x48656c6c6f`  (case-insensitive hex pairs)
10281        //   - Escape: `Hello\\000world`  (backslash + octal triples)
10282        //   - Plain:  any string → raw UTF-8 bytes (PG also accepts)
10283        // Errors surface as TypeMismatch so the operator gets a
10284        // clear "this literal isn't a bytea literal" hint.
10285        (Value::Text(s), DataType::Bytes) => {
10286            let bytes = decode_bytea_literal(&s).map_err(|e| {
10287                EngineError::Eval(EvalError::TypeMismatch {
10288                    detail: alloc::format!(
10289                        "cannot parse {s:?} as BYTEA for column `{col_name}`: {e}"
10290                    ),
10291                })
10292            })?;
10293            Some(Value::Bytes(bytes))
10294        }
10295        // v7.10.4 — BYTEA → Text round-trip uses the PG hex
10296        // output (lowercase, `\x` prefix). Important when a
10297        // SELECT pulls a bytea cell through a Text column path.
10298        (Value::Bytes(b), DataType::Text) => Some(Value::Text(encode_bytea_hex(&b))),
10299        // v7.10.11 — Text → TEXT[]. Decode PG's external array
10300        // form `'{a,b,NULL}'`. NULL element token (case-insensitive)
10301        // is the literal `NULL`; everything else is a quoted or
10302        // unquoted text element. mailrs `'{label1,label2}'::TEXT[]`.
10303        (Value::Text(s), DataType::TextArray) => {
10304            let arr = decode_text_array_literal(&s).map_err(|e| {
10305                EngineError::Eval(EvalError::TypeMismatch {
10306                    detail: alloc::format!(
10307                        "cannot parse {s:?} as TEXT[] for column `{col_name}`: {e}"
10308                    ),
10309                })
10310            })?;
10311            Some(Value::TextArray(arr))
10312        }
10313        // v7.10.11 — TEXT[] → Text round-trip uses PG's
10314        // external array form (`{a,b,NULL}`). Lets a SELECT
10315        // pull an array column through any Text-side codepath.
10316        (Value::TextArray(items), DataType::Text) => Some(Value::Text(encode_text_array(&items))),
10317        (Value::Text(s), DataType::Timestamp | DataType::Timestamptz) => {
10318            let t = eval::parse_timestamp_literal(&s).ok_or_else(|| {
10319                EngineError::Eval(EvalError::TypeMismatch {
10320                    detail: alloc::format!(
10321                        "cannot parse {s:?} as TIMESTAMP for column `{col_name}`"
10322                    ),
10323                })
10324            })?;
10325            Some(Value::Timestamp(t))
10326        }
10327        // DATE ↔ TIMESTAMP convertibility (DATE → midnight,
10328        // TIMESTAMP → day truncation).
10329        (Value::Date(d), DataType::Timestamp | DataType::Timestamptz) => {
10330            Some(Value::Timestamp(i64::from(d) * 86_400_000_000))
10331        }
10332        // v7.9.21 — Value::Timestamp lands in either Timestamp
10333        // or Timestamptz columns; the on-disk layout is the
10334        // same i64 microseconds UTC.
10335        (Value::Timestamp(t), DataType::Timestamptz) => Some(Value::Timestamp(t)),
10336        (Value::Timestamp(t), DataType::Date) => {
10337            let days = t.div_euclid(86_400_000_000);
10338            i32::try_from(days).ok().map(Value::Date)
10339        }
10340        (
10341            Value::Numeric {
10342                scaled,
10343                scale: src_scale,
10344            },
10345            DataType::Numeric { precision, scale },
10346        ) => Some(numeric_rescale(
10347            scaled, src_scale, precision, scale, col_name,
10348        )?),
10349        #[allow(clippy::cast_precision_loss)]
10350        (Value::Numeric { scaled, scale }, DataType::Float) => {
10351            let mut div = 1.0_f64;
10352            for _ in 0..scale {
10353                div *= 10.0;
10354            }
10355            Some(Value::Float((scaled as f64) / div))
10356        }
10357        (Value::Numeric { scaled, scale }, DataType::Int) => {
10358            let truncated = numeric_truncate_to_integer(scaled, scale);
10359            i32::try_from(truncated).ok().map(Value::Int)
10360        }
10361        (Value::Numeric { scaled, scale }, DataType::BigInt) => {
10362            let truncated = numeric_truncate_to_integer(scaled, scale);
10363            i64::try_from(truncated).ok().map(Value::BigInt)
10364        }
10365        (Value::Numeric { scaled, scale }, DataType::SmallInt) => {
10366            let truncated = numeric_truncate_to_integer(scaled, scale);
10367            i16::try_from(truncated).ok().map(Value::SmallInt)
10368        }
10369        // VARCHAR(n) enforces an upper bound on character count.
10370        (Value::Text(s), DataType::Varchar(max)) => {
10371            if u32::try_from(s.chars().count()).unwrap_or(u32::MAX) <= max {
10372                Some(Value::Text(s))
10373            } else {
10374                return Err(EngineError::Unsupported(alloc::format!(
10375                    "value for VARCHAR({max}) column `{col_name}` exceeds length: \
10376                     {} chars",
10377                    s.chars().count()
10378                )));
10379            }
10380        }
10381        // v6.0.1: f32 → SQ8 INSERT-time quantisation. Triggered
10382        // when the column declares `VECTOR(N) USING SQ8` and
10383        // the INSERT VALUES expression yields a raw f32 vector
10384        // (the normal pgvector-shape literal). Dim mismatch
10385        // falls through the `_ => None` arm and surfaces as
10386        // `TypeMismatch` with the expected SQ8 column type —
10387        // matching the F32 path's existing error.
10388        (
10389            Value::Vector(v),
10390            DataType::Vector {
10391                dim,
10392                encoding: VecEncoding::Sq8,
10393            },
10394        ) if v.len() == dim as usize => Some(Value::Sq8Vector(spg_storage::quantize::quantize(&v))),
10395        // v6.0.3: f32 → f16 INSERT-time conversion for HALF
10396        // columns. Bit-exact at the storage layer (modulo
10397        // half-precision rounding); no rerank pass needed at
10398        // search time.
10399        (
10400            Value::Vector(v),
10401            DataType::Vector {
10402                dim,
10403                encoding: VecEncoding::F16,
10404            },
10405        ) if v.len() == dim as usize => Some(Value::HalfVector(
10406            spg_storage::halfvec::HalfVector::from_f32_slice(&v),
10407        )),
10408        // CHAR(n) right-pads with U+0020 to exactly n chars; if the input
10409        // is already longer we reject (PG truncates trailing-space-only;
10410        // staying strict for v1).
10411        (Value::Text(s), DataType::Char(size)) => {
10412            let len = u32::try_from(s.chars().count()).unwrap_or(u32::MAX);
10413            if len > size {
10414                return Err(EngineError::Unsupported(alloc::format!(
10415                    "value for CHAR({size}) column `{col_name}` exceeds length: \
10416                     {len} chars"
10417                )));
10418            }
10419            let need = (size - len) as usize;
10420            let mut padded = s;
10421            padded.reserve(need);
10422            for _ in 0..need {
10423                padded.push(' ');
10424            }
10425            Some(Value::Text(padded))
10426        }
10427        _ => None,
10428    };
10429    coerced.ok_or(EngineError::Storage(StorageError::TypeMismatch {
10430        column: col_name.into(),
10431        expected,
10432        actual,
10433        position,
10434    }))
10435}
10436
10437/// v7.12.4 — render a function arg list into the
10438/// canonical form the storage layer caches as
10439/// [`spg_storage::FunctionDef::args_repr`]. The catalogue uses
10440/// this string for both display + as a coarse signature key
10441/// for the (deferred) overload resolution v7.12.5+ adds.
10442fn render_function_args(args: &[spg_sql::ast::FunctionArg]) -> alloc::string::String {
10443    use core::fmt::Write;
10444    let mut out = alloc::string::String::from("(");
10445    for (i, a) in args.iter().enumerate() {
10446        if i > 0 {
10447            out.push_str(", ");
10448        }
10449        match a.mode {
10450            spg_sql::ast::FunctionArgMode::In => {}
10451            spg_sql::ast::FunctionArgMode::Out => out.push_str("OUT "),
10452            spg_sql::ast::FunctionArgMode::InOut => out.push_str("INOUT "),
10453        }
10454        if let Some(n) = &a.name {
10455            out.push_str(n);
10456            out.push(' ');
10457        }
10458        match &a.ty {
10459            spg_sql::ast::FunctionArgType::Typed(t) => {
10460                let _ = write!(out, "{t}");
10461            }
10462            spg_sql::ast::FunctionArgType::Raw(s) => out.push_str(s),
10463        }
10464    }
10465    out.push(')');
10466    out
10467}
10468
10469#[cfg(test)]
10470mod tests {
10471    use super::*;
10472    use alloc::vec;
10473
10474    fn unwrap_command_ok(r: &QueryResult) -> usize {
10475        match r {
10476            QueryResult::CommandOk { affected, .. } => *affected,
10477            QueryResult::Rows { .. } => panic!("expected CommandOk, got Rows"),
10478        }
10479    }
10480
10481    #[test]
10482    fn create_table_registers_schema() {
10483        let mut e = Engine::new();
10484        e.execute("CREATE TABLE foo (a INT NOT NULL, b TEXT)")
10485            .unwrap();
10486        assert_eq!(e.catalog().table_count(), 1);
10487        let t = e.catalog().get("foo").unwrap();
10488        assert_eq!(t.schema().columns.len(), 2);
10489        assert_eq!(t.schema().columns[0].ty, DataType::Int);
10490        assert!(!t.schema().columns[0].nullable);
10491        assert_eq!(t.schema().columns[1].ty, DataType::Text);
10492    }
10493
10494    #[test]
10495    fn create_table_vector_default_is_f32_encoded() {
10496        let mut e = Engine::new();
10497        e.execute("CREATE TABLE t (v VECTOR(8))").unwrap();
10498        let t = e.catalog().get("t").unwrap();
10499        assert_eq!(
10500            t.schema().columns[0].ty,
10501            DataType::Vector {
10502                dim: 8,
10503                encoding: VecEncoding::F32,
10504            },
10505        );
10506    }
10507
10508    #[test]
10509    fn create_table_vector_using_sq8_succeeds() {
10510        // v6.0.1 step 3: the step-1 fence in `column_def_to_schema`
10511        // is lifted. CREATE TABLE persists an SQ8 column type in
10512        // the catalog; INSERT (next test) quantises raw f32 input.
10513        let mut e = Engine::new();
10514        e.execute("CREATE TABLE t (v VECTOR(8) USING SQ8)").unwrap();
10515        let t = e.catalog().get("t").unwrap();
10516        assert_eq!(
10517            t.schema().columns[0].ty,
10518            DataType::Vector {
10519                dim: 8,
10520                encoding: VecEncoding::Sq8,
10521            },
10522        );
10523    }
10524
10525    #[test]
10526    fn insert_into_sq8_column_quantises_f32_payload() {
10527        // v6.0.1 step 3: INSERT-time `coerce_value` rewrites a raw
10528        // `Value::Vector(Vec<f32>)` literal into the column's
10529        // quantised representation. The row that lands in the
10530        // catalog must therefore hold a `Value::Sq8Vector`, not the
10531        // original f32 buffer — that's the bit that delivers the
10532        // 4× compression target.
10533        let mut e = Engine::new();
10534        e.execute("CREATE TABLE t (v VECTOR(4) USING SQ8)").unwrap();
10535        e.execute("INSERT INTO t VALUES ([0.0, 0.25, 0.5, 1.0])")
10536            .unwrap();
10537        let t = e.catalog().get("t").unwrap();
10538        assert_eq!(t.rows().len(), 1);
10539        match &t.rows()[0].values[0] {
10540            Value::Sq8Vector(q) => {
10541                assert_eq!(q.bytes.len(), 4);
10542                // min/max are derived from the payload: min=0.0, max=1.0.
10543                assert!((q.min - 0.0).abs() < 1e-6);
10544                assert!((q.max - 1.0).abs() < 1e-6);
10545            }
10546            other => panic!("expected Sq8Vector cell, got {other:?}"),
10547        }
10548    }
10549
10550    #[test]
10551    fn create_table_vector_using_half_succeeds_and_insert_converts_to_f16() {
10552        // v6.0.3: CREATE TABLE accepts USING HALF; INSERT path
10553        // converts the incoming `Value::Vector(Vec<f32>)` cell
10554        // into `Value::HalfVector(HalfVector)` via the new
10555        // `coerce_value` arm. The dequantised round-trip is
10556        // bit-exact for f16-representable values, so 0.0 / 0.25
10557        // / 0.5 / 1.0 hit their grid points exactly.
10558        let mut e = Engine::new();
10559        e.execute("CREATE TABLE t (v VECTOR(4) USING HALF)")
10560            .unwrap();
10561        e.execute("INSERT INTO t VALUES ([0.0, 0.25, 0.5, 1.0])")
10562            .unwrap();
10563        let t = e.catalog().get("t").unwrap();
10564        assert_eq!(t.rows().len(), 1);
10565        match &t.rows()[0].values[0] {
10566            Value::HalfVector(h) => {
10567                assert_eq!(h.dim(), 4);
10568                let back = h.to_f32_vec();
10569                let expected = alloc::vec![0.0_f32, 0.25, 0.5, 1.0];
10570                for (g, e) in back.iter().zip(expected.iter()) {
10571                    assert!(
10572                        (g - e).abs() < 1e-6,
10573                        "{g} vs {e} should be exact on f16 grid"
10574                    );
10575                }
10576            }
10577            other => panic!("expected HalfVector cell, got {other:?}"),
10578        }
10579    }
10580
10581    #[test]
10582    fn alter_index_rebuild_in_place_succeeds() {
10583        // v6.0.4: bare REBUILD (no encoding switch) walks every
10584        // row again to rebuild the NSW graph. Verifies the engine
10585        // dispatch + storage helper plumbing without changing any
10586        // cell encoding.
10587        let mut e = Engine::new();
10588        e.execute("CREATE TABLE t (id INT NOT NULL, v VECTOR(3) NOT NULL)")
10589            .unwrap();
10590        for i in 0..8_i32 {
10591            #[allow(clippy::cast_precision_loss)]
10592            let base = (i as f32) * 0.1;
10593            e.execute(&alloc::format!(
10594                "INSERT INTO t VALUES ({i}, [{base}, {b1}, {b2}])",
10595                b1 = base + 0.01,
10596                b2 = base + 0.02,
10597            ))
10598            .unwrap();
10599        }
10600        e.execute("CREATE INDEX t_idx ON t USING hnsw (v)").unwrap();
10601        e.execute("ALTER INDEX t_idx REBUILD").unwrap();
10602        // Schema encoding stays F32 (no encoding clause).
10603        assert_eq!(
10604            e.catalog().get("t").unwrap().schema().columns[1].ty,
10605            DataType::Vector {
10606                dim: 3,
10607                encoding: VecEncoding::F32,
10608            },
10609        );
10610    }
10611
10612    #[test]
10613    fn alter_index_rebuild_with_encoding_switches_cell_type() {
10614        // v6.0.4: REBUILD WITH (encoding = SQ8) recodes every
10615        // stored cell from F32 → SQ8 + rebuilds the graph atop the
10616        // new encoding. Post-rebuild, cells must be Sq8Vector and
10617        // the schema must report encoding = Sq8.
10618        let mut e = Engine::new();
10619        e.execute("CREATE TABLE t (id INT NOT NULL, v VECTOR(4) NOT NULL)")
10620            .unwrap();
10621        e.execute("INSERT INTO t VALUES (1, [0.0, 0.25, 0.5, 1.0])")
10622            .unwrap();
10623        e.execute("CREATE INDEX t_idx ON t USING hnsw (v)").unwrap();
10624        e.execute("ALTER INDEX t_idx REBUILD WITH (encoding = SQ8)")
10625            .unwrap();
10626        let t = e.catalog().get("t").unwrap();
10627        assert_eq!(
10628            t.schema().columns[1].ty,
10629            DataType::Vector {
10630                dim: 4,
10631                encoding: VecEncoding::Sq8,
10632            },
10633        );
10634        assert!(matches!(t.rows()[0].values[1], Value::Sq8Vector(_)));
10635    }
10636
10637    #[test]
10638    fn alter_index_rebuild_unknown_index_errors() {
10639        let mut e = Engine::new();
10640        let err = e.execute("ALTER INDEX nope REBUILD").unwrap_err();
10641        assert!(
10642            matches!(
10643                &err,
10644                EngineError::Storage(StorageError::IndexNotFound { name }) if name == "nope"
10645            ),
10646            "got: {err}"
10647        );
10648    }
10649
10650    #[test]
10651    fn alter_index_rebuild_on_btree_index_errors() {
10652        // REBUILD on a B-tree index has no semantic meaning in
10653        // v6.0.4 — rejected at the storage layer with `Unsupported`.
10654        let mut e = Engine::new();
10655        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
10656        e.execute("INSERT INTO t VALUES (1)").unwrap();
10657        e.execute("CREATE INDEX t_idx ON t (id)").unwrap();
10658        let err = e.execute("ALTER INDEX t_idx REBUILD").unwrap_err();
10659        assert!(
10660            matches!(&err, EngineError::Storage(StorageError::Unsupported(_))),
10661            "got: {err}"
10662        );
10663    }
10664
10665    #[test]
10666    fn prepared_insert_substitutes_placeholders() {
10667        // v6.1.1: prepare() parses once; execute_prepared() walks the
10668        // AST and replaces $1/$2 with the param Values BEFORE the
10669        // dispatch sees them. Same logical result as a simple-query
10670        // INSERT, but parse happens once per *statement*, not per
10671        // execution.
10672        let mut e = Engine::new();
10673        e.execute("CREATE TABLE t (id INT NOT NULL, name TEXT NOT NULL)")
10674            .unwrap();
10675        let stmt = e.prepare("INSERT INTO t VALUES ($1, $2)").unwrap();
10676        for (id, name) in [(1, "alice"), (2, "bob"), (3, "carol")] {
10677            e.execute_prepared(stmt.clone(), &[Value::Int(id), Value::Text(name.into())])
10678                .unwrap();
10679        }
10680        // Read back via simple-query SELECT.
10681        let rows_result = e.execute("SELECT id, name FROM t").unwrap();
10682        let QueryResult::Rows { rows, .. } = rows_result else {
10683            panic!("expected Rows")
10684        };
10685        assert_eq!(rows.len(), 3);
10686    }
10687
10688    #[test]
10689    fn prepared_select_with_placeholder_filters_rows() {
10690        let mut e = Engine::new();
10691        e.execute("CREATE TABLE t (id INT NOT NULL, v INT NOT NULL)")
10692            .unwrap();
10693        for i in 0..10_i32 {
10694            e.execute(&alloc::format!("INSERT INTO t VALUES ({i}, {})", i * 7))
10695                .unwrap();
10696        }
10697        let stmt = e.prepare("SELECT id FROM t WHERE v = $1").unwrap();
10698        let QueryResult::Rows { rows, .. } = e.execute_prepared(stmt, &[Value::Int(35)]).unwrap()
10699        else {
10700            panic!("expected Rows")
10701        };
10702        // v = 35 means i*7 = 35 → i = 5.
10703        assert_eq!(rows.len(), 1);
10704        assert_eq!(rows[0].values[0], Value::Int(5));
10705    }
10706
10707    #[test]
10708    fn prepared_too_few_params_errors() {
10709        let mut e = Engine::new();
10710        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
10711        let stmt = e.prepare("INSERT INTO t VALUES ($1)").unwrap();
10712        let err = e.execute_prepared(stmt, &[]).unwrap_err();
10713        assert!(
10714            matches!(
10715                &err,
10716                EngineError::Eval(EvalError::PlaceholderOutOfRange { n: 1, bound: 0 })
10717            ),
10718            "got: {err}"
10719        );
10720    }
10721
10722    #[test]
10723    fn insert_into_half_column_dim_mismatch_errors() {
10724        let mut e = Engine::new();
10725        e.execute("CREATE TABLE t (v VECTOR(4) USING HALF)")
10726            .unwrap();
10727        let err = e.execute("INSERT INTO t VALUES ([1.0, 2.0])").unwrap_err();
10728        assert!(matches!(
10729            &err,
10730            EngineError::Storage(StorageError::TypeMismatch { .. })
10731        ));
10732    }
10733
10734    #[test]
10735    fn insert_into_sq8_column_dim_mismatch_errors() {
10736        // Dim mismatch falls through the `coerce_value` Vector→Sq8
10737        // arm's guard and surfaces as `TypeMismatch` — the same
10738        // error the F32 path produces today, so client error
10739        // handling stays uniform across encodings.
10740        let mut e = Engine::new();
10741        e.execute("CREATE TABLE t (v VECTOR(4) USING SQ8)").unwrap();
10742        let err = e.execute("INSERT INTO t VALUES ([1.0, 2.0])").unwrap_err();
10743        assert!(
10744            matches!(
10745                &err,
10746                EngineError::Storage(StorageError::TypeMismatch { .. })
10747            ),
10748            "got: {err}",
10749        );
10750    }
10751
10752    #[test]
10753    fn create_table_duplicate_errors() {
10754        let mut e = Engine::new();
10755        e.execute("CREATE TABLE foo (a INT)").unwrap();
10756        let err = e.execute("CREATE TABLE foo (a INT)").unwrap_err();
10757        assert!(matches!(
10758            err,
10759            EngineError::Storage(StorageError::DuplicateTable { ref name }) if name == "foo"
10760        ));
10761    }
10762
10763    #[test]
10764    fn insert_into_unknown_table_errors() {
10765        let mut e = Engine::new();
10766        let err = e.execute("INSERT INTO ghost VALUES (1)").unwrap_err();
10767        assert!(matches!(
10768            err,
10769            EngineError::Storage(StorageError::TableNotFound { ref name }) if name == "ghost"
10770        ));
10771    }
10772
10773    #[test]
10774    fn insert_happy_path_reports_one_affected() {
10775        let mut e = Engine::new();
10776        e.execute("CREATE TABLE foo (a INT NOT NULL)").unwrap();
10777        let r = e.execute("INSERT INTO foo VALUES (42)").unwrap();
10778        assert_eq!(unwrap_command_ok(&r), 1);
10779        assert_eq!(e.catalog().get("foo").unwrap().row_count(), 1);
10780    }
10781
10782    #[test]
10783    fn insert_arity_mismatch_propagates() {
10784        let mut e = Engine::new();
10785        e.execute("CREATE TABLE foo (a INT, b TEXT)").unwrap();
10786        let err = e.execute("INSERT INTO foo VALUES (1)").unwrap_err();
10787        assert!(matches!(
10788            err,
10789            EngineError::Storage(StorageError::ArityMismatch { .. })
10790        ));
10791    }
10792
10793    #[test]
10794    fn insert_negative_integer_via_unary_minus() {
10795        let mut e = Engine::new();
10796        e.execute("CREATE TABLE foo (a INT NOT NULL)").unwrap();
10797        e.execute("INSERT INTO foo VALUES (-7)").unwrap();
10798        let rows = e.catalog().get("foo").unwrap().rows();
10799        assert_eq!(rows[0].values[0], Value::Int(-7));
10800    }
10801
10802    #[test]
10803    fn insert_non_literal_expr_unsupported() {
10804        let mut e = Engine::new();
10805        e.execute("CREATE TABLE foo (a INT NOT NULL)").unwrap();
10806        let err = e.execute("INSERT INTO foo VALUES (1 + 2)").unwrap_err();
10807        assert!(matches!(err, EngineError::Unsupported(_)));
10808    }
10809
10810    #[test]
10811    fn select_star_returns_all_rows_in_insertion_order() {
10812        let mut e = Engine::new();
10813        e.execute("CREATE TABLE foo (a INT NOT NULL, b TEXT NOT NULL)")
10814            .unwrap();
10815        e.execute("INSERT INTO foo VALUES (1, 'one')").unwrap();
10816        e.execute("INSERT INTO foo VALUES (2, 'two')").unwrap();
10817        e.execute("INSERT INTO foo VALUES (3, 'three')").unwrap();
10818
10819        let r = e.execute("SELECT * FROM foo").unwrap();
10820        let QueryResult::Rows { columns, rows } = r else {
10821            panic!("expected Rows")
10822        };
10823        assert_eq!(columns.len(), 2);
10824        assert_eq!(columns[0].name, "a");
10825        assert_eq!(rows.len(), 3);
10826        assert_eq!(
10827            rows[1].values,
10828            vec![Value::Int(2), Value::Text("two".into())]
10829        );
10830    }
10831
10832    #[test]
10833    fn select_star_on_empty_table_returns_zero_rows() {
10834        let mut e = Engine::new();
10835        e.execute("CREATE TABLE foo (a INT)").unwrap();
10836        let r = e.execute("SELECT * FROM foo").unwrap();
10837        match r {
10838            QueryResult::Rows { rows, .. } => assert!(rows.is_empty()),
10839            QueryResult::CommandOk { .. } => panic!("expected Rows"),
10840        }
10841    }
10842
10843    // --- v0.4: WHERE + projection ------------------------------------------
10844
10845    fn make_three_row_users(e: &mut Engine) {
10846        e.execute("CREATE TABLE users (id INT NOT NULL, name TEXT NOT NULL, score INT)")
10847            .unwrap();
10848        e.execute("INSERT INTO users VALUES (1, 'alice', 90)")
10849            .unwrap();
10850        e.execute("INSERT INTO users VALUES (2, 'bob', NULL)")
10851            .unwrap();
10852        e.execute("INSERT INTO users VALUES (3, 'cara', 70)")
10853            .unwrap();
10854    }
10855
10856    fn unwrap_rows(r: QueryResult) -> (Vec<ColumnSchema>, Vec<Row>) {
10857        match r {
10858            QueryResult::Rows { columns, rows } => (columns, rows),
10859            QueryResult::CommandOk { .. } => panic!("expected Rows"),
10860        }
10861    }
10862
10863    #[test]
10864    fn where_filter_passes_only_true_rows() {
10865        let mut e = Engine::new();
10866        make_three_row_users(&mut e);
10867        let r = e.execute("SELECT * FROM users WHERE id > 1").unwrap();
10868        let (_, rows) = unwrap_rows(r);
10869        assert_eq!(rows.len(), 2);
10870        assert_eq!(rows[0].values[0], Value::Int(2));
10871        assert_eq!(rows[1].values[0], Value::Int(3));
10872    }
10873
10874    #[test]
10875    fn where_with_null_result_filters_out_row() {
10876        let mut e = Engine::new();
10877        make_three_row_users(&mut e);
10878        // score is NULL for bob → score > 80 is NULL → row excluded
10879        let r = e.execute("SELECT * FROM users WHERE score > 80").unwrap();
10880        let (_, rows) = unwrap_rows(r);
10881        assert_eq!(rows.len(), 1);
10882        assert_eq!(rows[0].values[1], Value::Text("alice".into()));
10883    }
10884
10885    #[test]
10886    fn projection_named_columns() {
10887        let mut e = Engine::new();
10888        make_three_row_users(&mut e);
10889        let r = e.execute("SELECT name, score FROM users").unwrap();
10890        let (cols, rows) = unwrap_rows(r);
10891        assert_eq!(cols.len(), 2);
10892        assert_eq!(cols[0].name, "name");
10893        assert_eq!(cols[1].name, "score");
10894        assert_eq!(rows.len(), 3);
10895        assert_eq!(
10896            rows[0].values,
10897            vec![Value::Text("alice".into()), Value::Int(90)]
10898        );
10899    }
10900
10901    #[test]
10902    fn projection_with_column_alias() {
10903        let mut e = Engine::new();
10904        make_three_row_users(&mut e);
10905        let r = e
10906            .execute("SELECT name AS who FROM users WHERE id = 1")
10907            .unwrap();
10908        let (cols, rows) = unwrap_rows(r);
10909        assert_eq!(cols[0].name, "who");
10910        assert_eq!(rows.len(), 1);
10911        assert_eq!(rows[0].values[0], Value::Text("alice".into()));
10912    }
10913
10914    #[test]
10915    fn qualified_column_with_table_alias_resolves() {
10916        let mut e = Engine::new();
10917        make_three_row_users(&mut e);
10918        let r = e
10919            .execute("SELECT u.id, u.name FROM users AS u WHERE u.id < 3")
10920            .unwrap();
10921        let (cols, rows) = unwrap_rows(r);
10922        assert_eq!(cols.len(), 2);
10923        assert_eq!(rows.len(), 2);
10924    }
10925
10926    #[test]
10927    fn qualified_column_with_wrong_alias_errors() {
10928        let mut e = Engine::new();
10929        make_three_row_users(&mut e);
10930        let err = e.execute("SELECT x.id FROM users AS u").unwrap_err();
10931        assert!(matches!(
10932            err,
10933            EngineError::Eval(EvalError::UnknownQualifier { ref qualifier }) if qualifier == "x"
10934        ));
10935    }
10936
10937    #[test]
10938    fn select_unknown_column_errors_in_projection() {
10939        let mut e = Engine::new();
10940        make_three_row_users(&mut e);
10941        let err = e.execute("SELECT ghost FROM users").unwrap_err();
10942        assert!(matches!(
10943            err,
10944            EngineError::Eval(EvalError::ColumnNotFound { ref name }) if name == "ghost"
10945        ));
10946    }
10947
10948    #[test]
10949    fn where_unknown_column_errors() {
10950        let mut e = Engine::new();
10951        make_three_row_users(&mut e);
10952        let err = e
10953            .execute("SELECT * FROM users WHERE ghost = 1")
10954            .unwrap_err();
10955        assert!(matches!(
10956            err,
10957            EngineError::Eval(EvalError::ColumnNotFound { .. })
10958        ));
10959    }
10960
10961    #[test]
10962    fn expression_projection_evaluates_and_renders() {
10963        // Compound expressions in the SELECT list are evaluated per row;
10964        // the output column is typed TEXT, name defaults to the expression.
10965        let mut e = Engine::new();
10966        e.execute("CREATE TABLE t (a INT NOT NULL)").unwrap();
10967        e.execute("INSERT INTO t VALUES (3)").unwrap();
10968        let (_, rows) = unwrap_rows(e.execute("SELECT 1 + 2 FROM t").unwrap());
10969        assert_eq!(rows.len(), 1);
10970        // The expression evaluates to integer 3; rendered as the cell value
10971        // (storage::Value::Int(3) since arithmetic kept ints).
10972        assert_eq!(rows[0].values[0], Value::Int(3));
10973    }
10974
10975    #[test]
10976    fn select_unknown_table_errors() {
10977        let mut e = Engine::new();
10978        let err = e.execute("SELECT * FROM ghost").unwrap_err();
10979        assert!(matches!(
10980            err,
10981            EngineError::Storage(StorageError::TableNotFound { .. })
10982        ));
10983    }
10984
10985    #[test]
10986    fn invalid_sql_returns_parse_error() {
10987        // v4.4: UPDATE is now real SQL, so use a true syntactic
10988        // garbage payload for the parse-error path.
10989        let mut e = Engine::new();
10990        let err = e.execute("THIS_IS_NOT_A_KEYWORD foo bar baz").unwrap_err();
10991        assert!(matches!(err, EngineError::Parse(_)));
10992    }
10993
10994    // --- v0.8 CREATE INDEX + index seek ------------------------------------
10995
10996    #[test]
10997    fn create_index_registers_on_table() {
10998        let mut e = Engine::new();
10999        make_three_row_users(&mut e);
11000        e.execute("CREATE INDEX by_name ON users (name)").unwrap();
11001        let t = e.catalog().get("users").unwrap();
11002        assert_eq!(t.indices().len(), 1);
11003        assert_eq!(t.indices()[0].name, "by_name");
11004    }
11005
11006    #[test]
11007    fn create_index_on_unknown_table_errors() {
11008        let mut e = Engine::new();
11009        let err = e.execute("CREATE INDEX i ON ghost (a)").unwrap_err();
11010        assert!(matches!(
11011            err,
11012            EngineError::Storage(StorageError::TableNotFound { .. })
11013        ));
11014    }
11015
11016    #[test]
11017    fn create_index_on_unknown_column_errors() {
11018        let mut e = Engine::new();
11019        make_three_row_users(&mut e);
11020        let err = e.execute("CREATE INDEX i ON users (ghost)").unwrap_err();
11021        assert!(matches!(
11022            err,
11023            EngineError::Storage(StorageError::ColumnNotFound { .. })
11024        ));
11025    }
11026
11027    #[test]
11028    fn select_eq_uses_index_returns_same_rows_as_scan() {
11029        // Build two engines: one with an index, one without. Same query →
11030        // same row set (index is a planner optimisation, not a semantic
11031        // change).
11032        let mut without = Engine::new();
11033        make_three_row_users(&mut without);
11034        let mut with = Engine::new();
11035        make_three_row_users(&mut with);
11036        with.execute("CREATE INDEX by_id ON users (id)").unwrap();
11037
11038        let q = "SELECT * FROM users WHERE id = 2";
11039        let (_, no_idx_rows) = unwrap_rows(without.execute(q).unwrap());
11040        let (_, idx_rows) = unwrap_rows(with.execute(q).unwrap());
11041        assert_eq!(no_idx_rows, idx_rows);
11042        assert_eq!(idx_rows.len(), 1);
11043    }
11044
11045    #[test]
11046    fn select_eq_with_no_matching_index_value_returns_empty() {
11047        let mut e = Engine::new();
11048        make_three_row_users(&mut e);
11049        e.execute("CREATE INDEX by_id ON users (id)").unwrap();
11050        let (_, rows) = unwrap_rows(e.execute("SELECT * FROM users WHERE id = 999").unwrap());
11051        assert_eq!(rows.len(), 0);
11052    }
11053
11054    // --- v0.9 transactions -------------------------------------------------
11055
11056    #[test]
11057    fn begin_sets_in_transaction_flag() {
11058        let mut e = Engine::new();
11059        assert!(!e.in_transaction());
11060        e.execute("BEGIN").unwrap();
11061        assert!(e.in_transaction());
11062    }
11063
11064    #[test]
11065    fn double_begin_errors() {
11066        let mut e = Engine::new();
11067        e.execute("BEGIN").unwrap();
11068        let err = e.execute("BEGIN").unwrap_err();
11069        assert_eq!(err, EngineError::TransactionAlreadyOpen);
11070    }
11071
11072    #[test]
11073    fn commit_without_begin_errors() {
11074        let mut e = Engine::new();
11075        let err = e.execute("COMMIT").unwrap_err();
11076        assert_eq!(err, EngineError::NoActiveTransaction);
11077    }
11078
11079    #[test]
11080    fn rollback_without_begin_errors() {
11081        let mut e = Engine::new();
11082        let err = e.execute("ROLLBACK").unwrap_err();
11083        assert_eq!(err, EngineError::NoActiveTransaction);
11084    }
11085
11086    #[test]
11087    fn commit_applies_shadow_to_committed_catalog() {
11088        let mut e = Engine::new();
11089        e.execute("CREATE TABLE t (v INT NOT NULL)").unwrap();
11090        e.execute("BEGIN").unwrap();
11091        e.execute("INSERT INTO t VALUES (1)").unwrap();
11092        e.execute("INSERT INTO t VALUES (2)").unwrap();
11093        e.execute("COMMIT").unwrap();
11094        assert!(!e.in_transaction());
11095        assert_eq!(e.catalog().get("t").unwrap().row_count(), 2);
11096    }
11097
11098    #[test]
11099    fn rollback_discards_shadow() {
11100        let mut e = Engine::new();
11101        e.execute("CREATE TABLE t (v INT NOT NULL)").unwrap();
11102        e.execute("BEGIN").unwrap();
11103        e.execute("INSERT INTO t VALUES (1)").unwrap();
11104        e.execute("INSERT INTO t VALUES (2)").unwrap();
11105        e.execute("ROLLBACK").unwrap();
11106        assert!(!e.in_transaction());
11107        assert_eq!(e.catalog().get("t").unwrap().row_count(), 0);
11108    }
11109
11110    #[test]
11111    fn select_during_tx_sees_uncommitted_writes_own_session() {
11112        // The shadow catalog is read by SELECTs while a TX is open — the
11113        // session can see its own pending writes.
11114        let mut e = Engine::new();
11115        e.execute("CREATE TABLE t (v INT NOT NULL)").unwrap();
11116        e.execute("BEGIN").unwrap();
11117        e.execute("INSERT INTO t VALUES (42)").unwrap();
11118        let (_, rows) = unwrap_rows(e.execute("SELECT * FROM t").unwrap());
11119        assert_eq!(rows.len(), 1);
11120        assert_eq!(rows[0].values[0], Value::Int(42));
11121    }
11122
11123    #[test]
11124    fn snapshot_with_no_users_is_bare_catalog_format() {
11125        let mut e = Engine::new();
11126        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
11127        let bytes = e.snapshot();
11128        assert_eq!(
11129            &bytes[..8],
11130            b"SPGDB001",
11131            "must be the bare v3.x catalog magic"
11132        );
11133        let e2 = Engine::restore_envelope(&bytes).unwrap();
11134        assert!(e2.users().is_empty());
11135        assert_eq!(e2.catalog().table_count(), 1);
11136    }
11137
11138    #[test]
11139    fn snapshot_with_users_round_trips_both_via_envelope() {
11140        let mut e = Engine::new();
11141        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
11142        e.create_user("alice", "pw1", Role::Admin, [9; 16]).unwrap();
11143        e.create_user("bob", "pw2", Role::ReadOnly, [5; 16])
11144            .unwrap();
11145        let bytes = e.snapshot();
11146        assert_eq!(&bytes[..8], b"SPGENV01", "must be the v4.1 envelope magic");
11147        let e2 = Engine::restore_envelope(&bytes).unwrap();
11148        assert_eq!(e2.users().len(), 2);
11149        assert_eq!(e2.verify_user("alice", "pw1"), Some(Role::Admin));
11150        assert_eq!(e2.verify_user("bob", "pw2"), Some(Role::ReadOnly));
11151        assert_eq!(e2.verify_user("alice", "wrong"), None);
11152        assert_eq!(e2.catalog().table_count(), 1);
11153    }
11154
11155    #[test]
11156    fn ddl_inside_tx_also_rolled_back() {
11157        let mut e = Engine::new();
11158        e.execute("BEGIN").unwrap();
11159        e.execute("CREATE TABLE t (v INT)").unwrap();
11160        // Visible inside the TX.
11161        e.execute("SELECT * FROM t").unwrap();
11162        e.execute("ROLLBACK").unwrap();
11163        // Gone after rollback.
11164        let err = e.execute("SELECT * FROM t").unwrap_err();
11165        assert!(matches!(
11166            err,
11167            EngineError::Storage(StorageError::TableNotFound { .. })
11168        ));
11169    }
11170
11171    // ── v6.1.2: CREATE / DROP PUBLICATION (engine-side) ──────
11172
11173    #[test]
11174    fn create_publication_lands_in_catalog() {
11175        let mut e = Engine::new();
11176        assert!(e.publications().is_empty());
11177        e.execute("CREATE PUBLICATION pub_a").unwrap();
11178        assert_eq!(e.publications().len(), 1);
11179        assert!(e.publications().contains("pub_a"));
11180    }
11181
11182    #[test]
11183    fn create_publication_duplicate_errors() {
11184        let mut e = Engine::new();
11185        e.execute("CREATE PUBLICATION pub_a").unwrap();
11186        let err = e.execute("CREATE PUBLICATION pub_a").unwrap_err();
11187        assert!(
11188            alloc::format!("{err:?}").contains("DuplicateName"),
11189            "got {err:?}"
11190        );
11191    }
11192
11193    #[test]
11194    fn drop_publication_silent_when_absent() {
11195        let mut e = Engine::new();
11196        // PG-compatible: DROP a publication that doesn't exist
11197        // succeeds (no-op) but reports zero affected.
11198        let r = e.execute("DROP PUBLICATION nope").unwrap();
11199        match r {
11200            QueryResult::CommandOk { affected, .. } => assert_eq!(affected, 0),
11201            other => panic!("expected CommandOk, got {other:?}"),
11202        }
11203    }
11204
11205    #[test]
11206    fn drop_publication_present_reports_one_affected() {
11207        let mut e = Engine::new();
11208        e.execute("CREATE PUBLICATION pub_a").unwrap();
11209        let r = e.execute("DROP PUBLICATION pub_a").unwrap();
11210        match r {
11211            QueryResult::CommandOk {
11212                affected,
11213                modified_catalog,
11214            } => {
11215                assert_eq!(affected, 1);
11216                assert!(modified_catalog);
11217            }
11218            other => panic!("expected CommandOk, got {other:?}"),
11219        }
11220        assert!(e.publications().is_empty());
11221    }
11222
11223    #[test]
11224    fn publications_persist_across_snapshot_restore() {
11225        // The persist-across-restart ship-gate at the engine layer —
11226        // snapshot → restore_envelope round trip must preserve the
11227        // publication catalog. The spg-server e2e covers the
11228        // process-restart variant.
11229        let mut e = Engine::new();
11230        e.execute("CREATE PUBLICATION pub_a").unwrap();
11231        e.execute("CREATE PUBLICATION pub_b FOR ALL TABLES")
11232            .unwrap();
11233        let snap = e.snapshot();
11234        let e2 = Engine::restore_envelope(&snap).unwrap();
11235        assert_eq!(e2.publications().len(), 2);
11236        assert!(e2.publications().contains("pub_a"));
11237        assert!(e2.publications().contains("pub_b"));
11238    }
11239
11240    #[test]
11241    fn create_publication_allowed_inside_transaction() {
11242        // v6.1.4 dropped the v6.1.2 in-TX guard — PG allows
11243        // CREATE PUBLICATION inside a TX and the auto-commit
11244        // wrap path needs the same allowance.
11245        let mut e = Engine::new();
11246        e.execute("BEGIN").unwrap();
11247        e.execute("CREATE PUBLICATION pub_a").unwrap();
11248        e.execute("COMMIT").unwrap();
11249        assert!(e.publications().contains("pub_a"));
11250    }
11251
11252    // ── v6.1.3: SHOW PUBLICATIONS + FOR-list variants ───────
11253
11254    #[test]
11255    fn create_publication_for_table_list_lands_with_scope() {
11256        let mut e = Engine::new();
11257        e.execute("CREATE TABLE t1 (id INT NOT NULL)").unwrap();
11258        e.execute("CREATE TABLE t2 (id INT NOT NULL)").unwrap();
11259        e.execute("CREATE PUBLICATION pub_a FOR TABLE t1, t2")
11260            .unwrap();
11261        let scope = e.publications().get("pub_a").cloned();
11262        let Some(spg_sql::ast::PublicationScope::ForTables(ts)) = scope else {
11263            panic!("expected ForTables scope, got {scope:?}")
11264        };
11265        assert_eq!(ts, alloc::vec!["t1".to_string(), "t2".to_string()]);
11266    }
11267
11268    #[test]
11269    fn create_publication_all_tables_except_lands_with_scope() {
11270        let mut e = Engine::new();
11271        e.execute("CREATE PUBLICATION pub_a FOR ALL TABLES EXCEPT t3")
11272            .unwrap();
11273        let scope = e.publications().get("pub_a").cloned();
11274        let Some(spg_sql::ast::PublicationScope::AllTablesExcept(ts)) = scope else {
11275            panic!("expected AllTablesExcept scope, got {scope:?}")
11276        };
11277        assert_eq!(ts, alloc::vec!["t3".to_string()]);
11278    }
11279
11280    #[test]
11281    fn show_publications_empty_returns_zero_rows() {
11282        let e = Engine::new();
11283        let r = e.execute_readonly("SHOW PUBLICATIONS").unwrap();
11284        let QueryResult::Rows { rows, columns } = r else {
11285            panic!()
11286        };
11287        assert!(rows.is_empty());
11288        assert_eq!(columns.len(), 3);
11289        assert_eq!(columns[0].name, "name");
11290        assert_eq!(columns[1].name, "scope");
11291        assert_eq!(columns[2].name, "table_count");
11292    }
11293
11294    #[test]
11295    fn show_publications_returns_one_row_per_publication_ordered_by_name() {
11296        let mut e = Engine::new();
11297        e.execute("CREATE PUBLICATION z_pub").unwrap();
11298        e.execute("CREATE PUBLICATION a_pub FOR TABLE t1, t2")
11299            .unwrap();
11300        e.execute("CREATE PUBLICATION m_pub FOR ALL TABLES EXCEPT bad")
11301            .unwrap();
11302        let r = e.execute_readonly("SHOW PUBLICATIONS").unwrap();
11303        let QueryResult::Rows { rows, .. } = r else {
11304            panic!()
11305        };
11306        assert_eq!(rows.len(), 3);
11307        // Alphabetical order: a_pub, m_pub, z_pub.
11308        let names: Vec<&str> = rows
11309            .iter()
11310            .map(|r| {
11311                if let Value::Text(s) = &r.values[0] {
11312                    s.as_str()
11313                } else {
11314                    panic!()
11315                }
11316            })
11317            .collect();
11318        assert_eq!(names, alloc::vec!["a_pub", "m_pub", "z_pub"]);
11319        // Row 0 — a_pub scope summary + table_count = 2.
11320        match &rows[0].values[1] {
11321            Value::Text(s) => assert_eq!(s, "FOR TABLE t1, t2"),
11322            other => panic!("expected Text, got {other:?}"),
11323        }
11324        assert_eq!(rows[0].values[2], Value::Int(2));
11325        // Row 1 — m_pub.
11326        match &rows[1].values[1] {
11327            Value::Text(s) => assert_eq!(s, "FOR ALL TABLES EXCEPT bad"),
11328            other => panic!("expected Text, got {other:?}"),
11329        }
11330        assert_eq!(rows[1].values[2], Value::Int(1));
11331        // Row 2 — z_pub (AllTables → NULL count).
11332        match &rows[2].values[1] {
11333            Value::Text(s) => assert_eq!(s, "FOR ALL TABLES"),
11334            other => panic!("expected Text, got {other:?}"),
11335        }
11336        assert_eq!(rows[2].values[2], Value::Null);
11337    }
11338
11339    #[test]
11340    fn for_list_scopes_persist_across_snapshot() {
11341        // The v6.1.2 envelope-v3 round-trip exercised AllTables;
11342        // v6.1.3 needs the scope-1 / scope-2 tags to survive too.
11343        let mut e = Engine::new();
11344        e.execute("CREATE PUBLICATION p1 FOR TABLE t1, t2").unwrap();
11345        e.execute("CREATE PUBLICATION p2 FOR ALL TABLES EXCEPT bad, worse")
11346            .unwrap();
11347        let snap = e.snapshot();
11348        let e2 = Engine::restore_envelope(&snap).unwrap();
11349        assert_eq!(e2.publications().len(), 2);
11350        let p1 = e2.publications().get("p1").cloned();
11351        let Some(spg_sql::ast::PublicationScope::ForTables(ts)) = p1 else {
11352            panic!("p1 scope lost: {p1:?}")
11353        };
11354        assert_eq!(ts, alloc::vec!["t1".to_string(), "t2".to_string()]);
11355        let p2 = e2.publications().get("p2").cloned();
11356        let Some(spg_sql::ast::PublicationScope::AllTablesExcept(ts)) = p2 else {
11357            panic!("p2 scope lost: {p2:?}")
11358        };
11359        assert_eq!(ts, alloc::vec!["bad".to_string(), "worse".to_string()]);
11360    }
11361
11362    // ── v6.1.4: CREATE / DROP SUBSCRIPTION + SHOW + envelope v4 ─
11363
11364    #[test]
11365    fn create_subscription_lands_in_catalog_with_defaults() {
11366        let mut e = Engine::new();
11367        e.execute(
11368            "CREATE SUBSCRIPTION sub_a CONNECTION 'host=127.0.0.1 port=20002' PUBLICATION pub_a",
11369        )
11370        .unwrap();
11371        let s = e.subscriptions().get("sub_a").cloned().expect("present");
11372        assert_eq!(s.conn_str, "host=127.0.0.1 port=20002");
11373        assert_eq!(s.publications, alloc::vec!["pub_a".to_string()]);
11374        assert!(s.enabled);
11375        assert_eq!(s.last_received_pos, 0);
11376    }
11377
11378    #[test]
11379    fn create_subscription_duplicate_name_errors() {
11380        let mut e = Engine::new();
11381        e.execute("CREATE SUBSCRIPTION s CONNECTION 'host=x' PUBLICATION p")
11382            .unwrap();
11383        let err = e
11384            .execute("CREATE SUBSCRIPTION s CONNECTION 'host=y' PUBLICATION p")
11385            .unwrap_err();
11386        assert!(
11387            alloc::format!("{err:?}").contains("DuplicateName"),
11388            "got {err:?}"
11389        );
11390    }
11391
11392    #[test]
11393    fn drop_subscription_silent_when_absent() {
11394        let mut e = Engine::new();
11395        let r = e.execute("DROP SUBSCRIPTION never").unwrap();
11396        match r {
11397            QueryResult::CommandOk { affected, .. } => assert_eq!(affected, 0),
11398            other => panic!("expected CommandOk, got {other:?}"),
11399        }
11400    }
11401
11402    #[test]
11403    fn subscription_advance_updates_last_pos_monotone() {
11404        let mut e = Engine::new();
11405        e.execute("CREATE SUBSCRIPTION s CONNECTION 'h=x' PUBLICATION p")
11406            .unwrap();
11407        assert!(e.subscription_advance("s", 100));
11408        assert_eq!(e.subscriptions().get("s").unwrap().last_received_pos, 100);
11409        assert!(e.subscription_advance("s", 50)); // stale → ignored
11410        assert_eq!(e.subscriptions().get("s").unwrap().last_received_pos, 100);
11411        assert!(e.subscription_advance("s", 200));
11412        assert_eq!(e.subscriptions().get("s").unwrap().last_received_pos, 200);
11413        assert!(!e.subscription_advance("missing", 1));
11414    }
11415
11416    #[test]
11417    fn show_subscriptions_returns_rows_ordered_by_name() {
11418        let mut e = Engine::new();
11419        e.execute("CREATE SUBSCRIPTION z_sub CONNECTION 'h=x' PUBLICATION p1, p2")
11420            .unwrap();
11421        e.execute("CREATE SUBSCRIPTION a_sub CONNECTION 'h=y' PUBLICATION p3")
11422            .unwrap();
11423        let r = e.execute_readonly("SHOW SUBSCRIPTIONS").unwrap();
11424        let QueryResult::Rows { rows, columns } = r else {
11425            panic!()
11426        };
11427        assert_eq!(rows.len(), 2);
11428        assert_eq!(columns.len(), 5);
11429        assert_eq!(columns[0].name, "name");
11430        assert_eq!(columns[4].name, "last_received_pos");
11431        // Alphabetical: a_sub, z_sub.
11432        let names: Vec<&str> = rows
11433            .iter()
11434            .map(|r| {
11435                if let Value::Text(s) = &r.values[0] {
11436                    s.as_str()
11437                } else {
11438                    panic!()
11439                }
11440            })
11441            .collect();
11442        assert_eq!(names, alloc::vec!["a_sub", "z_sub"]);
11443        // Row 0: a_sub
11444        assert_eq!(rows[0].values[1], Value::Text("h=y".to_string()));
11445        assert_eq!(rows[0].values[2], Value::Text("p3".to_string()));
11446        assert_eq!(rows[0].values[3], Value::Bool(true));
11447        assert_eq!(rows[0].values[4], Value::BigInt(0));
11448        // Row 1: z_sub — publications join with ", "
11449        assert_eq!(rows[1].values[2], Value::Text("p1, p2".to_string()));
11450    }
11451
11452    #[test]
11453    fn subscriptions_persist_across_snapshot_envelope_v4() {
11454        let mut e = Engine::new();
11455        e.execute("CREATE SUBSCRIPTION s1 CONNECTION 'h=A' PUBLICATION p1, p2")
11456            .unwrap();
11457        e.execute("CREATE SUBSCRIPTION s2 CONNECTION 'h=B' PUBLICATION p3")
11458            .unwrap();
11459        e.subscription_advance("s2", 42);
11460        let snap = e.snapshot();
11461        let e2 = Engine::restore_envelope(&snap).unwrap();
11462        assert_eq!(e2.subscriptions().len(), 2);
11463        let s1 = e2.subscriptions().get("s1").unwrap();
11464        assert_eq!(s1.conn_str, "h=A");
11465        assert_eq!(
11466            s1.publications,
11467            alloc::vec!["p1".to_string(), "p2".to_string()]
11468        );
11469        assert_eq!(s1.last_received_pos, 0);
11470        let s2 = e2.subscriptions().get("s2").unwrap();
11471        assert_eq!(s2.last_received_pos, 42);
11472    }
11473
11474    #[test]
11475    fn v3_envelope_loads_with_empty_subscriptions() {
11476        // v3 snapshot (publications-only). Forge it by hand so we
11477        // verify v6.1.4 readers don't panic — they must surface
11478        // empty subscriptions and a populated publication table.
11479        let mut e = Engine::new();
11480        e.execute("CREATE PUBLICATION pub_legacy").unwrap();
11481        let catalog = e.catalog.serialize();
11482        let users = crate::users::serialize_users(&e.users);
11483        let pubs = e.publications.serialize();
11484        let mut buf = Vec::new();
11485        buf.extend_from_slice(b"SPGENV01");
11486        buf.push(3u8); // v3
11487        buf.extend_from_slice(&u32::try_from(catalog.len()).unwrap().to_le_bytes());
11488        buf.extend_from_slice(&catalog);
11489        buf.extend_from_slice(&u32::try_from(users.len()).unwrap().to_le_bytes());
11490        buf.extend_from_slice(&users);
11491        buf.extend_from_slice(&u32::try_from(pubs.len()).unwrap().to_le_bytes());
11492        buf.extend_from_slice(&pubs);
11493        let crc = spg_crypto::crc32::crc32(&buf);
11494        buf.extend_from_slice(&crc.to_le_bytes());
11495
11496        let e2 = Engine::restore_envelope(&buf).expect("v3 envelope restores under v4 reader");
11497        assert!(e2.subscriptions().is_empty());
11498        assert!(e2.publications().contains("pub_legacy"));
11499    }
11500
11501    #[test]
11502    fn create_subscription_allowed_inside_transaction() {
11503        let mut e = Engine::new();
11504        e.execute("BEGIN").unwrap();
11505        e.execute("CREATE SUBSCRIPTION s CONNECTION 'h=x' PUBLICATION p")
11506            .unwrap();
11507        e.execute("COMMIT").unwrap();
11508        assert!(e.subscriptions().contains("s"));
11509    }
11510
11511    // ── v6.2.0: ANALYZE + spg_statistic + envelope v5 ──────────
11512    #[test]
11513    fn analyze_populates_histogram_bounds() {
11514        let mut e = Engine::new();
11515        e.execute("CREATE TABLE t (id INT NOT NULL, name TEXT)")
11516            .unwrap();
11517        for i in 0..50 {
11518            e.execute(&alloc::format!("INSERT INTO t VALUES ({i}, 'name{i}')"))
11519                .unwrap();
11520        }
11521        e.execute("ANALYZE t").unwrap();
11522        let stats = e.statistics();
11523        let id_stats = stats.get("t", "id").unwrap();
11524        assert!(id_stats.histogram_bounds.len() >= 2);
11525        assert_eq!(id_stats.histogram_bounds.first().unwrap(), "0");
11526        assert_eq!(id_stats.histogram_bounds.last().unwrap(), "49");
11527        assert!((id_stats.null_frac - 0.0).abs() < 1e-6);
11528        assert_eq!(id_stats.n_distinct, 50);
11529    }
11530
11531    #[test]
11532    fn reanalyze_overwrites_prior_stats() {
11533        let mut e = Engine::new();
11534        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
11535        for i in 0..10 {
11536            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})"))
11537                .unwrap();
11538        }
11539        e.execute("ANALYZE t").unwrap();
11540        let n1 = e.statistics().get("t", "id").unwrap().n_distinct;
11541        assert_eq!(n1, 10);
11542        for i in 10..30 {
11543            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})"))
11544                .unwrap();
11545        }
11546        e.execute("ANALYZE t").unwrap();
11547        let n2 = e.statistics().get("t", "id").unwrap().n_distinct;
11548        assert_eq!(n2, 30);
11549    }
11550
11551    #[test]
11552    fn analyze_unknown_table_errors() {
11553        let mut e = Engine::new();
11554        let err = e.execute("ANALYZE nonexistent").unwrap_err();
11555        assert!(matches!(
11556            err,
11557            EngineError::Storage(StorageError::TableNotFound { .. })
11558        ));
11559    }
11560
11561    #[test]
11562    fn bare_analyze_covers_all_user_tables() {
11563        let mut e = Engine::new();
11564        e.execute("CREATE TABLE t1 (id INT NOT NULL)").unwrap();
11565        e.execute("CREATE TABLE t2 (name TEXT NOT NULL)").unwrap();
11566        e.execute("INSERT INTO t1 VALUES (1)").unwrap();
11567        e.execute("INSERT INTO t2 VALUES ('alice')").unwrap();
11568        let r = e.execute("ANALYZE").unwrap();
11569        match r {
11570            QueryResult::CommandOk {
11571                affected,
11572                modified_catalog,
11573            } => {
11574                assert_eq!(affected, 2);
11575                assert!(modified_catalog);
11576            }
11577            other => panic!("expected CommandOk, got {other:?}"),
11578        }
11579        assert!(e.statistics().get("t1", "id").is_some());
11580        assert!(e.statistics().get("t2", "name").is_some());
11581    }
11582
11583    #[test]
11584    fn select_from_spg_statistic_returns_rows_per_column() {
11585        let mut e = Engine::new();
11586        e.execute("CREATE TABLE t (id INT NOT NULL, label TEXT)")
11587            .unwrap();
11588        e.execute("INSERT INTO t VALUES (1, 'a')").unwrap();
11589        e.execute("INSERT INTO t VALUES (2, 'b')").unwrap();
11590        e.execute("ANALYZE t").unwrap();
11591        let r = e.execute_readonly("SELECT * FROM spg_statistic").unwrap();
11592        let QueryResult::Rows { rows, columns } = r else {
11593            panic!()
11594        };
11595        // v6.7.0 — spg_statistic gained a `cold_row_count` column.
11596        assert_eq!(columns.len(), 6);
11597        assert_eq!(columns[0].name, "table_name");
11598        assert_eq!(columns[4].name, "histogram_bounds");
11599        assert_eq!(columns[5].name, "cold_row_count");
11600        assert_eq!(rows.len(), 2, "one row per column of t");
11601        // Sorted by (table_name, column_name).
11602        match (&rows[0].values[0], &rows[0].values[1]) {
11603            (Value::Text(t), Value::Text(c)) => {
11604                assert_eq!(t, "t");
11605                // BTreeMap orders (table, column); columns "id" < "label".
11606                assert_eq!(c, "id");
11607            }
11608            _ => panic!(),
11609        }
11610    }
11611
11612    #[test]
11613    fn analyze_skips_vector_columns() {
11614        // Vector columns have their own stats shape (HNSW graph);
11615        // ANALYZE leaves them out of spg_statistic.
11616        let mut e = Engine::new();
11617        e.execute("CREATE TABLE t (id INT NOT NULL, v VECTOR(3) NOT NULL)")
11618            .unwrap();
11619        e.execute("INSERT INTO t VALUES (1, [1, 2, 3])").unwrap();
11620        e.execute("ANALYZE t").unwrap();
11621        assert!(e.statistics().get("t", "id").is_some());
11622        assert!(e.statistics().get("t", "v").is_none());
11623    }
11624
11625    #[test]
11626    fn statistics_persist_across_envelope_v5_round_trip() {
11627        let mut e = Engine::new();
11628        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
11629        for i in 0..20 {
11630            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})"))
11631                .unwrap();
11632        }
11633        e.execute("ANALYZE").unwrap();
11634        let snap = e.snapshot();
11635        let e2 = Engine::restore_envelope(&snap).unwrap();
11636        let s = e2.statistics().get("t", "id").unwrap();
11637        assert_eq!(s.n_distinct, 20);
11638    }
11639
11640    // ── v6.2.1 auto-analyze threshold ───────────────────────────
11641
11642    #[test]
11643    fn auto_analyze_threshold_fires_after_10pct_of_min_rows_on_small_table() {
11644        // For a table with 0 rows then 10 inserts → modified=10,
11645        // row_count=10. Threshold = 0.1 × max(10, 100) = 10. So
11646        // after the 10th INSERT the threshold is met.
11647        let mut e = Engine::new();
11648        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
11649        for i in 0..9 {
11650            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})"))
11651                .unwrap();
11652        }
11653        assert!(e.tables_needing_analyze().is_empty(), "9 < threshold");
11654        e.execute("INSERT INTO t VALUES (9)").unwrap();
11655        let needs = e.tables_needing_analyze();
11656        assert_eq!(needs, alloc::vec!["t".to_string()]);
11657    }
11658
11659    #[test]
11660    fn auto_analyze_threshold_uses_10pct_of_row_count_for_large_tables() {
11661        // After ANALYZE on 1000 rows, threshold = 0.1 × row_count.
11662        // Each new INSERT bumps both modified and row_count, so to
11663        // trigger from N=1000 we need modifications ≥ 0.1 × (1000+M),
11664        // i.e. M ≥ 112. The test inserts 50 (no fire), then 150
11665        // more (200 total mods, row_count=1200, threshold=120 → fire).
11666        let mut e = Engine::new();
11667        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
11668        for i in 0..1000 {
11669            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})"))
11670                .unwrap();
11671        }
11672        e.execute("ANALYZE t").unwrap();
11673        assert!(e.tables_needing_analyze().is_empty(), "fresh ANALYZE");
11674        for i in 1000..1050 {
11675            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})"))
11676                .unwrap();
11677        }
11678        assert!(
11679            e.tables_needing_analyze().is_empty(),
11680            "50 inserts < threshold of ~105"
11681        );
11682        for i in 1050..1200 {
11683            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})"))
11684                .unwrap();
11685        }
11686        assert_eq!(
11687            e.tables_needing_analyze(),
11688            alloc::vec!["t".to_string()],
11689            "200 inserts > 0.1 × 1200 threshold"
11690        );
11691    }
11692
11693    #[test]
11694    fn auto_analyze_threshold_resets_after_analyze() {
11695        let mut e = Engine::new();
11696        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
11697        for i in 0..200 {
11698            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})"))
11699                .unwrap();
11700        }
11701        assert!(!e.tables_needing_analyze().is_empty());
11702        e.execute("ANALYZE").unwrap();
11703        assert!(
11704            e.tables_needing_analyze().is_empty(),
11705            "ANALYZE must reset the counter"
11706        );
11707    }
11708
11709    #[test]
11710    fn auto_analyze_threshold_tracks_updates_and_deletes() {
11711        let mut e = Engine::new();
11712        e.execute("CREATE TABLE t (id INT NOT NULL, label TEXT)")
11713            .unwrap();
11714        for i in 0..50 {
11715            e.execute(&alloc::format!("INSERT INTO t VALUES ({i}, 'x')"))
11716                .unwrap();
11717        }
11718        e.execute("ANALYZE t").unwrap();
11719        // UPDATE 20 rows + DELETE 5 → modified=25. Threshold = 0.1
11720        // × max(50, 100) = 10. So 25 >= 10 → trigger.
11721        e.execute("UPDATE t SET label = 'y' WHERE id < 20").unwrap();
11722        e.execute("DELETE FROM t WHERE id >= 45").unwrap();
11723        assert_eq!(e.tables_needing_analyze(), alloc::vec!["t".to_string()]);
11724    }
11725
11726    #[test]
11727    fn v4_envelope_loads_with_empty_statistics() {
11728        // Forge a v4 envelope by hand: catalog + users + pubs +
11729        // subs trailer, no statistics. A v6.2.0 reader must accept
11730        // it and surface an empty Statistics.
11731        let mut e = Engine::new();
11732        e.create_user("alice", "secret", crate::users::Role::ReadOnly, [0u8; 16])
11733            .unwrap();
11734        let catalog = e.catalog.serialize();
11735        let users = crate::users::serialize_users(&e.users);
11736        let pubs = e.publications.serialize();
11737        let subs = e.subscriptions.serialize();
11738        let mut buf = Vec::new();
11739        buf.extend_from_slice(b"SPGENV01");
11740        buf.push(4u8);
11741        buf.extend_from_slice(&u32::try_from(catalog.len()).unwrap().to_le_bytes());
11742        buf.extend_from_slice(&catalog);
11743        buf.extend_from_slice(&u32::try_from(users.len()).unwrap().to_le_bytes());
11744        buf.extend_from_slice(&users);
11745        buf.extend_from_slice(&u32::try_from(pubs.len()).unwrap().to_le_bytes());
11746        buf.extend_from_slice(&pubs);
11747        buf.extend_from_slice(&u32::try_from(subs.len()).unwrap().to_le_bytes());
11748        buf.extend_from_slice(&subs);
11749        let crc = spg_crypto::crc32::crc32(&buf);
11750        buf.extend_from_slice(&crc.to_le_bytes());
11751        let e2 = Engine::restore_envelope(&buf).expect("v4 envelope restores");
11752        assert!(e2.statistics().is_empty());
11753    }
11754
11755    #[test]
11756    fn v1_v2_envelope_loads_with_empty_publications() {
11757        // A snapshot taken before v6.1.2 (no publication trailer,
11758        // envelope v2) must still deserialise — and the resulting
11759        // engine must report zero publications. Use the engine's own
11760        // round-trip with no publications: that emits v3 but with an
11761        // empty pubs block. Then forge a v2 envelope by hand to lock
11762        // the back-compat path.
11763        let mut e = Engine::new();
11764        // Force users to be non-empty so the snapshot takes the
11765        // envelope path rather than the bare-catalog fallback.
11766        e.create_user("alice", "secret", crate::users::Role::ReadOnly, [0u8; 16])
11767            .unwrap();
11768
11769        // Forge an envelope v2: same shape as v3 but no pubs trailer.
11770        let catalog = e.catalog.serialize();
11771        let users = crate::users::serialize_users(&e.users);
11772        let mut buf = Vec::new();
11773        buf.extend_from_slice(b"SPGENV01");
11774        buf.push(2u8); // v2
11775        buf.extend_from_slice(&u32::try_from(catalog.len()).unwrap().to_le_bytes());
11776        buf.extend_from_slice(&catalog);
11777        buf.extend_from_slice(&u32::try_from(users.len()).unwrap().to_le_bytes());
11778        buf.extend_from_slice(&users);
11779        let crc = spg_crypto::crc32::crc32(&buf);
11780        buf.extend_from_slice(&crc.to_le_bytes());
11781
11782        let e2 = Engine::restore_envelope(&buf).expect("v2 envelope restores");
11783        assert!(e2.publications().is_empty());
11784    }
11785}