Skip to main content

spg_engine/
lib.rs

1//! SPG execution engine — v0.3 wires the SQL front-end to the in-memory
2//! storage layer. Implements `CREATE TABLE`, single-row `INSERT VALUES`, and
3//! `SELECT * FROM <table>` (no WHERE yet — that lands in v0.4 alongside
4//! expression evaluation against rows).
5#![no_std]
6
7extern crate alloc;
8
9pub mod aggregate;
10pub mod describe;
11pub mod eval;
12pub mod json;
13pub mod memoize;
14pub mod plan_cache;
15pub mod publications;
16pub mod query_stats;
17pub mod reorder;
18pub mod selectivity;
19pub mod statistics;
20pub mod subscriptions;
21pub mod users;
22
23pub use crate::users::{Role, ScramSecrets, UserError, UserStore};
24
25use alloc::borrow::Cow;
26use alloc::boxed::Box;
27use alloc::collections::BTreeMap;
28use alloc::string::{String, ToString};
29use alloc::vec::Vec;
30use core::fmt;
31
32use spg_sql::ast::{
33    BinOp, ColumnDef, ColumnName, ColumnTypeName, CreateIndexStatement,
34    CreatePublicationStatement, CreateSubscriptionStatement, CreateTableStatement,
35    CreateUserStatement, Expr, FrameBound, FrameKind, FromClause, IndexMethod, InsertStatement,
36    JoinKind, Literal, OrderBy, SelectItem, SelectStatement, Statement, UnOp, UnionKind,
37    VecEncoding as SqlVecEncoding, WindowFrame,
38};
39use spg_sql::parser::{self, ParseError};
40use spg_storage::{
41    Catalog, ColumnSchema, CompactReport, DataType, IndexKey, IndexKind, Row, StorageError, Table,
42    TableSchema, Value, VecEncoding,
43};
44
45use crate::eval::{EvalContext, EvalError};
46
47/// Result of executing one statement.
48#[derive(Debug, Clone, PartialEq)]
49#[non_exhaustive]
50pub enum QueryResult {
51    /// DDL or DML succeeded.
52    ///
53    /// `affected` is the row count for `INSERT` and 0 elsewhere.
54    /// `modified_catalog` tells the server whether this statement
55    /// caused the *committed* catalog to change — it's the signal to
56    /// snapshot/audit. False for `BEGIN`/`ROLLBACK`, false for writeful
57    /// statements executed inside a transaction (those only touch the
58    /// shadow), and true for `COMMIT` and for writes outside a TX.
59    CommandOk {
60        affected: usize,
61        modified_catalog: bool,
62    },
63    /// `SELECT` returned a (possibly empty) row set.
64    Rows {
65        columns: Vec<ColumnSchema>,
66        rows: Vec<Row>,
67    },
68}
69
70/// All errors the engine can return.
71///
72/// Marked `#[non_exhaustive]` from v7.5.0 onward: external `match`
73/// must include a `_` arm so new variants in subsequent v7.x releases
74/// are not breaking changes.
75#[derive(Debug, Clone, PartialEq)]
76#[non_exhaustive]
77pub enum EngineError {
78    Parse(ParseError),
79    Storage(StorageError),
80    Eval(EvalError),
81    /// Front-end accepted a construct that the v0.x executor doesn't support.
82    Unsupported(String),
83    /// `BEGIN` while another transaction is already open.
84    TransactionAlreadyOpen,
85    /// `COMMIT` / `ROLLBACK` with no active transaction.
86    NoActiveTransaction,
87    /// v4.0 sentinel: `execute_readonly` got a statement that
88    /// mutates engine state (INSERT / CREATE / BEGIN / COMMIT / …).
89    /// The caller should retake the write lock and dispatch through
90    /// `execute(&mut self)` instead.
91    WriteRequired,
92    /// v4.2: a SELECT would have returned more rows than the
93    /// configured `max_query_rows` cap. Carries the cap.
94    RowLimitExceeded(usize),
95    /// v4.5: cooperative cancellation — the host (server's
96    /// per-query watchdog) set the cancel flag while a long-running
97    /// SELECT / UPDATE / DELETE was scanning rows. The partial work
98    /// is discarded; the caller should surface this as a timeout
99    /// to the client.
100    Cancelled,
101}
102
103impl fmt::Display for EngineError {
104    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
105        match self {
106            Self::Parse(e) => write!(f, "parse: {e}"),
107            Self::Storage(e) => write!(f, "storage: {e}"),
108            Self::Eval(e) => write!(f, "eval: {e}"),
109            Self::Unsupported(s) => write!(f, "unsupported: {s}"),
110            Self::TransactionAlreadyOpen => f.write_str("a transaction is already open"),
111            Self::NoActiveTransaction => f.write_str("no active transaction"),
112            Self::WriteRequired => {
113                f.write_str("statement requires a write lock (use execute, not execute_readonly)")
114            }
115            Self::RowLimitExceeded(n) => {
116                write!(f, "query exceeded max_query_rows={n}")
117            }
118            Self::Cancelled => f.write_str("query cancelled (timeout or client request)"),
119        }
120    }
121}
122
123impl From<ParseError> for EngineError {
124    fn from(e: ParseError) -> Self {
125        Self::Parse(e)
126    }
127}
128impl From<StorageError> for EngineError {
129    fn from(e: StorageError) -> Self {
130        Self::Storage(e)
131    }
132}
133impl From<EvalError> for EngineError {
134    fn from(e: EvalError) -> Self {
135        Self::Eval(e)
136    }
137}
138
139/// The execution engine. Holds the catalog and (later) other server-scope
140/// state. `Engine::new()` is intentionally cheap so callers can construct one
141/// per database, per test.
142/// Function pointer that returns "now" as microseconds since Unix
143/// epoch. The engine is `no_std`, so it can't reach for `std::time`
144/// itself — callers (`spg-server`, the sqllogictest runner) inject a
145/// concrete implementation. `None` means `NOW()` / `CURRENT_*` raise
146/// `Unsupported`.
147pub type ClockFn = fn() -> i64;
148
149/// Function pointer that produces 16 cryptographically random bytes.
150/// Like `ClockFn`, the engine is `no_std` and can't reach for /dev/urandom
151/// itself — host (`spg-server`) injects an OS-backed source. `None`
152/// means SQL-driven `CREATE USER` falls back to a deterministic salt
153/// derived from the username (acceptable in tests; the server always
154/// installs a real RNG so production paths never see this).
155pub type SaltFn = fn() -> [u8; 16];
156
157/// v4.5 cooperative cancellation token. A long-running SELECT /
158/// UPDATE / DELETE checks `is_cancelled` at row-loop checkpoints
159/// and bails with `EngineError::Cancelled`. The host
160/// (`spg-server`) creates an `AtomicBool` per query, spawns a
161/// watchdog thread that sets it after `SPG_QUERY_TIMEOUT_MS`,
162/// and passes it via `execute_with_cancel` / `execute_readonly_with_cancel`.
163///
164/// `CancelToken::none()` is a no-op — used by the legacy `execute`
165/// and `execute_readonly` entry points so existing callers don't
166/// change.
167#[derive(Debug, Clone, Copy)]
168pub struct CancelToken<'a> {
169    flag: Option<&'a core::sync::atomic::AtomicBool>,
170}
171
172impl<'a> CancelToken<'a> {
173    #[must_use]
174    pub const fn none() -> Self {
175        Self { flag: None }
176    }
177
178    #[must_use]
179    pub const fn from_flag(f: &'a core::sync::atomic::AtomicBool) -> Self {
180        Self { flag: Some(f) }
181    }
182
183    #[must_use]
184    pub fn is_cancelled(self) -> bool {
185        self.flag
186            .is_some_and(|f| f.load(core::sync::atomic::Ordering::Relaxed))
187    }
188
189    /// Returns `Err(Cancelled)` if the token has been tripped.
190    /// Used at row-loop checkpoints to bail cooperatively without
191    /// scattering raw `is_cancelled` checks across the executor.
192    #[inline]
193    pub fn check(self) -> Result<(), EngineError> {
194        if self.is_cancelled() {
195            Err(EngineError::Cancelled)
196        } else {
197            Ok(())
198        }
199    }
200}
201
202// ---- snapshot envelope (v4.1, extended with CRC32 in v4.37,  ----
203// ----   publications in v6.1.2 v3, subscriptions in v6.1.4 v4) ----
204//
205// Wraps a catalog blob + a user blob behind a small header so the
206// server can persist both atomically without inventing a new file.
207// Bare catalog blobs (v3.x) still load via `restore_envelope` since
208// the magic check fails fast and the function falls back to
209// `Catalog::deserialize`.
210//
211// Layout — v1 (v4.1, no CRC):
212//   [8 bytes magic "SPGENV01"]
213//   [u8 version = 1]
214//   [u32 catalog_len][catalog bytes]
215//   [u32 users_len][users bytes]
216//
217// Layout — v2 (v4.37, CRC32 of body):
218//   [8 bytes magic "SPGENV01"]
219//   [u8 version = 2]
220//   [u32 catalog_len][catalog bytes]
221//   [u32 users_len][users bytes]
222//   [u32 crc32]                      ← CRC32 of every byte before it.
223//
224// Layout — v3 (v6.1.2, publications trailer):
225//   [8 bytes magic "SPGENV01"]
226//   [u8 version = 3]
227//   [u32 catalog_len][catalog bytes]
228//   [u32 users_len][users bytes]
229//   [u32 pubs_len][publications bytes]
230//   [u32 crc32]
231//
232// Layout — v4 (v6.1.4, subscriptions trailer):
233//   [8 bytes magic "SPGENV01"]
234//   [u8 version = 4]
235//   [u32 catalog_len][catalog bytes]
236//   [u32 users_len][users bytes]
237//   [u32 pubs_len][publications bytes]
238//   [u32 subs_len][subscriptions bytes]
239//   [u32 crc32]
240//
241// Layout — v5 (v6.2.0, statistics trailer):
242//   [8 bytes magic "SPGENV01"]
243//   [u8 version = 5]
244//   [u32 catalog_len][catalog bytes]
245//   [u32 users_len][users bytes]
246//   [u32 pubs_len][publications bytes]
247//   [u32 subs_len][subscriptions bytes]
248//   [u32 stats_len][statistics bytes]      ← NEW
249//   [u32 crc32]
250//
251// Writers emit v5 from v6.2.0 on. Readers accept all of {v1, v2,
252// v3, v4, v5}: v1/v2 load with empty publications / subscriptions /
253// statistics; v3 loads with empty subscriptions + statistics; v4
254// loads with empty statistics; v5 deserialises all three. Older
255// SPG versions reading a v5 envelope fall through the version
256// match to `EnvelopeParse::Bare` — pre-v6.2.0 binaries cannot
257// open v6.2.0+ snapshots (matches the v6.1.2 / v6.1.4 breaks).
258
259const ENVELOPE_MAGIC: &[u8; 8] = b"SPGENV01";
260const ENVELOPE_VERSION_V1: u8 = 1;
261const ENVELOPE_VERSION_V2: u8 = 2;
262const ENVELOPE_VERSION_V3: u8 = 3;
263const ENVELOPE_VERSION_V4: u8 = 4;
264const ENVELOPE_VERSION_V5: u8 = 5;
265
266fn build_envelope(
267    catalog: &[u8],
268    users: &[u8],
269    pubs: &[u8],
270    subs: &[u8],
271    stats: &[u8],
272) -> Vec<u8> {
273    let mut out = Vec::with_capacity(
274        8 + 1
275            + 4
276            + catalog.len()
277            + 4
278            + users.len()
279            + 4
280            + pubs.len()
281            + 4
282            + subs.len()
283            + 4
284            + stats.len()
285            + 4,
286    );
287    out.extend_from_slice(ENVELOPE_MAGIC);
288    out.push(ENVELOPE_VERSION_V5);
289    out.extend_from_slice(
290        &u32::try_from(catalog.len())
291            .expect("≤ 4G catalog")
292            .to_le_bytes(),
293    );
294    out.extend_from_slice(catalog);
295    out.extend_from_slice(
296        &u32::try_from(users.len())
297            .expect("≤ 4G users")
298            .to_le_bytes(),
299    );
300    out.extend_from_slice(users);
301    out.extend_from_slice(
302        &u32::try_from(pubs.len())
303            .expect("≤ 4G publications")
304            .to_le_bytes(),
305    );
306    out.extend_from_slice(pubs);
307    out.extend_from_slice(
308        &u32::try_from(subs.len())
309            .expect("≤ 4G subscriptions")
310            .to_le_bytes(),
311    );
312    out.extend_from_slice(subs);
313    out.extend_from_slice(
314        &u32::try_from(stats.len())
315            .expect("≤ 4G statistics")
316            .to_le_bytes(),
317    );
318    out.extend_from_slice(stats);
319    let crc = spg_crypto::crc32::crc32(&out);
320    out.extend_from_slice(&crc.to_le_bytes());
321    out
322}
323
324/// Outcome of envelope parsing: either bare-catalog fallback, a
325/// successfully split section trio from a v1/v2/v3 envelope, or an
326/// explicit corruption error from a v2/v3 CRC mismatch. `Bare`
327/// (catalog-only fallback) preserves v3.x readability. v1/v2
328/// envelopes set `publications` to `None`; v3 sets it to the
329/// publications byte slice.
330enum EnvelopeParse<'a> {
331    Bare,
332    Pair {
333        catalog: &'a [u8],
334        users: &'a [u8],
335        publications: Option<&'a [u8]>,
336        subscriptions: Option<&'a [u8]>,
337        statistics: Option<&'a [u8]>,
338    },
339    CrcMismatch {
340        expected: u32,
341        computed: u32,
342    },
343}
344
345/// Returns `EnvelopeParse::Pair` for a valid v1 / v2 / v3 envelope,
346/// `Bare` for a buffer that doesn't look like an envelope (v3.x
347/// bare catalog fallback), and `CrcMismatch` for a v2/v3 envelope
348/// whose trailing CRC32 doesn't match the body.
349fn split_envelope(buf: &[u8]) -> EnvelopeParse<'_> {
350    if buf.len() < 8 + 1 + 4 || &buf[..8] != ENVELOPE_MAGIC {
351        return EnvelopeParse::Bare;
352    }
353    let version = buf[8];
354    if !matches!(
355        version,
356        ENVELOPE_VERSION_V1
357            | ENVELOPE_VERSION_V2
358            | ENVELOPE_VERSION_V3
359            | ENVELOPE_VERSION_V4
360            | ENVELOPE_VERSION_V5
361    ) {
362        return EnvelopeParse::Bare;
363    }
364    let mut p = 9usize;
365    let Some(cat_len_bytes) = buf.get(p..p + 4) else {
366        return EnvelopeParse::Bare;
367    };
368    let Ok(cat_len_arr) = cat_len_bytes.try_into() else {
369        return EnvelopeParse::Bare;
370    };
371    let cat_len = u32::from_le_bytes(cat_len_arr) as usize;
372    p += 4;
373    if p + cat_len + 4 > buf.len() {
374        return EnvelopeParse::Bare;
375    }
376    let catalog = &buf[p..p + cat_len];
377    p += cat_len;
378    let Some(user_len_bytes) = buf.get(p..p + 4) else {
379        return EnvelopeParse::Bare;
380    };
381    let Ok(user_len_arr) = user_len_bytes.try_into() else {
382        return EnvelopeParse::Bare;
383    };
384    let user_len = u32::from_le_bytes(user_len_arr) as usize;
385    p += 4;
386    if p + user_len > buf.len() {
387        return EnvelopeParse::Bare;
388    }
389    let users = &buf[p..p + user_len];
390    p += user_len;
391    let publications = if matches!(
392        version,
393        ENVELOPE_VERSION_V3 | ENVELOPE_VERSION_V4 | ENVELOPE_VERSION_V5
394    ) {
395        // [u32 pubs_len][publications bytes]
396        let Some(pubs_len_bytes) = buf.get(p..p + 4) else {
397            return EnvelopeParse::Bare;
398        };
399        let Ok(pubs_len_arr) = pubs_len_bytes.try_into() else {
400            return EnvelopeParse::Bare;
401        };
402        let pubs_len = u32::from_le_bytes(pubs_len_arr) as usize;
403        p += 4;
404        if p + pubs_len > buf.len() {
405            return EnvelopeParse::Bare;
406        }
407        let pubs_slice = &buf[p..p + pubs_len];
408        p += pubs_len;
409        Some(pubs_slice)
410    } else {
411        None
412    };
413    let subscriptions = if matches!(version, ENVELOPE_VERSION_V4 | ENVELOPE_VERSION_V5) {
414        // [u32 subs_len][subscriptions bytes]
415        let Some(subs_len_bytes) = buf.get(p..p + 4) else {
416            return EnvelopeParse::Bare;
417        };
418        let Ok(subs_len_arr) = subs_len_bytes.try_into() else {
419            return EnvelopeParse::Bare;
420        };
421        let subs_len = u32::from_le_bytes(subs_len_arr) as usize;
422        p += 4;
423        if p + subs_len > buf.len() {
424            return EnvelopeParse::Bare;
425        }
426        let subs_slice = &buf[p..p + subs_len];
427        p += subs_len;
428        Some(subs_slice)
429    } else {
430        None
431    };
432    let statistics = if version == ENVELOPE_VERSION_V5 {
433        // [u32 stats_len][statistics bytes]
434        let Some(stats_len_bytes) = buf.get(p..p + 4) else {
435            return EnvelopeParse::Bare;
436        };
437        let Ok(stats_len_arr) = stats_len_bytes.try_into() else {
438            return EnvelopeParse::Bare;
439        };
440        let stats_len = u32::from_le_bytes(stats_len_arr) as usize;
441        p += 4;
442        if p + stats_len > buf.len() {
443            return EnvelopeParse::Bare;
444        }
445        let stats_slice = &buf[p..p + stats_len];
446        p += stats_len;
447        Some(stats_slice)
448    } else {
449        None
450    };
451    if matches!(
452        version,
453        ENVELOPE_VERSION_V2 | ENVELOPE_VERSION_V3 | ENVELOPE_VERSION_V4 | ENVELOPE_VERSION_V5
454    ) {
455        if p + 4 != buf.len() {
456            return EnvelopeParse::Bare;
457        }
458        let Ok(crc_arr) = buf[p..p + 4].try_into() else {
459            return EnvelopeParse::Bare;
460        };
461        let expected = u32::from_le_bytes(crc_arr);
462        let computed = spg_crypto::crc32::crc32(&buf[..p]);
463        if expected != computed {
464            return EnvelopeParse::CrcMismatch { expected, computed };
465        }
466    } else if p != buf.len() {
467        // v1: must end exactly at the users section.
468        return EnvelopeParse::Bare;
469    }
470    EnvelopeParse::Pair {
471        catalog,
472        users,
473        publications,
474        subscriptions,
475        statistics,
476    }
477}
478
479/// v4.41.1 opaque transaction handle. Returned by `Engine::alloc_tx_id`,
480/// threaded through `Engine::execute_in` so dispatch can identify which
481/// in-flight TX a statement belongs to. `IMPLICIT_TX` is the reserved
482/// slot every legacy caller — engine self-tests, spg-cli, spg-embedded,
483/// startup replay — implicitly uses through the unchanged
484/// `Engine::execute(sql)` API. v4.41.1 keeps at most one active slot at
485/// runtime (dispatch holds `engine.write()` across the wrap, same as
486/// v4.34); the map shape is here to let v4.42 turn on N in-flight
487/// implicit TXs without reshuffling the engine internals.
488#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
489pub struct TxId(pub u64);
490
491/// Reserved slot used by `Engine::execute(sql)` — the legacy single-
492/// global-shadow path. New `alloc_tx_id` handles start at 1.
493pub const IMPLICIT_TX: TxId = TxId(0);
494
495/// v6.7.3 — default segment-size threshold used by `COMPACT COLD
496/// SEGMENTS` when no explicit target is supplied. Segments whose
497/// `OwnedSegment::bytes().len()` is **strictly** less than this
498/// value are eligible to merge. spg-server reads
499/// `SPG_COMPACTION_TARGET_SEGMENT_BYTES` to override.
500pub const COMPACTION_TARGET_DEFAULT_BYTES: u64 = 4 * 1024 * 1024;
501
502/// Per-slot transaction state. Held inside `tx_catalogs[tx_id]` for the
503/// lifetime of a BEGIN..COMMIT (or BEGIN..ROLLBACK) window. Drops when
504/// the TX commits (its `catalog` is moved over `Engine.catalog`) or
505/// rolls back (slot removed, catalog discarded).
506#[derive(Debug, Default, Clone)]
507struct TxState {
508    /// The TX's shadow copy of the catalog. Started as a clone of
509    /// `Engine.catalog` at BEGIN time; writes flow into it; COMMIT
510    /// installs it over `Engine.catalog`. `Catalog::clone()` is O(1)
511    /// since v4.40 (`PersistentVec` rows + `PersistentBTreeMap` indices).
512    catalog: Catalog,
513    /// Per-TX savepoint stack. Each entry pairs the savepoint name with
514    /// a clone of `catalog` at the moment `SAVEPOINT <name>` fired.
515    /// `ROLLBACK TO <name>` restores from the entry and pops everything
516    /// after it; `RELEASE <name>` discards the entry and everything
517    /// after; COMMIT/ROLLBACK clears the whole stack.
518    savepoints: Vec<(String, Catalog)>,
519}
520
521#[derive(Debug, Default)]
522pub struct Engine {
523    /// Committed catalog — what survives `Engine::snapshot()` and what
524    /// outside-TX `SELECT`s read.
525    catalog: Catalog,
526    /// Active TX slots, keyed by `TxId`. Empty when no TX is in flight.
527    /// v4.41.1 runtime invariant: at most one entry (single-writer
528    /// model unchanged). v4.42 will let dispatch hold multiple entries
529    /// concurrently for group commit + engine MVCC.
530    tx_catalogs: BTreeMap<TxId, TxState>,
531    /// Which slot the next exec_* call should mutate. Set by
532    /// `execute_in(sql, tx_id)` at the entry point; legacy `execute(sql)`
533    /// sets it to `IMPLICIT_TX`. None when no TX is in flight (read /
534    /// write goes straight against `catalog`).
535    current_tx: Option<TxId>,
536    /// Monotonic counter for `alloc_tx_id`. Starts at 1 — slot 0 is
537    /// reserved for `IMPLICIT_TX`.
538    next_tx_id: u64,
539    /// Optional wall clock used to satisfy `NOW()` / `CURRENT_TIMESTAMP`
540    /// / `CURRENT_DATE`. Set by the host environment.
541    clock: Option<ClockFn>,
542    /// v4.1 cryptographic RNG for per-user password salt. Set by the
543    /// host. `None` means SQL-driven `CREATE USER` uses a
544    /// deterministic fallback — see `SaltFn`.
545    salt_fn: Option<SaltFn>,
546    /// v4.2 per-query row cap. `None` = unlimited. When set, a
547    /// SELECT that materialises more than `n` rows returns
548    /// `EngineError::RowLimitExceeded`. Enforced before the result
549    /// is shaped into wire frames so a runaway scan can't blow the
550    /// server's heap.
551    max_query_rows: Option<usize>,
552    /// v4.1 RBAC user table. Empty means "no RBAC configured yet" —
553    /// the server decides what that means at the auth boundary
554    /// (open mode vs legacy single-password mode). User CRUD goes
555    /// through `create_user`/`drop_user`/`verify_user`; persistence
556    /// rides the snapshot envelope alongside the catalog.
557    users: UserStore,
558    /// v6.1.2 logical-replication publication catalog. Empty until
559    /// `CREATE PUBLICATION` runs. Persistence rides the v3 envelope
560    /// trailer (see `build_envelope`).
561    publications: publications::Publications,
562    /// v6.1.4 logical-replication subscription catalog. Empty until
563    /// `CREATE SUBSCRIPTION` runs. Persistence rides the v4 envelope
564    /// trailer.
565    subscriptions: subscriptions::Subscriptions,
566    /// v6.2.0 — per-column statistics for the cost-based optimizer.
567    /// Populated by `ANALYZE`; queried via `spg_statistic` virtual
568    /// table. Persistence rides the v5 envelope trailer.
569    statistics: statistics::Statistics,
570    /// v6.3.0 — engine-level plan cache. Caches the post-`prepare()`
571    /// `Statement` keyed on SQL text. In-memory only — does NOT ride
572    /// the snapshot envelope (rebuilt on demand after restart).
573    plan_cache: plan_cache::PlanCache,
574    /// v6.5.1 — per-distinct-SQL execution stats. In-memory only,
575    /// surfaced via `spg_stat_query` virtual table. Updated by the
576    /// `execute_*` paths after a successful execute.
577    query_stats: query_stats::QueryStats,
578    /// v6.5.2 — connection-state provider callback. spg-server
579    /// registers a function at startup that snapshots its
580    /// per-pgwire-connection registry into `ActivityRow`s; engine
581    /// reads through it on every `SELECT * FROM spg_stat_activity`.
582    /// `None` ⇒ no-data (returns empty rows; matches the no_std
583    /// embedded callers that don't run pgwire).
584    activity_provider: Option<ActivityProvider>,
585    /// v6.5.3 — audit-chain provider + verifier. Same pattern as
586    /// activity_provider: spg-server registers both at startup;
587    /// engine reads through on `SELECT * FROM spg_audit_chain` and
588    /// `SELECT * FROM spg_audit_verify`. `None` ⇒ no-data.
589    audit_chain_provider: Option<AuditChainProvider>,
590    audit_verifier: Option<AuditVerifier>,
591    /// v6.5.6 — slow-query log threshold in microseconds. When set,
592    /// every successful execute whose elapsed exceeds the threshold
593    /// gets fed to the registered slow-query log callback (so
594    /// spg-server can emit a structured log line). Default `None`
595    /// = no slow-query logging.
596    slow_query_threshold_us: Option<u64>,
597    slow_query_logger: Option<SlowQueryLogger>,
598}
599
600/// v6.5.6 — callback signature for slow-query log emission. Called
601/// with `(sql, elapsed_us)` once per successful execute that crosses
602/// the threshold.
603pub type SlowQueryLogger = fn(&str, u64);
604
605/// v6.5.4 — synthesise a `CREATE TABLE` statement from catalog
606/// state. Round-trips through `Engine::execute` to recreate the
607/// same schema (sans data + indexes — indexes are emitted as a
608/// separate `CREATE INDEX` chain in `spg_database_ddl`).
609fn render_create_table(name: &str, columns: &[ColumnSchema]) -> String {
610    let mut out = alloc::format!("CREATE TABLE {name} (");
611    for (i, col) in columns.iter().enumerate() {
612        if i > 0 {
613            out.push_str(", ");
614        }
615        out.push_str(&col.name);
616        out.push(' ');
617        out.push_str(&render_data_type(col.ty));
618        if !col.nullable {
619            out.push_str(" NOT NULL");
620        }
621        if col.auto_increment {
622            out.push_str(" AUTO_INCREMENT");
623        }
624    }
625    out.push(')');
626    out
627}
628
629fn render_data_type(ty: DataType) -> String {
630    match ty {
631        DataType::SmallInt => "SMALLINT".into(),
632        DataType::Int => "INT".into(),
633        DataType::BigInt => "BIGINT".into(),
634        DataType::Float => "FLOAT".into(),
635        DataType::Text => "TEXT".into(),
636        DataType::Varchar(n) => alloc::format!("VARCHAR({n})"),
637        DataType::Char(n) => alloc::format!("CHAR({n})"),
638        DataType::Bool => "BOOL".into(),
639        DataType::Vector { dim, encoding } => match encoding {
640            spg_storage::VecEncoding::F32 => alloc::format!("VECTOR({dim})"),
641            spg_storage::VecEncoding::Sq8 => alloc::format!("VECTOR({dim}) USING SQ8"),
642            spg_storage::VecEncoding::F16 => alloc::format!("VECTOR({dim}) USING HALF"),
643        },
644        DataType::Numeric { precision, scale } => {
645            alloc::format!("NUMERIC({precision},{scale})")
646        }
647        DataType::Date => "DATE".into(),
648        DataType::Timestamp => "TIMESTAMP".into(),
649        DataType::Interval => "INTERVAL".into(),
650        DataType::Json => "JSON".into(),
651        DataType::Jsonb => "JSONB".into(),
652        DataType::Timestamptz => "TIMESTAMPTZ".into(),
653    }
654}
655
656/// v6.5.2 — one row of `spg_stat_activity`. Engine-public so
657/// spg-server can construct rows without re-exporting internal
658/// dispatch types.
659#[derive(Debug, Clone)]
660pub struct ActivityRow {
661    pub pid: u32,
662    pub user: String,
663    pub started_at_us: i64,
664    pub current_sql: String,
665    pub wait_event: String,
666    pub elapsed_us: i64,
667    pub in_transaction: bool,
668}
669
670/// v6.5.2 — provider callback type. Fresh snapshot returned each
671/// call; engine doesn't cache the slice.
672pub type ActivityProvider = fn() -> Vec<ActivityRow>;
673
674/// v6.5.3 — one row of `spg_audit_chain`. Engine-public so
675/// spg-server can construct rows directly from `AuditEntry`.
676#[derive(Debug, Clone)]
677pub struct AuditRow {
678    pub seq: i64,
679    pub ts_ms: i64,
680    pub prev_hash_hex: String,
681    pub entry_hash_hex: String,
682    pub sql: String,
683}
684
685/// v6.5.3 — chain-table provider + verifier. spg-server registers
686/// fn pointers that snapshot / verify the audit log. `verify`
687/// returns `(verified_count, broken_at_seq)` — `broken_at_seq` is
688/// `-1` on a clean chain.
689pub type AuditChainProvider = fn() -> Vec<AuditRow>;
690pub type AuditVerifier = fn() -> (i64, i64);
691
692impl Engine {
693    pub fn new() -> Self {
694        Self {
695            catalog: Catalog::new(),
696            tx_catalogs: BTreeMap::new(),
697            current_tx: None,
698            next_tx_id: 1,
699            clock: None,
700            salt_fn: None,
701            max_query_rows: None,
702            users: UserStore::new(),
703            publications: publications::Publications::new(),
704            subscriptions: subscriptions::Subscriptions::new(),
705            statistics: statistics::Statistics::new(),
706            plan_cache: plan_cache::PlanCache::new(),
707            query_stats: query_stats::QueryStats::new(),
708            activity_provider: None,
709            audit_chain_provider: None,
710            audit_verifier: None,
711            slow_query_threshold_us: None,
712            slow_query_logger: None,
713        }
714    }
715
716    /// Construct an engine restored from a previously-snapshotted catalog
717    /// (see `snapshot()`).
718    pub fn restore(catalog: Catalog) -> Self {
719        Self {
720            catalog,
721            tx_catalogs: BTreeMap::new(),
722            current_tx: None,
723            next_tx_id: 1,
724            clock: None,
725            salt_fn: None,
726            max_query_rows: None,
727            users: UserStore::new(),
728            publications: publications::Publications::new(),
729            subscriptions: subscriptions::Subscriptions::new(),
730            statistics: statistics::Statistics::new(),
731            plan_cache: plan_cache::PlanCache::new(),
732            query_stats: query_stats::QueryStats::new(),
733            activity_provider: None,
734            audit_chain_provider: None,
735            audit_verifier: None,
736            slow_query_threshold_us: None,
737            slow_query_logger: None,
738        }
739    }
740
741    /// Restore an engine + user table from a v4.1 envelope produced
742    /// by `snapshot_with_users()`. Falls back to plain catalog-only
743    /// restore if the envelope magic isn't present (so v3.x snapshot
744    /// files still load). v6.1.2 adds the optional publications
745    /// trailer (envelope v3); a v1/v2 envelope deserialises to an
746    /// empty publication table.
747    pub fn restore_envelope(buf: &[u8]) -> Result<Self, EngineError> {
748        match split_envelope(buf) {
749            EnvelopeParse::Pair {
750                catalog: catalog_bytes,
751                users: user_bytes,
752                publications: pub_bytes,
753                subscriptions: sub_bytes,
754                statistics: stats_bytes,
755            } => {
756                let catalog = Catalog::deserialize(catalog_bytes).map_err(EngineError::Storage)?;
757                let users = users::deserialize_users(user_bytes)
758                    .map_err(|e| EngineError::Unsupported(alloc::format!("users restore: {e}")))?;
759                let publications = match pub_bytes {
760                    Some(b) => publications::Publications::deserialize(b).map_err(|e| {
761                        EngineError::Unsupported(alloc::format!("publications restore: {e:?}"))
762                    })?,
763                    None => publications::Publications::new(),
764                };
765                let subscriptions = match sub_bytes {
766                    Some(b) => subscriptions::Subscriptions::deserialize(b).map_err(|e| {
767                        EngineError::Unsupported(alloc::format!("subscriptions restore: {e:?}"))
768                    })?,
769                    None => subscriptions::Subscriptions::new(),
770                };
771                let statistics = match stats_bytes {
772                    Some(b) => statistics::Statistics::deserialize(b).map_err(|e| {
773                        EngineError::Unsupported(alloc::format!("statistics restore: {e:?}"))
774                    })?,
775                    None => statistics::Statistics::new(),
776                };
777                Ok(Self {
778                    catalog,
779                    tx_catalogs: BTreeMap::new(),
780                    current_tx: None,
781                    next_tx_id: 1,
782                    clock: None,
783                    salt_fn: None,
784                    max_query_rows: None,
785                    users,
786                    publications,
787                    subscriptions,
788                    statistics,
789                    plan_cache: plan_cache::PlanCache::new(),
790                    query_stats: query_stats::QueryStats::new(),
791                    activity_provider: None,
792                    audit_chain_provider: None,
793                    audit_verifier: None,
794                    slow_query_threshold_us: None,
795                    slow_query_logger: None,
796                })
797            }
798            EnvelopeParse::CrcMismatch { expected, computed } => {
799                Err(EngineError::Storage(StorageError::Corrupt(alloc::format!(
800                    "snapshot envelope CRC32 mismatch (expected={expected:#010x}, computed={computed:#010x})"
801                ))))
802            }
803            EnvelopeParse::Bare => {
804                let catalog = Catalog::deserialize(buf).map_err(EngineError::Storage)?;
805                Ok(Self::restore(catalog))
806            }
807        }
808    }
809
810    pub const fn users(&self) -> &UserStore {
811        &self.users
812    }
813
814    /// `salt` is supplied by the caller (the host has a random
815    /// source; the engine is `no_std`). Caller should pass a fresh
816    /// 16-byte random value per user.
817    pub fn create_user(
818        &mut self,
819        name: &str,
820        password: &str,
821        role: Role,
822        salt: [u8; 16],
823    ) -> Result<(), UserError> {
824        self.users.create(name, password, role, salt)?;
825        // v4.8: also derive SCRAM-SHA-256 secrets so PG-wire SASL
826        // auth can verify without re-running PBKDF2 per attempt.
827        // Uses a fresh salt from the host RNG (falls back to a
828        // deterministic per-username salt when no RNG is wired, same
829        // as the legacy hash path).
830        let scram_salt = self.salt_fn.map_or_else(
831            || {
832                let mut s = [0u8; users::SCRAM_SALT_LEN];
833                let digest = spg_crypto::hash(name.as_bytes());
834                // Use bytes 16..32 of BLAKE3 so we don't reuse the
835                // exact same fallback salt as the BLAKE3 hash path.
836                s.copy_from_slice(&digest[16..32]);
837                s
838            },
839            |f| f(),
840        );
841        self.users
842            .enable_scram(name, password, scram_salt, users::SCRAM_DEFAULT_ITERS)?;
843        Ok(())
844    }
845
846    pub fn drop_user(&mut self, name: &str) -> Result<(), UserError> {
847        self.users.drop(name)
848    }
849
850    pub fn verify_user(&self, name: &str, password: &str) -> Option<Role> {
851        self.users.verify(name, password)
852    }
853
854    /// Builder: attach a wall clock so `NOW()` / `CURRENT_TIMESTAMP` /
855    /// `CURRENT_DATE` evaluate to a real value instead of erroring out.
856    #[must_use]
857    pub const fn with_clock(mut self, clock: ClockFn) -> Self {
858        self.clock = Some(clock);
859        self
860    }
861
862    /// Builder: attach an OS-backed RNG for per-user password salts.
863    /// The host (`spg-server`) typically wires this to `/dev/urandom`.
864    #[must_use]
865    pub const fn with_salt_fn(mut self, f: SaltFn) -> Self {
866        self.salt_fn = Some(f);
867        self
868    }
869
870    /// Builder: cap the number of rows a single SELECT may return.
871    /// Exceeding the cap raises `EngineError::RowLimitExceeded` —
872    /// the bound is checked inside the executor so a runaway
873    /// catalog scan can't allocate millions of rows before the
874    /// server gets a chance to reject the result.
875    #[must_use]
876    pub const fn with_max_query_rows(mut self, n: usize) -> Self {
877        self.max_query_rows = Some(n);
878        self
879    }
880
881    /// The *committed* catalog. Note: during a transaction this returns the
882    /// pre-TX state — `SELECT` inside a TX goes through `execute()` and reads
883    /// the shadow. Tests that inspect outside-TX state should use this.
884    pub const fn catalog(&self) -> &Catalog {
885        &self.catalog
886    }
887
888    /// Serialize the *committed* catalog to bytes. v0.6 was full-snapshot; v0.9
889    /// adds the rule that an open TX's shadow is never snapshotted — only the
890    /// post-COMMIT state is persisted. v4.1 wraps the catalog in an envelope
891    /// when there are users to persist; an empty user table snapshots as the
892    /// bare catalog format (backwards-compat with v3.x readers). v6.1.2
893    /// adds publications to the envelope condition: either non-empty
894    /// users OR non-empty publications now triggers the envelope path.
895    pub fn snapshot(&self) -> Vec<u8> {
896        if self.users.is_empty()
897            && self.publications.is_empty()
898            && self.subscriptions.is_empty()
899            && self.statistics.is_empty()
900        {
901            self.catalog.serialize()
902        } else {
903            build_envelope(
904                &self.catalog.serialize(),
905                &users::serialize_users(&self.users),
906                &self.publications.serialize(),
907                &self.subscriptions.serialize(),
908                &self.statistics.serialize(),
909            )
910        }
911    }
912
913    /// True when at least one TX slot is in flight. v4.41.1 runtime
914    /// invariant: at most one slot active at a time (dispatch holds
915    /// `engine.write()` across the entire wrap). v4.42 will let this
916    /// return true with multiple slots concurrently.
917    pub fn in_transaction(&self) -> bool {
918        !self.tx_catalogs.is_empty()
919    }
920
921    /// v4.41.1 allocate a fresh TX handle. Used by spg-server dispatch
922    /// to scope each implicit-wrap BEGIN..stmt..COMMIT to its own slot
923    /// in `tx_catalogs`. v4.42 — the commit-barrier leader allocates
924    /// one of these per task in its group, runs `BEGIN`+sql+`COMMIT`
925    /// sequentially under a single `engine.write()` so each task's
926    /// mutations accumulate into shared state, then either keeps the
927    /// accumulated state (fsync OK) or restores the pre-image via
928    /// `replace_catalog` (fsync err).
929    pub fn alloc_tx_id(&mut self) -> TxId {
930        let id = TxId(self.next_tx_id);
931        self.next_tx_id = self.next_tx_id.saturating_add(1);
932        id
933    }
934
935    /// v4.42 — atomically replace the live catalog. Used by the
936    /// commit-barrier leader to roll back a group whose batched
937    /// fsync failed: the leader snapshots `engine.catalog().clone()`
938    /// (O(1) Arc bump after the v4.39/v4.40 persistent migration)
939    /// at group start, sequentially applies each task's BEGIN+sql+
940    /// COMMIT under the same write lock to accumulate mutations
941    /// into shared state, batches the WAL bytes, fsyncs once, and
942    /// on failure calls this with the pre-image to undo every
943    /// task in the group at once.
944    ///
945    /// **Does NOT touch `tx_catalogs` / `current_tx`.** Any
946    /// explicit-TX slot from a concurrent client (created via the
947    /// legacy `IMPLICIT_TX`-less dispatch path or via the future
948    /// MVCC-readers v5+ work) has its own snapshot baked into the
949    /// slot — restoring `self.catalog` to the pre-image leaves
950    /// those slots untouched, exactly as they were when the leader
951    /// took the lock. The leader's own implicit-TX slots are all
952    /// already discarded (`exec_commit` removed them as each
953    /// task's COMMIT ran) by the time this is reached.
954    pub fn replace_catalog(&mut self, catalog: Catalog) {
955        self.catalog = catalog;
956    }
957
958    /// v6.7.0 — public shim around `Catalog::freeze_oldest_to_cold`
959    /// so tests + the spg-server freezer can drive a freeze without
960    /// reaching into the private `active_catalog_mut`. v6.7.4
961    /// parallel freezer will build on this surface.
962    ///
963    /// Marks the table's cached `cold_row_count` stale because the
964    /// freeze added cold locators that ANALYZE hasn't yet refreshed.
965    pub fn freeze_oldest_to_cold(
966        &mut self,
967        table_name: &str,
968        index_name: &str,
969        max_rows: usize,
970    ) -> Result<spg_storage::FreezeReport, EngineError> {
971        let report = self
972            .active_catalog_mut()
973            .freeze_oldest_to_cold(table_name, index_name, max_rows)
974            .map_err(EngineError::Storage)?;
975        if let Some(t) = self.active_catalog_mut().get_mut(table_name) {
976            t.mark_cold_row_count_stale();
977        }
978        Ok(report)
979    }
980
981    /// v6.7.5 — public shim used by the spg-server follower's
982    /// segment-forwarding receiver. Registers a cold-tier segment
983    /// at a specific id (the master's id, as transmitted on the
984    /// wire) so the follower's BTree-Cold locators stay byte-
985    /// identical with the master's. Wraps
986    /// `Catalog::load_segment_bytes_at` under the standard
987    /// clone-mutate-replace pattern.
988    ///
989    /// Returns `Ok(())` on success **and** on the "slot already
990    /// occupied" case — a follower mid-reconnect may receive a
991    /// segment chunk for a segment_id it already has on disk
992    /// (forwarded last session); the caller should treat that
993    /// path as a no-op rather than a fatal error.
994    pub fn receive_cold_segment(
995        &mut self,
996        segment_id: u32,
997        bytes: Vec<u8>,
998    ) -> Result<(), EngineError> {
999        let mut new_cat = self.catalog.clone();
1000        match new_cat.load_segment_bytes_at(segment_id, bytes) {
1001            Ok(()) => {
1002                self.replace_catalog(new_cat);
1003                Ok(())
1004            }
1005            Err(StorageError::Corrupt(msg)) if msg.contains("already occupied") => Ok(()),
1006            Err(e) => Err(EngineError::Storage(e)),
1007        }
1008    }
1009
1010    /// v6.7.3 — public shim around `Catalog::compact_cold_segments`
1011    /// driving every BTree index on every user table. Returns one
1012    /// `(table, index, report)` triple for each merge that
1013    /// actually happened (no-op (table, index) pairs are filtered
1014    /// out so callers can size persist-side work to the live
1015    /// merges). Caller is responsible for persisting each
1016    /// `report.merged_segment_bytes` and updating the on-disk
1017    /// segment registry; engine layer is no_std and never
1018    /// touches disk.
1019    ///
1020    /// Marks every touched table's cached `cold_row_count` stale
1021    /// — compaction GC'd some shadowed rows, so the count must be
1022    /// re-derived on the next ANALYZE.
1023    pub fn compact_cold_segments_with_target(
1024        &mut self,
1025        target_segment_bytes: u64,
1026    ) -> Result<Vec<(String, String, CompactReport)>, EngineError> {
1027        let table_names = self.active_catalog().table_names();
1028        let mut reports: Vec<(String, String, CompactReport)> = Vec::new();
1029        for tname in table_names {
1030            if is_internal_table_name(&tname) {
1031                continue;
1032            }
1033            let idx_names: Vec<String> = {
1034                let Some(t) = self.active_catalog().get(&tname) else {
1035                    continue;
1036                };
1037                t.indices()
1038                    .iter()
1039                    .filter(|i| matches!(i.kind, IndexKind::BTree(_)))
1040                    .map(|i| i.name.clone())
1041                    .collect()
1042            };
1043            for iname in idx_names {
1044                let report = self
1045                    .active_catalog_mut()
1046                    .compact_cold_segments(&tname, &iname, target_segment_bytes)
1047                    .map_err(EngineError::Storage)?;
1048                if report.merged_segment_id.is_some() {
1049                    if let Some(t) = self.active_catalog_mut().get_mut(&tname) {
1050                        t.mark_cold_row_count_stale();
1051                    }
1052                    reports.push((tname.clone(), iname, report));
1053                }
1054            }
1055        }
1056        Ok(reports)
1057    }
1058
1059    fn active_catalog(&self) -> &Catalog {
1060        match self.current_tx {
1061            Some(t) => self
1062                .tx_catalogs
1063                .get(&t)
1064                .map_or(&self.catalog, |s| &s.catalog),
1065            None => &self.catalog,
1066        }
1067    }
1068
1069    fn active_catalog_mut(&mut self) -> &mut Catalog {
1070        let tx = self.current_tx;
1071        match tx {
1072            Some(t) => match self.tx_catalogs.get_mut(&t) {
1073                Some(s) => &mut s.catalog,
1074                None => &mut self.catalog,
1075            },
1076            None => &mut self.catalog,
1077        }
1078    }
1079
1080    /// Read-only execute path. Succeeds for `SELECT` / `SHOW TABLES`
1081    /// / `SHOW COLUMNS`; returns `EngineError::WriteRequired` for
1082    /// every other statement, so the caller can fall through to the
1083    /// `&mut self` `execute` path under a write lock. Engine state is
1084    /// not mutated even on the success path (`rewrite_clock_calls`
1085    /// and `resolve_order_by_position` both mutate the locally-owned
1086    /// AST, not `self`).
1087    ///
1088    /// **v4.0 concurrency**: this is the entry point the server takes
1089    /// under an `RwLock::read()` so multiple `SELECT` clients run in
1090    /// parallel without serialising on a single mutex.
1091    pub fn execute_readonly(&self, sql: &str) -> Result<QueryResult, EngineError> {
1092        self.execute_readonly_with_cancel(sql, CancelToken::none())
1093    }
1094
1095    /// v4.5 — read path with cooperative cancellation. Token's
1096    /// `is_cancelled` is checked at the start (so a watchdog that
1097    /// already fired returns Cancelled immediately) and at row-loop
1098    /// checkpoints inside `exec_select`. SHOW paths are O(small) and
1099    /// don't bother checking.
1100    pub fn execute_readonly_with_cancel(
1101        &self,
1102        sql: &str,
1103        cancel: CancelToken<'_>,
1104    ) -> Result<QueryResult, EngineError> {
1105        cancel.check()?;
1106        let mut stmt = parser::parse_statement(sql)?;
1107        let now_micros = self.clock.map(|f| f());
1108        rewrite_clock_calls(&mut stmt, now_micros);
1109        if let Statement::Select(s) = &mut stmt {
1110            resolve_order_by_position(s);
1111            // v6.2.3 — cost-based JOIN reorder (read path).
1112            reorder::reorder_joins(s, &self.catalog, &self.statistics);
1113        }
1114        let result = match stmt {
1115            Statement::Select(s) => self.exec_select_cancel(&s, cancel),
1116            Statement::ShowTables => Ok(self.exec_show_tables()),
1117            Statement::ShowColumns(table) => self.exec_show_columns(&table),
1118            Statement::ShowUsers => Ok(self.exec_show_users()),
1119            Statement::ShowPublications => Ok(self.exec_show_publications()),
1120            Statement::ShowSubscriptions => Ok(self.exec_show_subscriptions()),
1121            Statement::WaitForWalPosition { .. } => Err(EngineError::Unsupported(
1122                "WAIT FOR WAL POSITION must be handled by the server layer".into(),
1123            )),
1124            Statement::Explain(e) => self.exec_explain(&e, cancel),
1125            _ => Err(EngineError::WriteRequired),
1126        };
1127        self.enforce_row_limit(result)
1128    }
1129
1130    /// v4.2: cap result-set size. Applied after the executor
1131    /// materialises rows but before they leave the engine — wrapping
1132    /// every Rows-returning exec_* function would scatter the check.
1133    fn enforce_row_limit(
1134        &self,
1135        result: Result<QueryResult, EngineError>,
1136    ) -> Result<QueryResult, EngineError> {
1137        if let (Ok(QueryResult::Rows { rows, .. }), Some(cap)) = (&result, self.max_query_rows)
1138            && rows.len() > cap
1139        {
1140            return Err(EngineError::RowLimitExceeded(cap));
1141        }
1142        result
1143    }
1144
1145    pub fn execute(&mut self, sql: &str) -> Result<QueryResult, EngineError> {
1146        self.execute_in_with_cancel(sql, IMPLICIT_TX, CancelToken::none())
1147    }
1148
1149    /// v4.5 — write path with cooperative cancellation. Same dispatch
1150    /// as `execute_in_with_cancel(sql, IMPLICIT_TX, cancel)`. Kept as
1151    /// a separate entry point for backward-compat with the v4.5
1152    /// public API.
1153    pub fn execute_with_cancel(
1154        &mut self,
1155        sql: &str,
1156        cancel: CancelToken<'_>,
1157    ) -> Result<QueryResult, EngineError> {
1158        self.execute_in_with_cancel(sql, IMPLICIT_TX, cancel)
1159    }
1160
1161    /// v4.41.1 multi-slot write entry. Routes `sql` through the TX
1162    /// slot identified by `tx_id` so spg-server dispatch can scope
1163    /// each implicit-wrap BEGIN..stmt..COMMIT to its own slot in
1164    /// `tx_catalogs`. `IMPLICIT_TX` is the legacy single-slot path
1165    /// every other caller (engine self-tests, replay, spg-embedded)
1166    /// implicitly takes via `execute()` / `execute_with_cancel()`.
1167    pub fn execute_in(&mut self, sql: &str, tx_id: TxId) -> Result<QueryResult, EngineError> {
1168        self.execute_in_with_cancel(sql, tx_id, CancelToken::none())
1169    }
1170
1171    /// v4.41.1 write path with cooperative cancellation + explicit TX
1172    /// scope. Sets `self.current_tx` for the duration of the call so
1173    /// every `exec_*` helper transparently sees its TX's shadow
1174    /// catalog and savepoint stack; restores on exit so the field is
1175    /// only valid mid-call (no leakage across calls).
1176    pub fn execute_in_with_cancel(
1177        &mut self,
1178        sql: &str,
1179        tx_id: TxId,
1180        cancel: CancelToken<'_>,
1181    ) -> Result<QueryResult, EngineError> {
1182        let saved = self.current_tx;
1183        self.current_tx = Some(tx_id);
1184        let result = self.execute_inner_with_cancel(sql, cancel);
1185        self.current_tx = saved;
1186        result
1187    }
1188
1189    /// v6.1.1 — parse and pre-process a SQL string ONCE so the
1190    /// resulting [`Statement`] can be cached and re-executed via
1191    /// [`Engine::execute_prepared`]. Returns the same `Statement`
1192    /// the simple-query path would synthesise internally (clock
1193    /// rewrites + ORDER BY position-ref resolution applied at
1194    /// prepare time, since both are session-independent). The
1195    /// `$N` placeholders in the SQL stay as `Expr::Placeholder(n)`
1196    /// nodes; they're resolved to concrete values per-call by
1197    /// `execute_prepared`'s substitution walk.
1198    ///
1199    /// Pgwire's `Parse` (P) message lands here.
1200    pub fn prepare(&self, sql: &str) -> Result<Statement, ParseError> {
1201        let mut stmt = parser::parse_statement(sql)?;
1202        let now_micros = self.clock.map(|f| f());
1203        rewrite_clock_calls(&mut stmt, now_micros);
1204        if let Statement::Select(s) = &mut stmt {
1205            // v6.4.1 — expand `GROUP BY ALL` to every non-aggregate
1206            // SELECT-list item BEFORE position / alias resolution so
1207            // downstream passes see the explicit list.
1208            expand_group_by_all(s);
1209            resolve_order_by_position(s);
1210            // v6.2.3 — cost-based JOIN reorder. No-op for
1211            // single-table FROMs or any non-INNER join shape.
1212            reorder::reorder_joins(s, &self.catalog, &self.statistics);
1213        }
1214        Ok(stmt)
1215    }
1216
1217    /// v6.3.0 — cached prepare. Returns a cloned `Statement` from
1218    /// the plan cache on hit, runs the full `prepare()` path on miss
1219    /// and inserts the resulting plan before returning. Skipping the
1220    /// parse + JOIN-reorder pipeline on hit is the dominant win for
1221    /// JDBC / sqlx / pgx clients that reuse the same SQL string.
1222    ///
1223    /// Returns a cloned `Statement` (not a borrow) because the
1224    /// pgwire layer owns its `PreparedStmt` map per-session and the
1225    /// engine-level cache must stay available for other sessions.
1226    /// Clone cost on a 5-table JOIN AST is well under the parse cost
1227    /// it replaces.
1228    pub fn prepare_cached(&mut self, sql: &str) -> Result<Statement, ParseError> {
1229        // v6.3.1 — version-aware lookup. If the cached plan was
1230        // prepared before the most recent ANALYZE, evict and replan.
1231        let current_version = self.statistics.version();
1232        if let Some(plan) = self.plan_cache.get(sql) {
1233            if plan.statistics_version == current_version {
1234                return Ok(plan.stmt.clone());
1235            }
1236            // Stale entry — fall through to evict + re-prepare.
1237        }
1238        self.plan_cache.evict(sql);
1239        let stmt = self.prepare(sql)?;
1240        let source_tables = plan_cache::collect_source_tables(&stmt);
1241        let plan = plan_cache::PreparedPlan {
1242            stmt: stmt.clone(),
1243            statistics_version: current_version,
1244            source_tables,
1245            describe_columns: alloc::vec::Vec::new(),
1246        };
1247        self.plan_cache.insert(String::from(sql), plan);
1248        Ok(stmt)
1249    }
1250
1251    /// v6.3.0 — read-only accessor for tests and v6.3.1 invalidation.
1252    pub fn plan_cache(&self) -> &plan_cache::PlanCache {
1253        &self.plan_cache
1254    }
1255
1256    /// v6.3.0 — mutable accessor for v6.3.1 invalidation hooks.
1257    pub fn plan_cache_mut(&mut self) -> &mut plan_cache::PlanCache {
1258        &mut self.plan_cache
1259    }
1260
1261    /// v6.3.3 — Describe a prepared `Statement` without executing.
1262    /// Returns `(parameter_oids, output_columns)`. Empty
1263    /// `output_columns` means the statement has no row-producing
1264    /// shape we could resolve here (JOIN, subquery, non-SELECT, …)
1265    /// — pgwire layer maps that to a `NoData` reply.
1266    pub fn describe_prepared(
1267        &self,
1268        stmt: &Statement,
1269    ) -> (Vec<u32>, Vec<ColumnSchema>) {
1270        describe::describe_prepared(stmt, self.active_catalog())
1271    }
1272
1273    /// v6.1.1 — execute a [`Statement`] previously returned by
1274    /// [`Engine::prepare`], substituting `Expr::Placeholder(n)`
1275    /// nodes for the corresponding [`Value`] in `params` (1-based
1276    /// per PG: `$1` → `params[0]`). Bind-time string parameters
1277    /// are decoded into typed `Value`s by the pgwire layer before
1278    /// this call so the resulting AST hits the same execution
1279    /// path as a simple query — no SQL re-parse.
1280    ///
1281    /// Pgwire's `Execute` (E) message after a `Bind` (B) lands here.
1282    pub fn execute_prepared(
1283        &mut self,
1284        mut stmt: Statement,
1285        params: &[Value],
1286    ) -> Result<QueryResult, EngineError> {
1287        substitute_placeholders(&mut stmt, params)?;
1288        self.execute_stmt_with_cancel(stmt, CancelToken::none())
1289    }
1290
1291    fn execute_inner_with_cancel(
1292        &mut self,
1293        sql: &str,
1294        cancel: CancelToken<'_>,
1295    ) -> Result<QueryResult, EngineError> {
1296        cancel.check()?;
1297        let stmt = self.prepare(sql)?;
1298        // v6.5.1 — wrap the executor with a wall-clock window so we
1299        // can record into spg_stat_query. Skip when the engine has
1300        // no clock attached (no_std embedded callers).
1301        let start_us = self.clock.map(|f| f());
1302        let result = self.execute_stmt_with_cancel(stmt, cancel);
1303        if let (Some(t0), Ok(_)) = (start_us, &result) {
1304            let now = self.clock.map_or(t0, |f| f());
1305            let elapsed = now.saturating_sub(t0).max(0) as u64;
1306            self.query_stats.record(sql, elapsed, now as u64);
1307            // v6.5.6 — slow-query log: fire callback when elapsed
1308            // exceeds the configured floor.
1309            if let (Some(threshold), Some(logger)) =
1310                (self.slow_query_threshold_us, self.slow_query_logger)
1311                && elapsed >= threshold
1312            {
1313                logger(sql, elapsed);
1314            }
1315        }
1316        result
1317    }
1318
1319    fn execute_stmt_with_cancel(
1320        &mut self,
1321        stmt: Statement,
1322        cancel: CancelToken<'_>,
1323    ) -> Result<QueryResult, EngineError> {
1324        cancel.check()?;
1325        let result = match stmt {
1326            Statement::CreateTable(s) => self.exec_create_table(s),
1327            // v7.9.15 — CREATE EXTENSION is a no-op on SPG. Returns
1328            // CommandOk with affected=0; modified_catalog=false so
1329            // the WAL doesn't grow a useless entry. mailrs F3.
1330            Statement::CreateExtension(_) => Ok(QueryResult::CommandOk {
1331                affected: 0,
1332                modified_catalog: false,
1333            }),
1334            // v7.9.27 — DO $$ ... $$ is also a no-op (SPG has no
1335            // PL/pgSQL). mailrs H1 + pg_dump compat.
1336            Statement::DoBlock => Ok(QueryResult::CommandOk {
1337                affected: 0,
1338                modified_catalog: false,
1339            }),
1340            Statement::CreateIndex(s) => self.exec_create_index(s),
1341            Statement::Insert(s) => self.exec_insert(s),
1342            Statement::Update(s) => self.exec_update_cancel(&s, cancel),
1343            Statement::Delete(s) => self.exec_delete_cancel(&s, cancel),
1344            Statement::Select(s) => self.exec_select_cancel(&s, cancel),
1345            Statement::Begin => self.exec_begin(),
1346            Statement::Commit => self.exec_commit(),
1347            Statement::Rollback => self.exec_rollback(),
1348            Statement::Savepoint(name) => self.exec_savepoint(name),
1349            Statement::RollbackToSavepoint(name) => self.exec_rollback_to_savepoint(&name),
1350            Statement::ReleaseSavepoint(name) => self.exec_release_savepoint(&name),
1351            Statement::ShowTables => Ok(self.exec_show_tables()),
1352            Statement::ShowColumns(table) => self.exec_show_columns(&table),
1353            Statement::ShowUsers => Ok(self.exec_show_users()),
1354            Statement::ShowPublications => Ok(self.exec_show_publications()),
1355            Statement::ShowSubscriptions => Ok(self.exec_show_subscriptions()),
1356            Statement::CreateUser(s) => self.exec_create_user(&s),
1357            Statement::DropUser(name) => self.exec_drop_user(&name),
1358            Statement::Explain(e) => self.exec_explain(&e, cancel),
1359            Statement::AlterIndex(s) => self.exec_alter_index(s),
1360            Statement::AlterTable(s) => self.exec_alter_table(s),
1361            Statement::CreatePublication(s) => self.exec_create_publication(s),
1362            Statement::DropPublication(name) => self.exec_drop_publication(&name),
1363            Statement::CreateSubscription(s) => self.exec_create_subscription(s),
1364            Statement::DropSubscription(name) => self.exec_drop_subscription(&name),
1365            // v6.1.7 — WAIT FOR WAL POSITION needs `lag_state`,
1366            // which lives in spg-server's ServerState. The engine
1367            // surfaces a clear error; the server-layer dispatch
1368            // intercepts the SQL before it reaches the engine on
1369            // a server build, so this arm only fires for
1370            // engine-only callers (spg-embedded, lib tests).
1371            Statement::WaitForWalPosition { .. } => Err(EngineError::Unsupported(
1372                "WAIT FOR WAL POSITION must be handled by the server layer".into(),
1373            )),
1374            // v6.2.0 — ANALYZE recomputes per-column histograms.
1375            Statement::Analyze(target) => self.exec_analyze(target.as_deref()),
1376            // v6.7.3 — COMPACT COLD SEGMENTS.
1377            Statement::CompactColdSegments => self.exec_compact_cold_segments(),
1378        };
1379        self.enforce_row_limit(result)
1380    }
1381
1382    /// v6.1.2 — `CREATE PUBLICATION` runtime path. Duplicate names
1383    /// surface as `EngineError::Unsupported` so the existing PG-wire
1384    /// error mapping stays uniform; the message carries the name so
1385    /// operators can grep replication-log noise. Inside-transaction
1386    /// invocation is rejected (matches `CREATE USER` / `DROP USER`
1387    /// stance) — replication-catalog mutation is a connection-level
1388    /// administrative op, not a transactional one.
1389    fn exec_create_publication(
1390        &mut self,
1391        s: CreatePublicationStatement,
1392    ) -> Result<QueryResult, EngineError> {
1393        // v6.1.4 — the v6.1.2 "no DDL inside a transaction" guard
1394        // was over-cautious: it also blocked the auto-commit wrap
1395        // path (which begins an internal TX around every WAL-
1396        // logged statement). PG itself allows CREATE PUBLICATION
1397        // inside a transaction (it rolls back with the TX).
1398        self.publications
1399            .create(s.name, s.scope)
1400            .map_err(|e| EngineError::Unsupported(alloc::format!("CREATE PUBLICATION: {e:?}")))?;
1401        Ok(QueryResult::CommandOk {
1402            affected: 1,
1403            modified_catalog: true,
1404        })
1405    }
1406
1407    /// v6.1.2 — `DROP PUBLICATION` runtime path. PG-compatible silent
1408    /// no-op when the publication doesn't exist (returns `affected=0`
1409    /// in that case so the wire-level command tag distinguishes
1410    /// "dropped" from "no-op", though both succeed).
1411    fn exec_drop_publication(&mut self, name: &str) -> Result<QueryResult, EngineError> {
1412        let removed = self.publications.drop(name);
1413        Ok(QueryResult::CommandOk {
1414            affected: usize::from(removed),
1415            modified_catalog: removed,
1416        })
1417    }
1418
1419    /// v6.1.2 — read access to the publication catalog. Used by
1420    /// the v6.1.5 publisher-side WAL filter, by `SHOW PUBLICATIONS`
1421    /// (v6.1.3+), and by e2e tests that need to assert state without
1422    /// going through the wire.
1423    pub const fn publications(&self) -> &publications::Publications {
1424        &self.publications
1425    }
1426
1427    /// v6.1.4 — `CREATE SUBSCRIPTION` runtime path. Defaults
1428    /// `enabled = true` and `last_received_pos = 0` for a freshly-
1429    /// created subscription. The actual worker thread is spawned
1430    /// by spg-server once the engine returns success.
1431    fn exec_create_subscription(
1432        &mut self,
1433        s: CreateSubscriptionStatement,
1434    ) -> Result<QueryResult, EngineError> {
1435        // See exec_create_publication — the in_transaction gate
1436        // was over-cautious; the auto-commit wrap path holds an
1437        // internal TX that this check was incorrectly blocking.
1438        let sub = subscriptions::Subscription {
1439            conn_str: s.conn_str,
1440            publications: s.publications,
1441            enabled: true,
1442            last_received_pos: 0,
1443        };
1444        self.subscriptions
1445            .create(s.name, sub)
1446            .map_err(|e| EngineError::Unsupported(alloc::format!("CREATE SUBSCRIPTION: {e:?}")))?;
1447        Ok(QueryResult::CommandOk {
1448            affected: 1,
1449            modified_catalog: true,
1450        })
1451    }
1452
1453    /// v6.1.4 — `DROP SUBSCRIPTION`. Silent no-op when the name
1454    /// doesn't exist (PG-compatible). The associated worker is
1455    /// torn down by spg-server when it observes the catalog
1456    /// change at the next snapshot or via the engine's
1457    /// subscriptions accessor (the worker polls the catalog on
1458    /// reconnect; v6.1.5's filter-side will tighten this to an
1459    /// explicit signal).
1460    fn exec_drop_subscription(&mut self, name: &str) -> Result<QueryResult, EngineError> {
1461        let removed = self.subscriptions.drop(name);
1462        Ok(QueryResult::CommandOk {
1463            affected: usize::from(removed),
1464            modified_catalog: removed,
1465        })
1466    }
1467
1468    /// v6.1.4 — read access to the subscription catalog. Used by
1469    /// the subscription worker (read its own row to find its
1470    /// publications + last applied position), by SHOW SUBSCRIPTIONS,
1471    /// and by e2e tests asserting state directly.
1472    pub const fn subscriptions(&self) -> &subscriptions::Subscriptions {
1473        &self.subscriptions
1474    }
1475
1476    /// v6.1.4 — write access to `last_received_pos`. Worker
1477    /// calls this after each apply batch (under the engine's
1478    /// write-lock). Returns `false` when the subscription was
1479    /// dropped between when the worker received the record and
1480    /// when this call landed.
1481    pub fn subscription_advance(&mut self, name: &str, pos: u64) -> bool {
1482        self.subscriptions.update_last_received_pos(name, pos)
1483    }
1484
1485    /// v6.1.4 — `SHOW SUBSCRIPTIONS` row materialisation. Returns
1486    /// `(name, conn_str, publications, enabled, last_received_pos)`
1487    /// ordered by subscription name. The `publications` column is
1488    /// the comma-joined list ("p1, p2") for ergonomic SHOW output;
1489    /// callers wanting structured access read `Engine::subscriptions`.
1490    fn exec_show_subscriptions(&self) -> QueryResult {
1491        let columns = alloc::vec![
1492            ColumnSchema::new("name", DataType::Text, false),
1493            ColumnSchema::new("conn_str", DataType::Text, false),
1494            ColumnSchema::new("publications", DataType::Text, false),
1495            ColumnSchema::new("enabled", DataType::Bool, false),
1496            ColumnSchema::new("last_received_pos", DataType::BigInt, false),
1497        ];
1498        let rows: Vec<Row> = self
1499            .subscriptions
1500            .iter()
1501            .map(|(name, sub)| {
1502                Row::new(alloc::vec![
1503                    Value::Text(name.clone()),
1504                    Value::Text(sub.conn_str.clone()),
1505                    Value::Text(sub.publications.join(", ")),
1506                    Value::Bool(sub.enabled),
1507                    Value::BigInt(i64::try_from(sub.last_received_pos).unwrap_or(i64::MAX)),
1508                ])
1509            })
1510            .collect();
1511        QueryResult::Rows { columns, rows }
1512    }
1513
1514    /// v6.2.0 — materialise `spg_statistic` rows. One row per
1515    /// `(table, column)` pair tracked in `Statistics`, with
1516    /// `histogram_bounds` rendered as a `[v0, v1, ...]` string —
1517    /// the same canonical form vector literals use for round-trip.
1518    fn exec_spg_statistic(&self) -> QueryResult {
1519        let columns = alloc::vec![
1520            ColumnSchema::new("table_name", DataType::Text, false),
1521            ColumnSchema::new("column_name", DataType::Text, false),
1522            ColumnSchema::new("null_frac", DataType::Float, false),
1523            ColumnSchema::new("n_distinct", DataType::BigInt, false),
1524            ColumnSchema::new("histogram_bounds", DataType::Text, false),
1525            // v6.7.0 — appended column (v6.2.0 stability contract
1526            // allows APPEND to spg_statistic, not reorder/rename).
1527            // Reports the cached per-table cold-row count; same
1528            // value across every column row of the same table.
1529            ColumnSchema::new("cold_row_count", DataType::BigInt, false),
1530        ];
1531        let rows: Vec<Row> = self
1532            .statistics
1533            .iter()
1534            .map(|((t, c), s)| {
1535                let cold = self
1536                    .catalog
1537                    .get(t)
1538                    .map_or(0, |table| table.cold_row_count());
1539                Row::new(alloc::vec![
1540                    Value::Text(t.clone()),
1541                    Value::Text(c.clone()),
1542                    Value::Float(f64::from(s.null_frac)),
1543                    Value::BigInt(i64::try_from(s.n_distinct).unwrap_or(i64::MAX)),
1544                    Value::Text(render_histogram_bounds(&s.histogram_bounds)),
1545                    Value::BigInt(i64::try_from(cold).unwrap_or(i64::MAX)),
1546                ])
1547            })
1548            .collect();
1549        QueryResult::Rows { columns, rows }
1550    }
1551
1552    /// v6.5.0 — materialise `spg_stat_replication` rows. One row
1553    /// per subscription with `(name, conn_str, publications,
1554    /// last_received_pos, enabled)`. Surface mirrors
1555    /// `SHOW SUBSCRIPTIONS` but follows the virtual-table dispatch
1556    /// shape so it composes with SELECT clauses (WHERE, projection
1557    /// onto specific columns, etc).
1558    fn exec_spg_stat_replication(&self) -> QueryResult {
1559        let columns = alloc::vec![
1560            ColumnSchema::new("name", DataType::Text, false),
1561            ColumnSchema::new("conn_str", DataType::Text, false),
1562            ColumnSchema::new("publications", DataType::Text, false),
1563            ColumnSchema::new("last_received_pos", DataType::BigInt, false),
1564            ColumnSchema::new("enabled", DataType::Bool, false),
1565        ];
1566        let rows: Vec<Row> = self
1567            .subscriptions
1568            .iter()
1569            .map(|(name, sub)| {
1570                Row::new(alloc::vec![
1571                    Value::Text(name.clone()),
1572                    Value::Text(sub.conn_str.clone()),
1573                    Value::Text(sub.publications.join(",")),
1574                    Value::BigInt(i64::try_from(sub.last_received_pos).unwrap_or(i64::MAX)),
1575                    Value::Bool(sub.enabled),
1576                ])
1577            })
1578            .collect();
1579        QueryResult::Rows { columns, rows }
1580    }
1581
1582    /// v6.5.0 — materialise `spg_stat_segment` rows. One row per
1583    /// cold-tier segment with `(segment_id, num_rows, num_pages,
1584    /// total_bytes)`.
1585    ///
1586    /// v6.7.0 — appended `table_name` column resolves the v6.5.0
1587    /// carve-out. Walks every user table's BTree indices to find
1588    /// which table's Cold locators point at each segment. Empty
1589    /// string for orphan segments (loaded via SPG_PRELOAD_COLD_SEGMENT
1590    /// before any index registered a locator). The walk is
1591    /// O(tables × indices × keys); cached per call, not across
1592    /// calls — re-walked on every `SELECT * FROM spg_stat_segment`.
1593    fn exec_spg_stat_segment(&self) -> QueryResult {
1594        let columns = alloc::vec![
1595            ColumnSchema::new("segment_id", DataType::BigInt, false),
1596            ColumnSchema::new("table_name", DataType::Text, false),
1597            ColumnSchema::new("num_rows", DataType::BigInt, false),
1598            ColumnSchema::new("num_pages", DataType::BigInt, false),
1599            ColumnSchema::new("total_bytes", DataType::BigInt, false),
1600        ];
1601        // v6.7.0 — build a segment_id → table_name map by walking
1602        // every user table's BTree indices once. O(tables × indices
1603        // × keys) for the v6.5.0 carve-out resolution; acceptable
1604        // because spg_stat_segment is operator-facing (not on a
1605        // hot-loop path).
1606        let mut segment_owners: alloc::collections::BTreeMap<u32, String> = BTreeMap::new();
1607        for tname in self.catalog.table_names() {
1608            if is_internal_table_name(&tname) {
1609                continue;
1610            }
1611            let Some(t) = self.catalog.get(&tname) else {
1612                continue;
1613            };
1614            for idx in t.indices() {
1615                if let spg_storage::IndexKind::BTree(map) = &idx.kind {
1616                    for (_, locs) in map.iter() {
1617                        for loc in locs {
1618                            if let spg_storage::RowLocator::Cold { segment_id, .. } = loc {
1619                                segment_owners.entry(*segment_id).or_insert_with(|| tname.clone());
1620                            }
1621                        }
1622                    }
1623                }
1624            }
1625        }
1626        let rows: Vec<Row> = self
1627            .catalog
1628            .cold_segment_ids_global()
1629            .iter()
1630            .filter_map(|&id| {
1631                let seg = self.catalog.cold_segment(id)?;
1632                let meta = seg.meta();
1633                let owner = segment_owners
1634                    .get(&id)
1635                    .cloned()
1636                    .unwrap_or_default();
1637                Some(Row::new(alloc::vec![
1638                    Value::BigInt(i64::from(id)),
1639                    Value::Text(owner),
1640                    Value::BigInt(i64::try_from(meta.num_rows).unwrap_or(i64::MAX)),
1641                    Value::BigInt(i64::from(meta.num_pages)),
1642                    Value::BigInt(i64::try_from(meta.total_bytes).unwrap_or(i64::MAX)),
1643                ]))
1644            })
1645            .collect();
1646        QueryResult::Rows { columns, rows }
1647    }
1648
1649    /// v6.5.1 — materialise `spg_stat_query` rows. One row per
1650    /// distinct SQL text recorded since the engine booted, capped
1651    /// at `QUERY_STATS_MAX` (1024). Columns:
1652    ///   sql, exec_count, total_us, mean_us, max_us, last_seen_us
1653    /// mean_us = total_us / exec_count (saturating).
1654    fn exec_spg_stat_query(&self) -> QueryResult {
1655        let columns = alloc::vec![
1656            ColumnSchema::new("sql", DataType::Text, false),
1657            ColumnSchema::new("exec_count", DataType::BigInt, false),
1658            ColumnSchema::new("total_us", DataType::BigInt, false),
1659            ColumnSchema::new("mean_us", DataType::BigInt, false),
1660            ColumnSchema::new("max_us", DataType::BigInt, false),
1661            ColumnSchema::new("last_seen_us", DataType::BigInt, false),
1662        ];
1663        let rows: Vec<Row> = self
1664            .query_stats
1665            .snapshot()
1666            .into_iter()
1667            .map(|(sql, s)| {
1668                let mean = if s.exec_count == 0 {
1669                    0
1670                } else {
1671                    s.total_us / s.exec_count
1672                };
1673                Row::new(alloc::vec![
1674                    Value::Text(sql),
1675                    Value::BigInt(i64::try_from(s.exec_count).unwrap_or(i64::MAX)),
1676                    Value::BigInt(i64::try_from(s.total_us).unwrap_or(i64::MAX)),
1677                    Value::BigInt(i64::try_from(mean).unwrap_or(i64::MAX)),
1678                    Value::BigInt(i64::try_from(s.max_us).unwrap_or(i64::MAX)),
1679                    Value::BigInt(i64::try_from(s.last_seen_us).unwrap_or(i64::MAX)),
1680                ])
1681            })
1682            .collect();
1683        QueryResult::Rows { columns, rows }
1684    }
1685
1686    /// v6.5.2 — register a connection-state provider. spg-server
1687    /// calls this at startup with a function that snapshots its
1688    /// per-pgwire-connection registry. Engine reads through the
1689    /// callback on `SELECT * FROM spg_stat_activity`.
1690    #[must_use]
1691    pub const fn with_activity_provider(mut self, f: ActivityProvider) -> Self {
1692        self.activity_provider = Some(f);
1693        self
1694    }
1695
1696    /// v6.5.3 — register audit chain provider + verifier.
1697    #[must_use]
1698    pub const fn with_audit_providers(
1699        mut self,
1700        chain: AuditChainProvider,
1701        verify: AuditVerifier,
1702    ) -> Self {
1703        self.audit_chain_provider = Some(chain);
1704        self.audit_verifier = Some(verify);
1705        self
1706    }
1707
1708    /// v6.5.6 — register a slow-query log callback. `threshold_us`
1709    /// is the floor (in microseconds); only executes above the floor
1710    /// fire the callback. spg-server wires this from
1711    /// `SPG_SLOW_QUERY_THRESHOLD_MS` (default 100 ms).
1712    #[must_use]
1713    pub const fn with_slow_query_log(
1714        mut self,
1715        threshold_us: u64,
1716        logger: SlowQueryLogger,
1717    ) -> Self {
1718        self.slow_query_threshold_us = Some(threshold_us);
1719        self.slow_query_logger = Some(logger);
1720        self
1721    }
1722
1723    /// v6.5.6 — operator knob for plan cache cap. spg-server reads
1724    /// `SPG_PLAN_CACHE_MAX` env at startup; uses this to override
1725    /// the compile-time default of 256.
1726    pub fn set_plan_cache_max(&mut self, n: usize) {
1727        self.plan_cache.set_max_entries(n);
1728    }
1729
1730    /// v6.5.2 — materialise `spg_stat_activity` rows. Pulls a fresh
1731    /// snapshot from the registered `ActivityProvider`. Returns an
1732    /// empty result set when no provider is registered (the no_std
1733    /// embedded path with no pgwire layer).
1734    fn exec_spg_stat_activity(&self) -> QueryResult {
1735        let columns = alloc::vec![
1736            ColumnSchema::new("pid", DataType::Int, false),
1737            ColumnSchema::new("user", DataType::Text, false),
1738            ColumnSchema::new("started_at_us", DataType::BigInt, false),
1739            ColumnSchema::new("current_sql", DataType::Text, false),
1740            ColumnSchema::new("wait_event", DataType::Text, false),
1741            ColumnSchema::new("elapsed_us", DataType::BigInt, false),
1742            ColumnSchema::new("in_transaction", DataType::Bool, false),
1743        ];
1744        let rows: Vec<Row> = self
1745            .activity_provider
1746            .map(|f| f())
1747            .unwrap_or_default()
1748            .into_iter()
1749            .map(|r| {
1750                Row::new(alloc::vec![
1751                    Value::Int(i32::try_from(r.pid).unwrap_or(i32::MAX)),
1752                    Value::Text(r.user),
1753                    Value::BigInt(r.started_at_us),
1754                    Value::Text(r.current_sql),
1755                    Value::Text(r.wait_event),
1756                    Value::BigInt(r.elapsed_us),
1757                    Value::Bool(r.in_transaction),
1758                ])
1759            })
1760            .collect();
1761        QueryResult::Rows { columns, rows }
1762    }
1763
1764    /// v6.5.4 — materialise `spg_table_ddl` rows. One row per user
1765    /// table with `(table_name, ddl)`. Reconstructed from catalog
1766    /// state on demand.
1767    fn exec_spg_table_ddl(&self) -> QueryResult {
1768        let columns = alloc::vec![
1769            ColumnSchema::new("table_name", DataType::Text, false),
1770            ColumnSchema::new("ddl", DataType::Text, false),
1771        ];
1772        let rows: Vec<Row> = self
1773            .catalog
1774            .table_names()
1775            .into_iter()
1776            .filter(|n| !is_internal_table_name(n))
1777            .filter_map(|name| {
1778                let table = self.catalog.get(&name)?;
1779                let ddl = render_create_table(&name, &table.schema().columns);
1780                Some(Row::new(alloc::vec![
1781                    Value::Text(name),
1782                    Value::Text(ddl),
1783                ]))
1784            })
1785            .collect();
1786        QueryResult::Rows { columns, rows }
1787    }
1788
1789    /// v6.5.4 — materialise `spg_role_ddl` rows. One row per user
1790    /// with `(role_name, ddl)`. Password is redacted (matches the
1791    /// `Statement::CreateUser` Display which prints `'<redacted>'`).
1792    fn exec_spg_role_ddl(&self) -> QueryResult {
1793        let columns = alloc::vec![
1794            ColumnSchema::new("role_name", DataType::Text, false),
1795            ColumnSchema::new("ddl", DataType::Text, false),
1796        ];
1797        let rows: Vec<Row> = self
1798            .users
1799            .iter()
1800            .map(|(name, rec)| {
1801                let ddl = alloc::format!(
1802                    "CREATE USER {name} WITH PASSWORD '<redacted>' ROLE '{}'",
1803                    rec.role.as_str(),
1804                );
1805                Row::new(alloc::vec![Value::Text(String::from(name)), Value::Text(ddl)])
1806            })
1807            .collect();
1808        QueryResult::Rows { columns, rows }
1809    }
1810
1811    /// v6.5.4 — materialise `spg_database_ddl`: single row whose
1812    /// `ddl` column concatenates every user table's CREATE +
1813    /// every role's CREATE in deterministic catalog order. Suitable
1814    /// for piping back through `Engine::execute` to recreate a
1815    /// schema-equivalent database.
1816    fn exec_spg_database_ddl(&self) -> QueryResult {
1817        let columns = alloc::vec![ColumnSchema::new("ddl", DataType::Text, false)];
1818        let mut out = String::new();
1819        for (name, rec) in self.users.iter() {
1820            out.push_str(&alloc::format!(
1821                "CREATE USER {name} WITH PASSWORD '<redacted>' ROLE '{}';\n",
1822                rec.role.as_str(),
1823            ));
1824        }
1825        for name in self.catalog.table_names() {
1826            if is_internal_table_name(&name) {
1827                continue;
1828            }
1829            if let Some(table) = self.catalog.get(&name) {
1830                out.push_str(&render_create_table(&name, &table.schema().columns));
1831                out.push_str(";\n");
1832            }
1833        }
1834        QueryResult::Rows {
1835            columns,
1836            rows: alloc::vec![Row::new(alloc::vec![Value::Text(out)])],
1837        }
1838    }
1839
1840    /// v6.5.3 — materialise `spg_audit_chain` rows. Pulls a fresh
1841    /// snapshot from the registered provider; empty when no
1842    /// provider is set.
1843    fn exec_spg_audit_chain(&self) -> QueryResult {
1844        let columns = alloc::vec![
1845            ColumnSchema::new("seq", DataType::BigInt, false),
1846            ColumnSchema::new("ts_ms", DataType::BigInt, false),
1847            ColumnSchema::new("prev_hash", DataType::Text, false),
1848            ColumnSchema::new("entry_hash", DataType::Text, false),
1849            ColumnSchema::new("sql", DataType::Text, false),
1850        ];
1851        let rows: Vec<Row> = self
1852            .audit_chain_provider
1853            .map(|f| f())
1854            .unwrap_or_default()
1855            .into_iter()
1856            .map(|r| {
1857                Row::new(alloc::vec![
1858                    Value::BigInt(r.seq),
1859                    Value::BigInt(r.ts_ms),
1860                    Value::Text(r.prev_hash_hex),
1861                    Value::Text(r.entry_hash_hex),
1862                    Value::Text(r.sql),
1863                ])
1864            })
1865            .collect();
1866        QueryResult::Rows { columns, rows }
1867    }
1868
1869    /// v6.5.3 — materialise `spg_audit_verify` single-row result.
1870    /// `(verified_count, broken_at_seq)` — broken_at_seq is `-1`
1871    /// on a clean chain. Returns one row with both values 0 when
1872    /// no verifier is registered (no-data fallback for embedded
1873    /// callers).
1874    fn exec_spg_audit_verify(&self) -> QueryResult {
1875        let columns = alloc::vec![
1876            ColumnSchema::new("verified_count", DataType::BigInt, false),
1877            ColumnSchema::new("broken_at_seq", DataType::BigInt, false),
1878        ];
1879        let (verified, broken) = self.audit_verifier.map(|f| f()).unwrap_or((0, -1));
1880        let row = Row::new(alloc::vec![
1881            Value::BigInt(verified),
1882            Value::BigInt(broken),
1883        ]);
1884        QueryResult::Rows {
1885            columns,
1886            rows: alloc::vec![row],
1887        }
1888    }
1889
1890    /// v6.5.1 — read-only accessor for tests + v6.5.6 ops resets.
1891    pub fn query_stats(&self) -> &query_stats::QueryStats {
1892        &self.query_stats
1893    }
1894
1895    /// v6.5.1 — mutable accessor (clear, etc).
1896    pub fn query_stats_mut(&mut self) -> &mut query_stats::QueryStats {
1897        &mut self.query_stats
1898    }
1899
1900    /// v6.2.0 — read access to the per-column statistics table.
1901    /// Used by the planner (v6.2.2 selectivity functions read this),
1902    /// by `SELECT * FROM spg_statistic`, and by e2e tests.
1903    pub const fn statistics(&self) -> &statistics::Statistics {
1904        &self.statistics
1905    }
1906
1907    /// v6.2.1 — return tables whose modified-row count crossed the
1908    /// auto-analyze threshold since the last ANALYZE on that table.
1909    /// The threshold is `0.1 × max(row_count, MIN_ROWS_FOR_AUTO_
1910    /// ANALYZE)` — combines PG-style fractional + absolute lower
1911    /// bound so a fresh / tiny table doesn't get hammered on every
1912    /// INSERT.
1913    ///
1914    /// Designed to be cheap: walks every user table's
1915    /// `Catalog::table_names()` + reads `statistics::modified_
1916    /// since_last_analyze()` (BTreeMap lookup). The background
1917    /// worker calls this under `engine.read()` then drops the lock
1918    /// before re-acquiring `engine.write()` for the actual ANALYZE.
1919    pub fn tables_needing_analyze(&self) -> Vec<String> {
1920        const MIN_ROWS: u64 = 100;
1921        let mut out = Vec::new();
1922        for name in self.catalog.table_names() {
1923            if is_internal_table_name(&name) {
1924                continue;
1925            }
1926            let Some(table) = self.catalog.get(&name) else {
1927                continue;
1928            };
1929            let row_count = table.rows().len() as u64;
1930            let modified = self.statistics.modified_since_last_analyze(&name);
1931            // Threshold: ceil(0.1 × max(row_count, MIN_ROWS)),
1932            // computed in integer arithmetic so spg-engine stays
1933            // no_std without pulling in libm. `(n + 9) / 10` is
1934            // `ceil(n / 10)` for non-negative `n`.
1935            let base = row_count.max(MIN_ROWS);
1936            let threshold = base.saturating_add(9) / 10;
1937            if modified >= threshold {
1938                out.push(name);
1939            }
1940        }
1941        out
1942    }
1943
1944    /// v6.2.0 — `ANALYZE [<table>]` runtime. Bare `ANALYZE` walks
1945    /// every user table; `ANALYZE <name>` re-stats one. For each
1946    /// target table, single-pass scan + per-column histogram +
1947    /// `null_frac` + `n_distinct`. Replaces the table's prior
1948    /// stats; resets the modified-row counter.
1949    ///
1950    /// v6.2.0 doesn't sample — it scans the full table. v6.2.x
1951    /// can add reservoir sampling at the > 100 K-row mark; not a
1952    /// scope blocker for the current commit since rows ≤ 100 K
1953    /// analyse in milliseconds.
1954    fn exec_analyze(&mut self, target: Option<&str>) -> Result<QueryResult, EngineError> {
1955        let names: Vec<String> = if let Some(name) = target {
1956            // Verify the table exists; surface a clear error if not.
1957            if self.catalog.get(name).is_none() {
1958                return Err(EngineError::Storage(StorageError::TableNotFound {
1959                    name: name.to_string(),
1960                }));
1961            }
1962            alloc::vec![name.to_string()]
1963        } else {
1964            self.catalog
1965                .table_names()
1966                .into_iter()
1967                .filter(|n| !is_internal_table_name(n))
1968                .collect()
1969        };
1970        let mut analysed = 0usize;
1971        for table_name in &names {
1972            self.analyze_one_table(table_name)?;
1973            analysed += 1;
1974        }
1975        // v6.3.1 — plan cache invalidation. Bump stats version so
1976        // future lookups see the new generation, and selectively
1977        // evict every plan whose `source_tables` overlap with the
1978        // ANALYZE target set. Bare ANALYZE (all tables) clears the
1979        // whole cache.
1980        if analysed > 0 {
1981            self.statistics.bump_version();
1982            if target.is_some() {
1983                for t in &names {
1984                    self.plan_cache.evict_referencing(t);
1985                }
1986            } else {
1987                self.plan_cache.clear();
1988            }
1989        }
1990        Ok(QueryResult::CommandOk {
1991            affected: analysed,
1992            modified_catalog: true,
1993        })
1994    }
1995
1996    /// v6.7.3 — `COMPACT COLD SEGMENTS` runtime path. Drives the
1997    /// engine-layer compaction shim with the default
1998    /// 4 MiB segment-size threshold. spg-server intercepts the
1999    /// SQL before it reaches the engine on a server build —
2000    /// it reads `SPG_COMPACTION_TARGET_SEGMENT_BYTES`, calls
2001    /// `Engine::compact_cold_segments_with_target` directly with
2002    /// the env value, and persists every merged segment to
2003    /// `<db>.spg/segments/`. This arm only fires for engine-only
2004    /// callers (spg-embedded, lib tests); in that mode merged
2005    /// segments live in memory and are dropped at process exit.
2006    fn exec_compact_cold_segments(&mut self) -> Result<QueryResult, EngineError> {
2007        let target = COMPACTION_TARGET_DEFAULT_BYTES;
2008        let reports = self.compact_cold_segments_with_target(target)?;
2009        let columns = alloc::vec![
2010            ColumnSchema::new("table_name", DataType::Text, false),
2011            ColumnSchema::new("index_name", DataType::Text, false),
2012            ColumnSchema::new("sources_merged", DataType::BigInt, false),
2013            ColumnSchema::new("merged_segment_id", DataType::BigInt, false),
2014            ColumnSchema::new("merged_rows", DataType::BigInt, false),
2015            ColumnSchema::new("deleted_rows_pruned", DataType::BigInt, false),
2016            ColumnSchema::new("bytes_reclaimed_estimate", DataType::BigInt, false),
2017        ];
2018        let rows: Vec<Row> = reports
2019            .into_iter()
2020            .map(|(tname, iname, report)| {
2021                Row::new(alloc::vec![
2022                    Value::Text(tname),
2023                    Value::Text(iname),
2024                    Value::BigInt(i64::try_from(report.sources.len()).unwrap_or(i64::MAX)),
2025                    Value::BigInt(i64::from(report.merged_segment_id.unwrap_or(0))),
2026                    Value::BigInt(i64::try_from(report.merged_rows).unwrap_or(i64::MAX)),
2027                    Value::BigInt(
2028                        i64::try_from(report.deleted_rows_pruned).unwrap_or(i64::MAX),
2029                    ),
2030                    Value::BigInt(
2031                        i64::try_from(report.bytes_reclaimed_estimate).unwrap_or(i64::MAX),
2032                    ),
2033                ])
2034            })
2035            .collect();
2036        Ok(QueryResult::Rows { columns, rows })
2037    }
2038
2039    /// Walk a single table's rows once and (re-)populate per-column
2040    /// stats. Drops the existing stats for `table` first so columns
2041    /// that have been DROP-ed between ANALYZEs don't leave stale
2042    /// rows.
2043    fn analyze_one_table(&mut self, table_name: &str) -> Result<(), EngineError> {
2044        let table = self.catalog.get(table_name).ok_or_else(|| {
2045            EngineError::Storage(StorageError::TableNotFound {
2046                name: table_name.to_string(),
2047            })
2048        })?;
2049        let schema = table.schema().clone();
2050        let row_count = table.rows().len();
2051        // For each column, collect (sorted) non-NULL textual values
2052        // + count NULLs; then ask `statistics::build_histogram` to
2053        // produce the 101 bounds and `estimate_n_distinct` the
2054        // distinct count.
2055        self.statistics.clear_table(table_name);
2056        for (col_pos, col_schema) in schema.columns.iter().enumerate() {
2057            // v6.2.0 skip: vector columns have their own stats
2058            // shape (HNSW graph topology). v6.2 deliberation #1.
2059            if matches!(col_schema.ty, DataType::Vector { .. }) {
2060                continue;
2061            }
2062            let mut non_null_values: Vec<Value> = Vec::with_capacity(row_count);
2063            let mut nulls: u64 = 0;
2064            for row in table.rows() {
2065                match row.values.get(col_pos) {
2066                    Some(Value::Null) | None => nulls += 1,
2067                    Some(v) => non_null_values.push(v.clone()),
2068                }
2069            }
2070            // Sort by type-aware ordering (Int as int, Text as
2071            // lex, etc.) so histogram bounds reflect the column's
2072            // natural order — not lexicographic on the string
2073            // representation, which would put "9" after "49".
2074            non_null_values.sort_by(|a, b| sort_values_for_histogram(a, b));
2075            let non_null: Vec<String> = non_null_values
2076                .iter()
2077                .map(canonical_value_repr)
2078                .collect();
2079            let null_frac = if row_count == 0 {
2080                0.0
2081            } else {
2082                #[allow(clippy::cast_precision_loss)]
2083                let f = nulls as f32 / row_count as f32;
2084                f
2085            };
2086            let n_distinct = statistics::estimate_n_distinct(&non_null);
2087            let histogram_bounds = statistics::build_histogram(&non_null);
2088            self.statistics.set(
2089                table_name.to_string(),
2090                col_schema.name.clone(),
2091                statistics::ColumnStats {
2092                    null_frac,
2093                    n_distinct,
2094                    histogram_bounds,
2095                },
2096            );
2097        }
2098        self.statistics.reset_modified(table_name);
2099        // v6.7.0 — refresh the per-table cold_rows cache. Walk the
2100        // BTree indices and count Cold locators (MAX across
2101        // indices); store the result on the table. Surfaced via
2102        // `spg_statistic.cold_row_count` (new column) and
2103        // `spg_stat_segment.table_name` (new column).
2104        let cold_count = {
2105            let table = self
2106                .active_catalog()
2107                .get(table_name)
2108                .expect("table still present");
2109            table.count_cold_locators()
2110        };
2111        let table_mut = self
2112            .active_catalog_mut()
2113            .get_mut(table_name)
2114            .expect("table still present");
2115        table_mut.set_cold_row_count(cold_count);
2116        Ok(())
2117    }
2118
2119    /// v6.1.3 — `SHOW PUBLICATIONS` row materialisation. Returns
2120    /// `(name, scope, table_count)` ordered by publication name.
2121    ///   - `scope` is the human-readable string:
2122    ///       `"FOR ALL TABLES"` /
2123    ///       `"FOR TABLE t1, t2"` /
2124    ///       `"FOR ALL TABLES EXCEPT t1, t2"`.
2125    ///   - `table_count` is NULL for `AllTables`, the list length
2126    ///     otherwise. NULLability lets clients distinguish "publish
2127    ///     everything" from "publish exactly 0 tables" (the v6.1.3
2128    ///     parser forbids the empty list, but the column shape is
2129    ///     ready for the v6.1.5 publisher-side semantics).
2130    fn exec_show_publications(&self) -> QueryResult {
2131        let columns = alloc::vec![
2132            ColumnSchema::new("name", DataType::Text, false),
2133            ColumnSchema::new("scope", DataType::Text, false),
2134            ColumnSchema::new("table_count", DataType::Int, true),
2135        ];
2136        let rows: Vec<Row> = self
2137            .publications
2138            .iter()
2139            .map(|(name, scope)| {
2140                let (scope_str, count_val) = match scope {
2141                    spg_sql::ast::PublicationScope::AllTables => {
2142                        ("FOR ALL TABLES".to_string(), Value::Null)
2143                    }
2144                    spg_sql::ast::PublicationScope::ForTables(ts) => (
2145                        alloc::format!("FOR TABLE {}", ts.join(", ")),
2146                        Value::Int(i32::try_from(ts.len()).unwrap_or(i32::MAX)),
2147                    ),
2148                    spg_sql::ast::PublicationScope::AllTablesExcept(ts) => (
2149                        alloc::format!("FOR ALL TABLES EXCEPT {}", ts.join(", ")),
2150                        Value::Int(i32::try_from(ts.len()).unwrap_or(i32::MAX)),
2151                    ),
2152                };
2153                Row::new(alloc::vec![
2154                    Value::Text(name.clone()),
2155                    Value::Text(scope_str),
2156                    count_val,
2157                ])
2158            })
2159            .collect();
2160        QueryResult::Rows { columns, rows }
2161    }
2162
2163    /// v4.1 `SHOW USERS` — `(name, role)` per row, ordered by name.
2164    fn exec_show_users(&self) -> QueryResult {
2165        let columns = alloc::vec![
2166            ColumnSchema::new("name", DataType::Text, false),
2167            ColumnSchema::new("role", DataType::Text, false),
2168        ];
2169        let rows: Vec<Row> = self
2170            .users
2171            .iter()
2172            .map(|(name, rec)| {
2173                Row::new(alloc::vec![
2174                    Value::Text(name.to_string()),
2175                    Value::Text(rec.role.as_str().to_string()),
2176                ])
2177            })
2178            .collect();
2179        QueryResult::Rows { columns, rows }
2180    }
2181
2182    fn exec_create_user(&mut self, s: &CreateUserStatement) -> Result<QueryResult, EngineError> {
2183        if self.in_transaction() {
2184            return Err(EngineError::Unsupported(
2185                "CREATE USER is not allowed inside a transaction".into(),
2186            ));
2187        }
2188        let role = users::Role::parse(&s.role).ok_or_else(|| {
2189            EngineError::Unsupported(alloc::format!("invalid role: {:?}", s.role))
2190        })?;
2191        // Prefer the host-injected RNG. Falls back to a deterministic
2192        // salt derived from the username only when no RNG is wired —
2193        // acceptable for tests; the server always installs one.
2194        let salt = self.salt_fn.map_or_else(
2195            || {
2196                let mut s_bytes = [0u8; 16];
2197                let digest = spg_crypto::hash(s.name.as_bytes());
2198                s_bytes.copy_from_slice(&digest[..16]);
2199                s_bytes
2200            },
2201            |f| f(),
2202        );
2203        self.users
2204            .create(&s.name, &s.password, role, salt)
2205            .map_err(|e| EngineError::Unsupported(alloc::format!("CREATE USER: {e}")))?;
2206        Ok(QueryResult::CommandOk {
2207            affected: 1,
2208            modified_catalog: true,
2209        })
2210    }
2211
2212    fn exec_drop_user(&mut self, name: &str) -> Result<QueryResult, EngineError> {
2213        if self.in_transaction() {
2214            return Err(EngineError::Unsupported(
2215                "DROP USER is not allowed inside a transaction".into(),
2216            ));
2217        }
2218        self.users
2219            .drop(name)
2220            .map_err(|e| EngineError::Unsupported(alloc::format!("DROP USER: {e}")))?;
2221        Ok(QueryResult::CommandOk {
2222            affected: 1,
2223            modified_catalog: true,
2224        })
2225    }
2226
2227    /// v4.4 `UPDATE <table> SET col = expr [, ...] [WHERE cond]`.
2228    /// Filter pass uses the same WHERE eval as `exec_select`. Per
2229    /// matched row, evaluate each RHS expression against the *old*
2230    /// row, then call `Table::update_row` which rebuilds indices.
2231    /// Indexed columns are correctly reflected because rebuild
2232    /// happens after the cell rewrite.
2233    fn exec_update_cancel(
2234        &mut self,
2235        stmt: &spg_sql::ast::UpdateStatement,
2236        cancel: CancelToken<'_>,
2237    ) -> Result<QueryResult, EngineError> {
2238        // v5.2.3: if the WHERE is a PK equality and matches a cold-
2239        // tier row, promote it back to the hot tier *before* the
2240        // hot-row walk. The promote pushes the row to the end of
2241        // `table.rows`, where the upcoming SET-evaluation loop will
2242        // pick it up and apply the assignments. Lookups for the key
2243        // never observe a gap because `promote_cold_row` inserts the
2244        // hot row before retiring the cold locator.
2245        if let Some(w) = &stmt.where_ {
2246            let schema_cols = self
2247                .active_catalog()
2248                .get(&stmt.table)
2249                .ok_or_else(|| {
2250                    EngineError::Storage(StorageError::TableNotFound {
2251                        name: stmt.table.clone(),
2252                    })
2253                })?
2254                .schema()
2255                .columns
2256                .clone();
2257            if let Some((col_pos, key)) = try_pk_predicate(w, &schema_cols, stmt.table.as_str())
2258                && let Some(idx_name) = self
2259                    .active_catalog()
2260                    .get(&stmt.table)
2261                    .and_then(|t| t.index_on(col_pos).map(|i| i.name.clone()))
2262            {
2263                // Promote may be a no-op (key is hot-only or absent);
2264                // we don't care about the return value here — the
2265                // subsequent hot walk will either match or not.
2266                let _ = self
2267                    .active_catalog_mut()
2268                    .promote_cold_row(&stmt.table, &idx_name, &key);
2269            }
2270        }
2271
2272        let table = self
2273            .active_catalog_mut()
2274            .get_mut(&stmt.table)
2275            .ok_or_else(|| {
2276                EngineError::Storage(StorageError::TableNotFound {
2277                    name: stmt.table.clone(),
2278                })
2279            })?;
2280        let schema_cols: Vec<ColumnSchema> = table.schema().columns.clone();
2281        // Resolve each SET target to a column position once, validate
2282        // up front so a typo'd column doesn't leave a partial mutation
2283        // behind.
2284        let mut targets: Vec<(usize, &Expr)> = Vec::with_capacity(stmt.assignments.len());
2285        for (col, expr) in &stmt.assignments {
2286            let pos = schema_cols
2287                .iter()
2288                .position(|c| c.name == *col)
2289                .ok_or_else(|| {
2290                    EngineError::Eval(EvalError::ColumnNotFound { name: col.clone() })
2291                })?;
2292            targets.push((pos, expr));
2293        }
2294        let ctx = EvalContext::new(&schema_cols, Some(stmt.table.as_str()));
2295        // Walk every row, evaluate WHERE then SET expressions. We
2296        // gather (position, new_values) tuples first and apply them
2297        // afterwards so the WHERE/RHS evaluation reads the original
2298        // row state — matches PG semantics (UPDATE doesn't see its
2299        // own writes).
2300        let mut planned: Vec<(usize, Vec<Value>)> = Vec::new();
2301        for (i, row) in table.rows().iter().enumerate() {
2302            // v4.5: cooperative cancel checkpoint every 256 rows so
2303            // a runaway UPDATE without WHERE doesn't drag past the
2304            // server's query-timeout watchdog.
2305            if i.is_multiple_of(256) {
2306                cancel.check()?;
2307            }
2308            if let Some(w) = &stmt.where_ {
2309                let cond = eval::eval_expr(w, row, &ctx)?;
2310                if !matches!(cond, Value::Bool(true)) {
2311                    continue;
2312                }
2313            }
2314            let mut new_vals = row.values.clone();
2315            for (pos, expr) in &targets {
2316                let v = eval::eval_expr(expr, row, &ctx)?;
2317                new_vals[*pos] =
2318                    coerce_value(v, schema_cols[*pos].ty, &schema_cols[*pos].name, *pos)?;
2319            }
2320            planned.push((i, new_vals));
2321        }
2322        // v7.6.6 — capture pre-update row values for the FK
2323        // enforcement passes below. `planned` carries new values
2324        // only; pair them with the old row.
2325        let plan_with_old: Vec<(usize, Vec<Value>, Vec<Value>)> = planned
2326            .iter()
2327            .map(|(pos, new_vals)| (*pos, table.rows()[*pos].values.clone(), new_vals.clone()))
2328            .collect();
2329        let self_fks = table.schema().foreign_keys.clone();
2330        let affected = planned.len();
2331        // Release mutable borrow on `table` for the FK passes.
2332        let _ = table;
2333        // v7.6.6 — Stage 2a: outbound FK check. For every row whose
2334        // local FK columns changed, the new value must exist in the
2335        // parent.
2336        if !self_fks.is_empty() {
2337            let new_rows: Vec<Vec<Value>> = planned
2338                .iter()
2339                .map(|(_pos, new_vals)| new_vals.clone())
2340                .collect();
2341            enforce_fk_inserts(self.active_catalog(), &stmt.table, &self_fks, &new_rows)?;
2342        }
2343        // v7.6.6 — Stage 2b: inbound FK check. For every row that
2344        // changed value in a column that *some other table* uses as
2345        // a FK parent column, react per `on_update` action.
2346        let child_plan = plan_fk_parent_updates(self.active_catalog(), &stmt.table, &plan_with_old)?;
2347        // Stage 3a — apply each child-side action.
2348        for step in &child_plan {
2349            apply_fk_child_step(self.active_catalog_mut(), step)?;
2350        }
2351        // Stage 3b — apply the original UPDATE.
2352        let table = self
2353            .active_catalog_mut()
2354            .get_mut(&stmt.table)
2355            .ok_or_else(|| {
2356                EngineError::Storage(StorageError::TableNotFound {
2357                    name: stmt.table.clone(),
2358                })
2359            })?;
2360        // v7.9.4 — snapshot post-update values for RETURNING.
2361        let updated_for_returning: Vec<Vec<Value>> =
2362            if stmt.returning.is_some() {
2363                planned.iter().map(|(_pos, vals)| vals.clone()).collect()
2364            } else {
2365                Vec::new()
2366            };
2367        for (pos, vals) in planned {
2368            table.update_row(pos, vals)?;
2369        }
2370        let _ = table;
2371        // v6.2.1 — auto-analyze modified-row tracking for UPDATE.
2372        if !self.in_transaction() && affected > 0 {
2373            self.statistics
2374                .record_modifications(&stmt.table, affected as u64);
2375        }
2376        // v7.9.4 — RETURNING projection.
2377        if let Some(items) = &stmt.returning {
2378            return self.build_returning_rows(
2379                &stmt.table,
2380                items,
2381                updated_for_returning,
2382            );
2383        }
2384        Ok(QueryResult::CommandOk {
2385            affected,
2386            modified_catalog: !self.in_transaction(),
2387        })
2388    }
2389
2390    /// v4.4 `DELETE FROM <table> [WHERE cond]`. Collects matching
2391    /// positions then delegates to `Table::delete_rows` (single index
2392    /// rebuild for the batch).
2393    fn exec_delete_cancel(
2394        &mut self,
2395        stmt: &spg_sql::ast::DeleteStatement,
2396        cancel: CancelToken<'_>,
2397    ) -> Result<QueryResult, EngineError> {
2398        // v5.2.3: PK-targeted DELETE → first retire any cold-tier
2399        // locator for the key. The cold row body stays in the
2400        // segment (becoming shadowed garbage that a future
2401        // compaction pass reclaims) but the index no longer
2402        // resolves it. The shadow count contributes to the
2403        // affected total; the subsequent hot walk handles any hot
2404        // rows for the same key.
2405        let mut cold_shadow_count: usize = 0;
2406        if let Some(w) = &stmt.where_ {
2407            let schema_cols = self
2408                .active_catalog()
2409                .get(&stmt.table)
2410                .ok_or_else(|| {
2411                    EngineError::Storage(StorageError::TableNotFound {
2412                        name: stmt.table.clone(),
2413                    })
2414                })?
2415                .schema()
2416                .columns
2417                .clone();
2418            if let Some((col_pos, key)) = try_pk_predicate(w, &schema_cols, stmt.table.as_str())
2419                && let Some(idx_name) = self
2420                    .active_catalog()
2421                    .get(&stmt.table)
2422                    .and_then(|t| t.index_on(col_pos).map(|i| i.name.clone()))
2423            {
2424                cold_shadow_count = self
2425                    .active_catalog_mut()
2426                    .shadow_cold_row(&stmt.table, &idx_name, &key)
2427                    .unwrap_or(0);
2428            }
2429        }
2430
2431        let table = self
2432            .active_catalog_mut()
2433            .get_mut(&stmt.table)
2434            .ok_or_else(|| {
2435                EngineError::Storage(StorageError::TableNotFound {
2436                    name: stmt.table.clone(),
2437                })
2438            })?;
2439        let schema_cols: Vec<ColumnSchema> = table.schema().columns.clone();
2440        let ctx = EvalContext::new(&schema_cols, Some(stmt.table.as_str()));
2441        let mut positions: Vec<usize> = Vec::new();
2442        // v7.6.3 — collect every to-delete row's full Value tuple
2443        // alongside its position, so the FK enforcement pass can
2444        // run after the mut borrow drops.
2445        let mut to_delete_rows: Vec<Vec<Value>> = Vec::new();
2446        for (i, row) in table.rows().iter().enumerate() {
2447            if i.is_multiple_of(256) {
2448                cancel.check()?;
2449            }
2450            let keep = if let Some(w) = &stmt.where_ {
2451                let cond = eval::eval_expr(w, row, &ctx)?;
2452                !matches!(cond, Value::Bool(true))
2453            } else {
2454                false
2455            };
2456            if !keep {
2457                positions.push(i);
2458                to_delete_rows.push(row.values.clone());
2459            }
2460        }
2461        // v7.6.3 / v7.6.4 — Stage 2: FK enforcement on the immutable
2462        // catalog. Release the mut borrow and run reverse-scan
2463        // against every child table whose FK targets this table.
2464        // RESTRICT / NoAction raise an error; CASCADE returns a
2465        // cascade plan that stage 3 applies after the primary delete.
2466        // SET NULL / SET DEFAULT remain Unsupported until v7.6.5.
2467        let _ = table;
2468        let cascade_plan = plan_fk_parent_deletions(
2469            self.active_catalog(),
2470            &stmt.table,
2471            &positions,
2472            &to_delete_rows,
2473        )?;
2474        // Stage 3a — apply each FK child step (SET NULL / SET
2475        // DEFAULT / CASCADE delete) before deleting the parent.
2476        // The plan is already ordered: nulls/defaults first, then
2477        // cascade deletes (so a row mutated and later deleted
2478        // surfaces as deleted — though v7.6.5 doesn't produce
2479        // that overlap today).
2480        for step in &cascade_plan {
2481            apply_fk_child_step(self.active_catalog_mut(), step)?;
2482        }
2483        // Stage 3b — actually delete the original target rows.
2484        let table = self
2485            .active_catalog_mut()
2486            .get_mut(&stmt.table)
2487            .ok_or_else(|| {
2488                EngineError::Storage(StorageError::TableNotFound {
2489                    name: stmt.table.clone(),
2490                })
2491            })?;
2492        let affected = table.delete_rows(&positions) + cold_shadow_count;
2493        let _ = table;
2494        // v6.2.1 — auto-analyze modified-row tracking for DELETE.
2495        if !self.in_transaction() && affected > 0 {
2496            self.statistics
2497                .record_modifications(&stmt.table, affected as u64);
2498        }
2499        // v7.9.4 — RETURNING projection over the soon-to-be-gone
2500        // rows. `to_delete_rows` was snapshotted in stage 1 before
2501        // mutation, so the projection sees the pre-delete state
2502        // (matches PG semantics: DELETE RETURNING returns the row
2503        // as it was just before removal).
2504        if let Some(items) = &stmt.returning {
2505            return self.build_returning_rows(
2506                &stmt.table,
2507                items,
2508                to_delete_rows,
2509            );
2510        }
2511        Ok(QueryResult::CommandOk {
2512            affected,
2513            modified_catalog: !self.in_transaction(),
2514        })
2515    }
2516
2517    /// `SHOW TABLES` — one row per table in the active catalog.
2518    /// Column name is `name` so result-set consumers can downstream
2519    /// `SELECT name FROM ...` style logic if needed.
2520    /// v4.26: `EXPLAIN [ANALYZE] <select>`. Returns a single-column
2521    /// `QUERY PLAN` text table — first line names the top operator
2522    /// (Scan / Aggregate / Window / etc.), indented children list
2523    /// FROM joins, WHERE filters, ORDER BY / LIMIT, projection
2524    /// shape, and any active index hits. `ANALYZE` execs the inner
2525    /// SELECT and appends actual-row + elapsed-micros annotations.
2526    #[allow(clippy::format_push_string)]
2527    fn exec_explain(
2528        &self,
2529        e: &spg_sql::ast::ExplainStatement,
2530        cancel: CancelToken<'_>,
2531    ) -> Result<QueryResult, EngineError> {
2532        let mut lines = Vec::<String>::new();
2533        explain_select(&e.inner, self, 0, &mut lines);
2534        if e.suggest {
2535            // v6.8.3 — index advisor. Walks the SELECT's FROM
2536            // tables + WHERE column refs; for each (table, column)
2537            // pair that lacks an index, append a SUGGEST line with
2538            // a copy-pastable `CREATE INDEX` statement. This is a
2539            // pure-syntax heuristic — no cardinality estimation —
2540            // matching the v6.8.3 design intent of "tell the
2541            // operator where indexes are missing", not "give the
2542            // mathematically optimal index set".
2543            let suggestions = build_index_suggestions(&e.inner, self);
2544            for s in suggestions {
2545                lines.push(s);
2546            }
2547        } else if e.analyze {
2548            // v6.2.4 — EXPLAIN ANALYZE annotates each operator line
2549            // with `(rows=N)` where the row count is computable
2550            // without re-executing the full query:
2551            //   - Top-level operator (first non-indented line):
2552            //     rows = final result.len()
2553            //   - "From: <table> [full scan]" lines: rows =
2554            //     table.rows().len() (catalog read; no execution)
2555            //   - "From: <table> [index seek]": indeterminate —
2556            //     the index step would need re-execution; v6.2.5
2557            //     adds per-operator wall-clock + hot/cold rows
2558            //     instrumentation that makes this concrete.
2559            //   - Everything else: marked `(—)` so the surface
2560            //     stays well-defined without silently dropping
2561            //     stats. v6.2.5 fills in via inline executor
2562            //     instrumentation.
2563            // Total elapsed lands on a trailing `Total: …` line.
2564            let started = self.clock.map(|f| f());
2565            let exec = self.exec_select_cancel(&e.inner, cancel)?;
2566            let elapsed_micros = match (self.clock, started) {
2567                (Some(f), Some(s)) => Some(f().saturating_sub(s)),
2568                _ => None,
2569            };
2570            let row_count = if let QueryResult::Rows { rows, .. } = &exec {
2571                rows.len()
2572            } else {
2573                0
2574            };
2575            annotate_explain_lines(&mut lines, row_count, self);
2576            let mut total = alloc::format!("Total: rows={row_count}");
2577            if let Some(us) = elapsed_micros {
2578                total.push_str(&alloc::format!(" elapsed={us}us"));
2579            }
2580            lines.push(total);
2581        }
2582        let columns = alloc::vec![ColumnSchema::new("QUERY PLAN", DataType::Text, false)];
2583        let rows: Vec<Row> = lines
2584            .into_iter()
2585            .map(|l| Row::new(alloc::vec![Value::Text(l)]))
2586            .collect();
2587        Ok(QueryResult::Rows { columns, rows })
2588    }
2589
2590    fn exec_show_tables(&self) -> QueryResult {
2591        let columns = alloc::vec![ColumnSchema::new("name", DataType::Text, false)];
2592        let rows: Vec<Row> = self
2593            .active_catalog()
2594            .table_names()
2595            .into_iter()
2596            .map(|n| Row::new(alloc::vec![Value::Text(n)]))
2597            .collect();
2598        QueryResult::Rows { columns, rows }
2599    }
2600
2601    /// `SHOW COLUMNS FROM <table>` — one row per column with the
2602    /// declared name, SQL type rendering, and nullability flag.
2603    fn exec_show_columns(&self, table_name: &str) -> Result<QueryResult, EngineError> {
2604        let table =
2605            self.active_catalog()
2606                .get(table_name)
2607                .ok_or_else(|| StorageError::TableNotFound {
2608                    name: table_name.into(),
2609                })?;
2610        let columns = alloc::vec![
2611            ColumnSchema::new("name", DataType::Text, false),
2612            ColumnSchema::new("type", DataType::Text, false),
2613            ColumnSchema::new("nullable", DataType::Bool, false),
2614        ];
2615        let rows: Vec<Row> = table
2616            .schema()
2617            .columns
2618            .iter()
2619            .map(|c| {
2620                Row::new(alloc::vec![
2621                    Value::Text(c.name.clone()),
2622                    Value::Text(alloc::format!("{}", c.ty)),
2623                    Value::Bool(c.nullable),
2624                ])
2625            })
2626            .collect();
2627        Ok(QueryResult::Rows { columns, rows })
2628    }
2629
2630    fn exec_begin(&mut self) -> Result<QueryResult, EngineError> {
2631        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2632        if self.tx_catalogs.contains_key(&tx_id) {
2633            return Err(EngineError::TransactionAlreadyOpen);
2634        }
2635        self.tx_catalogs.insert(
2636            tx_id,
2637            TxState {
2638                catalog: self.catalog.clone(),
2639                savepoints: Vec::new(),
2640            },
2641        );
2642        Ok(QueryResult::CommandOk {
2643            affected: 0,
2644            modified_catalog: false,
2645        })
2646    }
2647
2648    fn exec_commit(&mut self) -> Result<QueryResult, EngineError> {
2649        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2650        let state = self
2651            .tx_catalogs
2652            .remove(&tx_id)
2653            .ok_or(EngineError::NoActiveTransaction)?;
2654        self.catalog = state.catalog;
2655        // All savepoints become permanent at COMMIT and the stack
2656        // resets for the next TX (`state.savepoints` is discarded with
2657        // `state`).
2658        Ok(QueryResult::CommandOk {
2659            affected: 0,
2660            modified_catalog: true,
2661        })
2662    }
2663
2664    fn exec_rollback(&mut self) -> Result<QueryResult, EngineError> {
2665        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2666        if self.tx_catalogs.remove(&tx_id).is_none() {
2667            return Err(EngineError::NoActiveTransaction);
2668        }
2669        // savepoints discarded with the TxState
2670        Ok(QueryResult::CommandOk {
2671            affected: 0,
2672            modified_catalog: false,
2673        })
2674    }
2675
2676    fn exec_savepoint(&mut self, name: String) -> Result<QueryResult, EngineError> {
2677        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2678        let state = self
2679            .tx_catalogs
2680            .get_mut(&tx_id)
2681            .ok_or(EngineError::NoActiveTransaction)?;
2682        // PG re-uses an existing savepoint name by dropping the older
2683        // entry and pushing a fresh one — match that behaviour so
2684        // application code can `SAVEPOINT sp; ...; SAVEPOINT sp` freely.
2685        state.savepoints.retain(|(n, _)| n != &name);
2686        let snapshot = state.catalog.clone();
2687        state.savepoints.push((name, snapshot));
2688        Ok(QueryResult::CommandOk {
2689            affected: 0,
2690            modified_catalog: false,
2691        })
2692    }
2693
2694    fn exec_rollback_to_savepoint(&mut self, name: &str) -> Result<QueryResult, EngineError> {
2695        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2696        let state = self
2697            .tx_catalogs
2698            .get_mut(&tx_id)
2699            .ok_or(EngineError::NoActiveTransaction)?;
2700        let pos = state
2701            .savepoints
2702            .iter()
2703            .rposition(|(n, _)| n == name)
2704            .ok_or_else(|| {
2705                EngineError::Unsupported(alloc::format!("savepoint not found: {name}"))
2706            })?;
2707        // The savepoint stays on the stack (PG semantics): a later
2708        // `RELEASE` or further `ROLLBACK TO` is still allowed. Everything
2709        // after it is discarded.
2710        let snapshot = state.savepoints[pos].1.clone();
2711        state.savepoints.truncate(pos + 1);
2712        state.catalog = snapshot;
2713        Ok(QueryResult::CommandOk {
2714            affected: 0,
2715            modified_catalog: false,
2716        })
2717    }
2718
2719    fn exec_release_savepoint(&mut self, name: &str) -> Result<QueryResult, EngineError> {
2720        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2721        let state = self
2722            .tx_catalogs
2723            .get_mut(&tx_id)
2724            .ok_or(EngineError::NoActiveTransaction)?;
2725        let pos = state
2726            .savepoints
2727            .iter()
2728            .rposition(|(n, _)| n == name)
2729            .ok_or_else(|| {
2730                EngineError::Unsupported(alloc::format!("savepoint not found: {name}"))
2731            })?;
2732        // RELEASE keeps the work since the savepoint, just discards the
2733        // bookmark plus everything nested under it.
2734        state.savepoints.truncate(pos);
2735        Ok(QueryResult::CommandOk {
2736            affected: 0,
2737            modified_catalog: false,
2738        })
2739    }
2740
2741    /// v6.0.4 — synchronous `ALTER INDEX <name> REBUILD [WITH
2742    /// (encoding = …)]`. Walks every table in the active catalog
2743    /// looking for an index matching `stmt.name`, then delegates the
2744    /// rebuild (including any encoding switch) to
2745    /// `Table::rebuild_nsw_index`. The "live" non-blocking
2746    /// optimisation is v6.0.4.1 / v6.1.x territory.
2747    /// v6.7.2 — `ALTER TABLE t SET hot_tier_bytes = X`. Dispatch
2748    /// arm. Currently the only setting is `hot_tier_bytes`; later
2749    /// v6.7.x can extend `AlterTableTarget` without touching this
2750    /// arm structure.
2751    fn exec_alter_table(
2752        &mut self,
2753        s: spg_sql::ast::AlterTableStatement,
2754    ) -> Result<QueryResult, EngineError> {
2755        match s.target {
2756            spg_sql::ast::AlterTableTarget::SetHotTierBytes(n) => {
2757                let table = self
2758                    .active_catalog_mut()
2759                    .get_mut(&s.name)
2760                    .ok_or_else(|| {
2761                        EngineError::Storage(StorageError::TableNotFound {
2762                            name: s.name.clone(),
2763                        })
2764                    })?;
2765                table.schema_mut().hot_tier_bytes = Some(n);
2766            }
2767            spg_sql::ast::AlterTableTarget::AddForeignKey(fk) => {
2768                // v7.6.8 — resolve FK against the live catalog first
2769                // (validates parent table, columns, indices). Then
2770                // verify every existing row in the child table
2771                // satisfies the new constraint. Then install it.
2772                let cols_snapshot = self
2773                    .active_catalog()
2774                    .get(&s.name)
2775                    .ok_or_else(|| {
2776                        EngineError::Storage(StorageError::TableNotFound {
2777                            name: s.name.clone(),
2778                        })
2779                    })?
2780                    .schema()
2781                    .columns
2782                    .clone();
2783                let storage_fk = resolve_foreign_key(
2784                    &s.name,
2785                    &cols_snapshot,
2786                    fk,
2787                    self.active_catalog(),
2788                )?;
2789                // Verify existing rows. Treat them as a virtual
2790                // INSERT batch — reusing the v7.6.2 enforce helper.
2791                let existing_rows: Vec<Vec<Value>> = self
2792                    .active_catalog()
2793                    .get(&s.name)
2794                    .expect("checked above")
2795                    .rows()
2796                    .iter()
2797                    .map(|r| r.values.clone())
2798                    .collect();
2799                enforce_fk_inserts(
2800                    self.active_catalog(),
2801                    &s.name,
2802                    core::slice::from_ref(&storage_fk),
2803                    &existing_rows,
2804                )?;
2805                // Reject duplicate constraint name.
2806                let table = self
2807                    .active_catalog_mut()
2808                    .get_mut(&s.name)
2809                    .expect("checked above");
2810                if let Some(name) = &storage_fk.name
2811                    && table
2812                        .schema()
2813                        .foreign_keys
2814                        .iter()
2815                        .any(|f| f.name.as_ref() == Some(name))
2816                {
2817                    return Err(EngineError::Unsupported(alloc::format!(
2818                        "ALTER TABLE ADD CONSTRAINT: a constraint named {name:?} already exists"
2819                    )));
2820                }
2821                table.schema_mut().foreign_keys.push(storage_fk);
2822            }
2823            spg_sql::ast::AlterTableTarget::DropForeignKey(name) => {
2824                let table = self
2825                    .active_catalog_mut()
2826                    .get_mut(&s.name)
2827                    .ok_or_else(|| {
2828                        EngineError::Storage(StorageError::TableNotFound {
2829                            name: s.name.clone(),
2830                        })
2831                    })?;
2832                let fks = &mut table.schema_mut().foreign_keys;
2833                let before = fks.len();
2834                fks.retain(|f| f.name.as_ref() != Some(&name));
2835                if fks.len() == before {
2836                    return Err(EngineError::Unsupported(alloc::format!(
2837                        "ALTER TABLE DROP CONSTRAINT: no FK named {name:?} on {:?}",
2838                        s.name
2839                    )));
2840                }
2841            }
2842        }
2843        Ok(QueryResult::CommandOk {
2844            affected: 0,
2845            modified_catalog: !self.in_transaction(),
2846        })
2847    }
2848
2849    fn exec_alter_index(
2850        &mut self,
2851        stmt: spg_sql::ast::AlterIndexStatement,
2852    ) -> Result<QueryResult, EngineError> {
2853        // Translate the optional SQL-side encoding choice into the
2854        // storage-side enum; the same SqlVecEncoding -> VecEncoding
2855        // bridge `column_type_to_data_type` uses.
2856        let spg_sql::ast::AlterIndexStatement {
2857            name: idx_name,
2858            target,
2859        } = stmt;
2860        let spg_sql::ast::AlterIndexTarget::Rebuild { encoding } = target;
2861        let target = encoding.map(|e| match e {
2862            SqlVecEncoding::F32 => VecEncoding::F32,
2863            SqlVecEncoding::Sq8 => VecEncoding::Sq8,
2864            SqlVecEncoding::F16 => VecEncoding::F16,
2865        });
2866        // Linear scan: index names are globally unique within a
2867        // catalog (enforced by add_nsw_index_inner) so the first
2868        // match is the only one. Save the table name to avoid
2869        // borrowing while we then take a mut borrow.
2870        let table_name = {
2871            let cat = self.active_catalog();
2872            let mut found: Option<String> = None;
2873            for tname in cat.table_names() {
2874                if let Some(t) = cat.get(&tname)
2875                    && t.indices().iter().any(|i| i.name == idx_name)
2876                {
2877                    found = Some(tname);
2878                    break;
2879                }
2880            }
2881            found.ok_or_else(|| {
2882                EngineError::Storage(StorageError::IndexNotFound {
2883                    name: idx_name.clone(),
2884                })
2885            })?
2886        };
2887        let table = self
2888            .active_catalog_mut()
2889            .get_mut(&table_name)
2890            .expect("table found above");
2891        table.rebuild_nsw_index(&idx_name, target)?;
2892        // v6.3.1 — ALTER INDEX REBUILD potentially with new encoding
2893        // changes cost characteristics; evict any cached plans.
2894        self.plan_cache.evict_referencing(&table_name);
2895        Ok(QueryResult::CommandOk {
2896            affected: 0,
2897            modified_catalog: !self.in_transaction(),
2898        })
2899    }
2900
2901    fn exec_create_index(
2902        &mut self,
2903        stmt: CreateIndexStatement,
2904    ) -> Result<QueryResult, EngineError> {
2905        let table = self
2906            .active_catalog_mut()
2907            .get_mut(&stmt.table)
2908            .ok_or_else(|| {
2909                EngineError::Storage(StorageError::TableNotFound {
2910                    name: stmt.table.clone(),
2911                })
2912            })?;
2913        // `IF NOT EXISTS` reduces DuplicateIndex to a no-op CommandOk.
2914        if stmt.if_not_exists && table.indices().iter().any(|i| i.name == stmt.name) {
2915            return Ok(QueryResult::CommandOk {
2916                affected: 0,
2917                modified_catalog: false,
2918            });
2919        }
2920        // v7.9.14 — multi-column index parses through; engine
2921        // builds a single-column BTree on the leading column only.
2922        // The extras live on the AST so spg-server's dispatcher
2923        // can emit a PG-wire NoticeResponse / log line. Composite
2924        // BTree keys land in v7.10.
2925        let _ = &stmt.extra_columns; // intentional drop on engine side
2926        let table_name = stmt.table.clone();
2927        // v6.8.0 — resolve INCLUDE column names to positions. Done
2928        // before `add_index` so a typo error surfaces before any
2929        // catalog mutation lands.
2930        let included_positions: Vec<usize> = if stmt.included_columns.is_empty() {
2931            Vec::new()
2932        } else {
2933            let schema = table.schema();
2934            stmt.included_columns
2935                .iter()
2936                .map(|c| {
2937                    schema.column_position(c).ok_or_else(|| {
2938                        EngineError::Storage(StorageError::ColumnNotFound {
2939                            column: c.clone(),
2940                        })
2941                    })
2942                })
2943                .collect::<Result<Vec<_>, _>>()?
2944        };
2945        match stmt.method {
2946            IndexMethod::BTree => table.add_index(stmt.name.clone(), &stmt.column)?,
2947            IndexMethod::Hnsw => {
2948                if !included_positions.is_empty() {
2949                    return Err(EngineError::Unsupported(
2950                        "INCLUDE columns are not supported on HNSW indexes".into(),
2951                    ));
2952                }
2953                table.add_nsw_index(stmt.name.clone(), &stmt.column, spg_storage::NSW_DEFAULT_M)?;
2954            }
2955            // v6.7.1 — BRIN. Pure metadata; no in-memory data.
2956            IndexMethod::Brin => {
2957                if !included_positions.is_empty() {
2958                    return Err(EngineError::Unsupported(
2959                        "INCLUDE columns are not supported on BRIN indexes".into(),
2960                    ));
2961                }
2962                table.add_brin_index(stmt.name.clone(), &stmt.column)?;
2963            }
2964        }
2965        if !included_positions.is_empty()
2966            && let Some(idx) = table.indices_mut().iter_mut().find(|i| i.name == stmt.name)
2967        {
2968            idx.included_columns = included_positions;
2969        }
2970        // v6.8.1 — persist partial-index predicate. Stored as the
2971        // expression's Display form so the catalog snapshot stays
2972        // pure (storage has no spg-sql dependency). The runtime
2973        // maintenance path treats partial indexes identically to
2974        // full indexes for v6.8.1 (over-maintenance is safe; the
2975        // planner-side "use partial when query WHERE implies the
2976        // predicate" pass is STABILITY carve-out).
2977        if let Some(pred_expr) = &stmt.partial_predicate {
2978            let canonical = pred_expr.to_string();
2979            if matches!(stmt.method, IndexMethod::Hnsw | IndexMethod::Brin) {
2980                return Err(EngineError::Unsupported(
2981                    "WHERE predicates are not supported on HNSW or BRIN indexes".into(),
2982                ));
2983            }
2984            if let Some(idx) = table.indices_mut().iter_mut().find(|i| i.name == stmt.name) {
2985                idx.partial_predicate = Some(canonical);
2986            }
2987        }
2988        // v6.8.2 — persist expression index key. Same Display-form
2989        // storage; the runtime maintenance pass evaluates each
2990        // row's expression to derive the index key, but for v6.8.2
2991        // the engine falls through to the bare-column-reference
2992        // path and the expression is preserved for format-layer
2993        // round-trip + future planner work. Carved-out in
2994        // STABILITY § "Out of v6.8".
2995        if let Some(key_expr) = &stmt.expression {
2996            if matches!(stmt.method, IndexMethod::Hnsw | IndexMethod::Brin) {
2997                return Err(EngineError::Unsupported(
2998                    "Expression keys are not supported on HNSW or BRIN indexes".into(),
2999                ));
3000            }
3001            let canonical = key_expr.to_string();
3002            if let Some(idx) = table.indices_mut().iter_mut().find(|i| i.name == stmt.name) {
3003                idx.expression = Some(canonical);
3004            }
3005        }
3006        // v6.3.1 — adding an index can change the optimal plan for
3007        // any cached query that references this table.
3008        self.plan_cache.evict_referencing(&table_name);
3009        Ok(QueryResult::CommandOk {
3010            affected: 0,
3011            modified_catalog: !self.in_transaction(),
3012        })
3013    }
3014
3015    fn exec_create_table(
3016        &mut self,
3017        stmt: CreateTableStatement,
3018    ) -> Result<QueryResult, EngineError> {
3019        if stmt.if_not_exists && self.active_catalog().get(&stmt.name).is_some() {
3020            return Ok(QueryResult::CommandOk {
3021                affected: 0,
3022                modified_catalog: false,
3023            });
3024        }
3025        let table_name = stmt.name.clone();
3026        // v7.9.13 — pluck the names of any columns marked
3027        // `PRIMARY KEY` inline so the post-create-table pass can
3028        // build an implicit BTree index. mailrs F1.
3029        let inline_pk_columns: Vec<String> = stmt
3030            .columns
3031            .iter()
3032            .filter(|c| c.is_primary_key)
3033            .map(|c| c.name.clone())
3034            .collect();
3035        // v7.9.19 — table-level constraints: PRIMARY KEY (a, b, ...)
3036        // and UNIQUE (a, b, ...). Each builds a BTree index on the
3037        // leading column (the existing single-column storage tier)
3038        // and registers a UniquenessConstraint on the schema for
3039        // INSERT-time enforcement of the full tuple. mailrs G1/G6.
3040        let cols = stmt
3041            .columns
3042            .into_iter()
3043            .map(column_def_to_schema)
3044            .collect::<Result<Vec<_>, _>>()?;
3045        // Composite NOT-NULL implication for PRIMARY KEY columns.
3046        let mut cols = cols;
3047        for tc in &stmt.table_constraints {
3048            if let spg_sql::ast::TableConstraint::PrimaryKey { columns, .. } = tc {
3049                for col_name in columns {
3050                    if let Some(col) = cols.iter_mut().find(|c| c.name == *col_name) {
3051                        col.nullable = false;
3052                    }
3053                }
3054            }
3055        }
3056        // v7.6.1 — resolve every FK in the statement against the
3057        // already-known catalog. Validates: parent table exists,
3058        // parent column names exist, arity matches, parent columns
3059        // have a PK / UNIQUE index. Self-referencing FKs (parent
3060        // table == this table) resolve against the column list we
3061        // just built — they don't need the catalog yet.
3062        let mut fks: Vec<spg_storage::ForeignKeyConstraint> =
3063            Vec::with_capacity(stmt.foreign_keys.len());
3064        for fk in stmt.foreign_keys {
3065            fks.push(resolve_foreign_key(
3066                &table_name,
3067                &cols,
3068                fk,
3069                self.active_catalog(),
3070            )?);
3071        }
3072        let mut schema = TableSchema::new(table_name.clone(), cols);
3073        schema.foreign_keys = fks;
3074        // v7.9.19 — translate AST table_constraints to storage
3075        // UniquenessConstraints (column name → position) so the
3076        // INSERT enforcement helper sees positions directly.
3077        let mut uc_storage: Vec<spg_storage::UniquenessConstraint> = Vec::new();
3078        for tc in &stmt.table_constraints {
3079            let (is_pk, names) = match tc {
3080                spg_sql::ast::TableConstraint::PrimaryKey { columns, .. } => {
3081                    (true, columns.clone())
3082                }
3083                spg_sql::ast::TableConstraint::Unique { columns, .. } => {
3084                    (false, columns.clone())
3085                }
3086            };
3087            let mut positions = Vec::with_capacity(names.len());
3088            for n in &names {
3089                let pos = schema
3090                    .columns
3091                    .iter()
3092                    .position(|c| c.name == *n)
3093                    .ok_or_else(|| {
3094                        EngineError::Unsupported(alloc::format!(
3095                            "table constraint references unknown column {n:?}"
3096                        ))
3097                    })?;
3098                positions.push(pos);
3099            }
3100            uc_storage.push(spg_storage::UniquenessConstraint {
3101                is_primary_key: is_pk,
3102                columns: positions,
3103            });
3104        }
3105        schema.uniqueness_constraints = uc_storage.clone();
3106        self.active_catalog_mut().create_table(schema)?;
3107        // v7.9.13 — implicit BTree per inline PK column +
3108        // v7.9.19 — implicit BTree on the leading column of every
3109        // table-level PRIMARY KEY / UNIQUE constraint.
3110        let table = self
3111            .active_catalog_mut()
3112            .get_mut(&table_name)
3113            .expect("just created");
3114        for (i, col_name) in inline_pk_columns.iter().enumerate() {
3115            let idx_name = if inline_pk_columns.len() == 1 {
3116                alloc::format!("{table_name}_pkey")
3117            } else {
3118                alloc::format!("{table_name}_pkey_{i}")
3119            };
3120            if let Err(e) = table.add_index(idx_name, col_name) {
3121                return Err(EngineError::Storage(e));
3122            }
3123        }
3124        for (i, tc) in stmt.table_constraints.iter().enumerate() {
3125            let (is_pk, names) = match tc {
3126                spg_sql::ast::TableConstraint::PrimaryKey { columns, .. } => {
3127                    (true, columns)
3128                }
3129                spg_sql::ast::TableConstraint::Unique { columns, .. } => {
3130                    (false, columns)
3131                }
3132            };
3133            let leading = &names[0];
3134            // Skip if a same-column BTree already exists (e.g.
3135            // inline PK on the leading column).
3136            let already = table
3137                .indices()
3138                .iter()
3139                .any(|idx| {
3140                    matches!(idx.kind, spg_storage::IndexKind::BTree(_))
3141                        && table.schema().columns[idx.column_position].name == *leading
3142                });
3143            if already {
3144                continue;
3145            }
3146            let suffix = if is_pk { "pkey" } else { "key" };
3147            let idx_name = if names.len() == 1 {
3148                alloc::format!("{table_name}_{leading}_{suffix}")
3149            } else {
3150                alloc::format!("{table_name}_{leading}_{suffix}_{i}")
3151            };
3152            if let Err(e) = table.add_index(idx_name, leading) {
3153                return Err(EngineError::Storage(e));
3154            }
3155        }
3156        Ok(QueryResult::CommandOk {
3157            affected: 0,
3158            modified_catalog: !self.in_transaction(),
3159        })
3160    }
3161
3162    fn exec_insert(&mut self, stmt: InsertStatement) -> Result<QueryResult, EngineError> {
3163        // v7.9.21 — snapshot the clock fn pointer before the mut
3164        // borrow on the catalog opens; runtime DEFAULT eval needs
3165        // it inside the row hot loop.
3166        let clock = self.clock;
3167        let table = self
3168            .active_catalog_mut()
3169            .get_mut(&stmt.table)
3170            .ok_or_else(|| {
3171                EngineError::Storage(StorageError::TableNotFound {
3172                    name: stmt.table.clone(),
3173                })
3174            })?;
3175        // v3.1.5: clone the columns vector only (not the whole
3176        // TableSchema — saves one String alloc for the table name).
3177        // We need an owned snapshot because we'll call `table.insert`
3178        // (mutable borrow on `table`) inside the row loop while
3179        // reading schema fields.
3180        let column_meta: Vec<ColumnSchema> = table.schema().columns.clone();
3181        let schema_cols_len = column_meta.len();
3182        // Build a permutation `tuple_pos[c] = Some(j)` meaning schema
3183        // column `c` is filled from the `j`-th tuple slot; `None` means
3184        // "fill with NULL". Validated once and reused for every row.
3185        let tuple_pos: Option<Vec<Option<usize>>> = match &stmt.columns {
3186            None => None, // 1-1 mapping, fast path
3187            Some(cols) => {
3188                let mut map = alloc::vec![None; schema_cols_len];
3189                for (j, name) in cols.iter().enumerate() {
3190                    let idx = column_meta
3191                        .iter()
3192                        .position(|c| c.name == *name)
3193                        .ok_or_else(|| {
3194                            EngineError::Eval(EvalError::ColumnNotFound { name: name.clone() })
3195                        })?;
3196                    if map[idx].is_some() {
3197                        return Err(EngineError::Storage(StorageError::ArityMismatch {
3198                            expected: schema_cols_len,
3199                            actual: cols.len(),
3200                        }));
3201                    }
3202                    map[idx] = Some(j);
3203                }
3204                // Omitted columns must either be nullable, carry a
3205                // DEFAULT, or be AUTO_INCREMENT. Catch NOT NULL
3206                // omissions up front so the WAL stays clean.
3207                for (i, col) in column_meta.iter().enumerate() {
3208                    if map[i].is_none()
3209                        && !col.nullable
3210                        && col.default.is_none()
3211                        && col.runtime_default.is_none()
3212                        && !col.auto_increment
3213                    {
3214                        return Err(EngineError::Storage(StorageError::NullInNotNull {
3215                            column: col.name.clone(),
3216                        }));
3217                    }
3218                }
3219                Some(map)
3220            }
3221        };
3222        let expected_tuple_len = stmt.columns.as_ref().map_or(schema_cols_len, Vec::len);
3223        // v7.6.2 — snapshot this table's FK list before the
3224        // mutable-borrow window so we can run parent lookups
3225        // against the immutable catalog after parsing. Empty vec is
3226        // the no-FK fast path; clone cost is O(fks * arity) which
3227        // is < 100 ns for typical schemas.
3228        let fks = table.schema().foreign_keys.clone();
3229        let mut affected = 0usize;
3230        // Stage 1 — parse + AUTO_INC + coerce all rows under the
3231        // single mutable borrow.
3232        let mut all_values: Vec<Vec<Value>> = Vec::with_capacity(stmt.rows.len());
3233        for tuple in stmt.rows {
3234            if tuple.len() != expected_tuple_len {
3235                return Err(EngineError::Storage(StorageError::ArityMismatch {
3236                    expected: expected_tuple_len,
3237                    actual: tuple.len(),
3238                }));
3239            }
3240            // Fast path: no column-list permutation → tuple slot j
3241            // maps to schema column j. We can zip schema with tuple
3242            // and skip the `raw_tuple` staging allocation entirely.
3243            let values: Vec<Value> = if let Some(map) = &tuple_pos {
3244                // Permuted path: still need raw_tuple to index by `map[i]`.
3245                let raw_tuple: Vec<Value> = tuple
3246                    .into_iter()
3247                    .map(literal_expr_to_value)
3248                    .collect::<Result<_, _>>()?;
3249                let mut out = Vec::with_capacity(schema_cols_len);
3250                for (i, col) in column_meta.iter().enumerate() {
3251                    let mut raw = match map[i] {
3252                        Some(j) => raw_tuple[j].clone(),
3253                        None => resolve_column_default_free(col, clock)?,
3254                    };
3255                    if col.auto_increment && raw.is_null() {
3256                        let next = table.next_auto_value(i).ok_or_else(|| {
3257                            EngineError::Unsupported(alloc::format!(
3258                                "AUTO_INCREMENT applies to integer columns only (column `{}`)",
3259                                col.name
3260                            ))
3261                        })?;
3262                        raw = Value::BigInt(next);
3263                    }
3264                    out.push(coerce_value(raw, col.ty, &col.name, i)?);
3265                }
3266                out
3267            } else {
3268                // 1-1 mapping fast path: single Vec alloc, no raw_tuple.
3269                let mut out = Vec::with_capacity(schema_cols_len);
3270                for (i, (col, expr)) in column_meta.iter().zip(tuple).enumerate() {
3271                    let mut raw = literal_expr_to_value(expr)?;
3272                    if col.auto_increment && raw.is_null() {
3273                        let next = table.next_auto_value(i).ok_or_else(|| {
3274                            EngineError::Unsupported(alloc::format!(
3275                                "AUTO_INCREMENT applies to integer columns only (column `{}`)",
3276                                col.name
3277                            ))
3278                        })?;
3279                        raw = Value::BigInt(next);
3280                    }
3281                    out.push(coerce_value(raw, col.ty, &col.name, i)?);
3282                }
3283                out
3284            };
3285            all_values.push(values);
3286        }
3287        // Stage 2 — FK enforcement on the immutable catalog.
3288        // Non-lexical lifetimes release the mutable borrow on
3289        // `table` here since stage 1 was the last use. The
3290        // parent-table lookup runs before any row is committed.
3291        let uniqueness = table.schema().uniqueness_constraints.clone();
3292        let _ = table;
3293        if !fks.is_empty() {
3294            enforce_fk_inserts(self.active_catalog(), &stmt.table, &fks, &all_values)?;
3295        }
3296        // v7.9.19 — composite UNIQUE / PRIMARY KEY enforcement.
3297        enforce_uniqueness_inserts(
3298            self.active_catalog(),
3299            &stmt.table,
3300            &uniqueness,
3301            &all_values,
3302        )?;
3303        // v7.9.8 / v7.9.9 — ON CONFLICT handling.
3304        //   - `DO NOTHING` filters `all_values` to non-conflicting
3305        //     rows + drops within-batch duplicates.
3306        //   - `DO UPDATE SET …` ALSO filters, but for each
3307        //     conflicting row it queues an UPDATE on the existing
3308        //     row using the incoming row's values as `EXCLUDED.*`.
3309        let mut pending_updates: Vec<(usize, Vec<Value>)> = Vec::new();
3310        let mut skipped_count = 0usize;
3311        if let Some(clause) = &stmt.on_conflict {
3312            let conflict_cols = resolve_on_conflict_columns(
3313                self.active_catalog(),
3314                &stmt.table,
3315                clause.target_columns.as_slice(),
3316            )?;
3317            let mut kept: Vec<Vec<Value>> = Vec::with_capacity(all_values.len());
3318            let mut seen_keys: Vec<Vec<Value>> = Vec::new();
3319            for values in all_values {
3320                let key_tuple: Vec<&Value> =
3321                    conflict_cols.iter().map(|&c| &values[c]).collect();
3322                // SQL spec: NULL in any conflict column means "no
3323                // conflict possible" (NULL ≠ NULL for uniqueness).
3324                let has_null_key = key_tuple.iter().any(|v| matches!(v, Value::Null));
3325                let collides_with_table = !has_null_key
3326                    && on_conflict_keys_exist(
3327                        self.active_catalog(),
3328                        &stmt.table,
3329                        &conflict_cols,
3330                        &key_tuple,
3331                    );
3332                let key_tuple_owned: Vec<Value> =
3333                    key_tuple.iter().map(|v| (*v).clone()).collect();
3334                let collides_with_batch = !has_null_key
3335                    && seen_keys.iter().any(|k| k == &key_tuple_owned);
3336                let collides = collides_with_table || collides_with_batch;
3337                match (&clause.action, collides) {
3338                    (_, false) => {
3339                        seen_keys.push(key_tuple_owned);
3340                        kept.push(values);
3341                    }
3342                    (spg_sql::ast::OnConflictAction::Nothing, true) => {
3343                        skipped_count += 1;
3344                    }
3345                    (
3346                        spg_sql::ast::OnConflictAction::Update {
3347                            assignments,
3348                            where_,
3349                        },
3350                        true,
3351                    ) => {
3352                        if !collides_with_table {
3353                            skipped_count += 1;
3354                            continue;
3355                        }
3356                        let target_pos = lookup_row_position_by_keys(
3357                            self.active_catalog(),
3358                            &stmt.table,
3359                            &conflict_cols,
3360                            &key_tuple,
3361                        )
3362                        .ok_or_else(|| {
3363                            EngineError::Unsupported(
3364                                "ON CONFLICT DO UPDATE: conflict detected but row \
3365                                 position could not be resolved (cold-tier row?)"
3366                                    .into(),
3367                            )
3368                        })?;
3369                        let updated = apply_on_conflict_assignments(
3370                            self.active_catalog(),
3371                            &stmt.table,
3372                            target_pos,
3373                            &values,
3374                            assignments,
3375                            where_.as_ref(),
3376                        )?;
3377                        if let Some(new_row) = updated {
3378                            pending_updates.push((target_pos, new_row));
3379                        } else {
3380                            skipped_count += 1;
3381                        }
3382                    }
3383                }
3384            }
3385            all_values = kept;
3386        }
3387        // Stage 3 — insert all rows under a fresh mutable borrow.
3388        let table = self
3389            .active_catalog_mut()
3390            .get_mut(&stmt.table)
3391            .ok_or_else(|| {
3392                EngineError::Storage(StorageError::TableNotFound {
3393                    name: stmt.table.clone(),
3394                })
3395            })?;
3396        // v7.9.4 — keep RETURNING projection rows separate per
3397        // INSERT and per UPDATE branch so DO UPDATE pushes the new
3398        // post-update state, not the incoming-only values.
3399        let mut returning_rows: Vec<Vec<Value>> = Vec::new();
3400        for values in all_values {
3401            if stmt.returning.is_some() {
3402                returning_rows.push(values.clone());
3403            }
3404            table.insert(Row::new(values))?;
3405            affected += 1;
3406        }
3407        // v7.9.9 — apply ON CONFLICT DO UPDATE rewrites collected
3408        // in the conflict-resolution pass. update_row handles
3409        // index maintenance + body re-encoding.
3410        for (pos, new_row) in pending_updates {
3411            if stmt.returning.is_some() {
3412                returning_rows.push(new_row.clone());
3413            }
3414            table.update_row(pos, new_row)?;
3415            affected += 1;
3416        }
3417        let _ = skipped_count;
3418        // v7.9.4/v7.9.9 — RETURNING streams the rows that ended
3419        // up in the table after this statement (insert or
3420        // post-update on conflict).
3421        if let Some(items) = &stmt.returning {
3422            let _ = table;
3423            return self.build_returning_rows(
3424                &stmt.table,
3425                items,
3426                returning_rows,
3427            );
3428        }
3429        // v6.2.1 — auto-analyze: track per-table modified-row
3430        // counter so the background sweep can decide when to
3431        // re-ANALYZE. Cheap path on the autocommit-wrap hot loop
3432        // — one BTreeMap entry update per INSERT batch.
3433        if !self.in_transaction() && affected > 0 {
3434            self.statistics
3435                .record_modifications(&stmt.table, affected as u64);
3436        }
3437        Ok(QueryResult::CommandOk {
3438            affected,
3439            modified_catalog: !self.in_transaction(),
3440        })
3441    }
3442
3443    /// v4.5: SELECT with cooperative cancellation. The token is
3444    /// honoured between UNION peers and inside the bare-SELECT row
3445    /// loop; HNSW kNN graph walks and the aggregate executor don't
3446    /// honour it yet (deferred — those paths bound their work
3447    /// internally by `LIMIT k` and `GROUP BY` cardinality).
3448    /// v6.10.2 — cold-tier time-travel scan. Resolves the segment
3449    /// by id, decodes each row body against the table's current
3450    /// schema, applies the SELECT's projection + optional WHERE +
3451    /// optional LIMIT, returns a `Rows` result. JOINs / aggregates
3452    /// / ORDER BY are unsupported on this path (STABILITY carve-
3453    /// out); operators wanting them should restore the segment
3454    /// into a regular table first.
3455    fn exec_select_as_of_segment(
3456        &self,
3457        stmt: &SelectStatement,
3458        from: &spg_sql::ast::FromClause,
3459        segment_id: u32,
3460    ) -> Result<QueryResult, EngineError> {
3461        // v6.10.2 scope: no joins, no aggregates, no ORDER BY,
3462        // no GROUP BY / HAVING / UNION / OFFSET / DISTINCT.
3463        if !from.joins.is_empty()
3464            || stmt.group_by.is_some()
3465            || stmt.having.is_some()
3466            || !stmt.unions.is_empty()
3467            || !stmt.order_by.is_empty()
3468            || stmt.offset.is_some()
3469            || stmt.distinct
3470            || aggregate::uses_aggregate(stmt)
3471        {
3472            return Err(EngineError::Unsupported(
3473                "AS OF SEGMENT supports SELECT projection + WHERE + LIMIT only \
3474                 (joins / aggregates / ORDER BY are STABILITY § \"Out of v6.10\")"
3475                    .into(),
3476            ));
3477        }
3478        let table = self
3479            .active_catalog()
3480            .get(&from.primary.name)
3481            .ok_or_else(|| StorageError::TableNotFound {
3482                name: from.primary.name.clone(),
3483            })?;
3484        let schema = table.schema().clone();
3485        let schema_cols = &schema.columns;
3486        let alias = from
3487            .primary
3488            .alias
3489            .as_deref()
3490            .unwrap_or(from.primary.name.as_str());
3491        let ctx = EvalContext::new(schema_cols, Some(alias));
3492        let seg = self
3493            .active_catalog()
3494            .cold_segment(segment_id)
3495            .ok_or_else(|| {
3496                EngineError::Unsupported(alloc::format!(
3497                    "AS OF SEGMENT: cold segment {segment_id} not registered"
3498                ))
3499            })?;
3500        let mut out_rows: Vec<Row> = Vec::new();
3501        let mut limit_remaining: Option<usize> =
3502            stmt.limit_literal().and_then(|n| usize::try_from(n).ok());
3503        for (_key, body) in seg.scan() {
3504            let (row, _consumed) = spg_storage::decode_row_body_dense(&body, &schema)
3505                .map_err(EngineError::Storage)?;
3506            if let Some(where_expr) = &stmt.where_ {
3507                let cond = self.eval_expr_simple(where_expr, &row, &ctx)?;
3508                if !matches!(cond, Value::Bool(true)) {
3509                    continue;
3510                }
3511            }
3512            // Projection.
3513            let projected = self.project_row_simple(&row, &stmt.items, schema_cols, alias)?;
3514            out_rows.push(projected);
3515            if let Some(rem) = limit_remaining.as_mut() {
3516                if *rem == 0 {
3517                    out_rows.pop();
3518                    break;
3519                }
3520                *rem -= 1;
3521            }
3522        }
3523        // Output column schema: derive from SELECT items.
3524        let columns = self.derive_output_columns(&stmt.items, schema_cols, alias);
3525        Ok(QueryResult::Rows {
3526            columns,
3527            rows: out_rows,
3528        })
3529    }
3530
3531    /// v6.10.2 — simple-path WHERE eval that doesn't go through
3532    /// the correlated-subquery / Memoize machinery. AS OF SEGMENT
3533    /// scan paths predicate against a snapshot frozen segment, no
3534    /// cross-row state.
3535    fn eval_expr_simple(
3536        &self,
3537        expr: &Expr,
3538        row: &Row,
3539        ctx: &EvalContext,
3540    ) -> Result<Value, EngineError> {
3541        let cancel = CancelToken::none();
3542        self.eval_expr_with_correlated(expr, row, ctx, cancel, None)
3543    }
3544
3545    /// v7.9.4 — INSERT / UPDATE / DELETE RETURNING projector.
3546    /// Given the table name, the user-supplied projection items,
3547    /// and the mutated rows (post-insert / post-update values, or
3548    /// pre-delete snapshot), build a `QueryResult::Rows` whose
3549    /// schema describes the projected columns. Mailrs migration
3550    /// blocker #1.
3551    fn build_returning_rows(
3552        &self,
3553        table_name: &str,
3554        items: &[SelectItem],
3555        mutated_rows: Vec<Vec<Value>>,
3556    ) -> Result<QueryResult, EngineError> {
3557        let table = self.active_catalog().get(table_name).ok_or_else(|| {
3558            EngineError::Storage(StorageError::TableNotFound {
3559                name: table_name.into(),
3560            })
3561        })?;
3562        let schema_cols = table.schema().columns.clone();
3563        let columns = self.derive_output_columns(items, &schema_cols, table_name);
3564        let mut out_rows: Vec<Row> = Vec::with_capacity(mutated_rows.len());
3565        for values in mutated_rows {
3566            let row = Row::new(values);
3567            let projected = self.project_row_simple(&row, items, &schema_cols, table_name)?;
3568            out_rows.push(projected);
3569        }
3570        Ok(QueryResult::Rows {
3571            columns,
3572            rows: out_rows,
3573        })
3574    }
3575
3576    /// v6.10.2 — projection for AS OF SEGMENT. Resolves
3577    /// `SelectItem::Wildcard` to all schema columns and
3578    /// `SelectItem::Expr` via the regular eval path.
3579    fn project_row_simple(
3580        &self,
3581        row: &Row,
3582        items: &[SelectItem],
3583        schema_cols: &[ColumnSchema],
3584        alias: &str,
3585    ) -> Result<Row, EngineError> {
3586        let ctx = EvalContext::new(schema_cols, Some(alias));
3587        let cancel = CancelToken::none();
3588        let mut out_vals = Vec::new();
3589        for item in items {
3590            match item {
3591                SelectItem::Wildcard => {
3592                    out_vals.extend(row.values.iter().cloned());
3593                }
3594                SelectItem::Expr { expr, .. } => {
3595                    let v = self.eval_expr_with_correlated(expr, row, &ctx, cancel, None)?;
3596                    out_vals.push(v);
3597                }
3598            }
3599        }
3600        Ok(Row::new(out_vals))
3601    }
3602
3603    /// v6.10.2 — derive the output `ColumnSchema` list for an
3604    /// AS OF SEGMENT projection. Wildcards take the full schema;
3605    /// expressions take the alias if present or a synthetic
3606    /// `?column?` (PG convention) otherwise.
3607    fn derive_output_columns(
3608        &self,
3609        items: &[SelectItem],
3610        schema_cols: &[ColumnSchema],
3611        _alias: &str,
3612    ) -> Vec<ColumnSchema> {
3613        let mut out = Vec::new();
3614        for item in items {
3615            match item {
3616                SelectItem::Wildcard => {
3617                    out.extend(schema_cols.iter().cloned());
3618                }
3619                SelectItem::Expr { alias, .. } => {
3620                    let name = alias
3621                        .clone()
3622                        .unwrap_or_else(|| "?column?".to_string());
3623                    // Default to Text; the caller's row values
3624                    // carry the actual type. v6.10.2 scope.
3625                    out.push(ColumnSchema::new(name, DataType::Text, true));
3626                }
3627            }
3628        }
3629        out
3630    }
3631
3632    fn exec_select_cancel(
3633        &self,
3634        stmt: &SelectStatement,
3635        cancel: CancelToken<'_>,
3636    ) -> Result<QueryResult, EngineError> {
3637        cancel.check()?;
3638        // v6.10.2 — cold-tier time-travel short-circuit. When the
3639        // primary TableRef carries `AS OF SEGMENT '<id>'`, run a
3640        // dedicated cold-segment scan instead of the regular
3641        // hot+index path. The scope is intentionally narrow for
3642        // v6.10.2 — bare `SELECT * FROM <t> AS OF SEGMENT 'id'`,
3643        // optionally with a single-column-equality WHERE. JOINs /
3644        // aggregates / ORDER BY / subqueries on top of a time-
3645        // travelled scan are STABILITY § "Out of v6.10".
3646        if let Some(from) = &stmt.from
3647            && let Some(seg_id) = from.primary.as_of_segment
3648        {
3649            return self.exec_select_as_of_segment(stmt, from, seg_id);
3650        }
3651        // v6.2.0 / v6.5.0 — virtual-table short-circuits. Detected
3652        // pre-CTE because they don't read from the catalog and
3653        // shouldn't participate in regular FROM resolution.
3654        if let Some(from) = &stmt.from
3655            && from.joins.is_empty()
3656            && stmt.where_.is_none()
3657            && stmt.group_by.is_none()
3658            && stmt.having.is_none()
3659            && stmt.unions.is_empty()
3660            && stmt.order_by.is_empty()
3661            && stmt.limit.is_none()
3662            && stmt.offset.is_none()
3663            && !stmt.distinct
3664            && stmt.items.iter().all(|i| matches!(i, SelectItem::Wildcard))
3665        {
3666            let lower = from.primary.name.to_ascii_lowercase();
3667            match lower.as_str() {
3668                "spg_statistic" => return Ok(self.exec_spg_statistic()),
3669                // v6.5.0 — observability v2 virtual tables.
3670                "spg_stat_replication" => return Ok(self.exec_spg_stat_replication()),
3671                "spg_stat_segment" => return Ok(self.exec_spg_stat_segment()),
3672                "spg_stat_query" => return Ok(self.exec_spg_stat_query()),
3673                "spg_stat_activity" => return Ok(self.exec_spg_stat_activity()),
3674                "spg_audit_chain" => return Ok(self.exec_spg_audit_chain()),
3675                "spg_audit_verify" => return Ok(self.exec_spg_audit_verify()),
3676                "spg_table_ddl" => return Ok(self.exec_spg_table_ddl()),
3677                "spg_role_ddl" => return Ok(self.exec_spg_role_ddl()),
3678                "spg_database_ddl" => return Ok(self.exec_spg_database_ddl()),
3679                _ => {}
3680            }
3681        }
3682        // v4.11: CTEs materialise into a temporary enriched catalog
3683        // *before* anything else — the body SELECT can then refer
3684        // to CTE names via the regular FROM-clause resolution.
3685        // Uncorrelated only: each CTE body runs once against the
3686        // current catalog, not against later CTEs' results (left-
3687        // to-right materialisation would relax this, but we keep
3688        // it simple for v4.11 MVP).
3689        if !stmt.ctes.is_empty() {
3690            return self.exec_with_ctes(stmt, cancel);
3691        }
3692        // v4.10: subqueries (uncorrelated) are resolved here, before
3693        // the executor sees the row loop. We clone the statement so
3694        // we can mutate without disturbing the caller's AST — most
3695        // queries pass through with no subquery nodes and the clone
3696        // is cheap; with subqueries the materialisation cost
3697        // dominates anyway.
3698        let mut stmt_owned;
3699        let stmt_ref: &SelectStatement = if expr_tree_has_subquery(stmt) {
3700            stmt_owned = stmt.clone();
3701            self.resolve_select_subqueries(&mut stmt_owned, cancel)?;
3702            &stmt_owned
3703        } else {
3704            stmt
3705        };
3706        if stmt_ref.unions.is_empty() {
3707            return self.exec_bare_select_cancel(stmt_ref, cancel);
3708        }
3709        // UNION path: clone-strip the head into a bare block (its own
3710        // DISTINCT and any inner ORDER BY are dropped by parser rule —
3711        // the wrapper SelectStatement carries them), execute, then chain
3712        // peers with left-associative dedup semantics.
3713        let mut head = stmt_ref.clone();
3714        head.unions = Vec::new();
3715        head.order_by = Vec::new();
3716        head.limit = None;
3717        let QueryResult::Rows { columns, mut rows } =
3718            self.exec_bare_select_cancel(&head, cancel)?
3719        else {
3720            unreachable!("bare SELECT cannot return CommandOk")
3721        };
3722        for (kind, peer) in &stmt_ref.unions {
3723            let QueryResult::Rows {
3724                columns: peer_cols,
3725                rows: peer_rows,
3726            } = self.exec_bare_select_cancel(peer, cancel)?
3727            else {
3728                unreachable!("bare SELECT cannot return CommandOk")
3729            };
3730            if peer_cols.len() != columns.len() {
3731                return Err(EngineError::Unsupported(alloc::format!(
3732                    "UNION arity mismatch: head has {} columns, peer has {}",
3733                    columns.len(),
3734                    peer_cols.len()
3735                )));
3736            }
3737            rows.extend(peer_rows);
3738            if matches!(kind, UnionKind::Distinct) {
3739                rows = dedup_rows(rows);
3740            }
3741        }
3742        // ORDER BY at the top of a UNION applies to the combined result.
3743        // Eval against the projected schema (NOT the source table).
3744        if !stmt.order_by.is_empty() {
3745            let synth_ctx = EvalContext::new(&columns, None);
3746            let descs: Vec<bool> = stmt.order_by.iter().map(|o| o.desc).collect();
3747            let mut tagged: Vec<(Vec<f64>, Row)> = Vec::with_capacity(rows.len());
3748            for r in rows {
3749                let keys = build_order_keys(&stmt.order_by, &r, &synth_ctx)?;
3750                tagged.push((keys, r));
3751            }
3752            sort_by_keys(&mut tagged, &descs);
3753            rows = tagged.into_iter().map(|(_, r)| r).collect();
3754        }
3755        apply_offset_and_limit(&mut rows, stmt.offset_literal(), stmt.limit_literal());
3756        Ok(QueryResult::Rows { columns, rows })
3757    }
3758
3759    #[allow(clippy::too_many_lines)]
3760    #[allow(clippy::too_many_lines)] // huge match — splitting fragments the planner
3761    fn exec_bare_select_cancel(
3762        &self,
3763        stmt: &SelectStatement,
3764        cancel: CancelToken<'_>,
3765    ) -> Result<QueryResult, EngineError> {
3766        // v4.12: window-function path. When the projection contains
3767        // any `name(args) OVER (...)` we route to the dedicated
3768        // executor — partition + sort + per-row window value before
3769        // the regular projection.
3770        if select_has_window(stmt) {
3771            return self.exec_select_with_window(stmt, cancel);
3772        }
3773        // Constant SELECT (no FROM) — evaluate each item once against an
3774        // empty dummy row. Useful for `SELECT 1`, `SELECT coalesce(...)`,
3775        // `SELECT '7'::INT`. Column references will surface as
3776        // ColumnNotFound on eval since the schema is empty.
3777        let Some(from) = &stmt.from else {
3778            let empty_schema: Vec<ColumnSchema> = Vec::new();
3779            let ctx = EvalContext::new(&empty_schema, None);
3780            let projection = build_projection(&stmt.items, &empty_schema, "")?;
3781            let dummy_row = Row::new(Vec::new());
3782            let mut values = Vec::with_capacity(projection.len());
3783            for p in &projection {
3784                values.push(eval::eval_expr(&p.expr, &dummy_row, &ctx)?);
3785            }
3786            let columns: Vec<ColumnSchema> = projection
3787                .into_iter()
3788                .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
3789                .collect();
3790            return Ok(QueryResult::Rows {
3791                columns,
3792                rows: alloc::vec![Row::new(values)],
3793            });
3794        };
3795        // Multi-table FROM (one or more joined peers) goes through the
3796        // nested-loop join executor. Single-table FROM stays on the
3797        // existing scan + index-seek path.
3798        if !from.joins.is_empty() {
3799            return self.exec_joined_select(stmt, from);
3800        }
3801        let primary = &from.primary;
3802        let table = self.active_catalog().get(&primary.name).ok_or_else(|| {
3803            StorageError::TableNotFound {
3804                name: primary.name.clone(),
3805            }
3806        })?;
3807        let schema_cols = &table.schema().columns;
3808        // The qualifier accepted on column refs is the alias (if any) else the
3809        // bare table name.
3810        let alias = primary.alias.as_deref().unwrap_or(primary.name.as_str());
3811        let ctx = EvalContext::new(schema_cols, Some(alias));
3812
3813        // NSW kNN planner: `ORDER BY col <-> literal LIMIT k` with no
3814        // WHERE and an NSW index on `col` skips the full scan. The
3815        // walk returns rows already in ascending-distance order, so
3816        // ORDER BY / LIMIT are honoured implicitly.
3817        if let Some(nsw_rows) = try_nsw_knn(stmt, table, schema_cols, alias) {
3818            return materialise_in_order(stmt, table, schema_cols, alias, &nsw_rows);
3819        }
3820
3821        // Index seek: if WHERE is `col = literal` (or commuted) and the
3822        // referenced column has an index, dispatch each locator through
3823        // the catalog (hot tier → borrow, cold tier → page-read +
3824        // decode) and iterate just those rows. Otherwise fall back to a
3825        // full scan over the hot tier (cold-tier rows are only reached
3826        // via index seek in v5.1 — full table scans against cold-tier
3827        // data ship in v5.2 with the freezer's per-segment scan API).
3828        let indexed_rows: Option<Vec<Cow<'_, Row>>> = stmt
3829            .where_
3830            .as_ref()
3831            .and_then(|w| try_index_seek(w, schema_cols, self.active_catalog(), table, alias));
3832
3833        // Aggregate path: filter rows first, then hand off to the
3834        // aggregate executor which does its own projection + ORDER BY.
3835        if aggregate::uses_aggregate(stmt) {
3836            let mut filtered: Vec<&Row> = Vec::new();
3837            // v6.2.6 — Memoize: per-query LRU cache for correlated
3838            // scalar subqueries. Fresh per row-loop entry so each
3839            // SELECT execution gets an isolated cache.
3840            let mut memo = memoize::MemoizeCache::new();
3841            if let Some(rows) = &indexed_rows {
3842                for cow in rows {
3843                    let row = cow.as_ref();
3844                    if let Some(where_expr) = &stmt.where_ {
3845                        let cond = self.eval_expr_with_correlated(
3846                            where_expr,
3847                            row,
3848                            &ctx,
3849                            cancel,
3850                            Some(&mut memo),
3851                        )?;
3852                        if !matches!(cond, Value::Bool(true)) {
3853                            continue;
3854                        }
3855                    }
3856                    filtered.push(row);
3857                }
3858            } else {
3859                for i in 0..table.row_count() {
3860                    let row = &table.rows()[i];
3861                    if let Some(where_expr) = &stmt.where_ {
3862                        let cond = self.eval_expr_with_correlated(
3863                            where_expr,
3864                            row,
3865                            &ctx,
3866                            cancel,
3867                            Some(&mut memo),
3868                        )?;
3869                        if !matches!(cond, Value::Bool(true)) {
3870                            continue;
3871                        }
3872                    }
3873                    filtered.push(row);
3874                }
3875            }
3876            let mut agg = aggregate::run(stmt, &filtered, schema_cols, Some(alias))?;
3877            apply_offset_and_limit(&mut agg.rows, stmt.offset_literal(), stmt.limit_literal());
3878            return Ok(QueryResult::Rows {
3879                columns: agg.columns,
3880                rows: agg.rows,
3881            });
3882        }
3883
3884        let projection = build_projection(&stmt.items, schema_cols, alias)?;
3885
3886        // Materialise the filter pass into `(order_key, projected_row)`
3887        // tuples. The order key is `None` when there's no ORDER BY clause.
3888        let mut tagged: Vec<(Vec<f64>, Row)> = Vec::new();
3889        // v6.2.6 — Memoize per-row WHERE eval shares one cache.
3890        let mut memo = memoize::MemoizeCache::new();
3891        // Inline the per-row work in a closure so the indexed and full-
3892        // scan branches share the body.
3893        let mut process_row = |row: &Row, loop_idx: usize| -> Result<(), EngineError> {
3894            if loop_idx.is_multiple_of(256) {
3895                cancel.check()?;
3896            }
3897            if let Some(where_expr) = &stmt.where_ {
3898                let cond = self.eval_expr_with_correlated(
3899                    where_expr,
3900                    row,
3901                    &ctx,
3902                    cancel,
3903                    Some(&mut memo),
3904                )?;
3905                if !matches!(cond, Value::Bool(true)) {
3906                    return Ok(());
3907                }
3908            }
3909            let mut values = Vec::with_capacity(projection.len());
3910            for p in &projection {
3911                values.push(eval::eval_expr(&p.expr, row, &ctx)?);
3912            }
3913            let order_keys = if stmt.order_by.is_empty() {
3914                Vec::new()
3915            } else {
3916                build_order_keys(&stmt.order_by, row, &ctx)?
3917            };
3918            tagged.push((order_keys, Row::new(values)));
3919            Ok(())
3920        };
3921        if let Some(rows) = &indexed_rows {
3922            for (loop_idx, cow) in rows.iter().enumerate() {
3923                process_row(cow.as_ref(), loop_idx)?;
3924            }
3925        } else {
3926            for i in 0..table.row_count() {
3927                process_row(&table.rows()[i], i)?;
3928            }
3929        }
3930
3931        if !stmt.order_by.is_empty() {
3932            // Partial-sort fast path: when LIMIT is small relative to
3933            // the row count, select_nth_unstable + sort just the
3934            // prefix is O(n + k log k) instead of O(n log n). DISTINCT
3935            // requires the full sort because de-dup happens after.
3936            let keep = if stmt.distinct {
3937                None
3938            } else {
3939                stmt.limit_literal()
3940                    .map(|l| l as usize + stmt.offset_literal().map_or(0, |o| o as usize))
3941            };
3942            let descs: Vec<bool> = stmt.order_by.iter().map(|o| o.desc).collect();
3943            partial_sort_tagged(&mut tagged, keep, &descs);
3944        }
3945
3946        let mut output_rows: Vec<Row> = tagged.into_iter().map(|(_, r)| r).collect();
3947        if stmt.distinct {
3948            output_rows = dedup_rows(output_rows);
3949        }
3950        apply_offset_and_limit(&mut output_rows, stmt.offset_literal(), stmt.limit_literal());
3951
3952        let columns: Vec<ColumnSchema> = projection
3953            .into_iter()
3954            .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
3955            .collect();
3956
3957        Ok(QueryResult::Rows {
3958            columns,
3959            rows: output_rows,
3960        })
3961    }
3962
3963    /// Multi-table SELECT executor (one or more JOIN peers).
3964    ///
3965    /// v1.10 builds the joined row set up-front via nested-loop joins,
3966    /// then runs WHERE + projection + ORDER BY against the combined
3967    /// rows. No index seek. Aggregates and DISTINCT still work because
3968    /// the executor delegates projection through the same shared paths.
3969    #[allow(clippy::too_many_lines)]
3970    fn exec_joined_select(
3971        &self,
3972        stmt: &SelectStatement,
3973        from: &FromClause,
3974    ) -> Result<QueryResult, EngineError> {
3975        // Resolve every table reference up front so we surface
3976        // TableNotFound before we start the cartesian work.
3977        let primary_table = self
3978            .active_catalog()
3979            .get(&from.primary.name)
3980            .ok_or_else(|| StorageError::TableNotFound {
3981                name: from.primary.name.clone(),
3982            })?;
3983        let primary_alias = from
3984            .primary
3985            .alias
3986            .as_deref()
3987            .unwrap_or(from.primary.name.as_str())
3988            .to_string();
3989        let mut joined_tables: Vec<(&Table, String, JoinKind, Option<&Expr>)> = Vec::new();
3990        for j in &from.joins {
3991            let t = self.active_catalog().get(&j.table.name).ok_or_else(|| {
3992                StorageError::TableNotFound {
3993                    name: j.table.name.clone(),
3994                }
3995            })?;
3996            let a = j
3997                .table
3998                .alias
3999                .as_deref()
4000                .unwrap_or(j.table.name.as_str())
4001                .to_string();
4002            joined_tables.push((t, a, j.kind, j.on.as_ref()));
4003        }
4004
4005        // Build the combined schema: composite "alias.col" names so the
4006        // qualified-column resolver can find anything by exact match.
4007        let mut combined_schema: Vec<ColumnSchema> = Vec::new();
4008        for col in &primary_table.schema().columns {
4009            combined_schema.push(ColumnSchema::new(
4010                alloc::format!("{primary_alias}.{}", col.name),
4011                col.ty,
4012                col.nullable,
4013            ));
4014        }
4015        for (t, a, _, _) in &joined_tables {
4016            for col in &t.schema().columns {
4017                combined_schema.push(ColumnSchema::new(
4018                    alloc::format!("{a}.{}", col.name),
4019                    col.ty,
4020                    col.nullable,
4021                ));
4022            }
4023        }
4024        let ctx = EvalContext::new(&combined_schema, None);
4025
4026        // Nested-loop join. Starting set: every primary row, padded with
4027        // (no joined columns yet).
4028        let mut working: Vec<Row> = primary_table.rows().iter().cloned().collect();
4029        let mut produced_len = primary_table.schema().columns.len();
4030        for (t, _, kind, on) in &joined_tables {
4031            let right_arity = t.schema().columns.len();
4032            let mut next: Vec<Row> = Vec::new();
4033            for left in &working {
4034                let mut left_matched = false;
4035                for right in t.rows() {
4036                    let mut combined_vals = left.values.clone();
4037                    combined_vals.extend(right.values.iter().cloned());
4038                    // Pad combined to the eventual full width so the
4039                    // partial schema still matches positions used by ON.
4040                    let combined = Row::new(combined_vals);
4041                    let keep = if let Some(on_expr) = on {
4042                        let cond = eval::eval_expr(on_expr, &combined, &ctx)?;
4043                        matches!(cond, Value::Bool(true))
4044                    } else {
4045                        // CROSS / comma-list: every pair survives.
4046                        true
4047                    };
4048                    if keep {
4049                        next.push(combined);
4050                        left_matched = true;
4051                    }
4052                }
4053                if !left_matched && matches!(kind, JoinKind::Left) {
4054                    // LEFT OUTER JOIN: emit the left row with NULLs on
4055                    // the right side when no peer matched.
4056                    let mut combined_vals = left.values.clone();
4057                    for _ in 0..right_arity {
4058                        combined_vals.push(Value::Null);
4059                    }
4060                    next.push(Row::new(combined_vals));
4061                }
4062            }
4063            working = next;
4064            produced_len += right_arity;
4065            debug_assert!(produced_len <= combined_schema.len());
4066        }
4067
4068        // WHERE filter against combined rows.
4069        let mut filtered: Vec<Row> = Vec::new();
4070        for row in working {
4071            if let Some(where_expr) = &stmt.where_ {
4072                let cond = eval::eval_expr(where_expr, &row, &ctx)?;
4073                if !matches!(cond, Value::Bool(true)) {
4074                    continue;
4075                }
4076            }
4077            filtered.push(row);
4078        }
4079
4080        // Aggregate path: handle GROUP BY / aggregate calls over the
4081        // joined+filtered rows.
4082        if aggregate::uses_aggregate(stmt) {
4083            let refs: Vec<&Row> = filtered.iter().collect();
4084            let mut agg = aggregate::run(stmt, &refs, &combined_schema, None)?;
4085            apply_offset_and_limit(&mut agg.rows, stmt.offset_literal(), stmt.limit_literal());
4086            return Ok(QueryResult::Rows {
4087                columns: agg.columns,
4088                rows: agg.rows,
4089            });
4090        }
4091
4092        let projection = build_projection(&stmt.items, &combined_schema, "")?;
4093        let mut tagged: Vec<(Vec<f64>, Row)> = Vec::new();
4094        for row in &filtered {
4095            let mut values = Vec::with_capacity(projection.len());
4096            for p in &projection {
4097                values.push(eval::eval_expr(&p.expr, row, &ctx)?);
4098            }
4099            let order_keys = if stmt.order_by.is_empty() {
4100                Vec::new()
4101            } else {
4102                build_order_keys(&stmt.order_by, row, &ctx)?
4103            };
4104            tagged.push((order_keys, Row::new(values)));
4105        }
4106        if !stmt.order_by.is_empty() {
4107            let keep = if stmt.distinct {
4108                None
4109            } else {
4110                stmt.limit_literal()
4111                    .map(|l| l as usize + stmt.offset_literal().map_or(0, |o| o as usize))
4112            };
4113            let descs: Vec<bool> = stmt.order_by.iter().map(|o| o.desc).collect();
4114            partial_sort_tagged(&mut tagged, keep, &descs);
4115        }
4116        let mut output_rows: Vec<Row> = tagged.into_iter().map(|(_, r)| r).collect();
4117        if stmt.distinct {
4118            output_rows = dedup_rows(output_rows);
4119        }
4120        apply_offset_and_limit(&mut output_rows, stmt.offset_literal(), stmt.limit_literal());
4121        let columns: Vec<ColumnSchema> = projection
4122            .into_iter()
4123            .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
4124            .collect();
4125        Ok(QueryResult::Rows {
4126            columns,
4127            rows: output_rows,
4128        })
4129    }
4130}
4131
4132/// One row-producing projection: an expression to evaluate, the resulting
4133/// column's user-visible name, its inferred type, and nullability.
4134#[derive(Debug, Clone)]
4135struct ProjectedItem {
4136    expr: Expr,
4137    output_name: String,
4138    ty: DataType,
4139    nullable: bool,
4140}
4141
4142/// Dedupe a row set, preserving first-seen order. `Row`'s `PartialEq` is
4143/// structural (`Vec<Value>` ⇒ pairwise `Value` equality), which gives SQL
4144/// `NULL = NULL → TRUE` and `NaN = NaN → FALSE`. The first agrees with
4145/// the spec's "two NULLs are not distinct"; the second is a tolerated
4146/// quirk for v1 (no NaN literals are reachable from the SQL surface).
4147fn dedup_rows(rows: Vec<Row>) -> Vec<Row> {
4148    let mut out: Vec<Row> = Vec::with_capacity(rows.len());
4149    for r in rows {
4150        if !out.iter().any(|seen| seen == &r) {
4151            out.push(r);
4152        }
4153    }
4154    out
4155}
4156
4157/// Coerce a `Value` to an `f64` sort key for ORDER BY. Numbers map directly;
4158/// NULL sorts last (treated as `+∞`); booleans are 0.0 / 1.0; text uses lex
4159/// order via the byte values; vectors are not sortable.
4160fn value_to_order_key(v: &Value) -> Result<f64, EngineError> {
4161    match v {
4162        Value::Null => Ok(f64::INFINITY),
4163        Value::SmallInt(n) => Ok(f64::from(*n)),
4164        Value::Int(n) => Ok(f64::from(*n)),
4165        Value::Date(d) => Ok(f64::from(*d)),
4166        #[allow(clippy::cast_precision_loss)]
4167        Value::Timestamp(t) => Ok(*t as f64),
4168        #[allow(clippy::cast_precision_loss)]
4169        Value::Numeric { scaled, scale } => {
4170            // Scaled integer / 10^scale, computed via f64 for sort
4171            // ordering only. Precision losses here only matter for
4172            // ORDER BY tie-breaks well past 15 significant digits.
4173            // `f64::powi` lives in std; we hand-roll the loop so the
4174            // no_std engine crate doesn't need it.
4175            let mut divisor = 1.0_f64;
4176            for _ in 0..*scale {
4177                divisor *= 10.0;
4178            }
4179            Ok((*scaled as f64) / divisor)
4180        }
4181        #[allow(clippy::cast_precision_loss)]
4182        Value::BigInt(n) => Ok(*n as f64),
4183        Value::Float(x) => Ok(*x),
4184        Value::Bool(b) => Ok(if *b { 1.0 } else { 0.0 }),
4185        Value::Text(s) => {
4186            // Lex order by codepoints — good enough for ORDER BY name.
4187            // Map first 8 bytes packed into u64 as a coarse key; ties fall to
4188            // partial_cmp Equal. v1.x can swap in a real string comparator.
4189            let mut key: u64 = 0;
4190            for &b in s.as_bytes().iter().take(8) {
4191                key = (key << 8) | u64::from(b);
4192            }
4193            #[allow(clippy::cast_precision_loss)]
4194            Ok(key as f64)
4195        }
4196        Value::Vector(_) | Value::Sq8Vector(_) | Value::HalfVector(_) => {
4197            Err(EngineError::Unsupported(
4198                "ORDER BY of a raw vector column is not meaningful — use `<->`".into(),
4199            ))
4200        }
4201        Value::Interval { .. } => Err(EngineError::Unsupported(
4202            "ORDER BY of an INTERVAL is not supported in v2.11 \
4203             (months vs micros has no single canonical ordering)"
4204                .into(),
4205        )),
4206        Value::Json(_) => Err(EngineError::Unsupported(
4207            "ORDER BY of a JSON value is not supported — cast the document to text first".into(),
4208        )),
4209        // v7.5.0 — Value is #[non_exhaustive]; future variants need
4210        // an explicit ORDER BY mapping. Surface as Unsupported until
4211        // engine support is added.
4212        _ => Err(EngineError::Unsupported(
4213            "ORDER BY of this value type is not supported".into(),
4214        )),
4215    }
4216}
4217
4218/// Try to plan a WHERE clause as an equality lookup against an existing
4219/// index. Returns the candidate row indices on success; `None` means the
4220/// caller should fall back to a full scan.
4221///
4222/// v0.8 recognises a single top-level `col = literal` (in either operand
4223/// order). AND chains and range scans land in later milestones.
4224/// Look for `ORDER BY col <dist-op> literal LIMIT k` against an
4225/// NSW-indexed vector column. Recognised distance ops: `<->` (L2),
4226/// `<#>` (inner product), `<=>` (cosine). When a WHERE clause is
4227/// present, the planner does an "over-fetch and filter" pass — it
4228/// asks the graph for `k * over_fetch` candidates, evaluates WHERE
4229/// against each, and trims back to `k`. Returns the row indices in
4230/// ascending-distance order when the plan applies.
4231fn try_nsw_knn(
4232    stmt: &SelectStatement,
4233    table: &Table,
4234    schema_cols: &[ColumnSchema],
4235    table_alias: &str,
4236) -> Option<Vec<usize>> {
4237    if stmt.distinct {
4238        return None;
4239    }
4240    let limit = usize::try_from(stmt.limit_literal()?).ok()?;
4241    if limit == 0 {
4242        return None;
4243    }
4244    // v6.4.0 — NSW kNN dispatch needs a single ORDER BY key on the
4245    // distance metric. Multi-key ORDER BY falls through to the
4246    // generic sort path.
4247    if stmt.order_by.len() != 1 {
4248        return None;
4249    }
4250    let order = &stmt.order_by[0];
4251    // NSW kNN returns rows ascending by distance — DESC inverts the
4252    // natural order, so the planner can't handle it without a sort
4253    // pass. Fall back to the generic ORDER BY path.
4254    if order.desc {
4255        return None;
4256    }
4257    let Expr::Binary { lhs, op, rhs } = &order.expr else {
4258        return None;
4259    };
4260    let metric = match op {
4261        BinOp::L2Distance => spg_storage::NswMetric::L2,
4262        BinOp::InnerProduct => spg_storage::NswMetric::InnerProduct,
4263        BinOp::CosineDistance => spg_storage::NswMetric::Cosine,
4264        _ => return None,
4265    };
4266    // Accept both `col <op> literal` and `literal <op> col`.
4267    let ((Expr::Column(col), literal) | (literal, Expr::Column(col))) =
4268        (lhs.as_ref(), rhs.as_ref())
4269    else {
4270        return None;
4271    };
4272    if let Some(q) = &col.qualifier
4273        && q != table_alias
4274    {
4275        return None;
4276    }
4277    let col_pos = schema_cols.iter().position(|s| s.name == col.name)?;
4278    let query = literal_to_vector(literal)?;
4279    let idx = spg_storage::nsw_index_on(table, col_pos)?;
4280    if let Some(where_expr) = &stmt.where_ {
4281        // Over-fetch and filter. The factor (10×) is a heuristic that
4282        // covers typical selectivity for the corpus tests; v2.x will
4283        // make it configurable.
4284        let over_fetch = limit.saturating_mul(10).max(NSW_OVER_FETCH_FLOOR);
4285        let candidates = spg_storage::nsw_query(table, &idx.name, &query, over_fetch, metric);
4286        let ctx = EvalContext::new(schema_cols, Some(table_alias));
4287        let mut kept: Vec<usize> = Vec::with_capacity(limit);
4288        for i in candidates {
4289            let row = &table.rows()[i];
4290            let cond = eval::eval_expr(where_expr, row, &ctx).ok()?;
4291            if matches!(cond, Value::Bool(true)) {
4292                kept.push(i);
4293                if kept.len() >= limit {
4294                    break;
4295                }
4296            }
4297        }
4298        Some(kept)
4299    } else {
4300        Some(spg_storage::nsw_query(
4301            table, &idx.name, &query, limit, metric,
4302        ))
4303    }
4304}
4305
4306/// Lower bound on the over-fetch pool when WHERE is present — even
4307/// for tiny `LIMIT 1` queries we keep enough candidates to absorb a
4308/// few WHERE rejections.
4309const NSW_OVER_FETCH_FLOOR: usize = 32;
4310
4311/// Pull a `Vec<f32>` out of a literal-or-cast expression. Returns
4312/// `None` for anything we can't fold at plan time.
4313fn literal_to_vector(e: &Expr) -> Option<Vec<f32>> {
4314    match e {
4315        Expr::Literal(Literal::Vector(v)) => Some(v.clone()),
4316        Expr::Cast { expr, .. } => literal_to_vector(expr),
4317        _ => None,
4318    }
4319}
4320
4321/// Materialise rows in a planner-supplied order (used by the NSW path)
4322/// without re-running ORDER BY. The projection + LIMIT slot mirror the
4323/// equivalent block in `exec_bare_select`.
4324fn materialise_in_order(
4325    stmt: &SelectStatement,
4326    table: &Table,
4327    schema_cols: &[ColumnSchema],
4328    table_alias: &str,
4329    ordered_rows: &[usize],
4330) -> Result<QueryResult, EngineError> {
4331    let ctx = EvalContext::new(schema_cols, Some(table_alias));
4332    let projection = build_projection(&stmt.items, schema_cols, table_alias)?;
4333    let mut output_rows: Vec<Row> = Vec::with_capacity(ordered_rows.len());
4334    for &i in ordered_rows {
4335        let row = &table.rows()[i];
4336        let mut values = Vec::with_capacity(projection.len());
4337        for p in &projection {
4338            values.push(eval::eval_expr(&p.expr, row, &ctx)?);
4339        }
4340        output_rows.push(Row::new(values));
4341    }
4342    apply_offset_and_limit(&mut output_rows, stmt.offset_literal(), stmt.limit_literal());
4343    let columns: Vec<ColumnSchema> = projection
4344        .into_iter()
4345        .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
4346        .collect();
4347    Ok(QueryResult::Rows {
4348        columns,
4349        rows: output_rows,
4350    })
4351}
4352
4353fn try_index_seek<'a>(
4354    where_expr: &Expr,
4355    schema_cols: &[ColumnSchema],
4356    catalog: &'a Catalog,
4357    table: &'a Table,
4358    table_alias: &str,
4359) -> Option<Vec<Cow<'a, Row>>> {
4360    let Expr::Binary {
4361        lhs,
4362        op: BinOp::Eq,
4363        rhs,
4364    } = where_expr
4365    else {
4366        return None;
4367    };
4368    let (col_pos, value) = resolve_col_literal_pair(lhs, rhs, schema_cols, table_alias)
4369        .or_else(|| resolve_col_literal_pair(rhs, lhs, schema_cols, table_alias))?;
4370    let idx = table.index_on(col_pos)?;
4371    let key = IndexKey::from_value(&value)?;
4372    let locators = idx.lookup_eq(&key);
4373    let table_name = table.schema().name.as_str();
4374    // v5.1: each locator dispatches to either the hot tier (zero-
4375    // copy borrow of `table.rows()[i]`) or a cold-tier segment
4376    // (one page read + dense row decode, ~µs scale). Cold rows are
4377    // returned as `Cow::Owned` so the caller's `&Row` iteration
4378    // doesn't see a tier distinction; pre-freezer (no cold
4379    // segments loaded) every locator is `Hot` and every entry is
4380    // `Cow::Borrowed` — identical cost to the pre-v5.1 path.
4381    let mut out: Vec<Cow<'a, Row>> = Vec::with_capacity(locators.len());
4382    for loc in locators {
4383        match *loc {
4384            spg_storage::RowLocator::Hot(i) => {
4385                if let Some(row) = table.rows().get(i) {
4386                    out.push(Cow::Borrowed(row));
4387                }
4388            }
4389            spg_storage::RowLocator::Cold { segment_id, .. } => {
4390                if let Some(row) = catalog.resolve_cold_locator(table_name, segment_id, &key) {
4391                    out.push(Cow::Owned(row));
4392                }
4393            }
4394        }
4395    }
4396    Some(out)
4397}
4398
4399/// v5.2.3: extract `(column_position, IndexKey)` when `where_expr`
4400/// is a simple `col = literal` predicate suitable for a `BTree` index
4401/// seek. Used by `exec_update_cancel` / `exec_delete_cancel` to
4402/// decide whether a write touches a cold-tier row (which requires
4403/// promote-on-write / shadow-on-delete) before falling through to
4404/// the hot-tier row walk.
4405///
4406/// Returns `None` for any predicate shape the planner can't push
4407/// down to an index seek — complex WHERE clauses always take the
4408/// hot-only path (cold rows are immutable to non-indexed writes
4409/// until a future scan-fanout sub-version).
4410fn try_pk_predicate(
4411    where_expr: &Expr,
4412    schema_cols: &[ColumnSchema],
4413    table_alias: &str,
4414) -> Option<(usize, IndexKey)> {
4415    let Expr::Binary {
4416        lhs,
4417        op: BinOp::Eq,
4418        rhs,
4419    } = where_expr
4420    else {
4421        return None;
4422    };
4423    let (col_pos, value) = resolve_col_literal_pair(lhs, rhs, schema_cols, table_alias)
4424        .or_else(|| resolve_col_literal_pair(rhs, lhs, schema_cols, table_alias))?;
4425    let key = IndexKey::from_value(&value)?;
4426    Some((col_pos, key))
4427}
4428
4429fn resolve_col_literal_pair(
4430    col_side: &Expr,
4431    lit_side: &Expr,
4432    schema_cols: &[ColumnSchema],
4433    table_alias: &str,
4434) -> Option<(usize, Value)> {
4435    let Expr::Column(c) = col_side else {
4436        return None;
4437    };
4438    if let Some(q) = &c.qualifier
4439        && q != table_alias
4440    {
4441        return None;
4442    }
4443    let pos = schema_cols.iter().position(|s| s.name == c.name)?;
4444    let Expr::Literal(l) = lit_side else {
4445        return None;
4446    };
4447    let v = match l {
4448        Literal::Integer(n) => {
4449            if let Ok(small) = i32::try_from(*n) {
4450                Value::Int(small)
4451            } else {
4452                Value::BigInt(*n)
4453            }
4454        }
4455        Literal::Float(x) => Value::Float(*x),
4456        Literal::String(s) => Value::Text(s.clone()),
4457        Literal::Bool(b) => Value::Bool(*b),
4458        Literal::Null => Value::Null,
4459        // Vector and Interval literals can't be used as B-tree index keys.
4460        // Tell the planner to fall back to full-scan.
4461        Literal::Vector(_) | Literal::Interval { .. } => return None,
4462    };
4463    Some((pos, v))
4464}
4465
4466/// Find the schema entry that a SELECT-list `Expr::Column` refers to.
4467/// Mirrors `resolve_column` in `eval.rs`, but returns a proper
4468/// `EngineError` so the projection-build path keeps `UnknownQualifier`
4469/// vs `ColumnNotFound` distinct.
4470fn resolve_projection_column<'a>(
4471    c: &ColumnName,
4472    schema_cols: &'a [ColumnSchema],
4473    table_alias: &str,
4474) -> Result<&'a ColumnSchema, EngineError> {
4475    if let Some(q) = &c.qualifier {
4476        let composite = alloc::format!("{q}.{name}", name = c.name);
4477        if let Some(s) = schema_cols.iter().find(|s| s.name == composite) {
4478            return Ok(s);
4479        }
4480        // Single-table case: the qualifier may equal the active alias —
4481        // then look for the bare column name.
4482        if q == table_alias
4483            && let Some(s) = schema_cols.iter().find(|s| s.name == c.name)
4484        {
4485            return Ok(s);
4486        }
4487        // For multi-table schemas the qualifier is unknown only if no
4488        // column bears the "<q>." prefix. For single-table, the alias
4489        // mismatch alone is enough.
4490        let prefix = alloc::format!("{q}.");
4491        let qualifier_known =
4492            q == table_alias || schema_cols.iter().any(|s| s.name.starts_with(&prefix));
4493        if !qualifier_known {
4494            return Err(EngineError::Eval(EvalError::UnknownQualifier {
4495                qualifier: q.clone(),
4496            }));
4497        }
4498        return Err(EngineError::Eval(EvalError::ColumnNotFound {
4499            name: c.name.clone(),
4500        }));
4501    }
4502    if let Some(s) = schema_cols.iter().find(|s| s.name == c.name) {
4503        return Ok(s);
4504    }
4505    let suffix = alloc::format!(".{name}", name = c.name);
4506    let mut matches = schema_cols.iter().filter(|s| s.name.ends_with(&suffix));
4507    let first = matches.next();
4508    let extra = matches.next();
4509    match (first, extra) {
4510        (Some(s), None) => Ok(s),
4511        (Some(_), Some(_)) => Err(EngineError::Eval(EvalError::TypeMismatch {
4512            detail: alloc::format!("ambiguous column reference: {}", c.name),
4513        })),
4514        _ => Err(EngineError::Eval(EvalError::ColumnNotFound {
4515            name: c.name.clone(),
4516        })),
4517    }
4518}
4519
4520fn build_projection(
4521    items: &[SelectItem],
4522    schema_cols: &[ColumnSchema],
4523    table_alias: &str,
4524) -> Result<Vec<ProjectedItem>, EngineError> {
4525    let mut out = Vec::new();
4526    for item in items {
4527        match item {
4528            SelectItem::Wildcard => {
4529                for col in schema_cols {
4530                    out.push(ProjectedItem {
4531                        expr: Expr::Column(ColumnName {
4532                            qualifier: None,
4533                            name: col.name.clone(),
4534                        }),
4535                        output_name: col.name.clone(),
4536                        ty: col.ty,
4537                        nullable: col.nullable,
4538                    });
4539                }
4540            }
4541            SelectItem::Expr { expr, alias } => {
4542                // Plain column ref keeps full schema info (real type +
4543                // nullability). Compound expressions evaluate fine but have
4544                // no static type — surface them as nullable TEXT, which is
4545                // what most clients render anyway.
4546                if let Expr::Column(c) = expr {
4547                    let sch = resolve_projection_column(c, schema_cols, table_alias)?;
4548                    let output_name = alias.clone().unwrap_or_else(|| c.name.clone());
4549                    out.push(ProjectedItem {
4550                        expr: expr.clone(),
4551                        output_name,
4552                        ty: sch.ty,
4553                        nullable: sch.nullable,
4554                    });
4555                } else {
4556                    let output_name = alias.clone().unwrap_or_else(|| expr.to_string());
4557                    out.push(ProjectedItem {
4558                        expr: expr.clone(),
4559                        output_name,
4560                        ty: DataType::Text,
4561                        nullable: true,
4562                    });
4563                }
4564            }
4565        }
4566    }
4567    Ok(out)
4568}
4569
4570/// Promote an integer to a NUMERIC value at the requested scale.
4571/// Rejects values that, after scaling, would overflow the column's
4572/// precision budget.
4573fn numeric_from_integer(
4574    n: i128,
4575    precision: u8,
4576    scale: u8,
4577    col_name: &str,
4578) -> Result<Value, EngineError> {
4579    let factor = pow10_i128(scale);
4580    let scaled = n.checked_mul(factor).ok_or_else(|| {
4581        EngineError::Unsupported(alloc::format!(
4582            "integer overflow scaling value for column `{col_name}` to scale {scale}"
4583        ))
4584    })?;
4585    check_precision(scaled, precision, col_name)?;
4586    Ok(Value::Numeric { scaled, scale })
4587}
4588
4589/// Float → NUMERIC. Uses round-half-away-from-zero on `x * 10^scale`,
4590/// then verifies the result fits the column's precision.
4591#[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
4592fn numeric_from_float(
4593    x: f64,
4594    precision: u8,
4595    scale: u8,
4596    col_name: &str,
4597) -> Result<Value, EngineError> {
4598    if !x.is_finite() {
4599        return Err(EngineError::Unsupported(alloc::format!(
4600            "cannot store non-finite float in NUMERIC column `{col_name}`"
4601        )));
4602    }
4603    let mut factor = 1.0_f64;
4604    for _ in 0..scale {
4605        factor *= 10.0;
4606    }
4607    // Round half-away-from-zero by biasing then casting (`as i128`
4608    // truncates toward zero, so the bias + truncation gives the
4609    // desired rounding). `f64::floor` / `ceil` live in std; we don't
4610    // need them — the cast handles the truncation step.
4611    let shifted = x * factor;
4612    let biased = if shifted >= 0.0 {
4613        shifted + 0.5
4614    } else {
4615        shifted - 0.5
4616    };
4617    // Range-check before casting back to i128 — the cast itself is
4618    // saturating in Rust, which would silently truncate huge inputs.
4619    if !(-1e38..=1e38).contains(&biased) {
4620        return Err(EngineError::Unsupported(alloc::format!(
4621            "value {x} overflows NUMERIC range for column `{col_name}`"
4622        )));
4623    }
4624    let scaled = biased as i128;
4625    check_precision(scaled, precision, col_name)?;
4626    Ok(Value::Numeric { scaled, scale })
4627}
4628
4629/// Move a Numeric value from `src_scale` to `dst_scale`. Going up
4630/// multiplies by 10; going down rounds half-away-from-zero.
4631fn numeric_rescale(
4632    scaled: i128,
4633    src_scale: u8,
4634    precision: u8,
4635    dst_scale: u8,
4636    col_name: &str,
4637) -> Result<Value, EngineError> {
4638    let new_scaled = if dst_scale >= src_scale {
4639        let bump = pow10_i128(dst_scale - src_scale);
4640        scaled.checked_mul(bump).ok_or_else(|| {
4641            EngineError::Unsupported(alloc::format!(
4642                "overflow rescaling NUMERIC for column `{col_name}`"
4643            ))
4644        })?
4645    } else {
4646        let drop = pow10_i128(src_scale - dst_scale);
4647        let half = drop / 2;
4648        if scaled >= 0 {
4649            (scaled + half) / drop
4650        } else {
4651            (scaled - half) / drop
4652        }
4653    };
4654    check_precision(new_scaled, precision, col_name)?;
4655    Ok(Value::Numeric {
4656        scaled: new_scaled,
4657        scale: dst_scale,
4658    })
4659}
4660
4661/// Drop the fractional part of a scaled integer, returning the integer
4662/// portion (toward zero). Used for NUMERIC → INT casts.
4663const fn numeric_truncate_to_integer(scaled: i128, scale: u8) -> i128 {
4664    if scale == 0 {
4665        return scaled;
4666    }
4667    let factor = pow10_i128_const(scale);
4668    scaled / factor
4669}
4670
4671/// Verify a scaled NUMERIC value fits the column's declared precision.
4672/// `precision == 0` is the "unconstrained" form (bare `NUMERIC`); we
4673/// skip the check there.
4674fn check_precision(scaled: i128, precision: u8, col_name: &str) -> Result<(), EngineError> {
4675    if precision == 0 {
4676        return Ok(());
4677    }
4678    let limit = pow10_i128(precision);
4679    if scaled.unsigned_abs() >= limit.unsigned_abs() {
4680        return Err(EngineError::Unsupported(alloc::format!(
4681            "NUMERIC value exceeds precision {precision} for column `{col_name}`"
4682        )));
4683    }
4684    Ok(())
4685}
4686
4687const fn pow10_i128_const(p: u8) -> i128 {
4688    let mut acc: i128 = 1;
4689    let mut i = 0;
4690    while i < p {
4691        acc *= 10;
4692        i += 1;
4693    }
4694    acc
4695}
4696
4697fn pow10_i128(p: u8) -> i128 {
4698    pow10_i128_const(p)
4699}
4700
4701/// Walk a parsed `Statement`, swapping any `NOW()` /
4702/// `CURRENT_TIMESTAMP()` / `CURRENT_DATE()` function calls for a
4703/// literal cast that wraps the engine's per-statement clock reading.
4704/// When `now_micros` is `None`, calls stay as-is and surface as
4705/// `unknown function` at eval time — keeps the error path explicit.
4706/// v4.10: pre-walk the WHERE / projection / etc. of a SELECT and
4707/// replace every subquery node with a materialised literal. SPG
4708/// only supports uncorrelated subqueries — the inner SELECT does
4709/// not see outer-row columns, so the result is the same for every
4710/// outer row and can be evaluated once.
4711///
4712/// Returns the rewritten statement; the caller passes this to the
4713/// regular row-loop executor which no longer sees Subquery nodes
4714/// in its tree.
4715impl Engine {
4716    /// v4.12 window executor. Implements `ROW_NUMBER` / `RANK` /
4717    /// `DENSE_RANK` and the partition-aware aggregates `SUM` /
4718    /// `AVG` / `COUNT` / `MIN` / `MAX`. The plan is:
4719    /// 1. Apply the WHERE filter.
4720    /// 2. For each unique `WindowFunction` node in the projection,
4721    ///    partition + sort, compute the per-row value.
4722    /// 3. Append the window values as synthetic columns (`__win_N`)
4723    ///    to the row schema.
4724    /// 4. Rewrite the projection to read those columns.
4725    /// 5. Hand off to the regular project / ORDER BY / LIMIT pipe.
4726    #[allow(
4727        clippy::too_many_lines,
4728        clippy::type_complexity,
4729        clippy::needless_range_loop
4730    )] // window-eval is one cohesive pipe; splitting fragments
4731    fn exec_select_with_window(
4732        &self,
4733        stmt: &SelectStatement,
4734        cancel: CancelToken<'_>,
4735    ) -> Result<QueryResult, EngineError> {
4736        let from = stmt.from.as_ref().ok_or_else(|| {
4737            EngineError::Unsupported("window functions require a FROM clause".into())
4738        })?;
4739        // For v4.12 we only support a single-table FROM. Joins +
4740        // windows is queued for v5.x.
4741        if !from.joins.is_empty() {
4742            return Err(EngineError::Unsupported(
4743                "JOIN with window functions not yet supported".into(),
4744            ));
4745        }
4746        let primary = &from.primary;
4747        let table = self.active_catalog().get(&primary.name).ok_or_else(|| {
4748            StorageError::TableNotFound {
4749                name: primary.name.clone(),
4750            }
4751        })?;
4752        let alias = primary.alias.as_deref().unwrap_or(primary.name.as_str());
4753        let schema_cols = &table.schema().columns;
4754        let ctx = EvalContext::new(schema_cols, Some(alias));
4755
4756        // 1) Filter pass.
4757        let mut filtered: Vec<&Row> = Vec::new();
4758        for (i, row) in table.rows().iter().enumerate() {
4759            if i.is_multiple_of(256) {
4760                cancel.check()?;
4761            }
4762            if let Some(w) = &stmt.where_ {
4763                let cond = eval::eval_expr(w, row, &ctx)?;
4764                if !matches!(cond, Value::Bool(true)) {
4765                    continue;
4766                }
4767            }
4768            filtered.push(row);
4769        }
4770        let n_rows = filtered.len();
4771
4772        // 2) Collect unique window function nodes from projection.
4773        let mut window_nodes: Vec<Expr> = Vec::new();
4774        for item in &stmt.items {
4775            if let SelectItem::Expr { expr, .. } = item {
4776                collect_window_nodes(expr, &mut window_nodes);
4777            }
4778        }
4779
4780        // 3) For each window, compute per-row value.
4781        // Index: same order as window_nodes; for row i, win_vals[w][i].
4782        let mut win_vals: Vec<Vec<Value>> = Vec::with_capacity(window_nodes.len());
4783        for wnode in &window_nodes {
4784            let Expr::WindowFunction {
4785                name,
4786                args,
4787                partition_by,
4788                order_by,
4789                frame,
4790                null_treatment,
4791            } = wnode
4792            else {
4793                unreachable!("collect_window_nodes pushes only WindowFunction");
4794            };
4795            // Compute (partition_key, order_key, original_index) for each row.
4796            let mut indexed: Vec<(Vec<Value>, Vec<(Value, bool)>, usize)> =
4797                Vec::with_capacity(n_rows);
4798            for (i, row) in filtered.iter().enumerate() {
4799                let pkey: Vec<Value> = partition_by
4800                    .iter()
4801                    .map(|p| eval::eval_expr(p, row, &ctx))
4802                    .collect::<Result<_, _>>()?;
4803                let okey: Vec<(Value, bool)> = order_by
4804                    .iter()
4805                    .map(|(e, desc)| eval::eval_expr(e, row, &ctx).map(|v| (v, *desc)))
4806                    .collect::<Result<_, _>>()?;
4807                indexed.push((pkey, okey, i));
4808            }
4809            // Sort by (partition_key, order_key). Partition key uses
4810            // a stable encoded form; order key respects ASC/DESC.
4811            indexed.sort_by(|a, b| {
4812                let p_cmp = partition_key_cmp(&a.0, &b.0);
4813                if p_cmp != core::cmp::Ordering::Equal {
4814                    return p_cmp;
4815                }
4816                order_key_cmp(&a.1, &b.1)
4817            });
4818            // Per-partition compute.
4819            let mut out_vals: Vec<Value> = alloc::vec![Value::Null; n_rows];
4820            let mut p_start = 0;
4821            while p_start < indexed.len() {
4822                let mut p_end = p_start + 1;
4823                while p_end < indexed.len()
4824                    && partition_key_cmp(&indexed[p_start].0, &indexed[p_end].0)
4825                        == core::cmp::Ordering::Equal
4826                {
4827                    p_end += 1;
4828                }
4829                // Compute the function within this partition slice.
4830                compute_window_partition(
4831                    name,
4832                    args,
4833                    !order_by.is_empty(),
4834                    frame.as_ref(),
4835                    *null_treatment,
4836                    &indexed[p_start..p_end],
4837                    &filtered,
4838                    &ctx,
4839                    &mut out_vals,
4840                )?;
4841                p_start = p_end;
4842            }
4843            win_vals.push(out_vals);
4844        }
4845
4846        // 4) Build extended schema: original columns + synthetic.
4847        let mut ext_cols = schema_cols.clone();
4848        for i in 0..window_nodes.len() {
4849            ext_cols.push(ColumnSchema::new(
4850                alloc::format!("__win_{i}"),
4851                DataType::Text, // type doesn't matter for projection eval
4852                true,
4853            ));
4854        }
4855        // 5) Build extended rows: each row gets its window values appended.
4856        let mut ext_rows: Vec<Row> = Vec::with_capacity(n_rows);
4857        for i in 0..n_rows {
4858            let mut values = filtered[i].values.clone();
4859            for w in 0..window_nodes.len() {
4860                values.push(win_vals[w][i].clone());
4861            }
4862            ext_rows.push(Row::new(values));
4863        }
4864        // 6) Rewrite the projection: WindowFunction nodes → Column(__win_N).
4865        let mut rewritten_items: Vec<SelectItem> = Vec::with_capacity(stmt.items.len());
4866        for item in &stmt.items {
4867            let new_item = match item {
4868                SelectItem::Wildcard => SelectItem::Wildcard,
4869                SelectItem::Expr { expr, alias } => {
4870                    let mut e = expr.clone();
4871                    rewrite_window_to_columns(&mut e, &window_nodes);
4872                    SelectItem::Expr {
4873                        expr: e,
4874                        alias: alias.clone(),
4875                    }
4876                }
4877            };
4878            rewritten_items.push(new_item);
4879        }
4880
4881        // 7) Project into final rows.
4882        let ext_ctx = EvalContext::new(&ext_cols, Some(alias));
4883        let projection = build_projection(&rewritten_items, &ext_cols, alias)?;
4884        let mut tagged: Vec<(Vec<f64>, Row)> = Vec::with_capacity(n_rows);
4885        for (i, row) in ext_rows.iter().enumerate() {
4886            if i.is_multiple_of(256) {
4887                cancel.check()?;
4888            }
4889            let mut values = Vec::with_capacity(projection.len());
4890            for p in &projection {
4891                values.push(eval::eval_expr(&p.expr, row, &ext_ctx)?);
4892            }
4893            let order_keys = if stmt.order_by.is_empty() {
4894                Vec::new()
4895            } else {
4896                let mut keys = Vec::with_capacity(stmt.order_by.len());
4897                for o in &stmt.order_by {
4898                    let mut e = o.expr.clone();
4899                    rewrite_window_to_columns(&mut e, &window_nodes);
4900                    let key = eval::eval_expr(&e, row, &ext_ctx)?;
4901                    keys.push(value_to_order_key(&key)?);
4902                }
4903                keys
4904            };
4905            tagged.push((order_keys, Row::new(values)));
4906        }
4907        // ORDER BY + LIMIT/OFFSET on the projected rows.
4908        if !stmt.order_by.is_empty() {
4909            let descs: Vec<bool> = stmt.order_by.iter().map(|o| o.desc).collect();
4910            sort_by_keys(&mut tagged, &descs);
4911        }
4912        let mut out_rows: Vec<Row> = tagged.into_iter().map(|(_, r)| r).collect();
4913        apply_offset_and_limit(&mut out_rows, stmt.offset_literal(), stmt.limit_literal());
4914        let final_cols: Vec<ColumnSchema> = projection
4915            .into_iter()
4916            .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
4917            .collect();
4918        Ok(QueryResult::Rows {
4919            columns: final_cols,
4920            rows: out_rows,
4921        })
4922    }
4923
4924    /// v4.11: materialise each CTE into a temp table inside a
4925    /// cloned catalog, then run the body SELECT against a fresh
4926    /// engine instance that owns the enriched catalog. The clone
4927    /// is moderately expensive — only paid by CTE-bearing queries.
4928    /// Subqueries inside CTE bodies / the main body resolve as
4929    /// usual; `clock_fn` is propagated so `NOW()` lines up.
4930    fn exec_with_ctes(
4931        &self,
4932        stmt: &SelectStatement,
4933        cancel: CancelToken<'_>,
4934    ) -> Result<QueryResult, EngineError> {
4935        cancel.check()?;
4936        let mut catalog = self.active_catalog().clone();
4937        for cte in &stmt.ctes {
4938            if catalog.get(&cte.name).is_some() {
4939                return Err(EngineError::Unsupported(alloc::format!(
4940                    "CTE name {:?} shadows an existing table; rename the CTE",
4941                    cte.name
4942                )));
4943            }
4944            let (columns, rows) = if cte.recursive {
4945                self.materialise_recursive_cte(cte, &catalog, cancel)?
4946            } else {
4947                let body_result = self.exec_select_cancel(&cte.body, cancel)?;
4948                let QueryResult::Rows { columns, rows } = body_result else {
4949                    return Err(EngineError::Unsupported(alloc::format!(
4950                        "CTE {:?} body did not return rows",
4951                        cte.name
4952                    )));
4953                };
4954                (columns, rows)
4955            };
4956            // v4.22: the projection builder labels any non-column
4957            // expression as Text — including literal SELECT 1.
4958            // Promote each column's type to whatever the rows
4959            // actually carry so the CTE storage table accepts them.
4960            let inferred = infer_column_types(&columns, &rows);
4961            let mut columns = inferred;
4962            // v4.22: apply optional `WITH name(a, b, c)` overrides.
4963            if !cte.column_overrides.is_empty() {
4964                if cte.column_overrides.len() != columns.len() {
4965                    return Err(EngineError::Unsupported(alloc::format!(
4966                        "CTE {:?} column list has {} names but body returns {} columns",
4967                        cte.name,
4968                        cte.column_overrides.len(),
4969                        columns.len()
4970                    )));
4971                }
4972                for (col, name) in columns.iter_mut().zip(cte.column_overrides.iter()) {
4973                    col.name.clone_from(name);
4974                }
4975            }
4976            let schema = TableSchema::new(cte.name.clone(), columns);
4977            catalog.create_table(schema).map_err(EngineError::Storage)?;
4978            let table = catalog
4979                .get_mut(&cte.name)
4980                .expect("just-created CTE table must exist");
4981            for row in rows {
4982                table.insert(row).map_err(EngineError::Storage)?;
4983            }
4984        }
4985        // Strip CTEs from the body before running on the temp engine
4986        // so we don't recurse forever.
4987        let mut body = stmt.clone();
4988        body.ctes = Vec::new();
4989        let mut temp = Engine::restore(catalog);
4990        if let Some(c) = self.clock {
4991            temp = temp.with_clock(c);
4992        }
4993        if let Some(f) = self.salt_fn {
4994            temp = temp.with_salt_fn(f);
4995        }
4996        temp.exec_select_cancel(&body, cancel)
4997    }
4998
4999    /// v4.22: materialise a WITH RECURSIVE CTE. The body must be a
5000    /// UNION (or UNION ALL) of an anchor that does not reference
5001    /// the CTE name, and one or more recursive terms that do. The
5002    /// anchor runs first; each subsequent iteration runs the
5003    /// recursive term against a temp catalog where the CTE name is
5004    /// bound to the *previous* iteration's output. Iteration stops
5005    /// when the recursive term yields no rows; UNION (DISTINCT)
5006    /// deduplicates against the accumulated result, UNION ALL does
5007    /// not. A hard cap on total rows prevents runaway queries.
5008    #[allow(clippy::too_many_lines)]
5009    fn materialise_recursive_cte(
5010        &self,
5011        cte: &spg_sql::ast::Cte,
5012        base_catalog: &Catalog,
5013        cancel: CancelToken<'_>,
5014    ) -> Result<(Vec<ColumnSchema>, Vec<Row>), EngineError> {
5015        const MAX_TOTAL_ROWS: usize = 1_000_000;
5016        const MAX_ITERATIONS: usize = 100_000;
5017        cancel.check()?;
5018        if cte.body.unions.is_empty() {
5019            return Err(EngineError::Unsupported(alloc::format!(
5020                "WITH RECURSIVE {:?} body must be a UNION of an anchor and a recursive term",
5021                cte.name
5022            )));
5023        }
5024        // Anchor: the body's leading SELECT, with unions stripped.
5025        let mut anchor = cte.body.clone();
5026        let union_terms = core::mem::take(&mut anchor.unions);
5027        anchor.ctes = Vec::new();
5028        // Anchor must not reference the CTE name.
5029        if select_refers_to(&anchor, &cte.name) {
5030            return Err(EngineError::Unsupported(alloc::format!(
5031                "WITH RECURSIVE {:?}: the anchor must not reference the CTE itself",
5032                cte.name
5033            )));
5034        }
5035        let anchor_result = self.exec_select_cancel(&anchor, cancel)?;
5036        let QueryResult::Rows {
5037            columns: anchor_cols,
5038            rows: anchor_rows,
5039        } = anchor_result
5040        else {
5041            return Err(EngineError::Unsupported(alloc::format!(
5042                "WITH RECURSIVE {:?}: anchor did not return rows",
5043                cte.name
5044            )));
5045        };
5046        // The projection builder labels non-column expressions Text;
5047        // refine column types from the anchor's actual values so the
5048        // intermediate iter-catalog tables accept them.
5049        let mut columns = infer_column_types(&anchor_cols, &anchor_rows);
5050        if !cte.column_overrides.is_empty() {
5051            if cte.column_overrides.len() != columns.len() {
5052                return Err(EngineError::Unsupported(alloc::format!(
5053                    "CTE {:?} column list has {} names but anchor returns {} columns",
5054                    cte.name,
5055                    cte.column_overrides.len(),
5056                    columns.len()
5057                )));
5058            }
5059            for (col, name) in columns.iter_mut().zip(cte.column_overrides.iter()) {
5060                col.name.clone_from(name);
5061            }
5062        }
5063        let mut all_rows: Vec<Row> = anchor_rows.clone();
5064        let mut working_set: Vec<Row> = anchor_rows;
5065        let mut seen: alloc::collections::BTreeSet<Vec<u8>> = alloc::collections::BTreeSet::new();
5066        // Track at least one "all UNION ALL" flag — if every union
5067        // kind is ALL we skip the dedup step (faster + matches PG).
5068        let all_union_all = union_terms.iter().all(|(k, _)| matches!(k, UnionKind::All));
5069        if !all_union_all {
5070            for r in &all_rows {
5071                seen.insert(encode_row_key(r));
5072            }
5073        }
5074        for iter in 0..MAX_ITERATIONS {
5075            cancel.check()?;
5076            if working_set.is_empty() {
5077                break;
5078            }
5079            // Build a fresh catalog: base + CTE bound to working_set.
5080            let mut iter_catalog = base_catalog.clone();
5081            let schema = TableSchema::new(cte.name.clone(), columns.clone());
5082            iter_catalog
5083                .create_table(schema)
5084                .map_err(EngineError::Storage)?;
5085            {
5086                let table = iter_catalog.get_mut(&cte.name).expect("just-created");
5087                for row in &working_set {
5088                    table.insert(row.clone()).map_err(EngineError::Storage)?;
5089                }
5090            }
5091            let mut iter_engine = Engine::restore(iter_catalog);
5092            if let Some(c) = self.clock {
5093                iter_engine = iter_engine.with_clock(c);
5094            }
5095            if let Some(f) = self.salt_fn {
5096                iter_engine = iter_engine.with_salt_fn(f);
5097            }
5098            // Run each recursive term in sequence and collect new rows.
5099            let mut next_set: Vec<Row> = Vec::new();
5100            for (_, term) in &union_terms {
5101                let mut term = term.clone();
5102                term.ctes = Vec::new();
5103                let r = iter_engine.exec_select_cancel(&term, cancel)?;
5104                let QueryResult::Rows {
5105                    columns: rc,
5106                    rows: rs,
5107                } = r
5108                else {
5109                    return Err(EngineError::Unsupported(alloc::format!(
5110                        "WITH RECURSIVE {:?}: recursive term did not return rows",
5111                        cte.name
5112                    )));
5113                };
5114                if rc.len() != columns.len() {
5115                    return Err(EngineError::Unsupported(alloc::format!(
5116                        "WITH RECURSIVE {:?}: column count of recursive term ({}) does not match anchor ({})",
5117                        cte.name,
5118                        rc.len(),
5119                        columns.len()
5120                    )));
5121                }
5122                for row in rs {
5123                    if !all_union_all {
5124                        let key = encode_row_key(&row);
5125                        if !seen.insert(key) {
5126                            continue;
5127                        }
5128                    }
5129                    next_set.push(row);
5130                }
5131            }
5132            if next_set.is_empty() {
5133                break;
5134            }
5135            all_rows.extend(next_set.iter().cloned());
5136            working_set = next_set;
5137            if all_rows.len() > MAX_TOTAL_ROWS {
5138                return Err(EngineError::Unsupported(alloc::format!(
5139                    "WITH RECURSIVE {:?}: produced more than {MAX_TOTAL_ROWS} rows — likely runaway recursion",
5140                    cte.name
5141                )));
5142            }
5143            if iter + 1 == MAX_ITERATIONS {
5144                return Err(EngineError::Unsupported(alloc::format!(
5145                    "WITH RECURSIVE {:?}: exceeded {MAX_ITERATIONS} iterations",
5146                    cte.name
5147                )));
5148            }
5149        }
5150        Ok((columns, all_rows))
5151    }
5152
5153    fn resolve_select_subqueries(
5154        &self,
5155        stmt: &mut SelectStatement,
5156        cancel: CancelToken<'_>,
5157    ) -> Result<(), EngineError> {
5158        for item in &mut stmt.items {
5159            if let SelectItem::Expr { expr, .. } = item {
5160                self.resolve_expr_subqueries(expr, cancel)?;
5161            }
5162        }
5163        if let Some(w) = &mut stmt.where_ {
5164            self.resolve_expr_subqueries(w, cancel)?;
5165        }
5166        if let Some(gs) = &mut stmt.group_by {
5167            for g in gs {
5168                self.resolve_expr_subqueries(g, cancel)?;
5169            }
5170        }
5171        if let Some(h) = &mut stmt.having {
5172            self.resolve_expr_subqueries(h, cancel)?;
5173        }
5174        for o in &mut stmt.order_by {
5175            self.resolve_expr_subqueries(&mut o.expr, cancel)?;
5176        }
5177        for (_, peer) in &mut stmt.unions {
5178            self.resolve_select_subqueries(peer, cancel)?;
5179        }
5180        Ok(())
5181    }
5182
5183    #[allow(clippy::only_used_in_recursion)] // engine handle reads aren't really pure
5184    fn resolve_expr_subqueries(
5185        &self,
5186        e: &mut Expr,
5187        cancel: CancelToken<'_>,
5188    ) -> Result<(), EngineError> {
5189        // Replace-on-this-node cases first.
5190        if let Some(replacement) = self.subquery_replacement(e, cancel)? {
5191            *e = replacement;
5192            return Ok(());
5193        }
5194        match e {
5195            Expr::Binary { lhs, rhs, .. } => {
5196                self.resolve_expr_subqueries(lhs, cancel)?;
5197                self.resolve_expr_subqueries(rhs, cancel)?;
5198            }
5199            Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
5200                self.resolve_expr_subqueries(expr, cancel)?;
5201            }
5202            Expr::FunctionCall { args, .. } => {
5203                for a in args {
5204                    self.resolve_expr_subqueries(a, cancel)?;
5205                }
5206            }
5207            Expr::Like { expr, pattern, .. } => {
5208                self.resolve_expr_subqueries(expr, cancel)?;
5209                self.resolve_expr_subqueries(pattern, cancel)?;
5210            }
5211            Expr::Extract { source, .. } => self.resolve_expr_subqueries(source, cancel)?,
5212            // v4.12 window functions — recurse into args + ORDER BY
5213            // + PARTITION BY in case they carry inner subqueries.
5214            Expr::WindowFunction {
5215                args,
5216                partition_by,
5217                order_by,
5218                ..
5219            } => {
5220                for a in args {
5221                    self.resolve_expr_subqueries(a, cancel)?;
5222                }
5223                for p in partition_by {
5224                    self.resolve_expr_subqueries(p, cancel)?;
5225                }
5226                for (e, _) in order_by {
5227                    self.resolve_expr_subqueries(e, cancel)?;
5228                }
5229            }
5230            // Subquery nodes are handled in subquery_replacement
5231            // (which returned None — defensive no-op); Literal /
5232            // Column are leaves.
5233            Expr::ScalarSubquery(_)
5234            | Expr::Exists { .. }
5235            | Expr::InSubquery { .. }
5236            | Expr::Literal(_)
5237            | Expr::Placeholder(_)
5238            | Expr::Column(_) => {}
5239        }
5240        Ok(())
5241    }
5242
5243    /// v4.23: per-row eval that handles correlated subqueries.
5244    /// Equivalent to `eval::eval_expr` when the expression has no
5245    /// subqueries; otherwise clones the expression, substitutes
5246    /// outer-row columns into each surviving subquery node, runs
5247    /// the inner SELECT, and replaces the node with the literal
5248    /// result. Only the WHERE-filter call sites use this path so
5249    /// the uncorrelated fast path is preserved everywhere else.
5250    fn eval_expr_with_correlated(
5251        &self,
5252        expr: &Expr,
5253        row: &Row,
5254        ctx: &EvalContext<'_>,
5255        cancel: CancelToken<'_>,
5256        memo: Option<&mut memoize::MemoizeCache>,
5257    ) -> Result<Value, EngineError> {
5258        if !expr_has_subquery(expr) {
5259            return eval::eval_expr(expr, row, ctx).map_err(EngineError::Eval);
5260        }
5261        let mut e = expr.clone();
5262        self.resolve_correlated_in_expr(&mut e, row, ctx, cancel, memo)?;
5263        eval::eval_expr(&e, row, ctx).map_err(EngineError::Eval)
5264    }
5265
5266    fn resolve_correlated_in_expr(
5267        &self,
5268        e: &mut Expr,
5269        row: &Row,
5270        ctx: &EvalContext<'_>,
5271        cancel: CancelToken<'_>,
5272        mut memo: Option<&mut memoize::MemoizeCache>,
5273    ) -> Result<(), EngineError> {
5274        match e {
5275            Expr::ScalarSubquery(inner) => {
5276                // v6.2.6 — Memoize: build the cache key from the
5277                // pre-substitution subquery repr + the outer row's
5278                // values. Two outer rows with identical correlated
5279                // values hit the same entry.
5280                let cache_key = memo.as_ref().map(|_| memoize::CacheKey {
5281                    subquery_repr: alloc::format!("{}", **inner),
5282                    outer_values: row.values.clone(),
5283                });
5284                if let (Some(cache), Some(k)) = (memo.as_deref_mut(), cache_key.as_ref())
5285                    && let Some(cached) = cache.get(k)
5286                {
5287                    *e = value_to_literal_expr(cached)?;
5288                    return Ok(());
5289                }
5290                let mut s = (**inner).clone();
5291                substitute_outer_columns(&mut s, row, ctx);
5292                let r = self.exec_select_cancel(&s, cancel)?;
5293                let QueryResult::Rows { rows, .. } = r else {
5294                    return Err(EngineError::Unsupported(
5295                        "scalar subquery: inner did not return rows".into(),
5296                    ));
5297                };
5298                let value = match rows.as_slice() {
5299                    [] => Value::Null,
5300                    [r0] => r0.values.first().cloned().unwrap_or(Value::Null),
5301                    _ => {
5302                        return Err(EngineError::Unsupported(alloc::format!(
5303                            "scalar subquery returned {} rows; expected 0 or 1",
5304                            rows.len()
5305                        )));
5306                    }
5307                };
5308                if let (Some(cache), Some(k)) = (memo.as_deref_mut(), cache_key) {
5309                    cache.insert(k, value.clone());
5310                }
5311                *e = value_to_literal_expr(value)?;
5312            }
5313            Expr::Exists { subquery, negated } => {
5314                let mut s = (**subquery).clone();
5315                substitute_outer_columns(&mut s, row, ctx);
5316                let r = self.exec_select_cancel(&s, cancel)?;
5317                let exists = matches!(r, QueryResult::Rows { rows, .. } if !rows.is_empty());
5318                let bit = if *negated { !exists } else { exists };
5319                *e = Expr::Literal(Literal::Bool(bit));
5320            }
5321            Expr::InSubquery {
5322                expr: lhs,
5323                subquery,
5324                negated,
5325            } => {
5326                self.resolve_correlated_in_expr(lhs, row, ctx, cancel, memo.as_deref_mut())?;
5327                let lhs_val = eval::eval_expr(lhs, row, ctx).map_err(EngineError::Eval)?;
5328                let mut s = (**subquery).clone();
5329                substitute_outer_columns(&mut s, row, ctx);
5330                let r = self.exec_select_cancel(&s, cancel)?;
5331                let QueryResult::Rows { columns, rows, .. } = r else {
5332                    return Err(EngineError::Unsupported(
5333                        "IN-subquery: inner did not return rows".into(),
5334                    ));
5335                };
5336                if columns.len() != 1 {
5337                    return Err(EngineError::Unsupported(alloc::format!(
5338                        "IN-subquery must project exactly one column; got {}",
5339                        columns.len()
5340                    )));
5341                }
5342                let mut found = false;
5343                let mut any_null = false;
5344                for r0 in rows {
5345                    let v = r0.values.into_iter().next().unwrap_or(Value::Null);
5346                    if v.is_null() {
5347                        any_null = true;
5348                        continue;
5349                    }
5350                    if value_cmp(&v, &lhs_val) == core::cmp::Ordering::Equal {
5351                        found = true;
5352                        break;
5353                    }
5354                }
5355                let bit = if found {
5356                    !*negated
5357                } else if any_null {
5358                    return Err(EngineError::Unsupported(
5359                        "IN-subquery with NULL in result and no match: NULL semantics not yet implemented".into(),
5360                    ));
5361                } else {
5362                    *negated
5363                };
5364                *e = Expr::Literal(Literal::Bool(bit));
5365            }
5366            Expr::Binary { lhs, rhs, .. } => {
5367                self.resolve_correlated_in_expr(lhs, row, ctx, cancel, memo.as_deref_mut())?;
5368                self.resolve_correlated_in_expr(rhs, row, ctx, cancel, memo.as_deref_mut())?;
5369            }
5370            Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
5371                self.resolve_correlated_in_expr(expr, row, ctx, cancel, memo.as_deref_mut())?;
5372            }
5373            Expr::Like { expr, pattern, .. } => {
5374                self.resolve_correlated_in_expr(expr, row, ctx, cancel, memo.as_deref_mut())?;
5375                self.resolve_correlated_in_expr(pattern, row, ctx, cancel, memo.as_deref_mut())?;
5376            }
5377            Expr::FunctionCall { args, .. } => {
5378                for a in args {
5379                    self.resolve_correlated_in_expr(a, row, ctx, cancel, memo.as_deref_mut())?;
5380                }
5381            }
5382            Expr::Extract { source, .. } => {
5383                self.resolve_correlated_in_expr(source, row, ctx, cancel, memo.as_deref_mut())?;
5384            }
5385            Expr::WindowFunction { .. } | Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => {}
5386        }
5387        Ok(())
5388    }
5389
5390    fn subquery_replacement(
5391        &self,
5392        e: &Expr,
5393        cancel: CancelToken<'_>,
5394    ) -> Result<Option<Expr>, EngineError> {
5395        match e {
5396            Expr::ScalarSubquery(inner) => {
5397                let mut s = (**inner).clone();
5398                // Recurse into the inner SELECT first so nested
5399                // subqueries materialise bottom-up.
5400                self.resolve_select_subqueries(&mut s, cancel)?;
5401                let r = match self.exec_bare_select_cancel(&s, cancel) {
5402                    Ok(r) => r,
5403                    Err(e) if is_correlation_error(&e) => return Ok(None),
5404                    Err(e) => return Err(e),
5405                };
5406                let QueryResult::Rows { rows, .. } = r else {
5407                    return Err(EngineError::Unsupported(
5408                        "scalar subquery: inner statement did not return rows".into(),
5409                    ));
5410                };
5411                let value = match rows.as_slice() {
5412                    [] => Value::Null,
5413                    [row] => row.values.first().cloned().unwrap_or(Value::Null),
5414                    _ => {
5415                        return Err(EngineError::Unsupported(alloc::format!(
5416                            "scalar subquery returned {} rows; expected 0 or 1",
5417                            rows.len()
5418                        )));
5419                    }
5420                };
5421                Ok(Some(value_to_literal_expr(value)?))
5422            }
5423            Expr::Exists { subquery, negated } => {
5424                let mut s = (**subquery).clone();
5425                self.resolve_select_subqueries(&mut s, cancel)?;
5426                let r = match self.exec_bare_select_cancel(&s, cancel) {
5427                    Ok(r) => r,
5428                    Err(e) if is_correlation_error(&e) => return Ok(None),
5429                    Err(e) => return Err(e),
5430                };
5431                let exists = match r {
5432                    QueryResult::Rows { rows, .. } => !rows.is_empty(),
5433                    QueryResult::CommandOk { .. } => false,
5434                };
5435                let bit = if *negated { !exists } else { exists };
5436                Ok(Some(Expr::Literal(Literal::Bool(bit))))
5437            }
5438            Expr::InSubquery {
5439                expr,
5440                subquery,
5441                negated,
5442            } => {
5443                let mut s = (**subquery).clone();
5444                self.resolve_select_subqueries(&mut s, cancel)?;
5445                let r = match self.exec_bare_select_cancel(&s, cancel) {
5446                    Ok(r) => r,
5447                    Err(e) if is_correlation_error(&e) => return Ok(None),
5448                    Err(e) => return Err(e),
5449                };
5450                let QueryResult::Rows { columns, rows, .. } = r else {
5451                    return Err(EngineError::Unsupported(
5452                        "IN-subquery: inner statement did not return rows".into(),
5453                    ));
5454                };
5455                if columns.len() != 1 {
5456                    return Err(EngineError::Unsupported(alloc::format!(
5457                        "IN-subquery must project exactly one column; got {}",
5458                        columns.len()
5459                    )));
5460                }
5461                // Build the same OR-Eq chain the parse-time literal-list
5462                // path constructs, with each value lifted into a Literal.
5463                let mut acc: Option<Expr> = None;
5464                for row in rows {
5465                    let v = row.values.into_iter().next().unwrap_or(Value::Null);
5466                    let lit = value_to_literal_expr(v)?;
5467                    let cmp = Expr::Binary {
5468                        lhs: expr.clone(),
5469                        op: BinOp::Eq,
5470                        rhs: Box::new(lit),
5471                    };
5472                    acc = Some(match acc {
5473                        None => cmp,
5474                        Some(prev) => Expr::Binary {
5475                            lhs: Box::new(prev),
5476                            op: BinOp::Or,
5477                            rhs: Box::new(cmp),
5478                        },
5479                    });
5480                }
5481                let combined = acc.unwrap_or(Expr::Literal(Literal::Bool(false)));
5482                let final_expr = if *negated {
5483                    Expr::Unary {
5484                        op: UnOp::Not,
5485                        expr: Box::new(combined),
5486                    }
5487                } else {
5488                    combined
5489                };
5490                Ok(Some(final_expr))
5491            }
5492            _ => Ok(None),
5493        }
5494    }
5495}
5496
5497// ---- v4.12 window-function helpers ----
5498// The (partition-key, order-key, original-index) tuple shape used
5499// across these helpers is intrinsic to the planner. Factoring it
5500// into a typedef adds indirection without making the code clearer,
5501// so several lints are allowed inline on the affected functions
5502// rather than module-wide.
5503
5504/// v4.22: cheap structural scan for `FROM <name>` (qualified or
5505/// not) inside a SELECT — used to verify the anchor of a WITH
5506/// RECURSIVE CTE doesn't recurse into itself. Conservative: walks
5507/// FROM joins, subqueries, and unions.
5508fn select_refers_to(stmt: &SelectStatement, target: &str) -> bool {
5509    if let Some(from) = &stmt.from
5510        && from_refers_to(from, target)
5511    {
5512        return true;
5513    }
5514    for (_, peer) in &stmt.unions {
5515        if select_refers_to(peer, target) {
5516            return true;
5517        }
5518    }
5519    for item in &stmt.items {
5520        if let SelectItem::Expr { expr, .. } = item
5521            && expr_refers_to(expr, target)
5522        {
5523            return true;
5524        }
5525    }
5526    if let Some(w) = &stmt.where_
5527        && expr_refers_to(w, target)
5528    {
5529        return true;
5530    }
5531    false
5532}
5533
5534fn from_refers_to(from: &FromClause, target: &str) -> bool {
5535    if from.primary.name.eq_ignore_ascii_case(target) {
5536        return true;
5537    }
5538    from.joins
5539        .iter()
5540        .any(|j| j.table.name.eq_ignore_ascii_case(target))
5541}
5542
5543fn expr_refers_to(e: &Expr, target: &str) -> bool {
5544    match e {
5545        Expr::ScalarSubquery(s) => select_refers_to(s, target),
5546        Expr::Exists { subquery, .. } | Expr::InSubquery { subquery, .. } => {
5547            select_refers_to(subquery, target)
5548        }
5549        Expr::Binary { lhs, rhs, .. } => expr_refers_to(lhs, target) || expr_refers_to(rhs, target),
5550        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
5551            expr_refers_to(expr, target)
5552        }
5553        Expr::Like { expr, pattern, .. } => {
5554            expr_refers_to(expr, target) || expr_refers_to(pattern, target)
5555        }
5556        Expr::FunctionCall { args, .. } => args.iter().any(|a| expr_refers_to(a, target)),
5557        Expr::Extract { source, .. } => expr_refers_to(source, target),
5558        Expr::WindowFunction {
5559            args,
5560            partition_by,
5561            order_by,
5562            ..
5563        } => {
5564            args.iter().any(|a| expr_refers_to(a, target))
5565                || partition_by.iter().any(|p| expr_refers_to(p, target))
5566                || order_by.iter().any(|(o, _)| expr_refers_to(o, target))
5567        }
5568        Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => false,
5569    }
5570}
5571
5572/// v4.22: pick more specific column types from observed rows when
5573/// the projection builder defaulted to Text (the v1.x behavior for
5574/// non-column expressions). Lets `WITH t(n) AS (SELECT 1 ...)`
5575/// land an Int column in the CTE storage table rather than failing
5576/// the insert with "expected TEXT, got INT".
5577fn infer_column_types(columns: &[ColumnSchema], rows: &[Row]) -> Vec<ColumnSchema> {
5578    let mut out = columns.to_vec();
5579    for (col_idx, col) in out.iter_mut().enumerate() {
5580        if col.ty != DataType::Text {
5581            continue;
5582        }
5583        let mut inferred: Option<DataType> = None;
5584        let mut all_null = true;
5585        for row in rows {
5586            let Some(v) = row.values.get(col_idx) else {
5587                continue;
5588            };
5589            let ty = match v {
5590                Value::Null => continue,
5591                Value::SmallInt(_) => DataType::SmallInt,
5592                Value::Int(_) => DataType::Int,
5593                Value::BigInt(_) => DataType::BigInt,
5594                Value::Float(_) => DataType::Float,
5595                Value::Bool(_) => DataType::Bool,
5596                Value::Vector(_) => DataType::Vector {
5597                    dim: 0,
5598                    encoding: VecEncoding::F32,
5599                },
5600                _ => DataType::Text,
5601            };
5602            all_null = false;
5603            inferred = Some(match inferred {
5604                None => ty,
5605                Some(prev) if prev == ty => prev,
5606                Some(_) => DataType::Text,
5607            });
5608        }
5609        if let Some(t) = inferred {
5610            col.ty = t;
5611            col.nullable = true;
5612        } else if all_null {
5613            col.nullable = true;
5614        }
5615    }
5616    out
5617}
5618
5619/// v4.26: render a human-readable plan tree for `EXPLAIN <select>`.
5620/// Lines are pushed into `out`; `depth` controls indentation. We
5621/// describe the rewritten SELECT — what the executor *would* do —
5622/// using the engine handle to spot indexed lookups and table shapes.
5623#[allow(clippy::too_many_lines, clippy::format_push_string)]
5624/// v6.2.4 — Walk every line of the rendered plan tree and append
5625/// per-operator stats. Lines that name a known operator get
5626/// `(rows=N)` (`actual_rows` of the top-level operator equals the
5627/// final result row count; scans report their catalog row count
5628/// as the rows-considered metric). Other lines — Filter / Join /
5629/// GroupBy / OrderBy etc. — are marked `(—)` so the surface is
5630/// complete-by-construction; v6.2.5 fills these in via inline
5631/// executor counters.
5632/// v6.8.3 — surface "CREATE INDEX …" suggestions for every
5633/// `(table, column)` pair the query touches via WHERE / JOIN
5634/// that doesn't already have an index on the owning table.
5635/// Walks the SELECT's FROM clauses + WHERE expression tree;
5636/// returns one line per missing index. Deterministic order:
5637/// FROM-clause iteration order, then column-reference walk
5638/// order inside each WHERE. Each suggestion is a copy-pastable
5639/// DDL string.
5640fn build_index_suggestions(stmt: &SelectStatement, engine: &Engine) -> Vec<String> {
5641    use alloc::collections::BTreeSet;
5642    let mut seen: BTreeSet<(String, String)> = BTreeSet::new();
5643    let mut out: Vec<String> = Vec::new();
5644    let cat = engine.active_catalog();
5645    // Build a (table, qualifier-or-alias) list from the FROM clause
5646    // so unqualified column refs in WHERE resolve to the correct
5647    // table.
5648    let Some(from) = &stmt.from else {
5649        return out;
5650    };
5651    let mut tables: Vec<String> = Vec::new();
5652    tables.push(from.primary.name.clone());
5653    for j in &from.joins {
5654        tables.push(j.table.name.clone());
5655    }
5656    // Collect column refs from the WHERE expression. JOIN ON
5657    // predicates also feed in.
5658    let mut col_refs: Vec<spg_sql::ast::ColumnName> = Vec::new();
5659    if let Some(w) = &stmt.where_ {
5660        collect_column_refs(w, &mut col_refs);
5661    }
5662    for j in &from.joins {
5663        if let Some(on) = &j.on {
5664            collect_column_refs(on, &mut col_refs);
5665        }
5666    }
5667    for cn in &col_refs {
5668        // Resolve owner table: explicit qualifier first, else
5669        // first table in FROM that has a column of this name.
5670        let owner: Option<String> = if let Some(q) = &cn.qualifier {
5671            tables.iter().find(|t| t == &q).cloned()
5672        } else {
5673            tables.iter().find_map(|t| {
5674                cat.get(t).and_then(|tbl| {
5675                    if tbl.schema().column_position(&cn.name).is_some() {
5676                        Some(t.clone())
5677                    } else {
5678                        None
5679                    }
5680                })
5681            })
5682        };
5683        let Some(owner) = owner else {
5684            continue;
5685        };
5686        let Some(tbl) = cat.get(&owner) else {
5687            continue;
5688        };
5689        let Some(col_pos) = tbl.schema().column_position(&cn.name) else {
5690            continue;
5691        };
5692        // Skip if any BTree index already covers this column as
5693        // its key.
5694        let already_indexed = tbl.indices().iter().any(|i| {
5695            matches!(i.kind, spg_storage::IndexKind::BTree(_))
5696                && i.column_position == col_pos
5697                && i.expression.is_none()
5698                && i.partial_predicate.is_none()
5699        });
5700        if already_indexed {
5701            continue;
5702        }
5703        if seen.insert((owner.clone(), cn.name.clone())) {
5704            out.push(alloc::format!(
5705                "SUGGEST: CREATE INDEX ix_{}_{} ON {} ({})",
5706                owner,
5707                cn.name,
5708                owner,
5709                cn.name
5710            ));
5711        }
5712    }
5713    out
5714}
5715
5716/// Walks an `Expr` and pushes every `ColumnName` it references.
5717/// Order is depth-first, left-to-right.
5718fn collect_column_refs(expr: &Expr, out: &mut Vec<spg_sql::ast::ColumnName>) {
5719    match expr {
5720        Expr::Column(cn) => out.push(cn.clone()),
5721        Expr::FunctionCall { args, .. } => {
5722            for a in args {
5723                collect_column_refs(a, out);
5724            }
5725        }
5726        Expr::Binary { lhs, rhs, .. } => {
5727            collect_column_refs(lhs, out);
5728            collect_column_refs(rhs, out);
5729        }
5730        Expr::Unary { expr: e, .. } => collect_column_refs(e, out),
5731        _ => {}
5732    }
5733}
5734
5735fn annotate_explain_lines(lines: &mut [String], total_rows: usize, engine: &Engine) {
5736    let catalog = engine.active_catalog();
5737    let cold_ids = catalog.cold_segment_ids_global();
5738    let any_cold = !cold_ids.is_empty();
5739    let cold_ids_repr = if any_cold {
5740        let mut s = alloc::string::String::from("[");
5741        for (i, id) in cold_ids.iter().enumerate() {
5742            if i > 0 {
5743                s.push(',');
5744            }
5745            s.push_str(&alloc::format!("{id}"));
5746        }
5747        s.push(']');
5748        s
5749    } else {
5750        alloc::string::String::new()
5751    };
5752    for (idx, line) in lines.iter_mut().enumerate() {
5753        let trimmed = line.trim_start();
5754        let is_top_level = idx == 0;
5755        if is_top_level {
5756            line.push_str(&alloc::format!(" (rows={total_rows})"));
5757            continue;
5758        }
5759        if let Some(rest) = trimmed.strip_prefix("From: ") {
5760            let (name, scan_kind) = match rest.split_once(" [") {
5761                Some((n, k)) => (n.trim(), k.trim_end_matches(']')),
5762                None => (rest.trim(), ""),
5763            };
5764            let bare = name.split_whitespace().next().unwrap_or(name);
5765            let hot = catalog.get(bare).map(|t| t.rows().len());
5766            // v6.2.7 — `cold_segments=[id0,id1,…]` enumerates every
5767            // cold-tier segment the scan COULD have walked. v6.2.x
5768            // can tighten to per-table by walking the table's
5769            // BTree-index cold locators.
5770            let annot = match (hot, scan_kind) {
5771                (Some(h), "full scan") => {
5772                    let mut s = alloc::format!(" (hot_rows={h}");
5773                    if any_cold {
5774                        s.push_str(&alloc::format!(
5775                            ", cold_tier=present, cold_segments={cold_ids_repr}"
5776                        ));
5777                    }
5778                    s.push(')');
5779                    s
5780                }
5781                (Some(h), "index seek") => {
5782                    let mut s = alloc::format!(" (hot_rows≤{h}");
5783                    if any_cold {
5784                        s.push_str(&alloc::format!(
5785                            ", cold_tier=present, cold_segments={cold_ids_repr}"
5786                        ));
5787                    }
5788                    s.push(')');
5789                    s
5790                }
5791                _ => " (rows=—)".to_string(),
5792            };
5793            line.push_str(&annot);
5794            continue;
5795        }
5796        // Filter / GroupBy / Having / OrderBy / Limit / Join etc.
5797        line.push_str(" (rows=—)");
5798    }
5799}
5800
5801fn explain_select(stmt: &SelectStatement, engine: &Engine, depth: usize, out: &mut Vec<String>) {
5802    let pad = "  ".repeat(depth);
5803    // 1) Top-level operator label.
5804    let top = if !stmt.ctes.is_empty() {
5805        if stmt.ctes.iter().any(|c| c.recursive) {
5806            "CTEScan (WITH RECURSIVE)"
5807        } else {
5808            "CTEScan (WITH)"
5809        }
5810    } else if !stmt.unions.is_empty() {
5811        "UnionScan"
5812    } else if select_has_window(stmt) {
5813        "WindowAgg"
5814    } else if aggregate::uses_aggregate(stmt) {
5815        "Aggregate"
5816    } else if stmt.distinct {
5817        "Distinct"
5818    } else if stmt.from.is_some() {
5819        "TableScan"
5820    } else {
5821        "Result"
5822    };
5823    out.push(alloc::format!("{pad}{top}"));
5824    let child = "  ".repeat(depth + 1);
5825    // 2) CTE bodies.
5826    for cte in &stmt.ctes {
5827        let head = if cte.recursive {
5828            alloc::format!("{child}CTE (recursive): {}", cte.name)
5829        } else {
5830            alloc::format!("{child}CTE: {}", cte.name)
5831        };
5832        out.push(head);
5833        explain_select(&cte.body, engine, depth + 2, out);
5834    }
5835    // 3) FROM details — primary table + joins, index hits.
5836    if let Some(from) = &stmt.from {
5837        let mut tag = alloc::format!("{child}From: {}", from.primary.name);
5838        if let Some(alias) = &from.primary.alias {
5839            tag.push_str(&alloc::format!(" AS {alias}"));
5840        }
5841        // Try to detect an index-seek opportunity on WHERE against
5842        // the primary table — same heuristic the executor uses.
5843        if let Some(w) = &stmt.where_
5844            && let Some(table) = engine.active_catalog().get(&from.primary.name)
5845        {
5846            let alias = from.primary.alias.as_deref().unwrap_or(&from.primary.name);
5847            let cols = &table.schema().columns;
5848            if try_index_seek(w, cols, engine.active_catalog(), table, alias).is_some() {
5849                tag.push_str(" [index seek]");
5850            } else {
5851                tag.push_str(" [full scan]");
5852            }
5853        } else {
5854            tag.push_str(" [full scan]");
5855        }
5856        out.push(tag);
5857        for j in &from.joins {
5858            let kind = match j.kind {
5859                spg_sql::ast::JoinKind::Inner => "INNER JOIN",
5860                spg_sql::ast::JoinKind::Left => "LEFT JOIN",
5861                spg_sql::ast::JoinKind::Cross => "CROSS JOIN",
5862            };
5863            let mut s = alloc::format!("{child}{kind}: {}", j.table.name);
5864            if let Some(alias) = &j.table.alias {
5865                s.push_str(&alloc::format!(" AS {alias}"));
5866            }
5867            if j.on.is_some() {
5868                s.push_str(" (ON …)");
5869            }
5870            out.push(s);
5871        }
5872    }
5873    // 4) WHERE / GROUP BY / HAVING / ORDER BY / LIMIT / OFFSET.
5874    if let Some(w) = &stmt.where_ {
5875        let mut s = alloc::format!("{child}Filter: {w}");
5876        if expr_has_subquery(w) {
5877            s.push_str(" [subquery]");
5878        }
5879        out.push(s);
5880    }
5881    if let Some(gs) = &stmt.group_by {
5882        let mut parts = Vec::new();
5883        for g in gs {
5884            parts.push(alloc::format!("{g}"));
5885        }
5886        out.push(alloc::format!("{child}GroupBy: {}", parts.join(", ")));
5887    }
5888    if let Some(h) = &stmt.having {
5889        out.push(alloc::format!("{child}Having: {h}"));
5890    }
5891    for o in &stmt.order_by {
5892        let dir = if o.desc { "DESC" } else { "ASC" };
5893        out.push(alloc::format!("{child}OrderBy: {} {dir}", o.expr));
5894    }
5895    if let Some(lim) = stmt.limit {
5896        out.push(alloc::format!("{child}Limit: {lim}"));
5897    }
5898    if let Some(off) = stmt.offset {
5899        out.push(alloc::format!("{child}Offset: {off}"));
5900    }
5901    // 5) Projection — collapse Wildcard or render N items.
5902    if stmt
5903        .items
5904        .iter()
5905        .any(|it| matches!(it, SelectItem::Wildcard))
5906    {
5907        out.push(alloc::format!("{child}Project: *"));
5908    } else {
5909        out.push(alloc::format!(
5910            "{child}Project: {} item(s)",
5911            stmt.items.len()
5912        ));
5913    }
5914    // 6) Recurse into UNION peers.
5915    for (kind, peer) in &stmt.unions {
5916        let label = match kind {
5917            UnionKind::All => "UNION ALL",
5918            UnionKind::Distinct => "UNION",
5919        };
5920        out.push(alloc::format!("{child}{label}"));
5921        explain_select(peer, engine, depth + 2, out);
5922    }
5923}
5924
5925/// v4.23: recognise the engine errors that indicate the inner
5926/// SELECT couldn't be evaluated in isolation because it references
5927/// an outer column — used by `subquery_replacement` to skip
5928/// materialisation and let row-eval handle it instead.
5929fn is_correlation_error(e: &EngineError) -> bool {
5930    matches!(
5931        e,
5932        EngineError::Eval(
5933            eval::EvalError::ColumnNotFound { .. } | eval::EvalError::UnknownQualifier { .. }
5934        )
5935    )
5936}
5937
5938/// v4.23: walk every Expr in `stmt` and replace each Column ref
5939/// that targets the outer scope (qualifier matches the outer
5940/// table alias) with a Literal carrying the outer row's value.
5941/// Conservative: only qualified refs are substituted, so the user
5942/// must write `outer_alias.col` to reference an outer column. This
5943/// matches PG's lexical scoping for correlated subqueries and
5944/// avoids accidentally rebinding inner columns of the same name.
5945fn substitute_outer_columns(stmt: &mut SelectStatement, row: &Row, ctx: &EvalContext<'_>) {
5946    let Some(outer_alias) = ctx.table_alias else {
5947        return;
5948    };
5949    substitute_in_select(stmt, row, ctx, outer_alias);
5950}
5951
5952fn substitute_in_select(
5953    stmt: &mut SelectStatement,
5954    row: &Row,
5955    ctx: &EvalContext<'_>,
5956    outer_alias: &str,
5957) {
5958    for item in &mut stmt.items {
5959        if let SelectItem::Expr { expr, .. } = item {
5960            substitute_in_expr(expr, row, ctx, outer_alias);
5961        }
5962    }
5963    if let Some(w) = &mut stmt.where_ {
5964        substitute_in_expr(w, row, ctx, outer_alias);
5965    }
5966    if let Some(gs) = &mut stmt.group_by {
5967        for g in gs {
5968            substitute_in_expr(g, row, ctx, outer_alias);
5969        }
5970    }
5971    if let Some(h) = &mut stmt.having {
5972        substitute_in_expr(h, row, ctx, outer_alias);
5973    }
5974    for o in &mut stmt.order_by {
5975        substitute_in_expr(&mut o.expr, row, ctx, outer_alias);
5976    }
5977    for (_, peer) in &mut stmt.unions {
5978        substitute_in_select(peer, row, ctx, outer_alias);
5979    }
5980}
5981
5982fn substitute_in_expr(e: &mut Expr, row: &Row, ctx: &EvalContext<'_>, outer_alias: &str) {
5983    if let Expr::Column(c) = e
5984        && let Some(qual) = &c.qualifier
5985        && qual.eq_ignore_ascii_case(outer_alias)
5986    {
5987        // Look up the column's index in the outer schema.
5988        if let Some(idx) = ctx
5989            .columns
5990            .iter()
5991            .position(|sc| sc.name.eq_ignore_ascii_case(&c.name))
5992        {
5993            let v = row.values.get(idx).cloned().unwrap_or(Value::Null);
5994            if let Ok(lit) = value_to_literal_expr(v) {
5995                *e = lit;
5996                return;
5997            }
5998        }
5999    }
6000    match e {
6001        Expr::Binary { lhs, rhs, .. } => {
6002            substitute_in_expr(lhs, row, ctx, outer_alias);
6003            substitute_in_expr(rhs, row, ctx, outer_alias);
6004        }
6005        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
6006            substitute_in_expr(expr, row, ctx, outer_alias);
6007        }
6008        Expr::Like { expr, pattern, .. } => {
6009            substitute_in_expr(expr, row, ctx, outer_alias);
6010            substitute_in_expr(pattern, row, ctx, outer_alias);
6011        }
6012        Expr::FunctionCall { args, .. } => {
6013            for a in args {
6014                substitute_in_expr(a, row, ctx, outer_alias);
6015            }
6016        }
6017        Expr::Extract { source, .. } => substitute_in_expr(source, row, ctx, outer_alias),
6018        Expr::WindowFunction {
6019            args,
6020            partition_by,
6021            order_by,
6022            ..
6023        } => {
6024            for a in args {
6025                substitute_in_expr(a, row, ctx, outer_alias);
6026            }
6027            for p in partition_by {
6028                substitute_in_expr(p, row, ctx, outer_alias);
6029            }
6030            for (o, _) in order_by {
6031                substitute_in_expr(o, row, ctx, outer_alias);
6032            }
6033        }
6034        Expr::ScalarSubquery(s) => substitute_in_select(s, row, ctx, outer_alias),
6035        Expr::Exists { subquery, .. } | Expr::InSubquery { subquery, .. } => {
6036            substitute_in_select(subquery, row, ctx, outer_alias);
6037        }
6038        Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => {}
6039    }
6040}
6041
6042/// v4.22: encode a Row to a comparable byte key for UNION-DISTINCT
6043/// dedup inside the recursive iteration. Crude but deterministic
6044/// — Debug prints embed type discriminants so NULL ≠ "" ≠ 0.
6045fn encode_row_key(row: &Row) -> Vec<u8> {
6046    let mut out = Vec::new();
6047    for v in &row.values {
6048        let s = alloc::format!("{v:?}|");
6049        out.extend_from_slice(s.as_bytes());
6050    }
6051    out
6052}
6053
6054fn select_has_window(stmt: &SelectStatement) -> bool {
6055    for item in &stmt.items {
6056        if let SelectItem::Expr { expr, .. } = item
6057            && expr_has_window(expr)
6058        {
6059            return true;
6060        }
6061    }
6062    false
6063}
6064
6065fn expr_has_window(e: &Expr) -> bool {
6066    match e {
6067        Expr::WindowFunction { .. } => true,
6068        Expr::Binary { lhs, rhs, .. } => expr_has_window(lhs) || expr_has_window(rhs),
6069        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
6070            expr_has_window(expr)
6071        }
6072        Expr::FunctionCall { args, .. } => args.iter().any(expr_has_window),
6073        Expr::Like { expr, pattern, .. } => expr_has_window(expr) || expr_has_window(pattern),
6074        Expr::Extract { source, .. } => expr_has_window(source),
6075        Expr::ScalarSubquery(_)
6076        | Expr::Exists { .. }
6077        | Expr::InSubquery { .. }
6078        | Expr::Literal(_)
6079        | Expr::Placeholder(_)
6080        | Expr::Column(_) => false,
6081    }
6082}
6083
6084fn collect_window_nodes(e: &Expr, out: &mut Vec<Expr>) {
6085    if let Expr::WindowFunction { .. } = e {
6086        // Deduplicate by structural equality on the expression
6087        // (cheap because window args + partition + order are
6088        // small). Without dedup we'd recompute identical windows
6089        // once per occurrence in the projection.
6090        if !out.iter().any(|x| x == e) {
6091            out.push(e.clone());
6092        }
6093        return;
6094    }
6095    match e {
6096        // Already handled by the early-return at the top.
6097        Expr::WindowFunction { .. } => unreachable!(),
6098        Expr::Binary { lhs, rhs, .. } => {
6099            collect_window_nodes(lhs, out);
6100            collect_window_nodes(rhs, out);
6101        }
6102        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
6103            collect_window_nodes(expr, out);
6104        }
6105        Expr::FunctionCall { args, .. } => {
6106            for a in args {
6107                collect_window_nodes(a, out);
6108            }
6109        }
6110        Expr::Like { expr, pattern, .. } => {
6111            collect_window_nodes(expr, out);
6112            collect_window_nodes(pattern, out);
6113        }
6114        Expr::Extract { source, .. } => collect_window_nodes(source, out),
6115        _ => {}
6116    }
6117}
6118
6119fn rewrite_window_to_columns(e: &mut Expr, window_nodes: &[Expr]) {
6120    if let Expr::WindowFunction { .. } = e
6121        && let Some(idx) = window_nodes.iter().position(|w| w == e)
6122    {
6123        *e = Expr::Column(spg_sql::ast::ColumnName {
6124            qualifier: None,
6125            name: alloc::format!("__win_{idx}"),
6126        });
6127        return;
6128    }
6129    match e {
6130        Expr::Binary { lhs, rhs, .. } => {
6131            rewrite_window_to_columns(lhs, window_nodes);
6132            rewrite_window_to_columns(rhs, window_nodes);
6133        }
6134        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
6135            rewrite_window_to_columns(expr, window_nodes);
6136        }
6137        Expr::FunctionCall { args, .. } => {
6138            for a in args {
6139                rewrite_window_to_columns(a, window_nodes);
6140            }
6141        }
6142        Expr::Like { expr, pattern, .. } => {
6143            rewrite_window_to_columns(expr, window_nodes);
6144            rewrite_window_to_columns(pattern, window_nodes);
6145        }
6146        Expr::Extract { source, .. } => rewrite_window_to_columns(source, window_nodes),
6147        _ => {}
6148    }
6149}
6150
6151/// Total order over partition-key tuples. NULL sorts as the
6152/// lowest value (matches the `<` partial order's NULL-last
6153/// behaviour with `INFINITY` flipped).
6154fn partition_key_cmp(a: &[Value], b: &[Value]) -> core::cmp::Ordering {
6155    for (x, y) in a.iter().zip(b.iter()) {
6156        let c = value_cmp(x, y);
6157        if c != core::cmp::Ordering::Equal {
6158            return c;
6159        }
6160    }
6161    a.len().cmp(&b.len())
6162}
6163
6164fn order_key_cmp(a: &[(Value, bool)], b: &[(Value, bool)]) -> core::cmp::Ordering {
6165    for ((va, desc), (vb, _)) in a.iter().zip(b.iter()) {
6166        let c = value_cmp(va, vb);
6167        let c = if *desc { c.reverse() } else { c };
6168        if c != core::cmp::Ordering::Equal {
6169            return c;
6170        }
6171    }
6172    a.len().cmp(&b.len())
6173}
6174
6175#[allow(clippy::match_same_arms)] // explicit arms per type document the supported pairs
6176fn value_cmp(a: &Value, b: &Value) -> core::cmp::Ordering {
6177    use core::cmp::Ordering;
6178    match (a, b) {
6179        (Value::Null, Value::Null) => Ordering::Equal,
6180        (Value::Null, _) => Ordering::Less,
6181        (_, Value::Null) => Ordering::Greater,
6182        (Value::Int(x), Value::Int(y)) => x.cmp(y),
6183        (Value::BigInt(x), Value::BigInt(y)) => x.cmp(y),
6184        (Value::SmallInt(x), Value::SmallInt(y)) => x.cmp(y),
6185        (Value::Text(x), Value::Text(y)) => x.cmp(y),
6186        (Value::Bool(x), Value::Bool(y)) => x.cmp(y),
6187        (Value::Float(x), Value::Float(y)) => x.partial_cmp(y).unwrap_or(Ordering::Equal),
6188        (Value::Date(x), Value::Date(y)) => x.cmp(y),
6189        (Value::Timestamp(x), Value::Timestamp(y)) => x.cmp(y),
6190        // Cross-type compare: fall back to the debug rendering —
6191        // same-partition is the goal, exact order is irrelevant.
6192        _ => alloc::format!("{a:?}").cmp(&alloc::format!("{b:?}")),
6193    }
6194}
6195
6196/// Compute the window function's per-row output for one partition.
6197/// `slice` has (partition key, order key, original-row-index)
6198/// tuples already sorted by order key. `filtered_rows` is the
6199/// full row list indexed by original-row-index. `out_vals` is
6200/// the destination, also indexed by original-row-index.
6201#[allow(
6202    clippy::too_many_arguments,
6203    clippy::cast_possible_truncation,
6204    clippy::cast_possible_wrap,
6205    clippy::cast_precision_loss,
6206    clippy::cast_sign_loss,
6207    clippy::doc_markdown,
6208    clippy::too_many_lines,
6209    clippy::type_complexity,
6210    clippy::match_same_arms
6211)]
6212fn compute_window_partition(
6213    name: &str,
6214    args: &[Expr],
6215    ordered: bool,
6216    frame: Option<&WindowFrame>,
6217    null_treatment: spg_sql::ast::NullTreatment,
6218    slice: &[(Vec<Value>, Vec<(Value, bool)>, usize)],
6219    filtered_rows: &[&Row],
6220    ctx: &EvalContext<'_>,
6221    out_vals: &mut [Value],
6222) -> Result<(), EngineError> {
6223    let ignore_nulls = matches!(null_treatment, spg_sql::ast::NullTreatment::Ignore);
6224    let lower = name.to_ascii_lowercase();
6225    match lower.as_str() {
6226        "row_number" => {
6227            for (rank, (_, _, idx)) in slice.iter().enumerate() {
6228                out_vals[*idx] = Value::BigInt((rank + 1) as i64);
6229            }
6230            Ok(())
6231        }
6232        "rank" => {
6233            let mut prev_key: Option<&[(Value, bool)]> = None;
6234            let mut current_rank: i64 = 1;
6235            for (i, (_, okey, idx)) in slice.iter().enumerate() {
6236                if let Some(p) = prev_key
6237                    && order_key_cmp(p, okey) != core::cmp::Ordering::Equal
6238                {
6239                    current_rank = (i + 1) as i64;
6240                }
6241                if prev_key.is_none() {
6242                    current_rank = 1;
6243                }
6244                out_vals[*idx] = Value::BigInt(current_rank);
6245                prev_key = Some(okey.as_slice());
6246            }
6247            Ok(())
6248        }
6249        "dense_rank" => {
6250            let mut prev_key: Option<&[(Value, bool)]> = None;
6251            let mut current_rank: i64 = 0;
6252            for (_, okey, idx) in slice {
6253                if prev_key.is_none_or(|p| order_key_cmp(p, okey) != core::cmp::Ordering::Equal) {
6254                    current_rank += 1;
6255                }
6256                out_vals[*idx] = Value::BigInt(current_rank);
6257                prev_key = Some(okey.as_slice());
6258            }
6259            Ok(())
6260        }
6261        "sum" | "avg" | "min" | "max" | "count" | "count_star" => {
6262            // Pre-evaluate the function arg per row in the slice
6263            // (count_star has no arg).
6264            let arg_values: Vec<Value> = if lower == "count_star" || args.is_empty() {
6265                slice.iter().map(|_| Value::Null).collect()
6266            } else {
6267                slice
6268                    .iter()
6269                    .map(|(_, _, idx)| eval::eval_expr(&args[0], filtered_rows[*idx], ctx))
6270                    .collect::<Result<_, _>>()
6271                    .map_err(EngineError::Eval)?
6272            };
6273            // v4.20: pick the effective frame. Explicit frame
6274            // overrides the implicit default (running for ordered,
6275            // whole-partition for unordered).
6276            let eff = effective_frame(frame, ordered)?;
6277            #[allow(clippy::needless_range_loop)]
6278            for i in 0..slice.len() {
6279                let (lo, hi) = frame_bounds_for_row(&eff, i, slice);
6280                let mut sum: f64 = 0.0;
6281                let mut count: i64 = 0;
6282                let mut min_v: Option<f64> = None;
6283                let mut max_v: Option<f64> = None;
6284                let mut row_count: i64 = 0;
6285                if lo <= hi {
6286                    for j in lo..=hi {
6287                        let v = &arg_values[j];
6288                        match lower.as_str() {
6289                            "count_star" => row_count += 1,
6290                            "count" => {
6291                                if !v.is_null() {
6292                                    count += 1;
6293                                }
6294                            }
6295                            _ => {
6296                                if let Some(x) = value_to_f64(v) {
6297                                    sum += x;
6298                                    count += 1;
6299                                    min_v = Some(min_v.map_or(x, |m| m.min(x)));
6300                                    max_v = Some(max_v.map_or(x, |m| m.max(x)));
6301                                }
6302                            }
6303                        }
6304                    }
6305                }
6306                let value = match lower.as_str() {
6307                    "count_star" => Value::BigInt(row_count),
6308                    "count" => Value::BigInt(count),
6309                    "sum" => Value::Float(sum),
6310                    "avg" => {
6311                        if count == 0 {
6312                            Value::Null
6313                        } else {
6314                            Value::Float(sum / count as f64)
6315                        }
6316                    }
6317                    "min" => min_v.map_or(Value::Null, Value::Float),
6318                    "max" => max_v.map_or(Value::Null, Value::Float),
6319                    _ => unreachable!(),
6320                };
6321                let (_, _, idx) = &slice[i];
6322                out_vals[*idx] = value;
6323            }
6324            Ok(())
6325        }
6326        "lag" | "lead" => {
6327            // lag(expr [, offset [, default]])
6328            // lead(expr [, offset [, default]])
6329            if args.is_empty() {
6330                return Err(EngineError::Unsupported(alloc::format!(
6331                    "{lower}() requires at least one argument"
6332                )));
6333            }
6334            let offset: i64 = if args.len() >= 2 {
6335                let v = eval::eval_expr(&args[1], filtered_rows[slice[0].2], ctx)
6336                    .map_err(EngineError::Eval)?;
6337                match v {
6338                    Value::SmallInt(n) => i64::from(n),
6339                    Value::Int(n) => i64::from(n),
6340                    Value::BigInt(n) => n,
6341                    _ => {
6342                        return Err(EngineError::Unsupported(alloc::format!(
6343                            "{lower}() offset must be integer"
6344                        )));
6345                    }
6346                }
6347            } else {
6348                1
6349            };
6350            let default: Value = if args.len() >= 3 {
6351                eval::eval_expr(&args[2], filtered_rows[slice[0].2], ctx)
6352                    .map_err(EngineError::Eval)?
6353            } else {
6354                Value::Null
6355            };
6356            let values: Vec<Value> = slice
6357                .iter()
6358                .map(|(_, _, idx)| eval::eval_expr(&args[0], filtered_rows[*idx], ctx))
6359                .collect::<Result<_, _>>()
6360                .map_err(EngineError::Eval)?;
6361            let n = slice.len();
6362            for (i, (_, _, idx)) in slice.iter().enumerate() {
6363                let signed_offset = if lower == "lag" { -offset } else { offset };
6364                let v = if ignore_nulls {
6365                    // v6.4.2 — IGNORE NULLS: walk in the offset direction
6366                    // skipping NULL values; the `offset`-th non-NULL
6367                    // encountered is the result.
6368                    let step: i64 = if signed_offset >= 0 { 1 } else { -1 };
6369                    let needed: i64 = signed_offset.abs();
6370                    if needed == 0 {
6371                        values[i].clone()
6372                    } else {
6373                        let mut j: i64 = i as i64;
6374                        let mut hits: i64 = 0;
6375                        let mut found: Option<Value> = None;
6376                        loop {
6377                            j += step;
6378                            if j < 0 || j >= n as i64 {
6379                                break;
6380                            }
6381                            #[allow(clippy::cast_sign_loss)]
6382                            let v = &values[j as usize];
6383                            if !v.is_null() {
6384                                hits += 1;
6385                                if hits == needed {
6386                                    found = Some(v.clone());
6387                                    break;
6388                                }
6389                            }
6390                        }
6391                        found.unwrap_or_else(|| default.clone())
6392                    }
6393                } else {
6394                    let target_signed = i64::try_from(i).unwrap_or(i64::MAX) + signed_offset;
6395                    if target_signed < 0
6396                        || target_signed >= i64::try_from(n).unwrap_or(i64::MAX)
6397                    {
6398                        default.clone()
6399                    } else {
6400                        #[allow(clippy::cast_sign_loss)]
6401                        {
6402                            values[target_signed as usize].clone()
6403                        }
6404                    }
6405                };
6406                out_vals[*idx] = v;
6407            }
6408            Ok(())
6409        }
6410        "first_value" | "last_value" | "nth_value" => {
6411            if args.is_empty() {
6412                return Err(EngineError::Unsupported(alloc::format!(
6413                    "{lower}() requires at least one argument"
6414                )));
6415            }
6416            let values: Vec<Value> = slice
6417                .iter()
6418                .map(|(_, _, idx)| eval::eval_expr(&args[0], filtered_rows[*idx], ctx))
6419                .collect::<Result<_, _>>()
6420                .map_err(EngineError::Eval)?;
6421            let nth: usize = if lower == "nth_value" {
6422                if args.len() < 2 {
6423                    return Err(EngineError::Unsupported(
6424                        "nth_value() requires (expr, n)".into(),
6425                    ));
6426                }
6427                let v = eval::eval_expr(&args[1], filtered_rows[slice[0].2], ctx)
6428                    .map_err(EngineError::Eval)?;
6429                let raw = match v {
6430                    Value::SmallInt(n) => i64::from(n),
6431                    Value::Int(n) => i64::from(n),
6432                    Value::BigInt(n) => n,
6433                    _ => {
6434                        return Err(EngineError::Unsupported(
6435                            "nth_value() n must be integer".into(),
6436                        ));
6437                    }
6438                };
6439                if raw < 1 {
6440                    return Err(EngineError::Unsupported(
6441                        "nth_value() n must be >= 1".into(),
6442                    ));
6443                }
6444                #[allow(clippy::cast_sign_loss)]
6445                {
6446                    raw as usize
6447                }
6448            } else {
6449                0
6450            };
6451            let eff = effective_frame(frame, ordered)?;
6452            for i in 0..slice.len() {
6453                let (lo, hi) = frame_bounds_for_row(&eff, i, slice);
6454                let (_, _, idx) = &slice[i];
6455                let v = if lo > hi {
6456                    Value::Null
6457                } else if ignore_nulls && matches!(lower.as_str(), "first_value" | "last_value") {
6458                    // v6.4.2 — IGNORE NULLS: skip NULL cells when
6459                    // selecting the boundary value within the frame.
6460                    if lower == "first_value" {
6461                        (lo..=hi)
6462                            .find_map(|j| {
6463                                let v = &values[j];
6464                                (!v.is_null()).then(|| v.clone())
6465                            })
6466                            .unwrap_or(Value::Null)
6467                    } else {
6468                        (lo..=hi)
6469                            .rev()
6470                            .find_map(|j| {
6471                                let v = &values[j];
6472                                (!v.is_null()).then(|| v.clone())
6473                            })
6474                            .unwrap_or(Value::Null)
6475                    }
6476                } else {
6477                    match lower.as_str() {
6478                        "first_value" => values[lo].clone(),
6479                        "last_value" => values[hi].clone(),
6480                        "nth_value" => {
6481                            let pos = lo + nth - 1;
6482                            if pos > hi {
6483                                Value::Null
6484                            } else {
6485                                values[pos].clone()
6486                            }
6487                        }
6488                        _ => unreachable!(),
6489                    }
6490                };
6491                out_vals[*idx] = v;
6492            }
6493            Ok(())
6494        }
6495        "ntile" => {
6496            if args.is_empty() {
6497                return Err(EngineError::Unsupported(
6498                    "ntile(n) requires an integer argument".into(),
6499                ));
6500            }
6501            let v = eval::eval_expr(&args[0], filtered_rows[slice[0].2], ctx)
6502                .map_err(EngineError::Eval)?;
6503            let bucket_count: i64 = match v {
6504                Value::SmallInt(n) => i64::from(n),
6505                Value::Int(n) => i64::from(n),
6506                Value::BigInt(n) => n,
6507                _ => {
6508                    return Err(EngineError::Unsupported(
6509                        "ntile() argument must be integer".into(),
6510                    ));
6511                }
6512            };
6513            if bucket_count < 1 {
6514                return Err(EngineError::Unsupported(
6515                    "ntile() argument must be >= 1".into(),
6516                ));
6517            }
6518            #[allow(clippy::cast_sign_loss)]
6519            let buckets = bucket_count as usize;
6520            let n = slice.len();
6521            // Each bucket gets `base` rows; the first `extras` buckets
6522            // get one extra. PG semantics.
6523            let base = n / buckets;
6524            let extras = n % buckets;
6525            let mut bucket: usize = 1;
6526            let mut remaining_in_bucket = if extras > 0 { base + 1 } else { base };
6527            let mut buckets_with_extra_remaining = extras;
6528            for (_, _, idx) in slice {
6529                if remaining_in_bucket == 0 {
6530                    bucket += 1;
6531                    buckets_with_extra_remaining = buckets_with_extra_remaining.saturating_sub(1);
6532                    remaining_in_bucket = if buckets_with_extra_remaining > 0 {
6533                        base + 1
6534                    } else {
6535                        base
6536                    };
6537                    // Edge: if base==0 and extras==0, all rows fit;
6538                    // shouldn't reach here, but guard anyway.
6539                    if remaining_in_bucket == 0 {
6540                        remaining_in_bucket = 1;
6541                    }
6542                }
6543                out_vals[*idx] = Value::BigInt(i64::try_from(bucket).unwrap_or(i64::MAX));
6544                remaining_in_bucket -= 1;
6545            }
6546            Ok(())
6547        }
6548        "percent_rank" => {
6549            // (rank - 1) / (n - 1) where rank is the standard RANK().
6550            // Single-row partitions get 0.
6551            let n = slice.len();
6552            let mut prev_key: Option<&[(Value, bool)]> = None;
6553            let mut current_rank: i64 = 1;
6554            for (i, (_, okey, idx)) in slice.iter().enumerate() {
6555                if let Some(p) = prev_key
6556                    && order_key_cmp(p, okey) != core::cmp::Ordering::Equal
6557                {
6558                    current_rank = i64::try_from(i + 1).unwrap_or(i64::MAX);
6559                }
6560                if prev_key.is_none() {
6561                    current_rank = 1;
6562                }
6563                #[allow(clippy::cast_precision_loss)]
6564                let pr = if n <= 1 {
6565                    0.0
6566                } else {
6567                    (current_rank - 1) as f64 / (n - 1) as f64
6568                };
6569                out_vals[*idx] = Value::Float(pr);
6570                prev_key = Some(okey.as_slice());
6571            }
6572            Ok(())
6573        }
6574        "cume_dist" => {
6575            // # rows up to and including this row's peer group / n.
6576            let n = slice.len();
6577            // First pass: find peer-group-end rank for each row.
6578            for i in 0..slice.len() {
6579                let peer_end = peer_group_end(slice, i);
6580                #[allow(clippy::cast_precision_loss)]
6581                let cd = (peer_end + 1) as f64 / n as f64;
6582                let (_, _, idx) = &slice[i];
6583                out_vals[*idx] = Value::Float(cd);
6584            }
6585            Ok(())
6586        }
6587        other => Err(EngineError::Unsupported(alloc::format!(
6588            "window function {other:?} not supported (v4.21: row_number/rank/dense_rank/sum/avg/count/min/max/lag/lead/first_value/last_value/nth_value/ntile/percent_rank/cume_dist)"
6589        ))),
6590    }
6591}
6592
6593/// v4.20: resolve the user-provided frame down to a normalised
6594/// `(kind, start, end)`. `None` means default — derive from
6595/// `ordered`: ordered ⇒ RANGE UNBOUNDED PRECEDING AND CURRENT ROW,
6596/// unordered ⇒ ROWS UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING.
6597/// Single-bound shorthand (e.g. `ROWS 5 PRECEDING`) normalises
6598/// end → CURRENT ROW per the PG spec.
6599fn effective_frame(
6600    frame: Option<&WindowFrame>,
6601    ordered: bool,
6602) -> Result<(FrameKind, FrameBound, FrameBound), EngineError> {
6603    match frame {
6604        None => {
6605            if ordered {
6606                Ok((
6607                    FrameKind::Range,
6608                    FrameBound::UnboundedPreceding,
6609                    FrameBound::CurrentRow,
6610                ))
6611            } else {
6612                Ok((
6613                    FrameKind::Rows,
6614                    FrameBound::UnboundedPreceding,
6615                    FrameBound::UnboundedFollowing,
6616                ))
6617            }
6618        }
6619        Some(fr) => {
6620            let end = fr.end.clone().unwrap_or(FrameBound::CurrentRow);
6621            // Reject start > end (a few impossible combinations).
6622            if matches!(fr.start, FrameBound::UnboundedFollowing)
6623                || matches!(end, FrameBound::UnboundedPreceding)
6624            {
6625                return Err(EngineError::Unsupported(alloc::format!(
6626                    "invalid frame: start={:?} end={:?}",
6627                    fr.start,
6628                    end
6629                )));
6630            }
6631            // RANGE OFFSET PRECEDING / FOLLOWING needs value-typed
6632            // arithmetic on the ORDER BY key (e.g. `RANGE BETWEEN
6633            // INTERVAL '1 day' PRECEDING AND CURRENT ROW`). Not
6634            // implemented in v4.20.
6635            if fr.kind == FrameKind::Range
6636                && (matches!(
6637                    fr.start,
6638                    FrameBound::OffsetPreceding(_) | FrameBound::OffsetFollowing(_)
6639                ) || matches!(
6640                    end,
6641                    FrameBound::OffsetPreceding(_) | FrameBound::OffsetFollowing(_)
6642                ))
6643            {
6644                return Err(EngineError::Unsupported(
6645                    "RANGE with explicit offset bounds is not supported (v4.20: only UNBOUNDED / CURRENT ROW for RANGE)".into(),
6646                ));
6647            }
6648            Ok((fr.kind, fr.start.clone(), end))
6649        }
6650    }
6651}
6652
6653/// Compute `(lo, hi)` row-index bounds inside the partition slice
6654/// for the row at position `i`. Inclusive, clamped to
6655/// `[0, slice.len()-1]`. Empty result if `lo > hi`.
6656#[allow(clippy::type_complexity)]
6657fn frame_bounds_for_row(
6658    eff: &(FrameKind, FrameBound, FrameBound),
6659    i: usize,
6660    slice: &[(Vec<Value>, Vec<(Value, bool)>, usize)],
6661) -> (usize, usize) {
6662    let (kind, start, end) = eff;
6663    let n = slice.len();
6664    let last = n.saturating_sub(1);
6665    let (mut lo, mut hi) = match kind {
6666        FrameKind::Rows => {
6667            let lo = match start {
6668                FrameBound::UnboundedPreceding => 0,
6669                FrameBound::OffsetPreceding(k) => {
6670                    let k = usize::try_from(*k).unwrap_or(usize::MAX);
6671                    i.saturating_sub(k)
6672                }
6673                FrameBound::CurrentRow => i,
6674                FrameBound::OffsetFollowing(k) => {
6675                    let k = usize::try_from(*k).unwrap_or(usize::MAX);
6676                    i.saturating_add(k).min(last)
6677                }
6678                FrameBound::UnboundedFollowing => last,
6679            };
6680            let hi = match end {
6681                FrameBound::UnboundedPreceding => 0,
6682                FrameBound::OffsetPreceding(k) => {
6683                    let k = usize::try_from(*k).unwrap_or(usize::MAX);
6684                    i.saturating_sub(k)
6685                }
6686                FrameBound::CurrentRow => i,
6687                FrameBound::OffsetFollowing(k) => {
6688                    let k = usize::try_from(*k).unwrap_or(usize::MAX);
6689                    i.saturating_add(k).min(last)
6690                }
6691                FrameBound::UnboundedFollowing => last,
6692            };
6693            (lo, hi)
6694        }
6695        FrameKind::Range => {
6696            // RANGE bounds are peer-aware. With only UNBOUNDED and
6697            // CURRENT ROW supported (rejected at effective_frame for
6698            // explicit offsets), the start/end map to the
6699            // partition's full extent at the same-order-key peer
6700            // group boundary.
6701            let lo = match start {
6702                FrameBound::UnboundedPreceding => 0,
6703                FrameBound::CurrentRow => peer_group_start(slice, i),
6704                FrameBound::UnboundedFollowing => last,
6705                _ => unreachable!("offset bounds rejected for RANGE"),
6706            };
6707            let hi = match end {
6708                FrameBound::UnboundedPreceding => 0,
6709                FrameBound::CurrentRow => peer_group_end(slice, i),
6710                FrameBound::UnboundedFollowing => last,
6711                _ => unreachable!("offset bounds rejected for RANGE"),
6712            };
6713            (lo, hi)
6714        }
6715    };
6716    if hi >= n {
6717        hi = last;
6718    }
6719    if lo >= n {
6720        lo = last;
6721    }
6722    (lo, hi)
6723}
6724
6725/// Find the inclusive index of the first row with the same ORDER
6726/// BY key as `slice[i]`. Slice is already sorted by partition then
6727/// order, so peers are contiguous.
6728#[allow(clippy::type_complexity)]
6729fn peer_group_start(slice: &[(Vec<Value>, Vec<(Value, bool)>, usize)], i: usize) -> usize {
6730    let key = &slice[i].1;
6731    let mut j = i;
6732    while j > 0 && order_key_cmp(&slice[j - 1].1, key) == core::cmp::Ordering::Equal {
6733        j -= 1;
6734    }
6735    j
6736}
6737
6738/// Find the inclusive index of the last row with the same ORDER
6739/// BY key as `slice[i]`.
6740#[allow(clippy::type_complexity)]
6741fn peer_group_end(slice: &[(Vec<Value>, Vec<(Value, bool)>, usize)], i: usize) -> usize {
6742    let key = &slice[i].1;
6743    let mut j = i;
6744    while j + 1 < slice.len() && order_key_cmp(&slice[j + 1].1, key) == core::cmp::Ordering::Equal {
6745        j += 1;
6746    }
6747    j
6748}
6749
6750fn value_to_f64(v: &Value) -> Option<f64> {
6751    match v {
6752        Value::SmallInt(n) => Some(f64::from(*n)),
6753        Value::Int(n) => Some(f64::from(*n)),
6754        #[allow(clippy::cast_precision_loss)]
6755        Value::BigInt(n) => Some(*n as f64),
6756        Value::Float(x) => Some(*x),
6757        _ => None,
6758    }
6759}
6760
6761/// Quick scan for any subquery-bearing node in a SELECT's WHERE /
6762/// projection / `order_by` — saves cloning the AST when there are
6763/// none (the common case).
6764fn expr_tree_has_subquery(stmt: &SelectStatement) -> bool {
6765    let mut any = false;
6766    for item in &stmt.items {
6767        if let SelectItem::Expr { expr, .. } = item {
6768            any = any || expr_has_subquery(expr);
6769        }
6770    }
6771    if let Some(w) = &stmt.where_ {
6772        any = any || expr_has_subquery(w);
6773    }
6774    if let Some(h) = &stmt.having {
6775        any = any || expr_has_subquery(h);
6776    }
6777    for o in &stmt.order_by {
6778        any = any || expr_has_subquery(&o.expr);
6779    }
6780    for (_, peer) in &stmt.unions {
6781        any = any || expr_tree_has_subquery(peer);
6782    }
6783    any
6784}
6785
6786fn expr_has_subquery(e: &Expr) -> bool {
6787    match e {
6788        Expr::ScalarSubquery(_) | Expr::Exists { .. } | Expr::InSubquery { .. } => true,
6789        Expr::Binary { lhs, rhs, .. } => expr_has_subquery(lhs) || expr_has_subquery(rhs),
6790        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
6791            expr_has_subquery(expr)
6792        }
6793        Expr::FunctionCall { args, .. } => args.iter().any(expr_has_subquery),
6794        Expr::Like { expr, pattern, .. } => expr_has_subquery(expr) || expr_has_subquery(pattern),
6795        Expr::Extract { source, .. } => expr_has_subquery(source),
6796        Expr::WindowFunction {
6797            args,
6798            partition_by,
6799            order_by,
6800            ..
6801        } => {
6802            args.iter().any(expr_has_subquery)
6803                || partition_by.iter().any(expr_has_subquery)
6804                || order_by.iter().any(|(e, _)| expr_has_subquery(e))
6805        }
6806        Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => false,
6807    }
6808}
6809
6810/// v4.10 helper: materialise a runtime `Value` back into an AST
6811/// `Expr::Literal` for the subquery-rewrite path. Supports the
6812/// types `Literal` can represent (Integer / Float / Text / Bool /
6813/// Null). Date / Timestamp / Numeric / Vector / Interval / JSON
6814/// would lose precision through Literal and aren't supported in
6815/// uncorrelated-subquery results; they error with a clear hint.
6816fn value_to_literal_expr(v: Value) -> Result<Expr, EngineError> {
6817    let lit = match v {
6818        Value::Null => Literal::Null,
6819        Value::SmallInt(n) => Literal::Integer(i64::from(n)),
6820        Value::Int(n) => Literal::Integer(i64::from(n)),
6821        Value::BigInt(n) => Literal::Integer(n),
6822        Value::Float(x) => Literal::Float(x),
6823        Value::Text(s) | Value::Json(s) => Literal::String(s),
6824        Value::Bool(b) => Literal::Bool(b),
6825        other => {
6826            return Err(EngineError::Unsupported(alloc::format!(
6827                "subquery result type {:?} not yet materialisable; cast to text or integer in the inner SELECT",
6828                other.data_type()
6829            )));
6830        }
6831    };
6832    Ok(Expr::Literal(lit))
6833}
6834
6835/// v6.1.1 — walk the prepared `Statement` AST and replace every
6836/// `Expr::Placeholder(n)` with `Expr::Literal(value_to_literal(
6837/// params[n-1]))`. The dispatch downstream sees a `Statement`
6838/// indistinguishable from a simple-query parse, so the exec path
6839/// stays unchanged.
6840///
6841/// Errors fall into one shape: a `$N` references past the bound
6842/// `params.len()`. Out-of-range happens when the Bind didn't
6843/// supply enough values; pgwire surfaces this as a protocol error
6844/// to the client.
6845fn substitute_placeholders(stmt: &mut Statement, params: &[Value]) -> Result<(), EngineError> {
6846    match stmt {
6847        Statement::Select(s) => substitute_select(s, params)?,
6848        Statement::Insert(ins) => {
6849            for row in &mut ins.rows {
6850                for e in row {
6851                    substitute_expr(e, params)?;
6852                }
6853            }
6854        }
6855        Statement::Update(u) => {
6856            for (_, e) in &mut u.assignments {
6857                substitute_expr(e, params)?;
6858            }
6859            if let Some(w) = &mut u.where_ {
6860                substitute_expr(w, params)?;
6861            }
6862        }
6863        Statement::Delete(d) => {
6864            if let Some(w) = &mut d.where_ {
6865                substitute_expr(w, params)?;
6866            }
6867        }
6868        Statement::Explain(e) => substitute_select(&mut e.inner, params)?,
6869        // Other statements (CREATE / BEGIN / SHOW / …) have no
6870        // expression slots; no walk needed.
6871        _ => {}
6872    }
6873    Ok(())
6874}
6875
6876fn substitute_select(
6877    s: &mut SelectStatement,
6878    params: &[Value],
6879) -> Result<(), EngineError> {
6880    for item in &mut s.items {
6881        if let SelectItem::Expr { expr, .. } = item {
6882            substitute_expr(expr, params)?;
6883        }
6884    }
6885    if let Some(w) = &mut s.where_ {
6886        substitute_expr(w, params)?;
6887    }
6888    if let Some(gs) = &mut s.group_by {
6889        for g in gs {
6890            substitute_expr(g, params)?;
6891        }
6892    }
6893    if let Some(h) = &mut s.having {
6894        substitute_expr(h, params)?;
6895    }
6896    for o in &mut s.order_by {
6897        substitute_expr(&mut o.expr, params)?;
6898    }
6899    for (_, peer) in &mut s.unions {
6900        substitute_select(peer, params)?;
6901    }
6902    // v7.9.24 — LIMIT $N / OFFSET $N placeholder resolution.
6903    // mailrs H2. After this pass each LIMIT/OFFSET that was a
6904    // Placeholder is rewritten to Literal so the existing
6905    // `LimitExpr::as_literal` path consumes a concrete u32.
6906    if let Some(le) = s.limit {
6907        s.limit = Some(resolve_limit_placeholder(le, params)?);
6908    }
6909    if let Some(le) = s.offset {
6910        s.offset = Some(resolve_limit_placeholder(le, params)?);
6911    }
6912    Ok(())
6913}
6914
6915fn resolve_limit_placeholder(
6916    le: spg_sql::ast::LimitExpr,
6917    params: &[Value],
6918) -> Result<spg_sql::ast::LimitExpr, EngineError> {
6919    use spg_sql::ast::LimitExpr;
6920    match le {
6921        LimitExpr::Literal(_) => Ok(le),
6922        LimitExpr::Placeholder(n) => {
6923            let idx = usize::from(n).saturating_sub(1);
6924            let v = params.get(idx).ok_or_else(|| {
6925                EngineError::Eval(EvalError::PlaceholderOutOfRange {
6926                    n,
6927                    bound: u16::try_from(params.len()).unwrap_or(u16::MAX),
6928                })
6929            })?;
6930            let int = match v {
6931                Value::SmallInt(x) => Some(i64::from(*x)),
6932                Value::Int(x) => Some(i64::from(*x)),
6933                Value::BigInt(x) => Some(*x),
6934                _ => None,
6935            }
6936            .ok_or_else(|| {
6937                EngineError::Unsupported(alloc::format!(
6938                    "LIMIT/OFFSET ${n} bound to non-integer {v:?}"
6939                ))
6940            })?;
6941            if int < 0 {
6942                return Err(EngineError::Unsupported(alloc::format!(
6943                    "LIMIT/OFFSET ${n} bound to negative value {int}"
6944                )));
6945            }
6946            let bounded = u32::try_from(int).map_err(|_| {
6947                EngineError::Unsupported(alloc::format!(
6948                    "LIMIT/OFFSET ${n} value {int} exceeds u32 range"
6949                ))
6950            })?;
6951            Ok(LimitExpr::Literal(bounded))
6952        }
6953    }
6954}
6955
6956fn substitute_expr(e: &mut Expr, params: &[Value]) -> Result<(), EngineError> {
6957    if let Expr::Placeholder(n) = e {
6958        let idx = usize::from(*n).saturating_sub(1);
6959        let v = params.get(idx).ok_or_else(|| {
6960            EngineError::Eval(EvalError::PlaceholderOutOfRange {
6961                n: *n,
6962                bound: u16::try_from(params.len()).unwrap_or(u16::MAX),
6963            })
6964        })?;
6965        *e = Expr::Literal(value_to_literal(v.clone()));
6966        return Ok(());
6967    }
6968    match e {
6969        Expr::Binary { lhs, rhs, .. } => {
6970            substitute_expr(lhs, params)?;
6971            substitute_expr(rhs, params)?;
6972        }
6973        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
6974            substitute_expr(expr, params)?;
6975        }
6976        Expr::FunctionCall { args, .. } => {
6977            for a in args {
6978                substitute_expr(a, params)?;
6979            }
6980        }
6981        Expr::Like { expr, pattern, .. } => {
6982            substitute_expr(expr, params)?;
6983            substitute_expr(pattern, params)?;
6984        }
6985        Expr::Extract { source, .. } => substitute_expr(source, params)?,
6986        Expr::ScalarSubquery(s) => substitute_select(s, params)?,
6987        Expr::Exists { subquery, .. } => substitute_select(subquery, params)?,
6988        Expr::InSubquery { expr, subquery, .. } => {
6989            substitute_expr(expr, params)?;
6990            substitute_select(subquery, params)?;
6991        }
6992        Expr::WindowFunction {
6993            args,
6994            partition_by,
6995            order_by,
6996            ..
6997        } => {
6998            for a in args {
6999                substitute_expr(a, params)?;
7000            }
7001            for p in partition_by {
7002                substitute_expr(p, params)?;
7003            }
7004            for (e, _) in order_by {
7005                substitute_expr(e, params)?;
7006            }
7007        }
7008        Expr::Literal(_) | Expr::Column(_) => {}
7009        // Already handled above.
7010        Expr::Placeholder(_) => unreachable!("Placeholder handled at top of fn"),
7011    }
7012    Ok(())
7013}
7014
7015/// v6.1.1 — convert a runtime `Value` into the closest matching
7016/// `Literal` for the substitute walker. Lossless for the simple
7017/// scalars (Int / Float / Text / Bool); Numeric / Date / Timestamp
7018/// / Json / Interval render as their canonical text form so the
7019/// downstream coerce_value can re-parse against the target column
7020/// type. SQ8 / HalfVector cells are NOT expected as bind params;
7021/// pgwire's Bind decodes vector params to the f32 representation
7022/// before they reach this helper.
7023/// v6.2.0 — total ordering on `Value`s used by ANALYZE to sort a
7024/// column's non-NULL sample before histogram building. Cross-type
7025/// pairs (Int vs Float, Date vs Timestamp, …) compare via the
7026/// same widening the eval-side `compare` operator uses; everything
7027/// else (the genuinely-incompatible pairs) falls back to ordering
7028/// by canonical string form so the sort is still total + stable.
7029/// Vector / SQ8 / Half / Json / Numeric / Interval values reach
7030/// here only via the string-fallback path because vector columns
7031/// are filtered out upstream.
7032fn sort_values_for_histogram(a: &Value, b: &Value) -> core::cmp::Ordering {
7033    use core::cmp::Ordering;
7034    match (a, b) {
7035        (Value::SmallInt(a), Value::SmallInt(b)) => a.cmp(b),
7036        (Value::Int(a), Value::Int(b)) => a.cmp(b),
7037        (Value::BigInt(a), Value::BigInt(b)) => a.cmp(b),
7038        (Value::SmallInt(a), Value::Int(b)) => i32::from(*a).cmp(b),
7039        (Value::Int(a), Value::SmallInt(b)) => a.cmp(&i32::from(*b)),
7040        (Value::Int(a), Value::BigInt(b)) => i64::from(*a).cmp(b),
7041        (Value::BigInt(a), Value::Int(b)) => a.cmp(&i64::from(*b)),
7042        (Value::SmallInt(a), Value::BigInt(b)) => i64::from(*a).cmp(b),
7043        (Value::BigInt(a), Value::SmallInt(b)) => a.cmp(&i64::from(*b)),
7044        (Value::Float(a), Value::Float(b)) => a.partial_cmp(b).unwrap_or(Ordering::Equal),
7045        (Value::Text(a), Value::Text(b)) | (Value::Json(a), Value::Json(b)) => a.cmp(b),
7046        (Value::Bool(a), Value::Bool(b)) => a.cmp(b),
7047        (Value::Date(a), Value::Date(b)) => a.cmp(b),
7048        (Value::Timestamp(a), Value::Timestamp(b)) => a.cmp(b),
7049        // Mixed numeric/float — widen to f64 and compare.
7050        (Value::SmallInt(n), Value::Float(x)) => {
7051            (f64::from(*n)).partial_cmp(x).unwrap_or(Ordering::Equal)
7052        }
7053        (Value::Float(x), Value::SmallInt(n)) => {
7054            x.partial_cmp(&f64::from(*n)).unwrap_or(Ordering::Equal)
7055        }
7056        (Value::Int(n), Value::Float(x)) => {
7057            (f64::from(*n)).partial_cmp(x).unwrap_or(Ordering::Equal)
7058        }
7059        (Value::Float(x), Value::Int(n)) => {
7060            x.partial_cmp(&f64::from(*n)).unwrap_or(Ordering::Equal)
7061        }
7062        (Value::BigInt(n), Value::Float(x)) => {
7063            #[allow(clippy::cast_precision_loss)]
7064            let nf = *n as f64;
7065            nf.partial_cmp(x).unwrap_or(Ordering::Equal)
7066        }
7067        (Value::Float(x), Value::BigInt(n)) => {
7068            #[allow(clippy::cast_precision_loss)]
7069            let nf = *n as f64;
7070            x.partial_cmp(&nf).unwrap_or(Ordering::Equal)
7071        }
7072        // Cross-type fallback: lexicographic on canonical form.
7073        // Total + stable so the sort is well-defined.
7074        _ => canonical_value_repr(a).cmp(&canonical_value_repr(b)),
7075    }
7076}
7077
7078/// v6.2.0 — render the histogram bounds list as a `[v0, v1, ...]`
7079/// string for the `spg_statistic.histogram_bounds` column. Values
7080/// containing `,` or `[` / `]` are JSON-style escaped so the
7081/// rendering round-trips through a future parser; v6.2.0 only
7082/// uses the rendered form for human consumption, so the escaping
7083/// is conservative.
7084fn render_histogram_bounds(bounds: &[alloc::string::String]) -> alloc::string::String {
7085    let mut out = alloc::string::String::with_capacity(bounds.len() * 8 + 2);
7086    out.push('[');
7087    for (i, b) in bounds.iter().enumerate() {
7088        if i > 0 {
7089            out.push_str(", ");
7090        }
7091        let needs_quote = b.contains([',', '[', ']', '"']) || b.is_empty();
7092        if needs_quote {
7093            out.push('"');
7094            for ch in b.chars() {
7095                if ch == '"' || ch == '\\' {
7096                    out.push('\\');
7097                }
7098                out.push(ch);
7099            }
7100            out.push('"');
7101        } else {
7102            out.push_str(b);
7103        }
7104    }
7105    out.push(']');
7106    out
7107}
7108
7109/// v6.2.0 — canonical textual form of a `Value` for histogram
7110/// bound storage. Strings used by ANALYZE for sort + bound output.
7111/// INT / BIGINT → decimal; FLOAT → shortest-round-trip via
7112/// `{:?}`; TEXT pass-through; BOOL → `t` / `f`; DATE / TIMESTAMP →
7113/// the same form `format_date` / `format_timestamp` produce for
7114/// SQL Display. Vector / SQ8 / Half / Json / Numeric / Interval
7115/// reach this only via a non-Vector column (vector columns are
7116/// skipped upstream); they fall back to a Debug-derived form so
7117/// stats still serialise without crashing.
7118pub(crate) fn canonical_value_repr(v: &Value) -> alloc::string::String {
7119    match v {
7120        Value::Null => "NULL".to_string(),
7121        Value::SmallInt(n) => alloc::format!("{n}"),
7122        Value::Int(n) => alloc::format!("{n}"),
7123        Value::BigInt(n) => alloc::format!("{n}"),
7124        Value::Float(x) => alloc::format!("{x:?}"),
7125        Value::Text(s) | Value::Json(s) => s.clone(),
7126        Value::Bool(b) => if *b { "t" } else { "f" }.to_string(),
7127        Value::Date(d) => eval::format_date(*d),
7128        Value::Timestamp(t) => eval::format_timestamp(*t),
7129        Value::Interval { months, micros } => eval::format_interval(*months, *micros),
7130        Value::Numeric { scaled, scale } => eval::format_numeric(*scaled, *scale),
7131        Value::Vector(_) | Value::Sq8Vector(_) | Value::HalfVector(_) => {
7132            // Unreachable in practice (vector columns are filtered
7133            // out before this). Defensive fallback so a future
7134            // vector-stats path doesn't crash.
7135            alloc::format!("{v:?}")
7136        }
7137        // v7.5.0 — Value is #[non_exhaustive] for downstream
7138        // forward-compat. Future variants fall through to Debug
7139        // form here (same shape as the vector fallback above).
7140        _ => alloc::format!("{v:?}"),
7141    }
7142}
7143
7144/// v6.2.0 — true for engine-managed catalog tables that the bare
7145/// `ANALYZE` (no target) should skip. v6.2.0 has no internal
7146/// tables yet (publications / subscriptions / users / statistics
7147/// all live as engine fields, not catalog tables), so this is a
7148/// reserved future-proofing hook — every existing user table is
7149/// analysed.
7150const fn is_internal_table_name(_name: &str) -> bool {
7151    false
7152}
7153
7154fn value_to_literal(v: Value) -> Literal {
7155    match v {
7156        Value::Null => Literal::Null,
7157        Value::SmallInt(n) => Literal::Integer(i64::from(n)),
7158        Value::Int(n) => Literal::Integer(i64::from(n)),
7159        Value::BigInt(n) => Literal::Integer(n),
7160        Value::Float(x) => Literal::Float(x),
7161        Value::Text(s) | Value::Json(s) => Literal::String(s),
7162        Value::Bool(b) => Literal::Bool(b),
7163        Value::Vector(v) => Literal::Vector(v),
7164        Value::Numeric { scaled, scale } => {
7165            Literal::String(eval::format_numeric(scaled, scale))
7166        }
7167        Value::Date(d) => Literal::String(eval::format_date(d)),
7168        Value::Timestamp(t) => Literal::String(eval::format_timestamp(t)),
7169        Value::Interval { months, micros } => Literal::Interval {
7170            months,
7171            micros,
7172            text: eval::format_interval(months, micros),
7173        },
7174        // SQ8 / halfvec cells dequantise to f32 before reaching the
7175        // substitute walker; pgwire's Bind path handles that.
7176        Value::Sq8Vector(q) => Literal::Vector(spg_storage::quantize::dequantize(&q)),
7177        Value::HalfVector(h) => Literal::Vector(h.to_f32_vec()),
7178        // v7.5.0 — Value is #[non_exhaustive]; future variants
7179        // render as Debug-form String literal until explicit
7180        // mapping is added.
7181        v => Literal::String(alloc::format!("{v:?}")),
7182    }
7183}
7184
7185fn rewrite_clock_calls(stmt: &mut Statement, now_micros: Option<i64>) {
7186    let Some(now) = now_micros else {
7187        return;
7188    };
7189    match stmt {
7190        Statement::Select(s) => rewrite_select_clock(s, now),
7191        Statement::Insert(ins) => {
7192            for row in &mut ins.rows {
7193                for e in row {
7194                    rewrite_expr_clock(e, now);
7195                }
7196            }
7197        }
7198        _ => {}
7199    }
7200}
7201
7202fn rewrite_select_clock(s: &mut SelectStatement, now: i64) {
7203    for item in &mut s.items {
7204        if let SelectItem::Expr { expr, .. } = item {
7205            rewrite_expr_clock(expr, now);
7206        }
7207    }
7208    if let Some(w) = &mut s.where_ {
7209        rewrite_expr_clock(w, now);
7210    }
7211    if let Some(gs) = &mut s.group_by {
7212        for g in gs {
7213            rewrite_expr_clock(g, now);
7214        }
7215    }
7216    if let Some(h) = &mut s.having {
7217        rewrite_expr_clock(h, now);
7218    }
7219    for o in &mut s.order_by {
7220        rewrite_expr_clock(&mut o.expr, now);
7221    }
7222    for (_, peer) in &mut s.unions {
7223        rewrite_select_clock(peer, now);
7224    }
7225}
7226
7227/// v3.0.3 hot path: every recursion lands in exactly one `match` arm.
7228/// Literal / Column-with-qualifier (the dominant cases on a typical
7229/// AST) take a single pattern dispatch and exit. The clock-rewrite
7230/// targets (zero-arg `NOW` / `CURRENT_TIMESTAMP` / `CURRENT_DATE`
7231/// functions, and bare `CURRENT_TIMESTAMP` / `CURRENT_DATE` column
7232/// refs) sit on their own arms with match guards so the fall-through
7233/// to the recursive arms is unambiguous.
7234fn rewrite_expr_clock(e: &mut Expr, now: i64) {
7235    // Fast-path test on the no-recursion shapes first. We can't fold
7236    // them into the big match below because they need to *replace* `e`
7237    // outright; the recursive arms below match on its sub-fields.
7238    if let Some(replacement) = clock_replacement_for(e, now) {
7239        *e = replacement;
7240        return;
7241    }
7242    match e {
7243        Expr::Binary { lhs, rhs, .. } => {
7244            rewrite_expr_clock(lhs, now);
7245            rewrite_expr_clock(rhs, now);
7246        }
7247        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
7248            rewrite_expr_clock(expr, now);
7249        }
7250        Expr::FunctionCall { args, .. } => {
7251            for a in args {
7252                rewrite_expr_clock(a, now);
7253            }
7254        }
7255        Expr::Like { expr, pattern, .. } => {
7256            rewrite_expr_clock(expr, now);
7257            rewrite_expr_clock(pattern, now);
7258        }
7259        Expr::Extract { source, .. } => rewrite_expr_clock(source, now),
7260        // v4.10 subquery nodes — recurse into the inner SELECT's
7261        // expression slots so e.g. SELECT NOW() in a scalar
7262        // subquery picks up the same instant as the outer query.
7263        Expr::ScalarSubquery(s) => rewrite_select_clock(s, now),
7264        Expr::Exists { subquery, .. } => rewrite_select_clock(subquery, now),
7265        Expr::InSubquery { expr, subquery, .. } => {
7266            rewrite_expr_clock(expr, now);
7267            rewrite_select_clock(subquery, now);
7268        }
7269        // v4.12 window functions — args + PARTITION BY + ORDER BY
7270        // may all reference clock literals.
7271        Expr::WindowFunction {
7272            args,
7273            partition_by,
7274            order_by,
7275            ..
7276        } => {
7277            for a in args {
7278                rewrite_expr_clock(a, now);
7279            }
7280            for p in partition_by {
7281                rewrite_expr_clock(p, now);
7282            }
7283            for (e, _) in order_by {
7284                rewrite_expr_clock(e, now);
7285            }
7286        }
7287        Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => {}
7288    }
7289}
7290
7291/// Returns `Some(Expr)` when `e` is one of the clock-call shapes that
7292/// must be rewritten; otherwise `None` so the caller falls through to
7293/// the recursive walk. Identifies both function-call forms (`NOW()` /
7294/// `CURRENT_TIMESTAMP()` / `CURRENT_DATE()`) and bare-identifier forms
7295/// (`CURRENT_TIMESTAMP` / `CURRENT_DATE` as unqualified column refs,
7296/// which is how PG accepts them without parens).
7297fn clock_replacement_for(e: &Expr, now: i64) -> Option<Expr> {
7298    let (kind, name) = match e {
7299        Expr::FunctionCall { name, args } if args.is_empty() => (ClockSite::Fn, name.as_str()),
7300        Expr::Column(c) if c.qualifier.is_none() => (ClockSite::BareIdent, c.name.as_str()),
7301        _ => return None,
7302    };
7303    // ASCII case-insensitive name match. Limited to the three keywords
7304    // that actually need rewriting.
7305    let matched = match name.len() {
7306        3 if kind == ClockSite::Fn && name.eq_ignore_ascii_case("now") => Some(true),
7307        12 if name.eq_ignore_ascii_case("current_date") => Some(false),
7308        17 if name.eq_ignore_ascii_case("current_timestamp") => Some(true),
7309        _ => None,
7310    };
7311    let is_timestamp = matched?;
7312    let payload = if is_timestamp {
7313        now
7314    } else {
7315        now.div_euclid(86_400_000_000)
7316    };
7317    let target = if is_timestamp {
7318        spg_sql::ast::CastTarget::Timestamp
7319    } else {
7320        spg_sql::ast::CastTarget::Date
7321    };
7322    Some(Expr::Cast {
7323        expr: alloc::boxed::Box::new(Expr::Literal(spg_sql::ast::Literal::Integer(payload))),
7324        target,
7325    })
7326}
7327
7328#[derive(Debug, Clone, Copy, PartialEq, Eq)]
7329enum ClockSite {
7330    Fn,
7331    BareIdent,
7332}
7333
7334/// `ORDER BY <integer>` references the N-th SELECT item (1-based).
7335/// Swap the integer literal for the matching item's expression so the
7336/// executor doesn't need a special-case branch. Recurses into UNION
7337/// peers because each peer keeps its own SELECT list.
7338/// v6.4.1 — expand `GROUP BY ALL` to every non-aggregate SELECT-list
7339/// item. Mirrors DuckDB / PG 19 semantics. Wildcards (`SELECT * …`)
7340/// are NOT expanded by GROUP BY ALL (PG 19 leaves the wildcard intact
7341/// and groups by whatever explicit non-aggregates remain — none in
7342/// the wildcard-only case, which still works for non-aggregate
7343/// queries).
7344fn expand_group_by_all(s: &mut SelectStatement) {
7345    if !s.group_by_all {
7346        for (_, peer) in &mut s.unions {
7347            expand_group_by_all(peer);
7348        }
7349        return;
7350    }
7351    let mut groups: Vec<Expr> = Vec::new();
7352    for item in &s.items {
7353        if let SelectItem::Expr { expr, .. } = item
7354            && !aggregate::contains_aggregate(expr)
7355        {
7356            groups.push(expr.clone());
7357        }
7358    }
7359    s.group_by = Some(groups);
7360    s.group_by_all = false;
7361    for (_, peer) in &mut s.unions {
7362        expand_group_by_all(peer);
7363    }
7364}
7365
7366fn resolve_order_by_position(s: &mut SelectStatement) {
7367    // v6.4.0 — iterate every ORDER BY key. Position references
7368    // (`ORDER BY 2`) bind to the 1-based projection index;
7369    // identifier references that match a SELECT-list alias bind to
7370    // the projected expression (Step 4 of L3a).
7371    for order in &mut s.order_by {
7372        match &order.expr {
7373            Expr::Literal(Literal::Integer(n)) if *n >= 1 => {
7374                if let Ok(idx_one_based) = usize::try_from(*n) {
7375                    let idx = idx_one_based - 1;
7376                    if idx < s.items.len()
7377                        && let SelectItem::Expr { expr, .. } = &s.items[idx]
7378                    {
7379                        order.expr = expr.clone();
7380                    }
7381                }
7382            }
7383            Expr::Column(c) if c.qualifier.is_none() => {
7384                // Alias-in-ORDER-BY lookup.
7385                for item in &s.items {
7386                    if let SelectItem::Expr {
7387                        expr,
7388                        alias: Some(a),
7389                    } = item
7390                        && a == &c.name
7391                    {
7392                        order.expr = expr.clone();
7393                        break;
7394                    }
7395                }
7396            }
7397            _ => {}
7398        }
7399    }
7400    for (_, peer) in &mut s.unions {
7401        resolve_order_by_position(peer);
7402    }
7403}
7404
7405/// Sort `tagged` by `f64` key, reversing the comparator under DESC.
7406/// Used by the UNION ORDER BY path; per-block paths inline the same
7407/// comparator because they already hold `&OrderBy` directly.
7408/// v3.1.1: partial-sort helper. When `keep` (= offset + limit) is
7409/// strictly less than `tagged.len()`, run `select_nth_unstable_by` to
7410/// partition the prefix in O(n), then sort just that prefix in O(k
7411/// log k). Total O(n + k log k), vs O(n log n) for a full sort. The
7412/// caller decides what `keep` is; passing `None` (no LIMIT) keeps the
7413/// full-sort behaviour.
7414///
7415/// `tagged` holds `(Option<f64>, Row)` (the SELECT path) — `None` keys
7416/// sort last in ascending order, mirroring NULL-sorts-last in SQL.
7417fn partial_sort_tagged(
7418    tagged: &mut Vec<(Vec<f64>, Row)>,
7419    keep: Option<usize>,
7420    descs: &[bool],
7421) {
7422    let cmp = |a: &(Vec<f64>, Row), b: &(Vec<f64>, Row)| cmp_multi_key(&a.0, &b.0, descs);
7423    match keep {
7424        Some(k) if k < tagged.len() && k > 0 => {
7425            let pivot = k - 1;
7426            tagged.select_nth_unstable_by(pivot, cmp);
7427            tagged[..k].sort_by(cmp);
7428            tagged.truncate(k);
7429        }
7430        _ => {
7431            tagged.sort_by(cmp);
7432        }
7433    }
7434}
7435
7436fn sort_by_keys(tagged: &mut [(Vec<f64>, Row)], descs: &[bool]) {
7437    tagged.sort_by(|a, b| cmp_multi_key(&a.0, &b.0, descs));
7438}
7439
7440/// v6.4.0 — multi-key ORDER BY comparator. Each key's per-key DESC
7441/// flag is honored independently. NULL is encoded as `f64::INFINITY`
7442/// so it sorts last in ASC and first in DESC (matches PG default).
7443fn cmp_multi_key(a: &[f64], b: &[f64], descs: &[bool]) -> core::cmp::Ordering {
7444    use core::cmp::Ordering;
7445    for (i, (ka, kb)) in a.iter().zip(b.iter()).enumerate() {
7446        let ord = ka.partial_cmp(kb).unwrap_or(Ordering::Equal);
7447        let ord = if descs.get(i).copied().unwrap_or(false) {
7448            ord.reverse()
7449        } else {
7450            ord
7451        };
7452        if ord != Ordering::Equal {
7453            return ord;
7454        }
7455    }
7456    Ordering::Equal
7457}
7458
7459/// v6.4.0 — eval every ORDER BY expression for a row and pack the
7460/// resulting keys into a `Vec<f64>`. NULL → `f64::INFINITY`.
7461fn build_order_keys(
7462    order_by: &[OrderBy],
7463    row: &Row,
7464    ctx: &EvalContext,
7465) -> Result<Vec<f64>, EngineError> {
7466    let mut keys = Vec::with_capacity(order_by.len());
7467    for o in order_by {
7468        let v = eval::eval_expr(&o.expr, row, ctx)?;
7469        keys.push(value_to_order_key(&v)?);
7470    }
7471    Ok(keys)
7472}
7473
7474/// Drop the first `offset` rows then truncate to `limit`. PG / `MySQL`
7475/// agree: OFFSET applies *after* ORDER BY but *before* LIMIT (so
7476/// `LIMIT 10 OFFSET 5` keeps rows 6..=15).
7477fn apply_offset_and_limit(rows: &mut Vec<Row>, offset: Option<u32>, limit: Option<u32>) {
7478    if let Some(off) = offset {
7479        let off = off as usize;
7480        if off >= rows.len() {
7481            rows.clear();
7482        } else {
7483            rows.drain(..off);
7484        }
7485    }
7486    if let Some(n) = limit {
7487        rows.truncate(n as usize);
7488    }
7489}
7490
7491/// v7.6.1 — resolve a parser-level `ForeignKeyConstraint` (column
7492/// names + parent table name) into the storage-layer shape (column
7493/// indices + same parent table). Validates everything the engine
7494/// needs to know about the FK at CREATE TABLE time:
7495///
7496///   - parent table exists (catalog lookup, unless self-referencing)
7497///   - parent columns exist on the parent table
7498///   - parent column list matches the local arity (defaults to the
7499///     parent's primary index column when omitted)
7500///   - parent columns are covered by a `BTree` UNIQUE-class index
7501///     (SPG's stand-in for `PRIMARY KEY`/`UNIQUE`) — required so
7502///     the v7.6.2 INSERT path can do an O(log n) parent lookup
7503///   - local columns exist on the table being created
7504fn resolve_foreign_key(
7505    local_table_name: &str,
7506    local_cols: &[ColumnSchema],
7507    fk: spg_sql::ast::ForeignKeyConstraint,
7508    catalog: &Catalog,
7509) -> Result<spg_storage::ForeignKeyConstraint, EngineError> {
7510    // Resolve local columns.
7511    let mut local_columns = Vec::with_capacity(fk.columns.len());
7512    for name in &fk.columns {
7513        let pos = local_cols
7514            .iter()
7515            .position(|c| c.name == *name)
7516            .ok_or_else(|| {
7517                EngineError::Unsupported(alloc::format!(
7518                    "FOREIGN KEY references unknown local column {name:?}"
7519                ))
7520            })?;
7521        local_columns.push(pos);
7522    }
7523    // Self-referencing FK: parent table is the one we're creating.
7524    // The parent column resolution uses the local column list since
7525    // the catalog doesn't have this table yet.
7526    let is_self_ref = fk.parent_table == local_table_name;
7527    let (parent_cols_for_lookup, parent_table_str): (&[ColumnSchema], &str) = if is_self_ref {
7528        (local_cols, local_table_name)
7529    } else {
7530        let parent_table = catalog.get(&fk.parent_table).ok_or_else(|| {
7531            EngineError::Storage(StorageError::TableNotFound {
7532                name: fk.parent_table.clone(),
7533            })
7534        })?;
7535        (parent_table.schema().columns.as_slice(), fk.parent_table.as_str())
7536    };
7537    // Resolve parent column names → positions. If the FK omitted the
7538    // parent column list, fall back to the parent's primary index
7539    // column (single-column only — composite default is rejected
7540    // because there's no unambiguous "PK" in SPG's index list).
7541    let parent_columns: Vec<usize> = if fk.parent_columns.is_empty() {
7542        if fk.columns.len() != 1 {
7543            return Err(EngineError::Unsupported(
7544                "composite FOREIGN KEY without explicit parent column list is not supported \
7545                 — list the parent columns explicitly"
7546                    .into(),
7547            ));
7548        }
7549        // Find a single BTree index on the parent and use its column.
7550        let pos = pick_pk_index_column(catalog, parent_table_str, is_self_ref, local_cols)
7551            .ok_or_else(|| {
7552                EngineError::Unsupported(alloc::format!(
7553                    "parent table {parent_table_str:?} has no PRIMARY-key / UNIQUE BTree index \
7554                     to default the FOREIGN KEY against"
7555                ))
7556            })?;
7557        alloc::vec![pos]
7558    } else {
7559        let mut out = Vec::with_capacity(fk.parent_columns.len());
7560        for name in &fk.parent_columns {
7561            let pos = parent_cols_for_lookup
7562                .iter()
7563                .position(|c| c.name == *name)
7564                .ok_or_else(|| {
7565                    EngineError::Unsupported(alloc::format!(
7566                        "FOREIGN KEY references unknown parent column \
7567                         {name:?} on table {parent_table_str:?}"
7568                    ))
7569                })?;
7570            out.push(pos);
7571        }
7572        out
7573    };
7574    if parent_columns.len() != local_columns.len() {
7575        return Err(EngineError::Unsupported(alloc::format!(
7576            "FOREIGN KEY arity mismatch: {} local columns vs {} parent columns",
7577            local_columns.len(),
7578            parent_columns.len()
7579        )));
7580    }
7581    // For non-self-referencing FKs, verify the parent column set is
7582    // covered by a BTree index. SPG doesn't have a `PRIMARY KEY`
7583    // declaration; the convention is "the parent column for FK
7584    // purposes must have a BTree index" — which the user creates via
7585    // `CREATE INDEX ... USING btree (col)` (the default). We accept
7586    // any single-column BTree index that covers a parent column;
7587    // composite parent column lists require an index whose `column_position`
7588    // matches the first parent column (multi-column BTree indices
7589    // are not in the v7.x roadmap).
7590    if !is_self_ref {
7591        let parent_table = catalog
7592            .get(&fk.parent_table)
7593            .expect("checked above");
7594        let primary_parent_col = parent_columns[0];
7595        let has_btree = parent_table.schema().columns.get(primary_parent_col).is_some()
7596            && parent_table
7597                .indices()
7598                .iter()
7599                .any(|idx| {
7600                    matches!(idx.kind, spg_storage::IndexKind::BTree(_))
7601                        && idx.column_position == primary_parent_col
7602                        && idx.partial_predicate.is_none()
7603                });
7604        if !has_btree {
7605            return Err(EngineError::Unsupported(alloc::format!(
7606                "FOREIGN KEY parent column on {:?} is not covered by an unconditional BTree \
7607                 index — create one with `CREATE INDEX ... ON {} ({})` first",
7608                parent_table_str,
7609                parent_table_str,
7610                parent_table.schema().columns[primary_parent_col].name,
7611            )));
7612        }
7613    }
7614    let on_delete = fk_action_sql_to_storage(fk.on_delete);
7615    let on_update = fk_action_sql_to_storage(fk.on_update);
7616    Ok(spg_storage::ForeignKeyConstraint {
7617        name: fk.name,
7618        local_columns,
7619        parent_table: fk.parent_table,
7620        parent_columns,
7621        on_delete,
7622        on_update,
7623    })
7624}
7625
7626/// v7.6.1 — pick a sentinel "primary key" column from the parent
7627/// table when the FK didn't name parent columns. Picks the first
7628/// single-column unconditional BTree index — that's the closest
7629/// thing SPG has to a PRIMARY KEY today. Self-referencing FKs use
7630/// `local_cols` as the column source.
7631fn pick_pk_index_column(
7632    catalog: &Catalog,
7633    parent_name: &str,
7634    is_self_ref: bool,
7635    local_cols: &[ColumnSchema],
7636) -> Option<usize> {
7637    if is_self_ref {
7638        // Self-ref FK omitted parent columns: pick column 0 by
7639        // convention (no catalog entry yet). Engine will widen this
7640        // when v7.6.7 lands; v7.6.1 only handles the explicit form.
7641        let _ = local_cols;
7642        return Some(0);
7643    }
7644    let parent = catalog.get(parent_name)?;
7645    parent.indices().iter().find_map(|idx| {
7646        if matches!(idx.kind, spg_storage::IndexKind::BTree(_))
7647            && idx.partial_predicate.is_none()
7648            && idx.included_columns.is_empty()
7649            && idx.expression.is_none()
7650        {
7651            Some(idx.column_position)
7652        } else {
7653            None
7654        }
7655    })
7656}
7657
7658/// v7.9.8 / v7.9.10 — resolve the column positions that
7659/// identify a conflict for ON CONFLICT. Returns a Vec of
7660/// column positions (1 element for single-column form, N for
7661/// composite). When the user wrote bare `ON CONFLICT DO …`,
7662/// falls back to the table's first unconditional BTree index
7663/// (always single-column today).
7664fn resolve_on_conflict_columns(
7665    catalog: &Catalog,
7666    table_name: &str,
7667    target: &[String],
7668) -> Result<Vec<usize>, EngineError> {
7669    let table = catalog.get(table_name).ok_or_else(|| {
7670        EngineError::Storage(StorageError::TableNotFound {
7671            name: table_name.into(),
7672        })
7673    })?;
7674    if target.is_empty() {
7675        let pos = table
7676            .indices()
7677            .iter()
7678            .find_map(|idx| {
7679                if matches!(idx.kind, spg_storage::IndexKind::BTree(_))
7680                    && idx.partial_predicate.is_none()
7681                    && idx.included_columns.is_empty()
7682                    && idx.expression.is_none()
7683                {
7684                    Some(idx.column_position)
7685                } else {
7686                    None
7687                }
7688            })
7689            .ok_or_else(|| {
7690                EngineError::Unsupported(alloc::format!(
7691                    "ON CONFLICT without target requires a UNIQUE BTree index on {table_name:?}"
7692                ))
7693            })?;
7694        return Ok(alloc::vec![pos]);
7695    }
7696    let mut out = Vec::with_capacity(target.len());
7697    for name in target {
7698        let pos = table
7699            .schema()
7700            .columns
7701            .iter()
7702            .position(|c| c.name == *name)
7703            .ok_or_else(|| {
7704                EngineError::Unsupported(alloc::format!(
7705                    "ON CONFLICT target column {name:?} not found on {table_name:?}"
7706                ))
7707            })?;
7708        out.push(pos);
7709    }
7710    Ok(out)
7711}
7712
7713/// v7.9.8 — check whether the BTree index on `column_pos` of
7714/// `table_name` already has a row with this key.
7715fn on_conflict_key_exists(
7716    catalog: &Catalog,
7717    table_name: &str,
7718    column_pos: usize,
7719    key: &Value,
7720) -> bool {
7721    let Some(table) = catalog.get(table_name) else {
7722        return false;
7723    };
7724    let Some(idx_key) = spg_storage::IndexKey::from_value(key) else {
7725        return false;
7726    };
7727    table.indices().iter().any(|idx| {
7728        matches!(idx.kind, spg_storage::IndexKind::BTree(_))
7729            && idx.column_position == column_pos
7730            && idx.partial_predicate.is_none()
7731            && !idx.lookup_eq(&idx_key).is_empty()
7732    })
7733}
7734
7735/// v7.9.9 / v7.9.10 — look up an existing row's position by
7736/// matching all `column_positions` against the incoming `key`
7737/// tuple. Single-column shape (one column) reduces to the
7738/// canonical PK lookup; composite shapes scan linearly until
7739/// every position matches.
7740fn lookup_row_position_by_keys(
7741    catalog: &Catalog,
7742    table_name: &str,
7743    column_positions: &[usize],
7744    key: &[&Value],
7745) -> Option<usize> {
7746    let table = catalog.get(table_name)?;
7747    table.rows().iter().position(|r| {
7748        column_positions
7749            .iter()
7750            .enumerate()
7751            .all(|(i, &pos)| r.values.get(pos) == Some(key[i]))
7752    })
7753}
7754
7755/// v7.9.10 — does the table already contain a row whose
7756/// `column_positions` tuple equals `key`? Single-column shape
7757/// uses the existing BTree fast path; composite shapes fall
7758/// back to a row scan.
7759fn on_conflict_keys_exist(
7760    catalog: &Catalog,
7761    table_name: &str,
7762    column_positions: &[usize],
7763    key: &[&Value],
7764) -> bool {
7765    if column_positions.len() == 1 {
7766        return on_conflict_key_exists(
7767            catalog,
7768            table_name,
7769            column_positions[0],
7770            key[0],
7771        );
7772    }
7773    let Some(table) = catalog.get(table_name) else {
7774        return false;
7775    };
7776    table.rows().iter().any(|r| {
7777        column_positions
7778            .iter()
7779            .enumerate()
7780            .all(|(i, &pos)| r.values.get(pos) == Some(key[i]))
7781    })
7782}
7783
7784/// v7.9.9 — apply ON CONFLICT DO UPDATE SET assignments to an
7785/// existing row.
7786///
7787/// `incoming` is the rejected INSERT row (used to resolve
7788/// `EXCLUDED.col` references in the assignment exprs);
7789/// `target_pos` is the position of the existing row in the table.
7790/// Each assignment substitutes `EXCLUDED.col` with the matching
7791/// incoming value, evaluates the resulting expression against
7792/// the existing row, and writes the new value into the
7793/// corresponding column of the returned `Vec<Value>`. If
7794/// `where_` evaluates falsy, returns Ok(None) — PG behaviour:
7795/// the conflicting row is silently kept unchanged.
7796fn apply_on_conflict_assignments(
7797    catalog: &Catalog,
7798    table_name: &str,
7799    target_pos: usize,
7800    incoming: &[Value],
7801    assignments: &[(String, Expr)],
7802    where_: Option<&Expr>,
7803) -> Result<Option<Vec<Value>>, EngineError> {
7804    let table = catalog.get(table_name).ok_or_else(|| {
7805        EngineError::Storage(StorageError::TableNotFound {
7806            name: table_name.into(),
7807        })
7808    })?;
7809    let schema_cols = table.schema().columns.clone();
7810    let existing = table
7811        .rows()
7812        .get(target_pos)
7813        .ok_or_else(|| {
7814            EngineError::Unsupported(alloc::format!(
7815                "ON CONFLICT DO UPDATE: row position {target_pos} out of bounds on {table_name:?}"
7816            ))
7817        })?
7818        .clone();
7819    let ctx = eval::EvalContext::new(&schema_cols, Some(table_name));
7820    // Optional WHERE filter on the conflict row.
7821    if let Some(w) = where_ {
7822        let pred = w.clone();
7823        let pred = substitute_excluded_refs(pred, &schema_cols, incoming);
7824        let v = eval::eval_expr(&pred, &existing, &ctx)?;
7825        if !matches!(v, Value::Bool(true)) {
7826            return Ok(None);
7827        }
7828    }
7829    let mut new_values = existing.values.clone();
7830    for (col_name, expr) in assignments {
7831        let target_idx = schema_cols
7832            .iter()
7833            .position(|c| c.name == *col_name)
7834            .ok_or_else(|| {
7835                EngineError::Eval(EvalError::ColumnNotFound {
7836                    name: col_name.clone(),
7837                })
7838            })?;
7839        let sub = substitute_excluded_refs(expr.clone(), &schema_cols, incoming);
7840        let v = eval::eval_expr(&sub, &existing, &ctx)?;
7841        new_values[target_idx] =
7842            coerce_value(v, schema_cols[target_idx].ty, col_name, target_idx)?;
7843    }
7844    Ok(Some(new_values))
7845}
7846
7847/// v7.9.9 — walk an `Expr` tree replacing any `Column { qualifier:
7848/// "EXCLUDED", name }` reference with a `Literal` of the matching
7849/// value from the incoming-row vec. Resolution against the
7850/// child-table column list (by name).
7851fn substitute_excluded_refs(
7852    expr: Expr,
7853    schema_cols: &[ColumnSchema],
7854    incoming: &[Value],
7855) -> Expr {
7856    use spg_sql::ast::ColumnName;
7857    match expr {
7858        Expr::Column(ColumnName { qualifier, name })
7859            if qualifier
7860                .as_deref()
7861                .is_some_and(|q| q.eq_ignore_ascii_case("excluded")) =>
7862        {
7863            let pos = schema_cols.iter().position(|c| c.name == name);
7864            match pos {
7865                Some(p) => {
7866                    let v = incoming.get(p).cloned().unwrap_or(Value::Null);
7867                    value_to_literal_expr(v).unwrap_or_else(|_| {
7868                        Expr::Literal(spg_sql::ast::Literal::Null)
7869                    })
7870                }
7871                None => Expr::Column(ColumnName { qualifier, name }),
7872            }
7873        }
7874        Expr::Binary { op, lhs, rhs } => Expr::Binary {
7875            op,
7876            lhs: Box::new(substitute_excluded_refs(*lhs, schema_cols, incoming)),
7877            rhs: Box::new(substitute_excluded_refs(*rhs, schema_cols, incoming)),
7878        },
7879        Expr::Unary { op, expr } => Expr::Unary {
7880            op,
7881            expr: Box::new(substitute_excluded_refs(*expr, schema_cols, incoming)),
7882        },
7883        Expr::FunctionCall { name, args } => Expr::FunctionCall {
7884            name,
7885            args: args
7886                .into_iter()
7887                .map(|a| substitute_excluded_refs(a, schema_cols, incoming))
7888                .collect(),
7889        },
7890        other => other,
7891    }
7892}
7893
7894/// v7.6.2 / v7.6.7 — INSERT-side FK enforcement. For every row
7895/// about to be inserted into `child_table`, every FK declared on
7896/// that table is checked: the row's FK columns must either be
7897/// NULL (SQL spec skip) or match an existing parent row via the
7898/// parent's BTree PK / UNIQUE index.
7899///
7900/// Returns `EngineError::Unsupported` with a `FOREIGN KEY violation`
7901/// payload on first failure.
7902///
7903/// **Self-referencing FKs (v7.6.7 widening):** when `fk.parent_table
7904/// == child_table`, the parent rows visible to this check are
7905///  (a) rows already committed to the table, plus
7906///  (b) earlier rows from the *same* `rows` batch.
7907/// This makes `INSERT INTO tree VALUES (1, NULL), (2, 1), (3, 2)`
7908/// work in a single statement — common pattern for bulk-loading
7909/// hierarchies.
7910/// v7.9.19 — enforce table-level UNIQUE / PRIMARY KEY tuple
7911/// constraints at INSERT time. For each constraint declared on
7912/// the target table, check that no existing row + no earlier row
7913/// in the same batch has the same full-column tuple. NULL in
7914/// any column lifts the row out of the check (SQL spec: NULL
7915/// ≠ NULL for uniqueness). mailrs G1 + G6.
7916fn enforce_uniqueness_inserts(
7917    catalog: &Catalog,
7918    child_table: &str,
7919    constraints: &[spg_storage::UniquenessConstraint],
7920    rows: &[Vec<Value>],
7921) -> Result<(), EngineError> {
7922    if constraints.is_empty() {
7923        return Ok(());
7924    }
7925    let table = catalog.get(child_table).ok_or_else(|| {
7926        EngineError::Storage(StorageError::TableNotFound {
7927            name: child_table.into(),
7928        })
7929    })?;
7930    for uc in constraints {
7931        for (batch_idx, row_values) in rows.iter().enumerate() {
7932            let key: Vec<&Value> = uc.columns.iter().map(|&i| &row_values[i]).collect();
7933            let has_null = key.iter().any(|v| matches!(v, Value::Null));
7934            if has_null {
7935                continue;
7936            }
7937            // Table-side collision: scan existing rows.
7938            let collides_in_table = table.rows().iter().any(|prow| {
7939                uc.columns
7940                    .iter()
7941                    .enumerate()
7942                    .all(|(i, &p)| prow.values.get(p) == Some(key[i]))
7943            });
7944            // Batch-side collision: earlier rows in the same INSERT.
7945            let collides_in_batch = rows[..batch_idx].iter().any(|earlier| {
7946                uc.columns
7947                    .iter()
7948                    .enumerate()
7949                    .all(|(i, &p)| earlier.get(p) == Some(key[i]))
7950            });
7951            if collides_in_table || collides_in_batch {
7952                let kind = if uc.is_primary_key { "PRIMARY KEY" } else { "UNIQUE" };
7953                let col_names: Vec<String> = uc
7954                    .columns
7955                    .iter()
7956                    .map(|&i| table.schema().columns[i].name.clone())
7957                    .collect();
7958                return Err(EngineError::Unsupported(alloc::format!(
7959                    "{kind} violation on {child_table:?} columns {col_names:?}: \
7960                     row #{batch_idx} duplicates an existing key"
7961                )));
7962            }
7963        }
7964    }
7965    Ok(())
7966}
7967
7968fn enforce_fk_inserts(
7969    catalog: &Catalog,
7970    child_table: &str,
7971    fks: &[spg_storage::ForeignKeyConstraint],
7972    rows: &[Vec<Value>],
7973) -> Result<(), EngineError> {
7974    for fk in fks {
7975        let parent_is_self = fk.parent_table == child_table;
7976        let parent = if parent_is_self {
7977            // Self-ref: read the current state of the same table.
7978            // The mut borrow on child has been dropped by the caller.
7979            catalog.get(child_table).ok_or_else(|| {
7980                EngineError::Storage(StorageError::TableNotFound {
7981                    name: child_table.into(),
7982                })
7983            })?
7984        } else {
7985            catalog.get(&fk.parent_table).ok_or_else(|| {
7986                EngineError::Storage(StorageError::TableNotFound {
7987                    name: fk.parent_table.clone(),
7988                })
7989            })?
7990        };
7991        for (batch_idx, row_values) in rows.iter().enumerate() {
7992            // Single-column FK fast path: try the parent's BTree
7993            // index for an O(log n) lookup. Composite FKs fall back
7994            // to a parent-row scan.
7995            if fk.local_columns.len() == 1 {
7996                let v = &row_values[fk.local_columns[0]];
7997                if matches!(v, Value::Null) {
7998                    continue;
7999                }
8000                let parent_col = fk.parent_columns[0];
8001                let key = spg_storage::IndexKey::from_value(v).ok_or_else(|| {
8002                    EngineError::Unsupported(alloc::format!(
8003                        "FOREIGN KEY column value of type {:?} is not index-eligible",
8004                        v.data_type()
8005                    ))
8006                })?;
8007                let present_committed = parent.indices().iter().any(|idx| {
8008                    matches!(idx.kind, spg_storage::IndexKind::BTree(_))
8009                        && idx.column_position == parent_col
8010                        && idx.partial_predicate.is_none()
8011                        && !idx.lookup_eq(&key).is_empty()
8012                });
8013                // v7.6.7 self-ref widening: also accept a match
8014                // against earlier rows in this same batch when the
8015                // FK points at the table being inserted into.
8016                let present_in_batch = parent_is_self
8017                    && rows[..batch_idx].iter().any(|earlier| {
8018                        earlier.get(parent_col) == Some(v)
8019                    });
8020                if !(present_committed || present_in_batch) {
8021                    return Err(EngineError::Unsupported(alloc::format!(
8022                        "FOREIGN KEY violation: no parent row in {:?} where {} = {:?}",
8023                        fk.parent_table,
8024                        parent
8025                            .schema()
8026                            .columns
8027                            .get(parent_col)
8028                            .map_or("?", |c| c.name.as_str()),
8029                        v,
8030                    )));
8031                }
8032            } else {
8033                // Composite FK: scan parent rows. v7.6.7 also
8034                // accepts a match against earlier rows in the same
8035                // batch (self-ref bulk-loading of hierarchies).
8036                if fk.local_columns
8037                    .iter()
8038                    .all(|&i| matches!(row_values.get(i), Some(Value::Null)))
8039                {
8040                    continue;
8041                }
8042                let local: Vec<&Value> = fk.local_columns.iter().map(|&i| &row_values[i]).collect();
8043                let parent_match_committed = parent.rows().iter().any(|prow| {
8044                    fk.parent_columns
8045                        .iter()
8046                        .enumerate()
8047                        .all(|(i, &pi)| prow.values.get(pi) == Some(local[i]))
8048                });
8049                let parent_match_in_batch = parent_is_self
8050                    && rows[..batch_idx].iter().any(|earlier| {
8051                        fk.parent_columns
8052                            .iter()
8053                            .enumerate()
8054                            .all(|(i, &pi)| earlier.get(pi) == Some(local[i]))
8055                    });
8056                if !(parent_match_committed || parent_match_in_batch) {
8057                    return Err(EngineError::Unsupported(alloc::format!(
8058                        "FOREIGN KEY violation: no parent row in {:?} matching composite key",
8059                        fk.parent_table,
8060                    )));
8061                }
8062            }
8063        }
8064    }
8065    Ok(())
8066}
8067
8068/// v7.6.4 / v7.6.5 — one step of the FK action plan computed for a
8069/// DELETE on a parent. The plan is a list of these steps, stacked
8070/// across the FK graph by `plan_fk_parent_deletions`.
8071#[derive(Debug, Clone)]
8072struct FkChildStep {
8073    child_table: String,
8074    action: FkChildAction,
8075}
8076
8077#[derive(Debug, Clone)]
8078enum FkChildAction {
8079    /// CASCADE — remove these rows. Sorted, deduplicated positions.
8080    Delete { positions: Vec<usize> },
8081    /// SET NULL — for each (row, column) in the flat list, write
8082    /// NULL into that child cell. Multiple FKs on the same row may
8083    /// produce overlapping entries (deduped at plan time).
8084    SetNull {
8085        positions: Vec<usize>,
8086        columns: Vec<usize>,
8087    },
8088    /// SET DEFAULT — same shape as SetNull but writes the column's
8089    /// declared DEFAULT value (resolved at plan time). Columns
8090    /// without a DEFAULT raise an error during planning.
8091    SetDefault {
8092        positions: Vec<usize>,
8093        columns: Vec<usize>,
8094        defaults: Vec<Value>,
8095    },
8096}
8097
8098/// v7.6.3 → v7.6.5 — plan FK fallout for a DELETE on a parent table.
8099///
8100/// Walks every table in the catalog looking for FKs whose
8101/// `parent_table` is `parent_table_name`. For each such FK + each
8102/// to-be-deleted parent row:
8103///
8104///   - RESTRICT / NoAction → error, no plan returned
8105///   - CASCADE → child rows get scheduled for deletion; recursive
8106///   - SetNull → child FK column(s) scheduled to be NULL-ed.
8107///     Verified NULL-able at plan time.
8108///   - SetDefault → child FK column(s) scheduled to be reset to
8109///     their declared DEFAULT. Columns without a DEFAULT raise.
8110///
8111/// SET NULL / SET DEFAULT do NOT cascade further — the child row
8112/// stays; only one of its columns mutates.
8113fn plan_fk_parent_deletions(
8114    catalog: &Catalog,
8115    parent_table_name: &str,
8116    to_delete_positions: &[usize],
8117    to_delete_rows: &[Vec<Value>],
8118) -> Result<Vec<FkChildStep>, EngineError> {
8119    use alloc::collections::{BTreeMap, BTreeSet};
8120    if to_delete_rows.is_empty() {
8121        return Ok(Vec::new());
8122    }
8123    let mut delete_plan: BTreeMap<String, BTreeSet<usize>> = BTreeMap::new();
8124    // setnull / setdefault keyed by child_table → (row_idx, col_idx) → optional default
8125    let mut setnull_plan: BTreeMap<String, BTreeSet<(usize, usize)>> = BTreeMap::new();
8126    let mut setdefault_plan: BTreeMap<String, BTreeMap<(usize, usize), Value>> =
8127        BTreeMap::new();
8128    let mut visited: BTreeSet<(String, usize)> = BTreeSet::new();
8129    for &p in to_delete_positions {
8130        visited.insert((parent_table_name.to_string(), p));
8131    }
8132    let mut work: Vec<(String, Vec<Value>)> = to_delete_rows
8133        .iter()
8134        .map(|r| (parent_table_name.to_string(), r.clone()))
8135        .collect();
8136    while let Some((cur_parent, parent_row)) = work.pop() {
8137        for child_name in catalog.table_names() {
8138            let child = catalog
8139                .get(&child_name)
8140                .expect("table_names → catalog.get round-trip is total");
8141            for fk in &child.schema().foreign_keys {
8142                if fk.parent_table != cur_parent {
8143                    continue;
8144                }
8145                let parent_key: Vec<&Value> = fk
8146                    .parent_columns
8147                    .iter()
8148                    .map(|&pi| &parent_row[pi])
8149                    .collect();
8150                if parent_key.iter().any(|v| matches!(v, Value::Null)) {
8151                    continue;
8152                }
8153                for (child_row_idx, child_row) in child.rows().iter().enumerate() {
8154                    if child_name == cur_parent
8155                        && visited.contains(&(child_name.clone(), child_row_idx))
8156                    {
8157                        continue;
8158                    }
8159                    let matches_key = fk
8160                        .local_columns
8161                        .iter()
8162                        .enumerate()
8163                        .all(|(i, &li)| child_row.values.get(li) == Some(parent_key[i]));
8164                    if !matches_key {
8165                        continue;
8166                    }
8167                    match fk.on_delete {
8168                        spg_storage::FkAction::Restrict
8169                        | spg_storage::FkAction::NoAction => {
8170                            return Err(EngineError::Unsupported(alloc::format!(
8171                                "FOREIGN KEY violation: DELETE on {cur_parent:?} is \
8172                                 restricted by FK from {child_name:?}.{:?}",
8173                                fk.local_columns,
8174                            )));
8175                        }
8176                        spg_storage::FkAction::Cascade => {
8177                            if visited.insert((child_name.clone(), child_row_idx)) {
8178                                delete_plan
8179                                    .entry(child_name.clone())
8180                                    .or_default()
8181                                    .insert(child_row_idx);
8182                                work.push((child_name.clone(), child_row.values.clone()));
8183                            }
8184                        }
8185                        spg_storage::FkAction::SetNull => {
8186                            // Verify every local FK column is NULL-able.
8187                            for &li in &fk.local_columns {
8188                                let col = child.schema().columns.get(li).ok_or_else(|| {
8189                                    EngineError::Unsupported(alloc::format!(
8190                                        "FK local column {li} missing in {child_name:?}"
8191                                    ))
8192                                })?;
8193                                if !col.nullable {
8194                                    return Err(EngineError::Unsupported(alloc::format!(
8195                                        "FOREIGN KEY ON DELETE SET NULL: column \
8196                                         {child_name:?}.{:?} is NOT NULL — cannot SET NULL",
8197                                        col.name,
8198                                    )));
8199                                }
8200                            }
8201                            let entry = setnull_plan.entry(child_name.clone()).or_default();
8202                            for &li in &fk.local_columns {
8203                                entry.insert((child_row_idx, li));
8204                            }
8205                        }
8206                        spg_storage::FkAction::SetDefault => {
8207                            // Resolve the DEFAULT for every local FK col.
8208                            let entry =
8209                                setdefault_plan.entry(child_name.clone()).or_default();
8210                            for &li in &fk.local_columns {
8211                                let col = child.schema().columns.get(li).ok_or_else(|| {
8212                                    EngineError::Unsupported(alloc::format!(
8213                                        "FK local column {li} missing in {child_name:?}"
8214                                    ))
8215                                })?;
8216                                let default = col.default.clone().ok_or_else(|| {
8217                                    EngineError::Unsupported(alloc::format!(
8218                                        "FOREIGN KEY ON DELETE SET DEFAULT: column \
8219                                         {child_name:?}.{:?} has no DEFAULT declared",
8220                                        col.name,
8221                                    ))
8222                                })?;
8223                                entry.insert((child_row_idx, li), default);
8224                            }
8225                        }
8226                    }
8227                }
8228            }
8229        }
8230    }
8231    // Flatten the three plans into the ordered `FkChildStep` list.
8232    // Deletes are applied last per child (after any null/default
8233    // re-writes on the same child) so a child row that's both
8234    // re-written and then cascade-deleted only ends up deleted —
8235    // but in v7.6.5 SetNull/Cascade never overlap on the same row
8236    // (a single FK chooses exactly one action), so the order is
8237    // mostly a precaution.
8238    let mut steps: Vec<FkChildStep> = Vec::new();
8239    for (child_table, entries) in setnull_plan {
8240        let (positions, columns): (Vec<usize>, Vec<usize>) = entries.into_iter().unzip();
8241        steps.push(FkChildStep {
8242            child_table,
8243            action: FkChildAction::SetNull { positions, columns },
8244        });
8245    }
8246    for (child_table, entries) in setdefault_plan {
8247        let mut positions = Vec::with_capacity(entries.len());
8248        let mut columns = Vec::with_capacity(entries.len());
8249        let mut defaults = Vec::with_capacity(entries.len());
8250        for ((p, c), v) in entries {
8251            positions.push(p);
8252            columns.push(c);
8253            defaults.push(v);
8254        }
8255        steps.push(FkChildStep {
8256            child_table,
8257            action: FkChildAction::SetDefault {
8258                positions,
8259                columns,
8260                defaults,
8261            },
8262        });
8263    }
8264    for (child_table, positions) in delete_plan {
8265        steps.push(FkChildStep {
8266            child_table,
8267            action: FkChildAction::Delete {
8268                positions: positions.into_iter().collect(),
8269            },
8270        });
8271    }
8272    Ok(steps)
8273}
8274
8275/// v7.6.6 — plan FK fallout for an UPDATE that mutates parent-side
8276/// PK/UNIQUE columns. Walks every other table whose FK references
8277/// `parent_table_name`; for each FK whose parent_columns overlap a
8278/// mutated column, decides the action by `fk.on_update`.
8279///
8280///   - RESTRICT / NoAction → error if any child references the OLD
8281///     value
8282///   - CASCADE → child FK columns get rewritten to the NEW parent
8283///     value (a SetNull-style update step with the new value)
8284///   - SetNull → child FK columns set to NULL
8285///   - SetDefault → child FK columns set to declared default
8286///
8287/// `plan_with_old` is `(row_position, old_values, new_values)` so
8288/// the planner can detect "did this row's parent key actually
8289/// change?" — only rows where at least one referenced parent
8290/// column moved trigger inbound work.
8291fn plan_fk_parent_updates(
8292    catalog: &Catalog,
8293    parent_table_name: &str,
8294    plan_with_old: &[(usize, Vec<Value>, Vec<Value>)],
8295) -> Result<Vec<FkChildStep>, EngineError> {
8296    use alloc::collections::BTreeMap;
8297    if plan_with_old.is_empty() {
8298        return Ok(Vec::new());
8299    }
8300    // For each child table we may touch, build per-child step
8301    // lists. UPDATE never deletes children — `delete_plan` stays
8302    // empty here but is kept structurally aligned with
8303    // `plan_fk_parent_deletions` for future use.
8304    let delete_plan: BTreeMap<String, alloc::collections::BTreeSet<usize>> = BTreeMap::new();
8305    let mut setnull_plan: BTreeMap<
8306        String,
8307        alloc::collections::BTreeSet<(usize, usize)>,
8308    > = BTreeMap::new();
8309    let mut setdefault_plan: BTreeMap<String, BTreeMap<(usize, usize), Value>> =
8310        BTreeMap::new();
8311    // Cascade-update plan: child_table → row_idx → col_idx → new_value
8312    let mut cascade_plan: BTreeMap<String, BTreeMap<(usize, usize), Value>> = BTreeMap::new();
8313
8314    for child_name in catalog.table_names() {
8315        let child = catalog
8316            .get(&child_name)
8317            .expect("table_names → catalog.get total");
8318        for fk in &child.schema().foreign_keys {
8319            if fk.parent_table != parent_table_name {
8320                continue;
8321            }
8322            for (_pos, old_row, new_row) in plan_with_old {
8323                // Did any parent FK column change?
8324                let key_changed = fk
8325                    .parent_columns
8326                    .iter()
8327                    .any(|&pi| old_row.get(pi) != new_row.get(pi));
8328                if !key_changed {
8329                    continue;
8330                }
8331                // The OLD parent key — used to find referring children.
8332                let old_key: Vec<&Value> = fk
8333                    .parent_columns
8334                    .iter()
8335                    .map(|&pi| &old_row[pi])
8336                    .collect();
8337                if old_key.iter().any(|v| matches!(v, Value::Null)) {
8338                    // NULL parent has no children — skip.
8339                    continue;
8340                }
8341                let new_key: Vec<&Value> = fk
8342                    .parent_columns
8343                    .iter()
8344                    .map(|&pi| &new_row[pi])
8345                    .collect();
8346                for (child_row_idx, child_row) in child.rows().iter().enumerate() {
8347                    // Self-ref same-row updates: a row updating its
8348                    // own PK doesn't restrict itself.
8349                    if child_name == parent_table_name
8350                        && plan_with_old
8351                            .iter()
8352                            .any(|(p, _, _)| *p == child_row_idx)
8353                    {
8354                        continue;
8355                    }
8356                    let matches_key = fk
8357                        .local_columns
8358                        .iter()
8359                        .enumerate()
8360                        .all(|(i, &li)| child_row.values.get(li) == Some(old_key[i]));
8361                    if !matches_key {
8362                        continue;
8363                    }
8364                    match fk.on_update {
8365                        spg_storage::FkAction::Restrict
8366                        | spg_storage::FkAction::NoAction => {
8367                            return Err(EngineError::Unsupported(alloc::format!(
8368                                "FOREIGN KEY violation: UPDATE on {parent_table_name:?} PK is \
8369                                 restricted by FK from {child_name:?}.{:?}",
8370                                fk.local_columns,
8371                            )));
8372                        }
8373                        spg_storage::FkAction::Cascade => {
8374                            // Rewrite child FK columns to new key.
8375                            let entry = cascade_plan.entry(child_name.clone()).or_default();
8376                            for (i, &li) in fk.local_columns.iter().enumerate() {
8377                                entry.insert((child_row_idx, li), new_key[i].clone());
8378                            }
8379                        }
8380                        spg_storage::FkAction::SetNull => {
8381                            for &li in &fk.local_columns {
8382                                let col = child.schema().columns.get(li).ok_or_else(|| {
8383                                    EngineError::Unsupported(alloc::format!(
8384                                        "FK local column {li} missing in {child_name:?}"
8385                                    ))
8386                                })?;
8387                                if !col.nullable {
8388                                    return Err(EngineError::Unsupported(alloc::format!(
8389                                        "FOREIGN KEY ON UPDATE SET NULL: column \
8390                                         {child_name:?}.{:?} is NOT NULL",
8391                                        col.name,
8392                                    )));
8393                                }
8394                            }
8395                            let entry = setnull_plan.entry(child_name.clone()).or_default();
8396                            for &li in &fk.local_columns {
8397                                entry.insert((child_row_idx, li));
8398                            }
8399                        }
8400                        spg_storage::FkAction::SetDefault => {
8401                            let entry =
8402                                setdefault_plan.entry(child_name.clone()).or_default();
8403                            for &li in &fk.local_columns {
8404                                let col = child.schema().columns.get(li).ok_or_else(|| {
8405                                    EngineError::Unsupported(alloc::format!(
8406                                        "FK local column {li} missing in {child_name:?}"
8407                                    ))
8408                                })?;
8409                                let default = col.default.clone().ok_or_else(|| {
8410                                    EngineError::Unsupported(alloc::format!(
8411                                        "FOREIGN KEY ON UPDATE SET DEFAULT: column \
8412                                         {child_name:?}.{:?} has no DEFAULT",
8413                                        col.name,
8414                                    ))
8415                                })?;
8416                                entry.insert((child_row_idx, li), default);
8417                            }
8418                        }
8419                    }
8420                }
8421            }
8422        }
8423    }
8424    // Flatten into FkChildStep list. UPDATE doesn't produce
8425    // DeleteSteps (CASCADE on UPDATE just rewrites FK values).
8426    let mut steps: Vec<FkChildStep> = Vec::new();
8427    for (child_table, entries) in cascade_plan {
8428        let mut positions = Vec::with_capacity(entries.len());
8429        let mut columns = Vec::with_capacity(entries.len());
8430        let mut defaults = Vec::with_capacity(entries.len());
8431        for ((p, c), v) in entries {
8432            positions.push(p);
8433            columns.push(c);
8434            defaults.push(v);
8435        }
8436        // We reuse `FkChildAction::SetDefault` for cascade-update:
8437        // both shapes are "write a known value into specific cells"
8438        // — `apply_per_cell_writes` doesn't care whether the value
8439        // came from a DEFAULT declaration or a new parent key.
8440        steps.push(FkChildStep {
8441            child_table,
8442            action: FkChildAction::SetDefault {
8443                positions,
8444                columns,
8445                defaults,
8446            },
8447        });
8448    }
8449    for (child_table, entries) in setnull_plan {
8450        let (positions, columns): (Vec<usize>, Vec<usize>) = entries.into_iter().unzip();
8451        steps.push(FkChildStep {
8452            child_table,
8453            action: FkChildAction::SetNull { positions, columns },
8454        });
8455    }
8456    for (child_table, entries) in setdefault_plan {
8457        let mut positions = Vec::with_capacity(entries.len());
8458        let mut columns = Vec::with_capacity(entries.len());
8459        let mut defaults = Vec::with_capacity(entries.len());
8460        for ((p, c), v) in entries {
8461            positions.push(p);
8462            columns.push(c);
8463            defaults.push(v);
8464        }
8465        steps.push(FkChildStep {
8466            child_table,
8467            action: FkChildAction::SetDefault {
8468                positions,
8469                columns,
8470                defaults,
8471            },
8472        });
8473    }
8474    let _ = delete_plan; // UPDATE never deletes children.
8475    Ok(steps)
8476}
8477
8478/// v7.6.5 — apply one FK child step to the catalog. Encapsulates
8479/// the three action variants so the DELETE executor stays a
8480/// simple loop over the planned steps.
8481fn apply_fk_child_step(
8482    catalog: &mut Catalog,
8483    step: &FkChildStep,
8484) -> Result<(), EngineError> {
8485    let child = catalog.get_mut(&step.child_table).ok_or_else(|| {
8486        EngineError::Storage(StorageError::TableNotFound {
8487            name: step.child_table.clone(),
8488        })
8489    })?;
8490    match &step.action {
8491        FkChildAction::Delete { positions } => {
8492            let _ = child.delete_rows(positions);
8493        }
8494        FkChildAction::SetNull { positions, columns } => {
8495            apply_per_cell_writes(child, positions, columns, |_| Value::Null)?;
8496        }
8497        FkChildAction::SetDefault {
8498            positions,
8499            columns,
8500            defaults,
8501        } => {
8502            apply_per_cell_writes(child, positions, columns, |i| defaults[i].clone())?;
8503        }
8504    }
8505    Ok(())
8506}
8507
8508/// v7.6.5 — write new values into selected child cells via
8509/// `Table::update_row` (the catalog's existing UPDATE entry).
8510/// Groups writes by row position so multi-column updates on the
8511/// same row only call `update_row` once. `value_for(i)` produces
8512/// the new value for the i-th (position, column) entry.
8513fn apply_per_cell_writes(
8514    child: &mut spg_storage::Table,
8515    positions: &[usize],
8516    columns: &[usize],
8517    mut value_for: impl FnMut(usize) -> Value,
8518) -> Result<(), EngineError> {
8519    use alloc::collections::BTreeMap;
8520    let mut by_row: BTreeMap<usize, Vec<(usize, Value)>> = BTreeMap::new();
8521    for i in 0..positions.len() {
8522        by_row
8523            .entry(positions[i])
8524            .or_default()
8525            .push((columns[i], value_for(i)));
8526    }
8527    for (pos, mutations) in by_row {
8528        let mut new_values = child.rows()[pos].values.clone();
8529        for (col, v) in mutations {
8530            if let Some(slot) = new_values.get_mut(col) {
8531                *slot = v;
8532            }
8533        }
8534        child
8535            .update_row(pos, new_values)
8536            .map_err(EngineError::Storage)?;
8537    }
8538    Ok(())
8539}
8540
8541fn fk_action_sql_to_storage(a: spg_sql::ast::FkAction) -> spg_storage::FkAction {
8542    match a {
8543        spg_sql::ast::FkAction::Restrict => spg_storage::FkAction::Restrict,
8544        spg_sql::ast::FkAction::Cascade => spg_storage::FkAction::Cascade,
8545        spg_sql::ast::FkAction::SetNull => spg_storage::FkAction::SetNull,
8546        spg_sql::ast::FkAction::SetDefault => spg_storage::FkAction::SetDefault,
8547        spg_sql::ast::FkAction::NoAction => spg_storage::FkAction::NoAction,
8548    }
8549}
8550
8551/// v7.9.21 — resolve a column's DEFAULT for INSERT-time
8552/// default-fill. Free fn (rather than `&self`) so callers
8553/// with an active `&mut Table` borrow can still use it.
8554/// Literal defaults take the cached path (`col.default`);
8555/// runtime defaults hit `clock_fn` at each call. mailrs G4.
8556fn resolve_column_default_free(
8557    col: &ColumnSchema,
8558    clock_fn: Option<ClockFn>,
8559) -> Result<Value, EngineError> {
8560    if let Some(rt) = &col.runtime_default {
8561        return eval_runtime_default_free(rt, col.ty, clock_fn);
8562    }
8563    Ok(col.default.clone().unwrap_or(Value::Null))
8564}
8565
8566fn eval_runtime_default_free(
8567    rt: &str,
8568    ty: DataType,
8569    clock_fn: Option<ClockFn>,
8570) -> Result<Value, EngineError> {
8571    let s = rt.trim().to_ascii_lowercase();
8572    let canonical = s.trim_end_matches("()");
8573    let now_us = match clock_fn {
8574        Some(f) => f(),
8575        None => 0,
8576    };
8577    let v = match canonical {
8578        "now" | "current_timestamp" | "localtimestamp" => {
8579            Value::Timestamp(now_us)
8580        }
8581        "current_date" => Value::Date((now_us / 86_400_000_000) as i32),
8582        "current_time" | "localtime" => Value::Timestamp(now_us),
8583        other => {
8584            return Err(EngineError::Unsupported(alloc::format!(
8585                "runtime DEFAULT expression {other:?} not supported \
8586                 (v7.9.21 whitelist: now() / current_timestamp / \
8587                 current_date / current_time / localtimestamp / \
8588                 localtime)"
8589            )));
8590        }
8591    };
8592    coerce_value(v, ty, "DEFAULT", 0)
8593}
8594
8595/// v7.9.21 — true when a DEFAULT expression needs INSERT-time
8596/// evaluation rather than being cacheable as a literal Value.
8597/// FunctionCall is the immediate case (`now()`,
8598/// `current_timestamp`). Literal expressions and simple sign-
8599/// flipped numerics still take the static-cache path.
8600fn is_runtime_default_expr(expr: &Expr) -> bool {
8601    match expr {
8602        Expr::FunctionCall { .. } => true,
8603        Expr::Unary { expr, .. } => is_runtime_default_expr(expr),
8604        _ => false,
8605    }
8606}
8607
8608fn column_def_to_schema(c: ColumnDef) -> Result<ColumnSchema, EngineError> {
8609    let ty = column_type_to_data_type(c.ty);
8610    let mut schema = ColumnSchema::new(c.name.clone(), ty, c.nullable);
8611    if let Some(default_expr) = c.default {
8612        // v7.9.21 — distinguish literal defaults (evaluated once
8613        // at CREATE TABLE) from expression defaults (deferred to
8614        // INSERT). Function calls (`now()`, `current_timestamp`
8615        // — see v7.9.20 keyword promotion) take the runtime path.
8616        // Literals continue to cache. mailrs G4.
8617        if is_runtime_default_expr(&default_expr) {
8618            let display = alloc::format!("{default_expr}");
8619            schema = schema.with_runtime_default(display);
8620        } else {
8621            let raw = literal_expr_to_value(default_expr)?;
8622            let coerced = coerce_value(raw, ty, &c.name, 0)?;
8623            schema = schema.with_default(coerced);
8624        }
8625    }
8626    if c.auto_increment {
8627        // AUTO_INCREMENT only makes sense on integer-shaped columns.
8628        if !matches!(ty, DataType::SmallInt | DataType::Int | DataType::BigInt) {
8629            return Err(EngineError::Unsupported(alloc::format!(
8630                "AUTO_INCREMENT requires an integer column type, got {ty:?}"
8631            )));
8632        }
8633        schema = schema.with_auto_increment();
8634    }
8635    Ok(schema)
8636}
8637
8638const fn column_type_to_data_type(t: ColumnTypeName) -> DataType {
8639    match t {
8640        ColumnTypeName::SmallInt => DataType::SmallInt,
8641        ColumnTypeName::Int => DataType::Int,
8642        ColumnTypeName::BigInt => DataType::BigInt,
8643        ColumnTypeName::Float => DataType::Float,
8644        ColumnTypeName::Text => DataType::Text,
8645        ColumnTypeName::Varchar(n) => DataType::Varchar(n),
8646        ColumnTypeName::Char(n) => DataType::Char(n),
8647        ColumnTypeName::Bool => DataType::Bool,
8648        ColumnTypeName::Vector { dim, encoding } => DataType::Vector {
8649            dim,
8650            encoding: match encoding {
8651                SqlVecEncoding::F32 => VecEncoding::F32,
8652                SqlVecEncoding::Sq8 => VecEncoding::Sq8,
8653                SqlVecEncoding::F16 => VecEncoding::F16,
8654            },
8655        },
8656        ColumnTypeName::Numeric(precision, scale) => DataType::Numeric { precision, scale },
8657        ColumnTypeName::Date => DataType::Date,
8658        ColumnTypeName::Timestamp => DataType::Timestamp,
8659        ColumnTypeName::Timestamptz => DataType::Timestamptz,
8660        ColumnTypeName::Json => DataType::Json,
8661        ColumnTypeName::Jsonb => DataType::Jsonb,
8662    }
8663}
8664
8665/// Convert an INSERT VALUES expression to a storage Value. Supports literal
8666/// expressions, unary-minus over numeric literals, and pgvector-style
8667/// `'[..]'::vector` cast (v1.2). Anything more complex returns `Unsupported`.
8668fn literal_expr_to_value(expr: Expr) -> Result<Value, EngineError> {
8669    match expr {
8670        Expr::Literal(l) => Ok(literal_to_value(l)),
8671        Expr::Cast { expr, target } => {
8672            let inner_value = literal_expr_to_value(*expr)?;
8673            crate::eval::cast_value(inner_value, target).map_err(EngineError::Eval)
8674        }
8675        Expr::Unary {
8676            op: UnOp::Neg,
8677            expr,
8678        } => match *expr {
8679            Expr::Literal(Literal::Integer(n)) => {
8680                // Fold to i32 if it fits, else BigInt. Parser emits Integer(i64)
8681                // — overflow on negate of i64::MIN is the one edge case.
8682                let neg = n.checked_neg().ok_or_else(|| {
8683                    EngineError::Unsupported("integer literal overflow on negation".into())
8684                })?;
8685                Ok(int_value_for(neg))
8686            }
8687            Expr::Literal(Literal::Float(x)) => Ok(Value::Float(-x)),
8688            other => Err(EngineError::Unsupported(alloc::format!(
8689                "unary minus over non-literal expression: {other:?}"
8690            ))),
8691        },
8692        other => Err(EngineError::Unsupported(alloc::format!(
8693            "non-literal INSERT value expression: {other:?}"
8694        ))),
8695    }
8696}
8697
8698fn literal_to_value(l: Literal) -> Value {
8699    match l {
8700        Literal::Integer(n) => int_value_for(n),
8701        Literal::Float(x) => Value::Float(x),
8702        Literal::String(s) => Value::Text(s),
8703        Literal::Bool(b) => Value::Bool(b),
8704        Literal::Null => Value::Null,
8705        Literal::Vector(v) => Value::Vector(v),
8706        Literal::Interval { months, micros, .. } => Value::Interval { months, micros },
8707    }
8708}
8709
8710/// Pick `Int` (`i32`) when the literal fits, else `BigInt`. `INT` vs `BIGINT`
8711/// columns will still enforce the right tag downstream — this is just the
8712/// default we synthesise from an unannotated integer literal.
8713fn int_value_for(n: i64) -> Value {
8714    if let Ok(small) = i32::try_from(n) {
8715        Value::Int(small)
8716    } else {
8717        Value::BigInt(n)
8718    }
8719}
8720
8721/// Widen / narrow `v` to fit `expected`. Numerics permit safe widening
8722/// (`Int → BigInt`, `Int/BigInt → Float`) and best-effort narrowing
8723/// (`BigInt → Int` succeeds only when the value fits in `i32`). Everything
8724/// else returns `TypeMismatch` carrying the column name for caller diagnostics.
8725/// `NULL` is always permitted; the nullability check happens later in storage.
8726#[allow(clippy::too_many_lines)]
8727fn coerce_value(
8728    v: Value,
8729    expected: DataType,
8730    col_name: &str,
8731    position: usize,
8732) -> Result<Value, EngineError> {
8733    if v.is_null() {
8734        return Ok(Value::Null);
8735    }
8736    let actual = v.data_type().expect("non-null");
8737    if actual == expected {
8738        return Ok(v);
8739    }
8740    let coerced =
8741        match (v, expected) {
8742            (Value::Int(n), DataType::BigInt) => Some(Value::BigInt(i64::from(n))),
8743            (Value::Int(n), DataType::Float) => Some(Value::Float(f64::from(n))),
8744            (Value::Int(n), DataType::SmallInt) => i16::try_from(n).ok().map(Value::SmallInt),
8745            (Value::Int(n), DataType::Numeric { precision, scale }) => Some(numeric_from_integer(
8746                i128::from(n),
8747                precision,
8748                scale,
8749                col_name,
8750            )?),
8751            (Value::SmallInt(n), DataType::Int) => Some(Value::Int(i32::from(n))),
8752            (Value::SmallInt(n), DataType::BigInt) => Some(Value::BigInt(i64::from(n))),
8753            (Value::SmallInt(n), DataType::Float) => Some(Value::Float(f64::from(n))),
8754            (Value::SmallInt(n), DataType::Numeric { precision, scale }) => Some(
8755                numeric_from_integer(i128::from(n), precision, scale, col_name)?,
8756            ),
8757            (Value::BigInt(n), DataType::Int) => i32::try_from(n).ok().map(Value::Int),
8758            (Value::BigInt(n), DataType::SmallInt) => i16::try_from(n).ok().map(Value::SmallInt),
8759            #[allow(clippy::cast_precision_loss)]
8760            (Value::BigInt(n), DataType::Float) => Some(Value::Float(n as f64)),
8761            (Value::BigInt(n), DataType::Numeric { precision, scale }) => Some(
8762                numeric_from_integer(i128::from(n), precision, scale, col_name)?,
8763            ),
8764            (Value::Float(x), DataType::Numeric { precision, scale }) => {
8765                Some(numeric_from_float(x, precision, scale, col_name)?)
8766            }
8767            // Text → DATE / TIMESTAMP: parse canonical text forms.
8768            (Value::Text(s), DataType::Date) => {
8769                let d = eval::parse_date_literal(&s).ok_or_else(|| {
8770                    EngineError::Eval(EvalError::TypeMismatch {
8771                        detail: alloc::format!(
8772                            "cannot parse {s:?} as DATE for column `{col_name}`"
8773                        ),
8774                    })
8775                })?;
8776                Some(Value::Date(d))
8777            }
8778            // v4.9: Text ↔ JSON coercion. No structural validation —
8779            // any text literal is accepted; the responsibility for
8780            // valid JSON lies with the producer.
8781            (Value::Text(s), DataType::Json | DataType::Jsonb) => Some(Value::Json(s)),
8782            (Value::Json(s), DataType::Text) => Some(Value::Text(s)),
8783            (Value::Text(s), DataType::Timestamp | DataType::Timestamptz) => {
8784                let t = eval::parse_timestamp_literal(&s).ok_or_else(|| {
8785                    EngineError::Eval(EvalError::TypeMismatch {
8786                        detail: alloc::format!(
8787                            "cannot parse {s:?} as TIMESTAMP for column `{col_name}`"
8788                        ),
8789                    })
8790                })?;
8791                Some(Value::Timestamp(t))
8792            }
8793            // DATE ↔ TIMESTAMP convertibility (DATE → midnight,
8794            // TIMESTAMP → day truncation).
8795            (Value::Date(d), DataType::Timestamp | DataType::Timestamptz) => {
8796                Some(Value::Timestamp(i64::from(d) * 86_400_000_000))
8797            }
8798            // v7.9.21 — Value::Timestamp lands in either Timestamp
8799            // or Timestamptz columns; the on-disk layout is the
8800            // same i64 microseconds UTC.
8801            (Value::Timestamp(t), DataType::Timestamptz) => Some(Value::Timestamp(t)),
8802            (Value::Timestamp(t), DataType::Date) => {
8803                let days = t.div_euclid(86_400_000_000);
8804                i32::try_from(days).ok().map(Value::Date)
8805            }
8806            (
8807                Value::Numeric {
8808                    scaled,
8809                    scale: src_scale,
8810                },
8811                DataType::Numeric { precision, scale },
8812            ) => Some(numeric_rescale(
8813                scaled, src_scale, precision, scale, col_name,
8814            )?),
8815            #[allow(clippy::cast_precision_loss)]
8816            (Value::Numeric { scaled, scale }, DataType::Float) => {
8817                let mut div = 1.0_f64;
8818                for _ in 0..scale {
8819                    div *= 10.0;
8820                }
8821                Some(Value::Float((scaled as f64) / div))
8822            }
8823            (Value::Numeric { scaled, scale }, DataType::Int) => {
8824                let truncated = numeric_truncate_to_integer(scaled, scale);
8825                i32::try_from(truncated).ok().map(Value::Int)
8826            }
8827            (Value::Numeric { scaled, scale }, DataType::BigInt) => {
8828                let truncated = numeric_truncate_to_integer(scaled, scale);
8829                i64::try_from(truncated).ok().map(Value::BigInt)
8830            }
8831            (Value::Numeric { scaled, scale }, DataType::SmallInt) => {
8832                let truncated = numeric_truncate_to_integer(scaled, scale);
8833                i16::try_from(truncated).ok().map(Value::SmallInt)
8834            }
8835            // VARCHAR(n) enforces an upper bound on character count.
8836            (Value::Text(s), DataType::Varchar(max)) => {
8837                if u32::try_from(s.chars().count()).unwrap_or(u32::MAX) <= max {
8838                    Some(Value::Text(s))
8839                } else {
8840                    return Err(EngineError::Unsupported(alloc::format!(
8841                        "value for VARCHAR({max}) column `{col_name}` exceeds length: \
8842                     {} chars",
8843                        s.chars().count()
8844                    )));
8845                }
8846            }
8847            // v6.0.1: f32 → SQ8 INSERT-time quantisation. Triggered
8848            // when the column declares `VECTOR(N) USING SQ8` and
8849            // the INSERT VALUES expression yields a raw f32 vector
8850            // (the normal pgvector-shape literal). Dim mismatch
8851            // falls through the `_ => None` arm and surfaces as
8852            // `TypeMismatch` with the expected SQ8 column type —
8853            // matching the F32 path's existing error.
8854            (
8855                Value::Vector(v),
8856                DataType::Vector {
8857                    dim,
8858                    encoding: VecEncoding::Sq8,
8859                },
8860            ) if v.len() == dim as usize => {
8861                Some(Value::Sq8Vector(spg_storage::quantize::quantize(&v)))
8862            }
8863            // v6.0.3: f32 → f16 INSERT-time conversion for HALF
8864            // columns. Bit-exact at the storage layer (modulo
8865            // half-precision rounding); no rerank pass needed at
8866            // search time.
8867            (
8868                Value::Vector(v),
8869                DataType::Vector {
8870                    dim,
8871                    encoding: VecEncoding::F16,
8872                },
8873            ) if v.len() == dim as usize => Some(Value::HalfVector(
8874                spg_storage::halfvec::HalfVector::from_f32_slice(&v),
8875            )),
8876            // CHAR(n) right-pads with U+0020 to exactly n chars; if the input
8877            // is already longer we reject (PG truncates trailing-space-only;
8878            // staying strict for v1).
8879            (Value::Text(s), DataType::Char(size)) => {
8880                let len = u32::try_from(s.chars().count()).unwrap_or(u32::MAX);
8881                if len > size {
8882                    return Err(EngineError::Unsupported(alloc::format!(
8883                        "value for CHAR({size}) column `{col_name}` exceeds length: \
8884                     {len} chars"
8885                    )));
8886                }
8887                let need = (size - len) as usize;
8888                let mut padded = s;
8889                padded.reserve(need);
8890                for _ in 0..need {
8891                    padded.push(' ');
8892                }
8893                Some(Value::Text(padded))
8894            }
8895            _ => None,
8896        };
8897    coerced.ok_or(EngineError::Storage(StorageError::TypeMismatch {
8898        column: col_name.into(),
8899        expected,
8900        actual,
8901        position,
8902    }))
8903}
8904
8905#[cfg(test)]
8906mod tests {
8907    use super::*;
8908    use alloc::vec;
8909
8910    fn unwrap_command_ok(r: &QueryResult) -> usize {
8911        match r {
8912            QueryResult::CommandOk { affected, .. } => *affected,
8913            QueryResult::Rows { .. } => panic!("expected CommandOk, got Rows"),
8914        }
8915    }
8916
8917    #[test]
8918    fn create_table_registers_schema() {
8919        let mut e = Engine::new();
8920        e.execute("CREATE TABLE foo (a INT NOT NULL, b TEXT)")
8921            .unwrap();
8922        assert_eq!(e.catalog().table_count(), 1);
8923        let t = e.catalog().get("foo").unwrap();
8924        assert_eq!(t.schema().columns.len(), 2);
8925        assert_eq!(t.schema().columns[0].ty, DataType::Int);
8926        assert!(!t.schema().columns[0].nullable);
8927        assert_eq!(t.schema().columns[1].ty, DataType::Text);
8928    }
8929
8930    #[test]
8931    fn create_table_vector_default_is_f32_encoded() {
8932        let mut e = Engine::new();
8933        e.execute("CREATE TABLE t (v VECTOR(8))").unwrap();
8934        let t = e.catalog().get("t").unwrap();
8935        assert_eq!(
8936            t.schema().columns[0].ty,
8937            DataType::Vector {
8938                dim: 8,
8939                encoding: VecEncoding::F32,
8940            },
8941        );
8942    }
8943
8944    #[test]
8945    fn create_table_vector_using_sq8_succeeds() {
8946        // v6.0.1 step 3: the step-1 fence in `column_def_to_schema`
8947        // is lifted. CREATE TABLE persists an SQ8 column type in
8948        // the catalog; INSERT (next test) quantises raw f32 input.
8949        let mut e = Engine::new();
8950        e.execute("CREATE TABLE t (v VECTOR(8) USING SQ8)").unwrap();
8951        let t = e.catalog().get("t").unwrap();
8952        assert_eq!(
8953            t.schema().columns[0].ty,
8954            DataType::Vector {
8955                dim: 8,
8956                encoding: VecEncoding::Sq8,
8957            },
8958        );
8959    }
8960
8961    #[test]
8962    fn insert_into_sq8_column_quantises_f32_payload() {
8963        // v6.0.1 step 3: INSERT-time `coerce_value` rewrites a raw
8964        // `Value::Vector(Vec<f32>)` literal into the column's
8965        // quantised representation. The row that lands in the
8966        // catalog must therefore hold a `Value::Sq8Vector`, not the
8967        // original f32 buffer — that's the bit that delivers the
8968        // 4× compression target.
8969        let mut e = Engine::new();
8970        e.execute("CREATE TABLE t (v VECTOR(4) USING SQ8)").unwrap();
8971        e.execute("INSERT INTO t VALUES ([0.0, 0.25, 0.5, 1.0])")
8972            .unwrap();
8973        let t = e.catalog().get("t").unwrap();
8974        assert_eq!(t.rows().len(), 1);
8975        match &t.rows()[0].values[0] {
8976            Value::Sq8Vector(q) => {
8977                assert_eq!(q.bytes.len(), 4);
8978                // min/max are derived from the payload: min=0.0, max=1.0.
8979                assert!((q.min - 0.0).abs() < 1e-6);
8980                assert!((q.max - 1.0).abs() < 1e-6);
8981            }
8982            other => panic!("expected Sq8Vector cell, got {other:?}"),
8983        }
8984    }
8985
8986    #[test]
8987    fn create_table_vector_using_half_succeeds_and_insert_converts_to_f16() {
8988        // v6.0.3: CREATE TABLE accepts USING HALF; INSERT path
8989        // converts the incoming `Value::Vector(Vec<f32>)` cell
8990        // into `Value::HalfVector(HalfVector)` via the new
8991        // `coerce_value` arm. The dequantised round-trip is
8992        // bit-exact for f16-representable values, so 0.0 / 0.25
8993        // / 0.5 / 1.0 hit their grid points exactly.
8994        let mut e = Engine::new();
8995        e.execute("CREATE TABLE t (v VECTOR(4) USING HALF)")
8996            .unwrap();
8997        e.execute("INSERT INTO t VALUES ([0.0, 0.25, 0.5, 1.0])")
8998            .unwrap();
8999        let t = e.catalog().get("t").unwrap();
9000        assert_eq!(t.rows().len(), 1);
9001        match &t.rows()[0].values[0] {
9002            Value::HalfVector(h) => {
9003                assert_eq!(h.dim(), 4);
9004                let back = h.to_f32_vec();
9005                let expected = alloc::vec![0.0_f32, 0.25, 0.5, 1.0];
9006                for (g, e) in back.iter().zip(expected.iter()) {
9007                    assert!(
9008                        (g - e).abs() < 1e-6,
9009                        "{g} vs {e} should be exact on f16 grid"
9010                    );
9011                }
9012            }
9013            other => panic!("expected HalfVector cell, got {other:?}"),
9014        }
9015    }
9016
9017    #[test]
9018    fn alter_index_rebuild_in_place_succeeds() {
9019        // v6.0.4: bare REBUILD (no encoding switch) walks every
9020        // row again to rebuild the NSW graph. Verifies the engine
9021        // dispatch + storage helper plumbing without changing any
9022        // cell encoding.
9023        let mut e = Engine::new();
9024        e.execute("CREATE TABLE t (id INT NOT NULL, v VECTOR(3) NOT NULL)")
9025            .unwrap();
9026        for i in 0..8_i32 {
9027            #[allow(clippy::cast_precision_loss)]
9028            let base = (i as f32) * 0.1;
9029            e.execute(&alloc::format!(
9030                "INSERT INTO t VALUES ({i}, [{base}, {b1}, {b2}])",
9031                b1 = base + 0.01,
9032                b2 = base + 0.02,
9033            ))
9034            .unwrap();
9035        }
9036        e.execute("CREATE INDEX t_idx ON t USING hnsw (v)").unwrap();
9037        e.execute("ALTER INDEX t_idx REBUILD").unwrap();
9038        // Schema encoding stays F32 (no encoding clause).
9039        assert_eq!(
9040            e.catalog().get("t").unwrap().schema().columns[1].ty,
9041            DataType::Vector {
9042                dim: 3,
9043                encoding: VecEncoding::F32,
9044            },
9045        );
9046    }
9047
9048    #[test]
9049    fn alter_index_rebuild_with_encoding_switches_cell_type() {
9050        // v6.0.4: REBUILD WITH (encoding = SQ8) recodes every
9051        // stored cell from F32 → SQ8 + rebuilds the graph atop the
9052        // new encoding. Post-rebuild, cells must be Sq8Vector and
9053        // the schema must report encoding = Sq8.
9054        let mut e = Engine::new();
9055        e.execute("CREATE TABLE t (id INT NOT NULL, v VECTOR(4) NOT NULL)")
9056            .unwrap();
9057        e.execute("INSERT INTO t VALUES (1, [0.0, 0.25, 0.5, 1.0])")
9058            .unwrap();
9059        e.execute("CREATE INDEX t_idx ON t USING hnsw (v)").unwrap();
9060        e.execute("ALTER INDEX t_idx REBUILD WITH (encoding = SQ8)")
9061            .unwrap();
9062        let t = e.catalog().get("t").unwrap();
9063        assert_eq!(
9064            t.schema().columns[1].ty,
9065            DataType::Vector {
9066                dim: 4,
9067                encoding: VecEncoding::Sq8,
9068            },
9069        );
9070        assert!(matches!(t.rows()[0].values[1], Value::Sq8Vector(_)));
9071    }
9072
9073    #[test]
9074    fn alter_index_rebuild_unknown_index_errors() {
9075        let mut e = Engine::new();
9076        let err = e.execute("ALTER INDEX nope REBUILD").unwrap_err();
9077        assert!(
9078            matches!(
9079                &err,
9080                EngineError::Storage(StorageError::IndexNotFound { name }) if name == "nope"
9081            ),
9082            "got: {err}"
9083        );
9084    }
9085
9086    #[test]
9087    fn alter_index_rebuild_on_btree_index_errors() {
9088        // REBUILD on a B-tree index has no semantic meaning in
9089        // v6.0.4 — rejected at the storage layer with `Unsupported`.
9090        let mut e = Engine::new();
9091        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
9092        e.execute("INSERT INTO t VALUES (1)").unwrap();
9093        e.execute("CREATE INDEX t_idx ON t (id)").unwrap();
9094        let err = e.execute("ALTER INDEX t_idx REBUILD").unwrap_err();
9095        assert!(
9096            matches!(&err, EngineError::Storage(StorageError::Unsupported(_))),
9097            "got: {err}"
9098        );
9099    }
9100
9101    #[test]
9102    fn prepared_insert_substitutes_placeholders() {
9103        // v6.1.1: prepare() parses once; execute_prepared() walks the
9104        // AST and replaces $1/$2 with the param Values BEFORE the
9105        // dispatch sees them. Same logical result as a simple-query
9106        // INSERT, but parse happens once per *statement*, not per
9107        // execution.
9108        let mut e = Engine::new();
9109        e.execute("CREATE TABLE t (id INT NOT NULL, name TEXT NOT NULL)")
9110            .unwrap();
9111        let stmt = e.prepare("INSERT INTO t VALUES ($1, $2)").unwrap();
9112        for (id, name) in [(1, "alice"), (2, "bob"), (3, "carol")] {
9113            e.execute_prepared(
9114                stmt.clone(),
9115                &[Value::Int(id), Value::Text(name.into())],
9116            )
9117            .unwrap();
9118        }
9119        // Read back via simple-query SELECT.
9120        let rows_result = e.execute("SELECT id, name FROM t").unwrap();
9121        let QueryResult::Rows { rows, .. } = rows_result else {
9122            panic!("expected Rows")
9123        };
9124        assert_eq!(rows.len(), 3);
9125    }
9126
9127    #[test]
9128    fn prepared_select_with_placeholder_filters_rows() {
9129        let mut e = Engine::new();
9130        e.execute("CREATE TABLE t (id INT NOT NULL, v INT NOT NULL)")
9131            .unwrap();
9132        for i in 0..10_i32 {
9133            e.execute(&alloc::format!("INSERT INTO t VALUES ({i}, {})", i * 7))
9134                .unwrap();
9135        }
9136        let stmt = e
9137            .prepare("SELECT id FROM t WHERE v = $1")
9138            .unwrap();
9139        let QueryResult::Rows { rows, .. } = e
9140            .execute_prepared(stmt, &[Value::Int(35)])
9141            .unwrap()
9142        else {
9143            panic!("expected Rows")
9144        };
9145        // v = 35 means i*7 = 35 → i = 5.
9146        assert_eq!(rows.len(), 1);
9147        assert_eq!(rows[0].values[0], Value::Int(5));
9148    }
9149
9150    #[test]
9151    fn prepared_too_few_params_errors() {
9152        let mut e = Engine::new();
9153        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
9154        let stmt = e.prepare("INSERT INTO t VALUES ($1)").unwrap();
9155        let err = e.execute_prepared(stmt, &[]).unwrap_err();
9156        assert!(
9157            matches!(
9158                &err,
9159                EngineError::Eval(EvalError::PlaceholderOutOfRange { n: 1, bound: 0 })
9160            ),
9161            "got: {err}"
9162        );
9163    }
9164
9165    #[test]
9166    fn insert_into_half_column_dim_mismatch_errors() {
9167        let mut e = Engine::new();
9168        e.execute("CREATE TABLE t (v VECTOR(4) USING HALF)")
9169            .unwrap();
9170        let err = e.execute("INSERT INTO t VALUES ([1.0, 2.0])").unwrap_err();
9171        assert!(matches!(
9172            &err,
9173            EngineError::Storage(StorageError::TypeMismatch { .. })
9174        ));
9175    }
9176
9177    #[test]
9178    fn insert_into_sq8_column_dim_mismatch_errors() {
9179        // Dim mismatch falls through the `coerce_value` Vector→Sq8
9180        // arm's guard and surfaces as `TypeMismatch` — the same
9181        // error the F32 path produces today, so client error
9182        // handling stays uniform across encodings.
9183        let mut e = Engine::new();
9184        e.execute("CREATE TABLE t (v VECTOR(4) USING SQ8)").unwrap();
9185        let err = e.execute("INSERT INTO t VALUES ([1.0, 2.0])").unwrap_err();
9186        assert!(
9187            matches!(
9188                &err,
9189                EngineError::Storage(StorageError::TypeMismatch { .. })
9190            ),
9191            "got: {err}",
9192        );
9193    }
9194
9195    #[test]
9196    fn create_table_duplicate_errors() {
9197        let mut e = Engine::new();
9198        e.execute("CREATE TABLE foo (a INT)").unwrap();
9199        let err = e.execute("CREATE TABLE foo (a INT)").unwrap_err();
9200        assert!(matches!(
9201            err,
9202            EngineError::Storage(StorageError::DuplicateTable { ref name }) if name == "foo"
9203        ));
9204    }
9205
9206    #[test]
9207    fn insert_into_unknown_table_errors() {
9208        let mut e = Engine::new();
9209        let err = e.execute("INSERT INTO ghost VALUES (1)").unwrap_err();
9210        assert!(matches!(
9211            err,
9212            EngineError::Storage(StorageError::TableNotFound { ref name }) if name == "ghost"
9213        ));
9214    }
9215
9216    #[test]
9217    fn insert_happy_path_reports_one_affected() {
9218        let mut e = Engine::new();
9219        e.execute("CREATE TABLE foo (a INT NOT NULL)").unwrap();
9220        let r = e.execute("INSERT INTO foo VALUES (42)").unwrap();
9221        assert_eq!(unwrap_command_ok(&r), 1);
9222        assert_eq!(e.catalog().get("foo").unwrap().row_count(), 1);
9223    }
9224
9225    #[test]
9226    fn insert_arity_mismatch_propagates() {
9227        let mut e = Engine::new();
9228        e.execute("CREATE TABLE foo (a INT, b TEXT)").unwrap();
9229        let err = e.execute("INSERT INTO foo VALUES (1)").unwrap_err();
9230        assert!(matches!(
9231            err,
9232            EngineError::Storage(StorageError::ArityMismatch { .. })
9233        ));
9234    }
9235
9236    #[test]
9237    fn insert_negative_integer_via_unary_minus() {
9238        let mut e = Engine::new();
9239        e.execute("CREATE TABLE foo (a INT NOT NULL)").unwrap();
9240        e.execute("INSERT INTO foo VALUES (-7)").unwrap();
9241        let rows = e.catalog().get("foo").unwrap().rows();
9242        assert_eq!(rows[0].values[0], Value::Int(-7));
9243    }
9244
9245    #[test]
9246    fn insert_non_literal_expr_unsupported() {
9247        let mut e = Engine::new();
9248        e.execute("CREATE TABLE foo (a INT NOT NULL)").unwrap();
9249        let err = e.execute("INSERT INTO foo VALUES (1 + 2)").unwrap_err();
9250        assert!(matches!(err, EngineError::Unsupported(_)));
9251    }
9252
9253    #[test]
9254    fn select_star_returns_all_rows_in_insertion_order() {
9255        let mut e = Engine::new();
9256        e.execute("CREATE TABLE foo (a INT NOT NULL, b TEXT NOT NULL)")
9257            .unwrap();
9258        e.execute("INSERT INTO foo VALUES (1, 'one')").unwrap();
9259        e.execute("INSERT INTO foo VALUES (2, 'two')").unwrap();
9260        e.execute("INSERT INTO foo VALUES (3, 'three')").unwrap();
9261
9262        let r = e.execute("SELECT * FROM foo").unwrap();
9263        let QueryResult::Rows { columns, rows } = r else {
9264            panic!("expected Rows")
9265        };
9266        assert_eq!(columns.len(), 2);
9267        assert_eq!(columns[0].name, "a");
9268        assert_eq!(rows.len(), 3);
9269        assert_eq!(
9270            rows[1].values,
9271            vec![Value::Int(2), Value::Text("two".into())]
9272        );
9273    }
9274
9275    #[test]
9276    fn select_star_on_empty_table_returns_zero_rows() {
9277        let mut e = Engine::new();
9278        e.execute("CREATE TABLE foo (a INT)").unwrap();
9279        let r = e.execute("SELECT * FROM foo").unwrap();
9280        match r {
9281            QueryResult::Rows { rows, .. } => assert!(rows.is_empty()),
9282            QueryResult::CommandOk { .. } => panic!("expected Rows"),
9283        }
9284    }
9285
9286    // --- v0.4: WHERE + projection ------------------------------------------
9287
9288    fn make_three_row_users(e: &mut Engine) {
9289        e.execute("CREATE TABLE users (id INT NOT NULL, name TEXT NOT NULL, score INT)")
9290            .unwrap();
9291        e.execute("INSERT INTO users VALUES (1, 'alice', 90)")
9292            .unwrap();
9293        e.execute("INSERT INTO users VALUES (2, 'bob', NULL)")
9294            .unwrap();
9295        e.execute("INSERT INTO users VALUES (3, 'cara', 70)")
9296            .unwrap();
9297    }
9298
9299    fn unwrap_rows(r: QueryResult) -> (Vec<ColumnSchema>, Vec<Row>) {
9300        match r {
9301            QueryResult::Rows { columns, rows } => (columns, rows),
9302            QueryResult::CommandOk { .. } => panic!("expected Rows"),
9303        }
9304    }
9305
9306    #[test]
9307    fn where_filter_passes_only_true_rows() {
9308        let mut e = Engine::new();
9309        make_three_row_users(&mut e);
9310        let r = e.execute("SELECT * FROM users WHERE id > 1").unwrap();
9311        let (_, rows) = unwrap_rows(r);
9312        assert_eq!(rows.len(), 2);
9313        assert_eq!(rows[0].values[0], Value::Int(2));
9314        assert_eq!(rows[1].values[0], Value::Int(3));
9315    }
9316
9317    #[test]
9318    fn where_with_null_result_filters_out_row() {
9319        let mut e = Engine::new();
9320        make_three_row_users(&mut e);
9321        // score is NULL for bob → score > 80 is NULL → row excluded
9322        let r = e.execute("SELECT * FROM users WHERE score > 80").unwrap();
9323        let (_, rows) = unwrap_rows(r);
9324        assert_eq!(rows.len(), 1);
9325        assert_eq!(rows[0].values[1], Value::Text("alice".into()));
9326    }
9327
9328    #[test]
9329    fn projection_named_columns() {
9330        let mut e = Engine::new();
9331        make_three_row_users(&mut e);
9332        let r = e.execute("SELECT name, score FROM users").unwrap();
9333        let (cols, rows) = unwrap_rows(r);
9334        assert_eq!(cols.len(), 2);
9335        assert_eq!(cols[0].name, "name");
9336        assert_eq!(cols[1].name, "score");
9337        assert_eq!(rows.len(), 3);
9338        assert_eq!(
9339            rows[0].values,
9340            vec![Value::Text("alice".into()), Value::Int(90)]
9341        );
9342    }
9343
9344    #[test]
9345    fn projection_with_column_alias() {
9346        let mut e = Engine::new();
9347        make_three_row_users(&mut e);
9348        let r = e
9349            .execute("SELECT name AS who FROM users WHERE id = 1")
9350            .unwrap();
9351        let (cols, rows) = unwrap_rows(r);
9352        assert_eq!(cols[0].name, "who");
9353        assert_eq!(rows.len(), 1);
9354        assert_eq!(rows[0].values[0], Value::Text("alice".into()));
9355    }
9356
9357    #[test]
9358    fn qualified_column_with_table_alias_resolves() {
9359        let mut e = Engine::new();
9360        make_three_row_users(&mut e);
9361        let r = e
9362            .execute("SELECT u.id, u.name FROM users AS u WHERE u.id < 3")
9363            .unwrap();
9364        let (cols, rows) = unwrap_rows(r);
9365        assert_eq!(cols.len(), 2);
9366        assert_eq!(rows.len(), 2);
9367    }
9368
9369    #[test]
9370    fn qualified_column_with_wrong_alias_errors() {
9371        let mut e = Engine::new();
9372        make_three_row_users(&mut e);
9373        let err = e.execute("SELECT x.id FROM users AS u").unwrap_err();
9374        assert!(matches!(
9375            err,
9376            EngineError::Eval(EvalError::UnknownQualifier { ref qualifier }) if qualifier == "x"
9377        ));
9378    }
9379
9380    #[test]
9381    fn select_unknown_column_errors_in_projection() {
9382        let mut e = Engine::new();
9383        make_three_row_users(&mut e);
9384        let err = e.execute("SELECT ghost FROM users").unwrap_err();
9385        assert!(matches!(
9386            err,
9387            EngineError::Eval(EvalError::ColumnNotFound { ref name }) if name == "ghost"
9388        ));
9389    }
9390
9391    #[test]
9392    fn where_unknown_column_errors() {
9393        let mut e = Engine::new();
9394        make_three_row_users(&mut e);
9395        let err = e
9396            .execute("SELECT * FROM users WHERE ghost = 1")
9397            .unwrap_err();
9398        assert!(matches!(
9399            err,
9400            EngineError::Eval(EvalError::ColumnNotFound { .. })
9401        ));
9402    }
9403
9404    #[test]
9405    fn expression_projection_evaluates_and_renders() {
9406        // Compound expressions in the SELECT list are evaluated per row;
9407        // the output column is typed TEXT, name defaults to the expression.
9408        let mut e = Engine::new();
9409        e.execute("CREATE TABLE t (a INT NOT NULL)").unwrap();
9410        e.execute("INSERT INTO t VALUES (3)").unwrap();
9411        let (_, rows) = unwrap_rows(e.execute("SELECT 1 + 2 FROM t").unwrap());
9412        assert_eq!(rows.len(), 1);
9413        // The expression evaluates to integer 3; rendered as the cell value
9414        // (storage::Value::Int(3) since arithmetic kept ints).
9415        assert_eq!(rows[0].values[0], Value::Int(3));
9416    }
9417
9418    #[test]
9419    fn select_unknown_table_errors() {
9420        let mut e = Engine::new();
9421        let err = e.execute("SELECT * FROM ghost").unwrap_err();
9422        assert!(matches!(
9423            err,
9424            EngineError::Storage(StorageError::TableNotFound { .. })
9425        ));
9426    }
9427
9428    #[test]
9429    fn invalid_sql_returns_parse_error() {
9430        // v4.4: UPDATE is now real SQL, so use a true syntactic
9431        // garbage payload for the parse-error path.
9432        let mut e = Engine::new();
9433        let err = e.execute("THIS_IS_NOT_A_KEYWORD foo bar baz").unwrap_err();
9434        assert!(matches!(err, EngineError::Parse(_)));
9435    }
9436
9437    // --- v0.8 CREATE INDEX + index seek ------------------------------------
9438
9439    #[test]
9440    fn create_index_registers_on_table() {
9441        let mut e = Engine::new();
9442        make_three_row_users(&mut e);
9443        e.execute("CREATE INDEX by_name ON users (name)").unwrap();
9444        let t = e.catalog().get("users").unwrap();
9445        assert_eq!(t.indices().len(), 1);
9446        assert_eq!(t.indices()[0].name, "by_name");
9447    }
9448
9449    #[test]
9450    fn create_index_on_unknown_table_errors() {
9451        let mut e = Engine::new();
9452        let err = e.execute("CREATE INDEX i ON ghost (a)").unwrap_err();
9453        assert!(matches!(
9454            err,
9455            EngineError::Storage(StorageError::TableNotFound { .. })
9456        ));
9457    }
9458
9459    #[test]
9460    fn create_index_on_unknown_column_errors() {
9461        let mut e = Engine::new();
9462        make_three_row_users(&mut e);
9463        let err = e.execute("CREATE INDEX i ON users (ghost)").unwrap_err();
9464        assert!(matches!(
9465            err,
9466            EngineError::Storage(StorageError::ColumnNotFound { .. })
9467        ));
9468    }
9469
9470    #[test]
9471    fn select_eq_uses_index_returns_same_rows_as_scan() {
9472        // Build two engines: one with an index, one without. Same query →
9473        // same row set (index is a planner optimisation, not a semantic
9474        // change).
9475        let mut without = Engine::new();
9476        make_three_row_users(&mut without);
9477        let mut with = Engine::new();
9478        make_three_row_users(&mut with);
9479        with.execute("CREATE INDEX by_id ON users (id)").unwrap();
9480
9481        let q = "SELECT * FROM users WHERE id = 2";
9482        let (_, no_idx_rows) = unwrap_rows(without.execute(q).unwrap());
9483        let (_, idx_rows) = unwrap_rows(with.execute(q).unwrap());
9484        assert_eq!(no_idx_rows, idx_rows);
9485        assert_eq!(idx_rows.len(), 1);
9486    }
9487
9488    #[test]
9489    fn select_eq_with_no_matching_index_value_returns_empty() {
9490        let mut e = Engine::new();
9491        make_three_row_users(&mut e);
9492        e.execute("CREATE INDEX by_id ON users (id)").unwrap();
9493        let (_, rows) = unwrap_rows(e.execute("SELECT * FROM users WHERE id = 999").unwrap());
9494        assert_eq!(rows.len(), 0);
9495    }
9496
9497    // --- v0.9 transactions -------------------------------------------------
9498
9499    #[test]
9500    fn begin_sets_in_transaction_flag() {
9501        let mut e = Engine::new();
9502        assert!(!e.in_transaction());
9503        e.execute("BEGIN").unwrap();
9504        assert!(e.in_transaction());
9505    }
9506
9507    #[test]
9508    fn double_begin_errors() {
9509        let mut e = Engine::new();
9510        e.execute("BEGIN").unwrap();
9511        let err = e.execute("BEGIN").unwrap_err();
9512        assert_eq!(err, EngineError::TransactionAlreadyOpen);
9513    }
9514
9515    #[test]
9516    fn commit_without_begin_errors() {
9517        let mut e = Engine::new();
9518        let err = e.execute("COMMIT").unwrap_err();
9519        assert_eq!(err, EngineError::NoActiveTransaction);
9520    }
9521
9522    #[test]
9523    fn rollback_without_begin_errors() {
9524        let mut e = Engine::new();
9525        let err = e.execute("ROLLBACK").unwrap_err();
9526        assert_eq!(err, EngineError::NoActiveTransaction);
9527    }
9528
9529    #[test]
9530    fn commit_applies_shadow_to_committed_catalog() {
9531        let mut e = Engine::new();
9532        e.execute("CREATE TABLE t (v INT NOT NULL)").unwrap();
9533        e.execute("BEGIN").unwrap();
9534        e.execute("INSERT INTO t VALUES (1)").unwrap();
9535        e.execute("INSERT INTO t VALUES (2)").unwrap();
9536        e.execute("COMMIT").unwrap();
9537        assert!(!e.in_transaction());
9538        assert_eq!(e.catalog().get("t").unwrap().row_count(), 2);
9539    }
9540
9541    #[test]
9542    fn rollback_discards_shadow() {
9543        let mut e = Engine::new();
9544        e.execute("CREATE TABLE t (v INT NOT NULL)").unwrap();
9545        e.execute("BEGIN").unwrap();
9546        e.execute("INSERT INTO t VALUES (1)").unwrap();
9547        e.execute("INSERT INTO t VALUES (2)").unwrap();
9548        e.execute("ROLLBACK").unwrap();
9549        assert!(!e.in_transaction());
9550        assert_eq!(e.catalog().get("t").unwrap().row_count(), 0);
9551    }
9552
9553    #[test]
9554    fn select_during_tx_sees_uncommitted_writes_own_session() {
9555        // The shadow catalog is read by SELECTs while a TX is open — the
9556        // session can see its own pending writes.
9557        let mut e = Engine::new();
9558        e.execute("CREATE TABLE t (v INT NOT NULL)").unwrap();
9559        e.execute("BEGIN").unwrap();
9560        e.execute("INSERT INTO t VALUES (42)").unwrap();
9561        let (_, rows) = unwrap_rows(e.execute("SELECT * FROM t").unwrap());
9562        assert_eq!(rows.len(), 1);
9563        assert_eq!(rows[0].values[0], Value::Int(42));
9564    }
9565
9566    #[test]
9567    fn snapshot_with_no_users_is_bare_catalog_format() {
9568        let mut e = Engine::new();
9569        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
9570        let bytes = e.snapshot();
9571        assert_eq!(
9572            &bytes[..8],
9573            b"SPGDB001",
9574            "must be the bare v3.x catalog magic"
9575        );
9576        let e2 = Engine::restore_envelope(&bytes).unwrap();
9577        assert!(e2.users().is_empty());
9578        assert_eq!(e2.catalog().table_count(), 1);
9579    }
9580
9581    #[test]
9582    fn snapshot_with_users_round_trips_both_via_envelope() {
9583        let mut e = Engine::new();
9584        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
9585        e.create_user("alice", "pw1", Role::Admin, [9; 16]).unwrap();
9586        e.create_user("bob", "pw2", Role::ReadOnly, [5; 16])
9587            .unwrap();
9588        let bytes = e.snapshot();
9589        assert_eq!(&bytes[..8], b"SPGENV01", "must be the v4.1 envelope magic");
9590        let e2 = Engine::restore_envelope(&bytes).unwrap();
9591        assert_eq!(e2.users().len(), 2);
9592        assert_eq!(e2.verify_user("alice", "pw1"), Some(Role::Admin));
9593        assert_eq!(e2.verify_user("bob", "pw2"), Some(Role::ReadOnly));
9594        assert_eq!(e2.verify_user("alice", "wrong"), None);
9595        assert_eq!(e2.catalog().table_count(), 1);
9596    }
9597
9598    #[test]
9599    fn ddl_inside_tx_also_rolled_back() {
9600        let mut e = Engine::new();
9601        e.execute("BEGIN").unwrap();
9602        e.execute("CREATE TABLE t (v INT)").unwrap();
9603        // Visible inside the TX.
9604        e.execute("SELECT * FROM t").unwrap();
9605        e.execute("ROLLBACK").unwrap();
9606        // Gone after rollback.
9607        let err = e.execute("SELECT * FROM t").unwrap_err();
9608        assert!(matches!(
9609            err,
9610            EngineError::Storage(StorageError::TableNotFound { .. })
9611        ));
9612    }
9613
9614    // ── v6.1.2: CREATE / DROP PUBLICATION (engine-side) ──────
9615
9616    #[test]
9617    fn create_publication_lands_in_catalog() {
9618        let mut e = Engine::new();
9619        assert!(e.publications().is_empty());
9620        e.execute("CREATE PUBLICATION pub_a").unwrap();
9621        assert_eq!(e.publications().len(), 1);
9622        assert!(e.publications().contains("pub_a"));
9623    }
9624
9625    #[test]
9626    fn create_publication_duplicate_errors() {
9627        let mut e = Engine::new();
9628        e.execute("CREATE PUBLICATION pub_a").unwrap();
9629        let err = e.execute("CREATE PUBLICATION pub_a").unwrap_err();
9630        assert!(
9631            alloc::format!("{err:?}").contains("DuplicateName"),
9632            "got {err:?}"
9633        );
9634    }
9635
9636    #[test]
9637    fn drop_publication_silent_when_absent() {
9638        let mut e = Engine::new();
9639        // PG-compatible: DROP a publication that doesn't exist
9640        // succeeds (no-op) but reports zero affected.
9641        let r = e.execute("DROP PUBLICATION nope").unwrap();
9642        match r {
9643            QueryResult::CommandOk { affected, .. } => assert_eq!(affected, 0),
9644            other => panic!("expected CommandOk, got {other:?}"),
9645        }
9646    }
9647
9648    #[test]
9649    fn drop_publication_present_reports_one_affected() {
9650        let mut e = Engine::new();
9651        e.execute("CREATE PUBLICATION pub_a").unwrap();
9652        let r = e.execute("DROP PUBLICATION pub_a").unwrap();
9653        match r {
9654            QueryResult::CommandOk {
9655                affected,
9656                modified_catalog,
9657            } => {
9658                assert_eq!(affected, 1);
9659                assert!(modified_catalog);
9660            }
9661            other => panic!("expected CommandOk, got {other:?}"),
9662        }
9663        assert!(e.publications().is_empty());
9664    }
9665
9666    #[test]
9667    fn publications_persist_across_snapshot_restore() {
9668        // The persist-across-restart ship-gate at the engine layer —
9669        // snapshot → restore_envelope round trip must preserve the
9670        // publication catalog. The spg-server e2e covers the
9671        // process-restart variant.
9672        let mut e = Engine::new();
9673        e.execute("CREATE PUBLICATION pub_a").unwrap();
9674        e.execute("CREATE PUBLICATION pub_b FOR ALL TABLES").unwrap();
9675        let snap = e.snapshot();
9676        let e2 = Engine::restore_envelope(&snap).unwrap();
9677        assert_eq!(e2.publications().len(), 2);
9678        assert!(e2.publications().contains("pub_a"));
9679        assert!(e2.publications().contains("pub_b"));
9680    }
9681
9682    #[test]
9683    fn create_publication_allowed_inside_transaction() {
9684        // v6.1.4 dropped the v6.1.2 in-TX guard — PG allows
9685        // CREATE PUBLICATION inside a TX and the auto-commit
9686        // wrap path needs the same allowance.
9687        let mut e = Engine::new();
9688        e.execute("BEGIN").unwrap();
9689        e.execute("CREATE PUBLICATION pub_a").unwrap();
9690        e.execute("COMMIT").unwrap();
9691        assert!(e.publications().contains("pub_a"));
9692    }
9693
9694    // ── v6.1.3: SHOW PUBLICATIONS + FOR-list variants ───────
9695
9696    #[test]
9697    fn create_publication_for_table_list_lands_with_scope() {
9698        let mut e = Engine::new();
9699        e.execute("CREATE TABLE t1 (id INT NOT NULL)").unwrap();
9700        e.execute("CREATE TABLE t2 (id INT NOT NULL)").unwrap();
9701        e.execute("CREATE PUBLICATION pub_a FOR TABLE t1, t2")
9702            .unwrap();
9703        let scope = e.publications().get("pub_a").cloned();
9704        let Some(spg_sql::ast::PublicationScope::ForTables(ts)) = scope else {
9705            panic!("expected ForTables scope, got {scope:?}")
9706        };
9707        assert_eq!(ts, alloc::vec!["t1".to_string(), "t2".to_string()]);
9708    }
9709
9710    #[test]
9711    fn create_publication_all_tables_except_lands_with_scope() {
9712        let mut e = Engine::new();
9713        e.execute("CREATE PUBLICATION pub_a FOR ALL TABLES EXCEPT t3")
9714            .unwrap();
9715        let scope = e.publications().get("pub_a").cloned();
9716        let Some(spg_sql::ast::PublicationScope::AllTablesExcept(ts)) = scope else {
9717            panic!("expected AllTablesExcept scope, got {scope:?}")
9718        };
9719        assert_eq!(ts, alloc::vec!["t3".to_string()]);
9720    }
9721
9722    #[test]
9723    fn show_publications_empty_returns_zero_rows() {
9724        let e = Engine::new();
9725        let r = e.execute_readonly("SHOW PUBLICATIONS").unwrap();
9726        let QueryResult::Rows { rows, columns } = r else {
9727            panic!()
9728        };
9729        assert!(rows.is_empty());
9730        assert_eq!(columns.len(), 3);
9731        assert_eq!(columns[0].name, "name");
9732        assert_eq!(columns[1].name, "scope");
9733        assert_eq!(columns[2].name, "table_count");
9734    }
9735
9736    #[test]
9737    fn show_publications_returns_one_row_per_publication_ordered_by_name() {
9738        let mut e = Engine::new();
9739        e.execute("CREATE PUBLICATION z_pub").unwrap();
9740        e.execute("CREATE PUBLICATION a_pub FOR TABLE t1, t2")
9741            .unwrap();
9742        e.execute("CREATE PUBLICATION m_pub FOR ALL TABLES EXCEPT bad")
9743            .unwrap();
9744        let r = e.execute_readonly("SHOW PUBLICATIONS").unwrap();
9745        let QueryResult::Rows { rows, .. } = r else {
9746            panic!()
9747        };
9748        assert_eq!(rows.len(), 3);
9749        // Alphabetical order: a_pub, m_pub, z_pub.
9750        let names: Vec<&str> = rows
9751            .iter()
9752            .map(|r| {
9753                if let Value::Text(s) = &r.values[0] {
9754                    s.as_str()
9755                } else {
9756                    panic!()
9757                }
9758            })
9759            .collect();
9760        assert_eq!(names, alloc::vec!["a_pub", "m_pub", "z_pub"]);
9761        // Row 0 — a_pub scope summary + table_count = 2.
9762        match &rows[0].values[1] {
9763            Value::Text(s) => assert_eq!(s, "FOR TABLE t1, t2"),
9764            other => panic!("expected Text, got {other:?}"),
9765        }
9766        assert_eq!(rows[0].values[2], Value::Int(2));
9767        // Row 1 — m_pub.
9768        match &rows[1].values[1] {
9769            Value::Text(s) => assert_eq!(s, "FOR ALL TABLES EXCEPT bad"),
9770            other => panic!("expected Text, got {other:?}"),
9771        }
9772        assert_eq!(rows[1].values[2], Value::Int(1));
9773        // Row 2 — z_pub (AllTables → NULL count).
9774        match &rows[2].values[1] {
9775            Value::Text(s) => assert_eq!(s, "FOR ALL TABLES"),
9776            other => panic!("expected Text, got {other:?}"),
9777        }
9778        assert_eq!(rows[2].values[2], Value::Null);
9779    }
9780
9781    #[test]
9782    fn for_list_scopes_persist_across_snapshot() {
9783        // The v6.1.2 envelope-v3 round-trip exercised AllTables;
9784        // v6.1.3 needs the scope-1 / scope-2 tags to survive too.
9785        let mut e = Engine::new();
9786        e.execute("CREATE PUBLICATION p1 FOR TABLE t1, t2").unwrap();
9787        e.execute("CREATE PUBLICATION p2 FOR ALL TABLES EXCEPT bad, worse")
9788            .unwrap();
9789        let snap = e.snapshot();
9790        let e2 = Engine::restore_envelope(&snap).unwrap();
9791        assert_eq!(e2.publications().len(), 2);
9792        let p1 = e2.publications().get("p1").cloned();
9793        let Some(spg_sql::ast::PublicationScope::ForTables(ts)) = p1 else {
9794            panic!("p1 scope lost: {p1:?}")
9795        };
9796        assert_eq!(ts, alloc::vec!["t1".to_string(), "t2".to_string()]);
9797        let p2 = e2.publications().get("p2").cloned();
9798        let Some(spg_sql::ast::PublicationScope::AllTablesExcept(ts)) = p2 else {
9799            panic!("p2 scope lost: {p2:?}")
9800        };
9801        assert_eq!(ts, alloc::vec!["bad".to_string(), "worse".to_string()]);
9802    }
9803
9804    // ── v6.1.4: CREATE / DROP SUBSCRIPTION + SHOW + envelope v4 ─
9805
9806    #[test]
9807    fn create_subscription_lands_in_catalog_with_defaults() {
9808        let mut e = Engine::new();
9809        e.execute(
9810            "CREATE SUBSCRIPTION sub_a CONNECTION 'host=127.0.0.1 port=20002' PUBLICATION pub_a",
9811        )
9812        .unwrap();
9813        let s = e.subscriptions().get("sub_a").cloned().expect("present");
9814        assert_eq!(s.conn_str, "host=127.0.0.1 port=20002");
9815        assert_eq!(s.publications, alloc::vec!["pub_a".to_string()]);
9816        assert!(s.enabled);
9817        assert_eq!(s.last_received_pos, 0);
9818    }
9819
9820    #[test]
9821    fn create_subscription_duplicate_name_errors() {
9822        let mut e = Engine::new();
9823        e.execute("CREATE SUBSCRIPTION s CONNECTION 'host=x' PUBLICATION p")
9824            .unwrap();
9825        let err = e
9826            .execute("CREATE SUBSCRIPTION s CONNECTION 'host=y' PUBLICATION p")
9827            .unwrap_err();
9828        assert!(
9829            alloc::format!("{err:?}").contains("DuplicateName"),
9830            "got {err:?}"
9831        );
9832    }
9833
9834    #[test]
9835    fn drop_subscription_silent_when_absent() {
9836        let mut e = Engine::new();
9837        let r = e.execute("DROP SUBSCRIPTION never").unwrap();
9838        match r {
9839            QueryResult::CommandOk { affected, .. } => assert_eq!(affected, 0),
9840            other => panic!("expected CommandOk, got {other:?}"),
9841        }
9842    }
9843
9844    #[test]
9845    fn subscription_advance_updates_last_pos_monotone() {
9846        let mut e = Engine::new();
9847        e.execute("CREATE SUBSCRIPTION s CONNECTION 'h=x' PUBLICATION p")
9848            .unwrap();
9849        assert!(e.subscription_advance("s", 100));
9850        assert_eq!(e.subscriptions().get("s").unwrap().last_received_pos, 100);
9851        assert!(e.subscription_advance("s", 50)); // stale → ignored
9852        assert_eq!(e.subscriptions().get("s").unwrap().last_received_pos, 100);
9853        assert!(e.subscription_advance("s", 200));
9854        assert_eq!(e.subscriptions().get("s").unwrap().last_received_pos, 200);
9855        assert!(!e.subscription_advance("missing", 1));
9856    }
9857
9858    #[test]
9859    fn show_subscriptions_returns_rows_ordered_by_name() {
9860        let mut e = Engine::new();
9861        e.execute("CREATE SUBSCRIPTION z_sub CONNECTION 'h=x' PUBLICATION p1, p2")
9862            .unwrap();
9863        e.execute("CREATE SUBSCRIPTION a_sub CONNECTION 'h=y' PUBLICATION p3")
9864            .unwrap();
9865        let r = e.execute_readonly("SHOW SUBSCRIPTIONS").unwrap();
9866        let QueryResult::Rows { rows, columns } = r else {
9867            panic!()
9868        };
9869        assert_eq!(rows.len(), 2);
9870        assert_eq!(columns.len(), 5);
9871        assert_eq!(columns[0].name, "name");
9872        assert_eq!(columns[4].name, "last_received_pos");
9873        // Alphabetical: a_sub, z_sub.
9874        let names: Vec<&str> = rows
9875            .iter()
9876            .map(|r| {
9877                if let Value::Text(s) = &r.values[0] {
9878                    s.as_str()
9879                } else {
9880                    panic!()
9881                }
9882            })
9883            .collect();
9884        assert_eq!(names, alloc::vec!["a_sub", "z_sub"]);
9885        // Row 0: a_sub
9886        assert_eq!(rows[0].values[1], Value::Text("h=y".to_string()));
9887        assert_eq!(rows[0].values[2], Value::Text("p3".to_string()));
9888        assert_eq!(rows[0].values[3], Value::Bool(true));
9889        assert_eq!(rows[0].values[4], Value::BigInt(0));
9890        // Row 1: z_sub — publications join with ", "
9891        assert_eq!(rows[1].values[2], Value::Text("p1, p2".to_string()));
9892    }
9893
9894    #[test]
9895    fn subscriptions_persist_across_snapshot_envelope_v4() {
9896        let mut e = Engine::new();
9897        e.execute("CREATE SUBSCRIPTION s1 CONNECTION 'h=A' PUBLICATION p1, p2")
9898            .unwrap();
9899        e.execute("CREATE SUBSCRIPTION s2 CONNECTION 'h=B' PUBLICATION p3")
9900            .unwrap();
9901        e.subscription_advance("s2", 42);
9902        let snap = e.snapshot();
9903        let e2 = Engine::restore_envelope(&snap).unwrap();
9904        assert_eq!(e2.subscriptions().len(), 2);
9905        let s1 = e2.subscriptions().get("s1").unwrap();
9906        assert_eq!(s1.conn_str, "h=A");
9907        assert_eq!(s1.publications, alloc::vec!["p1".to_string(), "p2".to_string()]);
9908        assert_eq!(s1.last_received_pos, 0);
9909        let s2 = e2.subscriptions().get("s2").unwrap();
9910        assert_eq!(s2.last_received_pos, 42);
9911    }
9912
9913    #[test]
9914    fn v3_envelope_loads_with_empty_subscriptions() {
9915        // v3 snapshot (publications-only). Forge it by hand so we
9916        // verify v6.1.4 readers don't panic — they must surface
9917        // empty subscriptions and a populated publication table.
9918        let mut e = Engine::new();
9919        e.execute("CREATE PUBLICATION pub_legacy").unwrap();
9920        let catalog = e.catalog.serialize();
9921        let users = crate::users::serialize_users(&e.users);
9922        let pubs = e.publications.serialize();
9923        let mut buf = Vec::new();
9924        buf.extend_from_slice(b"SPGENV01");
9925        buf.push(3u8); // v3
9926        buf.extend_from_slice(&u32::try_from(catalog.len()).unwrap().to_le_bytes());
9927        buf.extend_from_slice(&catalog);
9928        buf.extend_from_slice(&u32::try_from(users.len()).unwrap().to_le_bytes());
9929        buf.extend_from_slice(&users);
9930        buf.extend_from_slice(&u32::try_from(pubs.len()).unwrap().to_le_bytes());
9931        buf.extend_from_slice(&pubs);
9932        let crc = spg_crypto::crc32::crc32(&buf);
9933        buf.extend_from_slice(&crc.to_le_bytes());
9934
9935        let e2 = Engine::restore_envelope(&buf).expect("v3 envelope restores under v4 reader");
9936        assert!(e2.subscriptions().is_empty());
9937        assert!(e2.publications().contains("pub_legacy"));
9938    }
9939
9940    #[test]
9941    fn create_subscription_allowed_inside_transaction() {
9942        let mut e = Engine::new();
9943        e.execute("BEGIN").unwrap();
9944        e.execute("CREATE SUBSCRIPTION s CONNECTION 'h=x' PUBLICATION p")
9945            .unwrap();
9946        e.execute("COMMIT").unwrap();
9947        assert!(e.subscriptions().contains("s"));
9948    }
9949
9950    #[test]
9951    // ── v6.2.0: ANALYZE + spg_statistic + envelope v5 ──────────
9952
9953    #[test]
9954    fn analyze_populates_histogram_bounds() {
9955        let mut e = Engine::new();
9956        e.execute("CREATE TABLE t (id INT NOT NULL, name TEXT)").unwrap();
9957        for i in 0..50 {
9958            e.execute(&alloc::format!(
9959                "INSERT INTO t VALUES ({i}, 'name{i}')"
9960            ))
9961            .unwrap();
9962        }
9963        e.execute("ANALYZE t").unwrap();
9964        let stats = e.statistics();
9965        let id_stats = stats.get("t", "id").unwrap();
9966        assert!(id_stats.histogram_bounds.len() >= 2);
9967        assert_eq!(id_stats.histogram_bounds.first().unwrap(), "0");
9968        assert_eq!(id_stats.histogram_bounds.last().unwrap(), "49");
9969        assert!((id_stats.null_frac - 0.0).abs() < 1e-6);
9970        assert_eq!(id_stats.n_distinct, 50);
9971    }
9972
9973    #[test]
9974    fn reanalyze_overwrites_prior_stats() {
9975        let mut e = Engine::new();
9976        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
9977        for i in 0..10 {
9978            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
9979        }
9980        e.execute("ANALYZE t").unwrap();
9981        let n1 = e.statistics().get("t", "id").unwrap().n_distinct;
9982        assert_eq!(n1, 10);
9983        for i in 10..30 {
9984            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
9985        }
9986        e.execute("ANALYZE t").unwrap();
9987        let n2 = e.statistics().get("t", "id").unwrap().n_distinct;
9988        assert_eq!(n2, 30);
9989    }
9990
9991    #[test]
9992    fn analyze_unknown_table_errors() {
9993        let mut e = Engine::new();
9994        let err = e.execute("ANALYZE nonexistent").unwrap_err();
9995        assert!(matches!(err, EngineError::Storage(StorageError::TableNotFound { .. })));
9996    }
9997
9998    #[test]
9999    fn bare_analyze_covers_all_user_tables() {
10000        let mut e = Engine::new();
10001        e.execute("CREATE TABLE t1 (id INT NOT NULL)").unwrap();
10002        e.execute("CREATE TABLE t2 (name TEXT NOT NULL)").unwrap();
10003        e.execute("INSERT INTO t1 VALUES (1)").unwrap();
10004        e.execute("INSERT INTO t2 VALUES ('alice')").unwrap();
10005        let r = e.execute("ANALYZE").unwrap();
10006        match r {
10007            QueryResult::CommandOk { affected, modified_catalog } => {
10008                assert_eq!(affected, 2);
10009                assert!(modified_catalog);
10010            }
10011            other => panic!("expected CommandOk, got {other:?}"),
10012        }
10013        assert!(e.statistics().get("t1", "id").is_some());
10014        assert!(e.statistics().get("t2", "name").is_some());
10015    }
10016
10017    #[test]
10018    fn select_from_spg_statistic_returns_rows_per_column() {
10019        let mut e = Engine::new();
10020        e.execute("CREATE TABLE t (id INT NOT NULL, label TEXT)")
10021            .unwrap();
10022        e.execute("INSERT INTO t VALUES (1, 'a')").unwrap();
10023        e.execute("INSERT INTO t VALUES (2, 'b')").unwrap();
10024        e.execute("ANALYZE t").unwrap();
10025        let r = e.execute_readonly("SELECT * FROM spg_statistic").unwrap();
10026        let QueryResult::Rows { rows, columns } = r else {
10027            panic!()
10028        };
10029        // v6.7.0 — spg_statistic gained a `cold_row_count` column.
10030        assert_eq!(columns.len(), 6);
10031        assert_eq!(columns[0].name, "table_name");
10032        assert_eq!(columns[4].name, "histogram_bounds");
10033        assert_eq!(columns[5].name, "cold_row_count");
10034        assert_eq!(rows.len(), 2, "one row per column of t");
10035        // Sorted by (table_name, column_name).
10036        match (&rows[0].values[0], &rows[0].values[1]) {
10037            (Value::Text(t), Value::Text(c)) => {
10038                assert_eq!(t, "t");
10039                // BTreeMap orders (table, column); columns "id" < "label".
10040                assert_eq!(c, "id");
10041            }
10042            _ => panic!(),
10043        }
10044    }
10045
10046    #[test]
10047    fn analyze_skips_vector_columns() {
10048        // Vector columns have their own stats shape (HNSW graph);
10049        // ANALYZE leaves them out of spg_statistic.
10050        let mut e = Engine::new();
10051        e.execute("CREATE TABLE t (id INT NOT NULL, v VECTOR(3) NOT NULL)")
10052            .unwrap();
10053        e.execute("INSERT INTO t VALUES (1, [1, 2, 3])").unwrap();
10054        e.execute("ANALYZE t").unwrap();
10055        assert!(e.statistics().get("t", "id").is_some());
10056        assert!(e.statistics().get("t", "v").is_none());
10057    }
10058
10059    #[test]
10060    fn statistics_persist_across_envelope_v5_round_trip() {
10061        let mut e = Engine::new();
10062        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
10063        for i in 0..20 {
10064            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
10065        }
10066        e.execute("ANALYZE").unwrap();
10067        let snap = e.snapshot();
10068        let e2 = Engine::restore_envelope(&snap).unwrap();
10069        let s = e2.statistics().get("t", "id").unwrap();
10070        assert_eq!(s.n_distinct, 20);
10071    }
10072
10073    // ── v6.2.1 auto-analyze threshold ───────────────────────────
10074
10075    #[test]
10076    fn auto_analyze_threshold_fires_after_10pct_of_min_rows_on_small_table() {
10077        // For a table with 0 rows then 10 inserts → modified=10,
10078        // row_count=10. Threshold = 0.1 × max(10, 100) = 10. So
10079        // after the 10th INSERT the threshold is met.
10080        let mut e = Engine::new();
10081        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
10082        for i in 0..9 {
10083            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
10084        }
10085        assert!(e.tables_needing_analyze().is_empty(), "9 < threshold");
10086        e.execute("INSERT INTO t VALUES (9)").unwrap();
10087        let needs = e.tables_needing_analyze();
10088        assert_eq!(needs, alloc::vec!["t".to_string()]);
10089    }
10090
10091    #[test]
10092    fn auto_analyze_threshold_uses_10pct_of_row_count_for_large_tables() {
10093        // After ANALYZE on 1000 rows, threshold = 0.1 × row_count.
10094        // Each new INSERT bumps both modified and row_count, so to
10095        // trigger from N=1000 we need modifications ≥ 0.1 × (1000+M),
10096        // i.e. M ≥ 112. The test inserts 50 (no fire), then 150
10097        // more (200 total mods, row_count=1200, threshold=120 → fire).
10098        let mut e = Engine::new();
10099        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
10100        for i in 0..1000 {
10101            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
10102        }
10103        e.execute("ANALYZE t").unwrap();
10104        assert!(e.tables_needing_analyze().is_empty(), "fresh ANALYZE");
10105        for i in 1000..1050 {
10106            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
10107        }
10108        assert!(
10109            e.tables_needing_analyze().is_empty(),
10110            "50 inserts < threshold of ~105"
10111        );
10112        for i in 1050..1200 {
10113            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
10114        }
10115        assert_eq!(
10116            e.tables_needing_analyze(),
10117            alloc::vec!["t".to_string()],
10118            "200 inserts > 0.1 × 1200 threshold"
10119        );
10120    }
10121
10122    #[test]
10123    fn auto_analyze_threshold_resets_after_analyze() {
10124        let mut e = Engine::new();
10125        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
10126        for i in 0..200 {
10127            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
10128        }
10129        assert!(!e.tables_needing_analyze().is_empty());
10130        e.execute("ANALYZE").unwrap();
10131        assert!(
10132            e.tables_needing_analyze().is_empty(),
10133            "ANALYZE must reset the counter"
10134        );
10135    }
10136
10137    #[test]
10138    fn auto_analyze_threshold_tracks_updates_and_deletes() {
10139        let mut e = Engine::new();
10140        e.execute("CREATE TABLE t (id INT NOT NULL, label TEXT)").unwrap();
10141        for i in 0..50 {
10142            e.execute(&alloc::format!("INSERT INTO t VALUES ({i}, 'x')"))
10143                .unwrap();
10144        }
10145        e.execute("ANALYZE t").unwrap();
10146        // UPDATE 20 rows + DELETE 5 → modified=25. Threshold = 0.1
10147        // × max(50, 100) = 10. So 25 >= 10 → trigger.
10148        e.execute("UPDATE t SET label = 'y' WHERE id < 20").unwrap();
10149        e.execute("DELETE FROM t WHERE id >= 45").unwrap();
10150        assert_eq!(
10151            e.tables_needing_analyze(),
10152            alloc::vec!["t".to_string()]
10153        );
10154    }
10155
10156    #[test]
10157    fn v4_envelope_loads_with_empty_statistics() {
10158        // Forge a v4 envelope by hand: catalog + users + pubs +
10159        // subs trailer, no statistics. A v6.2.0 reader must accept
10160        // it and surface an empty Statistics.
10161        let mut e = Engine::new();
10162        e.create_user("alice", "secret", crate::users::Role::ReadOnly, [0u8; 16])
10163            .unwrap();
10164        let catalog = e.catalog.serialize();
10165        let users = crate::users::serialize_users(&e.users);
10166        let pubs = e.publications.serialize();
10167        let subs = e.subscriptions.serialize();
10168        let mut buf = Vec::new();
10169        buf.extend_from_slice(b"SPGENV01");
10170        buf.push(4u8);
10171        buf.extend_from_slice(&u32::try_from(catalog.len()).unwrap().to_le_bytes());
10172        buf.extend_from_slice(&catalog);
10173        buf.extend_from_slice(&u32::try_from(users.len()).unwrap().to_le_bytes());
10174        buf.extend_from_slice(&users);
10175        buf.extend_from_slice(&u32::try_from(pubs.len()).unwrap().to_le_bytes());
10176        buf.extend_from_slice(&pubs);
10177        buf.extend_from_slice(&u32::try_from(subs.len()).unwrap().to_le_bytes());
10178        buf.extend_from_slice(&subs);
10179        let crc = spg_crypto::crc32::crc32(&buf);
10180        buf.extend_from_slice(&crc.to_le_bytes());
10181        let e2 = Engine::restore_envelope(&buf).expect("v4 envelope restores");
10182        assert!(e2.statistics().is_empty());
10183    }
10184
10185    #[test]
10186    fn v1_v2_envelope_loads_with_empty_publications() {
10187        // A snapshot taken before v6.1.2 (no publication trailer,
10188        // envelope v2) must still deserialise — and the resulting
10189        // engine must report zero publications. Use the engine's own
10190        // round-trip with no publications: that emits v3 but with an
10191        // empty pubs block. Then forge a v2 envelope by hand to lock
10192        // the back-compat path.
10193        let mut e = Engine::new();
10194        // Force users to be non-empty so the snapshot takes the
10195        // envelope path rather than the bare-catalog fallback.
10196        e.create_user(
10197            "alice",
10198            "secret",
10199            crate::users::Role::ReadOnly,
10200            [0u8; 16],
10201        )
10202        .unwrap();
10203
10204        // Forge an envelope v2: same shape as v3 but no pubs trailer.
10205        let catalog = e.catalog.serialize();
10206        let users = crate::users::serialize_users(&e.users);
10207        let mut buf = Vec::new();
10208        buf.extend_from_slice(b"SPGENV01");
10209        buf.push(2u8); // v2
10210        buf.extend_from_slice(
10211            &u32::try_from(catalog.len()).unwrap().to_le_bytes(),
10212        );
10213        buf.extend_from_slice(&catalog);
10214        buf.extend_from_slice(
10215            &u32::try_from(users.len()).unwrap().to_le_bytes(),
10216        );
10217        buf.extend_from_slice(&users);
10218        let crc = spg_crypto::crc32::crc32(&buf);
10219        buf.extend_from_slice(&crc.to_le_bytes());
10220
10221        let e2 = Engine::restore_envelope(&buf).expect("v2 envelope restores");
10222        assert!(e2.publications().is_empty());
10223    }
10224}