Skip to main content

spg_engine/
lib.rs

1//! SPG execution engine — v0.3 wires the SQL front-end to the in-memory
2//! storage layer. Implements `CREATE TABLE`, single-row `INSERT VALUES`, and
3//! `SELECT * FROM <table>` (no WHERE yet — that lands in v0.4 alongside
4//! expression evaluation against rows).
5#![no_std]
6
7extern crate alloc;
8
9pub mod aggregate;
10pub mod describe;
11pub mod eval;
12pub mod json;
13pub mod memoize;
14pub mod plan_cache;
15pub mod publications;
16pub mod query_stats;
17pub mod reorder;
18pub mod selectivity;
19pub mod statistics;
20pub mod subscriptions;
21pub mod users;
22
23pub use crate::users::{Role, ScramSecrets, UserError, UserStore};
24
25use alloc::borrow::Cow;
26use alloc::boxed::Box;
27use alloc::collections::BTreeMap;
28use alloc::string::{String, ToString};
29use alloc::vec::Vec;
30use core::fmt;
31
32use spg_sql::ast::{
33    BinOp, ColumnDef, ColumnName, ColumnTypeName, CreateIndexStatement,
34    CreatePublicationStatement, CreateSubscriptionStatement, CreateTableStatement,
35    CreateUserStatement, Expr, FrameBound, FrameKind, FromClause, IndexMethod, InsertStatement,
36    JoinKind, Literal, OrderBy, SelectItem, SelectStatement, Statement, UnOp, UnionKind,
37    VecEncoding as SqlVecEncoding, WindowFrame,
38};
39use spg_sql::parser::{self, ParseError};
40use spg_storage::{
41    Catalog, ColumnSchema, CompactReport, DataType, IndexKey, IndexKind, Row, StorageError, Table,
42    TableSchema, Value, VecEncoding,
43};
44
45use crate::eval::{EvalContext, EvalError};
46
47/// Result of executing one statement.
48#[derive(Debug, Clone, PartialEq)]
49#[non_exhaustive]
50pub enum QueryResult {
51    /// DDL or DML succeeded.
52    ///
53    /// `affected` is the row count for `INSERT` and 0 elsewhere.
54    /// `modified_catalog` tells the server whether this statement
55    /// caused the *committed* catalog to change — it's the signal to
56    /// snapshot/audit. False for `BEGIN`/`ROLLBACK`, false for writeful
57    /// statements executed inside a transaction (those only touch the
58    /// shadow), and true for `COMMIT` and for writes outside a TX.
59    CommandOk {
60        affected: usize,
61        modified_catalog: bool,
62    },
63    /// `SELECT` returned a (possibly empty) row set.
64    Rows {
65        columns: Vec<ColumnSchema>,
66        rows: Vec<Row>,
67    },
68}
69
70/// All errors the engine can return.
71///
72/// Marked `#[non_exhaustive]` from v7.5.0 onward: external `match`
73/// must include a `_` arm so new variants in subsequent v7.x releases
74/// are not breaking changes.
75#[derive(Debug, Clone, PartialEq)]
76#[non_exhaustive]
77pub enum EngineError {
78    Parse(ParseError),
79    Storage(StorageError),
80    Eval(EvalError),
81    /// Front-end accepted a construct that the v0.x executor doesn't support.
82    Unsupported(String),
83    /// `BEGIN` while another transaction is already open.
84    TransactionAlreadyOpen,
85    /// `COMMIT` / `ROLLBACK` with no active transaction.
86    NoActiveTransaction,
87    /// v4.0 sentinel: `execute_readonly` got a statement that
88    /// mutates engine state (INSERT / CREATE / BEGIN / COMMIT / …).
89    /// The caller should retake the write lock and dispatch through
90    /// `execute(&mut self)` instead.
91    WriteRequired,
92    /// v4.2: a SELECT would have returned more rows than the
93    /// configured `max_query_rows` cap. Carries the cap.
94    RowLimitExceeded(usize),
95    /// v4.5: cooperative cancellation — the host (server's
96    /// per-query watchdog) set the cancel flag while a long-running
97    /// SELECT / UPDATE / DELETE was scanning rows. The partial work
98    /// is discarded; the caller should surface this as a timeout
99    /// to the client.
100    Cancelled,
101}
102
103impl fmt::Display for EngineError {
104    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
105        match self {
106            Self::Parse(e) => write!(f, "parse: {e}"),
107            Self::Storage(e) => write!(f, "storage: {e}"),
108            Self::Eval(e) => write!(f, "eval: {e}"),
109            Self::Unsupported(s) => write!(f, "unsupported: {s}"),
110            Self::TransactionAlreadyOpen => f.write_str("a transaction is already open"),
111            Self::NoActiveTransaction => f.write_str("no active transaction"),
112            Self::WriteRequired => {
113                f.write_str("statement requires a write lock (use execute, not execute_readonly)")
114            }
115            Self::RowLimitExceeded(n) => {
116                write!(f, "query exceeded max_query_rows={n}")
117            }
118            Self::Cancelled => f.write_str("query cancelled (timeout or client request)"),
119        }
120    }
121}
122
123impl From<ParseError> for EngineError {
124    fn from(e: ParseError) -> Self {
125        Self::Parse(e)
126    }
127}
128impl From<StorageError> for EngineError {
129    fn from(e: StorageError) -> Self {
130        Self::Storage(e)
131    }
132}
133impl From<EvalError> for EngineError {
134    fn from(e: EvalError) -> Self {
135        Self::Eval(e)
136    }
137}
138
139/// The execution engine. Holds the catalog and (later) other server-scope
140/// state. `Engine::new()` is intentionally cheap so callers can construct one
141/// per database, per test.
142/// Function pointer that returns "now" as microseconds since Unix
143/// epoch. The engine is `no_std`, so it can't reach for `std::time`
144/// itself — callers (`spg-server`, the sqllogictest runner) inject a
145/// concrete implementation. `None` means `NOW()` / `CURRENT_*` raise
146/// `Unsupported`.
147pub type ClockFn = fn() -> i64;
148
149/// Function pointer that produces 16 cryptographically random bytes.
150/// Like `ClockFn`, the engine is `no_std` and can't reach for /dev/urandom
151/// itself — host (`spg-server`) injects an OS-backed source. `None`
152/// means SQL-driven `CREATE USER` falls back to a deterministic salt
153/// derived from the username (acceptable in tests; the server always
154/// installs a real RNG so production paths never see this).
155pub type SaltFn = fn() -> [u8; 16];
156
157/// v4.5 cooperative cancellation token. A long-running SELECT /
158/// UPDATE / DELETE checks `is_cancelled` at row-loop checkpoints
159/// and bails with `EngineError::Cancelled`. The host
160/// (`spg-server`) creates an `AtomicBool` per query, spawns a
161/// watchdog thread that sets it after `SPG_QUERY_TIMEOUT_MS`,
162/// and passes it via `execute_with_cancel` / `execute_readonly_with_cancel`.
163///
164/// `CancelToken::none()` is a no-op — used by the legacy `execute`
165/// and `execute_readonly` entry points so existing callers don't
166/// change.
167#[derive(Debug, Clone, Copy)]
168pub struct CancelToken<'a> {
169    flag: Option<&'a core::sync::atomic::AtomicBool>,
170}
171
172impl<'a> CancelToken<'a> {
173    #[must_use]
174    pub const fn none() -> Self {
175        Self { flag: None }
176    }
177
178    #[must_use]
179    pub const fn from_flag(f: &'a core::sync::atomic::AtomicBool) -> Self {
180        Self { flag: Some(f) }
181    }
182
183    #[must_use]
184    pub fn is_cancelled(self) -> bool {
185        self.flag
186            .is_some_and(|f| f.load(core::sync::atomic::Ordering::Relaxed))
187    }
188
189    /// Returns `Err(Cancelled)` if the token has been tripped.
190    /// Used at row-loop checkpoints to bail cooperatively without
191    /// scattering raw `is_cancelled` checks across the executor.
192    #[inline]
193    pub fn check(self) -> Result<(), EngineError> {
194        if self.is_cancelled() {
195            Err(EngineError::Cancelled)
196        } else {
197            Ok(())
198        }
199    }
200}
201
202// ---- snapshot envelope (v4.1, extended with CRC32 in v4.37,  ----
203// ----   publications in v6.1.2 v3, subscriptions in v6.1.4 v4) ----
204//
205// Wraps a catalog blob + a user blob behind a small header so the
206// server can persist both atomically without inventing a new file.
207// Bare catalog blobs (v3.x) still load via `restore_envelope` since
208// the magic check fails fast and the function falls back to
209// `Catalog::deserialize`.
210//
211// Layout — v1 (v4.1, no CRC):
212//   [8 bytes magic "SPGENV01"]
213//   [u8 version = 1]
214//   [u32 catalog_len][catalog bytes]
215//   [u32 users_len][users bytes]
216//
217// Layout — v2 (v4.37, CRC32 of body):
218//   [8 bytes magic "SPGENV01"]
219//   [u8 version = 2]
220//   [u32 catalog_len][catalog bytes]
221//   [u32 users_len][users bytes]
222//   [u32 crc32]                      ← CRC32 of every byte before it.
223//
224// Layout — v3 (v6.1.2, publications trailer):
225//   [8 bytes magic "SPGENV01"]
226//   [u8 version = 3]
227//   [u32 catalog_len][catalog bytes]
228//   [u32 users_len][users bytes]
229//   [u32 pubs_len][publications bytes]
230//   [u32 crc32]
231//
232// Layout — v4 (v6.1.4, subscriptions trailer):
233//   [8 bytes magic "SPGENV01"]
234//   [u8 version = 4]
235//   [u32 catalog_len][catalog bytes]
236//   [u32 users_len][users bytes]
237//   [u32 pubs_len][publications bytes]
238//   [u32 subs_len][subscriptions bytes]
239//   [u32 crc32]
240//
241// Layout — v5 (v6.2.0, statistics trailer):
242//   [8 bytes magic "SPGENV01"]
243//   [u8 version = 5]
244//   [u32 catalog_len][catalog bytes]
245//   [u32 users_len][users bytes]
246//   [u32 pubs_len][publications bytes]
247//   [u32 subs_len][subscriptions bytes]
248//   [u32 stats_len][statistics bytes]      ← NEW
249//   [u32 crc32]
250//
251// Writers emit v5 from v6.2.0 on. Readers accept all of {v1, v2,
252// v3, v4, v5}: v1/v2 load with empty publications / subscriptions /
253// statistics; v3 loads with empty subscriptions + statistics; v4
254// loads with empty statistics; v5 deserialises all three. Older
255// SPG versions reading a v5 envelope fall through the version
256// match to `EnvelopeParse::Bare` — pre-v6.2.0 binaries cannot
257// open v6.2.0+ snapshots (matches the v6.1.2 / v6.1.4 breaks).
258
259const ENVELOPE_MAGIC: &[u8; 8] = b"SPGENV01";
260const ENVELOPE_VERSION_V1: u8 = 1;
261const ENVELOPE_VERSION_V2: u8 = 2;
262const ENVELOPE_VERSION_V3: u8 = 3;
263const ENVELOPE_VERSION_V4: u8 = 4;
264const ENVELOPE_VERSION_V5: u8 = 5;
265
266fn build_envelope(
267    catalog: &[u8],
268    users: &[u8],
269    pubs: &[u8],
270    subs: &[u8],
271    stats: &[u8],
272) -> Vec<u8> {
273    let mut out = Vec::with_capacity(
274        8 + 1
275            + 4
276            + catalog.len()
277            + 4
278            + users.len()
279            + 4
280            + pubs.len()
281            + 4
282            + subs.len()
283            + 4
284            + stats.len()
285            + 4,
286    );
287    out.extend_from_slice(ENVELOPE_MAGIC);
288    out.push(ENVELOPE_VERSION_V5);
289    out.extend_from_slice(
290        &u32::try_from(catalog.len())
291            .expect("≤ 4G catalog")
292            .to_le_bytes(),
293    );
294    out.extend_from_slice(catalog);
295    out.extend_from_slice(
296        &u32::try_from(users.len())
297            .expect("≤ 4G users")
298            .to_le_bytes(),
299    );
300    out.extend_from_slice(users);
301    out.extend_from_slice(
302        &u32::try_from(pubs.len())
303            .expect("≤ 4G publications")
304            .to_le_bytes(),
305    );
306    out.extend_from_slice(pubs);
307    out.extend_from_slice(
308        &u32::try_from(subs.len())
309            .expect("≤ 4G subscriptions")
310            .to_le_bytes(),
311    );
312    out.extend_from_slice(subs);
313    out.extend_from_slice(
314        &u32::try_from(stats.len())
315            .expect("≤ 4G statistics")
316            .to_le_bytes(),
317    );
318    out.extend_from_slice(stats);
319    let crc = spg_crypto::crc32::crc32(&out);
320    out.extend_from_slice(&crc.to_le_bytes());
321    out
322}
323
324/// Outcome of envelope parsing: either bare-catalog fallback, a
325/// successfully split section trio from a v1/v2/v3 envelope, or an
326/// explicit corruption error from a v2/v3 CRC mismatch. `Bare`
327/// (catalog-only fallback) preserves v3.x readability. v1/v2
328/// envelopes set `publications` to `None`; v3 sets it to the
329/// publications byte slice.
330enum EnvelopeParse<'a> {
331    Bare,
332    Pair {
333        catalog: &'a [u8],
334        users: &'a [u8],
335        publications: Option<&'a [u8]>,
336        subscriptions: Option<&'a [u8]>,
337        statistics: Option<&'a [u8]>,
338    },
339    CrcMismatch {
340        expected: u32,
341        computed: u32,
342    },
343}
344
345/// Returns `EnvelopeParse::Pair` for a valid v1 / v2 / v3 envelope,
346/// `Bare` for a buffer that doesn't look like an envelope (v3.x
347/// bare catalog fallback), and `CrcMismatch` for a v2/v3 envelope
348/// whose trailing CRC32 doesn't match the body.
349fn split_envelope(buf: &[u8]) -> EnvelopeParse<'_> {
350    if buf.len() < 8 + 1 + 4 || &buf[..8] != ENVELOPE_MAGIC {
351        return EnvelopeParse::Bare;
352    }
353    let version = buf[8];
354    if !matches!(
355        version,
356        ENVELOPE_VERSION_V1
357            | ENVELOPE_VERSION_V2
358            | ENVELOPE_VERSION_V3
359            | ENVELOPE_VERSION_V4
360            | ENVELOPE_VERSION_V5
361    ) {
362        return EnvelopeParse::Bare;
363    }
364    let mut p = 9usize;
365    let Some(cat_len_bytes) = buf.get(p..p + 4) else {
366        return EnvelopeParse::Bare;
367    };
368    let Ok(cat_len_arr) = cat_len_bytes.try_into() else {
369        return EnvelopeParse::Bare;
370    };
371    let cat_len = u32::from_le_bytes(cat_len_arr) as usize;
372    p += 4;
373    if p + cat_len + 4 > buf.len() {
374        return EnvelopeParse::Bare;
375    }
376    let catalog = &buf[p..p + cat_len];
377    p += cat_len;
378    let Some(user_len_bytes) = buf.get(p..p + 4) else {
379        return EnvelopeParse::Bare;
380    };
381    let Ok(user_len_arr) = user_len_bytes.try_into() else {
382        return EnvelopeParse::Bare;
383    };
384    let user_len = u32::from_le_bytes(user_len_arr) as usize;
385    p += 4;
386    if p + user_len > buf.len() {
387        return EnvelopeParse::Bare;
388    }
389    let users = &buf[p..p + user_len];
390    p += user_len;
391    let publications = if matches!(
392        version,
393        ENVELOPE_VERSION_V3 | ENVELOPE_VERSION_V4 | ENVELOPE_VERSION_V5
394    ) {
395        // [u32 pubs_len][publications bytes]
396        let Some(pubs_len_bytes) = buf.get(p..p + 4) else {
397            return EnvelopeParse::Bare;
398        };
399        let Ok(pubs_len_arr) = pubs_len_bytes.try_into() else {
400            return EnvelopeParse::Bare;
401        };
402        let pubs_len = u32::from_le_bytes(pubs_len_arr) as usize;
403        p += 4;
404        if p + pubs_len > buf.len() {
405            return EnvelopeParse::Bare;
406        }
407        let pubs_slice = &buf[p..p + pubs_len];
408        p += pubs_len;
409        Some(pubs_slice)
410    } else {
411        None
412    };
413    let subscriptions = if matches!(version, ENVELOPE_VERSION_V4 | ENVELOPE_VERSION_V5) {
414        // [u32 subs_len][subscriptions bytes]
415        let Some(subs_len_bytes) = buf.get(p..p + 4) else {
416            return EnvelopeParse::Bare;
417        };
418        let Ok(subs_len_arr) = subs_len_bytes.try_into() else {
419            return EnvelopeParse::Bare;
420        };
421        let subs_len = u32::from_le_bytes(subs_len_arr) as usize;
422        p += 4;
423        if p + subs_len > buf.len() {
424            return EnvelopeParse::Bare;
425        }
426        let subs_slice = &buf[p..p + subs_len];
427        p += subs_len;
428        Some(subs_slice)
429    } else {
430        None
431    };
432    let statistics = if version == ENVELOPE_VERSION_V5 {
433        // [u32 stats_len][statistics bytes]
434        let Some(stats_len_bytes) = buf.get(p..p + 4) else {
435            return EnvelopeParse::Bare;
436        };
437        let Ok(stats_len_arr) = stats_len_bytes.try_into() else {
438            return EnvelopeParse::Bare;
439        };
440        let stats_len = u32::from_le_bytes(stats_len_arr) as usize;
441        p += 4;
442        if p + stats_len > buf.len() {
443            return EnvelopeParse::Bare;
444        }
445        let stats_slice = &buf[p..p + stats_len];
446        p += stats_len;
447        Some(stats_slice)
448    } else {
449        None
450    };
451    if matches!(
452        version,
453        ENVELOPE_VERSION_V2 | ENVELOPE_VERSION_V3 | ENVELOPE_VERSION_V4 | ENVELOPE_VERSION_V5
454    ) {
455        if p + 4 != buf.len() {
456            return EnvelopeParse::Bare;
457        }
458        let Ok(crc_arr) = buf[p..p + 4].try_into() else {
459            return EnvelopeParse::Bare;
460        };
461        let expected = u32::from_le_bytes(crc_arr);
462        let computed = spg_crypto::crc32::crc32(&buf[..p]);
463        if expected != computed {
464            return EnvelopeParse::CrcMismatch { expected, computed };
465        }
466    } else if p != buf.len() {
467        // v1: must end exactly at the users section.
468        return EnvelopeParse::Bare;
469    }
470    EnvelopeParse::Pair {
471        catalog,
472        users,
473        publications,
474        subscriptions,
475        statistics,
476    }
477}
478
479/// v4.41.1 opaque transaction handle. Returned by `Engine::alloc_tx_id`,
480/// threaded through `Engine::execute_in` so dispatch can identify which
481/// in-flight TX a statement belongs to. `IMPLICIT_TX` is the reserved
482/// slot every legacy caller — engine self-tests, spg-cli, spg-embedded,
483/// startup replay — implicitly uses through the unchanged
484/// `Engine::execute(sql)` API. v4.41.1 keeps at most one active slot at
485/// runtime (dispatch holds `engine.write()` across the wrap, same as
486/// v4.34); the map shape is here to let v4.42 turn on N in-flight
487/// implicit TXs without reshuffling the engine internals.
488#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
489pub struct TxId(pub u64);
490
491/// Reserved slot used by `Engine::execute(sql)` — the legacy single-
492/// global-shadow path. New `alloc_tx_id` handles start at 1.
493pub const IMPLICIT_TX: TxId = TxId(0);
494
495/// v6.7.3 — default segment-size threshold used by `COMPACT COLD
496/// SEGMENTS` when no explicit target is supplied. Segments whose
497/// `OwnedSegment::bytes().len()` is **strictly** less than this
498/// value are eligible to merge. spg-server reads
499/// `SPG_COMPACTION_TARGET_SEGMENT_BYTES` to override.
500pub const COMPACTION_TARGET_DEFAULT_BYTES: u64 = 4 * 1024 * 1024;
501
502/// Per-slot transaction state. Held inside `tx_catalogs[tx_id]` for the
503/// lifetime of a BEGIN..COMMIT (or BEGIN..ROLLBACK) window. Drops when
504/// the TX commits (its `catalog` is moved over `Engine.catalog`) or
505/// rolls back (slot removed, catalog discarded).
506#[derive(Debug, Default, Clone)]
507struct TxState {
508    /// The TX's shadow copy of the catalog. Started as a clone of
509    /// `Engine.catalog` at BEGIN time; writes flow into it; COMMIT
510    /// installs it over `Engine.catalog`. `Catalog::clone()` is O(1)
511    /// since v4.40 (`PersistentVec` rows + `PersistentBTreeMap` indices).
512    catalog: Catalog,
513    /// Per-TX savepoint stack. Each entry pairs the savepoint name with
514    /// a clone of `catalog` at the moment `SAVEPOINT <name>` fired.
515    /// `ROLLBACK TO <name>` restores from the entry and pops everything
516    /// after it; `RELEASE <name>` discards the entry and everything
517    /// after; COMMIT/ROLLBACK clears the whole stack.
518    savepoints: Vec<(String, Catalog)>,
519}
520
521#[derive(Debug, Default)]
522pub struct Engine {
523    /// Committed catalog — what survives `Engine::snapshot()` and what
524    /// outside-TX `SELECT`s read.
525    catalog: Catalog,
526    /// Active TX slots, keyed by `TxId`. Empty when no TX is in flight.
527    /// v4.41.1 runtime invariant: at most one entry (single-writer
528    /// model unchanged). v4.42 will let dispatch hold multiple entries
529    /// concurrently for group commit + engine MVCC.
530    tx_catalogs: BTreeMap<TxId, TxState>,
531    /// Which slot the next exec_* call should mutate. Set by
532    /// `execute_in(sql, tx_id)` at the entry point; legacy `execute(sql)`
533    /// sets it to `IMPLICIT_TX`. None when no TX is in flight (read /
534    /// write goes straight against `catalog`).
535    current_tx: Option<TxId>,
536    /// Monotonic counter for `alloc_tx_id`. Starts at 1 — slot 0 is
537    /// reserved for `IMPLICIT_TX`.
538    next_tx_id: u64,
539    /// Optional wall clock used to satisfy `NOW()` / `CURRENT_TIMESTAMP`
540    /// / `CURRENT_DATE`. Set by the host environment.
541    clock: Option<ClockFn>,
542    /// v4.1 cryptographic RNG for per-user password salt. Set by the
543    /// host. `None` means SQL-driven `CREATE USER` uses a
544    /// deterministic fallback — see `SaltFn`.
545    salt_fn: Option<SaltFn>,
546    /// v4.2 per-query row cap. `None` = unlimited. When set, a
547    /// SELECT that materialises more than `n` rows returns
548    /// `EngineError::RowLimitExceeded`. Enforced before the result
549    /// is shaped into wire frames so a runaway scan can't blow the
550    /// server's heap.
551    max_query_rows: Option<usize>,
552    /// v4.1 RBAC user table. Empty means "no RBAC configured yet" —
553    /// the server decides what that means at the auth boundary
554    /// (open mode vs legacy single-password mode). User CRUD goes
555    /// through `create_user`/`drop_user`/`verify_user`; persistence
556    /// rides the snapshot envelope alongside the catalog.
557    users: UserStore,
558    /// v6.1.2 logical-replication publication catalog. Empty until
559    /// `CREATE PUBLICATION` runs. Persistence rides the v3 envelope
560    /// trailer (see `build_envelope`).
561    publications: publications::Publications,
562    /// v6.1.4 logical-replication subscription catalog. Empty until
563    /// `CREATE SUBSCRIPTION` runs. Persistence rides the v4 envelope
564    /// trailer.
565    subscriptions: subscriptions::Subscriptions,
566    /// v6.2.0 — per-column statistics for the cost-based optimizer.
567    /// Populated by `ANALYZE`; queried via `spg_statistic` virtual
568    /// table. Persistence rides the v5 envelope trailer.
569    statistics: statistics::Statistics,
570    /// v6.3.0 — engine-level plan cache. Caches the post-`prepare()`
571    /// `Statement` keyed on SQL text. In-memory only — does NOT ride
572    /// the snapshot envelope (rebuilt on demand after restart).
573    plan_cache: plan_cache::PlanCache,
574    /// v6.5.1 — per-distinct-SQL execution stats. In-memory only,
575    /// surfaced via `spg_stat_query` virtual table. Updated by the
576    /// `execute_*` paths after a successful execute.
577    query_stats: query_stats::QueryStats,
578    /// v6.5.2 — connection-state provider callback. spg-server
579    /// registers a function at startup that snapshots its
580    /// per-pgwire-connection registry into `ActivityRow`s; engine
581    /// reads through it on every `SELECT * FROM spg_stat_activity`.
582    /// `None` ⇒ no-data (returns empty rows; matches the no_std
583    /// embedded callers that don't run pgwire).
584    activity_provider: Option<ActivityProvider>,
585    /// v6.5.3 — audit-chain provider + verifier. Same pattern as
586    /// activity_provider: spg-server registers both at startup;
587    /// engine reads through on `SELECT * FROM spg_audit_chain` and
588    /// `SELECT * FROM spg_audit_verify`. `None` ⇒ no-data.
589    audit_chain_provider: Option<AuditChainProvider>,
590    audit_verifier: Option<AuditVerifier>,
591    /// v6.5.6 — slow-query log threshold in microseconds. When set,
592    /// every successful execute whose elapsed exceeds the threshold
593    /// gets fed to the registered slow-query log callback (so
594    /// spg-server can emit a structured log line). Default `None`
595    /// = no slow-query logging.
596    slow_query_threshold_us: Option<u64>,
597    slow_query_logger: Option<SlowQueryLogger>,
598}
599
600/// v6.5.6 — callback signature for slow-query log emission. Called
601/// with `(sql, elapsed_us)` once per successful execute that crosses
602/// the threshold.
603pub type SlowQueryLogger = fn(&str, u64);
604
605/// v6.5.4 — synthesise a `CREATE TABLE` statement from catalog
606/// state. Round-trips through `Engine::execute` to recreate the
607/// same schema (sans data + indexes — indexes are emitted as a
608/// separate `CREATE INDEX` chain in `spg_database_ddl`).
609fn render_create_table(name: &str, columns: &[ColumnSchema]) -> String {
610    let mut out = alloc::format!("CREATE TABLE {name} (");
611    for (i, col) in columns.iter().enumerate() {
612        if i > 0 {
613            out.push_str(", ");
614        }
615        out.push_str(&col.name);
616        out.push(' ');
617        out.push_str(&render_data_type(col.ty));
618        if !col.nullable {
619            out.push_str(" NOT NULL");
620        }
621        if col.auto_increment {
622            out.push_str(" AUTO_INCREMENT");
623        }
624    }
625    out.push(')');
626    out
627}
628
629fn render_data_type(ty: DataType) -> String {
630    match ty {
631        DataType::SmallInt => "SMALLINT".into(),
632        DataType::Int => "INT".into(),
633        DataType::BigInt => "BIGINT".into(),
634        DataType::Float => "FLOAT".into(),
635        DataType::Text => "TEXT".into(),
636        DataType::Varchar(n) => alloc::format!("VARCHAR({n})"),
637        DataType::Char(n) => alloc::format!("CHAR({n})"),
638        DataType::Bool => "BOOL".into(),
639        DataType::Vector { dim, encoding } => match encoding {
640            spg_storage::VecEncoding::F32 => alloc::format!("VECTOR({dim})"),
641            spg_storage::VecEncoding::Sq8 => alloc::format!("VECTOR({dim}) USING SQ8"),
642            spg_storage::VecEncoding::F16 => alloc::format!("VECTOR({dim}) USING HALF"),
643        },
644        DataType::Numeric { precision, scale } => {
645            alloc::format!("NUMERIC({precision},{scale})")
646        }
647        DataType::Date => "DATE".into(),
648        DataType::Timestamp => "TIMESTAMP".into(),
649        DataType::Interval => "INTERVAL".into(),
650        DataType::Json => "JSON".into(),
651        DataType::Jsonb => "JSONB".into(),
652        DataType::Timestamptz => "TIMESTAMPTZ".into(),
653        DataType::Bytes => "BYTEA".into(),
654        DataType::TextArray => "TEXT[]".into(),
655    }
656}
657
658/// v6.5.2 — one row of `spg_stat_activity`. Engine-public so
659/// spg-server can construct rows without re-exporting internal
660/// dispatch types.
661#[derive(Debug, Clone)]
662pub struct ActivityRow {
663    pub pid: u32,
664    pub user: String,
665    pub started_at_us: i64,
666    pub current_sql: String,
667    pub wait_event: String,
668    pub elapsed_us: i64,
669    pub in_transaction: bool,
670}
671
672/// v6.5.2 — provider callback type. Fresh snapshot returned each
673/// call; engine doesn't cache the slice.
674pub type ActivityProvider = fn() -> Vec<ActivityRow>;
675
676/// v6.5.3 — one row of `spg_audit_chain`. Engine-public so
677/// spg-server can construct rows directly from `AuditEntry`.
678#[derive(Debug, Clone)]
679pub struct AuditRow {
680    pub seq: i64,
681    pub ts_ms: i64,
682    pub prev_hash_hex: String,
683    pub entry_hash_hex: String,
684    pub sql: String,
685}
686
687/// v6.5.3 — chain-table provider + verifier. spg-server registers
688/// fn pointers that snapshot / verify the audit log. `verify`
689/// returns `(verified_count, broken_at_seq)` — `broken_at_seq` is
690/// `-1` on a clean chain.
691pub type AuditChainProvider = fn() -> Vec<AuditRow>;
692pub type AuditVerifier = fn() -> (i64, i64);
693
694impl Engine {
695    pub fn new() -> Self {
696        Self {
697            catalog: Catalog::new(),
698            tx_catalogs: BTreeMap::new(),
699            current_tx: None,
700            next_tx_id: 1,
701            clock: None,
702            salt_fn: None,
703            max_query_rows: None,
704            users: UserStore::new(),
705            publications: publications::Publications::new(),
706            subscriptions: subscriptions::Subscriptions::new(),
707            statistics: statistics::Statistics::new(),
708            plan_cache: plan_cache::PlanCache::new(),
709            query_stats: query_stats::QueryStats::new(),
710            activity_provider: None,
711            audit_chain_provider: None,
712            audit_verifier: None,
713            slow_query_threshold_us: None,
714            slow_query_logger: None,
715        }
716    }
717
718    /// Construct an engine restored from a previously-snapshotted catalog
719    /// (see `snapshot()`).
720    pub fn restore(catalog: Catalog) -> Self {
721        Self {
722            catalog,
723            tx_catalogs: BTreeMap::new(),
724            current_tx: None,
725            next_tx_id: 1,
726            clock: None,
727            salt_fn: None,
728            max_query_rows: None,
729            users: UserStore::new(),
730            publications: publications::Publications::new(),
731            subscriptions: subscriptions::Subscriptions::new(),
732            statistics: statistics::Statistics::new(),
733            plan_cache: plan_cache::PlanCache::new(),
734            query_stats: query_stats::QueryStats::new(),
735            activity_provider: None,
736            audit_chain_provider: None,
737            audit_verifier: None,
738            slow_query_threshold_us: None,
739            slow_query_logger: None,
740        }
741    }
742
743    /// Restore an engine + user table from a v4.1 envelope produced
744    /// by `snapshot_with_users()`. Falls back to plain catalog-only
745    /// restore if the envelope magic isn't present (so v3.x snapshot
746    /// files still load). v6.1.2 adds the optional publications
747    /// trailer (envelope v3); a v1/v2 envelope deserialises to an
748    /// empty publication table.
749    pub fn restore_envelope(buf: &[u8]) -> Result<Self, EngineError> {
750        match split_envelope(buf) {
751            EnvelopeParse::Pair {
752                catalog: catalog_bytes,
753                users: user_bytes,
754                publications: pub_bytes,
755                subscriptions: sub_bytes,
756                statistics: stats_bytes,
757            } => {
758                let catalog = Catalog::deserialize(catalog_bytes).map_err(EngineError::Storage)?;
759                let users = users::deserialize_users(user_bytes)
760                    .map_err(|e| EngineError::Unsupported(alloc::format!("users restore: {e}")))?;
761                let publications = match pub_bytes {
762                    Some(b) => publications::Publications::deserialize(b).map_err(|e| {
763                        EngineError::Unsupported(alloc::format!("publications restore: {e:?}"))
764                    })?,
765                    None => publications::Publications::new(),
766                };
767                let subscriptions = match sub_bytes {
768                    Some(b) => subscriptions::Subscriptions::deserialize(b).map_err(|e| {
769                        EngineError::Unsupported(alloc::format!("subscriptions restore: {e:?}"))
770                    })?,
771                    None => subscriptions::Subscriptions::new(),
772                };
773                let statistics = match stats_bytes {
774                    Some(b) => statistics::Statistics::deserialize(b).map_err(|e| {
775                        EngineError::Unsupported(alloc::format!("statistics restore: {e:?}"))
776                    })?,
777                    None => statistics::Statistics::new(),
778                };
779                Ok(Self {
780                    catalog,
781                    tx_catalogs: BTreeMap::new(),
782                    current_tx: None,
783                    next_tx_id: 1,
784                    clock: None,
785                    salt_fn: None,
786                    max_query_rows: None,
787                    users,
788                    publications,
789                    subscriptions,
790                    statistics,
791                    plan_cache: plan_cache::PlanCache::new(),
792                    query_stats: query_stats::QueryStats::new(),
793                    activity_provider: None,
794                    audit_chain_provider: None,
795                    audit_verifier: None,
796                    slow_query_threshold_us: None,
797                    slow_query_logger: None,
798                })
799            }
800            EnvelopeParse::CrcMismatch { expected, computed } => {
801                Err(EngineError::Storage(StorageError::Corrupt(alloc::format!(
802                    "snapshot envelope CRC32 mismatch (expected={expected:#010x}, computed={computed:#010x})"
803                ))))
804            }
805            EnvelopeParse::Bare => {
806                let catalog = Catalog::deserialize(buf).map_err(EngineError::Storage)?;
807                Ok(Self::restore(catalog))
808            }
809        }
810    }
811
812    pub const fn users(&self) -> &UserStore {
813        &self.users
814    }
815
816    /// `salt` is supplied by the caller (the host has a random
817    /// source; the engine is `no_std`). Caller should pass a fresh
818    /// 16-byte random value per user.
819    pub fn create_user(
820        &mut self,
821        name: &str,
822        password: &str,
823        role: Role,
824        salt: [u8; 16],
825    ) -> Result<(), UserError> {
826        self.users.create(name, password, role, salt)?;
827        // v4.8: also derive SCRAM-SHA-256 secrets so PG-wire SASL
828        // auth can verify without re-running PBKDF2 per attempt.
829        // Uses a fresh salt from the host RNG (falls back to a
830        // deterministic per-username salt when no RNG is wired, same
831        // as the legacy hash path).
832        let scram_salt = self.salt_fn.map_or_else(
833            || {
834                let mut s = [0u8; users::SCRAM_SALT_LEN];
835                let digest = spg_crypto::hash(name.as_bytes());
836                // Use bytes 16..32 of BLAKE3 so we don't reuse the
837                // exact same fallback salt as the BLAKE3 hash path.
838                s.copy_from_slice(&digest[16..32]);
839                s
840            },
841            |f| f(),
842        );
843        self.users
844            .enable_scram(name, password, scram_salt, users::SCRAM_DEFAULT_ITERS)?;
845        Ok(())
846    }
847
848    pub fn drop_user(&mut self, name: &str) -> Result<(), UserError> {
849        self.users.drop(name)
850    }
851
852    pub fn verify_user(&self, name: &str, password: &str) -> Option<Role> {
853        self.users.verify(name, password)
854    }
855
856    /// Builder: attach a wall clock so `NOW()` / `CURRENT_TIMESTAMP` /
857    /// `CURRENT_DATE` evaluate to a real value instead of erroring out.
858    #[must_use]
859    pub const fn with_clock(mut self, clock: ClockFn) -> Self {
860        self.clock = Some(clock);
861        self
862    }
863
864    /// Builder: attach an OS-backed RNG for per-user password salts.
865    /// The host (`spg-server`) typically wires this to `/dev/urandom`.
866    #[must_use]
867    pub const fn with_salt_fn(mut self, f: SaltFn) -> Self {
868        self.salt_fn = Some(f);
869        self
870    }
871
872    /// Builder: cap the number of rows a single SELECT may return.
873    /// Exceeding the cap raises `EngineError::RowLimitExceeded` —
874    /// the bound is checked inside the executor so a runaway
875    /// catalog scan can't allocate millions of rows before the
876    /// server gets a chance to reject the result.
877    #[must_use]
878    pub const fn with_max_query_rows(mut self, n: usize) -> Self {
879        self.max_query_rows = Some(n);
880        self
881    }
882
883    /// The *committed* catalog. Note: during a transaction this returns the
884    /// pre-TX state — `SELECT` inside a TX goes through `execute()` and reads
885    /// the shadow. Tests that inspect outside-TX state should use this.
886    pub const fn catalog(&self) -> &Catalog {
887        &self.catalog
888    }
889
890    /// Serialize the *committed* catalog to bytes. v0.6 was full-snapshot; v0.9
891    /// adds the rule that an open TX's shadow is never snapshotted — only the
892    /// post-COMMIT state is persisted. v4.1 wraps the catalog in an envelope
893    /// when there are users to persist; an empty user table snapshots as the
894    /// bare catalog format (backwards-compat with v3.x readers). v6.1.2
895    /// adds publications to the envelope condition: either non-empty
896    /// users OR non-empty publications now triggers the envelope path.
897    pub fn snapshot(&self) -> Vec<u8> {
898        if self.users.is_empty()
899            && self.publications.is_empty()
900            && self.subscriptions.is_empty()
901            && self.statistics.is_empty()
902        {
903            self.catalog.serialize()
904        } else {
905            build_envelope(
906                &self.catalog.serialize(),
907                &users::serialize_users(&self.users),
908                &self.publications.serialize(),
909                &self.subscriptions.serialize(),
910                &self.statistics.serialize(),
911            )
912        }
913    }
914
915    /// True when at least one TX slot is in flight. v4.41.1 runtime
916    /// invariant: at most one slot active at a time (dispatch holds
917    /// `engine.write()` across the entire wrap). v4.42 will let this
918    /// return true with multiple slots concurrently.
919    pub fn in_transaction(&self) -> bool {
920        !self.tx_catalogs.is_empty()
921    }
922
923    /// v4.41.1 allocate a fresh TX handle. Used by spg-server dispatch
924    /// to scope each implicit-wrap BEGIN..stmt..COMMIT to its own slot
925    /// in `tx_catalogs`. v4.42 — the commit-barrier leader allocates
926    /// one of these per task in its group, runs `BEGIN`+sql+`COMMIT`
927    /// sequentially under a single `engine.write()` so each task's
928    /// mutations accumulate into shared state, then either keeps the
929    /// accumulated state (fsync OK) or restores the pre-image via
930    /// `replace_catalog` (fsync err).
931    pub fn alloc_tx_id(&mut self) -> TxId {
932        let id = TxId(self.next_tx_id);
933        self.next_tx_id = self.next_tx_id.saturating_add(1);
934        id
935    }
936
937    /// v4.42 — atomically replace the live catalog. Used by the
938    /// commit-barrier leader to roll back a group whose batched
939    /// fsync failed: the leader snapshots `engine.catalog().clone()`
940    /// (O(1) Arc bump after the v4.39/v4.40 persistent migration)
941    /// at group start, sequentially applies each task's BEGIN+sql+
942    /// COMMIT under the same write lock to accumulate mutations
943    /// into shared state, batches the WAL bytes, fsyncs once, and
944    /// on failure calls this with the pre-image to undo every
945    /// task in the group at once.
946    ///
947    /// **Does NOT touch `tx_catalogs` / `current_tx`.** Any
948    /// explicit-TX slot from a concurrent client (created via the
949    /// legacy `IMPLICIT_TX`-less dispatch path or via the future
950    /// MVCC-readers v5+ work) has its own snapshot baked into the
951    /// slot — restoring `self.catalog` to the pre-image leaves
952    /// those slots untouched, exactly as they were when the leader
953    /// took the lock. The leader's own implicit-TX slots are all
954    /// already discarded (`exec_commit` removed them as each
955    /// task's COMMIT ran) by the time this is reached.
956    pub fn replace_catalog(&mut self, catalog: Catalog) {
957        self.catalog = catalog;
958    }
959
960    /// v6.7.0 — public shim around `Catalog::freeze_oldest_to_cold`
961    /// so tests + the spg-server freezer can drive a freeze without
962    /// reaching into the private `active_catalog_mut`. v6.7.4
963    /// parallel freezer will build on this surface.
964    ///
965    /// Marks the table's cached `cold_row_count` stale because the
966    /// freeze added cold locators that ANALYZE hasn't yet refreshed.
967    pub fn freeze_oldest_to_cold(
968        &mut self,
969        table_name: &str,
970        index_name: &str,
971        max_rows: usize,
972    ) -> Result<spg_storage::FreezeReport, EngineError> {
973        let report = self
974            .active_catalog_mut()
975            .freeze_oldest_to_cold(table_name, index_name, max_rows)
976            .map_err(EngineError::Storage)?;
977        if let Some(t) = self.active_catalog_mut().get_mut(table_name) {
978            t.mark_cold_row_count_stale();
979        }
980        Ok(report)
981    }
982
983    /// v6.7.5 — public shim used by the spg-server follower's
984    /// segment-forwarding receiver. Registers a cold-tier segment
985    /// at a specific id (the master's id, as transmitted on the
986    /// wire) so the follower's BTree-Cold locators stay byte-
987    /// identical with the master's. Wraps
988    /// `Catalog::load_segment_bytes_at` under the standard
989    /// clone-mutate-replace pattern.
990    ///
991    /// Returns `Ok(())` on success **and** on the "slot already
992    /// occupied" case — a follower mid-reconnect may receive a
993    /// segment chunk for a segment_id it already has on disk
994    /// (forwarded last session); the caller should treat that
995    /// path as a no-op rather than a fatal error.
996    pub fn receive_cold_segment(
997        &mut self,
998        segment_id: u32,
999        bytes: Vec<u8>,
1000    ) -> Result<(), EngineError> {
1001        let mut new_cat = self.catalog.clone();
1002        match new_cat.load_segment_bytes_at(segment_id, bytes) {
1003            Ok(()) => {
1004                self.replace_catalog(new_cat);
1005                Ok(())
1006            }
1007            Err(StorageError::Corrupt(msg)) if msg.contains("already occupied") => Ok(()),
1008            Err(e) => Err(EngineError::Storage(e)),
1009        }
1010    }
1011
1012    /// v6.7.3 — public shim around `Catalog::compact_cold_segments`
1013    /// driving every BTree index on every user table. Returns one
1014    /// `(table, index, report)` triple for each merge that
1015    /// actually happened (no-op (table, index) pairs are filtered
1016    /// out so callers can size persist-side work to the live
1017    /// merges). Caller is responsible for persisting each
1018    /// `report.merged_segment_bytes` and updating the on-disk
1019    /// segment registry; engine layer is no_std and never
1020    /// touches disk.
1021    ///
1022    /// Marks every touched table's cached `cold_row_count` stale
1023    /// — compaction GC'd some shadowed rows, so the count must be
1024    /// re-derived on the next ANALYZE.
1025    pub fn compact_cold_segments_with_target(
1026        &mut self,
1027        target_segment_bytes: u64,
1028    ) -> Result<Vec<(String, String, CompactReport)>, EngineError> {
1029        let table_names = self.active_catalog().table_names();
1030        let mut reports: Vec<(String, String, CompactReport)> = Vec::new();
1031        for tname in table_names {
1032            if is_internal_table_name(&tname) {
1033                continue;
1034            }
1035            let idx_names: Vec<String> = {
1036                let Some(t) = self.active_catalog().get(&tname) else {
1037                    continue;
1038                };
1039                t.indices()
1040                    .iter()
1041                    .filter(|i| matches!(i.kind, IndexKind::BTree(_)))
1042                    .map(|i| i.name.clone())
1043                    .collect()
1044            };
1045            for iname in idx_names {
1046                let report = self
1047                    .active_catalog_mut()
1048                    .compact_cold_segments(&tname, &iname, target_segment_bytes)
1049                    .map_err(EngineError::Storage)?;
1050                if report.merged_segment_id.is_some() {
1051                    if let Some(t) = self.active_catalog_mut().get_mut(&tname) {
1052                        t.mark_cold_row_count_stale();
1053                    }
1054                    reports.push((tname.clone(), iname, report));
1055                }
1056            }
1057        }
1058        Ok(reports)
1059    }
1060
1061    fn active_catalog(&self) -> &Catalog {
1062        match self.current_tx {
1063            Some(t) => self
1064                .tx_catalogs
1065                .get(&t)
1066                .map_or(&self.catalog, |s| &s.catalog),
1067            None => &self.catalog,
1068        }
1069    }
1070
1071    fn active_catalog_mut(&mut self) -> &mut Catalog {
1072        let tx = self.current_tx;
1073        match tx {
1074            Some(t) => match self.tx_catalogs.get_mut(&t) {
1075                Some(s) => &mut s.catalog,
1076                None => &mut self.catalog,
1077            },
1078            None => &mut self.catalog,
1079        }
1080    }
1081
1082    /// Read-only execute path. Succeeds for `SELECT` / `SHOW TABLES`
1083    /// / `SHOW COLUMNS`; returns `EngineError::WriteRequired` for
1084    /// every other statement, so the caller can fall through to the
1085    /// `&mut self` `execute` path under a write lock. Engine state is
1086    /// not mutated even on the success path (`rewrite_clock_calls`
1087    /// and `resolve_order_by_position` both mutate the locally-owned
1088    /// AST, not `self`).
1089    ///
1090    /// **v4.0 concurrency**: this is the entry point the server takes
1091    /// under an `RwLock::read()` so multiple `SELECT` clients run in
1092    /// parallel without serialising on a single mutex.
1093    pub fn execute_readonly(&self, sql: &str) -> Result<QueryResult, EngineError> {
1094        self.execute_readonly_with_cancel(sql, CancelToken::none())
1095    }
1096
1097    /// v4.5 — read path with cooperative cancellation. Token's
1098    /// `is_cancelled` is checked at the start (so a watchdog that
1099    /// already fired returns Cancelled immediately) and at row-loop
1100    /// checkpoints inside `exec_select`. SHOW paths are O(small) and
1101    /// don't bother checking.
1102    pub fn execute_readonly_with_cancel(
1103        &self,
1104        sql: &str,
1105        cancel: CancelToken<'_>,
1106    ) -> Result<QueryResult, EngineError> {
1107        cancel.check()?;
1108        let mut stmt = parser::parse_statement(sql)?;
1109        let now_micros = self.clock.map(|f| f());
1110        rewrite_clock_calls(&mut stmt, now_micros);
1111        if let Statement::Select(s) = &mut stmt {
1112            resolve_order_by_position(s);
1113            // v6.2.3 — cost-based JOIN reorder (read path).
1114            reorder::reorder_joins(s, &self.catalog, &self.statistics);
1115        }
1116        let result = match stmt {
1117            Statement::Select(s) => self.exec_select_cancel(&s, cancel),
1118            Statement::ShowTables => Ok(self.exec_show_tables()),
1119            Statement::ShowColumns(table) => self.exec_show_columns(&table),
1120            Statement::ShowUsers => Ok(self.exec_show_users()),
1121            Statement::ShowPublications => Ok(self.exec_show_publications()),
1122            Statement::ShowSubscriptions => Ok(self.exec_show_subscriptions()),
1123            Statement::WaitForWalPosition { .. } => Err(EngineError::Unsupported(
1124                "WAIT FOR WAL POSITION must be handled by the server layer".into(),
1125            )),
1126            Statement::Explain(e) => self.exec_explain(&e, cancel),
1127            _ => Err(EngineError::WriteRequired),
1128        };
1129        self.enforce_row_limit(result)
1130    }
1131
1132    /// v4.2: cap result-set size. Applied after the executor
1133    /// materialises rows but before they leave the engine — wrapping
1134    /// every Rows-returning exec_* function would scatter the check.
1135    fn enforce_row_limit(
1136        &self,
1137        result: Result<QueryResult, EngineError>,
1138    ) -> Result<QueryResult, EngineError> {
1139        if let (Ok(QueryResult::Rows { rows, .. }), Some(cap)) = (&result, self.max_query_rows)
1140            && rows.len() > cap
1141        {
1142            return Err(EngineError::RowLimitExceeded(cap));
1143        }
1144        result
1145    }
1146
1147    pub fn execute(&mut self, sql: &str) -> Result<QueryResult, EngineError> {
1148        self.execute_in_with_cancel(sql, IMPLICIT_TX, CancelToken::none())
1149    }
1150
1151    /// v4.5 — write path with cooperative cancellation. Same dispatch
1152    /// as `execute_in_with_cancel(sql, IMPLICIT_TX, cancel)`. Kept as
1153    /// a separate entry point for backward-compat with the v4.5
1154    /// public API.
1155    pub fn execute_with_cancel(
1156        &mut self,
1157        sql: &str,
1158        cancel: CancelToken<'_>,
1159    ) -> Result<QueryResult, EngineError> {
1160        self.execute_in_with_cancel(sql, IMPLICIT_TX, cancel)
1161    }
1162
1163    /// v4.41.1 multi-slot write entry. Routes `sql` through the TX
1164    /// slot identified by `tx_id` so spg-server dispatch can scope
1165    /// each implicit-wrap BEGIN..stmt..COMMIT to its own slot in
1166    /// `tx_catalogs`. `IMPLICIT_TX` is the legacy single-slot path
1167    /// every other caller (engine self-tests, replay, spg-embedded)
1168    /// implicitly takes via `execute()` / `execute_with_cancel()`.
1169    pub fn execute_in(&mut self, sql: &str, tx_id: TxId) -> Result<QueryResult, EngineError> {
1170        self.execute_in_with_cancel(sql, tx_id, CancelToken::none())
1171    }
1172
1173    /// v4.41.1 write path with cooperative cancellation + explicit TX
1174    /// scope. Sets `self.current_tx` for the duration of the call so
1175    /// every `exec_*` helper transparently sees its TX's shadow
1176    /// catalog and savepoint stack; restores on exit so the field is
1177    /// only valid mid-call (no leakage across calls).
1178    pub fn execute_in_with_cancel(
1179        &mut self,
1180        sql: &str,
1181        tx_id: TxId,
1182        cancel: CancelToken<'_>,
1183    ) -> Result<QueryResult, EngineError> {
1184        let saved = self.current_tx;
1185        self.current_tx = Some(tx_id);
1186        let result = self.execute_inner_with_cancel(sql, cancel);
1187        self.current_tx = saved;
1188        result
1189    }
1190
1191    /// v6.1.1 — parse and pre-process a SQL string ONCE so the
1192    /// resulting [`Statement`] can be cached and re-executed via
1193    /// [`Engine::execute_prepared`]. Returns the same `Statement`
1194    /// the simple-query path would synthesise internally (clock
1195    /// rewrites + ORDER BY position-ref resolution applied at
1196    /// prepare time, since both are session-independent). The
1197    /// `$N` placeholders in the SQL stay as `Expr::Placeholder(n)`
1198    /// nodes; they're resolved to concrete values per-call by
1199    /// `execute_prepared`'s substitution walk.
1200    ///
1201    /// Pgwire's `Parse` (P) message lands here.
1202    pub fn prepare(&self, sql: &str) -> Result<Statement, ParseError> {
1203        let mut stmt = parser::parse_statement(sql)?;
1204        let now_micros = self.clock.map(|f| f());
1205        rewrite_clock_calls(&mut stmt, now_micros);
1206        if let Statement::Select(s) = &mut stmt {
1207            // v6.4.1 — expand `GROUP BY ALL` to every non-aggregate
1208            // SELECT-list item BEFORE position / alias resolution so
1209            // downstream passes see the explicit list.
1210            expand_group_by_all(s);
1211            resolve_order_by_position(s);
1212            // v6.2.3 — cost-based JOIN reorder. No-op for
1213            // single-table FROMs or any non-INNER join shape.
1214            reorder::reorder_joins(s, &self.catalog, &self.statistics);
1215        }
1216        Ok(stmt)
1217    }
1218
1219    /// v6.3.0 — cached prepare. Returns a cloned `Statement` from
1220    /// the plan cache on hit, runs the full `prepare()` path on miss
1221    /// and inserts the resulting plan before returning. Skipping the
1222    /// parse + JOIN-reorder pipeline on hit is the dominant win for
1223    /// JDBC / sqlx / pgx clients that reuse the same SQL string.
1224    ///
1225    /// Returns a cloned `Statement` (not a borrow) because the
1226    /// pgwire layer owns its `PreparedStmt` map per-session and the
1227    /// engine-level cache must stay available for other sessions.
1228    /// Clone cost on a 5-table JOIN AST is well under the parse cost
1229    /// it replaces.
1230    pub fn prepare_cached(&mut self, sql: &str) -> Result<Statement, ParseError> {
1231        // v6.3.1 — version-aware lookup. If the cached plan was
1232        // prepared before the most recent ANALYZE, evict and replan.
1233        let current_version = self.statistics.version();
1234        if let Some(plan) = self.plan_cache.get(sql) {
1235            if plan.statistics_version == current_version {
1236                return Ok(plan.stmt.clone());
1237            }
1238            // Stale entry — fall through to evict + re-prepare.
1239        }
1240        self.plan_cache.evict(sql);
1241        let stmt = self.prepare(sql)?;
1242        let source_tables = plan_cache::collect_source_tables(&stmt);
1243        let plan = plan_cache::PreparedPlan {
1244            stmt: stmt.clone(),
1245            statistics_version: current_version,
1246            source_tables,
1247            describe_columns: alloc::vec::Vec::new(),
1248        };
1249        self.plan_cache.insert(String::from(sql), plan);
1250        Ok(stmt)
1251    }
1252
1253    /// v6.3.0 — read-only accessor for tests and v6.3.1 invalidation.
1254    pub fn plan_cache(&self) -> &plan_cache::PlanCache {
1255        &self.plan_cache
1256    }
1257
1258    /// v6.3.0 — mutable accessor for v6.3.1 invalidation hooks.
1259    pub fn plan_cache_mut(&mut self) -> &mut plan_cache::PlanCache {
1260        &mut self.plan_cache
1261    }
1262
1263    /// v6.3.3 — Describe a prepared `Statement` without executing.
1264    /// Returns `(parameter_oids, output_columns)`. Empty
1265    /// `output_columns` means the statement has no row-producing
1266    /// shape we could resolve here (JOIN, subquery, non-SELECT, …)
1267    /// — pgwire layer maps that to a `NoData` reply.
1268    pub fn describe_prepared(
1269        &self,
1270        stmt: &Statement,
1271    ) -> (Vec<u32>, Vec<ColumnSchema>) {
1272        describe::describe_prepared(stmt, self.active_catalog())
1273    }
1274
1275    /// v6.1.1 — execute a [`Statement`] previously returned by
1276    /// [`Engine::prepare`], substituting `Expr::Placeholder(n)`
1277    /// nodes for the corresponding [`Value`] in `params` (1-based
1278    /// per PG: `$1` → `params[0]`). Bind-time string parameters
1279    /// are decoded into typed `Value`s by the pgwire layer before
1280    /// this call so the resulting AST hits the same execution
1281    /// path as a simple query — no SQL re-parse.
1282    ///
1283    /// Pgwire's `Execute` (E) message after a `Bind` (B) lands here.
1284    pub fn execute_prepared(
1285        &mut self,
1286        mut stmt: Statement,
1287        params: &[Value],
1288    ) -> Result<QueryResult, EngineError> {
1289        substitute_placeholders(&mut stmt, params)?;
1290        self.execute_stmt_with_cancel(stmt, CancelToken::none())
1291    }
1292
1293    fn execute_inner_with_cancel(
1294        &mut self,
1295        sql: &str,
1296        cancel: CancelToken<'_>,
1297    ) -> Result<QueryResult, EngineError> {
1298        cancel.check()?;
1299        let stmt = self.prepare(sql)?;
1300        // v6.5.1 — wrap the executor with a wall-clock window so we
1301        // can record into spg_stat_query. Skip when the engine has
1302        // no clock attached (no_std embedded callers).
1303        let start_us = self.clock.map(|f| f());
1304        let result = self.execute_stmt_with_cancel(stmt, cancel);
1305        if let (Some(t0), Ok(_)) = (start_us, &result) {
1306            let now = self.clock.map_or(t0, |f| f());
1307            let elapsed = now.saturating_sub(t0).max(0) as u64;
1308            self.query_stats.record(sql, elapsed, now as u64);
1309            // v6.5.6 — slow-query log: fire callback when elapsed
1310            // exceeds the configured floor.
1311            if let (Some(threshold), Some(logger)) =
1312                (self.slow_query_threshold_us, self.slow_query_logger)
1313                && elapsed >= threshold
1314            {
1315                logger(sql, elapsed);
1316            }
1317        }
1318        result
1319    }
1320
1321    fn execute_stmt_with_cancel(
1322        &mut self,
1323        stmt: Statement,
1324        cancel: CancelToken<'_>,
1325    ) -> Result<QueryResult, EngineError> {
1326        cancel.check()?;
1327        let result = match stmt {
1328            Statement::CreateTable(s) => self.exec_create_table(s),
1329            // v7.9.15 — CREATE EXTENSION is a no-op on SPG. Returns
1330            // CommandOk with affected=0; modified_catalog=false so
1331            // the WAL doesn't grow a useless entry. mailrs F3.
1332            Statement::CreateExtension(_) => Ok(QueryResult::CommandOk {
1333                affected: 0,
1334                modified_catalog: false,
1335            }),
1336            // v7.9.27 — DO $$ ... $$ is also a no-op (SPG has no
1337            // PL/pgSQL). mailrs H1 + pg_dump compat.
1338            Statement::DoBlock => Ok(QueryResult::CommandOk {
1339                affected: 0,
1340                modified_catalog: false,
1341            }),
1342            Statement::CreateIndex(s) => self.exec_create_index(s),
1343            Statement::Insert(s) => self.exec_insert(s),
1344            Statement::Update(s) => self.exec_update_cancel(&s, cancel),
1345            Statement::Delete(s) => self.exec_delete_cancel(&s, cancel),
1346            Statement::Select(s) => self.exec_select_cancel(&s, cancel),
1347            Statement::Begin => self.exec_begin(),
1348            Statement::Commit => self.exec_commit(),
1349            Statement::Rollback => self.exec_rollback(),
1350            Statement::Savepoint(name) => self.exec_savepoint(name),
1351            Statement::RollbackToSavepoint(name) => self.exec_rollback_to_savepoint(&name),
1352            Statement::ReleaseSavepoint(name) => self.exec_release_savepoint(&name),
1353            Statement::ShowTables => Ok(self.exec_show_tables()),
1354            Statement::ShowColumns(table) => self.exec_show_columns(&table),
1355            Statement::ShowUsers => Ok(self.exec_show_users()),
1356            Statement::ShowPublications => Ok(self.exec_show_publications()),
1357            Statement::ShowSubscriptions => Ok(self.exec_show_subscriptions()),
1358            Statement::CreateUser(s) => self.exec_create_user(&s),
1359            Statement::DropUser(name) => self.exec_drop_user(&name),
1360            Statement::Explain(e) => self.exec_explain(&e, cancel),
1361            Statement::AlterIndex(s) => self.exec_alter_index(s),
1362            Statement::AlterTable(s) => self.exec_alter_table(s),
1363            Statement::CreatePublication(s) => self.exec_create_publication(s),
1364            Statement::DropPublication(name) => self.exec_drop_publication(&name),
1365            Statement::CreateSubscription(s) => self.exec_create_subscription(s),
1366            Statement::DropSubscription(name) => self.exec_drop_subscription(&name),
1367            // v6.1.7 — WAIT FOR WAL POSITION needs `lag_state`,
1368            // which lives in spg-server's ServerState. The engine
1369            // surfaces a clear error; the server-layer dispatch
1370            // intercepts the SQL before it reaches the engine on
1371            // a server build, so this arm only fires for
1372            // engine-only callers (spg-embedded, lib tests).
1373            Statement::WaitForWalPosition { .. } => Err(EngineError::Unsupported(
1374                "WAIT FOR WAL POSITION must be handled by the server layer".into(),
1375            )),
1376            // v6.2.0 — ANALYZE recomputes per-column histograms.
1377            Statement::Analyze(target) => self.exec_analyze(target.as_deref()),
1378            // v6.7.3 — COMPACT COLD SEGMENTS.
1379            Statement::CompactColdSegments => self.exec_compact_cold_segments(),
1380        };
1381        self.enforce_row_limit(result)
1382    }
1383
1384    /// v6.1.2 — `CREATE PUBLICATION` runtime path. Duplicate names
1385    /// surface as `EngineError::Unsupported` so the existing PG-wire
1386    /// error mapping stays uniform; the message carries the name so
1387    /// operators can grep replication-log noise. Inside-transaction
1388    /// invocation is rejected (matches `CREATE USER` / `DROP USER`
1389    /// stance) — replication-catalog mutation is a connection-level
1390    /// administrative op, not a transactional one.
1391    fn exec_create_publication(
1392        &mut self,
1393        s: CreatePublicationStatement,
1394    ) -> Result<QueryResult, EngineError> {
1395        // v6.1.4 — the v6.1.2 "no DDL inside a transaction" guard
1396        // was over-cautious: it also blocked the auto-commit wrap
1397        // path (which begins an internal TX around every WAL-
1398        // logged statement). PG itself allows CREATE PUBLICATION
1399        // inside a transaction (it rolls back with the TX).
1400        self.publications
1401            .create(s.name, s.scope)
1402            .map_err(|e| EngineError::Unsupported(alloc::format!("CREATE PUBLICATION: {e:?}")))?;
1403        Ok(QueryResult::CommandOk {
1404            affected: 1,
1405            modified_catalog: true,
1406        })
1407    }
1408
1409    /// v6.1.2 — `DROP PUBLICATION` runtime path. PG-compatible silent
1410    /// no-op when the publication doesn't exist (returns `affected=0`
1411    /// in that case so the wire-level command tag distinguishes
1412    /// "dropped" from "no-op", though both succeed).
1413    fn exec_drop_publication(&mut self, name: &str) -> Result<QueryResult, EngineError> {
1414        let removed = self.publications.drop(name);
1415        Ok(QueryResult::CommandOk {
1416            affected: usize::from(removed),
1417            modified_catalog: removed,
1418        })
1419    }
1420
1421    /// v6.1.2 — read access to the publication catalog. Used by
1422    /// the v6.1.5 publisher-side WAL filter, by `SHOW PUBLICATIONS`
1423    /// (v6.1.3+), and by e2e tests that need to assert state without
1424    /// going through the wire.
1425    pub const fn publications(&self) -> &publications::Publications {
1426        &self.publications
1427    }
1428
1429    /// v6.1.4 — `CREATE SUBSCRIPTION` runtime path. Defaults
1430    /// `enabled = true` and `last_received_pos = 0` for a freshly-
1431    /// created subscription. The actual worker thread is spawned
1432    /// by spg-server once the engine returns success.
1433    fn exec_create_subscription(
1434        &mut self,
1435        s: CreateSubscriptionStatement,
1436    ) -> Result<QueryResult, EngineError> {
1437        // See exec_create_publication — the in_transaction gate
1438        // was over-cautious; the auto-commit wrap path holds an
1439        // internal TX that this check was incorrectly blocking.
1440        let sub = subscriptions::Subscription {
1441            conn_str: s.conn_str,
1442            publications: s.publications,
1443            enabled: true,
1444            last_received_pos: 0,
1445        };
1446        self.subscriptions
1447            .create(s.name, sub)
1448            .map_err(|e| EngineError::Unsupported(alloc::format!("CREATE SUBSCRIPTION: {e:?}")))?;
1449        Ok(QueryResult::CommandOk {
1450            affected: 1,
1451            modified_catalog: true,
1452        })
1453    }
1454
1455    /// v6.1.4 — `DROP SUBSCRIPTION`. Silent no-op when the name
1456    /// doesn't exist (PG-compatible). The associated worker is
1457    /// torn down by spg-server when it observes the catalog
1458    /// change at the next snapshot or via the engine's
1459    /// subscriptions accessor (the worker polls the catalog on
1460    /// reconnect; v6.1.5's filter-side will tighten this to an
1461    /// explicit signal).
1462    fn exec_drop_subscription(&mut self, name: &str) -> Result<QueryResult, EngineError> {
1463        let removed = self.subscriptions.drop(name);
1464        Ok(QueryResult::CommandOk {
1465            affected: usize::from(removed),
1466            modified_catalog: removed,
1467        })
1468    }
1469
1470    /// v6.1.4 — read access to the subscription catalog. Used by
1471    /// the subscription worker (read its own row to find its
1472    /// publications + last applied position), by SHOW SUBSCRIPTIONS,
1473    /// and by e2e tests asserting state directly.
1474    pub const fn subscriptions(&self) -> &subscriptions::Subscriptions {
1475        &self.subscriptions
1476    }
1477
1478    /// v6.1.4 — write access to `last_received_pos`. Worker
1479    /// calls this after each apply batch (under the engine's
1480    /// write-lock). Returns `false` when the subscription was
1481    /// dropped between when the worker received the record and
1482    /// when this call landed.
1483    pub fn subscription_advance(&mut self, name: &str, pos: u64) -> bool {
1484        self.subscriptions.update_last_received_pos(name, pos)
1485    }
1486
1487    /// v6.1.4 — `SHOW SUBSCRIPTIONS` row materialisation. Returns
1488    /// `(name, conn_str, publications, enabled, last_received_pos)`
1489    /// ordered by subscription name. The `publications` column is
1490    /// the comma-joined list ("p1, p2") for ergonomic SHOW output;
1491    /// callers wanting structured access read `Engine::subscriptions`.
1492    fn exec_show_subscriptions(&self) -> QueryResult {
1493        let columns = alloc::vec![
1494            ColumnSchema::new("name", DataType::Text, false),
1495            ColumnSchema::new("conn_str", DataType::Text, false),
1496            ColumnSchema::new("publications", DataType::Text, false),
1497            ColumnSchema::new("enabled", DataType::Bool, false),
1498            ColumnSchema::new("last_received_pos", DataType::BigInt, false),
1499        ];
1500        let rows: Vec<Row> = self
1501            .subscriptions
1502            .iter()
1503            .map(|(name, sub)| {
1504                Row::new(alloc::vec![
1505                    Value::Text(name.clone()),
1506                    Value::Text(sub.conn_str.clone()),
1507                    Value::Text(sub.publications.join(", ")),
1508                    Value::Bool(sub.enabled),
1509                    Value::BigInt(i64::try_from(sub.last_received_pos).unwrap_or(i64::MAX)),
1510                ])
1511            })
1512            .collect();
1513        QueryResult::Rows { columns, rows }
1514    }
1515
1516    /// v6.2.0 — materialise `spg_statistic` rows. One row per
1517    /// `(table, column)` pair tracked in `Statistics`, with
1518    /// `histogram_bounds` rendered as a `[v0, v1, ...]` string —
1519    /// the same canonical form vector literals use for round-trip.
1520    fn exec_spg_statistic(&self) -> QueryResult {
1521        let columns = alloc::vec![
1522            ColumnSchema::new("table_name", DataType::Text, false),
1523            ColumnSchema::new("column_name", DataType::Text, false),
1524            ColumnSchema::new("null_frac", DataType::Float, false),
1525            ColumnSchema::new("n_distinct", DataType::BigInt, false),
1526            ColumnSchema::new("histogram_bounds", DataType::Text, false),
1527            // v6.7.0 — appended column (v6.2.0 stability contract
1528            // allows APPEND to spg_statistic, not reorder/rename).
1529            // Reports the cached per-table cold-row count; same
1530            // value across every column row of the same table.
1531            ColumnSchema::new("cold_row_count", DataType::BigInt, false),
1532        ];
1533        let rows: Vec<Row> = self
1534            .statistics
1535            .iter()
1536            .map(|((t, c), s)| {
1537                let cold = self
1538                    .catalog
1539                    .get(t)
1540                    .map_or(0, |table| table.cold_row_count());
1541                Row::new(alloc::vec![
1542                    Value::Text(t.clone()),
1543                    Value::Text(c.clone()),
1544                    Value::Float(f64::from(s.null_frac)),
1545                    Value::BigInt(i64::try_from(s.n_distinct).unwrap_or(i64::MAX)),
1546                    Value::Text(render_histogram_bounds(&s.histogram_bounds)),
1547                    Value::BigInt(i64::try_from(cold).unwrap_or(i64::MAX)),
1548                ])
1549            })
1550            .collect();
1551        QueryResult::Rows { columns, rows }
1552    }
1553
1554    /// v6.5.0 — materialise `spg_stat_replication` rows. One row
1555    /// per subscription with `(name, conn_str, publications,
1556    /// last_received_pos, enabled)`. Surface mirrors
1557    /// `SHOW SUBSCRIPTIONS` but follows the virtual-table dispatch
1558    /// shape so it composes with SELECT clauses (WHERE, projection
1559    /// onto specific columns, etc).
1560    fn exec_spg_stat_replication(&self) -> QueryResult {
1561        let columns = alloc::vec![
1562            ColumnSchema::new("name", DataType::Text, false),
1563            ColumnSchema::new("conn_str", DataType::Text, false),
1564            ColumnSchema::new("publications", DataType::Text, false),
1565            ColumnSchema::new("last_received_pos", DataType::BigInt, false),
1566            ColumnSchema::new("enabled", DataType::Bool, false),
1567        ];
1568        let rows: Vec<Row> = self
1569            .subscriptions
1570            .iter()
1571            .map(|(name, sub)| {
1572                Row::new(alloc::vec![
1573                    Value::Text(name.clone()),
1574                    Value::Text(sub.conn_str.clone()),
1575                    Value::Text(sub.publications.join(",")),
1576                    Value::BigInt(i64::try_from(sub.last_received_pos).unwrap_or(i64::MAX)),
1577                    Value::Bool(sub.enabled),
1578                ])
1579            })
1580            .collect();
1581        QueryResult::Rows { columns, rows }
1582    }
1583
1584    /// v6.5.0 — materialise `spg_stat_segment` rows. One row per
1585    /// cold-tier segment with `(segment_id, num_rows, num_pages,
1586    /// total_bytes)`.
1587    ///
1588    /// v6.7.0 — appended `table_name` column resolves the v6.5.0
1589    /// carve-out. Walks every user table's BTree indices to find
1590    /// which table's Cold locators point at each segment. Empty
1591    /// string for orphan segments (loaded via SPG_PRELOAD_COLD_SEGMENT
1592    /// before any index registered a locator). The walk is
1593    /// O(tables × indices × keys); cached per call, not across
1594    /// calls — re-walked on every `SELECT * FROM spg_stat_segment`.
1595    fn exec_spg_stat_segment(&self) -> QueryResult {
1596        let columns = alloc::vec![
1597            ColumnSchema::new("segment_id", DataType::BigInt, false),
1598            ColumnSchema::new("table_name", DataType::Text, false),
1599            ColumnSchema::new("num_rows", DataType::BigInt, false),
1600            ColumnSchema::new("num_pages", DataType::BigInt, false),
1601            ColumnSchema::new("total_bytes", DataType::BigInt, false),
1602        ];
1603        // v6.7.0 — build a segment_id → table_name map by walking
1604        // every user table's BTree indices once. O(tables × indices
1605        // × keys) for the v6.5.0 carve-out resolution; acceptable
1606        // because spg_stat_segment is operator-facing (not on a
1607        // hot-loop path).
1608        let mut segment_owners: alloc::collections::BTreeMap<u32, String> = BTreeMap::new();
1609        for tname in self.catalog.table_names() {
1610            if is_internal_table_name(&tname) {
1611                continue;
1612            }
1613            let Some(t) = self.catalog.get(&tname) else {
1614                continue;
1615            };
1616            for idx in t.indices() {
1617                if let spg_storage::IndexKind::BTree(map) = &idx.kind {
1618                    for (_, locs) in map.iter() {
1619                        for loc in locs {
1620                            if let spg_storage::RowLocator::Cold { segment_id, .. } = loc {
1621                                segment_owners.entry(*segment_id).or_insert_with(|| tname.clone());
1622                            }
1623                        }
1624                    }
1625                }
1626            }
1627        }
1628        let rows: Vec<Row> = self
1629            .catalog
1630            .cold_segment_ids_global()
1631            .iter()
1632            .filter_map(|&id| {
1633                let seg = self.catalog.cold_segment(id)?;
1634                let meta = seg.meta();
1635                let owner = segment_owners
1636                    .get(&id)
1637                    .cloned()
1638                    .unwrap_or_default();
1639                Some(Row::new(alloc::vec![
1640                    Value::BigInt(i64::from(id)),
1641                    Value::Text(owner),
1642                    Value::BigInt(i64::try_from(meta.num_rows).unwrap_or(i64::MAX)),
1643                    Value::BigInt(i64::from(meta.num_pages)),
1644                    Value::BigInt(i64::try_from(meta.total_bytes).unwrap_or(i64::MAX)),
1645                ]))
1646            })
1647            .collect();
1648        QueryResult::Rows { columns, rows }
1649    }
1650
1651    /// v6.5.1 — materialise `spg_stat_query` rows. One row per
1652    /// distinct SQL text recorded since the engine booted, capped
1653    /// at `QUERY_STATS_MAX` (1024). Columns:
1654    ///   sql, exec_count, total_us, mean_us, max_us, last_seen_us
1655    /// mean_us = total_us / exec_count (saturating).
1656    fn exec_spg_stat_query(&self) -> QueryResult {
1657        let columns = alloc::vec![
1658            ColumnSchema::new("sql", DataType::Text, false),
1659            ColumnSchema::new("exec_count", DataType::BigInt, false),
1660            ColumnSchema::new("total_us", DataType::BigInt, false),
1661            ColumnSchema::new("mean_us", DataType::BigInt, false),
1662            ColumnSchema::new("max_us", DataType::BigInt, false),
1663            ColumnSchema::new("last_seen_us", DataType::BigInt, false),
1664        ];
1665        let rows: Vec<Row> = self
1666            .query_stats
1667            .snapshot()
1668            .into_iter()
1669            .map(|(sql, s)| {
1670                let mean = if s.exec_count == 0 {
1671                    0
1672                } else {
1673                    s.total_us / s.exec_count
1674                };
1675                Row::new(alloc::vec![
1676                    Value::Text(sql),
1677                    Value::BigInt(i64::try_from(s.exec_count).unwrap_or(i64::MAX)),
1678                    Value::BigInt(i64::try_from(s.total_us).unwrap_or(i64::MAX)),
1679                    Value::BigInt(i64::try_from(mean).unwrap_or(i64::MAX)),
1680                    Value::BigInt(i64::try_from(s.max_us).unwrap_or(i64::MAX)),
1681                    Value::BigInt(i64::try_from(s.last_seen_us).unwrap_or(i64::MAX)),
1682                ])
1683            })
1684            .collect();
1685        QueryResult::Rows { columns, rows }
1686    }
1687
1688    /// v6.5.2 — register a connection-state provider. spg-server
1689    /// calls this at startup with a function that snapshots its
1690    /// per-pgwire-connection registry. Engine reads through the
1691    /// callback on `SELECT * FROM spg_stat_activity`.
1692    #[must_use]
1693    pub const fn with_activity_provider(mut self, f: ActivityProvider) -> Self {
1694        self.activity_provider = Some(f);
1695        self
1696    }
1697
1698    /// v6.5.3 — register audit chain provider + verifier.
1699    #[must_use]
1700    pub const fn with_audit_providers(
1701        mut self,
1702        chain: AuditChainProvider,
1703        verify: AuditVerifier,
1704    ) -> Self {
1705        self.audit_chain_provider = Some(chain);
1706        self.audit_verifier = Some(verify);
1707        self
1708    }
1709
1710    /// v6.5.6 — register a slow-query log callback. `threshold_us`
1711    /// is the floor (in microseconds); only executes above the floor
1712    /// fire the callback. spg-server wires this from
1713    /// `SPG_SLOW_QUERY_THRESHOLD_MS` (default 100 ms).
1714    #[must_use]
1715    pub const fn with_slow_query_log(
1716        mut self,
1717        threshold_us: u64,
1718        logger: SlowQueryLogger,
1719    ) -> Self {
1720        self.slow_query_threshold_us = Some(threshold_us);
1721        self.slow_query_logger = Some(logger);
1722        self
1723    }
1724
1725    /// v6.5.6 — operator knob for plan cache cap. spg-server reads
1726    /// `SPG_PLAN_CACHE_MAX` env at startup; uses this to override
1727    /// the compile-time default of 256.
1728    pub fn set_plan_cache_max(&mut self, n: usize) {
1729        self.plan_cache.set_max_entries(n);
1730    }
1731
1732    /// v6.5.2 — materialise `spg_stat_activity` rows. Pulls a fresh
1733    /// snapshot from the registered `ActivityProvider`. Returns an
1734    /// empty result set when no provider is registered (the no_std
1735    /// embedded path with no pgwire layer).
1736    fn exec_spg_stat_activity(&self) -> QueryResult {
1737        let columns = alloc::vec![
1738            ColumnSchema::new("pid", DataType::Int, false),
1739            ColumnSchema::new("user", DataType::Text, false),
1740            ColumnSchema::new("started_at_us", DataType::BigInt, false),
1741            ColumnSchema::new("current_sql", DataType::Text, false),
1742            ColumnSchema::new("wait_event", DataType::Text, false),
1743            ColumnSchema::new("elapsed_us", DataType::BigInt, false),
1744            ColumnSchema::new("in_transaction", DataType::Bool, false),
1745        ];
1746        let rows: Vec<Row> = self
1747            .activity_provider
1748            .map(|f| f())
1749            .unwrap_or_default()
1750            .into_iter()
1751            .map(|r| {
1752                Row::new(alloc::vec![
1753                    Value::Int(i32::try_from(r.pid).unwrap_or(i32::MAX)),
1754                    Value::Text(r.user),
1755                    Value::BigInt(r.started_at_us),
1756                    Value::Text(r.current_sql),
1757                    Value::Text(r.wait_event),
1758                    Value::BigInt(r.elapsed_us),
1759                    Value::Bool(r.in_transaction),
1760                ])
1761            })
1762            .collect();
1763        QueryResult::Rows { columns, rows }
1764    }
1765
1766    /// v6.5.4 — materialise `spg_table_ddl` rows. One row per user
1767    /// table with `(table_name, ddl)`. Reconstructed from catalog
1768    /// state on demand.
1769    fn exec_spg_table_ddl(&self) -> QueryResult {
1770        let columns = alloc::vec![
1771            ColumnSchema::new("table_name", DataType::Text, false),
1772            ColumnSchema::new("ddl", DataType::Text, false),
1773        ];
1774        let rows: Vec<Row> = self
1775            .catalog
1776            .table_names()
1777            .into_iter()
1778            .filter(|n| !is_internal_table_name(n))
1779            .filter_map(|name| {
1780                let table = self.catalog.get(&name)?;
1781                let ddl = render_create_table(&name, &table.schema().columns);
1782                Some(Row::new(alloc::vec![
1783                    Value::Text(name),
1784                    Value::Text(ddl),
1785                ]))
1786            })
1787            .collect();
1788        QueryResult::Rows { columns, rows }
1789    }
1790
1791    /// v6.5.4 — materialise `spg_role_ddl` rows. One row per user
1792    /// with `(role_name, ddl)`. Password is redacted (matches the
1793    /// `Statement::CreateUser` Display which prints `'<redacted>'`).
1794    fn exec_spg_role_ddl(&self) -> QueryResult {
1795        let columns = alloc::vec![
1796            ColumnSchema::new("role_name", DataType::Text, false),
1797            ColumnSchema::new("ddl", DataType::Text, false),
1798        ];
1799        let rows: Vec<Row> = self
1800            .users
1801            .iter()
1802            .map(|(name, rec)| {
1803                let ddl = alloc::format!(
1804                    "CREATE USER {name} WITH PASSWORD '<redacted>' ROLE '{}'",
1805                    rec.role.as_str(),
1806                );
1807                Row::new(alloc::vec![Value::Text(String::from(name)), Value::Text(ddl)])
1808            })
1809            .collect();
1810        QueryResult::Rows { columns, rows }
1811    }
1812
1813    /// v6.5.4 — materialise `spg_database_ddl`: single row whose
1814    /// `ddl` column concatenates every user table's CREATE +
1815    /// every role's CREATE in deterministic catalog order. Suitable
1816    /// for piping back through `Engine::execute` to recreate a
1817    /// schema-equivalent database.
1818    fn exec_spg_database_ddl(&self) -> QueryResult {
1819        let columns = alloc::vec![ColumnSchema::new("ddl", DataType::Text, false)];
1820        let mut out = String::new();
1821        for (name, rec) in self.users.iter() {
1822            out.push_str(&alloc::format!(
1823                "CREATE USER {name} WITH PASSWORD '<redacted>' ROLE '{}';\n",
1824                rec.role.as_str(),
1825            ));
1826        }
1827        for name in self.catalog.table_names() {
1828            if is_internal_table_name(&name) {
1829                continue;
1830            }
1831            if let Some(table) = self.catalog.get(&name) {
1832                out.push_str(&render_create_table(&name, &table.schema().columns));
1833                out.push_str(";\n");
1834            }
1835        }
1836        QueryResult::Rows {
1837            columns,
1838            rows: alloc::vec![Row::new(alloc::vec![Value::Text(out)])],
1839        }
1840    }
1841
1842    /// v6.5.3 — materialise `spg_audit_chain` rows. Pulls a fresh
1843    /// snapshot from the registered provider; empty when no
1844    /// provider is set.
1845    fn exec_spg_audit_chain(&self) -> QueryResult {
1846        let columns = alloc::vec![
1847            ColumnSchema::new("seq", DataType::BigInt, false),
1848            ColumnSchema::new("ts_ms", DataType::BigInt, false),
1849            ColumnSchema::new("prev_hash", DataType::Text, false),
1850            ColumnSchema::new("entry_hash", DataType::Text, false),
1851            ColumnSchema::new("sql", DataType::Text, false),
1852        ];
1853        let rows: Vec<Row> = self
1854            .audit_chain_provider
1855            .map(|f| f())
1856            .unwrap_or_default()
1857            .into_iter()
1858            .map(|r| {
1859                Row::new(alloc::vec![
1860                    Value::BigInt(r.seq),
1861                    Value::BigInt(r.ts_ms),
1862                    Value::Text(r.prev_hash_hex),
1863                    Value::Text(r.entry_hash_hex),
1864                    Value::Text(r.sql),
1865                ])
1866            })
1867            .collect();
1868        QueryResult::Rows { columns, rows }
1869    }
1870
1871    /// v6.5.3 — materialise `spg_audit_verify` single-row result.
1872    /// `(verified_count, broken_at_seq)` — broken_at_seq is `-1`
1873    /// on a clean chain. Returns one row with both values 0 when
1874    /// no verifier is registered (no-data fallback for embedded
1875    /// callers).
1876    fn exec_spg_audit_verify(&self) -> QueryResult {
1877        let columns = alloc::vec![
1878            ColumnSchema::new("verified_count", DataType::BigInt, false),
1879            ColumnSchema::new("broken_at_seq", DataType::BigInt, false),
1880        ];
1881        let (verified, broken) = self.audit_verifier.map(|f| f()).unwrap_or((0, -1));
1882        let row = Row::new(alloc::vec![
1883            Value::BigInt(verified),
1884            Value::BigInt(broken),
1885        ]);
1886        QueryResult::Rows {
1887            columns,
1888            rows: alloc::vec![row],
1889        }
1890    }
1891
1892    /// v6.5.1 — read-only accessor for tests + v6.5.6 ops resets.
1893    pub fn query_stats(&self) -> &query_stats::QueryStats {
1894        &self.query_stats
1895    }
1896
1897    /// v6.5.1 — mutable accessor (clear, etc).
1898    pub fn query_stats_mut(&mut self) -> &mut query_stats::QueryStats {
1899        &mut self.query_stats
1900    }
1901
1902    /// v6.2.0 — read access to the per-column statistics table.
1903    /// Used by the planner (v6.2.2 selectivity functions read this),
1904    /// by `SELECT * FROM spg_statistic`, and by e2e tests.
1905    pub const fn statistics(&self) -> &statistics::Statistics {
1906        &self.statistics
1907    }
1908
1909    /// v6.2.1 — return tables whose modified-row count crossed the
1910    /// auto-analyze threshold since the last ANALYZE on that table.
1911    /// The threshold is `0.1 × max(row_count, MIN_ROWS_FOR_AUTO_
1912    /// ANALYZE)` — combines PG-style fractional + absolute lower
1913    /// bound so a fresh / tiny table doesn't get hammered on every
1914    /// INSERT.
1915    ///
1916    /// Designed to be cheap: walks every user table's
1917    /// `Catalog::table_names()` + reads `statistics::modified_
1918    /// since_last_analyze()` (BTreeMap lookup). The background
1919    /// worker calls this under `engine.read()` then drops the lock
1920    /// before re-acquiring `engine.write()` for the actual ANALYZE.
1921    pub fn tables_needing_analyze(&self) -> Vec<String> {
1922        const MIN_ROWS: u64 = 100;
1923        let mut out = Vec::new();
1924        for name in self.catalog.table_names() {
1925            if is_internal_table_name(&name) {
1926                continue;
1927            }
1928            let Some(table) = self.catalog.get(&name) else {
1929                continue;
1930            };
1931            let row_count = table.rows().len() as u64;
1932            let modified = self.statistics.modified_since_last_analyze(&name);
1933            // Threshold: ceil(0.1 × max(row_count, MIN_ROWS)),
1934            // computed in integer arithmetic so spg-engine stays
1935            // no_std without pulling in libm. `(n + 9) / 10` is
1936            // `ceil(n / 10)` for non-negative `n`.
1937            let base = row_count.max(MIN_ROWS);
1938            let threshold = base.saturating_add(9) / 10;
1939            if modified >= threshold {
1940                out.push(name);
1941            }
1942        }
1943        out
1944    }
1945
1946    /// v6.2.0 — `ANALYZE [<table>]` runtime. Bare `ANALYZE` walks
1947    /// every user table; `ANALYZE <name>` re-stats one. For each
1948    /// target table, single-pass scan + per-column histogram +
1949    /// `null_frac` + `n_distinct`. Replaces the table's prior
1950    /// stats; resets the modified-row counter.
1951    ///
1952    /// v6.2.0 doesn't sample — it scans the full table. v6.2.x
1953    /// can add reservoir sampling at the > 100 K-row mark; not a
1954    /// scope blocker for the current commit since rows ≤ 100 K
1955    /// analyse in milliseconds.
1956    fn exec_analyze(&mut self, target: Option<&str>) -> Result<QueryResult, EngineError> {
1957        let names: Vec<String> = if let Some(name) = target {
1958            // Verify the table exists; surface a clear error if not.
1959            if self.catalog.get(name).is_none() {
1960                return Err(EngineError::Storage(StorageError::TableNotFound {
1961                    name: name.to_string(),
1962                }));
1963            }
1964            alloc::vec![name.to_string()]
1965        } else {
1966            self.catalog
1967                .table_names()
1968                .into_iter()
1969                .filter(|n| !is_internal_table_name(n))
1970                .collect()
1971        };
1972        let mut analysed = 0usize;
1973        for table_name in &names {
1974            self.analyze_one_table(table_name)?;
1975            analysed += 1;
1976        }
1977        // v6.3.1 — plan cache invalidation. Bump stats version so
1978        // future lookups see the new generation, and selectively
1979        // evict every plan whose `source_tables` overlap with the
1980        // ANALYZE target set. Bare ANALYZE (all tables) clears the
1981        // whole cache.
1982        if analysed > 0 {
1983            self.statistics.bump_version();
1984            if target.is_some() {
1985                for t in &names {
1986                    self.plan_cache.evict_referencing(t);
1987                }
1988            } else {
1989                self.plan_cache.clear();
1990            }
1991        }
1992        Ok(QueryResult::CommandOk {
1993            affected: analysed,
1994            modified_catalog: true,
1995        })
1996    }
1997
1998    /// v6.7.3 — `COMPACT COLD SEGMENTS` runtime path. Drives the
1999    /// engine-layer compaction shim with the default
2000    /// 4 MiB segment-size threshold. spg-server intercepts the
2001    /// SQL before it reaches the engine on a server build —
2002    /// it reads `SPG_COMPACTION_TARGET_SEGMENT_BYTES`, calls
2003    /// `Engine::compact_cold_segments_with_target` directly with
2004    /// the env value, and persists every merged segment to
2005    /// `<db>.spg/segments/`. This arm only fires for engine-only
2006    /// callers (spg-embedded, lib tests); in that mode merged
2007    /// segments live in memory and are dropped at process exit.
2008    fn exec_compact_cold_segments(&mut self) -> Result<QueryResult, EngineError> {
2009        let target = COMPACTION_TARGET_DEFAULT_BYTES;
2010        let reports = self.compact_cold_segments_with_target(target)?;
2011        let columns = alloc::vec![
2012            ColumnSchema::new("table_name", DataType::Text, false),
2013            ColumnSchema::new("index_name", DataType::Text, false),
2014            ColumnSchema::new("sources_merged", DataType::BigInt, false),
2015            ColumnSchema::new("merged_segment_id", DataType::BigInt, false),
2016            ColumnSchema::new("merged_rows", DataType::BigInt, false),
2017            ColumnSchema::new("deleted_rows_pruned", DataType::BigInt, false),
2018            ColumnSchema::new("bytes_reclaimed_estimate", DataType::BigInt, false),
2019        ];
2020        let rows: Vec<Row> = reports
2021            .into_iter()
2022            .map(|(tname, iname, report)| {
2023                Row::new(alloc::vec![
2024                    Value::Text(tname),
2025                    Value::Text(iname),
2026                    Value::BigInt(i64::try_from(report.sources.len()).unwrap_or(i64::MAX)),
2027                    Value::BigInt(i64::from(report.merged_segment_id.unwrap_or(0))),
2028                    Value::BigInt(i64::try_from(report.merged_rows).unwrap_or(i64::MAX)),
2029                    Value::BigInt(
2030                        i64::try_from(report.deleted_rows_pruned).unwrap_or(i64::MAX),
2031                    ),
2032                    Value::BigInt(
2033                        i64::try_from(report.bytes_reclaimed_estimate).unwrap_or(i64::MAX),
2034                    ),
2035                ])
2036            })
2037            .collect();
2038        Ok(QueryResult::Rows { columns, rows })
2039    }
2040
2041    /// Walk a single table's rows once and (re-)populate per-column
2042    /// stats. Drops the existing stats for `table` first so columns
2043    /// that have been DROP-ed between ANALYZEs don't leave stale
2044    /// rows.
2045    fn analyze_one_table(&mut self, table_name: &str) -> Result<(), EngineError> {
2046        let table = self.catalog.get(table_name).ok_or_else(|| {
2047            EngineError::Storage(StorageError::TableNotFound {
2048                name: table_name.to_string(),
2049            })
2050        })?;
2051        let schema = table.schema().clone();
2052        let row_count = table.rows().len();
2053        // For each column, collect (sorted) non-NULL textual values
2054        // + count NULLs; then ask `statistics::build_histogram` to
2055        // produce the 101 bounds and `estimate_n_distinct` the
2056        // distinct count.
2057        self.statistics.clear_table(table_name);
2058        for (col_pos, col_schema) in schema.columns.iter().enumerate() {
2059            // v6.2.0 skip: vector columns have their own stats
2060            // shape (HNSW graph topology). v6.2 deliberation #1.
2061            if matches!(col_schema.ty, DataType::Vector { .. }) {
2062                continue;
2063            }
2064            let mut non_null_values: Vec<Value> = Vec::with_capacity(row_count);
2065            let mut nulls: u64 = 0;
2066            for row in table.rows() {
2067                match row.values.get(col_pos) {
2068                    Some(Value::Null) | None => nulls += 1,
2069                    Some(v) => non_null_values.push(v.clone()),
2070                }
2071            }
2072            // Sort by type-aware ordering (Int as int, Text as
2073            // lex, etc.) so histogram bounds reflect the column's
2074            // natural order — not lexicographic on the string
2075            // representation, which would put "9" after "49".
2076            non_null_values.sort_by(|a, b| sort_values_for_histogram(a, b));
2077            let non_null: Vec<String> = non_null_values
2078                .iter()
2079                .map(canonical_value_repr)
2080                .collect();
2081            let null_frac = if row_count == 0 {
2082                0.0
2083            } else {
2084                #[allow(clippy::cast_precision_loss)]
2085                let f = nulls as f32 / row_count as f32;
2086                f
2087            };
2088            let n_distinct = statistics::estimate_n_distinct(&non_null);
2089            let histogram_bounds = statistics::build_histogram(&non_null);
2090            self.statistics.set(
2091                table_name.to_string(),
2092                col_schema.name.clone(),
2093                statistics::ColumnStats {
2094                    null_frac,
2095                    n_distinct,
2096                    histogram_bounds,
2097                },
2098            );
2099        }
2100        self.statistics.reset_modified(table_name);
2101        // v6.7.0 — refresh the per-table cold_rows cache. Walk the
2102        // BTree indices and count Cold locators (MAX across
2103        // indices); store the result on the table. Surfaced via
2104        // `spg_statistic.cold_row_count` (new column) and
2105        // `spg_stat_segment.table_name` (new column).
2106        let cold_count = {
2107            let table = self
2108                .active_catalog()
2109                .get(table_name)
2110                .expect("table still present");
2111            table.count_cold_locators()
2112        };
2113        let table_mut = self
2114            .active_catalog_mut()
2115            .get_mut(table_name)
2116            .expect("table still present");
2117        table_mut.set_cold_row_count(cold_count);
2118        Ok(())
2119    }
2120
2121    /// v6.1.3 — `SHOW PUBLICATIONS` row materialisation. Returns
2122    /// `(name, scope, table_count)` ordered by publication name.
2123    ///   - `scope` is the human-readable string:
2124    ///       `"FOR ALL TABLES"` /
2125    ///       `"FOR TABLE t1, t2"` /
2126    ///       `"FOR ALL TABLES EXCEPT t1, t2"`.
2127    ///   - `table_count` is NULL for `AllTables`, the list length
2128    ///     otherwise. NULLability lets clients distinguish "publish
2129    ///     everything" from "publish exactly 0 tables" (the v6.1.3
2130    ///     parser forbids the empty list, but the column shape is
2131    ///     ready for the v6.1.5 publisher-side semantics).
2132    fn exec_show_publications(&self) -> QueryResult {
2133        let columns = alloc::vec![
2134            ColumnSchema::new("name", DataType::Text, false),
2135            ColumnSchema::new("scope", DataType::Text, false),
2136            ColumnSchema::new("table_count", DataType::Int, true),
2137        ];
2138        let rows: Vec<Row> = self
2139            .publications
2140            .iter()
2141            .map(|(name, scope)| {
2142                let (scope_str, count_val) = match scope {
2143                    spg_sql::ast::PublicationScope::AllTables => {
2144                        ("FOR ALL TABLES".to_string(), Value::Null)
2145                    }
2146                    spg_sql::ast::PublicationScope::ForTables(ts) => (
2147                        alloc::format!("FOR TABLE {}", ts.join(", ")),
2148                        Value::Int(i32::try_from(ts.len()).unwrap_or(i32::MAX)),
2149                    ),
2150                    spg_sql::ast::PublicationScope::AllTablesExcept(ts) => (
2151                        alloc::format!("FOR ALL TABLES EXCEPT {}", ts.join(", ")),
2152                        Value::Int(i32::try_from(ts.len()).unwrap_or(i32::MAX)),
2153                    ),
2154                };
2155                Row::new(alloc::vec![
2156                    Value::Text(name.clone()),
2157                    Value::Text(scope_str),
2158                    count_val,
2159                ])
2160            })
2161            .collect();
2162        QueryResult::Rows { columns, rows }
2163    }
2164
2165    /// v4.1 `SHOW USERS` — `(name, role)` per row, ordered by name.
2166    fn exec_show_users(&self) -> QueryResult {
2167        let columns = alloc::vec![
2168            ColumnSchema::new("name", DataType::Text, false),
2169            ColumnSchema::new("role", DataType::Text, false),
2170        ];
2171        let rows: Vec<Row> = self
2172            .users
2173            .iter()
2174            .map(|(name, rec)| {
2175                Row::new(alloc::vec![
2176                    Value::Text(name.to_string()),
2177                    Value::Text(rec.role.as_str().to_string()),
2178                ])
2179            })
2180            .collect();
2181        QueryResult::Rows { columns, rows }
2182    }
2183
2184    fn exec_create_user(&mut self, s: &CreateUserStatement) -> Result<QueryResult, EngineError> {
2185        if self.in_transaction() {
2186            return Err(EngineError::Unsupported(
2187                "CREATE USER is not allowed inside a transaction".into(),
2188            ));
2189        }
2190        let role = users::Role::parse(&s.role).ok_or_else(|| {
2191            EngineError::Unsupported(alloc::format!("invalid role: {:?}", s.role))
2192        })?;
2193        // Prefer the host-injected RNG. Falls back to a deterministic
2194        // salt derived from the username only when no RNG is wired —
2195        // acceptable for tests; the server always installs one.
2196        let salt = self.salt_fn.map_or_else(
2197            || {
2198                let mut s_bytes = [0u8; 16];
2199                let digest = spg_crypto::hash(s.name.as_bytes());
2200                s_bytes.copy_from_slice(&digest[..16]);
2201                s_bytes
2202            },
2203            |f| f(),
2204        );
2205        self.users
2206            .create(&s.name, &s.password, role, salt)
2207            .map_err(|e| EngineError::Unsupported(alloc::format!("CREATE USER: {e}")))?;
2208        Ok(QueryResult::CommandOk {
2209            affected: 1,
2210            modified_catalog: true,
2211        })
2212    }
2213
2214    fn exec_drop_user(&mut self, name: &str) -> Result<QueryResult, EngineError> {
2215        if self.in_transaction() {
2216            return Err(EngineError::Unsupported(
2217                "DROP USER is not allowed inside a transaction".into(),
2218            ));
2219        }
2220        self.users
2221            .drop(name)
2222            .map_err(|e| EngineError::Unsupported(alloc::format!("DROP USER: {e}")))?;
2223        Ok(QueryResult::CommandOk {
2224            affected: 1,
2225            modified_catalog: true,
2226        })
2227    }
2228
2229    /// v4.4 `UPDATE <table> SET col = expr [, ...] [WHERE cond]`.
2230    /// Filter pass uses the same WHERE eval as `exec_select`. Per
2231    /// matched row, evaluate each RHS expression against the *old*
2232    /// row, then call `Table::update_row` which rebuilds indices.
2233    /// Indexed columns are correctly reflected because rebuild
2234    /// happens after the cell rewrite.
2235    fn exec_update_cancel(
2236        &mut self,
2237        stmt: &spg_sql::ast::UpdateStatement,
2238        cancel: CancelToken<'_>,
2239    ) -> Result<QueryResult, EngineError> {
2240        // v5.2.3: if the WHERE is a PK equality and matches a cold-
2241        // tier row, promote it back to the hot tier *before* the
2242        // hot-row walk. The promote pushes the row to the end of
2243        // `table.rows`, where the upcoming SET-evaluation loop will
2244        // pick it up and apply the assignments. Lookups for the key
2245        // never observe a gap because `promote_cold_row` inserts the
2246        // hot row before retiring the cold locator.
2247        if let Some(w) = &stmt.where_ {
2248            let schema_cols = self
2249                .active_catalog()
2250                .get(&stmt.table)
2251                .ok_or_else(|| {
2252                    EngineError::Storage(StorageError::TableNotFound {
2253                        name: stmt.table.clone(),
2254                    })
2255                })?
2256                .schema()
2257                .columns
2258                .clone();
2259            if let Some((col_pos, key)) = try_pk_predicate(w, &schema_cols, stmt.table.as_str())
2260                && let Some(idx_name) = self
2261                    .active_catalog()
2262                    .get(&stmt.table)
2263                    .and_then(|t| t.index_on(col_pos).map(|i| i.name.clone()))
2264            {
2265                // Promote may be a no-op (key is hot-only or absent);
2266                // we don't care about the return value here — the
2267                // subsequent hot walk will either match or not.
2268                let _ = self
2269                    .active_catalog_mut()
2270                    .promote_cold_row(&stmt.table, &idx_name, &key);
2271            }
2272        }
2273
2274        let table = self
2275            .active_catalog_mut()
2276            .get_mut(&stmt.table)
2277            .ok_or_else(|| {
2278                EngineError::Storage(StorageError::TableNotFound {
2279                    name: stmt.table.clone(),
2280                })
2281            })?;
2282        let schema_cols: Vec<ColumnSchema> = table.schema().columns.clone();
2283        // Resolve each SET target to a column position once, validate
2284        // up front so a typo'd column doesn't leave a partial mutation
2285        // behind.
2286        let mut targets: Vec<(usize, &Expr)> = Vec::with_capacity(stmt.assignments.len());
2287        for (col, expr) in &stmt.assignments {
2288            let pos = schema_cols
2289                .iter()
2290                .position(|c| c.name == *col)
2291                .ok_or_else(|| {
2292                    EngineError::Eval(EvalError::ColumnNotFound { name: col.clone() })
2293                })?;
2294            targets.push((pos, expr));
2295        }
2296        let ctx = EvalContext::new(&schema_cols, Some(stmt.table.as_str()));
2297        // Walk every row, evaluate WHERE then SET expressions. We
2298        // gather (position, new_values) tuples first and apply them
2299        // afterwards so the WHERE/RHS evaluation reads the original
2300        // row state — matches PG semantics (UPDATE doesn't see its
2301        // own writes).
2302        let mut planned: Vec<(usize, Vec<Value>)> = Vec::new();
2303        for (i, row) in table.rows().iter().enumerate() {
2304            // v4.5: cooperative cancel checkpoint every 256 rows so
2305            // a runaway UPDATE without WHERE doesn't drag past the
2306            // server's query-timeout watchdog.
2307            if i.is_multiple_of(256) {
2308                cancel.check()?;
2309            }
2310            if let Some(w) = &stmt.where_ {
2311                let cond = eval::eval_expr(w, row, &ctx)?;
2312                if !matches!(cond, Value::Bool(true)) {
2313                    continue;
2314                }
2315            }
2316            let mut new_vals = row.values.clone();
2317            for (pos, expr) in &targets {
2318                let v = eval::eval_expr(expr, row, &ctx)?;
2319                new_vals[*pos] =
2320                    coerce_value(v, schema_cols[*pos].ty, &schema_cols[*pos].name, *pos)?;
2321            }
2322            planned.push((i, new_vals));
2323        }
2324        // v7.6.6 — capture pre-update row values for the FK
2325        // enforcement passes below. `planned` carries new values
2326        // only; pair them with the old row.
2327        let plan_with_old: Vec<(usize, Vec<Value>, Vec<Value>)> = planned
2328            .iter()
2329            .map(|(pos, new_vals)| (*pos, table.rows()[*pos].values.clone(), new_vals.clone()))
2330            .collect();
2331        let self_fks = table.schema().foreign_keys.clone();
2332        let affected = planned.len();
2333        // Release mutable borrow on `table` for the FK passes.
2334        let _ = table;
2335        // v7.6.6 — Stage 2a: outbound FK check. For every row whose
2336        // local FK columns changed, the new value must exist in the
2337        // parent.
2338        if !self_fks.is_empty() {
2339            let new_rows: Vec<Vec<Value>> = planned
2340                .iter()
2341                .map(|(_pos, new_vals)| new_vals.clone())
2342                .collect();
2343            enforce_fk_inserts(self.active_catalog(), &stmt.table, &self_fks, &new_rows)?;
2344        }
2345        // v7.6.6 — Stage 2b: inbound FK check. For every row that
2346        // changed value in a column that *some other table* uses as
2347        // a FK parent column, react per `on_update` action.
2348        let child_plan = plan_fk_parent_updates(self.active_catalog(), &stmt.table, &plan_with_old)?;
2349        // Stage 3a — apply each child-side action.
2350        for step in &child_plan {
2351            apply_fk_child_step(self.active_catalog_mut(), step)?;
2352        }
2353        // Stage 3b — apply the original UPDATE.
2354        let table = self
2355            .active_catalog_mut()
2356            .get_mut(&stmt.table)
2357            .ok_or_else(|| {
2358                EngineError::Storage(StorageError::TableNotFound {
2359                    name: stmt.table.clone(),
2360                })
2361            })?;
2362        // v7.9.4 — snapshot post-update values for RETURNING.
2363        let updated_for_returning: Vec<Vec<Value>> =
2364            if stmt.returning.is_some() {
2365                planned.iter().map(|(_pos, vals)| vals.clone()).collect()
2366            } else {
2367                Vec::new()
2368            };
2369        for (pos, vals) in planned {
2370            table.update_row(pos, vals)?;
2371        }
2372        let _ = table;
2373        // v6.2.1 — auto-analyze modified-row tracking for UPDATE.
2374        if !self.in_transaction() && affected > 0 {
2375            self.statistics
2376                .record_modifications(&stmt.table, affected as u64);
2377        }
2378        // v7.9.4 — RETURNING projection.
2379        if let Some(items) = &stmt.returning {
2380            return self.build_returning_rows(
2381                &stmt.table,
2382                items,
2383                updated_for_returning,
2384            );
2385        }
2386        Ok(QueryResult::CommandOk {
2387            affected,
2388            modified_catalog: !self.in_transaction(),
2389        })
2390    }
2391
2392    /// v4.4 `DELETE FROM <table> [WHERE cond]`. Collects matching
2393    /// positions then delegates to `Table::delete_rows` (single index
2394    /// rebuild for the batch).
2395    fn exec_delete_cancel(
2396        &mut self,
2397        stmt: &spg_sql::ast::DeleteStatement,
2398        cancel: CancelToken<'_>,
2399    ) -> Result<QueryResult, EngineError> {
2400        // v5.2.3: PK-targeted DELETE → first retire any cold-tier
2401        // locator for the key. The cold row body stays in the
2402        // segment (becoming shadowed garbage that a future
2403        // compaction pass reclaims) but the index no longer
2404        // resolves it. The shadow count contributes to the
2405        // affected total; the subsequent hot walk handles any hot
2406        // rows for the same key.
2407        let mut cold_shadow_count: usize = 0;
2408        if let Some(w) = &stmt.where_ {
2409            let schema_cols = self
2410                .active_catalog()
2411                .get(&stmt.table)
2412                .ok_or_else(|| {
2413                    EngineError::Storage(StorageError::TableNotFound {
2414                        name: stmt.table.clone(),
2415                    })
2416                })?
2417                .schema()
2418                .columns
2419                .clone();
2420            if let Some((col_pos, key)) = try_pk_predicate(w, &schema_cols, stmt.table.as_str())
2421                && let Some(idx_name) = self
2422                    .active_catalog()
2423                    .get(&stmt.table)
2424                    .and_then(|t| t.index_on(col_pos).map(|i| i.name.clone()))
2425            {
2426                cold_shadow_count = self
2427                    .active_catalog_mut()
2428                    .shadow_cold_row(&stmt.table, &idx_name, &key)
2429                    .unwrap_or(0);
2430            }
2431        }
2432
2433        let table = self
2434            .active_catalog_mut()
2435            .get_mut(&stmt.table)
2436            .ok_or_else(|| {
2437                EngineError::Storage(StorageError::TableNotFound {
2438                    name: stmt.table.clone(),
2439                })
2440            })?;
2441        let schema_cols: Vec<ColumnSchema> = table.schema().columns.clone();
2442        let ctx = EvalContext::new(&schema_cols, Some(stmt.table.as_str()));
2443        let mut positions: Vec<usize> = Vec::new();
2444        // v7.6.3 — collect every to-delete row's full Value tuple
2445        // alongside its position, so the FK enforcement pass can
2446        // run after the mut borrow drops.
2447        let mut to_delete_rows: Vec<Vec<Value>> = Vec::new();
2448        for (i, row) in table.rows().iter().enumerate() {
2449            if i.is_multiple_of(256) {
2450                cancel.check()?;
2451            }
2452            let keep = if let Some(w) = &stmt.where_ {
2453                let cond = eval::eval_expr(w, row, &ctx)?;
2454                !matches!(cond, Value::Bool(true))
2455            } else {
2456                false
2457            };
2458            if !keep {
2459                positions.push(i);
2460                to_delete_rows.push(row.values.clone());
2461            }
2462        }
2463        // v7.6.3 / v7.6.4 — Stage 2: FK enforcement on the immutable
2464        // catalog. Release the mut borrow and run reverse-scan
2465        // against every child table whose FK targets this table.
2466        // RESTRICT / NoAction raise an error; CASCADE returns a
2467        // cascade plan that stage 3 applies after the primary delete.
2468        // SET NULL / SET DEFAULT remain Unsupported until v7.6.5.
2469        let _ = table;
2470        let cascade_plan = plan_fk_parent_deletions(
2471            self.active_catalog(),
2472            &stmt.table,
2473            &positions,
2474            &to_delete_rows,
2475        )?;
2476        // Stage 3a — apply each FK child step (SET NULL / SET
2477        // DEFAULT / CASCADE delete) before deleting the parent.
2478        // The plan is already ordered: nulls/defaults first, then
2479        // cascade deletes (so a row mutated and later deleted
2480        // surfaces as deleted — though v7.6.5 doesn't produce
2481        // that overlap today).
2482        for step in &cascade_plan {
2483            apply_fk_child_step(self.active_catalog_mut(), step)?;
2484        }
2485        // Stage 3b — actually delete the original target rows.
2486        let table = self
2487            .active_catalog_mut()
2488            .get_mut(&stmt.table)
2489            .ok_or_else(|| {
2490                EngineError::Storage(StorageError::TableNotFound {
2491                    name: stmt.table.clone(),
2492                })
2493            })?;
2494        let affected = table.delete_rows(&positions) + cold_shadow_count;
2495        let _ = table;
2496        // v6.2.1 — auto-analyze modified-row tracking for DELETE.
2497        if !self.in_transaction() && affected > 0 {
2498            self.statistics
2499                .record_modifications(&stmt.table, affected as u64);
2500        }
2501        // v7.9.4 — RETURNING projection over the soon-to-be-gone
2502        // rows. `to_delete_rows` was snapshotted in stage 1 before
2503        // mutation, so the projection sees the pre-delete state
2504        // (matches PG semantics: DELETE RETURNING returns the row
2505        // as it was just before removal).
2506        if let Some(items) = &stmt.returning {
2507            return self.build_returning_rows(
2508                &stmt.table,
2509                items,
2510                to_delete_rows,
2511            );
2512        }
2513        Ok(QueryResult::CommandOk {
2514            affected,
2515            modified_catalog: !self.in_transaction(),
2516        })
2517    }
2518
2519    /// `SHOW TABLES` — one row per table in the active catalog.
2520    /// Column name is `name` so result-set consumers can downstream
2521    /// `SELECT name FROM ...` style logic if needed.
2522    /// v4.26: `EXPLAIN [ANALYZE] <select>`. Returns a single-column
2523    /// `QUERY PLAN` text table — first line names the top operator
2524    /// (Scan / Aggregate / Window / etc.), indented children list
2525    /// FROM joins, WHERE filters, ORDER BY / LIMIT, projection
2526    /// shape, and any active index hits. `ANALYZE` execs the inner
2527    /// SELECT and appends actual-row + elapsed-micros annotations.
2528    #[allow(clippy::format_push_string)]
2529    fn exec_explain(
2530        &self,
2531        e: &spg_sql::ast::ExplainStatement,
2532        cancel: CancelToken<'_>,
2533    ) -> Result<QueryResult, EngineError> {
2534        let mut lines = Vec::<String>::new();
2535        explain_select(&e.inner, self, 0, &mut lines);
2536        if e.suggest {
2537            // v6.8.3 — index advisor. Walks the SELECT's FROM
2538            // tables + WHERE column refs; for each (table, column)
2539            // pair that lacks an index, append a SUGGEST line with
2540            // a copy-pastable `CREATE INDEX` statement. This is a
2541            // pure-syntax heuristic — no cardinality estimation —
2542            // matching the v6.8.3 design intent of "tell the
2543            // operator where indexes are missing", not "give the
2544            // mathematically optimal index set".
2545            let suggestions = build_index_suggestions(&e.inner, self);
2546            for s in suggestions {
2547                lines.push(s);
2548            }
2549        } else if e.analyze {
2550            // v6.2.4 — EXPLAIN ANALYZE annotates each operator line
2551            // with `(rows=N)` where the row count is computable
2552            // without re-executing the full query:
2553            //   - Top-level operator (first non-indented line):
2554            //     rows = final result.len()
2555            //   - "From: <table> [full scan]" lines: rows =
2556            //     table.rows().len() (catalog read; no execution)
2557            //   - "From: <table> [index seek]": indeterminate —
2558            //     the index step would need re-execution; v6.2.5
2559            //     adds per-operator wall-clock + hot/cold rows
2560            //     instrumentation that makes this concrete.
2561            //   - Everything else: marked `(—)` so the surface
2562            //     stays well-defined without silently dropping
2563            //     stats. v6.2.5 fills in via inline executor
2564            //     instrumentation.
2565            // Total elapsed lands on a trailing `Total: …` line.
2566            let started = self.clock.map(|f| f());
2567            let exec = self.exec_select_cancel(&e.inner, cancel)?;
2568            let elapsed_micros = match (self.clock, started) {
2569                (Some(f), Some(s)) => Some(f().saturating_sub(s)),
2570                _ => None,
2571            };
2572            let row_count = if let QueryResult::Rows { rows, .. } = &exec {
2573                rows.len()
2574            } else {
2575                0
2576            };
2577            annotate_explain_lines(&mut lines, row_count, self);
2578            let mut total = alloc::format!("Total: rows={row_count}");
2579            if let Some(us) = elapsed_micros {
2580                total.push_str(&alloc::format!(" elapsed={us}us"));
2581            }
2582            lines.push(total);
2583        }
2584        let columns = alloc::vec![ColumnSchema::new("QUERY PLAN", DataType::Text, false)];
2585        let rows: Vec<Row> = lines
2586            .into_iter()
2587            .map(|l| Row::new(alloc::vec![Value::Text(l)]))
2588            .collect();
2589        Ok(QueryResult::Rows { columns, rows })
2590    }
2591
2592    fn exec_show_tables(&self) -> QueryResult {
2593        let columns = alloc::vec![ColumnSchema::new("name", DataType::Text, false)];
2594        let rows: Vec<Row> = self
2595            .active_catalog()
2596            .table_names()
2597            .into_iter()
2598            .map(|n| Row::new(alloc::vec![Value::Text(n)]))
2599            .collect();
2600        QueryResult::Rows { columns, rows }
2601    }
2602
2603    /// `SHOW COLUMNS FROM <table>` — one row per column with the
2604    /// declared name, SQL type rendering, and nullability flag.
2605    fn exec_show_columns(&self, table_name: &str) -> Result<QueryResult, EngineError> {
2606        let table =
2607            self.active_catalog()
2608                .get(table_name)
2609                .ok_or_else(|| StorageError::TableNotFound {
2610                    name: table_name.into(),
2611                })?;
2612        let columns = alloc::vec![
2613            ColumnSchema::new("name", DataType::Text, false),
2614            ColumnSchema::new("type", DataType::Text, false),
2615            ColumnSchema::new("nullable", DataType::Bool, false),
2616        ];
2617        let rows: Vec<Row> = table
2618            .schema()
2619            .columns
2620            .iter()
2621            .map(|c| {
2622                Row::new(alloc::vec![
2623                    Value::Text(c.name.clone()),
2624                    Value::Text(alloc::format!("{}", c.ty)),
2625                    Value::Bool(c.nullable),
2626                ])
2627            })
2628            .collect();
2629        Ok(QueryResult::Rows { columns, rows })
2630    }
2631
2632    fn exec_begin(&mut self) -> Result<QueryResult, EngineError> {
2633        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2634        if self.tx_catalogs.contains_key(&tx_id) {
2635            return Err(EngineError::TransactionAlreadyOpen);
2636        }
2637        self.tx_catalogs.insert(
2638            tx_id,
2639            TxState {
2640                catalog: self.catalog.clone(),
2641                savepoints: Vec::new(),
2642            },
2643        );
2644        Ok(QueryResult::CommandOk {
2645            affected: 0,
2646            modified_catalog: false,
2647        })
2648    }
2649
2650    fn exec_commit(&mut self) -> Result<QueryResult, EngineError> {
2651        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2652        let state = self
2653            .tx_catalogs
2654            .remove(&tx_id)
2655            .ok_or(EngineError::NoActiveTransaction)?;
2656        self.catalog = state.catalog;
2657        // All savepoints become permanent at COMMIT and the stack
2658        // resets for the next TX (`state.savepoints` is discarded with
2659        // `state`).
2660        Ok(QueryResult::CommandOk {
2661            affected: 0,
2662            modified_catalog: true,
2663        })
2664    }
2665
2666    fn exec_rollback(&mut self) -> Result<QueryResult, EngineError> {
2667        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2668        if self.tx_catalogs.remove(&tx_id).is_none() {
2669            return Err(EngineError::NoActiveTransaction);
2670        }
2671        // savepoints discarded with the TxState
2672        Ok(QueryResult::CommandOk {
2673            affected: 0,
2674            modified_catalog: false,
2675        })
2676    }
2677
2678    fn exec_savepoint(&mut self, name: String) -> Result<QueryResult, EngineError> {
2679        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2680        let state = self
2681            .tx_catalogs
2682            .get_mut(&tx_id)
2683            .ok_or(EngineError::NoActiveTransaction)?;
2684        // PG re-uses an existing savepoint name by dropping the older
2685        // entry and pushing a fresh one — match that behaviour so
2686        // application code can `SAVEPOINT sp; ...; SAVEPOINT sp` freely.
2687        state.savepoints.retain(|(n, _)| n != &name);
2688        let snapshot = state.catalog.clone();
2689        state.savepoints.push((name, snapshot));
2690        Ok(QueryResult::CommandOk {
2691            affected: 0,
2692            modified_catalog: false,
2693        })
2694    }
2695
2696    fn exec_rollback_to_savepoint(&mut self, name: &str) -> Result<QueryResult, EngineError> {
2697        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2698        let state = self
2699            .tx_catalogs
2700            .get_mut(&tx_id)
2701            .ok_or(EngineError::NoActiveTransaction)?;
2702        let pos = state
2703            .savepoints
2704            .iter()
2705            .rposition(|(n, _)| n == name)
2706            .ok_or_else(|| {
2707                EngineError::Unsupported(alloc::format!("savepoint not found: {name}"))
2708            })?;
2709        // The savepoint stays on the stack (PG semantics): a later
2710        // `RELEASE` or further `ROLLBACK TO` is still allowed. Everything
2711        // after it is discarded.
2712        let snapshot = state.savepoints[pos].1.clone();
2713        state.savepoints.truncate(pos + 1);
2714        state.catalog = snapshot;
2715        Ok(QueryResult::CommandOk {
2716            affected: 0,
2717            modified_catalog: false,
2718        })
2719    }
2720
2721    fn exec_release_savepoint(&mut self, name: &str) -> Result<QueryResult, EngineError> {
2722        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2723        let state = self
2724            .tx_catalogs
2725            .get_mut(&tx_id)
2726            .ok_or(EngineError::NoActiveTransaction)?;
2727        let pos = state
2728            .savepoints
2729            .iter()
2730            .rposition(|(n, _)| n == name)
2731            .ok_or_else(|| {
2732                EngineError::Unsupported(alloc::format!("savepoint not found: {name}"))
2733            })?;
2734        // RELEASE keeps the work since the savepoint, just discards the
2735        // bookmark plus everything nested under it.
2736        state.savepoints.truncate(pos);
2737        Ok(QueryResult::CommandOk {
2738            affected: 0,
2739            modified_catalog: false,
2740        })
2741    }
2742
2743    /// v6.0.4 — synchronous `ALTER INDEX <name> REBUILD [WITH
2744    /// (encoding = …)]`. Walks every table in the active catalog
2745    /// looking for an index matching `stmt.name`, then delegates the
2746    /// rebuild (including any encoding switch) to
2747    /// `Table::rebuild_nsw_index`. The "live" non-blocking
2748    /// optimisation is v6.0.4.1 / v6.1.x territory.
2749    /// v6.7.2 — `ALTER TABLE t SET hot_tier_bytes = X`. Dispatch
2750    /// arm. Currently the only setting is `hot_tier_bytes`; later
2751    /// v6.7.x can extend `AlterTableTarget` without touching this
2752    /// arm structure.
2753    fn exec_alter_table(
2754        &mut self,
2755        s: spg_sql::ast::AlterTableStatement,
2756    ) -> Result<QueryResult, EngineError> {
2757        match s.target {
2758            spg_sql::ast::AlterTableTarget::SetHotTierBytes(n) => {
2759                let table = self
2760                    .active_catalog_mut()
2761                    .get_mut(&s.name)
2762                    .ok_or_else(|| {
2763                        EngineError::Storage(StorageError::TableNotFound {
2764                            name: s.name.clone(),
2765                        })
2766                    })?;
2767                table.schema_mut().hot_tier_bytes = Some(n);
2768            }
2769            spg_sql::ast::AlterTableTarget::AddForeignKey(fk) => {
2770                // v7.6.8 — resolve FK against the live catalog first
2771                // (validates parent table, columns, indices). Then
2772                // verify every existing row in the child table
2773                // satisfies the new constraint. Then install it.
2774                let cols_snapshot = self
2775                    .active_catalog()
2776                    .get(&s.name)
2777                    .ok_or_else(|| {
2778                        EngineError::Storage(StorageError::TableNotFound {
2779                            name: s.name.clone(),
2780                        })
2781                    })?
2782                    .schema()
2783                    .columns
2784                    .clone();
2785                let storage_fk = resolve_foreign_key(
2786                    &s.name,
2787                    &cols_snapshot,
2788                    fk,
2789                    self.active_catalog(),
2790                )?;
2791                // Verify existing rows. Treat them as a virtual
2792                // INSERT batch — reusing the v7.6.2 enforce helper.
2793                let existing_rows: Vec<Vec<Value>> = self
2794                    .active_catalog()
2795                    .get(&s.name)
2796                    .expect("checked above")
2797                    .rows()
2798                    .iter()
2799                    .map(|r| r.values.clone())
2800                    .collect();
2801                enforce_fk_inserts(
2802                    self.active_catalog(),
2803                    &s.name,
2804                    core::slice::from_ref(&storage_fk),
2805                    &existing_rows,
2806                )?;
2807                // Reject duplicate constraint name.
2808                let table = self
2809                    .active_catalog_mut()
2810                    .get_mut(&s.name)
2811                    .expect("checked above");
2812                if let Some(name) = &storage_fk.name
2813                    && table
2814                        .schema()
2815                        .foreign_keys
2816                        .iter()
2817                        .any(|f| f.name.as_ref() == Some(name))
2818                {
2819                    return Err(EngineError::Unsupported(alloc::format!(
2820                        "ALTER TABLE ADD CONSTRAINT: a constraint named {name:?} already exists"
2821                    )));
2822                }
2823                table.schema_mut().foreign_keys.push(storage_fk);
2824            }
2825            spg_sql::ast::AlterTableTarget::DropForeignKey(name) => {
2826                let table = self
2827                    .active_catalog_mut()
2828                    .get_mut(&s.name)
2829                    .ok_or_else(|| {
2830                        EngineError::Storage(StorageError::TableNotFound {
2831                            name: s.name.clone(),
2832                        })
2833                    })?;
2834                let fks = &mut table.schema_mut().foreign_keys;
2835                let before = fks.len();
2836                fks.retain(|f| f.name.as_ref() != Some(&name));
2837                if fks.len() == before {
2838                    return Err(EngineError::Unsupported(alloc::format!(
2839                        "ALTER TABLE DROP CONSTRAINT: no FK named {name:?} on {:?}",
2840                        s.name
2841                    )));
2842                }
2843            }
2844        }
2845        Ok(QueryResult::CommandOk {
2846            affected: 0,
2847            modified_catalog: !self.in_transaction(),
2848        })
2849    }
2850
2851    fn exec_alter_index(
2852        &mut self,
2853        stmt: spg_sql::ast::AlterIndexStatement,
2854    ) -> Result<QueryResult, EngineError> {
2855        // Translate the optional SQL-side encoding choice into the
2856        // storage-side enum; the same SqlVecEncoding -> VecEncoding
2857        // bridge `column_type_to_data_type` uses.
2858        let spg_sql::ast::AlterIndexStatement {
2859            name: idx_name,
2860            target,
2861        } = stmt;
2862        let spg_sql::ast::AlterIndexTarget::Rebuild { encoding } = target;
2863        let target = encoding.map(|e| match e {
2864            SqlVecEncoding::F32 => VecEncoding::F32,
2865            SqlVecEncoding::Sq8 => VecEncoding::Sq8,
2866            SqlVecEncoding::F16 => VecEncoding::F16,
2867        });
2868        // Linear scan: index names are globally unique within a
2869        // catalog (enforced by add_nsw_index_inner) so the first
2870        // match is the only one. Save the table name to avoid
2871        // borrowing while we then take a mut borrow.
2872        let table_name = {
2873            let cat = self.active_catalog();
2874            let mut found: Option<String> = None;
2875            for tname in cat.table_names() {
2876                if let Some(t) = cat.get(&tname)
2877                    && t.indices().iter().any(|i| i.name == idx_name)
2878                {
2879                    found = Some(tname);
2880                    break;
2881                }
2882            }
2883            found.ok_or_else(|| {
2884                EngineError::Storage(StorageError::IndexNotFound {
2885                    name: idx_name.clone(),
2886                })
2887            })?
2888        };
2889        let table = self
2890            .active_catalog_mut()
2891            .get_mut(&table_name)
2892            .expect("table found above");
2893        table.rebuild_nsw_index(&idx_name, target)?;
2894        // v6.3.1 — ALTER INDEX REBUILD potentially with new encoding
2895        // changes cost characteristics; evict any cached plans.
2896        self.plan_cache.evict_referencing(&table_name);
2897        Ok(QueryResult::CommandOk {
2898            affected: 0,
2899            modified_catalog: !self.in_transaction(),
2900        })
2901    }
2902
2903    fn exec_create_index(
2904        &mut self,
2905        stmt: CreateIndexStatement,
2906    ) -> Result<QueryResult, EngineError> {
2907        let table = self
2908            .active_catalog_mut()
2909            .get_mut(&stmt.table)
2910            .ok_or_else(|| {
2911                EngineError::Storage(StorageError::TableNotFound {
2912                    name: stmt.table.clone(),
2913                })
2914            })?;
2915        // `IF NOT EXISTS` reduces DuplicateIndex to a no-op CommandOk.
2916        if stmt.if_not_exists && table.indices().iter().any(|i| i.name == stmt.name) {
2917            return Ok(QueryResult::CommandOk {
2918                affected: 0,
2919                modified_catalog: false,
2920            });
2921        }
2922        // v7.9.14 — multi-column index parses through; engine
2923        // builds a single-column BTree on the leading column only.
2924        // The extras live on the AST so spg-server's dispatcher
2925        // can emit a PG-wire NoticeResponse / log line. Composite
2926        // BTree keys land in v7.10.
2927        let _ = &stmt.extra_columns; // intentional drop on engine side
2928        let table_name = stmt.table.clone();
2929        // v6.8.0 — resolve INCLUDE column names to positions. Done
2930        // before `add_index` so a typo error surfaces before any
2931        // catalog mutation lands.
2932        let included_positions: Vec<usize> = if stmt.included_columns.is_empty() {
2933            Vec::new()
2934        } else {
2935            let schema = table.schema();
2936            stmt.included_columns
2937                .iter()
2938                .map(|c| {
2939                    schema.column_position(c).ok_or_else(|| {
2940                        EngineError::Storage(StorageError::ColumnNotFound {
2941                            column: c.clone(),
2942                        })
2943                    })
2944                })
2945                .collect::<Result<Vec<_>, _>>()?
2946        };
2947        match stmt.method {
2948            IndexMethod::BTree => table.add_index(stmt.name.clone(), &stmt.column)?,
2949            IndexMethod::Hnsw => {
2950                if !included_positions.is_empty() {
2951                    return Err(EngineError::Unsupported(
2952                        "INCLUDE columns are not supported on HNSW indexes".into(),
2953                    ));
2954                }
2955                table.add_nsw_index(stmt.name.clone(), &stmt.column, spg_storage::NSW_DEFAULT_M)?;
2956            }
2957            // v6.7.1 — BRIN. Pure metadata; no in-memory data.
2958            IndexMethod::Brin => {
2959                if !included_positions.is_empty() {
2960                    return Err(EngineError::Unsupported(
2961                        "INCLUDE columns are not supported on BRIN indexes".into(),
2962                    ));
2963                }
2964                table.add_brin_index(stmt.name.clone(), &stmt.column)?;
2965            }
2966        }
2967        if !included_positions.is_empty()
2968            && let Some(idx) = table.indices_mut().iter_mut().find(|i| i.name == stmt.name)
2969        {
2970            idx.included_columns = included_positions;
2971        }
2972        // v6.8.1 — persist partial-index predicate. Stored as the
2973        // expression's Display form so the catalog snapshot stays
2974        // pure (storage has no spg-sql dependency). The runtime
2975        // maintenance path treats partial indexes identically to
2976        // full indexes for v6.8.1 (over-maintenance is safe; the
2977        // planner-side "use partial when query WHERE implies the
2978        // predicate" pass is STABILITY carve-out).
2979        if let Some(pred_expr) = &stmt.partial_predicate {
2980            let canonical = pred_expr.to_string();
2981            if matches!(stmt.method, IndexMethod::Hnsw | IndexMethod::Brin) {
2982                return Err(EngineError::Unsupported(
2983                    "WHERE predicates are not supported on HNSW or BRIN indexes".into(),
2984                ));
2985            }
2986            if let Some(idx) = table.indices_mut().iter_mut().find(|i| i.name == stmt.name) {
2987                idx.partial_predicate = Some(canonical);
2988            }
2989        }
2990        // v6.8.2 — persist expression index key. Same Display-form
2991        // storage; the runtime maintenance pass evaluates each
2992        // row's expression to derive the index key, but for v6.8.2
2993        // the engine falls through to the bare-column-reference
2994        // path and the expression is preserved for format-layer
2995        // round-trip + future planner work. Carved-out in
2996        // STABILITY § "Out of v6.8".
2997        if let Some(key_expr) = &stmt.expression {
2998            if matches!(stmt.method, IndexMethod::Hnsw | IndexMethod::Brin) {
2999                return Err(EngineError::Unsupported(
3000                    "Expression keys are not supported on HNSW or BRIN indexes".into(),
3001                ));
3002            }
3003            let canonical = key_expr.to_string();
3004            if let Some(idx) = table.indices_mut().iter_mut().find(|i| i.name == stmt.name) {
3005                idx.expression = Some(canonical);
3006            }
3007        }
3008        // v7.9.29 — persist `is_unique` flag on the storage Index.
3009        // Combined with `partial_predicate`, INSERT enforcement
3010        // checks that no other row whose predicate evaluates true
3011        // shares the same indexed key. Parser already rejected
3012        // `UNIQUE` on HNSW / BRIN, so plain BTree here.
3013        // For multi-column UNIQUE INDEX the extras matter (the
3014        // full tuple is the uniqueness key), so resolve them to
3015        // column positions and persist on the index too.
3016        if stmt.is_unique {
3017            let mut extra_positions: alloc::vec::Vec<usize> = alloc::vec::Vec::new();
3018            for col_name in &stmt.extra_columns {
3019                let pos = table
3020                    .schema()
3021                    .columns
3022                    .iter()
3023                    .position(|c| c.name.eq_ignore_ascii_case(col_name))
3024                    .ok_or_else(|| {
3025                        EngineError::Unsupported(alloc::format!(
3026                            "UNIQUE INDEX {:?}: extra column {col_name:?} not in table {:?}",
3027                            stmt.name, stmt.table
3028                        ))
3029                    })?;
3030                extra_positions.push(pos);
3031            }
3032            if let Some(idx) = table.indices_mut().iter_mut().find(|i| i.name == stmt.name) {
3033                idx.is_unique = true;
3034                idx.extra_column_positions = extra_positions;
3035            }
3036            // At index-creation time, check the existing rows for
3037            // pre-existing duplicates that would have violated the
3038            // new constraint — otherwise CREATE UNIQUE INDEX would
3039            // silently leave duplicates in place.
3040            let snapshot_indices = table.indices().to_vec();
3041            let snapshot_rows: alloc::vec::Vec<spg_storage::Row> =
3042                table.rows().iter().cloned().collect();
3043            let snapshot_schema = table.schema().clone();
3044            let idx_ref = snapshot_indices
3045                .iter()
3046                .find(|i| i.name == stmt.name)
3047                .expect("just-added index");
3048            check_existing_unique_violation(idx_ref, &snapshot_schema, &snapshot_rows)?;
3049        }
3050        // v6.3.1 — adding an index can change the optimal plan for
3051        // any cached query that references this table.
3052        self.plan_cache.evict_referencing(&table_name);
3053        Ok(QueryResult::CommandOk {
3054            affected: 0,
3055            modified_catalog: !self.in_transaction(),
3056        })
3057    }
3058
3059    fn exec_create_table(
3060        &mut self,
3061        stmt: CreateTableStatement,
3062    ) -> Result<QueryResult, EngineError> {
3063        if stmt.if_not_exists && self.active_catalog().get(&stmt.name).is_some() {
3064            return Ok(QueryResult::CommandOk {
3065                affected: 0,
3066                modified_catalog: false,
3067            });
3068        }
3069        let table_name = stmt.name.clone();
3070        // v7.9.13 — pluck the names of any columns marked
3071        // `PRIMARY KEY` inline so the post-create-table pass can
3072        // build an implicit BTree index. mailrs F1.
3073        let inline_pk_columns: Vec<String> = stmt
3074            .columns
3075            .iter()
3076            .filter(|c| c.is_primary_key)
3077            .map(|c| c.name.clone())
3078            .collect();
3079        // v7.9.19 — table-level constraints: PRIMARY KEY (a, b, ...)
3080        // and UNIQUE (a, b, ...). Each builds a BTree index on the
3081        // leading column (the existing single-column storage tier)
3082        // and registers a UniquenessConstraint on the schema for
3083        // INSERT-time enforcement of the full tuple. mailrs G1/G6.
3084        let cols = stmt
3085            .columns
3086            .into_iter()
3087            .map(column_def_to_schema)
3088            .collect::<Result<Vec<_>, _>>()?;
3089        // Composite NOT-NULL implication for PRIMARY KEY columns.
3090        let mut cols = cols;
3091        for tc in &stmt.table_constraints {
3092            if let spg_sql::ast::TableConstraint::PrimaryKey { columns, .. } = tc {
3093                for col_name in columns {
3094                    if let Some(col) = cols.iter_mut().find(|c| c.name == *col_name) {
3095                        col.nullable = false;
3096                    }
3097                }
3098            }
3099        }
3100        // v7.6.1 — resolve every FK in the statement against the
3101        // already-known catalog. Validates: parent table exists,
3102        // parent column names exist, arity matches, parent columns
3103        // have a PK / UNIQUE index. Self-referencing FKs (parent
3104        // table == this table) resolve against the column list we
3105        // just built — they don't need the catalog yet.
3106        let mut fks: Vec<spg_storage::ForeignKeyConstraint> =
3107            Vec::with_capacity(stmt.foreign_keys.len());
3108        for fk in stmt.foreign_keys {
3109            fks.push(resolve_foreign_key(
3110                &table_name,
3111                &cols,
3112                fk,
3113                self.active_catalog(),
3114            )?);
3115        }
3116        let mut schema = TableSchema::new(table_name.clone(), cols);
3117        schema.foreign_keys = fks;
3118        // v7.9.19 — translate AST table_constraints to storage
3119        // UniquenessConstraints (column name → position) so the
3120        // INSERT enforcement helper sees positions directly.
3121        let mut uc_storage: Vec<spg_storage::UniquenessConstraint> = Vec::new();
3122        for tc in &stmt.table_constraints {
3123            let (is_pk, names) = match tc {
3124                spg_sql::ast::TableConstraint::PrimaryKey { columns, .. } => {
3125                    (true, columns.clone())
3126                }
3127                spg_sql::ast::TableConstraint::Unique { columns, .. } => {
3128                    (false, columns.clone())
3129                }
3130            };
3131            let mut positions = Vec::with_capacity(names.len());
3132            for n in &names {
3133                let pos = schema
3134                    .columns
3135                    .iter()
3136                    .position(|c| c.name == *n)
3137                    .ok_or_else(|| {
3138                        EngineError::Unsupported(alloc::format!(
3139                            "table constraint references unknown column {n:?}"
3140                        ))
3141                    })?;
3142                positions.push(pos);
3143            }
3144            uc_storage.push(spg_storage::UniquenessConstraint {
3145                is_primary_key: is_pk,
3146                columns: positions,
3147            });
3148        }
3149        schema.uniqueness_constraints = uc_storage.clone();
3150        self.active_catalog_mut().create_table(schema)?;
3151        // v7.9.13 — implicit BTree per inline PK column +
3152        // v7.9.19 — implicit BTree on the leading column of every
3153        // table-level PRIMARY KEY / UNIQUE constraint.
3154        let table = self
3155            .active_catalog_mut()
3156            .get_mut(&table_name)
3157            .expect("just created");
3158        for (i, col_name) in inline_pk_columns.iter().enumerate() {
3159            let idx_name = if inline_pk_columns.len() == 1 {
3160                alloc::format!("{table_name}_pkey")
3161            } else {
3162                alloc::format!("{table_name}_pkey_{i}")
3163            };
3164            if let Err(e) = table.add_index(idx_name, col_name) {
3165                return Err(EngineError::Storage(e));
3166            }
3167        }
3168        for (i, tc) in stmt.table_constraints.iter().enumerate() {
3169            let (is_pk, names) = match tc {
3170                spg_sql::ast::TableConstraint::PrimaryKey { columns, .. } => {
3171                    (true, columns)
3172                }
3173                spg_sql::ast::TableConstraint::Unique { columns, .. } => {
3174                    (false, columns)
3175                }
3176            };
3177            let leading = &names[0];
3178            // Skip if a same-column BTree already exists (e.g.
3179            // inline PK on the leading column).
3180            let already = table
3181                .indices()
3182                .iter()
3183                .any(|idx| {
3184                    matches!(idx.kind, spg_storage::IndexKind::BTree(_))
3185                        && table.schema().columns[idx.column_position].name == *leading
3186                });
3187            if already {
3188                continue;
3189            }
3190            let suffix = if is_pk { "pkey" } else { "key" };
3191            let idx_name = if names.len() == 1 {
3192                alloc::format!("{table_name}_{leading}_{suffix}")
3193            } else {
3194                alloc::format!("{table_name}_{leading}_{suffix}_{i}")
3195            };
3196            if let Err(e) = table.add_index(idx_name, leading) {
3197                return Err(EngineError::Storage(e));
3198            }
3199        }
3200        Ok(QueryResult::CommandOk {
3201            affected: 0,
3202            modified_catalog: !self.in_transaction(),
3203        })
3204    }
3205
3206    fn exec_insert(&mut self, stmt: InsertStatement) -> Result<QueryResult, EngineError> {
3207        // v7.9.21 — snapshot the clock fn pointer before the mut
3208        // borrow on the catalog opens; runtime DEFAULT eval needs
3209        // it inside the row hot loop.
3210        let clock = self.clock;
3211        let table = self
3212            .active_catalog_mut()
3213            .get_mut(&stmt.table)
3214            .ok_or_else(|| {
3215                EngineError::Storage(StorageError::TableNotFound {
3216                    name: stmt.table.clone(),
3217                })
3218            })?;
3219        // v3.1.5: clone the columns vector only (not the whole
3220        // TableSchema — saves one String alloc for the table name).
3221        // We need an owned snapshot because we'll call `table.insert`
3222        // (mutable borrow on `table`) inside the row loop while
3223        // reading schema fields.
3224        let column_meta: Vec<ColumnSchema> = table.schema().columns.clone();
3225        let schema_cols_len = column_meta.len();
3226        // Build a permutation `tuple_pos[c] = Some(j)` meaning schema
3227        // column `c` is filled from the `j`-th tuple slot; `None` means
3228        // "fill with NULL". Validated once and reused for every row.
3229        let tuple_pos: Option<Vec<Option<usize>>> = match &stmt.columns {
3230            None => None, // 1-1 mapping, fast path
3231            Some(cols) => {
3232                let mut map = alloc::vec![None; schema_cols_len];
3233                for (j, name) in cols.iter().enumerate() {
3234                    let idx = column_meta
3235                        .iter()
3236                        .position(|c| c.name == *name)
3237                        .ok_or_else(|| {
3238                            EngineError::Eval(EvalError::ColumnNotFound { name: name.clone() })
3239                        })?;
3240                    if map[idx].is_some() {
3241                        return Err(EngineError::Storage(StorageError::ArityMismatch {
3242                            expected: schema_cols_len,
3243                            actual: cols.len(),
3244                        }));
3245                    }
3246                    map[idx] = Some(j);
3247                }
3248                // Omitted columns must either be nullable, carry a
3249                // DEFAULT, or be AUTO_INCREMENT. Catch NOT NULL
3250                // omissions up front so the WAL stays clean.
3251                for (i, col) in column_meta.iter().enumerate() {
3252                    if map[i].is_none()
3253                        && !col.nullable
3254                        && col.default.is_none()
3255                        && col.runtime_default.is_none()
3256                        && !col.auto_increment
3257                    {
3258                        return Err(EngineError::Storage(StorageError::NullInNotNull {
3259                            column: col.name.clone(),
3260                        }));
3261                    }
3262                }
3263                Some(map)
3264            }
3265        };
3266        let expected_tuple_len = stmt.columns.as_ref().map_or(schema_cols_len, Vec::len);
3267        // v7.6.2 — snapshot this table's FK list before the
3268        // mutable-borrow window so we can run parent lookups
3269        // against the immutable catalog after parsing. Empty vec is
3270        // the no-FK fast path; clone cost is O(fks * arity) which
3271        // is < 100 ns for typical schemas.
3272        let fks = table.schema().foreign_keys.clone();
3273        let mut affected = 0usize;
3274        // Stage 1 — parse + AUTO_INC + coerce all rows under the
3275        // single mutable borrow.
3276        let mut all_values: Vec<Vec<Value>> = Vec::with_capacity(stmt.rows.len());
3277        for tuple in stmt.rows {
3278            if tuple.len() != expected_tuple_len {
3279                return Err(EngineError::Storage(StorageError::ArityMismatch {
3280                    expected: expected_tuple_len,
3281                    actual: tuple.len(),
3282                }));
3283            }
3284            // Fast path: no column-list permutation → tuple slot j
3285            // maps to schema column j. We can zip schema with tuple
3286            // and skip the `raw_tuple` staging allocation entirely.
3287            let values: Vec<Value> = if let Some(map) = &tuple_pos {
3288                // Permuted path: still need raw_tuple to index by `map[i]`.
3289                let raw_tuple: Vec<Value> = tuple
3290                    .into_iter()
3291                    .map(literal_expr_to_value)
3292                    .collect::<Result<_, _>>()?;
3293                let mut out = Vec::with_capacity(schema_cols_len);
3294                for (i, col) in column_meta.iter().enumerate() {
3295                    let mut raw = match map[i] {
3296                        Some(j) => raw_tuple[j].clone(),
3297                        None => resolve_column_default_free(col, clock)?,
3298                    };
3299                    if col.auto_increment && raw.is_null() {
3300                        let next = table.next_auto_value(i).ok_or_else(|| {
3301                            EngineError::Unsupported(alloc::format!(
3302                                "AUTO_INCREMENT applies to integer columns only (column `{}`)",
3303                                col.name
3304                            ))
3305                        })?;
3306                        raw = Value::BigInt(next);
3307                    }
3308                    out.push(coerce_value(raw, col.ty, &col.name, i)?);
3309                }
3310                out
3311            } else {
3312                // 1-1 mapping fast path: single Vec alloc, no raw_tuple.
3313                let mut out = Vec::with_capacity(schema_cols_len);
3314                for (i, (col, expr)) in column_meta.iter().zip(tuple).enumerate() {
3315                    let mut raw = literal_expr_to_value(expr)?;
3316                    if col.auto_increment && raw.is_null() {
3317                        let next = table.next_auto_value(i).ok_or_else(|| {
3318                            EngineError::Unsupported(alloc::format!(
3319                                "AUTO_INCREMENT applies to integer columns only (column `{}`)",
3320                                col.name
3321                            ))
3322                        })?;
3323                        raw = Value::BigInt(next);
3324                    }
3325                    out.push(coerce_value(raw, col.ty, &col.name, i)?);
3326                }
3327                out
3328            };
3329            all_values.push(values);
3330        }
3331        // Stage 2 — FK enforcement on the immutable catalog.
3332        // Non-lexical lifetimes release the mutable borrow on
3333        // `table` here since stage 1 was the last use. The
3334        // parent-table lookup runs before any row is committed.
3335        let uniqueness = table.schema().uniqueness_constraints.clone();
3336        let _ = table;
3337        if !fks.is_empty() {
3338            enforce_fk_inserts(self.active_catalog(), &stmt.table, &fks, &all_values)?;
3339        }
3340        // v7.9.19 — composite UNIQUE / PRIMARY KEY enforcement.
3341        enforce_uniqueness_inserts(
3342            self.active_catalog(),
3343            &stmt.table,
3344            &uniqueness,
3345            &all_values,
3346        )?;
3347        // v7.9.29 — CREATE UNIQUE INDEX [WHERE pred] enforcement.
3348        // Independent of table-level UniquenessConstraint (which
3349        // can't carry a predicate). Walks the table's indexes;
3350        // for each `is_unique` index, only rows whose
3351        // partial_predicate evaluates truthy are checked for
3352        // collision. mailrs K1.
3353        enforce_unique_index_inserts(
3354            self.active_catalog(),
3355            &stmt.table,
3356            &all_values,
3357        )?;
3358        // v7.9.8 / v7.9.9 — ON CONFLICT handling.
3359        //   - `DO NOTHING` filters `all_values` to non-conflicting
3360        //     rows + drops within-batch duplicates.
3361        //   - `DO UPDATE SET …` ALSO filters, but for each
3362        //     conflicting row it queues an UPDATE on the existing
3363        //     row using the incoming row's values as `EXCLUDED.*`.
3364        let mut pending_updates: Vec<(usize, Vec<Value>)> = Vec::new();
3365        let mut skipped_count = 0usize;
3366        if let Some(clause) = &stmt.on_conflict {
3367            let conflict_cols = resolve_on_conflict_columns(
3368                self.active_catalog(),
3369                &stmt.table,
3370                clause.target_columns.as_slice(),
3371            )?;
3372            let mut kept: Vec<Vec<Value>> = Vec::with_capacity(all_values.len());
3373            let mut seen_keys: Vec<Vec<Value>> = Vec::new();
3374            for values in all_values {
3375                let key_tuple: Vec<&Value> =
3376                    conflict_cols.iter().map(|&c| &values[c]).collect();
3377                // SQL spec: NULL in any conflict column means "no
3378                // conflict possible" (NULL ≠ NULL for uniqueness).
3379                let has_null_key = key_tuple.iter().any(|v| matches!(v, Value::Null));
3380                let collides_with_table = !has_null_key
3381                    && on_conflict_keys_exist(
3382                        self.active_catalog(),
3383                        &stmt.table,
3384                        &conflict_cols,
3385                        &key_tuple,
3386                    );
3387                let key_tuple_owned: Vec<Value> =
3388                    key_tuple.iter().map(|v| (*v).clone()).collect();
3389                let collides_with_batch = !has_null_key
3390                    && seen_keys.iter().any(|k| k == &key_tuple_owned);
3391                let collides = collides_with_table || collides_with_batch;
3392                match (&clause.action, collides) {
3393                    (_, false) => {
3394                        seen_keys.push(key_tuple_owned);
3395                        kept.push(values);
3396                    }
3397                    (spg_sql::ast::OnConflictAction::Nothing, true) => {
3398                        skipped_count += 1;
3399                    }
3400                    (
3401                        spg_sql::ast::OnConflictAction::Update {
3402                            assignments,
3403                            where_,
3404                        },
3405                        true,
3406                    ) => {
3407                        if !collides_with_table {
3408                            skipped_count += 1;
3409                            continue;
3410                        }
3411                        let target_pos = lookup_row_position_by_keys(
3412                            self.active_catalog(),
3413                            &stmt.table,
3414                            &conflict_cols,
3415                            &key_tuple,
3416                        )
3417                        .ok_or_else(|| {
3418                            EngineError::Unsupported(
3419                                "ON CONFLICT DO UPDATE: conflict detected but row \
3420                                 position could not be resolved (cold-tier row?)"
3421                                    .into(),
3422                            )
3423                        })?;
3424                        let updated = apply_on_conflict_assignments(
3425                            self.active_catalog(),
3426                            &stmt.table,
3427                            target_pos,
3428                            &values,
3429                            assignments,
3430                            where_.as_ref(),
3431                        )?;
3432                        if let Some(new_row) = updated {
3433                            pending_updates.push((target_pos, new_row));
3434                        } else {
3435                            skipped_count += 1;
3436                        }
3437                    }
3438                }
3439            }
3440            all_values = kept;
3441        }
3442        // Stage 3 — insert all rows under a fresh mutable borrow.
3443        let table = self
3444            .active_catalog_mut()
3445            .get_mut(&stmt.table)
3446            .ok_or_else(|| {
3447                EngineError::Storage(StorageError::TableNotFound {
3448                    name: stmt.table.clone(),
3449                })
3450            })?;
3451        // v7.9.4 — keep RETURNING projection rows separate per
3452        // INSERT and per UPDATE branch so DO UPDATE pushes the new
3453        // post-update state, not the incoming-only values.
3454        let mut returning_rows: Vec<Vec<Value>> = Vec::new();
3455        for values in all_values {
3456            if stmt.returning.is_some() {
3457                returning_rows.push(values.clone());
3458            }
3459            table.insert(Row::new(values))?;
3460            affected += 1;
3461        }
3462        // v7.9.9 — apply ON CONFLICT DO UPDATE rewrites collected
3463        // in the conflict-resolution pass. update_row handles
3464        // index maintenance + body re-encoding.
3465        for (pos, new_row) in pending_updates {
3466            if stmt.returning.is_some() {
3467                returning_rows.push(new_row.clone());
3468            }
3469            table.update_row(pos, new_row)?;
3470            affected += 1;
3471        }
3472        let _ = skipped_count;
3473        // v7.9.4/v7.9.9 — RETURNING streams the rows that ended
3474        // up in the table after this statement (insert or
3475        // post-update on conflict).
3476        if let Some(items) = &stmt.returning {
3477            let _ = table;
3478            return self.build_returning_rows(
3479                &stmt.table,
3480                items,
3481                returning_rows,
3482            );
3483        }
3484        // v6.2.1 — auto-analyze: track per-table modified-row
3485        // counter so the background sweep can decide when to
3486        // re-ANALYZE. Cheap path on the autocommit-wrap hot loop
3487        // — one BTreeMap entry update per INSERT batch.
3488        if !self.in_transaction() && affected > 0 {
3489            self.statistics
3490                .record_modifications(&stmt.table, affected as u64);
3491        }
3492        Ok(QueryResult::CommandOk {
3493            affected,
3494            modified_catalog: !self.in_transaction(),
3495        })
3496    }
3497
3498    /// v4.5: SELECT with cooperative cancellation. The token is
3499    /// honoured between UNION peers and inside the bare-SELECT row
3500    /// loop; HNSW kNN graph walks and the aggregate executor don't
3501    /// honour it yet (deferred — those paths bound their work
3502    /// internally by `LIMIT k` and `GROUP BY` cardinality).
3503    /// v6.10.2 — cold-tier time-travel scan. Resolves the segment
3504    /// by id, decodes each row body against the table's current
3505    /// schema, applies the SELECT's projection + optional WHERE +
3506    /// optional LIMIT, returns a `Rows` result. JOINs / aggregates
3507    /// / ORDER BY are unsupported on this path (STABILITY carve-
3508    /// out); operators wanting them should restore the segment
3509    /// into a regular table first.
3510    fn exec_select_as_of_segment(
3511        &self,
3512        stmt: &SelectStatement,
3513        from: &spg_sql::ast::FromClause,
3514        segment_id: u32,
3515    ) -> Result<QueryResult, EngineError> {
3516        // v6.10.2 scope: no joins, no aggregates, no ORDER BY,
3517        // no GROUP BY / HAVING / UNION / OFFSET / DISTINCT.
3518        if !from.joins.is_empty()
3519            || stmt.group_by.is_some()
3520            || stmt.having.is_some()
3521            || !stmt.unions.is_empty()
3522            || !stmt.order_by.is_empty()
3523            || stmt.offset.is_some()
3524            || stmt.distinct
3525            || aggregate::uses_aggregate(stmt)
3526        {
3527            return Err(EngineError::Unsupported(
3528                "AS OF SEGMENT supports SELECT projection + WHERE + LIMIT only \
3529                 (joins / aggregates / ORDER BY are STABILITY § \"Out of v6.10\")"
3530                    .into(),
3531            ));
3532        }
3533        let table = self
3534            .active_catalog()
3535            .get(&from.primary.name)
3536            .ok_or_else(|| StorageError::TableNotFound {
3537                name: from.primary.name.clone(),
3538            })?;
3539        let schema = table.schema().clone();
3540        let schema_cols = &schema.columns;
3541        let alias = from
3542            .primary
3543            .alias
3544            .as_deref()
3545            .unwrap_or(from.primary.name.as_str());
3546        let ctx = EvalContext::new(schema_cols, Some(alias));
3547        let seg = self
3548            .active_catalog()
3549            .cold_segment(segment_id)
3550            .ok_or_else(|| {
3551                EngineError::Unsupported(alloc::format!(
3552                    "AS OF SEGMENT: cold segment {segment_id} not registered"
3553                ))
3554            })?;
3555        let mut out_rows: Vec<Row> = Vec::new();
3556        let mut limit_remaining: Option<usize> =
3557            stmt.limit_literal().and_then(|n| usize::try_from(n).ok());
3558        for (_key, body) in seg.scan() {
3559            let (row, _consumed) = spg_storage::decode_row_body_dense(&body, &schema)
3560                .map_err(EngineError::Storage)?;
3561            if let Some(where_expr) = &stmt.where_ {
3562                let cond = self.eval_expr_simple(where_expr, &row, &ctx)?;
3563                if !matches!(cond, Value::Bool(true)) {
3564                    continue;
3565                }
3566            }
3567            // Projection.
3568            let projected = self.project_row_simple(&row, &stmt.items, schema_cols, alias)?;
3569            out_rows.push(projected);
3570            if let Some(rem) = limit_remaining.as_mut() {
3571                if *rem == 0 {
3572                    out_rows.pop();
3573                    break;
3574                }
3575                *rem -= 1;
3576            }
3577        }
3578        // Output column schema: derive from SELECT items.
3579        let columns = self.derive_output_columns(&stmt.items, schema_cols, alias);
3580        Ok(QueryResult::Rows {
3581            columns,
3582            rows: out_rows,
3583        })
3584    }
3585
3586    /// v6.10.2 — simple-path WHERE eval that doesn't go through
3587    /// the correlated-subquery / Memoize machinery. AS OF SEGMENT
3588    /// scan paths predicate against a snapshot frozen segment, no
3589    /// cross-row state.
3590    fn eval_expr_simple(
3591        &self,
3592        expr: &Expr,
3593        row: &Row,
3594        ctx: &EvalContext,
3595    ) -> Result<Value, EngineError> {
3596        let cancel = CancelToken::none();
3597        self.eval_expr_with_correlated(expr, row, ctx, cancel, None)
3598    }
3599
3600    /// v7.9.4 — INSERT / UPDATE / DELETE RETURNING projector.
3601    /// Given the table name, the user-supplied projection items,
3602    /// and the mutated rows (post-insert / post-update values, or
3603    /// pre-delete snapshot), build a `QueryResult::Rows` whose
3604    /// schema describes the projected columns. Mailrs migration
3605    /// blocker #1.
3606    fn build_returning_rows(
3607        &self,
3608        table_name: &str,
3609        items: &[SelectItem],
3610        mutated_rows: Vec<Vec<Value>>,
3611    ) -> Result<QueryResult, EngineError> {
3612        let table = self.active_catalog().get(table_name).ok_or_else(|| {
3613            EngineError::Storage(StorageError::TableNotFound {
3614                name: table_name.into(),
3615            })
3616        })?;
3617        let schema_cols = table.schema().columns.clone();
3618        let columns = self.derive_output_columns(items, &schema_cols, table_name);
3619        let mut out_rows: Vec<Row> = Vec::with_capacity(mutated_rows.len());
3620        for values in mutated_rows {
3621            let row = Row::new(values);
3622            let projected = self.project_row_simple(&row, items, &schema_cols, table_name)?;
3623            out_rows.push(projected);
3624        }
3625        Ok(QueryResult::Rows {
3626            columns,
3627            rows: out_rows,
3628        })
3629    }
3630
3631    /// v6.10.2 — projection for AS OF SEGMENT. Resolves
3632    /// `SelectItem::Wildcard` to all schema columns and
3633    /// `SelectItem::Expr` via the regular eval path.
3634    fn project_row_simple(
3635        &self,
3636        row: &Row,
3637        items: &[SelectItem],
3638        schema_cols: &[ColumnSchema],
3639        alias: &str,
3640    ) -> Result<Row, EngineError> {
3641        let ctx = EvalContext::new(schema_cols, Some(alias));
3642        let cancel = CancelToken::none();
3643        let mut out_vals = Vec::new();
3644        for item in items {
3645            match item {
3646                SelectItem::Wildcard => {
3647                    out_vals.extend(row.values.iter().cloned());
3648                }
3649                SelectItem::Expr { expr, .. } => {
3650                    let v = self.eval_expr_with_correlated(expr, row, &ctx, cancel, None)?;
3651                    out_vals.push(v);
3652                }
3653            }
3654        }
3655        Ok(Row::new(out_vals))
3656    }
3657
3658    /// v6.10.2 — derive the output `ColumnSchema` list for an
3659    /// AS OF SEGMENT projection. Wildcards take the full schema;
3660    /// expressions take the alias if present or a synthetic
3661    /// `?column?` (PG convention) otherwise.
3662    fn derive_output_columns(
3663        &self,
3664        items: &[SelectItem],
3665        schema_cols: &[ColumnSchema],
3666        _alias: &str,
3667    ) -> Vec<ColumnSchema> {
3668        let mut out = Vec::new();
3669        for item in items {
3670            match item {
3671                SelectItem::Wildcard => {
3672                    out.extend(schema_cols.iter().cloned());
3673                }
3674                SelectItem::Expr { alias, .. } => {
3675                    let name = alias
3676                        .clone()
3677                        .unwrap_or_else(|| "?column?".to_string());
3678                    // Default to Text; the caller's row values
3679                    // carry the actual type. v6.10.2 scope.
3680                    out.push(ColumnSchema::new(name, DataType::Text, true));
3681                }
3682            }
3683        }
3684        out
3685    }
3686
3687    fn exec_select_cancel(
3688        &self,
3689        stmt: &SelectStatement,
3690        cancel: CancelToken<'_>,
3691    ) -> Result<QueryResult, EngineError> {
3692        cancel.check()?;
3693        // v6.10.2 — cold-tier time-travel short-circuit. When the
3694        // primary TableRef carries `AS OF SEGMENT '<id>'`, run a
3695        // dedicated cold-segment scan instead of the regular
3696        // hot+index path. The scope is intentionally narrow for
3697        // v6.10.2 — bare `SELECT * FROM <t> AS OF SEGMENT 'id'`,
3698        // optionally with a single-column-equality WHERE. JOINs /
3699        // aggregates / ORDER BY / subqueries on top of a time-
3700        // travelled scan are STABILITY § "Out of v6.10".
3701        if let Some(from) = &stmt.from
3702            && let Some(seg_id) = from.primary.as_of_segment
3703        {
3704            return self.exec_select_as_of_segment(stmt, from, seg_id);
3705        }
3706        // v6.2.0 / v6.5.0 — virtual-table short-circuits. Detected
3707        // pre-CTE because they don't read from the catalog and
3708        // shouldn't participate in regular FROM resolution.
3709        if let Some(from) = &stmt.from
3710            && from.joins.is_empty()
3711            && stmt.where_.is_none()
3712            && stmt.group_by.is_none()
3713            && stmt.having.is_none()
3714            && stmt.unions.is_empty()
3715            && stmt.order_by.is_empty()
3716            && stmt.limit.is_none()
3717            && stmt.offset.is_none()
3718            && !stmt.distinct
3719            && stmt.items.iter().all(|i| matches!(i, SelectItem::Wildcard))
3720        {
3721            let lower = from.primary.name.to_ascii_lowercase();
3722            match lower.as_str() {
3723                "spg_statistic" => return Ok(self.exec_spg_statistic()),
3724                // v6.5.0 — observability v2 virtual tables.
3725                "spg_stat_replication" => return Ok(self.exec_spg_stat_replication()),
3726                "spg_stat_segment" => return Ok(self.exec_spg_stat_segment()),
3727                "spg_stat_query" => return Ok(self.exec_spg_stat_query()),
3728                "spg_stat_activity" => return Ok(self.exec_spg_stat_activity()),
3729                "spg_audit_chain" => return Ok(self.exec_spg_audit_chain()),
3730                "spg_audit_verify" => return Ok(self.exec_spg_audit_verify()),
3731                "spg_table_ddl" => return Ok(self.exec_spg_table_ddl()),
3732                "spg_role_ddl" => return Ok(self.exec_spg_role_ddl()),
3733                "spg_database_ddl" => return Ok(self.exec_spg_database_ddl()),
3734                _ => {}
3735            }
3736        }
3737        // v4.11: CTEs materialise into a temporary enriched catalog
3738        // *before* anything else — the body SELECT can then refer
3739        // to CTE names via the regular FROM-clause resolution.
3740        // Uncorrelated only: each CTE body runs once against the
3741        // current catalog, not against later CTEs' results (left-
3742        // to-right materialisation would relax this, but we keep
3743        // it simple for v4.11 MVP).
3744        if !stmt.ctes.is_empty() {
3745            return self.exec_with_ctes(stmt, cancel);
3746        }
3747        // v4.10: subqueries (uncorrelated) are resolved here, before
3748        // the executor sees the row loop. We clone the statement so
3749        // we can mutate without disturbing the caller's AST — most
3750        // queries pass through with no subquery nodes and the clone
3751        // is cheap; with subqueries the materialisation cost
3752        // dominates anyway.
3753        let mut stmt_owned;
3754        let stmt_ref: &SelectStatement = if expr_tree_has_subquery(stmt) {
3755            stmt_owned = stmt.clone();
3756            self.resolve_select_subqueries(&mut stmt_owned, cancel)?;
3757            &stmt_owned
3758        } else {
3759            stmt
3760        };
3761        if stmt_ref.unions.is_empty() {
3762            return self.exec_bare_select_cancel(stmt_ref, cancel);
3763        }
3764        // UNION path: clone-strip the head into a bare block (its own
3765        // DISTINCT and any inner ORDER BY are dropped by parser rule —
3766        // the wrapper SelectStatement carries them), execute, then chain
3767        // peers with left-associative dedup semantics.
3768        let mut head = stmt_ref.clone();
3769        head.unions = Vec::new();
3770        head.order_by = Vec::new();
3771        head.limit = None;
3772        let QueryResult::Rows { columns, mut rows } =
3773            self.exec_bare_select_cancel(&head, cancel)?
3774        else {
3775            unreachable!("bare SELECT cannot return CommandOk")
3776        };
3777        for (kind, peer) in &stmt_ref.unions {
3778            let QueryResult::Rows {
3779                columns: peer_cols,
3780                rows: peer_rows,
3781            } = self.exec_bare_select_cancel(peer, cancel)?
3782            else {
3783                unreachable!("bare SELECT cannot return CommandOk")
3784            };
3785            if peer_cols.len() != columns.len() {
3786                return Err(EngineError::Unsupported(alloc::format!(
3787                    "UNION arity mismatch: head has {} columns, peer has {}",
3788                    columns.len(),
3789                    peer_cols.len()
3790                )));
3791            }
3792            rows.extend(peer_rows);
3793            if matches!(kind, UnionKind::Distinct) {
3794                rows = dedup_rows(rows);
3795            }
3796        }
3797        // ORDER BY at the top of a UNION applies to the combined result.
3798        // Eval against the projected schema (NOT the source table).
3799        if !stmt.order_by.is_empty() {
3800            let synth_ctx = EvalContext::new(&columns, None);
3801            let descs: Vec<bool> = stmt.order_by.iter().map(|o| o.desc).collect();
3802            let mut tagged: Vec<(Vec<f64>, Row)> = Vec::with_capacity(rows.len());
3803            for r in rows {
3804                let keys = build_order_keys(&stmt.order_by, &r, &synth_ctx)?;
3805                tagged.push((keys, r));
3806            }
3807            sort_by_keys(&mut tagged, &descs);
3808            rows = tagged.into_iter().map(|(_, r)| r).collect();
3809        }
3810        apply_offset_and_limit(&mut rows, stmt.offset_literal(), stmt.limit_literal());
3811        Ok(QueryResult::Rows { columns, rows })
3812    }
3813
3814    #[allow(clippy::too_many_lines)]
3815    #[allow(clippy::too_many_lines)] // huge match — splitting fragments the planner
3816    fn exec_bare_select_cancel(
3817        &self,
3818        stmt: &SelectStatement,
3819        cancel: CancelToken<'_>,
3820    ) -> Result<QueryResult, EngineError> {
3821        // v4.12: window-function path. When the projection contains
3822        // any `name(args) OVER (...)` we route to the dedicated
3823        // executor — partition + sort + per-row window value before
3824        // the regular projection.
3825        if select_has_window(stmt) {
3826            return self.exec_select_with_window(stmt, cancel);
3827        }
3828        // Constant SELECT (no FROM) — evaluate each item once against an
3829        // empty dummy row. Useful for `SELECT 1`, `SELECT coalesce(...)`,
3830        // `SELECT '7'::INT`. Column references will surface as
3831        // ColumnNotFound on eval since the schema is empty.
3832        let Some(from) = &stmt.from else {
3833            let empty_schema: Vec<ColumnSchema> = Vec::new();
3834            let ctx = EvalContext::new(&empty_schema, None);
3835            let projection = build_projection(&stmt.items, &empty_schema, "")?;
3836            let dummy_row = Row::new(Vec::new());
3837            let mut values = Vec::with_capacity(projection.len());
3838            for p in &projection {
3839                values.push(eval::eval_expr(&p.expr, &dummy_row, &ctx)?);
3840            }
3841            let columns: Vec<ColumnSchema> = projection
3842                .into_iter()
3843                .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
3844                .collect();
3845            return Ok(QueryResult::Rows {
3846                columns,
3847                rows: alloc::vec![Row::new(values)],
3848            });
3849        };
3850        // Multi-table FROM (one or more joined peers) goes through the
3851        // nested-loop join executor. Single-table FROM stays on the
3852        // existing scan + index-seek path.
3853        if !from.joins.is_empty() {
3854            return self.exec_joined_select(stmt, from);
3855        }
3856        let primary = &from.primary;
3857        let table = self.active_catalog().get(&primary.name).ok_or_else(|| {
3858            StorageError::TableNotFound {
3859                name: primary.name.clone(),
3860            }
3861        })?;
3862        let schema_cols = &table.schema().columns;
3863        // The qualifier accepted on column refs is the alias (if any) else the
3864        // bare table name.
3865        let alias = primary.alias.as_deref().unwrap_or(primary.name.as_str());
3866        let ctx = EvalContext::new(schema_cols, Some(alias));
3867
3868        // NSW kNN planner: `ORDER BY col <-> literal LIMIT k` with no
3869        // WHERE and an NSW index on `col` skips the full scan. The
3870        // walk returns rows already in ascending-distance order, so
3871        // ORDER BY / LIMIT are honoured implicitly.
3872        if let Some(nsw_rows) = try_nsw_knn(stmt, table, schema_cols, alias) {
3873            return materialise_in_order(stmt, table, schema_cols, alias, &nsw_rows);
3874        }
3875
3876        // Index seek: if WHERE is `col = literal` (or commuted) and the
3877        // referenced column has an index, dispatch each locator through
3878        // the catalog (hot tier → borrow, cold tier → page-read +
3879        // decode) and iterate just those rows. Otherwise fall back to a
3880        // full scan over the hot tier (cold-tier rows are only reached
3881        // via index seek in v5.1 — full table scans against cold-tier
3882        // data ship in v5.2 with the freezer's per-segment scan API).
3883        let indexed_rows: Option<Vec<Cow<'_, Row>>> = stmt
3884            .where_
3885            .as_ref()
3886            .and_then(|w| try_index_seek(w, schema_cols, self.active_catalog(), table, alias));
3887
3888        // Aggregate path: filter rows first, then hand off to the
3889        // aggregate executor which does its own projection + ORDER BY.
3890        if aggregate::uses_aggregate(stmt) {
3891            let mut filtered: Vec<&Row> = Vec::new();
3892            // v6.2.6 — Memoize: per-query LRU cache for correlated
3893            // scalar subqueries. Fresh per row-loop entry so each
3894            // SELECT execution gets an isolated cache.
3895            let mut memo = memoize::MemoizeCache::new();
3896            if let Some(rows) = &indexed_rows {
3897                for cow in rows {
3898                    let row = cow.as_ref();
3899                    if let Some(where_expr) = &stmt.where_ {
3900                        let cond = self.eval_expr_with_correlated(
3901                            where_expr,
3902                            row,
3903                            &ctx,
3904                            cancel,
3905                            Some(&mut memo),
3906                        )?;
3907                        if !matches!(cond, Value::Bool(true)) {
3908                            continue;
3909                        }
3910                    }
3911                    filtered.push(row);
3912                }
3913            } else {
3914                for i in 0..table.row_count() {
3915                    let row = &table.rows()[i];
3916                    if let Some(where_expr) = &stmt.where_ {
3917                        let cond = self.eval_expr_with_correlated(
3918                            where_expr,
3919                            row,
3920                            &ctx,
3921                            cancel,
3922                            Some(&mut memo),
3923                        )?;
3924                        if !matches!(cond, Value::Bool(true)) {
3925                            continue;
3926                        }
3927                    }
3928                    filtered.push(row);
3929                }
3930            }
3931            let mut agg = aggregate::run(stmt, &filtered, schema_cols, Some(alias))?;
3932            apply_offset_and_limit(&mut agg.rows, stmt.offset_literal(), stmt.limit_literal());
3933            return Ok(QueryResult::Rows {
3934                columns: agg.columns,
3935                rows: agg.rows,
3936            });
3937        }
3938
3939        let projection = build_projection(&stmt.items, schema_cols, alias)?;
3940
3941        // Materialise the filter pass into `(order_key, projected_row)`
3942        // tuples. The order key is `None` when there's no ORDER BY clause.
3943        let mut tagged: Vec<(Vec<f64>, Row)> = Vec::new();
3944        // v6.2.6 — Memoize per-row WHERE eval shares one cache.
3945        let mut memo = memoize::MemoizeCache::new();
3946        // Inline the per-row work in a closure so the indexed and full-
3947        // scan branches share the body.
3948        let mut process_row = |row: &Row, loop_idx: usize| -> Result<(), EngineError> {
3949            if loop_idx.is_multiple_of(256) {
3950                cancel.check()?;
3951            }
3952            if let Some(where_expr) = &stmt.where_ {
3953                let cond = self.eval_expr_with_correlated(
3954                    where_expr,
3955                    row,
3956                    &ctx,
3957                    cancel,
3958                    Some(&mut memo),
3959                )?;
3960                if !matches!(cond, Value::Bool(true)) {
3961                    return Ok(());
3962                }
3963            }
3964            let mut values = Vec::with_capacity(projection.len());
3965            for p in &projection {
3966                values.push(eval::eval_expr(&p.expr, row, &ctx)?);
3967            }
3968            let order_keys = if stmt.order_by.is_empty() {
3969                Vec::new()
3970            } else {
3971                build_order_keys(&stmt.order_by, row, &ctx)?
3972            };
3973            tagged.push((order_keys, Row::new(values)));
3974            Ok(())
3975        };
3976        if let Some(rows) = &indexed_rows {
3977            for (loop_idx, cow) in rows.iter().enumerate() {
3978                process_row(cow.as_ref(), loop_idx)?;
3979            }
3980        } else {
3981            for i in 0..table.row_count() {
3982                process_row(&table.rows()[i], i)?;
3983            }
3984        }
3985
3986        if !stmt.order_by.is_empty() {
3987            // Partial-sort fast path: when LIMIT is small relative to
3988            // the row count, select_nth_unstable + sort just the
3989            // prefix is O(n + k log k) instead of O(n log n). DISTINCT
3990            // requires the full sort because de-dup happens after.
3991            let keep = if stmt.distinct {
3992                None
3993            } else {
3994                stmt.limit_literal()
3995                    .map(|l| l as usize + stmt.offset_literal().map_or(0, |o| o as usize))
3996            };
3997            let descs: Vec<bool> = stmt.order_by.iter().map(|o| o.desc).collect();
3998            partial_sort_tagged(&mut tagged, keep, &descs);
3999        }
4000
4001        let mut output_rows: Vec<Row> = tagged.into_iter().map(|(_, r)| r).collect();
4002        if stmt.distinct {
4003            output_rows = dedup_rows(output_rows);
4004        }
4005        apply_offset_and_limit(&mut output_rows, stmt.offset_literal(), stmt.limit_literal());
4006
4007        let columns: Vec<ColumnSchema> = projection
4008            .into_iter()
4009            .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
4010            .collect();
4011
4012        Ok(QueryResult::Rows {
4013            columns,
4014            rows: output_rows,
4015        })
4016    }
4017
4018    /// Multi-table SELECT executor (one or more JOIN peers).
4019    ///
4020    /// v1.10 builds the joined row set up-front via nested-loop joins,
4021    /// then runs WHERE + projection + ORDER BY against the combined
4022    /// rows. No index seek. Aggregates and DISTINCT still work because
4023    /// the executor delegates projection through the same shared paths.
4024    #[allow(clippy::too_many_lines)]
4025    fn exec_joined_select(
4026        &self,
4027        stmt: &SelectStatement,
4028        from: &FromClause,
4029    ) -> Result<QueryResult, EngineError> {
4030        // Resolve every table reference up front so we surface
4031        // TableNotFound before we start the cartesian work.
4032        let primary_table = self
4033            .active_catalog()
4034            .get(&from.primary.name)
4035            .ok_or_else(|| StorageError::TableNotFound {
4036                name: from.primary.name.clone(),
4037            })?;
4038        let primary_alias = from
4039            .primary
4040            .alias
4041            .as_deref()
4042            .unwrap_or(from.primary.name.as_str())
4043            .to_string();
4044        let mut joined_tables: Vec<(&Table, String, JoinKind, Option<&Expr>)> = Vec::new();
4045        for j in &from.joins {
4046            let t = self.active_catalog().get(&j.table.name).ok_or_else(|| {
4047                StorageError::TableNotFound {
4048                    name: j.table.name.clone(),
4049                }
4050            })?;
4051            let a = j
4052                .table
4053                .alias
4054                .as_deref()
4055                .unwrap_or(j.table.name.as_str())
4056                .to_string();
4057            joined_tables.push((t, a, j.kind, j.on.as_ref()));
4058        }
4059
4060        // Build the combined schema: composite "alias.col" names so the
4061        // qualified-column resolver can find anything by exact match.
4062        let mut combined_schema: Vec<ColumnSchema> = Vec::new();
4063        for col in &primary_table.schema().columns {
4064            combined_schema.push(ColumnSchema::new(
4065                alloc::format!("{primary_alias}.{}", col.name),
4066                col.ty,
4067                col.nullable,
4068            ));
4069        }
4070        for (t, a, _, _) in &joined_tables {
4071            for col in &t.schema().columns {
4072                combined_schema.push(ColumnSchema::new(
4073                    alloc::format!("{a}.{}", col.name),
4074                    col.ty,
4075                    col.nullable,
4076                ));
4077            }
4078        }
4079        let ctx = EvalContext::new(&combined_schema, None);
4080
4081        // Nested-loop join. Starting set: every primary row, padded with
4082        // (no joined columns yet).
4083        let mut working: Vec<Row> = primary_table.rows().iter().cloned().collect();
4084        let mut produced_len = primary_table.schema().columns.len();
4085        for (t, _, kind, on) in &joined_tables {
4086            let right_arity = t.schema().columns.len();
4087            let mut next: Vec<Row> = Vec::new();
4088            for left in &working {
4089                let mut left_matched = false;
4090                for right in t.rows() {
4091                    let mut combined_vals = left.values.clone();
4092                    combined_vals.extend(right.values.iter().cloned());
4093                    // Pad combined to the eventual full width so the
4094                    // partial schema still matches positions used by ON.
4095                    let combined = Row::new(combined_vals);
4096                    let keep = if let Some(on_expr) = on {
4097                        let cond = eval::eval_expr(on_expr, &combined, &ctx)?;
4098                        matches!(cond, Value::Bool(true))
4099                    } else {
4100                        // CROSS / comma-list: every pair survives.
4101                        true
4102                    };
4103                    if keep {
4104                        next.push(combined);
4105                        left_matched = true;
4106                    }
4107                }
4108                if !left_matched && matches!(kind, JoinKind::Left) {
4109                    // LEFT OUTER JOIN: emit the left row with NULLs on
4110                    // the right side when no peer matched.
4111                    let mut combined_vals = left.values.clone();
4112                    for _ in 0..right_arity {
4113                        combined_vals.push(Value::Null);
4114                    }
4115                    next.push(Row::new(combined_vals));
4116                }
4117            }
4118            working = next;
4119            produced_len += right_arity;
4120            debug_assert!(produced_len <= combined_schema.len());
4121        }
4122
4123        // WHERE filter against combined rows.
4124        let mut filtered: Vec<Row> = Vec::new();
4125        for row in working {
4126            if let Some(where_expr) = &stmt.where_ {
4127                let cond = eval::eval_expr(where_expr, &row, &ctx)?;
4128                if !matches!(cond, Value::Bool(true)) {
4129                    continue;
4130                }
4131            }
4132            filtered.push(row);
4133        }
4134
4135        // Aggregate path: handle GROUP BY / aggregate calls over the
4136        // joined+filtered rows.
4137        if aggregate::uses_aggregate(stmt) {
4138            let refs: Vec<&Row> = filtered.iter().collect();
4139            let mut agg = aggregate::run(stmt, &refs, &combined_schema, None)?;
4140            apply_offset_and_limit(&mut agg.rows, stmt.offset_literal(), stmt.limit_literal());
4141            return Ok(QueryResult::Rows {
4142                columns: agg.columns,
4143                rows: agg.rows,
4144            });
4145        }
4146
4147        let projection = build_projection(&stmt.items, &combined_schema, "")?;
4148        let mut tagged: Vec<(Vec<f64>, Row)> = Vec::new();
4149        for row in &filtered {
4150            let mut values = Vec::with_capacity(projection.len());
4151            for p in &projection {
4152                values.push(eval::eval_expr(&p.expr, row, &ctx)?);
4153            }
4154            let order_keys = if stmt.order_by.is_empty() {
4155                Vec::new()
4156            } else {
4157                build_order_keys(&stmt.order_by, row, &ctx)?
4158            };
4159            tagged.push((order_keys, Row::new(values)));
4160        }
4161        if !stmt.order_by.is_empty() {
4162            let keep = if stmt.distinct {
4163                None
4164            } else {
4165                stmt.limit_literal()
4166                    .map(|l| l as usize + stmt.offset_literal().map_or(0, |o| o as usize))
4167            };
4168            let descs: Vec<bool> = stmt.order_by.iter().map(|o| o.desc).collect();
4169            partial_sort_tagged(&mut tagged, keep, &descs);
4170        }
4171        let mut output_rows: Vec<Row> = tagged.into_iter().map(|(_, r)| r).collect();
4172        if stmt.distinct {
4173            output_rows = dedup_rows(output_rows);
4174        }
4175        apply_offset_and_limit(&mut output_rows, stmt.offset_literal(), stmt.limit_literal());
4176        let columns: Vec<ColumnSchema> = projection
4177            .into_iter()
4178            .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
4179            .collect();
4180        Ok(QueryResult::Rows {
4181            columns,
4182            rows: output_rows,
4183        })
4184    }
4185}
4186
4187/// One row-producing projection: an expression to evaluate, the resulting
4188/// column's user-visible name, its inferred type, and nullability.
4189#[derive(Debug, Clone)]
4190struct ProjectedItem {
4191    expr: Expr,
4192    output_name: String,
4193    ty: DataType,
4194    nullable: bool,
4195}
4196
4197/// Dedupe a row set, preserving first-seen order. `Row`'s `PartialEq` is
4198/// structural (`Vec<Value>` ⇒ pairwise `Value` equality), which gives SQL
4199/// `NULL = NULL → TRUE` and `NaN = NaN → FALSE`. The first agrees with
4200/// the spec's "two NULLs are not distinct"; the second is a tolerated
4201/// quirk for v1 (no NaN literals are reachable from the SQL surface).
4202fn dedup_rows(rows: Vec<Row>) -> Vec<Row> {
4203    let mut out: Vec<Row> = Vec::with_capacity(rows.len());
4204    for r in rows {
4205        if !out.iter().any(|seen| seen == &r) {
4206            out.push(r);
4207        }
4208    }
4209    out
4210}
4211
4212/// Coerce a `Value` to an `f64` sort key for ORDER BY. Numbers map directly;
4213/// NULL sorts last (treated as `+∞`); booleans are 0.0 / 1.0; text uses lex
4214/// order via the byte values; vectors are not sortable.
4215fn value_to_order_key(v: &Value) -> Result<f64, EngineError> {
4216    match v {
4217        Value::Null => Ok(f64::INFINITY),
4218        Value::SmallInt(n) => Ok(f64::from(*n)),
4219        Value::Int(n) => Ok(f64::from(*n)),
4220        Value::Date(d) => Ok(f64::from(*d)),
4221        #[allow(clippy::cast_precision_loss)]
4222        Value::Timestamp(t) => Ok(*t as f64),
4223        #[allow(clippy::cast_precision_loss)]
4224        Value::Numeric { scaled, scale } => {
4225            // Scaled integer / 10^scale, computed via f64 for sort
4226            // ordering only. Precision losses here only matter for
4227            // ORDER BY tie-breaks well past 15 significant digits.
4228            // `f64::powi` lives in std; we hand-roll the loop so the
4229            // no_std engine crate doesn't need it.
4230            let mut divisor = 1.0_f64;
4231            for _ in 0..*scale {
4232                divisor *= 10.0;
4233            }
4234            Ok((*scaled as f64) / divisor)
4235        }
4236        #[allow(clippy::cast_precision_loss)]
4237        Value::BigInt(n) => Ok(*n as f64),
4238        Value::Float(x) => Ok(*x),
4239        Value::Bool(b) => Ok(if *b { 1.0 } else { 0.0 }),
4240        Value::Text(s) => {
4241            // Lex order by codepoints — good enough for ORDER BY name.
4242            // Map first 8 bytes packed into u64 as a coarse key; ties fall to
4243            // partial_cmp Equal. v1.x can swap in a real string comparator.
4244            let mut key: u64 = 0;
4245            for &b in s.as_bytes().iter().take(8) {
4246                key = (key << 8) | u64::from(b);
4247            }
4248            #[allow(clippy::cast_precision_loss)]
4249            Ok(key as f64)
4250        }
4251        Value::Vector(_) | Value::Sq8Vector(_) | Value::HalfVector(_) => {
4252            Err(EngineError::Unsupported(
4253                "ORDER BY of a raw vector column is not meaningful — use `<->`".into(),
4254            ))
4255        }
4256        Value::Interval { .. } => Err(EngineError::Unsupported(
4257            "ORDER BY of an INTERVAL is not supported in v2.11 \
4258             (months vs micros has no single canonical ordering)"
4259                .into(),
4260        )),
4261        Value::Json(_) => Err(EngineError::Unsupported(
4262            "ORDER BY of a JSON value is not supported — cast the document to text first".into(),
4263        )),
4264        // v7.5.0 — Value is #[non_exhaustive]; future variants need
4265        // an explicit ORDER BY mapping. Surface as Unsupported until
4266        // engine support is added.
4267        _ => Err(EngineError::Unsupported(
4268            "ORDER BY of this value type is not supported".into(),
4269        )),
4270    }
4271}
4272
4273/// Try to plan a WHERE clause as an equality lookup against an existing
4274/// index. Returns the candidate row indices on success; `None` means the
4275/// caller should fall back to a full scan.
4276///
4277/// v0.8 recognises a single top-level `col = literal` (in either operand
4278/// order). AND chains and range scans land in later milestones.
4279/// Look for `ORDER BY col <dist-op> literal LIMIT k` against an
4280/// NSW-indexed vector column. Recognised distance ops: `<->` (L2),
4281/// `<#>` (inner product), `<=>` (cosine). When a WHERE clause is
4282/// present, the planner does an "over-fetch and filter" pass — it
4283/// asks the graph for `k * over_fetch` candidates, evaluates WHERE
4284/// against each, and trims back to `k`. Returns the row indices in
4285/// ascending-distance order when the plan applies.
4286fn try_nsw_knn(
4287    stmt: &SelectStatement,
4288    table: &Table,
4289    schema_cols: &[ColumnSchema],
4290    table_alias: &str,
4291) -> Option<Vec<usize>> {
4292    if stmt.distinct {
4293        return None;
4294    }
4295    let limit = usize::try_from(stmt.limit_literal()?).ok()?;
4296    if limit == 0 {
4297        return None;
4298    }
4299    // v6.4.0 — NSW kNN dispatch needs a single ORDER BY key on the
4300    // distance metric. Multi-key ORDER BY falls through to the
4301    // generic sort path.
4302    if stmt.order_by.len() != 1 {
4303        return None;
4304    }
4305    let order = &stmt.order_by[0];
4306    // NSW kNN returns rows ascending by distance — DESC inverts the
4307    // natural order, so the planner can't handle it without a sort
4308    // pass. Fall back to the generic ORDER BY path.
4309    if order.desc {
4310        return None;
4311    }
4312    let Expr::Binary { lhs, op, rhs } = &order.expr else {
4313        return None;
4314    };
4315    let metric = match op {
4316        BinOp::L2Distance => spg_storage::NswMetric::L2,
4317        BinOp::InnerProduct => spg_storage::NswMetric::InnerProduct,
4318        BinOp::CosineDistance => spg_storage::NswMetric::Cosine,
4319        _ => return None,
4320    };
4321    // Accept both `col <op> literal` and `literal <op> col`.
4322    let ((Expr::Column(col), literal) | (literal, Expr::Column(col))) =
4323        (lhs.as_ref(), rhs.as_ref())
4324    else {
4325        return None;
4326    };
4327    if let Some(q) = &col.qualifier
4328        && q != table_alias
4329    {
4330        return None;
4331    }
4332    let col_pos = schema_cols.iter().position(|s| s.name == col.name)?;
4333    let query = literal_to_vector(literal)?;
4334    let idx = spg_storage::nsw_index_on(table, col_pos)?;
4335    if let Some(where_expr) = &stmt.where_ {
4336        // Over-fetch and filter. The factor (10×) is a heuristic that
4337        // covers typical selectivity for the corpus tests; v2.x will
4338        // make it configurable.
4339        let over_fetch = limit.saturating_mul(10).max(NSW_OVER_FETCH_FLOOR);
4340        let candidates = spg_storage::nsw_query(table, &idx.name, &query, over_fetch, metric);
4341        let ctx = EvalContext::new(schema_cols, Some(table_alias));
4342        let mut kept: Vec<usize> = Vec::with_capacity(limit);
4343        for i in candidates {
4344            let row = &table.rows()[i];
4345            let cond = eval::eval_expr(where_expr, row, &ctx).ok()?;
4346            if matches!(cond, Value::Bool(true)) {
4347                kept.push(i);
4348                if kept.len() >= limit {
4349                    break;
4350                }
4351            }
4352        }
4353        Some(kept)
4354    } else {
4355        Some(spg_storage::nsw_query(
4356            table, &idx.name, &query, limit, metric,
4357        ))
4358    }
4359}
4360
4361/// Lower bound on the over-fetch pool when WHERE is present — even
4362/// for tiny `LIMIT 1` queries we keep enough candidates to absorb a
4363/// few WHERE rejections.
4364const NSW_OVER_FETCH_FLOOR: usize = 32;
4365
4366/// Pull a `Vec<f32>` out of a literal-or-cast expression. Returns
4367/// `None` for anything we can't fold at plan time.
4368fn literal_to_vector(e: &Expr) -> Option<Vec<f32>> {
4369    match e {
4370        Expr::Literal(Literal::Vector(v)) => Some(v.clone()),
4371        Expr::Cast { expr, .. } => literal_to_vector(expr),
4372        _ => None,
4373    }
4374}
4375
4376/// Materialise rows in a planner-supplied order (used by the NSW path)
4377/// without re-running ORDER BY. The projection + LIMIT slot mirror the
4378/// equivalent block in `exec_bare_select`.
4379fn materialise_in_order(
4380    stmt: &SelectStatement,
4381    table: &Table,
4382    schema_cols: &[ColumnSchema],
4383    table_alias: &str,
4384    ordered_rows: &[usize],
4385) -> Result<QueryResult, EngineError> {
4386    let ctx = EvalContext::new(schema_cols, Some(table_alias));
4387    let projection = build_projection(&stmt.items, schema_cols, table_alias)?;
4388    let mut output_rows: Vec<Row> = Vec::with_capacity(ordered_rows.len());
4389    for &i in ordered_rows {
4390        let row = &table.rows()[i];
4391        let mut values = Vec::with_capacity(projection.len());
4392        for p in &projection {
4393            values.push(eval::eval_expr(&p.expr, row, &ctx)?);
4394        }
4395        output_rows.push(Row::new(values));
4396    }
4397    apply_offset_and_limit(&mut output_rows, stmt.offset_literal(), stmt.limit_literal());
4398    let columns: Vec<ColumnSchema> = projection
4399        .into_iter()
4400        .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
4401        .collect();
4402    Ok(QueryResult::Rows {
4403        columns,
4404        rows: output_rows,
4405    })
4406}
4407
4408fn try_index_seek<'a>(
4409    where_expr: &Expr,
4410    schema_cols: &[ColumnSchema],
4411    catalog: &'a Catalog,
4412    table: &'a Table,
4413    table_alias: &str,
4414) -> Option<Vec<Cow<'a, Row>>> {
4415    let Expr::Binary {
4416        lhs,
4417        op: BinOp::Eq,
4418        rhs,
4419    } = where_expr
4420    else {
4421        return None;
4422    };
4423    let (col_pos, value) = resolve_col_literal_pair(lhs, rhs, schema_cols, table_alias)
4424        .or_else(|| resolve_col_literal_pair(rhs, lhs, schema_cols, table_alias))?;
4425    let idx = table.index_on(col_pos)?;
4426    let key = IndexKey::from_value(&value)?;
4427    let locators = idx.lookup_eq(&key);
4428    let table_name = table.schema().name.as_str();
4429    // v5.1: each locator dispatches to either the hot tier (zero-
4430    // copy borrow of `table.rows()[i]`) or a cold-tier segment
4431    // (one page read + dense row decode, ~µs scale). Cold rows are
4432    // returned as `Cow::Owned` so the caller's `&Row` iteration
4433    // doesn't see a tier distinction; pre-freezer (no cold
4434    // segments loaded) every locator is `Hot` and every entry is
4435    // `Cow::Borrowed` — identical cost to the pre-v5.1 path.
4436    let mut out: Vec<Cow<'a, Row>> = Vec::with_capacity(locators.len());
4437    for loc in locators {
4438        match *loc {
4439            spg_storage::RowLocator::Hot(i) => {
4440                if let Some(row) = table.rows().get(i) {
4441                    out.push(Cow::Borrowed(row));
4442                }
4443            }
4444            spg_storage::RowLocator::Cold { segment_id, .. } => {
4445                if let Some(row) = catalog.resolve_cold_locator(table_name, segment_id, &key) {
4446                    out.push(Cow::Owned(row));
4447                }
4448            }
4449        }
4450    }
4451    Some(out)
4452}
4453
4454/// v5.2.3: extract `(column_position, IndexKey)` when `where_expr`
4455/// is a simple `col = literal` predicate suitable for a `BTree` index
4456/// seek. Used by `exec_update_cancel` / `exec_delete_cancel` to
4457/// decide whether a write touches a cold-tier row (which requires
4458/// promote-on-write / shadow-on-delete) before falling through to
4459/// the hot-tier row walk.
4460///
4461/// Returns `None` for any predicate shape the planner can't push
4462/// down to an index seek — complex WHERE clauses always take the
4463/// hot-only path (cold rows are immutable to non-indexed writes
4464/// until a future scan-fanout sub-version).
4465fn try_pk_predicate(
4466    where_expr: &Expr,
4467    schema_cols: &[ColumnSchema],
4468    table_alias: &str,
4469) -> Option<(usize, IndexKey)> {
4470    let Expr::Binary {
4471        lhs,
4472        op: BinOp::Eq,
4473        rhs,
4474    } = where_expr
4475    else {
4476        return None;
4477    };
4478    let (col_pos, value) = resolve_col_literal_pair(lhs, rhs, schema_cols, table_alias)
4479        .or_else(|| resolve_col_literal_pair(rhs, lhs, schema_cols, table_alias))?;
4480    let key = IndexKey::from_value(&value)?;
4481    Some((col_pos, key))
4482}
4483
4484fn resolve_col_literal_pair(
4485    col_side: &Expr,
4486    lit_side: &Expr,
4487    schema_cols: &[ColumnSchema],
4488    table_alias: &str,
4489) -> Option<(usize, Value)> {
4490    let Expr::Column(c) = col_side else {
4491        return None;
4492    };
4493    if let Some(q) = &c.qualifier
4494        && q != table_alias
4495    {
4496        return None;
4497    }
4498    let pos = schema_cols.iter().position(|s| s.name == c.name)?;
4499    let Expr::Literal(l) = lit_side else {
4500        return None;
4501    };
4502    let v = match l {
4503        Literal::Integer(n) => {
4504            if let Ok(small) = i32::try_from(*n) {
4505                Value::Int(small)
4506            } else {
4507                Value::BigInt(*n)
4508            }
4509        }
4510        Literal::Float(x) => Value::Float(*x),
4511        Literal::String(s) => Value::Text(s.clone()),
4512        Literal::Bool(b) => Value::Bool(*b),
4513        Literal::Null => Value::Null,
4514        // Vector and Interval literals can't be used as B-tree index keys.
4515        // Tell the planner to fall back to full-scan.
4516        Literal::Vector(_) | Literal::Interval { .. } => return None,
4517    };
4518    Some((pos, v))
4519}
4520
4521/// Find the schema entry that a SELECT-list `Expr::Column` refers to.
4522/// Mirrors `resolve_column` in `eval.rs`, but returns a proper
4523/// `EngineError` so the projection-build path keeps `UnknownQualifier`
4524/// vs `ColumnNotFound` distinct.
4525fn resolve_projection_column<'a>(
4526    c: &ColumnName,
4527    schema_cols: &'a [ColumnSchema],
4528    table_alias: &str,
4529) -> Result<&'a ColumnSchema, EngineError> {
4530    if let Some(q) = &c.qualifier {
4531        let composite = alloc::format!("{q}.{name}", name = c.name);
4532        if let Some(s) = schema_cols.iter().find(|s| s.name == composite) {
4533            return Ok(s);
4534        }
4535        // Single-table case: the qualifier may equal the active alias —
4536        // then look for the bare column name.
4537        if q == table_alias
4538            && let Some(s) = schema_cols.iter().find(|s| s.name == c.name)
4539        {
4540            return Ok(s);
4541        }
4542        // For multi-table schemas the qualifier is unknown only if no
4543        // column bears the "<q>." prefix. For single-table, the alias
4544        // mismatch alone is enough.
4545        let prefix = alloc::format!("{q}.");
4546        let qualifier_known =
4547            q == table_alias || schema_cols.iter().any(|s| s.name.starts_with(&prefix));
4548        if !qualifier_known {
4549            return Err(EngineError::Eval(EvalError::UnknownQualifier {
4550                qualifier: q.clone(),
4551            }));
4552        }
4553        return Err(EngineError::Eval(EvalError::ColumnNotFound {
4554            name: c.name.clone(),
4555        }));
4556    }
4557    if let Some(s) = schema_cols.iter().find(|s| s.name == c.name) {
4558        return Ok(s);
4559    }
4560    let suffix = alloc::format!(".{name}", name = c.name);
4561    let mut matches = schema_cols.iter().filter(|s| s.name.ends_with(&suffix));
4562    let first = matches.next();
4563    let extra = matches.next();
4564    match (first, extra) {
4565        (Some(s), None) => Ok(s),
4566        (Some(_), Some(_)) => Err(EngineError::Eval(EvalError::TypeMismatch {
4567            detail: alloc::format!("ambiguous column reference: {}", c.name),
4568        })),
4569        _ => Err(EngineError::Eval(EvalError::ColumnNotFound {
4570            name: c.name.clone(),
4571        })),
4572    }
4573}
4574
4575fn build_projection(
4576    items: &[SelectItem],
4577    schema_cols: &[ColumnSchema],
4578    table_alias: &str,
4579) -> Result<Vec<ProjectedItem>, EngineError> {
4580    let mut out = Vec::new();
4581    for item in items {
4582        match item {
4583            SelectItem::Wildcard => {
4584                for col in schema_cols {
4585                    out.push(ProjectedItem {
4586                        expr: Expr::Column(ColumnName {
4587                            qualifier: None,
4588                            name: col.name.clone(),
4589                        }),
4590                        output_name: col.name.clone(),
4591                        ty: col.ty,
4592                        nullable: col.nullable,
4593                    });
4594                }
4595            }
4596            SelectItem::Expr { expr, alias } => {
4597                // Plain column ref keeps full schema info (real type +
4598                // nullability). Compound expressions evaluate fine but have
4599                // no static type — surface them as nullable TEXT, which is
4600                // what most clients render anyway.
4601                if let Expr::Column(c) = expr {
4602                    let sch = resolve_projection_column(c, schema_cols, table_alias)?;
4603                    let output_name = alias.clone().unwrap_or_else(|| c.name.clone());
4604                    out.push(ProjectedItem {
4605                        expr: expr.clone(),
4606                        output_name,
4607                        ty: sch.ty,
4608                        nullable: sch.nullable,
4609                    });
4610                } else {
4611                    let output_name = alias.clone().unwrap_or_else(|| expr.to_string());
4612                    out.push(ProjectedItem {
4613                        expr: expr.clone(),
4614                        output_name,
4615                        ty: DataType::Text,
4616                        nullable: true,
4617                    });
4618                }
4619            }
4620        }
4621    }
4622    Ok(out)
4623}
4624
4625/// Promote an integer to a NUMERIC value at the requested scale.
4626/// Rejects values that, after scaling, would overflow the column's
4627/// precision budget.
4628fn numeric_from_integer(
4629    n: i128,
4630    precision: u8,
4631    scale: u8,
4632    col_name: &str,
4633) -> Result<Value, EngineError> {
4634    let factor = pow10_i128(scale);
4635    let scaled = n.checked_mul(factor).ok_or_else(|| {
4636        EngineError::Unsupported(alloc::format!(
4637            "integer overflow scaling value for column `{col_name}` to scale {scale}"
4638        ))
4639    })?;
4640    check_precision(scaled, precision, col_name)?;
4641    Ok(Value::Numeric { scaled, scale })
4642}
4643
4644/// Float → NUMERIC. Uses round-half-away-from-zero on `x * 10^scale`,
4645/// then verifies the result fits the column's precision.
4646#[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
4647fn numeric_from_float(
4648    x: f64,
4649    precision: u8,
4650    scale: u8,
4651    col_name: &str,
4652) -> Result<Value, EngineError> {
4653    if !x.is_finite() {
4654        return Err(EngineError::Unsupported(alloc::format!(
4655            "cannot store non-finite float in NUMERIC column `{col_name}`"
4656        )));
4657    }
4658    let mut factor = 1.0_f64;
4659    for _ in 0..scale {
4660        factor *= 10.0;
4661    }
4662    // Round half-away-from-zero by biasing then casting (`as i128`
4663    // truncates toward zero, so the bias + truncation gives the
4664    // desired rounding). `f64::floor` / `ceil` live in std; we don't
4665    // need them — the cast handles the truncation step.
4666    let shifted = x * factor;
4667    let biased = if shifted >= 0.0 {
4668        shifted + 0.5
4669    } else {
4670        shifted - 0.5
4671    };
4672    // Range-check before casting back to i128 — the cast itself is
4673    // saturating in Rust, which would silently truncate huge inputs.
4674    if !(-1e38..=1e38).contains(&biased) {
4675        return Err(EngineError::Unsupported(alloc::format!(
4676            "value {x} overflows NUMERIC range for column `{col_name}`"
4677        )));
4678    }
4679    let scaled = biased as i128;
4680    check_precision(scaled, precision, col_name)?;
4681    Ok(Value::Numeric { scaled, scale })
4682}
4683
4684/// Move a Numeric value from `src_scale` to `dst_scale`. Going up
4685/// multiplies by 10; going down rounds half-away-from-zero.
4686fn numeric_rescale(
4687    scaled: i128,
4688    src_scale: u8,
4689    precision: u8,
4690    dst_scale: u8,
4691    col_name: &str,
4692) -> Result<Value, EngineError> {
4693    let new_scaled = if dst_scale >= src_scale {
4694        let bump = pow10_i128(dst_scale - src_scale);
4695        scaled.checked_mul(bump).ok_or_else(|| {
4696            EngineError::Unsupported(alloc::format!(
4697                "overflow rescaling NUMERIC for column `{col_name}`"
4698            ))
4699        })?
4700    } else {
4701        let drop = pow10_i128(src_scale - dst_scale);
4702        let half = drop / 2;
4703        if scaled >= 0 {
4704            (scaled + half) / drop
4705        } else {
4706            (scaled - half) / drop
4707        }
4708    };
4709    check_precision(new_scaled, precision, col_name)?;
4710    Ok(Value::Numeric {
4711        scaled: new_scaled,
4712        scale: dst_scale,
4713    })
4714}
4715
4716/// Drop the fractional part of a scaled integer, returning the integer
4717/// portion (toward zero). Used for NUMERIC → INT casts.
4718const fn numeric_truncate_to_integer(scaled: i128, scale: u8) -> i128 {
4719    if scale == 0 {
4720        return scaled;
4721    }
4722    let factor = pow10_i128_const(scale);
4723    scaled / factor
4724}
4725
4726/// Verify a scaled NUMERIC value fits the column's declared precision.
4727/// `precision == 0` is the "unconstrained" form (bare `NUMERIC`); we
4728/// skip the check there.
4729fn check_precision(scaled: i128, precision: u8, col_name: &str) -> Result<(), EngineError> {
4730    if precision == 0 {
4731        return Ok(());
4732    }
4733    let limit = pow10_i128(precision);
4734    if scaled.unsigned_abs() >= limit.unsigned_abs() {
4735        return Err(EngineError::Unsupported(alloc::format!(
4736            "NUMERIC value exceeds precision {precision} for column `{col_name}`"
4737        )));
4738    }
4739    Ok(())
4740}
4741
4742const fn pow10_i128_const(p: u8) -> i128 {
4743    let mut acc: i128 = 1;
4744    let mut i = 0;
4745    while i < p {
4746        acc *= 10;
4747        i += 1;
4748    }
4749    acc
4750}
4751
4752fn pow10_i128(p: u8) -> i128 {
4753    pow10_i128_const(p)
4754}
4755
4756/// Walk a parsed `Statement`, swapping any `NOW()` /
4757/// `CURRENT_TIMESTAMP()` / `CURRENT_DATE()` function calls for a
4758/// literal cast that wraps the engine's per-statement clock reading.
4759/// When `now_micros` is `None`, calls stay as-is and surface as
4760/// `unknown function` at eval time — keeps the error path explicit.
4761/// v4.10: pre-walk the WHERE / projection / etc. of a SELECT and
4762/// replace every subquery node with a materialised literal. SPG
4763/// only supports uncorrelated subqueries — the inner SELECT does
4764/// not see outer-row columns, so the result is the same for every
4765/// outer row and can be evaluated once.
4766///
4767/// Returns the rewritten statement; the caller passes this to the
4768/// regular row-loop executor which no longer sees Subquery nodes
4769/// in its tree.
4770impl Engine {
4771    /// v4.12 window executor. Implements `ROW_NUMBER` / `RANK` /
4772    /// `DENSE_RANK` and the partition-aware aggregates `SUM` /
4773    /// `AVG` / `COUNT` / `MIN` / `MAX`. The plan is:
4774    /// 1. Apply the WHERE filter.
4775    /// 2. For each unique `WindowFunction` node in the projection,
4776    ///    partition + sort, compute the per-row value.
4777    /// 3. Append the window values as synthetic columns (`__win_N`)
4778    ///    to the row schema.
4779    /// 4. Rewrite the projection to read those columns.
4780    /// 5. Hand off to the regular project / ORDER BY / LIMIT pipe.
4781    #[allow(
4782        clippy::too_many_lines,
4783        clippy::type_complexity,
4784        clippy::needless_range_loop
4785    )] // window-eval is one cohesive pipe; splitting fragments
4786    fn exec_select_with_window(
4787        &self,
4788        stmt: &SelectStatement,
4789        cancel: CancelToken<'_>,
4790    ) -> Result<QueryResult, EngineError> {
4791        let from = stmt.from.as_ref().ok_or_else(|| {
4792            EngineError::Unsupported("window functions require a FROM clause".into())
4793        })?;
4794        // For v4.12 we only support a single-table FROM. Joins +
4795        // windows is queued for v5.x.
4796        if !from.joins.is_empty() {
4797            return Err(EngineError::Unsupported(
4798                "JOIN with window functions not yet supported".into(),
4799            ));
4800        }
4801        let primary = &from.primary;
4802        let table = self.active_catalog().get(&primary.name).ok_or_else(|| {
4803            StorageError::TableNotFound {
4804                name: primary.name.clone(),
4805            }
4806        })?;
4807        let alias = primary.alias.as_deref().unwrap_or(primary.name.as_str());
4808        let schema_cols = &table.schema().columns;
4809        let ctx = EvalContext::new(schema_cols, Some(alias));
4810
4811        // 1) Filter pass.
4812        let mut filtered: Vec<&Row> = Vec::new();
4813        for (i, row) in table.rows().iter().enumerate() {
4814            if i.is_multiple_of(256) {
4815                cancel.check()?;
4816            }
4817            if let Some(w) = &stmt.where_ {
4818                let cond = eval::eval_expr(w, row, &ctx)?;
4819                if !matches!(cond, Value::Bool(true)) {
4820                    continue;
4821                }
4822            }
4823            filtered.push(row);
4824        }
4825        let n_rows = filtered.len();
4826
4827        // 2) Collect unique window function nodes from projection.
4828        let mut window_nodes: Vec<Expr> = Vec::new();
4829        for item in &stmt.items {
4830            if let SelectItem::Expr { expr, .. } = item {
4831                collect_window_nodes(expr, &mut window_nodes);
4832            }
4833        }
4834
4835        // 3) For each window, compute per-row value.
4836        // Index: same order as window_nodes; for row i, win_vals[w][i].
4837        let mut win_vals: Vec<Vec<Value>> = Vec::with_capacity(window_nodes.len());
4838        for wnode in &window_nodes {
4839            let Expr::WindowFunction {
4840                name,
4841                args,
4842                partition_by,
4843                order_by,
4844                frame,
4845                null_treatment,
4846            } = wnode
4847            else {
4848                unreachable!("collect_window_nodes pushes only WindowFunction");
4849            };
4850            // Compute (partition_key, order_key, original_index) for each row.
4851            let mut indexed: Vec<(Vec<Value>, Vec<(Value, bool)>, usize)> =
4852                Vec::with_capacity(n_rows);
4853            for (i, row) in filtered.iter().enumerate() {
4854                let pkey: Vec<Value> = partition_by
4855                    .iter()
4856                    .map(|p| eval::eval_expr(p, row, &ctx))
4857                    .collect::<Result<_, _>>()?;
4858                let okey: Vec<(Value, bool)> = order_by
4859                    .iter()
4860                    .map(|(e, desc)| eval::eval_expr(e, row, &ctx).map(|v| (v, *desc)))
4861                    .collect::<Result<_, _>>()?;
4862                indexed.push((pkey, okey, i));
4863            }
4864            // Sort by (partition_key, order_key). Partition key uses
4865            // a stable encoded form; order key respects ASC/DESC.
4866            indexed.sort_by(|a, b| {
4867                let p_cmp = partition_key_cmp(&a.0, &b.0);
4868                if p_cmp != core::cmp::Ordering::Equal {
4869                    return p_cmp;
4870                }
4871                order_key_cmp(&a.1, &b.1)
4872            });
4873            // Per-partition compute.
4874            let mut out_vals: Vec<Value> = alloc::vec![Value::Null; n_rows];
4875            let mut p_start = 0;
4876            while p_start < indexed.len() {
4877                let mut p_end = p_start + 1;
4878                while p_end < indexed.len()
4879                    && partition_key_cmp(&indexed[p_start].0, &indexed[p_end].0)
4880                        == core::cmp::Ordering::Equal
4881                {
4882                    p_end += 1;
4883                }
4884                // Compute the function within this partition slice.
4885                compute_window_partition(
4886                    name,
4887                    args,
4888                    !order_by.is_empty(),
4889                    frame.as_ref(),
4890                    *null_treatment,
4891                    &indexed[p_start..p_end],
4892                    &filtered,
4893                    &ctx,
4894                    &mut out_vals,
4895                )?;
4896                p_start = p_end;
4897            }
4898            win_vals.push(out_vals);
4899        }
4900
4901        // 4) Build extended schema: original columns + synthetic.
4902        let mut ext_cols = schema_cols.clone();
4903        for i in 0..window_nodes.len() {
4904            ext_cols.push(ColumnSchema::new(
4905                alloc::format!("__win_{i}"),
4906                DataType::Text, // type doesn't matter for projection eval
4907                true,
4908            ));
4909        }
4910        // 5) Build extended rows: each row gets its window values appended.
4911        let mut ext_rows: Vec<Row> = Vec::with_capacity(n_rows);
4912        for i in 0..n_rows {
4913            let mut values = filtered[i].values.clone();
4914            for w in 0..window_nodes.len() {
4915                values.push(win_vals[w][i].clone());
4916            }
4917            ext_rows.push(Row::new(values));
4918        }
4919        // 6) Rewrite the projection: WindowFunction nodes → Column(__win_N).
4920        let mut rewritten_items: Vec<SelectItem> = Vec::with_capacity(stmt.items.len());
4921        for item in &stmt.items {
4922            let new_item = match item {
4923                SelectItem::Wildcard => SelectItem::Wildcard,
4924                SelectItem::Expr { expr, alias } => {
4925                    let mut e = expr.clone();
4926                    rewrite_window_to_columns(&mut e, &window_nodes);
4927                    SelectItem::Expr {
4928                        expr: e,
4929                        alias: alias.clone(),
4930                    }
4931                }
4932            };
4933            rewritten_items.push(new_item);
4934        }
4935
4936        // 7) Project into final rows.
4937        let ext_ctx = EvalContext::new(&ext_cols, Some(alias));
4938        let projection = build_projection(&rewritten_items, &ext_cols, alias)?;
4939        let mut tagged: Vec<(Vec<f64>, Row)> = Vec::with_capacity(n_rows);
4940        for (i, row) in ext_rows.iter().enumerate() {
4941            if i.is_multiple_of(256) {
4942                cancel.check()?;
4943            }
4944            let mut values = Vec::with_capacity(projection.len());
4945            for p in &projection {
4946                values.push(eval::eval_expr(&p.expr, row, &ext_ctx)?);
4947            }
4948            let order_keys = if stmt.order_by.is_empty() {
4949                Vec::new()
4950            } else {
4951                let mut keys = Vec::with_capacity(stmt.order_by.len());
4952                for o in &stmt.order_by {
4953                    let mut e = o.expr.clone();
4954                    rewrite_window_to_columns(&mut e, &window_nodes);
4955                    let key = eval::eval_expr(&e, row, &ext_ctx)?;
4956                    keys.push(value_to_order_key(&key)?);
4957                }
4958                keys
4959            };
4960            tagged.push((order_keys, Row::new(values)));
4961        }
4962        // ORDER BY + LIMIT/OFFSET on the projected rows.
4963        if !stmt.order_by.is_empty() {
4964            let descs: Vec<bool> = stmt.order_by.iter().map(|o| o.desc).collect();
4965            sort_by_keys(&mut tagged, &descs);
4966        }
4967        let mut out_rows: Vec<Row> = tagged.into_iter().map(|(_, r)| r).collect();
4968        apply_offset_and_limit(&mut out_rows, stmt.offset_literal(), stmt.limit_literal());
4969        let final_cols: Vec<ColumnSchema> = projection
4970            .into_iter()
4971            .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
4972            .collect();
4973        Ok(QueryResult::Rows {
4974            columns: final_cols,
4975            rows: out_rows,
4976        })
4977    }
4978
4979    /// v4.11: materialise each CTE into a temp table inside a
4980    /// cloned catalog, then run the body SELECT against a fresh
4981    /// engine instance that owns the enriched catalog. The clone
4982    /// is moderately expensive — only paid by CTE-bearing queries.
4983    /// Subqueries inside CTE bodies / the main body resolve as
4984    /// usual; `clock_fn` is propagated so `NOW()` lines up.
4985    fn exec_with_ctes(
4986        &self,
4987        stmt: &SelectStatement,
4988        cancel: CancelToken<'_>,
4989    ) -> Result<QueryResult, EngineError> {
4990        cancel.check()?;
4991        let mut catalog = self.active_catalog().clone();
4992        for cte in &stmt.ctes {
4993            if catalog.get(&cte.name).is_some() {
4994                return Err(EngineError::Unsupported(alloc::format!(
4995                    "CTE name {:?} shadows an existing table; rename the CTE",
4996                    cte.name
4997                )));
4998            }
4999            let (columns, rows) = if cte.recursive {
5000                self.materialise_recursive_cte(cte, &catalog, cancel)?
5001            } else {
5002                let body_result = self.exec_select_cancel(&cte.body, cancel)?;
5003                let QueryResult::Rows { columns, rows } = body_result else {
5004                    return Err(EngineError::Unsupported(alloc::format!(
5005                        "CTE {:?} body did not return rows",
5006                        cte.name
5007                    )));
5008                };
5009                (columns, rows)
5010            };
5011            // v4.22: the projection builder labels any non-column
5012            // expression as Text — including literal SELECT 1.
5013            // Promote each column's type to whatever the rows
5014            // actually carry so the CTE storage table accepts them.
5015            let inferred = infer_column_types(&columns, &rows);
5016            let mut columns = inferred;
5017            // v4.22: apply optional `WITH name(a, b, c)` overrides.
5018            if !cte.column_overrides.is_empty() {
5019                if cte.column_overrides.len() != columns.len() {
5020                    return Err(EngineError::Unsupported(alloc::format!(
5021                        "CTE {:?} column list has {} names but body returns {} columns",
5022                        cte.name,
5023                        cte.column_overrides.len(),
5024                        columns.len()
5025                    )));
5026                }
5027                for (col, name) in columns.iter_mut().zip(cte.column_overrides.iter()) {
5028                    col.name.clone_from(name);
5029                }
5030            }
5031            let schema = TableSchema::new(cte.name.clone(), columns);
5032            catalog.create_table(schema).map_err(EngineError::Storage)?;
5033            let table = catalog
5034                .get_mut(&cte.name)
5035                .expect("just-created CTE table must exist");
5036            for row in rows {
5037                table.insert(row).map_err(EngineError::Storage)?;
5038            }
5039        }
5040        // Strip CTEs from the body before running on the temp engine
5041        // so we don't recurse forever.
5042        let mut body = stmt.clone();
5043        body.ctes = Vec::new();
5044        let mut temp = Engine::restore(catalog);
5045        if let Some(c) = self.clock {
5046            temp = temp.with_clock(c);
5047        }
5048        if let Some(f) = self.salt_fn {
5049            temp = temp.with_salt_fn(f);
5050        }
5051        temp.exec_select_cancel(&body, cancel)
5052    }
5053
5054    /// v4.22: materialise a WITH RECURSIVE CTE. The body must be a
5055    /// UNION (or UNION ALL) of an anchor that does not reference
5056    /// the CTE name, and one or more recursive terms that do. The
5057    /// anchor runs first; each subsequent iteration runs the
5058    /// recursive term against a temp catalog where the CTE name is
5059    /// bound to the *previous* iteration's output. Iteration stops
5060    /// when the recursive term yields no rows; UNION (DISTINCT)
5061    /// deduplicates against the accumulated result, UNION ALL does
5062    /// not. A hard cap on total rows prevents runaway queries.
5063    #[allow(clippy::too_many_lines)]
5064    fn materialise_recursive_cte(
5065        &self,
5066        cte: &spg_sql::ast::Cte,
5067        base_catalog: &Catalog,
5068        cancel: CancelToken<'_>,
5069    ) -> Result<(Vec<ColumnSchema>, Vec<Row>), EngineError> {
5070        const MAX_TOTAL_ROWS: usize = 1_000_000;
5071        const MAX_ITERATIONS: usize = 100_000;
5072        cancel.check()?;
5073        if cte.body.unions.is_empty() {
5074            return Err(EngineError::Unsupported(alloc::format!(
5075                "WITH RECURSIVE {:?} body must be a UNION of an anchor and a recursive term",
5076                cte.name
5077            )));
5078        }
5079        // Anchor: the body's leading SELECT, with unions stripped.
5080        let mut anchor = cte.body.clone();
5081        let union_terms = core::mem::take(&mut anchor.unions);
5082        anchor.ctes = Vec::new();
5083        // Anchor must not reference the CTE name.
5084        if select_refers_to(&anchor, &cte.name) {
5085            return Err(EngineError::Unsupported(alloc::format!(
5086                "WITH RECURSIVE {:?}: the anchor must not reference the CTE itself",
5087                cte.name
5088            )));
5089        }
5090        let anchor_result = self.exec_select_cancel(&anchor, cancel)?;
5091        let QueryResult::Rows {
5092            columns: anchor_cols,
5093            rows: anchor_rows,
5094        } = anchor_result
5095        else {
5096            return Err(EngineError::Unsupported(alloc::format!(
5097                "WITH RECURSIVE {:?}: anchor did not return rows",
5098                cte.name
5099            )));
5100        };
5101        // The projection builder labels non-column expressions Text;
5102        // refine column types from the anchor's actual values so the
5103        // intermediate iter-catalog tables accept them.
5104        let mut columns = infer_column_types(&anchor_cols, &anchor_rows);
5105        if !cte.column_overrides.is_empty() {
5106            if cte.column_overrides.len() != columns.len() {
5107                return Err(EngineError::Unsupported(alloc::format!(
5108                    "CTE {:?} column list has {} names but anchor returns {} columns",
5109                    cte.name,
5110                    cte.column_overrides.len(),
5111                    columns.len()
5112                )));
5113            }
5114            for (col, name) in columns.iter_mut().zip(cte.column_overrides.iter()) {
5115                col.name.clone_from(name);
5116            }
5117        }
5118        let mut all_rows: Vec<Row> = anchor_rows.clone();
5119        let mut working_set: Vec<Row> = anchor_rows;
5120        let mut seen: alloc::collections::BTreeSet<Vec<u8>> = alloc::collections::BTreeSet::new();
5121        // Track at least one "all UNION ALL" flag — if every union
5122        // kind is ALL we skip the dedup step (faster + matches PG).
5123        let all_union_all = union_terms.iter().all(|(k, _)| matches!(k, UnionKind::All));
5124        if !all_union_all {
5125            for r in &all_rows {
5126                seen.insert(encode_row_key(r));
5127            }
5128        }
5129        for iter in 0..MAX_ITERATIONS {
5130            cancel.check()?;
5131            if working_set.is_empty() {
5132                break;
5133            }
5134            // Build a fresh catalog: base + CTE bound to working_set.
5135            let mut iter_catalog = base_catalog.clone();
5136            let schema = TableSchema::new(cte.name.clone(), columns.clone());
5137            iter_catalog
5138                .create_table(schema)
5139                .map_err(EngineError::Storage)?;
5140            {
5141                let table = iter_catalog.get_mut(&cte.name).expect("just-created");
5142                for row in &working_set {
5143                    table.insert(row.clone()).map_err(EngineError::Storage)?;
5144                }
5145            }
5146            let mut iter_engine = Engine::restore(iter_catalog);
5147            if let Some(c) = self.clock {
5148                iter_engine = iter_engine.with_clock(c);
5149            }
5150            if let Some(f) = self.salt_fn {
5151                iter_engine = iter_engine.with_salt_fn(f);
5152            }
5153            // Run each recursive term in sequence and collect new rows.
5154            let mut next_set: Vec<Row> = Vec::new();
5155            for (_, term) in &union_terms {
5156                let mut term = term.clone();
5157                term.ctes = Vec::new();
5158                let r = iter_engine.exec_select_cancel(&term, cancel)?;
5159                let QueryResult::Rows {
5160                    columns: rc,
5161                    rows: rs,
5162                } = r
5163                else {
5164                    return Err(EngineError::Unsupported(alloc::format!(
5165                        "WITH RECURSIVE {:?}: recursive term did not return rows",
5166                        cte.name
5167                    )));
5168                };
5169                if rc.len() != columns.len() {
5170                    return Err(EngineError::Unsupported(alloc::format!(
5171                        "WITH RECURSIVE {:?}: column count of recursive term ({}) does not match anchor ({})",
5172                        cte.name,
5173                        rc.len(),
5174                        columns.len()
5175                    )));
5176                }
5177                for row in rs {
5178                    if !all_union_all {
5179                        let key = encode_row_key(&row);
5180                        if !seen.insert(key) {
5181                            continue;
5182                        }
5183                    }
5184                    next_set.push(row);
5185                }
5186            }
5187            if next_set.is_empty() {
5188                break;
5189            }
5190            all_rows.extend(next_set.iter().cloned());
5191            working_set = next_set;
5192            if all_rows.len() > MAX_TOTAL_ROWS {
5193                return Err(EngineError::Unsupported(alloc::format!(
5194                    "WITH RECURSIVE {:?}: produced more than {MAX_TOTAL_ROWS} rows — likely runaway recursion",
5195                    cte.name
5196                )));
5197            }
5198            if iter + 1 == MAX_ITERATIONS {
5199                return Err(EngineError::Unsupported(alloc::format!(
5200                    "WITH RECURSIVE {:?}: exceeded {MAX_ITERATIONS} iterations",
5201                    cte.name
5202                )));
5203            }
5204        }
5205        Ok((columns, all_rows))
5206    }
5207
5208    fn resolve_select_subqueries(
5209        &self,
5210        stmt: &mut SelectStatement,
5211        cancel: CancelToken<'_>,
5212    ) -> Result<(), EngineError> {
5213        for item in &mut stmt.items {
5214            if let SelectItem::Expr { expr, .. } = item {
5215                self.resolve_expr_subqueries(expr, cancel)?;
5216            }
5217        }
5218        if let Some(w) = &mut stmt.where_ {
5219            self.resolve_expr_subqueries(w, cancel)?;
5220        }
5221        if let Some(gs) = &mut stmt.group_by {
5222            for g in gs {
5223                self.resolve_expr_subqueries(g, cancel)?;
5224            }
5225        }
5226        if let Some(h) = &mut stmt.having {
5227            self.resolve_expr_subqueries(h, cancel)?;
5228        }
5229        for o in &mut stmt.order_by {
5230            self.resolve_expr_subqueries(&mut o.expr, cancel)?;
5231        }
5232        for (_, peer) in &mut stmt.unions {
5233            self.resolve_select_subqueries(peer, cancel)?;
5234        }
5235        Ok(())
5236    }
5237
5238    #[allow(clippy::only_used_in_recursion)] // engine handle reads aren't really pure
5239    fn resolve_expr_subqueries(
5240        &self,
5241        e: &mut Expr,
5242        cancel: CancelToken<'_>,
5243    ) -> Result<(), EngineError> {
5244        // Replace-on-this-node cases first.
5245        if let Some(replacement) = self.subquery_replacement(e, cancel)? {
5246            *e = replacement;
5247            return Ok(());
5248        }
5249        match e {
5250            Expr::Binary { lhs, rhs, .. } => {
5251                self.resolve_expr_subqueries(lhs, cancel)?;
5252                self.resolve_expr_subqueries(rhs, cancel)?;
5253            }
5254            Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
5255                self.resolve_expr_subqueries(expr, cancel)?;
5256            }
5257            Expr::FunctionCall { args, .. } => {
5258                for a in args {
5259                    self.resolve_expr_subqueries(a, cancel)?;
5260                }
5261            }
5262            Expr::Like { expr, pattern, .. } => {
5263                self.resolve_expr_subqueries(expr, cancel)?;
5264                self.resolve_expr_subqueries(pattern, cancel)?;
5265            }
5266            Expr::Extract { source, .. } => self.resolve_expr_subqueries(source, cancel)?,
5267            // v4.12 window functions — recurse into args + ORDER BY
5268            // + PARTITION BY in case they carry inner subqueries.
5269            Expr::WindowFunction {
5270                args,
5271                partition_by,
5272                order_by,
5273                ..
5274            } => {
5275                for a in args {
5276                    self.resolve_expr_subqueries(a, cancel)?;
5277                }
5278                for p in partition_by {
5279                    self.resolve_expr_subqueries(p, cancel)?;
5280                }
5281                for (e, _) in order_by {
5282                    self.resolve_expr_subqueries(e, cancel)?;
5283                }
5284            }
5285            // Subquery nodes are handled in subquery_replacement
5286            // (which returned None — defensive no-op); Literal /
5287            // Column are leaves.
5288            Expr::ScalarSubquery(_)
5289            | Expr::Exists { .. }
5290            | Expr::InSubquery { .. }
5291            | Expr::Literal(_)
5292            | Expr::Placeholder(_)
5293            | Expr::Column(_) => {}
5294            // v7.10.10 — recurse children.
5295            Expr::Array(items) => {
5296                for elem in items {
5297                    self.resolve_expr_subqueries(elem, cancel)?;
5298                }
5299            }
5300            Expr::ArraySubscript { target, index } => {
5301                self.resolve_expr_subqueries(target, cancel)?;
5302                self.resolve_expr_subqueries(index, cancel)?;
5303            }
5304            Expr::AnyAll { expr, array, .. } => {
5305                self.resolve_expr_subqueries(expr, cancel)?;
5306                self.resolve_expr_subqueries(array, cancel)?;
5307            }
5308        }
5309        Ok(())
5310    }
5311
5312    /// v4.23: per-row eval that handles correlated subqueries.
5313    /// Equivalent to `eval::eval_expr` when the expression has no
5314    /// subqueries; otherwise clones the expression, substitutes
5315    /// outer-row columns into each surviving subquery node, runs
5316    /// the inner SELECT, and replaces the node with the literal
5317    /// result. Only the WHERE-filter call sites use this path so
5318    /// the uncorrelated fast path is preserved everywhere else.
5319    fn eval_expr_with_correlated(
5320        &self,
5321        expr: &Expr,
5322        row: &Row,
5323        ctx: &EvalContext<'_>,
5324        cancel: CancelToken<'_>,
5325        memo: Option<&mut memoize::MemoizeCache>,
5326    ) -> Result<Value, EngineError> {
5327        if !expr_has_subquery(expr) {
5328            return eval::eval_expr(expr, row, ctx).map_err(EngineError::Eval);
5329        }
5330        let mut e = expr.clone();
5331        self.resolve_correlated_in_expr(&mut e, row, ctx, cancel, memo)?;
5332        eval::eval_expr(&e, row, ctx).map_err(EngineError::Eval)
5333    }
5334
5335    fn resolve_correlated_in_expr(
5336        &self,
5337        e: &mut Expr,
5338        row: &Row,
5339        ctx: &EvalContext<'_>,
5340        cancel: CancelToken<'_>,
5341        mut memo: Option<&mut memoize::MemoizeCache>,
5342    ) -> Result<(), EngineError> {
5343        match e {
5344            Expr::ScalarSubquery(inner) => {
5345                // v6.2.6 — Memoize: build the cache key from the
5346                // pre-substitution subquery repr + the outer row's
5347                // values. Two outer rows with identical correlated
5348                // values hit the same entry.
5349                let cache_key = memo.as_ref().map(|_| memoize::CacheKey {
5350                    subquery_repr: alloc::format!("{}", **inner),
5351                    outer_values: row.values.clone(),
5352                });
5353                if let (Some(cache), Some(k)) = (memo.as_deref_mut(), cache_key.as_ref())
5354                    && let Some(cached) = cache.get(k)
5355                {
5356                    *e = value_to_literal_expr(cached)?;
5357                    return Ok(());
5358                }
5359                let mut s = (**inner).clone();
5360                substitute_outer_columns(&mut s, row, ctx);
5361                let r = self.exec_select_cancel(&s, cancel)?;
5362                let QueryResult::Rows { rows, .. } = r else {
5363                    return Err(EngineError::Unsupported(
5364                        "scalar subquery: inner did not return rows".into(),
5365                    ));
5366                };
5367                let value = match rows.as_slice() {
5368                    [] => Value::Null,
5369                    [r0] => r0.values.first().cloned().unwrap_or(Value::Null),
5370                    _ => {
5371                        return Err(EngineError::Unsupported(alloc::format!(
5372                            "scalar subquery returned {} rows; expected 0 or 1",
5373                            rows.len()
5374                        )));
5375                    }
5376                };
5377                if let (Some(cache), Some(k)) = (memo.as_deref_mut(), cache_key) {
5378                    cache.insert(k, value.clone());
5379                }
5380                *e = value_to_literal_expr(value)?;
5381            }
5382            Expr::Exists { subquery, negated } => {
5383                let mut s = (**subquery).clone();
5384                substitute_outer_columns(&mut s, row, ctx);
5385                let r = self.exec_select_cancel(&s, cancel)?;
5386                let exists = matches!(r, QueryResult::Rows { rows, .. } if !rows.is_empty());
5387                let bit = if *negated { !exists } else { exists };
5388                *e = Expr::Literal(Literal::Bool(bit));
5389            }
5390            Expr::InSubquery {
5391                expr: lhs,
5392                subquery,
5393                negated,
5394            } => {
5395                self.resolve_correlated_in_expr(lhs, row, ctx, cancel, memo.as_deref_mut())?;
5396                let lhs_val = eval::eval_expr(lhs, row, ctx).map_err(EngineError::Eval)?;
5397                let mut s = (**subquery).clone();
5398                substitute_outer_columns(&mut s, row, ctx);
5399                let r = self.exec_select_cancel(&s, cancel)?;
5400                let QueryResult::Rows { columns, rows, .. } = r else {
5401                    return Err(EngineError::Unsupported(
5402                        "IN-subquery: inner did not return rows".into(),
5403                    ));
5404                };
5405                if columns.len() != 1 {
5406                    return Err(EngineError::Unsupported(alloc::format!(
5407                        "IN-subquery must project exactly one column; got {}",
5408                        columns.len()
5409                    )));
5410                }
5411                let mut found = false;
5412                let mut any_null = false;
5413                for r0 in rows {
5414                    let v = r0.values.into_iter().next().unwrap_or(Value::Null);
5415                    if v.is_null() {
5416                        any_null = true;
5417                        continue;
5418                    }
5419                    if value_cmp(&v, &lhs_val) == core::cmp::Ordering::Equal {
5420                        found = true;
5421                        break;
5422                    }
5423                }
5424                let bit = if found {
5425                    !*negated
5426                } else if any_null {
5427                    return Err(EngineError::Unsupported(
5428                        "IN-subquery with NULL in result and no match: NULL semantics not yet implemented".into(),
5429                    ));
5430                } else {
5431                    *negated
5432                };
5433                *e = Expr::Literal(Literal::Bool(bit));
5434            }
5435            Expr::Binary { lhs, rhs, .. } => {
5436                self.resolve_correlated_in_expr(lhs, row, ctx, cancel, memo.as_deref_mut())?;
5437                self.resolve_correlated_in_expr(rhs, row, ctx, cancel, memo.as_deref_mut())?;
5438            }
5439            Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
5440                self.resolve_correlated_in_expr(expr, row, ctx, cancel, memo.as_deref_mut())?;
5441            }
5442            Expr::Like { expr, pattern, .. } => {
5443                self.resolve_correlated_in_expr(expr, row, ctx, cancel, memo.as_deref_mut())?;
5444                self.resolve_correlated_in_expr(pattern, row, ctx, cancel, memo.as_deref_mut())?;
5445            }
5446            Expr::FunctionCall { args, .. } => {
5447                for a in args {
5448                    self.resolve_correlated_in_expr(a, row, ctx, cancel, memo.as_deref_mut())?;
5449                }
5450            }
5451            Expr::Extract { source, .. } => {
5452                self.resolve_correlated_in_expr(source, row, ctx, cancel, memo.as_deref_mut())?;
5453            }
5454            Expr::WindowFunction { .. } | Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => {}
5455            // v7.10.10 — recurse children.
5456            Expr::Array(items) => {
5457                for elem in items {
5458                    self.resolve_correlated_in_expr(elem, row, ctx, cancel, memo.as_deref_mut())?;
5459                }
5460            }
5461            Expr::ArraySubscript { target, index } => {
5462                self.resolve_correlated_in_expr(target, row, ctx, cancel, memo.as_deref_mut())?;
5463                self.resolve_correlated_in_expr(index, row, ctx, cancel, memo.as_deref_mut())?;
5464            }
5465            Expr::AnyAll { expr, array, .. } => {
5466                self.resolve_correlated_in_expr(expr, row, ctx, cancel, memo.as_deref_mut())?;
5467                self.resolve_correlated_in_expr(array, row, ctx, cancel, memo.as_deref_mut())?;
5468            }
5469        }
5470        Ok(())
5471    }
5472
5473    fn subquery_replacement(
5474        &self,
5475        e: &Expr,
5476        cancel: CancelToken<'_>,
5477    ) -> Result<Option<Expr>, EngineError> {
5478        match e {
5479            Expr::ScalarSubquery(inner) => {
5480                let mut s = (**inner).clone();
5481                // Recurse into the inner SELECT first so nested
5482                // subqueries materialise bottom-up.
5483                self.resolve_select_subqueries(&mut s, cancel)?;
5484                let r = match self.exec_bare_select_cancel(&s, cancel) {
5485                    Ok(r) => r,
5486                    Err(e) if is_correlation_error(&e) => return Ok(None),
5487                    Err(e) => return Err(e),
5488                };
5489                let QueryResult::Rows { rows, .. } = r else {
5490                    return Err(EngineError::Unsupported(
5491                        "scalar subquery: inner statement did not return rows".into(),
5492                    ));
5493                };
5494                let value = match rows.as_slice() {
5495                    [] => Value::Null,
5496                    [row] => row.values.first().cloned().unwrap_or(Value::Null),
5497                    _ => {
5498                        return Err(EngineError::Unsupported(alloc::format!(
5499                            "scalar subquery returned {} rows; expected 0 or 1",
5500                            rows.len()
5501                        )));
5502                    }
5503                };
5504                Ok(Some(value_to_literal_expr(value)?))
5505            }
5506            Expr::Exists { subquery, negated } => {
5507                let mut s = (**subquery).clone();
5508                self.resolve_select_subqueries(&mut s, cancel)?;
5509                let r = match self.exec_bare_select_cancel(&s, cancel) {
5510                    Ok(r) => r,
5511                    Err(e) if is_correlation_error(&e) => return Ok(None),
5512                    Err(e) => return Err(e),
5513                };
5514                let exists = match r {
5515                    QueryResult::Rows { rows, .. } => !rows.is_empty(),
5516                    QueryResult::CommandOk { .. } => false,
5517                };
5518                let bit = if *negated { !exists } else { exists };
5519                Ok(Some(Expr::Literal(Literal::Bool(bit))))
5520            }
5521            Expr::InSubquery {
5522                expr,
5523                subquery,
5524                negated,
5525            } => {
5526                let mut s = (**subquery).clone();
5527                self.resolve_select_subqueries(&mut s, cancel)?;
5528                let r = match self.exec_bare_select_cancel(&s, cancel) {
5529                    Ok(r) => r,
5530                    Err(e) if is_correlation_error(&e) => return Ok(None),
5531                    Err(e) => return Err(e),
5532                };
5533                let QueryResult::Rows { columns, rows, .. } = r else {
5534                    return Err(EngineError::Unsupported(
5535                        "IN-subquery: inner statement did not return rows".into(),
5536                    ));
5537                };
5538                if columns.len() != 1 {
5539                    return Err(EngineError::Unsupported(alloc::format!(
5540                        "IN-subquery must project exactly one column; got {}",
5541                        columns.len()
5542                    )));
5543                }
5544                // Build the same OR-Eq chain the parse-time literal-list
5545                // path constructs, with each value lifted into a Literal.
5546                let mut acc: Option<Expr> = None;
5547                for row in rows {
5548                    let v = row.values.into_iter().next().unwrap_or(Value::Null);
5549                    let lit = value_to_literal_expr(v)?;
5550                    let cmp = Expr::Binary {
5551                        lhs: expr.clone(),
5552                        op: BinOp::Eq,
5553                        rhs: Box::new(lit),
5554                    };
5555                    acc = Some(match acc {
5556                        None => cmp,
5557                        Some(prev) => Expr::Binary {
5558                            lhs: Box::new(prev),
5559                            op: BinOp::Or,
5560                            rhs: Box::new(cmp),
5561                        },
5562                    });
5563                }
5564                let combined = acc.unwrap_or(Expr::Literal(Literal::Bool(false)));
5565                let final_expr = if *negated {
5566                    Expr::Unary {
5567                        op: UnOp::Not,
5568                        expr: Box::new(combined),
5569                    }
5570                } else {
5571                    combined
5572                };
5573                Ok(Some(final_expr))
5574            }
5575            _ => Ok(None),
5576        }
5577    }
5578}
5579
5580// ---- v4.12 window-function helpers ----
5581// The (partition-key, order-key, original-index) tuple shape used
5582// across these helpers is intrinsic to the planner. Factoring it
5583// into a typedef adds indirection without making the code clearer,
5584// so several lints are allowed inline on the affected functions
5585// rather than module-wide.
5586
5587/// v4.22: cheap structural scan for `FROM <name>` (qualified or
5588/// not) inside a SELECT — used to verify the anchor of a WITH
5589/// RECURSIVE CTE doesn't recurse into itself. Conservative: walks
5590/// FROM joins, subqueries, and unions.
5591fn select_refers_to(stmt: &SelectStatement, target: &str) -> bool {
5592    if let Some(from) = &stmt.from
5593        && from_refers_to(from, target)
5594    {
5595        return true;
5596    }
5597    for (_, peer) in &stmt.unions {
5598        if select_refers_to(peer, target) {
5599            return true;
5600        }
5601    }
5602    for item in &stmt.items {
5603        if let SelectItem::Expr { expr, .. } = item
5604            && expr_refers_to(expr, target)
5605        {
5606            return true;
5607        }
5608    }
5609    if let Some(w) = &stmt.where_
5610        && expr_refers_to(w, target)
5611    {
5612        return true;
5613    }
5614    false
5615}
5616
5617fn from_refers_to(from: &FromClause, target: &str) -> bool {
5618    if from.primary.name.eq_ignore_ascii_case(target) {
5619        return true;
5620    }
5621    from.joins
5622        .iter()
5623        .any(|j| j.table.name.eq_ignore_ascii_case(target))
5624}
5625
5626fn expr_refers_to(e: &Expr, target: &str) -> bool {
5627    match e {
5628        Expr::ScalarSubquery(s) => select_refers_to(s, target),
5629        Expr::Exists { subquery, .. } | Expr::InSubquery { subquery, .. } => {
5630            select_refers_to(subquery, target)
5631        }
5632        Expr::Binary { lhs, rhs, .. } => expr_refers_to(lhs, target) || expr_refers_to(rhs, target),
5633        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
5634            expr_refers_to(expr, target)
5635        }
5636        Expr::Like { expr, pattern, .. } => {
5637            expr_refers_to(expr, target) || expr_refers_to(pattern, target)
5638        }
5639        Expr::FunctionCall { args, .. } => args.iter().any(|a| expr_refers_to(a, target)),
5640        Expr::Extract { source, .. } => expr_refers_to(source, target),
5641        Expr::WindowFunction {
5642            args,
5643            partition_by,
5644            order_by,
5645            ..
5646        } => {
5647            args.iter().any(|a| expr_refers_to(a, target))
5648                || partition_by.iter().any(|p| expr_refers_to(p, target))
5649                || order_by.iter().any(|(o, _)| expr_refers_to(o, target))
5650        }
5651        Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => false,
5652        Expr::Array(items) => items.iter().any(|e| expr_refers_to(e, target)),
5653        Expr::ArraySubscript { target: t, index } => {
5654            expr_refers_to(t, target) || expr_refers_to(index, target)
5655        }
5656        Expr::AnyAll { expr, array, .. } => {
5657            expr_refers_to(expr, target) || expr_refers_to(array, target)
5658        }
5659    }
5660}
5661
5662/// v4.22: pick more specific column types from observed rows when
5663/// the projection builder defaulted to Text (the v1.x behavior for
5664/// non-column expressions). Lets `WITH t(n) AS (SELECT 1 ...)`
5665/// land an Int column in the CTE storage table rather than failing
5666/// the insert with "expected TEXT, got INT".
5667fn infer_column_types(columns: &[ColumnSchema], rows: &[Row]) -> Vec<ColumnSchema> {
5668    let mut out = columns.to_vec();
5669    for (col_idx, col) in out.iter_mut().enumerate() {
5670        if col.ty != DataType::Text {
5671            continue;
5672        }
5673        let mut inferred: Option<DataType> = None;
5674        let mut all_null = true;
5675        for row in rows {
5676            let Some(v) = row.values.get(col_idx) else {
5677                continue;
5678            };
5679            let ty = match v {
5680                Value::Null => continue,
5681                Value::SmallInt(_) => DataType::SmallInt,
5682                Value::Int(_) => DataType::Int,
5683                Value::BigInt(_) => DataType::BigInt,
5684                Value::Float(_) => DataType::Float,
5685                Value::Bool(_) => DataType::Bool,
5686                Value::Vector(_) => DataType::Vector {
5687                    dim: 0,
5688                    encoding: VecEncoding::F32,
5689                },
5690                _ => DataType::Text,
5691            };
5692            all_null = false;
5693            inferred = Some(match inferred {
5694                None => ty,
5695                Some(prev) if prev == ty => prev,
5696                Some(_) => DataType::Text,
5697            });
5698        }
5699        if let Some(t) = inferred {
5700            col.ty = t;
5701            col.nullable = true;
5702        } else if all_null {
5703            col.nullable = true;
5704        }
5705    }
5706    out
5707}
5708
5709/// v4.26: render a human-readable plan tree for `EXPLAIN <select>`.
5710/// Lines are pushed into `out`; `depth` controls indentation. We
5711/// describe the rewritten SELECT — what the executor *would* do —
5712/// using the engine handle to spot indexed lookups and table shapes.
5713#[allow(clippy::too_many_lines, clippy::format_push_string)]
5714/// v6.2.4 — Walk every line of the rendered plan tree and append
5715/// per-operator stats. Lines that name a known operator get
5716/// `(rows=N)` (`actual_rows` of the top-level operator equals the
5717/// final result row count; scans report their catalog row count
5718/// as the rows-considered metric). Other lines — Filter / Join /
5719/// GroupBy / OrderBy etc. — are marked `(—)` so the surface is
5720/// complete-by-construction; v6.2.5 fills these in via inline
5721/// executor counters.
5722/// v6.8.3 — surface "CREATE INDEX …" suggestions for every
5723/// `(table, column)` pair the query touches via WHERE / JOIN
5724/// that doesn't already have an index on the owning table.
5725/// Walks the SELECT's FROM clauses + WHERE expression tree;
5726/// returns one line per missing index. Deterministic order:
5727/// FROM-clause iteration order, then column-reference walk
5728/// order inside each WHERE. Each suggestion is a copy-pastable
5729/// DDL string.
5730fn build_index_suggestions(stmt: &SelectStatement, engine: &Engine) -> Vec<String> {
5731    use alloc::collections::BTreeSet;
5732    let mut seen: BTreeSet<(String, String)> = BTreeSet::new();
5733    let mut out: Vec<String> = Vec::new();
5734    let cat = engine.active_catalog();
5735    // Build a (table, qualifier-or-alias) list from the FROM clause
5736    // so unqualified column refs in WHERE resolve to the correct
5737    // table.
5738    let Some(from) = &stmt.from else {
5739        return out;
5740    };
5741    let mut tables: Vec<String> = Vec::new();
5742    tables.push(from.primary.name.clone());
5743    for j in &from.joins {
5744        tables.push(j.table.name.clone());
5745    }
5746    // Collect column refs from the WHERE expression. JOIN ON
5747    // predicates also feed in.
5748    let mut col_refs: Vec<spg_sql::ast::ColumnName> = Vec::new();
5749    if let Some(w) = &stmt.where_ {
5750        collect_column_refs(w, &mut col_refs);
5751    }
5752    for j in &from.joins {
5753        if let Some(on) = &j.on {
5754            collect_column_refs(on, &mut col_refs);
5755        }
5756    }
5757    for cn in &col_refs {
5758        // Resolve owner table: explicit qualifier first, else
5759        // first table in FROM that has a column of this name.
5760        let owner: Option<String> = if let Some(q) = &cn.qualifier {
5761            tables.iter().find(|t| t == &q).cloned()
5762        } else {
5763            tables.iter().find_map(|t| {
5764                cat.get(t).and_then(|tbl| {
5765                    if tbl.schema().column_position(&cn.name).is_some() {
5766                        Some(t.clone())
5767                    } else {
5768                        None
5769                    }
5770                })
5771            })
5772        };
5773        let Some(owner) = owner else {
5774            continue;
5775        };
5776        let Some(tbl) = cat.get(&owner) else {
5777            continue;
5778        };
5779        let Some(col_pos) = tbl.schema().column_position(&cn.name) else {
5780            continue;
5781        };
5782        // Skip if any BTree index already covers this column as
5783        // its key.
5784        let already_indexed = tbl.indices().iter().any(|i| {
5785            matches!(i.kind, spg_storage::IndexKind::BTree(_))
5786                && i.column_position == col_pos
5787                && i.expression.is_none()
5788                && i.partial_predicate.is_none()
5789        });
5790        if already_indexed {
5791            continue;
5792        }
5793        if seen.insert((owner.clone(), cn.name.clone())) {
5794            out.push(alloc::format!(
5795                "SUGGEST: CREATE INDEX ix_{}_{} ON {} ({})",
5796                owner,
5797                cn.name,
5798                owner,
5799                cn.name
5800            ));
5801        }
5802    }
5803    out
5804}
5805
5806/// Walks an `Expr` and pushes every `ColumnName` it references.
5807/// Order is depth-first, left-to-right.
5808fn collect_column_refs(expr: &Expr, out: &mut Vec<spg_sql::ast::ColumnName>) {
5809    match expr {
5810        Expr::Column(cn) => out.push(cn.clone()),
5811        Expr::FunctionCall { args, .. } => {
5812            for a in args {
5813                collect_column_refs(a, out);
5814            }
5815        }
5816        Expr::Binary { lhs, rhs, .. } => {
5817            collect_column_refs(lhs, out);
5818            collect_column_refs(rhs, out);
5819        }
5820        Expr::Unary { expr: e, .. } => collect_column_refs(e, out),
5821        _ => {}
5822    }
5823}
5824
5825fn annotate_explain_lines(lines: &mut [String], total_rows: usize, engine: &Engine) {
5826    let catalog = engine.active_catalog();
5827    let cold_ids = catalog.cold_segment_ids_global();
5828    let any_cold = !cold_ids.is_empty();
5829    let cold_ids_repr = if any_cold {
5830        let mut s = alloc::string::String::from("[");
5831        for (i, id) in cold_ids.iter().enumerate() {
5832            if i > 0 {
5833                s.push(',');
5834            }
5835            s.push_str(&alloc::format!("{id}"));
5836        }
5837        s.push(']');
5838        s
5839    } else {
5840        alloc::string::String::new()
5841    };
5842    for (idx, line) in lines.iter_mut().enumerate() {
5843        let trimmed = line.trim_start();
5844        let is_top_level = idx == 0;
5845        if is_top_level {
5846            line.push_str(&alloc::format!(" (rows={total_rows})"));
5847            continue;
5848        }
5849        if let Some(rest) = trimmed.strip_prefix("From: ") {
5850            let (name, scan_kind) = match rest.split_once(" [") {
5851                Some((n, k)) => (n.trim(), k.trim_end_matches(']')),
5852                None => (rest.trim(), ""),
5853            };
5854            let bare = name.split_whitespace().next().unwrap_or(name);
5855            let hot = catalog.get(bare).map(|t| t.rows().len());
5856            // v6.2.7 — `cold_segments=[id0,id1,…]` enumerates every
5857            // cold-tier segment the scan COULD have walked. v6.2.x
5858            // can tighten to per-table by walking the table's
5859            // BTree-index cold locators.
5860            let annot = match (hot, scan_kind) {
5861                (Some(h), "full scan") => {
5862                    let mut s = alloc::format!(" (hot_rows={h}");
5863                    if any_cold {
5864                        s.push_str(&alloc::format!(
5865                            ", cold_tier=present, cold_segments={cold_ids_repr}"
5866                        ));
5867                    }
5868                    s.push(')');
5869                    s
5870                }
5871                (Some(h), "index seek") => {
5872                    let mut s = alloc::format!(" (hot_rows≤{h}");
5873                    if any_cold {
5874                        s.push_str(&alloc::format!(
5875                            ", cold_tier=present, cold_segments={cold_ids_repr}"
5876                        ));
5877                    }
5878                    s.push(')');
5879                    s
5880                }
5881                _ => " (rows=—)".to_string(),
5882            };
5883            line.push_str(&annot);
5884            continue;
5885        }
5886        // Filter / GroupBy / Having / OrderBy / Limit / Join etc.
5887        line.push_str(" (rows=—)");
5888    }
5889}
5890
5891fn explain_select(stmt: &SelectStatement, engine: &Engine, depth: usize, out: &mut Vec<String>) {
5892    let pad = "  ".repeat(depth);
5893    // 1) Top-level operator label.
5894    let top = if !stmt.ctes.is_empty() {
5895        if stmt.ctes.iter().any(|c| c.recursive) {
5896            "CTEScan (WITH RECURSIVE)"
5897        } else {
5898            "CTEScan (WITH)"
5899        }
5900    } else if !stmt.unions.is_empty() {
5901        "UnionScan"
5902    } else if select_has_window(stmt) {
5903        "WindowAgg"
5904    } else if aggregate::uses_aggregate(stmt) {
5905        "Aggregate"
5906    } else if stmt.distinct {
5907        "Distinct"
5908    } else if stmt.from.is_some() {
5909        "TableScan"
5910    } else {
5911        "Result"
5912    };
5913    out.push(alloc::format!("{pad}{top}"));
5914    let child = "  ".repeat(depth + 1);
5915    // 2) CTE bodies.
5916    for cte in &stmt.ctes {
5917        let head = if cte.recursive {
5918            alloc::format!("{child}CTE (recursive): {}", cte.name)
5919        } else {
5920            alloc::format!("{child}CTE: {}", cte.name)
5921        };
5922        out.push(head);
5923        explain_select(&cte.body, engine, depth + 2, out);
5924    }
5925    // 3) FROM details — primary table + joins, index hits.
5926    if let Some(from) = &stmt.from {
5927        let mut tag = alloc::format!("{child}From: {}", from.primary.name);
5928        if let Some(alias) = &from.primary.alias {
5929            tag.push_str(&alloc::format!(" AS {alias}"));
5930        }
5931        // Try to detect an index-seek opportunity on WHERE against
5932        // the primary table — same heuristic the executor uses.
5933        if let Some(w) = &stmt.where_
5934            && let Some(table) = engine.active_catalog().get(&from.primary.name)
5935        {
5936            let alias = from.primary.alias.as_deref().unwrap_or(&from.primary.name);
5937            let cols = &table.schema().columns;
5938            if try_index_seek(w, cols, engine.active_catalog(), table, alias).is_some() {
5939                tag.push_str(" [index seek]");
5940            } else {
5941                tag.push_str(" [full scan]");
5942            }
5943        } else {
5944            tag.push_str(" [full scan]");
5945        }
5946        out.push(tag);
5947        for j in &from.joins {
5948            let kind = match j.kind {
5949                spg_sql::ast::JoinKind::Inner => "INNER JOIN",
5950                spg_sql::ast::JoinKind::Left => "LEFT JOIN",
5951                spg_sql::ast::JoinKind::Cross => "CROSS JOIN",
5952            };
5953            let mut s = alloc::format!("{child}{kind}: {}", j.table.name);
5954            if let Some(alias) = &j.table.alias {
5955                s.push_str(&alloc::format!(" AS {alias}"));
5956            }
5957            if j.on.is_some() {
5958                s.push_str(" (ON …)");
5959            }
5960            out.push(s);
5961        }
5962    }
5963    // 4) WHERE / GROUP BY / HAVING / ORDER BY / LIMIT / OFFSET.
5964    if let Some(w) = &stmt.where_ {
5965        let mut s = alloc::format!("{child}Filter: {w}");
5966        if expr_has_subquery(w) {
5967            s.push_str(" [subquery]");
5968        }
5969        out.push(s);
5970    }
5971    if let Some(gs) = &stmt.group_by {
5972        let mut parts = Vec::new();
5973        for g in gs {
5974            parts.push(alloc::format!("{g}"));
5975        }
5976        out.push(alloc::format!("{child}GroupBy: {}", parts.join(", ")));
5977    }
5978    if let Some(h) = &stmt.having {
5979        out.push(alloc::format!("{child}Having: {h}"));
5980    }
5981    for o in &stmt.order_by {
5982        let dir = if o.desc { "DESC" } else { "ASC" };
5983        out.push(alloc::format!("{child}OrderBy: {} {dir}", o.expr));
5984    }
5985    if let Some(lim) = stmt.limit {
5986        out.push(alloc::format!("{child}Limit: {lim}"));
5987    }
5988    if let Some(off) = stmt.offset {
5989        out.push(alloc::format!("{child}Offset: {off}"));
5990    }
5991    // 5) Projection — collapse Wildcard or render N items.
5992    if stmt
5993        .items
5994        .iter()
5995        .any(|it| matches!(it, SelectItem::Wildcard))
5996    {
5997        out.push(alloc::format!("{child}Project: *"));
5998    } else {
5999        out.push(alloc::format!(
6000            "{child}Project: {} item(s)",
6001            stmt.items.len()
6002        ));
6003    }
6004    // 6) Recurse into UNION peers.
6005    for (kind, peer) in &stmt.unions {
6006        let label = match kind {
6007            UnionKind::All => "UNION ALL",
6008            UnionKind::Distinct => "UNION",
6009        };
6010        out.push(alloc::format!("{child}{label}"));
6011        explain_select(peer, engine, depth + 2, out);
6012    }
6013}
6014
6015/// v4.23: recognise the engine errors that indicate the inner
6016/// SELECT couldn't be evaluated in isolation because it references
6017/// an outer column — used by `subquery_replacement` to skip
6018/// materialisation and let row-eval handle it instead.
6019fn is_correlation_error(e: &EngineError) -> bool {
6020    matches!(
6021        e,
6022        EngineError::Eval(
6023            eval::EvalError::ColumnNotFound { .. } | eval::EvalError::UnknownQualifier { .. }
6024        )
6025    )
6026}
6027
6028/// v4.23: walk every Expr in `stmt` and replace each Column ref
6029/// that targets the outer scope (qualifier matches the outer
6030/// table alias) with a Literal carrying the outer row's value.
6031/// Conservative: only qualified refs are substituted, so the user
6032/// must write `outer_alias.col` to reference an outer column. This
6033/// matches PG's lexical scoping for correlated subqueries and
6034/// avoids accidentally rebinding inner columns of the same name.
6035fn substitute_outer_columns(stmt: &mut SelectStatement, row: &Row, ctx: &EvalContext<'_>) {
6036    let Some(outer_alias) = ctx.table_alias else {
6037        return;
6038    };
6039    substitute_in_select(stmt, row, ctx, outer_alias);
6040}
6041
6042fn substitute_in_select(
6043    stmt: &mut SelectStatement,
6044    row: &Row,
6045    ctx: &EvalContext<'_>,
6046    outer_alias: &str,
6047) {
6048    for item in &mut stmt.items {
6049        if let SelectItem::Expr { expr, .. } = item {
6050            substitute_in_expr(expr, row, ctx, outer_alias);
6051        }
6052    }
6053    if let Some(w) = &mut stmt.where_ {
6054        substitute_in_expr(w, row, ctx, outer_alias);
6055    }
6056    if let Some(gs) = &mut stmt.group_by {
6057        for g in gs {
6058            substitute_in_expr(g, row, ctx, outer_alias);
6059        }
6060    }
6061    if let Some(h) = &mut stmt.having {
6062        substitute_in_expr(h, row, ctx, outer_alias);
6063    }
6064    for o in &mut stmt.order_by {
6065        substitute_in_expr(&mut o.expr, row, ctx, outer_alias);
6066    }
6067    for (_, peer) in &mut stmt.unions {
6068        substitute_in_select(peer, row, ctx, outer_alias);
6069    }
6070}
6071
6072fn substitute_in_expr(e: &mut Expr, row: &Row, ctx: &EvalContext<'_>, outer_alias: &str) {
6073    if let Expr::Column(c) = e
6074        && let Some(qual) = &c.qualifier
6075        && qual.eq_ignore_ascii_case(outer_alias)
6076    {
6077        // Look up the column's index in the outer schema.
6078        if let Some(idx) = ctx
6079            .columns
6080            .iter()
6081            .position(|sc| sc.name.eq_ignore_ascii_case(&c.name))
6082        {
6083            let v = row.values.get(idx).cloned().unwrap_or(Value::Null);
6084            if let Ok(lit) = value_to_literal_expr(v) {
6085                *e = lit;
6086                return;
6087            }
6088        }
6089    }
6090    match e {
6091        Expr::Binary { lhs, rhs, .. } => {
6092            substitute_in_expr(lhs, row, ctx, outer_alias);
6093            substitute_in_expr(rhs, row, ctx, outer_alias);
6094        }
6095        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
6096            substitute_in_expr(expr, row, ctx, outer_alias);
6097        }
6098        Expr::Like { expr, pattern, .. } => {
6099            substitute_in_expr(expr, row, ctx, outer_alias);
6100            substitute_in_expr(pattern, row, ctx, outer_alias);
6101        }
6102        Expr::FunctionCall { args, .. } => {
6103            for a in args {
6104                substitute_in_expr(a, row, ctx, outer_alias);
6105            }
6106        }
6107        Expr::Extract { source, .. } => substitute_in_expr(source, row, ctx, outer_alias),
6108        Expr::WindowFunction {
6109            args,
6110            partition_by,
6111            order_by,
6112            ..
6113        } => {
6114            for a in args {
6115                substitute_in_expr(a, row, ctx, outer_alias);
6116            }
6117            for p in partition_by {
6118                substitute_in_expr(p, row, ctx, outer_alias);
6119            }
6120            for (o, _) in order_by {
6121                substitute_in_expr(o, row, ctx, outer_alias);
6122            }
6123        }
6124        Expr::ScalarSubquery(s) => substitute_in_select(s, row, ctx, outer_alias),
6125        Expr::Exists { subquery, .. } | Expr::InSubquery { subquery, .. } => {
6126            substitute_in_select(subquery, row, ctx, outer_alias);
6127        }
6128        Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => {}
6129        Expr::Array(items) => {
6130            for elem in items {
6131                substitute_in_expr(elem, row, ctx, outer_alias);
6132            }
6133        }
6134        Expr::ArraySubscript { target, index } => {
6135            substitute_in_expr(target, row, ctx, outer_alias);
6136            substitute_in_expr(index, row, ctx, outer_alias);
6137        }
6138        Expr::AnyAll { expr, array, .. } => {
6139            substitute_in_expr(expr, row, ctx, outer_alias);
6140            substitute_in_expr(array, row, ctx, outer_alias);
6141        }
6142    }
6143}
6144
6145/// v4.22: encode a Row to a comparable byte key for UNION-DISTINCT
6146/// dedup inside the recursive iteration. Crude but deterministic
6147/// — Debug prints embed type discriminants so NULL ≠ "" ≠ 0.
6148fn encode_row_key(row: &Row) -> Vec<u8> {
6149    let mut out = Vec::new();
6150    for v in &row.values {
6151        let s = alloc::format!("{v:?}|");
6152        out.extend_from_slice(s.as_bytes());
6153    }
6154    out
6155}
6156
6157fn select_has_window(stmt: &SelectStatement) -> bool {
6158    for item in &stmt.items {
6159        if let SelectItem::Expr { expr, .. } = item
6160            && expr_has_window(expr)
6161        {
6162            return true;
6163        }
6164    }
6165    false
6166}
6167
6168fn expr_has_window(e: &Expr) -> bool {
6169    match e {
6170        Expr::WindowFunction { .. } => true,
6171        Expr::Binary { lhs, rhs, .. } => expr_has_window(lhs) || expr_has_window(rhs),
6172        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
6173            expr_has_window(expr)
6174        }
6175        Expr::FunctionCall { args, .. } => args.iter().any(expr_has_window),
6176        Expr::Like { expr, pattern, .. } => expr_has_window(expr) || expr_has_window(pattern),
6177        Expr::Extract { source, .. } => expr_has_window(source),
6178        Expr::ScalarSubquery(_)
6179        | Expr::Exists { .. }
6180        | Expr::InSubquery { .. }
6181        | Expr::Literal(_)
6182        | Expr::Placeholder(_)
6183        | Expr::Column(_) => false,
6184        Expr::Array(items) => items.iter().any(expr_has_window),
6185        Expr::ArraySubscript { target, index } => {
6186            expr_has_window(target) || expr_has_window(index)
6187        }
6188        Expr::AnyAll { expr, array, .. } => {
6189            expr_has_window(expr) || expr_has_window(array)
6190        }
6191    }
6192}
6193
6194fn collect_window_nodes(e: &Expr, out: &mut Vec<Expr>) {
6195    if let Expr::WindowFunction { .. } = e {
6196        // Deduplicate by structural equality on the expression
6197        // (cheap because window args + partition + order are
6198        // small). Without dedup we'd recompute identical windows
6199        // once per occurrence in the projection.
6200        if !out.iter().any(|x| x == e) {
6201            out.push(e.clone());
6202        }
6203        return;
6204    }
6205    match e {
6206        // Already handled by the early-return at the top.
6207        Expr::WindowFunction { .. } => unreachable!(),
6208        Expr::Binary { lhs, rhs, .. } => {
6209            collect_window_nodes(lhs, out);
6210            collect_window_nodes(rhs, out);
6211        }
6212        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
6213            collect_window_nodes(expr, out);
6214        }
6215        Expr::FunctionCall { args, .. } => {
6216            for a in args {
6217                collect_window_nodes(a, out);
6218            }
6219        }
6220        Expr::Like { expr, pattern, .. } => {
6221            collect_window_nodes(expr, out);
6222            collect_window_nodes(pattern, out);
6223        }
6224        Expr::Extract { source, .. } => collect_window_nodes(source, out),
6225        _ => {}
6226    }
6227}
6228
6229fn rewrite_window_to_columns(e: &mut Expr, window_nodes: &[Expr]) {
6230    if let Expr::WindowFunction { .. } = e
6231        && let Some(idx) = window_nodes.iter().position(|w| w == e)
6232    {
6233        *e = Expr::Column(spg_sql::ast::ColumnName {
6234            qualifier: None,
6235            name: alloc::format!("__win_{idx}"),
6236        });
6237        return;
6238    }
6239    match e {
6240        Expr::Binary { lhs, rhs, .. } => {
6241            rewrite_window_to_columns(lhs, window_nodes);
6242            rewrite_window_to_columns(rhs, window_nodes);
6243        }
6244        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
6245            rewrite_window_to_columns(expr, window_nodes);
6246        }
6247        Expr::FunctionCall { args, .. } => {
6248            for a in args {
6249                rewrite_window_to_columns(a, window_nodes);
6250            }
6251        }
6252        Expr::Like { expr, pattern, .. } => {
6253            rewrite_window_to_columns(expr, window_nodes);
6254            rewrite_window_to_columns(pattern, window_nodes);
6255        }
6256        Expr::Extract { source, .. } => rewrite_window_to_columns(source, window_nodes),
6257        _ => {}
6258    }
6259}
6260
6261/// Total order over partition-key tuples. NULL sorts as the
6262/// lowest value (matches the `<` partial order's NULL-last
6263/// behaviour with `INFINITY` flipped).
6264fn partition_key_cmp(a: &[Value], b: &[Value]) -> core::cmp::Ordering {
6265    for (x, y) in a.iter().zip(b.iter()) {
6266        let c = value_cmp(x, y);
6267        if c != core::cmp::Ordering::Equal {
6268            return c;
6269        }
6270    }
6271    a.len().cmp(&b.len())
6272}
6273
6274fn order_key_cmp(a: &[(Value, bool)], b: &[(Value, bool)]) -> core::cmp::Ordering {
6275    for ((va, desc), (vb, _)) in a.iter().zip(b.iter()) {
6276        let c = value_cmp(va, vb);
6277        let c = if *desc { c.reverse() } else { c };
6278        if c != core::cmp::Ordering::Equal {
6279            return c;
6280        }
6281    }
6282    a.len().cmp(&b.len())
6283}
6284
6285#[allow(clippy::match_same_arms)] // explicit arms per type document the supported pairs
6286fn value_cmp(a: &Value, b: &Value) -> core::cmp::Ordering {
6287    use core::cmp::Ordering;
6288    match (a, b) {
6289        (Value::Null, Value::Null) => Ordering::Equal,
6290        (Value::Null, _) => Ordering::Less,
6291        (_, Value::Null) => Ordering::Greater,
6292        (Value::Int(x), Value::Int(y)) => x.cmp(y),
6293        (Value::BigInt(x), Value::BigInt(y)) => x.cmp(y),
6294        (Value::SmallInt(x), Value::SmallInt(y)) => x.cmp(y),
6295        (Value::Text(x), Value::Text(y)) => x.cmp(y),
6296        (Value::Bool(x), Value::Bool(y)) => x.cmp(y),
6297        (Value::Float(x), Value::Float(y)) => x.partial_cmp(y).unwrap_or(Ordering::Equal),
6298        (Value::Date(x), Value::Date(y)) => x.cmp(y),
6299        (Value::Timestamp(x), Value::Timestamp(y)) => x.cmp(y),
6300        // Cross-type compare: fall back to the debug rendering —
6301        // same-partition is the goal, exact order is irrelevant.
6302        _ => alloc::format!("{a:?}").cmp(&alloc::format!("{b:?}")),
6303    }
6304}
6305
6306/// Compute the window function's per-row output for one partition.
6307/// `slice` has (partition key, order key, original-row-index)
6308/// tuples already sorted by order key. `filtered_rows` is the
6309/// full row list indexed by original-row-index. `out_vals` is
6310/// the destination, also indexed by original-row-index.
6311#[allow(
6312    clippy::too_many_arguments,
6313    clippy::cast_possible_truncation,
6314    clippy::cast_possible_wrap,
6315    clippy::cast_precision_loss,
6316    clippy::cast_sign_loss,
6317    clippy::doc_markdown,
6318    clippy::too_many_lines,
6319    clippy::type_complexity,
6320    clippy::match_same_arms
6321)]
6322fn compute_window_partition(
6323    name: &str,
6324    args: &[Expr],
6325    ordered: bool,
6326    frame: Option<&WindowFrame>,
6327    null_treatment: spg_sql::ast::NullTreatment,
6328    slice: &[(Vec<Value>, Vec<(Value, bool)>, usize)],
6329    filtered_rows: &[&Row],
6330    ctx: &EvalContext<'_>,
6331    out_vals: &mut [Value],
6332) -> Result<(), EngineError> {
6333    let ignore_nulls = matches!(null_treatment, spg_sql::ast::NullTreatment::Ignore);
6334    let lower = name.to_ascii_lowercase();
6335    match lower.as_str() {
6336        "row_number" => {
6337            for (rank, (_, _, idx)) in slice.iter().enumerate() {
6338                out_vals[*idx] = Value::BigInt((rank + 1) as i64);
6339            }
6340            Ok(())
6341        }
6342        "rank" => {
6343            let mut prev_key: Option<&[(Value, bool)]> = None;
6344            let mut current_rank: i64 = 1;
6345            for (i, (_, okey, idx)) in slice.iter().enumerate() {
6346                if let Some(p) = prev_key
6347                    && order_key_cmp(p, okey) != core::cmp::Ordering::Equal
6348                {
6349                    current_rank = (i + 1) as i64;
6350                }
6351                if prev_key.is_none() {
6352                    current_rank = 1;
6353                }
6354                out_vals[*idx] = Value::BigInt(current_rank);
6355                prev_key = Some(okey.as_slice());
6356            }
6357            Ok(())
6358        }
6359        "dense_rank" => {
6360            let mut prev_key: Option<&[(Value, bool)]> = None;
6361            let mut current_rank: i64 = 0;
6362            for (_, okey, idx) in slice {
6363                if prev_key.is_none_or(|p| order_key_cmp(p, okey) != core::cmp::Ordering::Equal) {
6364                    current_rank += 1;
6365                }
6366                out_vals[*idx] = Value::BigInt(current_rank);
6367                prev_key = Some(okey.as_slice());
6368            }
6369            Ok(())
6370        }
6371        "sum" | "avg" | "min" | "max" | "count" | "count_star" => {
6372            // Pre-evaluate the function arg per row in the slice
6373            // (count_star has no arg).
6374            let arg_values: Vec<Value> = if lower == "count_star" || args.is_empty() {
6375                slice.iter().map(|_| Value::Null).collect()
6376            } else {
6377                slice
6378                    .iter()
6379                    .map(|(_, _, idx)| eval::eval_expr(&args[0], filtered_rows[*idx], ctx))
6380                    .collect::<Result<_, _>>()
6381                    .map_err(EngineError::Eval)?
6382            };
6383            // v4.20: pick the effective frame. Explicit frame
6384            // overrides the implicit default (running for ordered,
6385            // whole-partition for unordered).
6386            let eff = effective_frame(frame, ordered)?;
6387            #[allow(clippy::needless_range_loop)]
6388            for i in 0..slice.len() {
6389                let (lo, hi) = frame_bounds_for_row(&eff, i, slice);
6390                let mut sum: f64 = 0.0;
6391                let mut count: i64 = 0;
6392                let mut min_v: Option<f64> = None;
6393                let mut max_v: Option<f64> = None;
6394                let mut row_count: i64 = 0;
6395                if lo <= hi {
6396                    for j in lo..=hi {
6397                        let v = &arg_values[j];
6398                        match lower.as_str() {
6399                            "count_star" => row_count += 1,
6400                            "count" => {
6401                                if !v.is_null() {
6402                                    count += 1;
6403                                }
6404                            }
6405                            _ => {
6406                                if let Some(x) = value_to_f64(v) {
6407                                    sum += x;
6408                                    count += 1;
6409                                    min_v = Some(min_v.map_or(x, |m| m.min(x)));
6410                                    max_v = Some(max_v.map_or(x, |m| m.max(x)));
6411                                }
6412                            }
6413                        }
6414                    }
6415                }
6416                let value = match lower.as_str() {
6417                    "count_star" => Value::BigInt(row_count),
6418                    "count" => Value::BigInt(count),
6419                    "sum" => Value::Float(sum),
6420                    "avg" => {
6421                        if count == 0 {
6422                            Value::Null
6423                        } else {
6424                            Value::Float(sum / count as f64)
6425                        }
6426                    }
6427                    "min" => min_v.map_or(Value::Null, Value::Float),
6428                    "max" => max_v.map_or(Value::Null, Value::Float),
6429                    _ => unreachable!(),
6430                };
6431                let (_, _, idx) = &slice[i];
6432                out_vals[*idx] = value;
6433            }
6434            Ok(())
6435        }
6436        "lag" | "lead" => {
6437            // lag(expr [, offset [, default]])
6438            // lead(expr [, offset [, default]])
6439            if args.is_empty() {
6440                return Err(EngineError::Unsupported(alloc::format!(
6441                    "{lower}() requires at least one argument"
6442                )));
6443            }
6444            let offset: i64 = if args.len() >= 2 {
6445                let v = eval::eval_expr(&args[1], filtered_rows[slice[0].2], ctx)
6446                    .map_err(EngineError::Eval)?;
6447                match v {
6448                    Value::SmallInt(n) => i64::from(n),
6449                    Value::Int(n) => i64::from(n),
6450                    Value::BigInt(n) => n,
6451                    _ => {
6452                        return Err(EngineError::Unsupported(alloc::format!(
6453                            "{lower}() offset must be integer"
6454                        )));
6455                    }
6456                }
6457            } else {
6458                1
6459            };
6460            let default: Value = if args.len() >= 3 {
6461                eval::eval_expr(&args[2], filtered_rows[slice[0].2], ctx)
6462                    .map_err(EngineError::Eval)?
6463            } else {
6464                Value::Null
6465            };
6466            let values: Vec<Value> = slice
6467                .iter()
6468                .map(|(_, _, idx)| eval::eval_expr(&args[0], filtered_rows[*idx], ctx))
6469                .collect::<Result<_, _>>()
6470                .map_err(EngineError::Eval)?;
6471            let n = slice.len();
6472            for (i, (_, _, idx)) in slice.iter().enumerate() {
6473                let signed_offset = if lower == "lag" { -offset } else { offset };
6474                let v = if ignore_nulls {
6475                    // v6.4.2 — IGNORE NULLS: walk in the offset direction
6476                    // skipping NULL values; the `offset`-th non-NULL
6477                    // encountered is the result.
6478                    let step: i64 = if signed_offset >= 0 { 1 } else { -1 };
6479                    let needed: i64 = signed_offset.abs();
6480                    if needed == 0 {
6481                        values[i].clone()
6482                    } else {
6483                        let mut j: i64 = i as i64;
6484                        let mut hits: i64 = 0;
6485                        let mut found: Option<Value> = None;
6486                        loop {
6487                            j += step;
6488                            if j < 0 || j >= n as i64 {
6489                                break;
6490                            }
6491                            #[allow(clippy::cast_sign_loss)]
6492                            let v = &values[j as usize];
6493                            if !v.is_null() {
6494                                hits += 1;
6495                                if hits == needed {
6496                                    found = Some(v.clone());
6497                                    break;
6498                                }
6499                            }
6500                        }
6501                        found.unwrap_or_else(|| default.clone())
6502                    }
6503                } else {
6504                    let target_signed = i64::try_from(i).unwrap_or(i64::MAX) + signed_offset;
6505                    if target_signed < 0
6506                        || target_signed >= i64::try_from(n).unwrap_or(i64::MAX)
6507                    {
6508                        default.clone()
6509                    } else {
6510                        #[allow(clippy::cast_sign_loss)]
6511                        {
6512                            values[target_signed as usize].clone()
6513                        }
6514                    }
6515                };
6516                out_vals[*idx] = v;
6517            }
6518            Ok(())
6519        }
6520        "first_value" | "last_value" | "nth_value" => {
6521            if args.is_empty() {
6522                return Err(EngineError::Unsupported(alloc::format!(
6523                    "{lower}() requires at least one argument"
6524                )));
6525            }
6526            let values: Vec<Value> = slice
6527                .iter()
6528                .map(|(_, _, idx)| eval::eval_expr(&args[0], filtered_rows[*idx], ctx))
6529                .collect::<Result<_, _>>()
6530                .map_err(EngineError::Eval)?;
6531            let nth: usize = if lower == "nth_value" {
6532                if args.len() < 2 {
6533                    return Err(EngineError::Unsupported(
6534                        "nth_value() requires (expr, n)".into(),
6535                    ));
6536                }
6537                let v = eval::eval_expr(&args[1], filtered_rows[slice[0].2], ctx)
6538                    .map_err(EngineError::Eval)?;
6539                let raw = match v {
6540                    Value::SmallInt(n) => i64::from(n),
6541                    Value::Int(n) => i64::from(n),
6542                    Value::BigInt(n) => n,
6543                    _ => {
6544                        return Err(EngineError::Unsupported(
6545                            "nth_value() n must be integer".into(),
6546                        ));
6547                    }
6548                };
6549                if raw < 1 {
6550                    return Err(EngineError::Unsupported(
6551                        "nth_value() n must be >= 1".into(),
6552                    ));
6553                }
6554                #[allow(clippy::cast_sign_loss)]
6555                {
6556                    raw as usize
6557                }
6558            } else {
6559                0
6560            };
6561            let eff = effective_frame(frame, ordered)?;
6562            for i in 0..slice.len() {
6563                let (lo, hi) = frame_bounds_for_row(&eff, i, slice);
6564                let (_, _, idx) = &slice[i];
6565                let v = if lo > hi {
6566                    Value::Null
6567                } else if ignore_nulls && matches!(lower.as_str(), "first_value" | "last_value") {
6568                    // v6.4.2 — IGNORE NULLS: skip NULL cells when
6569                    // selecting the boundary value within the frame.
6570                    if lower == "first_value" {
6571                        (lo..=hi)
6572                            .find_map(|j| {
6573                                let v = &values[j];
6574                                (!v.is_null()).then(|| v.clone())
6575                            })
6576                            .unwrap_or(Value::Null)
6577                    } else {
6578                        (lo..=hi)
6579                            .rev()
6580                            .find_map(|j| {
6581                                let v = &values[j];
6582                                (!v.is_null()).then(|| v.clone())
6583                            })
6584                            .unwrap_or(Value::Null)
6585                    }
6586                } else {
6587                    match lower.as_str() {
6588                        "first_value" => values[lo].clone(),
6589                        "last_value" => values[hi].clone(),
6590                        "nth_value" => {
6591                            let pos = lo + nth - 1;
6592                            if pos > hi {
6593                                Value::Null
6594                            } else {
6595                                values[pos].clone()
6596                            }
6597                        }
6598                        _ => unreachable!(),
6599                    }
6600                };
6601                out_vals[*idx] = v;
6602            }
6603            Ok(())
6604        }
6605        "ntile" => {
6606            if args.is_empty() {
6607                return Err(EngineError::Unsupported(
6608                    "ntile(n) requires an integer argument".into(),
6609                ));
6610            }
6611            let v = eval::eval_expr(&args[0], filtered_rows[slice[0].2], ctx)
6612                .map_err(EngineError::Eval)?;
6613            let bucket_count: i64 = match v {
6614                Value::SmallInt(n) => i64::from(n),
6615                Value::Int(n) => i64::from(n),
6616                Value::BigInt(n) => n,
6617                _ => {
6618                    return Err(EngineError::Unsupported(
6619                        "ntile() argument must be integer".into(),
6620                    ));
6621                }
6622            };
6623            if bucket_count < 1 {
6624                return Err(EngineError::Unsupported(
6625                    "ntile() argument must be >= 1".into(),
6626                ));
6627            }
6628            #[allow(clippy::cast_sign_loss)]
6629            let buckets = bucket_count as usize;
6630            let n = slice.len();
6631            // Each bucket gets `base` rows; the first `extras` buckets
6632            // get one extra. PG semantics.
6633            let base = n / buckets;
6634            let extras = n % buckets;
6635            let mut bucket: usize = 1;
6636            let mut remaining_in_bucket = if extras > 0 { base + 1 } else { base };
6637            let mut buckets_with_extra_remaining = extras;
6638            for (_, _, idx) in slice {
6639                if remaining_in_bucket == 0 {
6640                    bucket += 1;
6641                    buckets_with_extra_remaining = buckets_with_extra_remaining.saturating_sub(1);
6642                    remaining_in_bucket = if buckets_with_extra_remaining > 0 {
6643                        base + 1
6644                    } else {
6645                        base
6646                    };
6647                    // Edge: if base==0 and extras==0, all rows fit;
6648                    // shouldn't reach here, but guard anyway.
6649                    if remaining_in_bucket == 0 {
6650                        remaining_in_bucket = 1;
6651                    }
6652                }
6653                out_vals[*idx] = Value::BigInt(i64::try_from(bucket).unwrap_or(i64::MAX));
6654                remaining_in_bucket -= 1;
6655            }
6656            Ok(())
6657        }
6658        "percent_rank" => {
6659            // (rank - 1) / (n - 1) where rank is the standard RANK().
6660            // Single-row partitions get 0.
6661            let n = slice.len();
6662            let mut prev_key: Option<&[(Value, bool)]> = None;
6663            let mut current_rank: i64 = 1;
6664            for (i, (_, okey, idx)) in slice.iter().enumerate() {
6665                if let Some(p) = prev_key
6666                    && order_key_cmp(p, okey) != core::cmp::Ordering::Equal
6667                {
6668                    current_rank = i64::try_from(i + 1).unwrap_or(i64::MAX);
6669                }
6670                if prev_key.is_none() {
6671                    current_rank = 1;
6672                }
6673                #[allow(clippy::cast_precision_loss)]
6674                let pr = if n <= 1 {
6675                    0.0
6676                } else {
6677                    (current_rank - 1) as f64 / (n - 1) as f64
6678                };
6679                out_vals[*idx] = Value::Float(pr);
6680                prev_key = Some(okey.as_slice());
6681            }
6682            Ok(())
6683        }
6684        "cume_dist" => {
6685            // # rows up to and including this row's peer group / n.
6686            let n = slice.len();
6687            // First pass: find peer-group-end rank for each row.
6688            for i in 0..slice.len() {
6689                let peer_end = peer_group_end(slice, i);
6690                #[allow(clippy::cast_precision_loss)]
6691                let cd = (peer_end + 1) as f64 / n as f64;
6692                let (_, _, idx) = &slice[i];
6693                out_vals[*idx] = Value::Float(cd);
6694            }
6695            Ok(())
6696        }
6697        other => Err(EngineError::Unsupported(alloc::format!(
6698            "window function {other:?} not supported (v4.21: row_number/rank/dense_rank/sum/avg/count/min/max/lag/lead/first_value/last_value/nth_value/ntile/percent_rank/cume_dist)"
6699        ))),
6700    }
6701}
6702
6703/// v4.20: resolve the user-provided frame down to a normalised
6704/// `(kind, start, end)`. `None` means default — derive from
6705/// `ordered`: ordered ⇒ RANGE UNBOUNDED PRECEDING AND CURRENT ROW,
6706/// unordered ⇒ ROWS UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING.
6707/// Single-bound shorthand (e.g. `ROWS 5 PRECEDING`) normalises
6708/// end → CURRENT ROW per the PG spec.
6709fn effective_frame(
6710    frame: Option<&WindowFrame>,
6711    ordered: bool,
6712) -> Result<(FrameKind, FrameBound, FrameBound), EngineError> {
6713    match frame {
6714        None => {
6715            if ordered {
6716                Ok((
6717                    FrameKind::Range,
6718                    FrameBound::UnboundedPreceding,
6719                    FrameBound::CurrentRow,
6720                ))
6721            } else {
6722                Ok((
6723                    FrameKind::Rows,
6724                    FrameBound::UnboundedPreceding,
6725                    FrameBound::UnboundedFollowing,
6726                ))
6727            }
6728        }
6729        Some(fr) => {
6730            let end = fr.end.clone().unwrap_or(FrameBound::CurrentRow);
6731            // Reject start > end (a few impossible combinations).
6732            if matches!(fr.start, FrameBound::UnboundedFollowing)
6733                || matches!(end, FrameBound::UnboundedPreceding)
6734            {
6735                return Err(EngineError::Unsupported(alloc::format!(
6736                    "invalid frame: start={:?} end={:?}",
6737                    fr.start,
6738                    end
6739                )));
6740            }
6741            // RANGE OFFSET PRECEDING / FOLLOWING needs value-typed
6742            // arithmetic on the ORDER BY key (e.g. `RANGE BETWEEN
6743            // INTERVAL '1 day' PRECEDING AND CURRENT ROW`). Not
6744            // implemented in v4.20.
6745            if fr.kind == FrameKind::Range
6746                && (matches!(
6747                    fr.start,
6748                    FrameBound::OffsetPreceding(_) | FrameBound::OffsetFollowing(_)
6749                ) || matches!(
6750                    end,
6751                    FrameBound::OffsetPreceding(_) | FrameBound::OffsetFollowing(_)
6752                ))
6753            {
6754                return Err(EngineError::Unsupported(
6755                    "RANGE with explicit offset bounds is not supported (v4.20: only UNBOUNDED / CURRENT ROW for RANGE)".into(),
6756                ));
6757            }
6758            Ok((fr.kind, fr.start.clone(), end))
6759        }
6760    }
6761}
6762
6763/// Compute `(lo, hi)` row-index bounds inside the partition slice
6764/// for the row at position `i`. Inclusive, clamped to
6765/// `[0, slice.len()-1]`. Empty result if `lo > hi`.
6766#[allow(clippy::type_complexity)]
6767fn frame_bounds_for_row(
6768    eff: &(FrameKind, FrameBound, FrameBound),
6769    i: usize,
6770    slice: &[(Vec<Value>, Vec<(Value, bool)>, usize)],
6771) -> (usize, usize) {
6772    let (kind, start, end) = eff;
6773    let n = slice.len();
6774    let last = n.saturating_sub(1);
6775    let (mut lo, mut hi) = match kind {
6776        FrameKind::Rows => {
6777            let lo = match start {
6778                FrameBound::UnboundedPreceding => 0,
6779                FrameBound::OffsetPreceding(k) => {
6780                    let k = usize::try_from(*k).unwrap_or(usize::MAX);
6781                    i.saturating_sub(k)
6782                }
6783                FrameBound::CurrentRow => i,
6784                FrameBound::OffsetFollowing(k) => {
6785                    let k = usize::try_from(*k).unwrap_or(usize::MAX);
6786                    i.saturating_add(k).min(last)
6787                }
6788                FrameBound::UnboundedFollowing => last,
6789            };
6790            let hi = match end {
6791                FrameBound::UnboundedPreceding => 0,
6792                FrameBound::OffsetPreceding(k) => {
6793                    let k = usize::try_from(*k).unwrap_or(usize::MAX);
6794                    i.saturating_sub(k)
6795                }
6796                FrameBound::CurrentRow => i,
6797                FrameBound::OffsetFollowing(k) => {
6798                    let k = usize::try_from(*k).unwrap_or(usize::MAX);
6799                    i.saturating_add(k).min(last)
6800                }
6801                FrameBound::UnboundedFollowing => last,
6802            };
6803            (lo, hi)
6804        }
6805        FrameKind::Range => {
6806            // RANGE bounds are peer-aware. With only UNBOUNDED and
6807            // CURRENT ROW supported (rejected at effective_frame for
6808            // explicit offsets), the start/end map to the
6809            // partition's full extent at the same-order-key peer
6810            // group boundary.
6811            let lo = match start {
6812                FrameBound::UnboundedPreceding => 0,
6813                FrameBound::CurrentRow => peer_group_start(slice, i),
6814                FrameBound::UnboundedFollowing => last,
6815                _ => unreachable!("offset bounds rejected for RANGE"),
6816            };
6817            let hi = match end {
6818                FrameBound::UnboundedPreceding => 0,
6819                FrameBound::CurrentRow => peer_group_end(slice, i),
6820                FrameBound::UnboundedFollowing => last,
6821                _ => unreachable!("offset bounds rejected for RANGE"),
6822            };
6823            (lo, hi)
6824        }
6825    };
6826    if hi >= n {
6827        hi = last;
6828    }
6829    if lo >= n {
6830        lo = last;
6831    }
6832    (lo, hi)
6833}
6834
6835/// Find the inclusive index of the first row with the same ORDER
6836/// BY key as `slice[i]`. Slice is already sorted by partition then
6837/// order, so peers are contiguous.
6838#[allow(clippy::type_complexity)]
6839fn peer_group_start(slice: &[(Vec<Value>, Vec<(Value, bool)>, usize)], i: usize) -> usize {
6840    let key = &slice[i].1;
6841    let mut j = i;
6842    while j > 0 && order_key_cmp(&slice[j - 1].1, key) == core::cmp::Ordering::Equal {
6843        j -= 1;
6844    }
6845    j
6846}
6847
6848/// Find the inclusive index of the last row with the same ORDER
6849/// BY key as `slice[i]`.
6850#[allow(clippy::type_complexity)]
6851fn peer_group_end(slice: &[(Vec<Value>, Vec<(Value, bool)>, usize)], i: usize) -> usize {
6852    let key = &slice[i].1;
6853    let mut j = i;
6854    while j + 1 < slice.len() && order_key_cmp(&slice[j + 1].1, key) == core::cmp::Ordering::Equal {
6855        j += 1;
6856    }
6857    j
6858}
6859
6860fn value_to_f64(v: &Value) -> Option<f64> {
6861    match v {
6862        Value::SmallInt(n) => Some(f64::from(*n)),
6863        Value::Int(n) => Some(f64::from(*n)),
6864        #[allow(clippy::cast_precision_loss)]
6865        Value::BigInt(n) => Some(*n as f64),
6866        Value::Float(x) => Some(*x),
6867        _ => None,
6868    }
6869}
6870
6871/// Quick scan for any subquery-bearing node in a SELECT's WHERE /
6872/// projection / `order_by` — saves cloning the AST when there are
6873/// none (the common case).
6874fn expr_tree_has_subquery(stmt: &SelectStatement) -> bool {
6875    let mut any = false;
6876    for item in &stmt.items {
6877        if let SelectItem::Expr { expr, .. } = item {
6878            any = any || expr_has_subquery(expr);
6879        }
6880    }
6881    if let Some(w) = &stmt.where_ {
6882        any = any || expr_has_subquery(w);
6883    }
6884    if let Some(h) = &stmt.having {
6885        any = any || expr_has_subquery(h);
6886    }
6887    for o in &stmt.order_by {
6888        any = any || expr_has_subquery(&o.expr);
6889    }
6890    for (_, peer) in &stmt.unions {
6891        any = any || expr_tree_has_subquery(peer);
6892    }
6893    any
6894}
6895
6896fn expr_has_subquery(e: &Expr) -> bool {
6897    match e {
6898        Expr::ScalarSubquery(_) | Expr::Exists { .. } | Expr::InSubquery { .. } => true,
6899        Expr::Binary { lhs, rhs, .. } => expr_has_subquery(lhs) || expr_has_subquery(rhs),
6900        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
6901            expr_has_subquery(expr)
6902        }
6903        Expr::FunctionCall { args, .. } => args.iter().any(expr_has_subquery),
6904        Expr::Like { expr, pattern, .. } => expr_has_subquery(expr) || expr_has_subquery(pattern),
6905        Expr::Extract { source, .. } => expr_has_subquery(source),
6906        Expr::WindowFunction {
6907            args,
6908            partition_by,
6909            order_by,
6910            ..
6911        } => {
6912            args.iter().any(expr_has_subquery)
6913                || partition_by.iter().any(expr_has_subquery)
6914                || order_by.iter().any(|(e, _)| expr_has_subquery(e))
6915        }
6916        Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => false,
6917        Expr::Array(items) => items.iter().any(expr_has_subquery),
6918        Expr::ArraySubscript { target, index } => {
6919            expr_has_subquery(target) || expr_has_subquery(index)
6920        }
6921        Expr::AnyAll { expr, array, .. } => {
6922            expr_has_subquery(expr) || expr_has_subquery(array)
6923        }
6924    }
6925}
6926
6927/// v4.10 helper: materialise a runtime `Value` back into an AST
6928/// `Expr::Literal` for the subquery-rewrite path. Supports the
6929/// types `Literal` can represent (Integer / Float / Text / Bool /
6930/// Null). Date / Timestamp / Numeric / Vector / Interval / JSON
6931/// would lose precision through Literal and aren't supported in
6932/// uncorrelated-subquery results; they error with a clear hint.
6933fn value_to_literal_expr(v: Value) -> Result<Expr, EngineError> {
6934    let lit = match v {
6935        Value::Null => Literal::Null,
6936        Value::SmallInt(n) => Literal::Integer(i64::from(n)),
6937        Value::Int(n) => Literal::Integer(i64::from(n)),
6938        Value::BigInt(n) => Literal::Integer(n),
6939        Value::Float(x) => Literal::Float(x),
6940        Value::Text(s) | Value::Json(s) => Literal::String(s),
6941        Value::Bool(b) => Literal::Bool(b),
6942        other => {
6943            return Err(EngineError::Unsupported(alloc::format!(
6944                "subquery result type {:?} not yet materialisable; cast to text or integer in the inner SELECT",
6945                other.data_type()
6946            )));
6947        }
6948    };
6949    Ok(Expr::Literal(lit))
6950}
6951
6952/// v6.1.1 — walk the prepared `Statement` AST and replace every
6953/// `Expr::Placeholder(n)` with `Expr::Literal(value_to_literal(
6954/// params[n-1]))`. The dispatch downstream sees a `Statement`
6955/// indistinguishable from a simple-query parse, so the exec path
6956/// stays unchanged.
6957///
6958/// Errors fall into one shape: a `$N` references past the bound
6959/// `params.len()`. Out-of-range happens when the Bind didn't
6960/// supply enough values; pgwire surfaces this as a protocol error
6961/// to the client.
6962fn substitute_placeholders(stmt: &mut Statement, params: &[Value]) -> Result<(), EngineError> {
6963    match stmt {
6964        Statement::Select(s) => substitute_select(s, params)?,
6965        Statement::Insert(ins) => {
6966            for row in &mut ins.rows {
6967                for e in row {
6968                    substitute_expr(e, params)?;
6969                }
6970            }
6971        }
6972        Statement::Update(u) => {
6973            for (_, e) in &mut u.assignments {
6974                substitute_expr(e, params)?;
6975            }
6976            if let Some(w) = &mut u.where_ {
6977                substitute_expr(w, params)?;
6978            }
6979        }
6980        Statement::Delete(d) => {
6981            if let Some(w) = &mut d.where_ {
6982                substitute_expr(w, params)?;
6983            }
6984        }
6985        Statement::Explain(e) => substitute_select(&mut e.inner, params)?,
6986        // Other statements (CREATE / BEGIN / SHOW / …) have no
6987        // expression slots; no walk needed.
6988        _ => {}
6989    }
6990    Ok(())
6991}
6992
6993fn substitute_select(
6994    s: &mut SelectStatement,
6995    params: &[Value],
6996) -> Result<(), EngineError> {
6997    for item in &mut s.items {
6998        if let SelectItem::Expr { expr, .. } = item {
6999            substitute_expr(expr, params)?;
7000        }
7001    }
7002    if let Some(w) = &mut s.where_ {
7003        substitute_expr(w, params)?;
7004    }
7005    if let Some(gs) = &mut s.group_by {
7006        for g in gs {
7007            substitute_expr(g, params)?;
7008        }
7009    }
7010    if let Some(h) = &mut s.having {
7011        substitute_expr(h, params)?;
7012    }
7013    for o in &mut s.order_by {
7014        substitute_expr(&mut o.expr, params)?;
7015    }
7016    for (_, peer) in &mut s.unions {
7017        substitute_select(peer, params)?;
7018    }
7019    // v7.9.24 — LIMIT $N / OFFSET $N placeholder resolution.
7020    // mailrs H2. After this pass each LIMIT/OFFSET that was a
7021    // Placeholder is rewritten to Literal so the existing
7022    // `LimitExpr::as_literal` path consumes a concrete u32.
7023    if let Some(le) = s.limit {
7024        s.limit = Some(resolve_limit_placeholder(le, params)?);
7025    }
7026    if let Some(le) = s.offset {
7027        s.offset = Some(resolve_limit_placeholder(le, params)?);
7028    }
7029    Ok(())
7030}
7031
7032fn resolve_limit_placeholder(
7033    le: spg_sql::ast::LimitExpr,
7034    params: &[Value],
7035) -> Result<spg_sql::ast::LimitExpr, EngineError> {
7036    use spg_sql::ast::LimitExpr;
7037    match le {
7038        LimitExpr::Literal(_) => Ok(le),
7039        LimitExpr::Placeholder(n) => {
7040            let idx = usize::from(n).saturating_sub(1);
7041            let v = params.get(idx).ok_or_else(|| {
7042                EngineError::Eval(EvalError::PlaceholderOutOfRange {
7043                    n,
7044                    bound: u16::try_from(params.len()).unwrap_or(u16::MAX),
7045                })
7046            })?;
7047            let int = match v {
7048                Value::SmallInt(x) => Some(i64::from(*x)),
7049                Value::Int(x) => Some(i64::from(*x)),
7050                Value::BigInt(x) => Some(*x),
7051                _ => None,
7052            }
7053            .ok_or_else(|| {
7054                EngineError::Unsupported(alloc::format!(
7055                    "LIMIT/OFFSET ${n} bound to non-integer {v:?}"
7056                ))
7057            })?;
7058            if int < 0 {
7059                return Err(EngineError::Unsupported(alloc::format!(
7060                    "LIMIT/OFFSET ${n} bound to negative value {int}"
7061                )));
7062            }
7063            let bounded = u32::try_from(int).map_err(|_| {
7064                EngineError::Unsupported(alloc::format!(
7065                    "LIMIT/OFFSET ${n} value {int} exceeds u32 range"
7066                ))
7067            })?;
7068            Ok(LimitExpr::Literal(bounded))
7069        }
7070    }
7071}
7072
7073fn substitute_expr(e: &mut Expr, params: &[Value]) -> Result<(), EngineError> {
7074    if let Expr::Placeholder(n) = e {
7075        let idx = usize::from(*n).saturating_sub(1);
7076        let v = params.get(idx).ok_or_else(|| {
7077            EngineError::Eval(EvalError::PlaceholderOutOfRange {
7078                n: *n,
7079                bound: u16::try_from(params.len()).unwrap_or(u16::MAX),
7080            })
7081        })?;
7082        *e = Expr::Literal(value_to_literal(v.clone()));
7083        return Ok(());
7084    }
7085    match e {
7086        Expr::Binary { lhs, rhs, .. } => {
7087            substitute_expr(lhs, params)?;
7088            substitute_expr(rhs, params)?;
7089        }
7090        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
7091            substitute_expr(expr, params)?;
7092        }
7093        Expr::FunctionCall { args, .. } => {
7094            for a in args {
7095                substitute_expr(a, params)?;
7096            }
7097        }
7098        Expr::Like { expr, pattern, .. } => {
7099            substitute_expr(expr, params)?;
7100            substitute_expr(pattern, params)?;
7101        }
7102        Expr::Extract { source, .. } => substitute_expr(source, params)?,
7103        Expr::ScalarSubquery(s) => substitute_select(s, params)?,
7104        Expr::Exists { subquery, .. } => substitute_select(subquery, params)?,
7105        Expr::InSubquery { expr, subquery, .. } => {
7106            substitute_expr(expr, params)?;
7107            substitute_select(subquery, params)?;
7108        }
7109        Expr::WindowFunction {
7110            args,
7111            partition_by,
7112            order_by,
7113            ..
7114        } => {
7115            for a in args {
7116                substitute_expr(a, params)?;
7117            }
7118            for p in partition_by {
7119                substitute_expr(p, params)?;
7120            }
7121            for (e, _) in order_by {
7122                substitute_expr(e, params)?;
7123            }
7124        }
7125        Expr::Literal(_) | Expr::Column(_) => {}
7126        // Already handled above.
7127        Expr::Placeholder(_) => unreachable!("Placeholder handled at top of fn"),
7128        Expr::Array(items) => {
7129            for elem in items {
7130                substitute_expr(elem, params)?;
7131            }
7132        }
7133        Expr::ArraySubscript { target, index } => {
7134            substitute_expr(target, params)?;
7135            substitute_expr(index, params)?;
7136        }
7137        Expr::AnyAll { expr, array, .. } => {
7138            substitute_expr(expr, params)?;
7139            substitute_expr(array, params)?;
7140        }
7141    }
7142    Ok(())
7143}
7144
7145/// v6.1.1 — convert a runtime `Value` into the closest matching
7146/// `Literal` for the substitute walker. Lossless for the simple
7147/// scalars (Int / Float / Text / Bool); Numeric / Date / Timestamp
7148/// / Json / Interval render as their canonical text form so the
7149/// downstream coerce_value can re-parse against the target column
7150/// type. SQ8 / HalfVector cells are NOT expected as bind params;
7151/// pgwire's Bind decodes vector params to the f32 representation
7152/// before they reach this helper.
7153/// v6.2.0 — total ordering on `Value`s used by ANALYZE to sort a
7154/// column's non-NULL sample before histogram building. Cross-type
7155/// pairs (Int vs Float, Date vs Timestamp, …) compare via the
7156/// same widening the eval-side `compare` operator uses; everything
7157/// else (the genuinely-incompatible pairs) falls back to ordering
7158/// by canonical string form so the sort is still total + stable.
7159/// Vector / SQ8 / Half / Json / Numeric / Interval values reach
7160/// here only via the string-fallback path because vector columns
7161/// are filtered out upstream.
7162fn sort_values_for_histogram(a: &Value, b: &Value) -> core::cmp::Ordering {
7163    use core::cmp::Ordering;
7164    match (a, b) {
7165        (Value::SmallInt(a), Value::SmallInt(b)) => a.cmp(b),
7166        (Value::Int(a), Value::Int(b)) => a.cmp(b),
7167        (Value::BigInt(a), Value::BigInt(b)) => a.cmp(b),
7168        (Value::SmallInt(a), Value::Int(b)) => i32::from(*a).cmp(b),
7169        (Value::Int(a), Value::SmallInt(b)) => a.cmp(&i32::from(*b)),
7170        (Value::Int(a), Value::BigInt(b)) => i64::from(*a).cmp(b),
7171        (Value::BigInt(a), Value::Int(b)) => a.cmp(&i64::from(*b)),
7172        (Value::SmallInt(a), Value::BigInt(b)) => i64::from(*a).cmp(b),
7173        (Value::BigInt(a), Value::SmallInt(b)) => a.cmp(&i64::from(*b)),
7174        (Value::Float(a), Value::Float(b)) => a.partial_cmp(b).unwrap_or(Ordering::Equal),
7175        (Value::Text(a), Value::Text(b)) | (Value::Json(a), Value::Json(b)) => a.cmp(b),
7176        (Value::Bool(a), Value::Bool(b)) => a.cmp(b),
7177        (Value::Date(a), Value::Date(b)) => a.cmp(b),
7178        (Value::Timestamp(a), Value::Timestamp(b)) => a.cmp(b),
7179        // Mixed numeric/float — widen to f64 and compare.
7180        (Value::SmallInt(n), Value::Float(x)) => {
7181            (f64::from(*n)).partial_cmp(x).unwrap_or(Ordering::Equal)
7182        }
7183        (Value::Float(x), Value::SmallInt(n)) => {
7184            x.partial_cmp(&f64::from(*n)).unwrap_or(Ordering::Equal)
7185        }
7186        (Value::Int(n), Value::Float(x)) => {
7187            (f64::from(*n)).partial_cmp(x).unwrap_or(Ordering::Equal)
7188        }
7189        (Value::Float(x), Value::Int(n)) => {
7190            x.partial_cmp(&f64::from(*n)).unwrap_or(Ordering::Equal)
7191        }
7192        (Value::BigInt(n), Value::Float(x)) => {
7193            #[allow(clippy::cast_precision_loss)]
7194            let nf = *n as f64;
7195            nf.partial_cmp(x).unwrap_or(Ordering::Equal)
7196        }
7197        (Value::Float(x), Value::BigInt(n)) => {
7198            #[allow(clippy::cast_precision_loss)]
7199            let nf = *n as f64;
7200            x.partial_cmp(&nf).unwrap_or(Ordering::Equal)
7201        }
7202        // Cross-type fallback: lexicographic on canonical form.
7203        // Total + stable so the sort is well-defined.
7204        _ => canonical_value_repr(a).cmp(&canonical_value_repr(b)),
7205    }
7206}
7207
7208/// v6.2.0 — render the histogram bounds list as a `[v0, v1, ...]`
7209/// string for the `spg_statistic.histogram_bounds` column. Values
7210/// containing `,` or `[` / `]` are JSON-style escaped so the
7211/// rendering round-trips through a future parser; v6.2.0 only
7212/// uses the rendered form for human consumption, so the escaping
7213/// is conservative.
7214fn render_histogram_bounds(bounds: &[alloc::string::String]) -> alloc::string::String {
7215    let mut out = alloc::string::String::with_capacity(bounds.len() * 8 + 2);
7216    out.push('[');
7217    for (i, b) in bounds.iter().enumerate() {
7218        if i > 0 {
7219            out.push_str(", ");
7220        }
7221        let needs_quote = b.contains([',', '[', ']', '"']) || b.is_empty();
7222        if needs_quote {
7223            out.push('"');
7224            for ch in b.chars() {
7225                if ch == '"' || ch == '\\' {
7226                    out.push('\\');
7227                }
7228                out.push(ch);
7229            }
7230            out.push('"');
7231        } else {
7232            out.push_str(b);
7233        }
7234    }
7235    out.push(']');
7236    out
7237}
7238
7239/// v6.2.0 — canonical textual form of a `Value` for histogram
7240/// bound storage. Strings used by ANALYZE for sort + bound output.
7241/// INT / BIGINT → decimal; FLOAT → shortest-round-trip via
7242/// `{:?}`; TEXT pass-through; BOOL → `t` / `f`; DATE / TIMESTAMP →
7243/// the same form `format_date` / `format_timestamp` produce for
7244/// SQL Display. Vector / SQ8 / Half / Json / Numeric / Interval
7245/// reach this only via a non-Vector column (vector columns are
7246/// skipped upstream); they fall back to a Debug-derived form so
7247/// stats still serialise without crashing.
7248pub(crate) fn canonical_value_repr(v: &Value) -> alloc::string::String {
7249    match v {
7250        Value::Null => "NULL".to_string(),
7251        Value::SmallInt(n) => alloc::format!("{n}"),
7252        Value::Int(n) => alloc::format!("{n}"),
7253        Value::BigInt(n) => alloc::format!("{n}"),
7254        Value::Float(x) => alloc::format!("{x:?}"),
7255        Value::Text(s) | Value::Json(s) => s.clone(),
7256        Value::Bool(b) => if *b { "t" } else { "f" }.to_string(),
7257        Value::Date(d) => eval::format_date(*d),
7258        Value::Timestamp(t) => eval::format_timestamp(*t),
7259        Value::Interval { months, micros } => eval::format_interval(*months, *micros),
7260        Value::Numeric { scaled, scale } => eval::format_numeric(*scaled, *scale),
7261        Value::Vector(_) | Value::Sq8Vector(_) | Value::HalfVector(_) => {
7262            // Unreachable in practice (vector columns are filtered
7263            // out before this). Defensive fallback so a future
7264            // vector-stats path doesn't crash.
7265            alloc::format!("{v:?}")
7266        }
7267        // v7.5.0 — Value is #[non_exhaustive] for downstream
7268        // forward-compat. Future variants fall through to Debug
7269        // form here (same shape as the vector fallback above).
7270        _ => alloc::format!("{v:?}"),
7271    }
7272}
7273
7274/// v6.2.0 — true for engine-managed catalog tables that the bare
7275/// `ANALYZE` (no target) should skip. v6.2.0 has no internal
7276/// tables yet (publications / subscriptions / users / statistics
7277/// all live as engine fields, not catalog tables), so this is a
7278/// reserved future-proofing hook — every existing user table is
7279/// analysed.
7280const fn is_internal_table_name(_name: &str) -> bool {
7281    false
7282}
7283
7284fn value_to_literal(v: Value) -> Literal {
7285    match v {
7286        Value::Null => Literal::Null,
7287        Value::SmallInt(n) => Literal::Integer(i64::from(n)),
7288        Value::Int(n) => Literal::Integer(i64::from(n)),
7289        Value::BigInt(n) => Literal::Integer(n),
7290        Value::Float(x) => Literal::Float(x),
7291        Value::Text(s) | Value::Json(s) => Literal::String(s),
7292        Value::Bool(b) => Literal::Bool(b),
7293        Value::Vector(v) => Literal::Vector(v),
7294        Value::Numeric { scaled, scale } => {
7295            Literal::String(eval::format_numeric(scaled, scale))
7296        }
7297        Value::Date(d) => Literal::String(eval::format_date(d)),
7298        Value::Timestamp(t) => Literal::String(eval::format_timestamp(t)),
7299        Value::Interval { months, micros } => Literal::Interval {
7300            months,
7301            micros,
7302            text: eval::format_interval(months, micros),
7303        },
7304        // SQ8 / halfvec cells dequantise to f32 before reaching the
7305        // substitute walker; pgwire's Bind path handles that.
7306        Value::Sq8Vector(q) => Literal::Vector(spg_storage::quantize::dequantize(&q)),
7307        Value::HalfVector(h) => Literal::Vector(h.to_f32_vec()),
7308        // v7.5.0 — Value is #[non_exhaustive]; future variants
7309        // render as Debug-form String literal until explicit
7310        // mapping is added.
7311        v => Literal::String(alloc::format!("{v:?}")),
7312    }
7313}
7314
7315fn rewrite_clock_calls(stmt: &mut Statement, now_micros: Option<i64>) {
7316    let Some(now) = now_micros else {
7317        return;
7318    };
7319    match stmt {
7320        Statement::Select(s) => rewrite_select_clock(s, now),
7321        Statement::Insert(ins) => {
7322            for row in &mut ins.rows {
7323                for e in row {
7324                    rewrite_expr_clock(e, now);
7325                }
7326            }
7327        }
7328        _ => {}
7329    }
7330}
7331
7332fn rewrite_select_clock(s: &mut SelectStatement, now: i64) {
7333    for item in &mut s.items {
7334        if let SelectItem::Expr { expr, .. } = item {
7335            rewrite_expr_clock(expr, now);
7336        }
7337    }
7338    if let Some(w) = &mut s.where_ {
7339        rewrite_expr_clock(w, now);
7340    }
7341    if let Some(gs) = &mut s.group_by {
7342        for g in gs {
7343            rewrite_expr_clock(g, now);
7344        }
7345    }
7346    if let Some(h) = &mut s.having {
7347        rewrite_expr_clock(h, now);
7348    }
7349    for o in &mut s.order_by {
7350        rewrite_expr_clock(&mut o.expr, now);
7351    }
7352    for (_, peer) in &mut s.unions {
7353        rewrite_select_clock(peer, now);
7354    }
7355}
7356
7357/// v3.0.3 hot path: every recursion lands in exactly one `match` arm.
7358/// Literal / Column-with-qualifier (the dominant cases on a typical
7359/// AST) take a single pattern dispatch and exit. The clock-rewrite
7360/// targets (zero-arg `NOW` / `CURRENT_TIMESTAMP` / `CURRENT_DATE`
7361/// functions, and bare `CURRENT_TIMESTAMP` / `CURRENT_DATE` column
7362/// refs) sit on their own arms with match guards so the fall-through
7363/// to the recursive arms is unambiguous.
7364fn rewrite_expr_clock(e: &mut Expr, now: i64) {
7365    // Fast-path test on the no-recursion shapes first. We can't fold
7366    // them into the big match below because they need to *replace* `e`
7367    // outright; the recursive arms below match on its sub-fields.
7368    if let Some(replacement) = clock_replacement_for(e, now) {
7369        *e = replacement;
7370        return;
7371    }
7372    match e {
7373        Expr::Binary { lhs, rhs, .. } => {
7374            rewrite_expr_clock(lhs, now);
7375            rewrite_expr_clock(rhs, now);
7376        }
7377        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
7378            rewrite_expr_clock(expr, now);
7379        }
7380        Expr::FunctionCall { args, .. } => {
7381            for a in args {
7382                rewrite_expr_clock(a, now);
7383            }
7384        }
7385        Expr::Like { expr, pattern, .. } => {
7386            rewrite_expr_clock(expr, now);
7387            rewrite_expr_clock(pattern, now);
7388        }
7389        Expr::Extract { source, .. } => rewrite_expr_clock(source, now),
7390        // v4.10 subquery nodes — recurse into the inner SELECT's
7391        // expression slots so e.g. SELECT NOW() in a scalar
7392        // subquery picks up the same instant as the outer query.
7393        Expr::ScalarSubquery(s) => rewrite_select_clock(s, now),
7394        Expr::Exists { subquery, .. } => rewrite_select_clock(subquery, now),
7395        Expr::InSubquery { expr, subquery, .. } => {
7396            rewrite_expr_clock(expr, now);
7397            rewrite_select_clock(subquery, now);
7398        }
7399        // v4.12 window functions — args + PARTITION BY + ORDER BY
7400        // may all reference clock literals.
7401        Expr::WindowFunction {
7402            args,
7403            partition_by,
7404            order_by,
7405            ..
7406        } => {
7407            for a in args {
7408                rewrite_expr_clock(a, now);
7409            }
7410            for p in partition_by {
7411                rewrite_expr_clock(p, now);
7412            }
7413            for (e, _) in order_by {
7414                rewrite_expr_clock(e, now);
7415            }
7416        }
7417        Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => {}
7418        Expr::Array(items) => {
7419            for elem in items {
7420                rewrite_expr_clock(elem, now);
7421            }
7422        }
7423        Expr::ArraySubscript { target, index } => {
7424            rewrite_expr_clock(target, now);
7425            rewrite_expr_clock(index, now);
7426        }
7427        Expr::AnyAll { expr, array, .. } => {
7428            rewrite_expr_clock(expr, now);
7429            rewrite_expr_clock(array, now);
7430        }
7431    }
7432}
7433
7434/// Returns `Some(Expr)` when `e` is one of the clock-call shapes that
7435/// must be rewritten; otherwise `None` so the caller falls through to
7436/// the recursive walk. Identifies both function-call forms (`NOW()` /
7437/// `CURRENT_TIMESTAMP()` / `CURRENT_DATE()`) and bare-identifier forms
7438/// (`CURRENT_TIMESTAMP` / `CURRENT_DATE` as unqualified column refs,
7439/// which is how PG accepts them without parens).
7440fn clock_replacement_for(e: &Expr, now: i64) -> Option<Expr> {
7441    let (kind, name) = match e {
7442        Expr::FunctionCall { name, args } if args.is_empty() => (ClockSite::Fn, name.as_str()),
7443        Expr::Column(c) if c.qualifier.is_none() => (ClockSite::BareIdent, c.name.as_str()),
7444        _ => return None,
7445    };
7446    // ASCII case-insensitive name match. Limited to the three keywords
7447    // that actually need rewriting.
7448    let matched = match name.len() {
7449        3 if kind == ClockSite::Fn && name.eq_ignore_ascii_case("now") => Some(true),
7450        12 if name.eq_ignore_ascii_case("current_date") => Some(false),
7451        17 if name.eq_ignore_ascii_case("current_timestamp") => Some(true),
7452        _ => None,
7453    };
7454    let is_timestamp = matched?;
7455    let payload = if is_timestamp {
7456        now
7457    } else {
7458        now.div_euclid(86_400_000_000)
7459    };
7460    let target = if is_timestamp {
7461        spg_sql::ast::CastTarget::Timestamp
7462    } else {
7463        spg_sql::ast::CastTarget::Date
7464    };
7465    Some(Expr::Cast {
7466        expr: alloc::boxed::Box::new(Expr::Literal(spg_sql::ast::Literal::Integer(payload))),
7467        target,
7468    })
7469}
7470
7471#[derive(Debug, Clone, Copy, PartialEq, Eq)]
7472enum ClockSite {
7473    Fn,
7474    BareIdent,
7475}
7476
7477/// `ORDER BY <integer>` references the N-th SELECT item (1-based).
7478/// Swap the integer literal for the matching item's expression so the
7479/// executor doesn't need a special-case branch. Recurses into UNION
7480/// peers because each peer keeps its own SELECT list.
7481/// v6.4.1 — expand `GROUP BY ALL` to every non-aggregate SELECT-list
7482/// item. Mirrors DuckDB / PG 19 semantics. Wildcards (`SELECT * …`)
7483/// are NOT expanded by GROUP BY ALL (PG 19 leaves the wildcard intact
7484/// and groups by whatever explicit non-aggregates remain — none in
7485/// the wildcard-only case, which still works for non-aggregate
7486/// queries).
7487fn expand_group_by_all(s: &mut SelectStatement) {
7488    if !s.group_by_all {
7489        for (_, peer) in &mut s.unions {
7490            expand_group_by_all(peer);
7491        }
7492        return;
7493    }
7494    let mut groups: Vec<Expr> = Vec::new();
7495    for item in &s.items {
7496        if let SelectItem::Expr { expr, .. } = item
7497            && !aggregate::contains_aggregate(expr)
7498        {
7499            groups.push(expr.clone());
7500        }
7501    }
7502    s.group_by = Some(groups);
7503    s.group_by_all = false;
7504    for (_, peer) in &mut s.unions {
7505        expand_group_by_all(peer);
7506    }
7507}
7508
7509fn resolve_order_by_position(s: &mut SelectStatement) {
7510    // v6.4.0 — iterate every ORDER BY key. Position references
7511    // (`ORDER BY 2`) bind to the 1-based projection index;
7512    // identifier references that match a SELECT-list alias bind to
7513    // the projected expression (Step 4 of L3a).
7514    for order in &mut s.order_by {
7515        match &order.expr {
7516            Expr::Literal(Literal::Integer(n)) if *n >= 1 => {
7517                if let Ok(idx_one_based) = usize::try_from(*n) {
7518                    let idx = idx_one_based - 1;
7519                    if idx < s.items.len()
7520                        && let SelectItem::Expr { expr, .. } = &s.items[idx]
7521                    {
7522                        order.expr = expr.clone();
7523                    }
7524                }
7525            }
7526            Expr::Column(c) if c.qualifier.is_none() => {
7527                // Alias-in-ORDER-BY lookup.
7528                for item in &s.items {
7529                    if let SelectItem::Expr {
7530                        expr,
7531                        alias: Some(a),
7532                    } = item
7533                        && a == &c.name
7534                    {
7535                        order.expr = expr.clone();
7536                        break;
7537                    }
7538                }
7539            }
7540            _ => {}
7541        }
7542    }
7543    for (_, peer) in &mut s.unions {
7544        resolve_order_by_position(peer);
7545    }
7546}
7547
7548/// Sort `tagged` by `f64` key, reversing the comparator under DESC.
7549/// Used by the UNION ORDER BY path; per-block paths inline the same
7550/// comparator because they already hold `&OrderBy` directly.
7551/// v3.1.1: partial-sort helper. When `keep` (= offset + limit) is
7552/// strictly less than `tagged.len()`, run `select_nth_unstable_by` to
7553/// partition the prefix in O(n), then sort just that prefix in O(k
7554/// log k). Total O(n + k log k), vs O(n log n) for a full sort. The
7555/// caller decides what `keep` is; passing `None` (no LIMIT) keeps the
7556/// full-sort behaviour.
7557///
7558/// `tagged` holds `(Option<f64>, Row)` (the SELECT path) — `None` keys
7559/// sort last in ascending order, mirroring NULL-sorts-last in SQL.
7560fn partial_sort_tagged(
7561    tagged: &mut Vec<(Vec<f64>, Row)>,
7562    keep: Option<usize>,
7563    descs: &[bool],
7564) {
7565    let cmp = |a: &(Vec<f64>, Row), b: &(Vec<f64>, Row)| cmp_multi_key(&a.0, &b.0, descs);
7566    match keep {
7567        Some(k) if k < tagged.len() && k > 0 => {
7568            let pivot = k - 1;
7569            tagged.select_nth_unstable_by(pivot, cmp);
7570            tagged[..k].sort_by(cmp);
7571            tagged.truncate(k);
7572        }
7573        _ => {
7574            tagged.sort_by(cmp);
7575        }
7576    }
7577}
7578
7579fn sort_by_keys(tagged: &mut [(Vec<f64>, Row)], descs: &[bool]) {
7580    tagged.sort_by(|a, b| cmp_multi_key(&a.0, &b.0, descs));
7581}
7582
7583/// v6.4.0 — multi-key ORDER BY comparator. Each key's per-key DESC
7584/// flag is honored independently. NULL is encoded as `f64::INFINITY`
7585/// so it sorts last in ASC and first in DESC (matches PG default).
7586fn cmp_multi_key(a: &[f64], b: &[f64], descs: &[bool]) -> core::cmp::Ordering {
7587    use core::cmp::Ordering;
7588    for (i, (ka, kb)) in a.iter().zip(b.iter()).enumerate() {
7589        let ord = ka.partial_cmp(kb).unwrap_or(Ordering::Equal);
7590        let ord = if descs.get(i).copied().unwrap_or(false) {
7591            ord.reverse()
7592        } else {
7593            ord
7594        };
7595        if ord != Ordering::Equal {
7596            return ord;
7597        }
7598    }
7599    Ordering::Equal
7600}
7601
7602/// v6.4.0 — eval every ORDER BY expression for a row and pack the
7603/// resulting keys into a `Vec<f64>`. NULL → `f64::INFINITY`.
7604fn build_order_keys(
7605    order_by: &[OrderBy],
7606    row: &Row,
7607    ctx: &EvalContext,
7608) -> Result<Vec<f64>, EngineError> {
7609    let mut keys = Vec::with_capacity(order_by.len());
7610    for o in order_by {
7611        let v = eval::eval_expr(&o.expr, row, ctx)?;
7612        keys.push(value_to_order_key(&v)?);
7613    }
7614    Ok(keys)
7615}
7616
7617/// Drop the first `offset` rows then truncate to `limit`. PG / `MySQL`
7618/// agree: OFFSET applies *after* ORDER BY but *before* LIMIT (so
7619/// `LIMIT 10 OFFSET 5` keeps rows 6..=15).
7620fn apply_offset_and_limit(rows: &mut Vec<Row>, offset: Option<u32>, limit: Option<u32>) {
7621    if let Some(off) = offset {
7622        let off = off as usize;
7623        if off >= rows.len() {
7624            rows.clear();
7625        } else {
7626            rows.drain(..off);
7627        }
7628    }
7629    if let Some(n) = limit {
7630        rows.truncate(n as usize);
7631    }
7632}
7633
7634/// v7.6.1 — resolve a parser-level `ForeignKeyConstraint` (column
7635/// names + parent table name) into the storage-layer shape (column
7636/// indices + same parent table). Validates everything the engine
7637/// needs to know about the FK at CREATE TABLE time:
7638///
7639///   - parent table exists (catalog lookup, unless self-referencing)
7640///   - parent columns exist on the parent table
7641///   - parent column list matches the local arity (defaults to the
7642///     parent's primary index column when omitted)
7643///   - parent columns are covered by a `BTree` UNIQUE-class index
7644///     (SPG's stand-in for `PRIMARY KEY`/`UNIQUE`) — required so
7645///     the v7.6.2 INSERT path can do an O(log n) parent lookup
7646///   - local columns exist on the table being created
7647fn resolve_foreign_key(
7648    local_table_name: &str,
7649    local_cols: &[ColumnSchema],
7650    fk: spg_sql::ast::ForeignKeyConstraint,
7651    catalog: &Catalog,
7652) -> Result<spg_storage::ForeignKeyConstraint, EngineError> {
7653    // Resolve local columns.
7654    let mut local_columns = Vec::with_capacity(fk.columns.len());
7655    for name in &fk.columns {
7656        let pos = local_cols
7657            .iter()
7658            .position(|c| c.name == *name)
7659            .ok_or_else(|| {
7660                EngineError::Unsupported(alloc::format!(
7661                    "FOREIGN KEY references unknown local column {name:?}"
7662                ))
7663            })?;
7664        local_columns.push(pos);
7665    }
7666    // Self-referencing FK: parent table is the one we're creating.
7667    // The parent column resolution uses the local column list since
7668    // the catalog doesn't have this table yet.
7669    let is_self_ref = fk.parent_table == local_table_name;
7670    let (parent_cols_for_lookup, parent_table_str): (&[ColumnSchema], &str) = if is_self_ref {
7671        (local_cols, local_table_name)
7672    } else {
7673        let parent_table = catalog.get(&fk.parent_table).ok_or_else(|| {
7674            EngineError::Storage(StorageError::TableNotFound {
7675                name: fk.parent_table.clone(),
7676            })
7677        })?;
7678        (parent_table.schema().columns.as_slice(), fk.parent_table.as_str())
7679    };
7680    // Resolve parent column names → positions. If the FK omitted the
7681    // parent column list, fall back to the parent's primary index
7682    // column (single-column only — composite default is rejected
7683    // because there's no unambiguous "PK" in SPG's index list).
7684    let parent_columns: Vec<usize> = if fk.parent_columns.is_empty() {
7685        if fk.columns.len() != 1 {
7686            return Err(EngineError::Unsupported(
7687                "composite FOREIGN KEY without explicit parent column list is not supported \
7688                 — list the parent columns explicitly"
7689                    .into(),
7690            ));
7691        }
7692        // Find a single BTree index on the parent and use its column.
7693        let pos = pick_pk_index_column(catalog, parent_table_str, is_self_ref, local_cols)
7694            .ok_or_else(|| {
7695                EngineError::Unsupported(alloc::format!(
7696                    "parent table {parent_table_str:?} has no PRIMARY-key / UNIQUE BTree index \
7697                     to default the FOREIGN KEY against"
7698                ))
7699            })?;
7700        alloc::vec![pos]
7701    } else {
7702        let mut out = Vec::with_capacity(fk.parent_columns.len());
7703        for name in &fk.parent_columns {
7704            let pos = parent_cols_for_lookup
7705                .iter()
7706                .position(|c| c.name == *name)
7707                .ok_or_else(|| {
7708                    EngineError::Unsupported(alloc::format!(
7709                        "FOREIGN KEY references unknown parent column \
7710                         {name:?} on table {parent_table_str:?}"
7711                    ))
7712                })?;
7713            out.push(pos);
7714        }
7715        out
7716    };
7717    if parent_columns.len() != local_columns.len() {
7718        return Err(EngineError::Unsupported(alloc::format!(
7719            "FOREIGN KEY arity mismatch: {} local columns vs {} parent columns",
7720            local_columns.len(),
7721            parent_columns.len()
7722        )));
7723    }
7724    // For non-self-referencing FKs, verify the parent column set is
7725    // covered by a BTree index. SPG doesn't have a `PRIMARY KEY`
7726    // declaration; the convention is "the parent column for FK
7727    // purposes must have a BTree index" — which the user creates via
7728    // `CREATE INDEX ... USING btree (col)` (the default). We accept
7729    // any single-column BTree index that covers a parent column;
7730    // composite parent column lists require an index whose `column_position`
7731    // matches the first parent column (multi-column BTree indices
7732    // are not in the v7.x roadmap).
7733    if !is_self_ref {
7734        let parent_table = catalog
7735            .get(&fk.parent_table)
7736            .expect("checked above");
7737        let primary_parent_col = parent_columns[0];
7738        let has_btree = parent_table.schema().columns.get(primary_parent_col).is_some()
7739            && parent_table
7740                .indices()
7741                .iter()
7742                .any(|idx| {
7743                    matches!(idx.kind, spg_storage::IndexKind::BTree(_))
7744                        && idx.column_position == primary_parent_col
7745                        && idx.partial_predicate.is_none()
7746                });
7747        if !has_btree {
7748            return Err(EngineError::Unsupported(alloc::format!(
7749                "FOREIGN KEY parent column on {:?} is not covered by an unconditional BTree \
7750                 index — create one with `CREATE INDEX ... ON {} ({})` first",
7751                parent_table_str,
7752                parent_table_str,
7753                parent_table.schema().columns[primary_parent_col].name,
7754            )));
7755        }
7756    }
7757    let on_delete = fk_action_sql_to_storage(fk.on_delete);
7758    let on_update = fk_action_sql_to_storage(fk.on_update);
7759    Ok(spg_storage::ForeignKeyConstraint {
7760        name: fk.name,
7761        local_columns,
7762        parent_table: fk.parent_table,
7763        parent_columns,
7764        on_delete,
7765        on_update,
7766    })
7767}
7768
7769/// v7.6.1 — pick a sentinel "primary key" column from the parent
7770/// table when the FK didn't name parent columns. Picks the first
7771/// single-column unconditional BTree index — that's the closest
7772/// thing SPG has to a PRIMARY KEY today. Self-referencing FKs use
7773/// `local_cols` as the column source.
7774fn pick_pk_index_column(
7775    catalog: &Catalog,
7776    parent_name: &str,
7777    is_self_ref: bool,
7778    local_cols: &[ColumnSchema],
7779) -> Option<usize> {
7780    if is_self_ref {
7781        // Self-ref FK omitted parent columns: pick column 0 by
7782        // convention (no catalog entry yet). Engine will widen this
7783        // when v7.6.7 lands; v7.6.1 only handles the explicit form.
7784        let _ = local_cols;
7785        return Some(0);
7786    }
7787    let parent = catalog.get(parent_name)?;
7788    parent.indices().iter().find_map(|idx| {
7789        if matches!(idx.kind, spg_storage::IndexKind::BTree(_))
7790            && idx.partial_predicate.is_none()
7791            && idx.included_columns.is_empty()
7792            && idx.expression.is_none()
7793        {
7794            Some(idx.column_position)
7795        } else {
7796            None
7797        }
7798    })
7799}
7800
7801/// v7.9.8 / v7.9.10 — resolve the column positions that
7802/// identify a conflict for ON CONFLICT. Returns a Vec of
7803/// column positions (1 element for single-column form, N for
7804/// composite). When the user wrote bare `ON CONFLICT DO …`,
7805/// falls back to the table's first unconditional BTree index
7806/// (always single-column today).
7807fn resolve_on_conflict_columns(
7808    catalog: &Catalog,
7809    table_name: &str,
7810    target: &[String],
7811) -> Result<Vec<usize>, EngineError> {
7812    let table = catalog.get(table_name).ok_or_else(|| {
7813        EngineError::Storage(StorageError::TableNotFound {
7814            name: table_name.into(),
7815        })
7816    })?;
7817    if target.is_empty() {
7818        let pos = table
7819            .indices()
7820            .iter()
7821            .find_map(|idx| {
7822                if matches!(idx.kind, spg_storage::IndexKind::BTree(_))
7823                    && idx.partial_predicate.is_none()
7824                    && idx.included_columns.is_empty()
7825                    && idx.expression.is_none()
7826                {
7827                    Some(idx.column_position)
7828                } else {
7829                    None
7830                }
7831            })
7832            .ok_or_else(|| {
7833                EngineError::Unsupported(alloc::format!(
7834                    "ON CONFLICT without target requires a UNIQUE BTree index on {table_name:?}"
7835                ))
7836            })?;
7837        return Ok(alloc::vec![pos]);
7838    }
7839    let mut out = Vec::with_capacity(target.len());
7840    for name in target {
7841        let pos = table
7842            .schema()
7843            .columns
7844            .iter()
7845            .position(|c| c.name == *name)
7846            .ok_or_else(|| {
7847                EngineError::Unsupported(alloc::format!(
7848                    "ON CONFLICT target column {name:?} not found on {table_name:?}"
7849                ))
7850            })?;
7851        out.push(pos);
7852    }
7853    Ok(out)
7854}
7855
7856/// v7.9.8 — check whether the BTree index on `column_pos` of
7857/// `table_name` already has a row with this key.
7858fn on_conflict_key_exists(
7859    catalog: &Catalog,
7860    table_name: &str,
7861    column_pos: usize,
7862    key: &Value,
7863) -> bool {
7864    let Some(table) = catalog.get(table_name) else {
7865        return false;
7866    };
7867    let Some(idx_key) = spg_storage::IndexKey::from_value(key) else {
7868        return false;
7869    };
7870    table.indices().iter().any(|idx| {
7871        matches!(idx.kind, spg_storage::IndexKind::BTree(_))
7872            && idx.column_position == column_pos
7873            && idx.partial_predicate.is_none()
7874            && !idx.lookup_eq(&idx_key).is_empty()
7875    })
7876}
7877
7878/// v7.9.9 / v7.9.10 — look up an existing row's position by
7879/// matching all `column_positions` against the incoming `key`
7880/// tuple. Single-column shape (one column) reduces to the
7881/// canonical PK lookup; composite shapes scan linearly until
7882/// every position matches.
7883fn lookup_row_position_by_keys(
7884    catalog: &Catalog,
7885    table_name: &str,
7886    column_positions: &[usize],
7887    key: &[&Value],
7888) -> Option<usize> {
7889    let table = catalog.get(table_name)?;
7890    table.rows().iter().position(|r| {
7891        column_positions
7892            .iter()
7893            .enumerate()
7894            .all(|(i, &pos)| r.values.get(pos) == Some(key[i]))
7895    })
7896}
7897
7898/// v7.9.10 — does the table already contain a row whose
7899/// `column_positions` tuple equals `key`? Single-column shape
7900/// uses the existing BTree fast path; composite shapes fall
7901/// back to a row scan.
7902fn on_conflict_keys_exist(
7903    catalog: &Catalog,
7904    table_name: &str,
7905    column_positions: &[usize],
7906    key: &[&Value],
7907) -> bool {
7908    if column_positions.len() == 1 {
7909        return on_conflict_key_exists(
7910            catalog,
7911            table_name,
7912            column_positions[0],
7913            key[0],
7914        );
7915    }
7916    let Some(table) = catalog.get(table_name) else {
7917        return false;
7918    };
7919    table.rows().iter().any(|r| {
7920        column_positions
7921            .iter()
7922            .enumerate()
7923            .all(|(i, &pos)| r.values.get(pos) == Some(key[i]))
7924    })
7925}
7926
7927/// v7.9.9 — apply ON CONFLICT DO UPDATE SET assignments to an
7928/// existing row.
7929///
7930/// `incoming` is the rejected INSERT row (used to resolve
7931/// `EXCLUDED.col` references in the assignment exprs);
7932/// `target_pos` is the position of the existing row in the table.
7933/// Each assignment substitutes `EXCLUDED.col` with the matching
7934/// incoming value, evaluates the resulting expression against
7935/// the existing row, and writes the new value into the
7936/// corresponding column of the returned `Vec<Value>`. If
7937/// `where_` evaluates falsy, returns Ok(None) — PG behaviour:
7938/// the conflicting row is silently kept unchanged.
7939fn apply_on_conflict_assignments(
7940    catalog: &Catalog,
7941    table_name: &str,
7942    target_pos: usize,
7943    incoming: &[Value],
7944    assignments: &[(String, Expr)],
7945    where_: Option<&Expr>,
7946) -> Result<Option<Vec<Value>>, EngineError> {
7947    let table = catalog.get(table_name).ok_or_else(|| {
7948        EngineError::Storage(StorageError::TableNotFound {
7949            name: table_name.into(),
7950        })
7951    })?;
7952    let schema_cols = table.schema().columns.clone();
7953    let existing = table
7954        .rows()
7955        .get(target_pos)
7956        .ok_or_else(|| {
7957            EngineError::Unsupported(alloc::format!(
7958                "ON CONFLICT DO UPDATE: row position {target_pos} out of bounds on {table_name:?}"
7959            ))
7960        })?
7961        .clone();
7962    let ctx = eval::EvalContext::new(&schema_cols, Some(table_name));
7963    // Optional WHERE filter on the conflict row.
7964    if let Some(w) = where_ {
7965        let pred = w.clone();
7966        let pred = substitute_excluded_refs(pred, &schema_cols, incoming);
7967        let v = eval::eval_expr(&pred, &existing, &ctx)?;
7968        if !matches!(v, Value::Bool(true)) {
7969            return Ok(None);
7970        }
7971    }
7972    let mut new_values = existing.values.clone();
7973    for (col_name, expr) in assignments {
7974        let target_idx = schema_cols
7975            .iter()
7976            .position(|c| c.name == *col_name)
7977            .ok_or_else(|| {
7978                EngineError::Eval(EvalError::ColumnNotFound {
7979                    name: col_name.clone(),
7980                })
7981            })?;
7982        let sub = substitute_excluded_refs(expr.clone(), &schema_cols, incoming);
7983        let v = eval::eval_expr(&sub, &existing, &ctx)?;
7984        new_values[target_idx] =
7985            coerce_value(v, schema_cols[target_idx].ty, col_name, target_idx)?;
7986    }
7987    Ok(Some(new_values))
7988}
7989
7990/// v7.9.9 — walk an `Expr` tree replacing any `Column { qualifier:
7991/// "EXCLUDED", name }` reference with a `Literal` of the matching
7992/// value from the incoming-row vec. Resolution against the
7993/// child-table column list (by name).
7994fn substitute_excluded_refs(
7995    expr: Expr,
7996    schema_cols: &[ColumnSchema],
7997    incoming: &[Value],
7998) -> Expr {
7999    use spg_sql::ast::ColumnName;
8000    match expr {
8001        Expr::Column(ColumnName { qualifier, name })
8002            if qualifier
8003                .as_deref()
8004                .is_some_and(|q| q.eq_ignore_ascii_case("excluded")) =>
8005        {
8006            let pos = schema_cols.iter().position(|c| c.name == name);
8007            match pos {
8008                Some(p) => {
8009                    let v = incoming.get(p).cloned().unwrap_or(Value::Null);
8010                    value_to_literal_expr(v).unwrap_or_else(|_| {
8011                        Expr::Literal(spg_sql::ast::Literal::Null)
8012                    })
8013                }
8014                None => Expr::Column(ColumnName { qualifier, name }),
8015            }
8016        }
8017        Expr::Binary { op, lhs, rhs } => Expr::Binary {
8018            op,
8019            lhs: Box::new(substitute_excluded_refs(*lhs, schema_cols, incoming)),
8020            rhs: Box::new(substitute_excluded_refs(*rhs, schema_cols, incoming)),
8021        },
8022        Expr::Unary { op, expr } => Expr::Unary {
8023            op,
8024            expr: Box::new(substitute_excluded_refs(*expr, schema_cols, incoming)),
8025        },
8026        Expr::FunctionCall { name, args } => Expr::FunctionCall {
8027            name,
8028            args: args
8029                .into_iter()
8030                .map(|a| substitute_excluded_refs(a, schema_cols, incoming))
8031                .collect(),
8032        },
8033        other => other,
8034    }
8035}
8036
8037/// v7.6.2 / v7.6.7 — INSERT-side FK enforcement. For every row
8038/// about to be inserted into `child_table`, every FK declared on
8039/// that table is checked: the row's FK columns must either be
8040/// NULL (SQL spec skip) or match an existing parent row via the
8041/// parent's BTree PK / UNIQUE index.
8042///
8043/// Returns `EngineError::Unsupported` with a `FOREIGN KEY violation`
8044/// payload on first failure.
8045///
8046/// **Self-referencing FKs (v7.6.7 widening):** when `fk.parent_table
8047/// == child_table`, the parent rows visible to this check are
8048///  (a) rows already committed to the table, plus
8049///  (b) earlier rows from the *same* `rows` batch.
8050/// This makes `INSERT INTO tree VALUES (1, NULL), (2, 1), (3, 2)`
8051/// work in a single statement — common pattern for bulk-loading
8052/// hierarchies.
8053/// v7.9.19 — enforce table-level UNIQUE / PRIMARY KEY tuple
8054/// constraints at INSERT time. For each constraint declared on
8055/// the target table, check that no existing row + no earlier row
8056/// in the same batch has the same full-column tuple. NULL in
8057/// any column lifts the row out of the check (SQL spec: NULL
8058/// ≠ NULL for uniqueness). mailrs G1 + G6.
8059fn enforce_uniqueness_inserts(
8060    catalog: &Catalog,
8061    child_table: &str,
8062    constraints: &[spg_storage::UniquenessConstraint],
8063    rows: &[Vec<Value>],
8064) -> Result<(), EngineError> {
8065    if constraints.is_empty() {
8066        return Ok(());
8067    }
8068    let table = catalog.get(child_table).ok_or_else(|| {
8069        EngineError::Storage(StorageError::TableNotFound {
8070            name: child_table.into(),
8071        })
8072    })?;
8073    for uc in constraints {
8074        for (batch_idx, row_values) in rows.iter().enumerate() {
8075            let key: Vec<&Value> = uc.columns.iter().map(|&i| &row_values[i]).collect();
8076            let has_null = key.iter().any(|v| matches!(v, Value::Null));
8077            if has_null {
8078                continue;
8079            }
8080            // Table-side collision: scan existing rows.
8081            let collides_in_table = table.rows().iter().any(|prow| {
8082                uc.columns
8083                    .iter()
8084                    .enumerate()
8085                    .all(|(i, &p)| prow.values.get(p) == Some(key[i]))
8086            });
8087            // Batch-side collision: earlier rows in the same INSERT.
8088            let collides_in_batch = rows[..batch_idx].iter().any(|earlier| {
8089                uc.columns
8090                    .iter()
8091                    .enumerate()
8092                    .all(|(i, &p)| earlier.get(p) == Some(key[i]))
8093            });
8094            if collides_in_table || collides_in_batch {
8095                let kind = if uc.is_primary_key { "PRIMARY KEY" } else { "UNIQUE" };
8096                let col_names: Vec<String> = uc
8097                    .columns
8098                    .iter()
8099                    .map(|&i| table.schema().columns[i].name.clone())
8100                    .collect();
8101                return Err(EngineError::Unsupported(alloc::format!(
8102                    "{kind} violation on {child_table:?} columns {col_names:?}: \
8103                     row #{batch_idx} duplicates an existing key"
8104                )));
8105            }
8106        }
8107    }
8108    Ok(())
8109}
8110
8111/// v7.9.29 — `true` iff `v` counts as a truthy SQL value for a
8112/// WHERE-style predicate. NULL → false (three-valued logic
8113/// collapses to "skip this row" for index inclusion). Numeric
8114/// non-zero, BIGINT non-zero, TINYINT non-zero, BOOLEAN true → true.
8115/// Everything else (strings, vectors, JSON, …) is not a valid
8116/// predicate result and surfaces as `false` so a malformed
8117/// predicate degrades to "row not in index" rather than panicking.
8118fn predicate_truthy(v: &spg_storage::Value) -> bool {
8119    use spg_storage::Value as V;
8120    match v {
8121        V::Bool(b) => *b,
8122        V::Int(n) => *n != 0,
8123        V::BigInt(n) => *n != 0,
8124        V::SmallInt(n) => *n != 0,
8125        _ => false,
8126    }
8127}
8128
8129/// v7.9.29 — at CREATE UNIQUE INDEX time, scan the table's
8130/// committed rows for pre-existing duplicates. If any pair of rows
8131/// matches the predicate AND has the same index key, refuse to
8132/// create the index so the user fixes the data before retrying.
8133fn check_existing_unique_violation(
8134    idx: &spg_storage::Index,
8135    schema: &spg_storage::TableSchema,
8136    rows: &[spg_storage::Row],
8137) -> Result<(), EngineError> {
8138    let predicate_expr = match idx.partial_predicate.as_deref() {
8139        Some(s) => Some(spg_sql::parser::parse_expression(s).map_err(|e| {
8140            EngineError::Unsupported(alloc::format!(
8141                "stored partial predicate {s:?} failed to re-parse: {e:?}"
8142            ))
8143        })?),
8144        None => None,
8145    };
8146    let ctx = eval::EvalContext::new(&schema.columns, None);
8147    let key_positions = unique_key_positions(idx);
8148    let mut seen: alloc::vec::Vec<alloc::vec::Vec<spg_storage::Value>> = alloc::vec::Vec::new();
8149    for row in rows {
8150        if let Some(expr) = &predicate_expr {
8151            let v = eval::eval_expr(expr, row, &ctx).map_err(|e| {
8152                EngineError::Unsupported(alloc::format!(
8153                    "evaluating UNIQUE INDEX predicate against existing row: {e:?}"
8154                ))
8155            })?;
8156            if !predicate_truthy(&v) {
8157                continue;
8158            }
8159        }
8160        let key: alloc::vec::Vec<spg_storage::Value> = key_positions
8161            .iter()
8162            .map(|&p| {
8163                row.values
8164                    .get(p)
8165                    .cloned()
8166                    .unwrap_or(spg_storage::Value::Null)
8167            })
8168            .collect();
8169        if key.iter().any(|v| matches!(v, spg_storage::Value::Null)) {
8170            continue;
8171        }
8172        if seen.iter().any(|other| *other == key) {
8173            return Err(EngineError::Unsupported(alloc::format!(
8174                "CREATE UNIQUE INDEX {:?}: existing rows already violate the constraint",
8175                idx.name
8176            )));
8177        }
8178        seen.push(key);
8179    }
8180    Ok(())
8181}
8182
8183/// v7.9.29 — full key tuple for a UNIQUE INDEX (leading +
8184/// extra positions). For single-column indexes this is just
8185/// `[column_position]`.
8186fn unique_key_positions(idx: &spg_storage::Index) -> alloc::vec::Vec<usize> {
8187    let mut out = alloc::vec::Vec::with_capacity(1 + idx.extra_column_positions.len());
8188    out.push(idx.column_position);
8189    out.extend_from_slice(&idx.extra_column_positions);
8190    out
8191}
8192
8193/// v7.9.29 — at INSERT time, walk every `is_unique` index on the
8194/// target table. For each, eval the index's optional predicate
8195/// against (a) the candidate row and (b) every committed row plus
8196/// earlier batch rows; only rows where the predicate is truthy
8197/// participate. A duplicate key among predicate-matching rows is a
8198/// uniqueness violation. NULL keys lift the row out of the check
8199/// (matching PG's "UNIQUE allows multiple NULLs" semantics).
8200fn enforce_unique_index_inserts(
8201    catalog: &Catalog,
8202    table_name: &str,
8203    rows: &[alloc::vec::Vec<spg_storage::Value>],
8204) -> Result<(), EngineError> {
8205    let table = catalog.get(table_name).ok_or_else(|| {
8206        EngineError::Storage(StorageError::TableNotFound {
8207            name: table_name.into(),
8208        })
8209    })?;
8210    let schema = table.schema();
8211    let ctx = eval::EvalContext::new(&schema.columns, None);
8212    for idx in table.indices() {
8213        if !idx.is_unique {
8214            continue;
8215        }
8216        // Re-parse the predicate once per index per batch.
8217        let predicate_expr = match idx.partial_predicate.as_deref() {
8218            Some(s) => Some(spg_sql::parser::parse_expression(s).map_err(|e| {
8219                EngineError::Unsupported(alloc::format!(
8220                    "UNIQUE INDEX {:?} predicate {s:?} failed to re-parse: {e:?}",
8221                    idx.name
8222                ))
8223            })?),
8224            None => None,
8225        };
8226        let key_positions = unique_key_positions(idx);
8227        let key_of = |values: &[spg_storage::Value]| -> alloc::vec::Vec<spg_storage::Value> {
8228            key_positions
8229                .iter()
8230                .map(|&p| {
8231                    values
8232                        .get(p)
8233                        .cloned()
8234                        .unwrap_or(spg_storage::Value::Null)
8235                })
8236                .collect()
8237        };
8238        // Helper: does `values` participate in this index? (predicate
8239        // truthy when present.) Wraps `values` into a transient Row
8240        // because eval_expr requires &Row.
8241        let participates = |values: &[spg_storage::Value]| -> Result<bool, EngineError> {
8242            let Some(expr) = &predicate_expr else {
8243                return Ok(true);
8244            };
8245            let tmp_row = spg_storage::Row {
8246                values: values.to_vec(),
8247            };
8248            let v = eval::eval_expr(expr, &tmp_row, &ctx).map_err(|e| {
8249                EngineError::Unsupported(alloc::format!(
8250                    "UNIQUE INDEX {:?} predicate eval: {e:?}",
8251                    idx.name
8252                ))
8253            })?;
8254            Ok(predicate_truthy(&v))
8255        };
8256        for (batch_idx, row_values) in rows.iter().enumerate() {
8257            if !participates(row_values)? {
8258                continue;
8259            }
8260            let key = key_of(row_values);
8261            if key.iter().any(|v| matches!(v, spg_storage::Value::Null)) {
8262                continue;
8263            }
8264            // Committed-table collision.
8265            for prow in table.rows() {
8266                if !participates(&prow.values)? {
8267                    continue;
8268                }
8269                if key_of(&prow.values) == key {
8270                    return Err(EngineError::Unsupported(alloc::format!(
8271                        "UNIQUE INDEX {:?} violation on {table_name:?}: \
8272                         row #{batch_idx} duplicates an existing key",
8273                        idx.name
8274                    )));
8275                }
8276            }
8277            // Within-batch collision: earlier rows in the same INSERT.
8278            for earlier in &rows[..batch_idx] {
8279                if !participates(earlier)? {
8280                    continue;
8281                }
8282                if key_of(earlier) == key {
8283                    return Err(EngineError::Unsupported(alloc::format!(
8284                        "UNIQUE INDEX {:?} violation on {table_name:?}: \
8285                         row #{batch_idx} duplicates an earlier row in the same batch",
8286                        idx.name
8287                    )));
8288                }
8289            }
8290        }
8291    }
8292    Ok(())
8293}
8294
8295fn enforce_fk_inserts(
8296    catalog: &Catalog,
8297    child_table: &str,
8298    fks: &[spg_storage::ForeignKeyConstraint],
8299    rows: &[Vec<Value>],
8300) -> Result<(), EngineError> {
8301    for fk in fks {
8302        let parent_is_self = fk.parent_table == child_table;
8303        let parent = if parent_is_self {
8304            // Self-ref: read the current state of the same table.
8305            // The mut borrow on child has been dropped by the caller.
8306            catalog.get(child_table).ok_or_else(|| {
8307                EngineError::Storage(StorageError::TableNotFound {
8308                    name: child_table.into(),
8309                })
8310            })?
8311        } else {
8312            catalog.get(&fk.parent_table).ok_or_else(|| {
8313                EngineError::Storage(StorageError::TableNotFound {
8314                    name: fk.parent_table.clone(),
8315                })
8316            })?
8317        };
8318        for (batch_idx, row_values) in rows.iter().enumerate() {
8319            // Single-column FK fast path: try the parent's BTree
8320            // index for an O(log n) lookup. Composite FKs fall back
8321            // to a parent-row scan.
8322            if fk.local_columns.len() == 1 {
8323                let v = &row_values[fk.local_columns[0]];
8324                if matches!(v, Value::Null) {
8325                    continue;
8326                }
8327                let parent_col = fk.parent_columns[0];
8328                let key = spg_storage::IndexKey::from_value(v).ok_or_else(|| {
8329                    EngineError::Unsupported(alloc::format!(
8330                        "FOREIGN KEY column value of type {:?} is not index-eligible",
8331                        v.data_type()
8332                    ))
8333                })?;
8334                let present_committed = parent.indices().iter().any(|idx| {
8335                    matches!(idx.kind, spg_storage::IndexKind::BTree(_))
8336                        && idx.column_position == parent_col
8337                        && idx.partial_predicate.is_none()
8338                        && !idx.lookup_eq(&key).is_empty()
8339                });
8340                // v7.6.7 self-ref widening: also accept a match
8341                // against earlier rows in this same batch when the
8342                // FK points at the table being inserted into.
8343                let present_in_batch = parent_is_self
8344                    && rows[..batch_idx].iter().any(|earlier| {
8345                        earlier.get(parent_col) == Some(v)
8346                    });
8347                if !(present_committed || present_in_batch) {
8348                    return Err(EngineError::Unsupported(alloc::format!(
8349                        "FOREIGN KEY violation: no parent row in {:?} where {} = {:?}",
8350                        fk.parent_table,
8351                        parent
8352                            .schema()
8353                            .columns
8354                            .get(parent_col)
8355                            .map_or("?", |c| c.name.as_str()),
8356                        v,
8357                    )));
8358                }
8359            } else {
8360                // Composite FK: scan parent rows. v7.6.7 also
8361                // accepts a match against earlier rows in the same
8362                // batch (self-ref bulk-loading of hierarchies).
8363                if fk.local_columns
8364                    .iter()
8365                    .all(|&i| matches!(row_values.get(i), Some(Value::Null)))
8366                {
8367                    continue;
8368                }
8369                let local: Vec<&Value> = fk.local_columns.iter().map(|&i| &row_values[i]).collect();
8370                let parent_match_committed = parent.rows().iter().any(|prow| {
8371                    fk.parent_columns
8372                        .iter()
8373                        .enumerate()
8374                        .all(|(i, &pi)| prow.values.get(pi) == Some(local[i]))
8375                });
8376                let parent_match_in_batch = parent_is_self
8377                    && rows[..batch_idx].iter().any(|earlier| {
8378                        fk.parent_columns
8379                            .iter()
8380                            .enumerate()
8381                            .all(|(i, &pi)| earlier.get(pi) == Some(local[i]))
8382                    });
8383                if !(parent_match_committed || parent_match_in_batch) {
8384                    return Err(EngineError::Unsupported(alloc::format!(
8385                        "FOREIGN KEY violation: no parent row in {:?} matching composite key",
8386                        fk.parent_table,
8387                    )));
8388                }
8389            }
8390        }
8391    }
8392    Ok(())
8393}
8394
8395/// v7.6.4 / v7.6.5 — one step of the FK action plan computed for a
8396/// DELETE on a parent. The plan is a list of these steps, stacked
8397/// across the FK graph by `plan_fk_parent_deletions`.
8398#[derive(Debug, Clone)]
8399struct FkChildStep {
8400    child_table: String,
8401    action: FkChildAction,
8402}
8403
8404#[derive(Debug, Clone)]
8405enum FkChildAction {
8406    /// CASCADE — remove these rows. Sorted, deduplicated positions.
8407    Delete { positions: Vec<usize> },
8408    /// SET NULL — for each (row, column) in the flat list, write
8409    /// NULL into that child cell. Multiple FKs on the same row may
8410    /// produce overlapping entries (deduped at plan time).
8411    SetNull {
8412        positions: Vec<usize>,
8413        columns: Vec<usize>,
8414    },
8415    /// SET DEFAULT — same shape as SetNull but writes the column's
8416    /// declared DEFAULT value (resolved at plan time). Columns
8417    /// without a DEFAULT raise an error during planning.
8418    SetDefault {
8419        positions: Vec<usize>,
8420        columns: Vec<usize>,
8421        defaults: Vec<Value>,
8422    },
8423}
8424
8425/// v7.6.3 → v7.6.5 — plan FK fallout for a DELETE on a parent table.
8426///
8427/// Walks every table in the catalog looking for FKs whose
8428/// `parent_table` is `parent_table_name`. For each such FK + each
8429/// to-be-deleted parent row:
8430///
8431///   - RESTRICT / NoAction → error, no plan returned
8432///   - CASCADE → child rows get scheduled for deletion; recursive
8433///   - SetNull → child FK column(s) scheduled to be NULL-ed.
8434///     Verified NULL-able at plan time.
8435///   - SetDefault → child FK column(s) scheduled to be reset to
8436///     their declared DEFAULT. Columns without a DEFAULT raise.
8437///
8438/// SET NULL / SET DEFAULT do NOT cascade further — the child row
8439/// stays; only one of its columns mutates.
8440fn plan_fk_parent_deletions(
8441    catalog: &Catalog,
8442    parent_table_name: &str,
8443    to_delete_positions: &[usize],
8444    to_delete_rows: &[Vec<Value>],
8445) -> Result<Vec<FkChildStep>, EngineError> {
8446    use alloc::collections::{BTreeMap, BTreeSet};
8447    if to_delete_rows.is_empty() {
8448        return Ok(Vec::new());
8449    }
8450    let mut delete_plan: BTreeMap<String, BTreeSet<usize>> = BTreeMap::new();
8451    // setnull / setdefault keyed by child_table → (row_idx, col_idx) → optional default
8452    let mut setnull_plan: BTreeMap<String, BTreeSet<(usize, usize)>> = BTreeMap::new();
8453    let mut setdefault_plan: BTreeMap<String, BTreeMap<(usize, usize), Value>> =
8454        BTreeMap::new();
8455    let mut visited: BTreeSet<(String, usize)> = BTreeSet::new();
8456    for &p in to_delete_positions {
8457        visited.insert((parent_table_name.to_string(), p));
8458    }
8459    let mut work: Vec<(String, Vec<Value>)> = to_delete_rows
8460        .iter()
8461        .map(|r| (parent_table_name.to_string(), r.clone()))
8462        .collect();
8463    while let Some((cur_parent, parent_row)) = work.pop() {
8464        for child_name in catalog.table_names() {
8465            let child = catalog
8466                .get(&child_name)
8467                .expect("table_names → catalog.get round-trip is total");
8468            for fk in &child.schema().foreign_keys {
8469                if fk.parent_table != cur_parent {
8470                    continue;
8471                }
8472                let parent_key: Vec<&Value> = fk
8473                    .parent_columns
8474                    .iter()
8475                    .map(|&pi| &parent_row[pi])
8476                    .collect();
8477                if parent_key.iter().any(|v| matches!(v, Value::Null)) {
8478                    continue;
8479                }
8480                for (child_row_idx, child_row) in child.rows().iter().enumerate() {
8481                    if child_name == cur_parent
8482                        && visited.contains(&(child_name.clone(), child_row_idx))
8483                    {
8484                        continue;
8485                    }
8486                    let matches_key = fk
8487                        .local_columns
8488                        .iter()
8489                        .enumerate()
8490                        .all(|(i, &li)| child_row.values.get(li) == Some(parent_key[i]));
8491                    if !matches_key {
8492                        continue;
8493                    }
8494                    match fk.on_delete {
8495                        spg_storage::FkAction::Restrict
8496                        | spg_storage::FkAction::NoAction => {
8497                            return Err(EngineError::Unsupported(alloc::format!(
8498                                "FOREIGN KEY violation: DELETE on {cur_parent:?} is \
8499                                 restricted by FK from {child_name:?}.{:?}",
8500                                fk.local_columns,
8501                            )));
8502                        }
8503                        spg_storage::FkAction::Cascade => {
8504                            if visited.insert((child_name.clone(), child_row_idx)) {
8505                                delete_plan
8506                                    .entry(child_name.clone())
8507                                    .or_default()
8508                                    .insert(child_row_idx);
8509                                work.push((child_name.clone(), child_row.values.clone()));
8510                            }
8511                        }
8512                        spg_storage::FkAction::SetNull => {
8513                            // Verify every local FK column is NULL-able.
8514                            for &li in &fk.local_columns {
8515                                let col = child.schema().columns.get(li).ok_or_else(|| {
8516                                    EngineError::Unsupported(alloc::format!(
8517                                        "FK local column {li} missing in {child_name:?}"
8518                                    ))
8519                                })?;
8520                                if !col.nullable {
8521                                    return Err(EngineError::Unsupported(alloc::format!(
8522                                        "FOREIGN KEY ON DELETE SET NULL: column \
8523                                         {child_name:?}.{:?} is NOT NULL — cannot SET NULL",
8524                                        col.name,
8525                                    )));
8526                                }
8527                            }
8528                            let entry = setnull_plan.entry(child_name.clone()).or_default();
8529                            for &li in &fk.local_columns {
8530                                entry.insert((child_row_idx, li));
8531                            }
8532                        }
8533                        spg_storage::FkAction::SetDefault => {
8534                            // Resolve the DEFAULT for every local FK col.
8535                            let entry =
8536                                setdefault_plan.entry(child_name.clone()).or_default();
8537                            for &li in &fk.local_columns {
8538                                let col = child.schema().columns.get(li).ok_or_else(|| {
8539                                    EngineError::Unsupported(alloc::format!(
8540                                        "FK local column {li} missing in {child_name:?}"
8541                                    ))
8542                                })?;
8543                                let default = col.default.clone().ok_or_else(|| {
8544                                    EngineError::Unsupported(alloc::format!(
8545                                        "FOREIGN KEY ON DELETE SET DEFAULT: column \
8546                                         {child_name:?}.{:?} has no DEFAULT declared",
8547                                        col.name,
8548                                    ))
8549                                })?;
8550                                entry.insert((child_row_idx, li), default);
8551                            }
8552                        }
8553                    }
8554                }
8555            }
8556        }
8557    }
8558    // Flatten the three plans into the ordered `FkChildStep` list.
8559    // Deletes are applied last per child (after any null/default
8560    // re-writes on the same child) so a child row that's both
8561    // re-written and then cascade-deleted only ends up deleted —
8562    // but in v7.6.5 SetNull/Cascade never overlap on the same row
8563    // (a single FK chooses exactly one action), so the order is
8564    // mostly a precaution.
8565    let mut steps: Vec<FkChildStep> = Vec::new();
8566    for (child_table, entries) in setnull_plan {
8567        let (positions, columns): (Vec<usize>, Vec<usize>) = entries.into_iter().unzip();
8568        steps.push(FkChildStep {
8569            child_table,
8570            action: FkChildAction::SetNull { positions, columns },
8571        });
8572    }
8573    for (child_table, entries) in setdefault_plan {
8574        let mut positions = Vec::with_capacity(entries.len());
8575        let mut columns = Vec::with_capacity(entries.len());
8576        let mut defaults = Vec::with_capacity(entries.len());
8577        for ((p, c), v) in entries {
8578            positions.push(p);
8579            columns.push(c);
8580            defaults.push(v);
8581        }
8582        steps.push(FkChildStep {
8583            child_table,
8584            action: FkChildAction::SetDefault {
8585                positions,
8586                columns,
8587                defaults,
8588            },
8589        });
8590    }
8591    for (child_table, positions) in delete_plan {
8592        steps.push(FkChildStep {
8593            child_table,
8594            action: FkChildAction::Delete {
8595                positions: positions.into_iter().collect(),
8596            },
8597        });
8598    }
8599    Ok(steps)
8600}
8601
8602/// v7.6.6 — plan FK fallout for an UPDATE that mutates parent-side
8603/// PK/UNIQUE columns. Walks every other table whose FK references
8604/// `parent_table_name`; for each FK whose parent_columns overlap a
8605/// mutated column, decides the action by `fk.on_update`.
8606///
8607///   - RESTRICT / NoAction → error if any child references the OLD
8608///     value
8609///   - CASCADE → child FK columns get rewritten to the NEW parent
8610///     value (a SetNull-style update step with the new value)
8611///   - SetNull → child FK columns set to NULL
8612///   - SetDefault → child FK columns set to declared default
8613///
8614/// `plan_with_old` is `(row_position, old_values, new_values)` so
8615/// the planner can detect "did this row's parent key actually
8616/// change?" — only rows where at least one referenced parent
8617/// column moved trigger inbound work.
8618fn plan_fk_parent_updates(
8619    catalog: &Catalog,
8620    parent_table_name: &str,
8621    plan_with_old: &[(usize, Vec<Value>, Vec<Value>)],
8622) -> Result<Vec<FkChildStep>, EngineError> {
8623    use alloc::collections::BTreeMap;
8624    if plan_with_old.is_empty() {
8625        return Ok(Vec::new());
8626    }
8627    // For each child table we may touch, build per-child step
8628    // lists. UPDATE never deletes children — `delete_plan` stays
8629    // empty here but is kept structurally aligned with
8630    // `plan_fk_parent_deletions` for future use.
8631    let delete_plan: BTreeMap<String, alloc::collections::BTreeSet<usize>> = BTreeMap::new();
8632    let mut setnull_plan: BTreeMap<
8633        String,
8634        alloc::collections::BTreeSet<(usize, usize)>,
8635    > = BTreeMap::new();
8636    let mut setdefault_plan: BTreeMap<String, BTreeMap<(usize, usize), Value>> =
8637        BTreeMap::new();
8638    // Cascade-update plan: child_table → row_idx → col_idx → new_value
8639    let mut cascade_plan: BTreeMap<String, BTreeMap<(usize, usize), Value>> = BTreeMap::new();
8640
8641    for child_name in catalog.table_names() {
8642        let child = catalog
8643            .get(&child_name)
8644            .expect("table_names → catalog.get total");
8645        for fk in &child.schema().foreign_keys {
8646            if fk.parent_table != parent_table_name {
8647                continue;
8648            }
8649            for (_pos, old_row, new_row) in plan_with_old {
8650                // Did any parent FK column change?
8651                let key_changed = fk
8652                    .parent_columns
8653                    .iter()
8654                    .any(|&pi| old_row.get(pi) != new_row.get(pi));
8655                if !key_changed {
8656                    continue;
8657                }
8658                // The OLD parent key — used to find referring children.
8659                let old_key: Vec<&Value> = fk
8660                    .parent_columns
8661                    .iter()
8662                    .map(|&pi| &old_row[pi])
8663                    .collect();
8664                if old_key.iter().any(|v| matches!(v, Value::Null)) {
8665                    // NULL parent has no children — skip.
8666                    continue;
8667                }
8668                let new_key: Vec<&Value> = fk
8669                    .parent_columns
8670                    .iter()
8671                    .map(|&pi| &new_row[pi])
8672                    .collect();
8673                for (child_row_idx, child_row) in child.rows().iter().enumerate() {
8674                    // Self-ref same-row updates: a row updating its
8675                    // own PK doesn't restrict itself.
8676                    if child_name == parent_table_name
8677                        && plan_with_old
8678                            .iter()
8679                            .any(|(p, _, _)| *p == child_row_idx)
8680                    {
8681                        continue;
8682                    }
8683                    let matches_key = fk
8684                        .local_columns
8685                        .iter()
8686                        .enumerate()
8687                        .all(|(i, &li)| child_row.values.get(li) == Some(old_key[i]));
8688                    if !matches_key {
8689                        continue;
8690                    }
8691                    match fk.on_update {
8692                        spg_storage::FkAction::Restrict
8693                        | spg_storage::FkAction::NoAction => {
8694                            return Err(EngineError::Unsupported(alloc::format!(
8695                                "FOREIGN KEY violation: UPDATE on {parent_table_name:?} PK is \
8696                                 restricted by FK from {child_name:?}.{:?}",
8697                                fk.local_columns,
8698                            )));
8699                        }
8700                        spg_storage::FkAction::Cascade => {
8701                            // Rewrite child FK columns to new key.
8702                            let entry = cascade_plan.entry(child_name.clone()).or_default();
8703                            for (i, &li) in fk.local_columns.iter().enumerate() {
8704                                entry.insert((child_row_idx, li), new_key[i].clone());
8705                            }
8706                        }
8707                        spg_storage::FkAction::SetNull => {
8708                            for &li in &fk.local_columns {
8709                                let col = child.schema().columns.get(li).ok_or_else(|| {
8710                                    EngineError::Unsupported(alloc::format!(
8711                                        "FK local column {li} missing in {child_name:?}"
8712                                    ))
8713                                })?;
8714                                if !col.nullable {
8715                                    return Err(EngineError::Unsupported(alloc::format!(
8716                                        "FOREIGN KEY ON UPDATE SET NULL: column \
8717                                         {child_name:?}.{:?} is NOT NULL",
8718                                        col.name,
8719                                    )));
8720                                }
8721                            }
8722                            let entry = setnull_plan.entry(child_name.clone()).or_default();
8723                            for &li in &fk.local_columns {
8724                                entry.insert((child_row_idx, li));
8725                            }
8726                        }
8727                        spg_storage::FkAction::SetDefault => {
8728                            let entry =
8729                                setdefault_plan.entry(child_name.clone()).or_default();
8730                            for &li in &fk.local_columns {
8731                                let col = child.schema().columns.get(li).ok_or_else(|| {
8732                                    EngineError::Unsupported(alloc::format!(
8733                                        "FK local column {li} missing in {child_name:?}"
8734                                    ))
8735                                })?;
8736                                let default = col.default.clone().ok_or_else(|| {
8737                                    EngineError::Unsupported(alloc::format!(
8738                                        "FOREIGN KEY ON UPDATE SET DEFAULT: column \
8739                                         {child_name:?}.{:?} has no DEFAULT",
8740                                        col.name,
8741                                    ))
8742                                })?;
8743                                entry.insert((child_row_idx, li), default);
8744                            }
8745                        }
8746                    }
8747                }
8748            }
8749        }
8750    }
8751    // Flatten into FkChildStep list. UPDATE doesn't produce
8752    // DeleteSteps (CASCADE on UPDATE just rewrites FK values).
8753    let mut steps: Vec<FkChildStep> = Vec::new();
8754    for (child_table, entries) in cascade_plan {
8755        let mut positions = Vec::with_capacity(entries.len());
8756        let mut columns = Vec::with_capacity(entries.len());
8757        let mut defaults = Vec::with_capacity(entries.len());
8758        for ((p, c), v) in entries {
8759            positions.push(p);
8760            columns.push(c);
8761            defaults.push(v);
8762        }
8763        // We reuse `FkChildAction::SetDefault` for cascade-update:
8764        // both shapes are "write a known value into specific cells"
8765        // — `apply_per_cell_writes` doesn't care whether the value
8766        // came from a DEFAULT declaration or a new parent key.
8767        steps.push(FkChildStep {
8768            child_table,
8769            action: FkChildAction::SetDefault {
8770                positions,
8771                columns,
8772                defaults,
8773            },
8774        });
8775    }
8776    for (child_table, entries) in setnull_plan {
8777        let (positions, columns): (Vec<usize>, Vec<usize>) = entries.into_iter().unzip();
8778        steps.push(FkChildStep {
8779            child_table,
8780            action: FkChildAction::SetNull { positions, columns },
8781        });
8782    }
8783    for (child_table, entries) in setdefault_plan {
8784        let mut positions = Vec::with_capacity(entries.len());
8785        let mut columns = Vec::with_capacity(entries.len());
8786        let mut defaults = Vec::with_capacity(entries.len());
8787        for ((p, c), v) in entries {
8788            positions.push(p);
8789            columns.push(c);
8790            defaults.push(v);
8791        }
8792        steps.push(FkChildStep {
8793            child_table,
8794            action: FkChildAction::SetDefault {
8795                positions,
8796                columns,
8797                defaults,
8798            },
8799        });
8800    }
8801    let _ = delete_plan; // UPDATE never deletes children.
8802    Ok(steps)
8803}
8804
8805/// v7.6.5 — apply one FK child step to the catalog. Encapsulates
8806/// the three action variants so the DELETE executor stays a
8807/// simple loop over the planned steps.
8808fn apply_fk_child_step(
8809    catalog: &mut Catalog,
8810    step: &FkChildStep,
8811) -> Result<(), EngineError> {
8812    let child = catalog.get_mut(&step.child_table).ok_or_else(|| {
8813        EngineError::Storage(StorageError::TableNotFound {
8814            name: step.child_table.clone(),
8815        })
8816    })?;
8817    match &step.action {
8818        FkChildAction::Delete { positions } => {
8819            let _ = child.delete_rows(positions);
8820        }
8821        FkChildAction::SetNull { positions, columns } => {
8822            apply_per_cell_writes(child, positions, columns, |_| Value::Null)?;
8823        }
8824        FkChildAction::SetDefault {
8825            positions,
8826            columns,
8827            defaults,
8828        } => {
8829            apply_per_cell_writes(child, positions, columns, |i| defaults[i].clone())?;
8830        }
8831    }
8832    Ok(())
8833}
8834
8835/// v7.6.5 — write new values into selected child cells via
8836/// `Table::update_row` (the catalog's existing UPDATE entry).
8837/// Groups writes by row position so multi-column updates on the
8838/// same row only call `update_row` once. `value_for(i)` produces
8839/// the new value for the i-th (position, column) entry.
8840fn apply_per_cell_writes(
8841    child: &mut spg_storage::Table,
8842    positions: &[usize],
8843    columns: &[usize],
8844    mut value_for: impl FnMut(usize) -> Value,
8845) -> Result<(), EngineError> {
8846    use alloc::collections::BTreeMap;
8847    let mut by_row: BTreeMap<usize, Vec<(usize, Value)>> = BTreeMap::new();
8848    for i in 0..positions.len() {
8849        by_row
8850            .entry(positions[i])
8851            .or_default()
8852            .push((columns[i], value_for(i)));
8853    }
8854    for (pos, mutations) in by_row {
8855        let mut new_values = child.rows()[pos].values.clone();
8856        for (col, v) in mutations {
8857            if let Some(slot) = new_values.get_mut(col) {
8858                *slot = v;
8859            }
8860        }
8861        child
8862            .update_row(pos, new_values)
8863            .map_err(EngineError::Storage)?;
8864    }
8865    Ok(())
8866}
8867
8868fn fk_action_sql_to_storage(a: spg_sql::ast::FkAction) -> spg_storage::FkAction {
8869    match a {
8870        spg_sql::ast::FkAction::Restrict => spg_storage::FkAction::Restrict,
8871        spg_sql::ast::FkAction::Cascade => spg_storage::FkAction::Cascade,
8872        spg_sql::ast::FkAction::SetNull => spg_storage::FkAction::SetNull,
8873        spg_sql::ast::FkAction::SetDefault => spg_storage::FkAction::SetDefault,
8874        spg_sql::ast::FkAction::NoAction => spg_storage::FkAction::NoAction,
8875    }
8876}
8877
8878/// v7.9.21 — resolve a column's DEFAULT for INSERT-time
8879/// default-fill. Free fn (rather than `&self`) so callers
8880/// with an active `&mut Table` borrow can still use it.
8881/// Literal defaults take the cached path (`col.default`);
8882/// runtime defaults hit `clock_fn` at each call. mailrs G4.
8883fn resolve_column_default_free(
8884    col: &ColumnSchema,
8885    clock_fn: Option<ClockFn>,
8886) -> Result<Value, EngineError> {
8887    if let Some(rt) = &col.runtime_default {
8888        return eval_runtime_default_free(rt, col.ty, clock_fn);
8889    }
8890    Ok(col.default.clone().unwrap_or(Value::Null))
8891}
8892
8893fn eval_runtime_default_free(
8894    rt: &str,
8895    ty: DataType,
8896    clock_fn: Option<ClockFn>,
8897) -> Result<Value, EngineError> {
8898    let s = rt.trim().to_ascii_lowercase();
8899    let canonical = s.trim_end_matches("()");
8900    let now_us = match clock_fn {
8901        Some(f) => f(),
8902        None => 0,
8903    };
8904    let v = match canonical {
8905        "now" | "current_timestamp" | "localtimestamp" => {
8906            Value::Timestamp(now_us)
8907        }
8908        "current_date" => Value::Date((now_us / 86_400_000_000) as i32),
8909        "current_time" | "localtime" => Value::Timestamp(now_us),
8910        other => {
8911            return Err(EngineError::Unsupported(alloc::format!(
8912                "runtime DEFAULT expression {other:?} not supported \
8913                 (v7.9.21 whitelist: now() / current_timestamp / \
8914                 current_date / current_time / localtimestamp / \
8915                 localtime)"
8916            )));
8917        }
8918    };
8919    coerce_value(v, ty, "DEFAULT", 0)
8920}
8921
8922/// v7.9.21 — true when a DEFAULT expression needs INSERT-time
8923/// evaluation rather than being cacheable as a literal Value.
8924/// FunctionCall is the immediate case (`now()`,
8925/// `current_timestamp`). Literal expressions and simple sign-
8926/// flipped numerics still take the static-cache path.
8927fn is_runtime_default_expr(expr: &Expr) -> bool {
8928    match expr {
8929        Expr::FunctionCall { .. } => true,
8930        Expr::Unary { expr, .. } => is_runtime_default_expr(expr),
8931        _ => false,
8932    }
8933}
8934
8935fn column_def_to_schema(c: ColumnDef) -> Result<ColumnSchema, EngineError> {
8936    let ty = column_type_to_data_type(c.ty);
8937    let mut schema = ColumnSchema::new(c.name.clone(), ty, c.nullable);
8938    if let Some(default_expr) = c.default {
8939        // v7.9.21 — distinguish literal defaults (evaluated once
8940        // at CREATE TABLE) from expression defaults (deferred to
8941        // INSERT). Function calls (`now()`, `current_timestamp`
8942        // — see v7.9.20 keyword promotion) take the runtime path.
8943        // Literals continue to cache. mailrs G4.
8944        if is_runtime_default_expr(&default_expr) {
8945            let display = alloc::format!("{default_expr}");
8946            schema = schema.with_runtime_default(display);
8947        } else {
8948            let raw = literal_expr_to_value(default_expr)?;
8949            let coerced = coerce_value(raw, ty, &c.name, 0)?;
8950            schema = schema.with_default(coerced);
8951        }
8952    }
8953    if c.auto_increment {
8954        // AUTO_INCREMENT only makes sense on integer-shaped columns.
8955        if !matches!(ty, DataType::SmallInt | DataType::Int | DataType::BigInt) {
8956            return Err(EngineError::Unsupported(alloc::format!(
8957                "AUTO_INCREMENT requires an integer column type, got {ty:?}"
8958            )));
8959        }
8960        schema = schema.with_auto_increment();
8961    }
8962    Ok(schema)
8963}
8964
8965/// v7.10.4 — decode a BYTEA literal. Accepts:
8966///   * `\xDEADBEEF` (case-insensitive hex; whitespace stripped)
8967///   * `Hello\000world` (backslash escape form; `\\` for literal backslash)
8968///   * Anything else → raw UTF-8 bytes of the input (PG accepts this too).
8969fn decode_bytea_literal(s: &str) -> Result<alloc::vec::Vec<u8>, &'static str> {
8970    let s = s.trim();
8971    if let Some(hex) = s.strip_prefix("\\x").or_else(|| s.strip_prefix("\\X")) {
8972        // Hex form. Each pair of hex digits → one byte.
8973        let cleaned: alloc::string::String = hex.chars().filter(|c| !c.is_whitespace()).collect();
8974        if cleaned.len() % 2 != 0 {
8975            return Err("odd-length hex literal");
8976        }
8977        let mut out = alloc::vec::Vec::with_capacity(cleaned.len() / 2);
8978        let cleaned_bytes = cleaned.as_bytes();
8979        for i in (0..cleaned_bytes.len()).step_by(2) {
8980            let hi = hex_nibble(cleaned_bytes[i])?;
8981            let lo = hex_nibble(cleaned_bytes[i + 1])?;
8982            out.push((hi << 4) | lo);
8983        }
8984        return Ok(out);
8985    }
8986    // Escape form or raw. Walk char-by-char; `\\` and `\NNN` octal
8987    // sequences decode; anything else is a literal byte.
8988    let bytes = s.as_bytes();
8989    let mut out = alloc::vec::Vec::with_capacity(bytes.len());
8990    let mut i = 0;
8991    while i < bytes.len() {
8992        let b = bytes[i];
8993        if b == b'\\' && i + 1 < bytes.len() {
8994            let n = bytes[i + 1];
8995            if n == b'\\' {
8996                out.push(b'\\');
8997                i += 2;
8998                continue;
8999            }
9000            if n.is_ascii_digit() && i + 3 < bytes.len() && bytes[i + 2].is_ascii_digit()
9001                && bytes[i + 3].is_ascii_digit()
9002            {
9003                let oct = |x: u8| (x - b'0') as u32;
9004                let v = oct(n) * 64 + oct(bytes[i + 2]) * 8 + oct(bytes[i + 3]);
9005                if v <= 0xFF {
9006                    out.push(v as u8);
9007                    i += 4;
9008                    continue;
9009                }
9010            }
9011        }
9012        out.push(b);
9013        i += 1;
9014    }
9015    Ok(out)
9016}
9017
9018fn hex_nibble(b: u8) -> Result<u8, &'static str> {
9019    match b {
9020        b'0'..=b'9' => Ok(b - b'0'),
9021        b'a'..=b'f' => Ok(b - b'a' + 10),
9022        b'A'..=b'F' => Ok(b - b'A' + 10),
9023        _ => Err("invalid hex digit"),
9024    }
9025}
9026
9027/// v7.10.11 — decode a PG TEXT[] external array form
9028/// (`{a,b,NULL}` with optional double-quoted elements). The
9029/// engine takes a leading/trailing `{`/`}` and splits at commas.
9030/// Quoted elements (`"hello, world"`) preserve embedded commas;
9031/// `\\` and `\"` decode to literal backslash / quote. Plain
9032/// unquoted `NULL` (case-insensitive) maps to `None`.
9033fn decode_text_array_literal(
9034    s: &str,
9035) -> Result<alloc::vec::Vec<Option<alloc::string::String>>, &'static str> {
9036    let trimmed = s.trim();
9037    let inner = trimmed
9038        .strip_prefix('{')
9039        .and_then(|x| x.strip_suffix('}'))
9040        .ok_or("TEXT[] literal must be enclosed in '{...}'")?;
9041    let mut out: alloc::vec::Vec<Option<alloc::string::String>> = alloc::vec::Vec::new();
9042    if inner.trim().is_empty() {
9043        return Ok(out);
9044    }
9045    let bytes = inner.as_bytes();
9046    let mut i = 0;
9047    while i <= bytes.len() {
9048        // Skip leading whitespace.
9049        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
9050            i += 1;
9051        }
9052        // Quoted element.
9053        if i < bytes.len() && bytes[i] == b'"' {
9054            i += 1; // open quote
9055            let mut buf = alloc::string::String::new();
9056            while i < bytes.len() && bytes[i] != b'"' {
9057                if bytes[i] == b'\\' && i + 1 < bytes.len() {
9058                    buf.push(bytes[i + 1] as char);
9059                    i += 2;
9060                } else {
9061                    buf.push(bytes[i] as char);
9062                    i += 1;
9063                }
9064            }
9065            if i >= bytes.len() {
9066                return Err("unterminated quoted element");
9067            }
9068            i += 1; // close quote
9069            out.push(Some(buf));
9070        } else {
9071            // Unquoted element — read until next comma or end.
9072            let start = i;
9073            while i < bytes.len() && bytes[i] != b',' {
9074                i += 1;
9075            }
9076            let raw = inner[start..i].trim();
9077            if raw.eq_ignore_ascii_case("NULL") {
9078                out.push(None);
9079            } else {
9080                out.push(Some(alloc::string::ToString::to_string(raw)));
9081            }
9082        }
9083        // Skip whitespace, expect comma or end.
9084        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
9085            i += 1;
9086        }
9087        if i >= bytes.len() {
9088            break;
9089        }
9090        if bytes[i] != b',' {
9091            return Err("expected ',' between TEXT[] elements");
9092        }
9093        i += 1;
9094    }
9095    Ok(out)
9096}
9097
9098/// v7.10.11 — encode a TEXT[] back into the PG external array
9099/// form. NULL elements become the literal `NULL`; elements
9100/// containing commas, quotes, backslashes, or braces are
9101/// double-quoted with `\\` / `\"` escapes.
9102fn encode_text_array(items: &[Option<alloc::string::String>]) -> alloc::string::String {
9103    let mut out = alloc::string::String::with_capacity(2 + items.len() * 8);
9104    out.push('{');
9105    for (i, item) in items.iter().enumerate() {
9106        if i > 0 {
9107            out.push(',');
9108        }
9109        match item {
9110            None => out.push_str("NULL"),
9111            Some(s) => {
9112                let needs_quote = s.is_empty()
9113                    || s.eq_ignore_ascii_case("NULL")
9114                    || s.chars().any(|c| {
9115                        matches!(c, ',' | '{' | '}' | '"' | '\\' | ' ' | '\t')
9116                    });
9117                if needs_quote {
9118                    out.push('"');
9119                    for c in s.chars() {
9120                        if c == '"' || c == '\\' {
9121                            out.push('\\');
9122                        }
9123                        out.push(c);
9124                    }
9125                    out.push('"');
9126                } else {
9127                    out.push_str(s);
9128                }
9129            }
9130        }
9131    }
9132    out.push('}');
9133    out
9134}
9135
9136/// v7.10.4 — encode BYTEA bytes in PG hex output format
9137/// (`\x` prefix, lowercase hex pairs). Used by Text-side
9138/// round-trip + the wire layer's text-mode encoder.
9139fn encode_bytea_hex(b: &[u8]) -> alloc::string::String {
9140    let mut out = alloc::string::String::with_capacity(2 + 2 * b.len());
9141    out.push_str("\\x");
9142    for byte in b {
9143        let hi = byte >> 4;
9144        let lo = byte & 0x0F;
9145        out.push(hex_digit(hi));
9146        out.push(hex_digit(lo));
9147    }
9148    out
9149}
9150
9151const fn hex_digit(n: u8) -> char {
9152    match n {
9153        0..=9 => (b'0' + n) as char,
9154        10..=15 => (b'a' + n - 10) as char,
9155        _ => '?',
9156    }
9157}
9158
9159const fn column_type_to_data_type(t: ColumnTypeName) -> DataType {
9160    match t {
9161        ColumnTypeName::SmallInt => DataType::SmallInt,
9162        ColumnTypeName::Int => DataType::Int,
9163        ColumnTypeName::BigInt => DataType::BigInt,
9164        ColumnTypeName::Float => DataType::Float,
9165        ColumnTypeName::Text => DataType::Text,
9166        ColumnTypeName::Varchar(n) => DataType::Varchar(n),
9167        ColumnTypeName::Char(n) => DataType::Char(n),
9168        ColumnTypeName::Bool => DataType::Bool,
9169        ColumnTypeName::Vector { dim, encoding } => DataType::Vector {
9170            dim,
9171            encoding: match encoding {
9172                SqlVecEncoding::F32 => VecEncoding::F32,
9173                SqlVecEncoding::Sq8 => VecEncoding::Sq8,
9174                SqlVecEncoding::F16 => VecEncoding::F16,
9175            },
9176        },
9177        ColumnTypeName::Numeric(precision, scale) => DataType::Numeric { precision, scale },
9178        ColumnTypeName::Date => DataType::Date,
9179        ColumnTypeName::Timestamp => DataType::Timestamp,
9180        ColumnTypeName::Timestamptz => DataType::Timestamptz,
9181        ColumnTypeName::Json => DataType::Json,
9182        ColumnTypeName::Jsonb => DataType::Jsonb,
9183        ColumnTypeName::Bytes => DataType::Bytes,
9184        ColumnTypeName::TextArray => DataType::TextArray,
9185    }
9186}
9187
9188/// Convert an INSERT VALUES expression to a storage Value. Supports literal
9189/// expressions, unary-minus over numeric literals, and pgvector-style
9190/// `'[..]'::vector` cast (v1.2). Anything more complex returns `Unsupported`.
9191fn literal_expr_to_value(expr: Expr) -> Result<Value, EngineError> {
9192    match expr {
9193        Expr::Literal(l) => Ok(literal_to_value(l)),
9194        Expr::Cast { expr, target } => {
9195            let inner_value = literal_expr_to_value(*expr)?;
9196            crate::eval::cast_value(inner_value, target).map_err(EngineError::Eval)
9197        }
9198        Expr::Unary {
9199            op: UnOp::Neg,
9200            expr,
9201        } => match *expr {
9202            Expr::Literal(Literal::Integer(n)) => {
9203                // Fold to i32 if it fits, else BigInt. Parser emits Integer(i64)
9204                // — overflow on negate of i64::MIN is the one edge case.
9205                let neg = n.checked_neg().ok_or_else(|| {
9206                    EngineError::Unsupported("integer literal overflow on negation".into())
9207                })?;
9208                Ok(int_value_for(neg))
9209            }
9210            Expr::Literal(Literal::Float(x)) => Ok(Value::Float(-x)),
9211            other => Err(EngineError::Unsupported(alloc::format!(
9212                "unary minus over non-literal expression: {other:?}"
9213            ))),
9214        },
9215        // v7.10.10 — `ARRAY[lit, lit, …]` constructor accepted at
9216        // INSERT-time. Each element must reduce to a Value through
9217        // `literal_expr_to_value`; NULL elements become `None`.
9218        // Casts (e.g. `ARRAY[]::TEXT[]`) flow through the outer
9219        // Cast arm before reaching here.
9220        Expr::Array(items) => {
9221            let mut out: alloc::vec::Vec<Option<alloc::string::String>> =
9222                alloc::vec::Vec::with_capacity(items.len());
9223            for elem in items {
9224                match literal_expr_to_value(elem)? {
9225                    Value::Null => out.push(None),
9226                    Value::Text(s) => out.push(Some(s)),
9227                    other => out.push(Some(alloc::format!("{other:?}"))),
9228                }
9229            }
9230            Ok(Value::TextArray(out))
9231        }
9232        other => Err(EngineError::Unsupported(alloc::format!(
9233            "non-literal INSERT value expression: {other:?}"
9234        ))),
9235    }
9236}
9237
9238fn literal_to_value(l: Literal) -> Value {
9239    match l {
9240        Literal::Integer(n) => int_value_for(n),
9241        Literal::Float(x) => Value::Float(x),
9242        Literal::String(s) => Value::Text(s),
9243        Literal::Bool(b) => Value::Bool(b),
9244        Literal::Null => Value::Null,
9245        Literal::Vector(v) => Value::Vector(v),
9246        Literal::Interval { months, micros, .. } => Value::Interval { months, micros },
9247    }
9248}
9249
9250/// Pick `Int` (`i32`) when the literal fits, else `BigInt`. `INT` vs `BIGINT`
9251/// columns will still enforce the right tag downstream — this is just the
9252/// default we synthesise from an unannotated integer literal.
9253fn int_value_for(n: i64) -> Value {
9254    if let Ok(small) = i32::try_from(n) {
9255        Value::Int(small)
9256    } else {
9257        Value::BigInt(n)
9258    }
9259}
9260
9261/// Widen / narrow `v` to fit `expected`. Numerics permit safe widening
9262/// (`Int → BigInt`, `Int/BigInt → Float`) and best-effort narrowing
9263/// (`BigInt → Int` succeeds only when the value fits in `i32`). Everything
9264/// else returns `TypeMismatch` carrying the column name for caller diagnostics.
9265/// `NULL` is always permitted; the nullability check happens later in storage.
9266#[allow(clippy::too_many_lines)]
9267fn coerce_value(
9268    v: Value,
9269    expected: DataType,
9270    col_name: &str,
9271    position: usize,
9272) -> Result<Value, EngineError> {
9273    if v.is_null() {
9274        return Ok(Value::Null);
9275    }
9276    let actual = v.data_type().expect("non-null");
9277    if actual == expected {
9278        return Ok(v);
9279    }
9280    let coerced =
9281        match (v, expected) {
9282            (Value::Int(n), DataType::BigInt) => Some(Value::BigInt(i64::from(n))),
9283            (Value::Int(n), DataType::Float) => Some(Value::Float(f64::from(n))),
9284            (Value::Int(n), DataType::SmallInt) => i16::try_from(n).ok().map(Value::SmallInt),
9285            (Value::Int(n), DataType::Numeric { precision, scale }) => Some(numeric_from_integer(
9286                i128::from(n),
9287                precision,
9288                scale,
9289                col_name,
9290            )?),
9291            (Value::SmallInt(n), DataType::Int) => Some(Value::Int(i32::from(n))),
9292            (Value::SmallInt(n), DataType::BigInt) => Some(Value::BigInt(i64::from(n))),
9293            (Value::SmallInt(n), DataType::Float) => Some(Value::Float(f64::from(n))),
9294            (Value::SmallInt(n), DataType::Numeric { precision, scale }) => Some(
9295                numeric_from_integer(i128::from(n), precision, scale, col_name)?,
9296            ),
9297            (Value::BigInt(n), DataType::Int) => i32::try_from(n).ok().map(Value::Int),
9298            (Value::BigInt(n), DataType::SmallInt) => i16::try_from(n).ok().map(Value::SmallInt),
9299            #[allow(clippy::cast_precision_loss)]
9300            (Value::BigInt(n), DataType::Float) => Some(Value::Float(n as f64)),
9301            (Value::BigInt(n), DataType::Numeric { precision, scale }) => Some(
9302                numeric_from_integer(i128::from(n), precision, scale, col_name)?,
9303            ),
9304            (Value::Float(x), DataType::Numeric { precision, scale }) => {
9305                Some(numeric_from_float(x, precision, scale, col_name)?)
9306            }
9307            // Text → DATE / TIMESTAMP: parse canonical text forms.
9308            (Value::Text(s), DataType::Date) => {
9309                let d = eval::parse_date_literal(&s).ok_or_else(|| {
9310                    EngineError::Eval(EvalError::TypeMismatch {
9311                        detail: alloc::format!(
9312                            "cannot parse {s:?} as DATE for column `{col_name}`"
9313                        ),
9314                    })
9315                })?;
9316                Some(Value::Date(d))
9317            }
9318            // v4.9: Text ↔ JSON coercion. No structural validation —
9319            // any text literal is accepted; the responsibility for
9320            // valid JSON lies with the producer.
9321            (Value::Text(s), DataType::Json | DataType::Jsonb) => Some(Value::Json(s)),
9322            (Value::Json(s), DataType::Text) => Some(Value::Text(s)),
9323            // v7.10.4 — Text → BYTEA. Decode PG-style literal forms:
9324            //   - Hex:    `\x48656c6c6f`  (case-insensitive hex pairs)
9325            //   - Escape: `Hello\\000world`  (backslash + octal triples)
9326            //   - Plain:  any string → raw UTF-8 bytes (PG also accepts)
9327            // Errors surface as TypeMismatch so the operator gets a
9328            // clear "this literal isn't a bytea literal" hint.
9329            (Value::Text(s), DataType::Bytes) => {
9330                let bytes = decode_bytea_literal(&s).map_err(|e| {
9331                    EngineError::Eval(EvalError::TypeMismatch {
9332                        detail: alloc::format!(
9333                            "cannot parse {s:?} as BYTEA for column `{col_name}`: {e}"
9334                        ),
9335                    })
9336                })?;
9337                Some(Value::Bytes(bytes))
9338            }
9339            // v7.10.4 — BYTEA → Text round-trip uses the PG hex
9340            // output (lowercase, `\x` prefix). Important when a
9341            // SELECT pulls a bytea cell through a Text column path.
9342            (Value::Bytes(b), DataType::Text) => Some(Value::Text(encode_bytea_hex(&b))),
9343            // v7.10.11 — Text → TEXT[]. Decode PG's external array
9344            // form `'{a,b,NULL}'`. NULL element token (case-insensitive)
9345            // is the literal `NULL`; everything else is a quoted or
9346            // unquoted text element. mailrs `'{label1,label2}'::TEXT[]`.
9347            (Value::Text(s), DataType::TextArray) => {
9348                let arr = decode_text_array_literal(&s).map_err(|e| {
9349                    EngineError::Eval(EvalError::TypeMismatch {
9350                        detail: alloc::format!(
9351                            "cannot parse {s:?} as TEXT[] for column `{col_name}`: {e}"
9352                        ),
9353                    })
9354                })?;
9355                Some(Value::TextArray(arr))
9356            }
9357            // v7.10.11 — TEXT[] → Text round-trip uses PG's
9358            // external array form (`{a,b,NULL}`). Lets a SELECT
9359            // pull an array column through any Text-side codepath.
9360            (Value::TextArray(items), DataType::Text) => {
9361                Some(Value::Text(encode_text_array(&items)))
9362            }
9363            (Value::Text(s), DataType::Timestamp | DataType::Timestamptz) => {
9364                let t = eval::parse_timestamp_literal(&s).ok_or_else(|| {
9365                    EngineError::Eval(EvalError::TypeMismatch {
9366                        detail: alloc::format!(
9367                            "cannot parse {s:?} as TIMESTAMP for column `{col_name}`"
9368                        ),
9369                    })
9370                })?;
9371                Some(Value::Timestamp(t))
9372            }
9373            // DATE ↔ TIMESTAMP convertibility (DATE → midnight,
9374            // TIMESTAMP → day truncation).
9375            (Value::Date(d), DataType::Timestamp | DataType::Timestamptz) => {
9376                Some(Value::Timestamp(i64::from(d) * 86_400_000_000))
9377            }
9378            // v7.9.21 — Value::Timestamp lands in either Timestamp
9379            // or Timestamptz columns; the on-disk layout is the
9380            // same i64 microseconds UTC.
9381            (Value::Timestamp(t), DataType::Timestamptz) => Some(Value::Timestamp(t)),
9382            (Value::Timestamp(t), DataType::Date) => {
9383                let days = t.div_euclid(86_400_000_000);
9384                i32::try_from(days).ok().map(Value::Date)
9385            }
9386            (
9387                Value::Numeric {
9388                    scaled,
9389                    scale: src_scale,
9390                },
9391                DataType::Numeric { precision, scale },
9392            ) => Some(numeric_rescale(
9393                scaled, src_scale, precision, scale, col_name,
9394            )?),
9395            #[allow(clippy::cast_precision_loss)]
9396            (Value::Numeric { scaled, scale }, DataType::Float) => {
9397                let mut div = 1.0_f64;
9398                for _ in 0..scale {
9399                    div *= 10.0;
9400                }
9401                Some(Value::Float((scaled as f64) / div))
9402            }
9403            (Value::Numeric { scaled, scale }, DataType::Int) => {
9404                let truncated = numeric_truncate_to_integer(scaled, scale);
9405                i32::try_from(truncated).ok().map(Value::Int)
9406            }
9407            (Value::Numeric { scaled, scale }, DataType::BigInt) => {
9408                let truncated = numeric_truncate_to_integer(scaled, scale);
9409                i64::try_from(truncated).ok().map(Value::BigInt)
9410            }
9411            (Value::Numeric { scaled, scale }, DataType::SmallInt) => {
9412                let truncated = numeric_truncate_to_integer(scaled, scale);
9413                i16::try_from(truncated).ok().map(Value::SmallInt)
9414            }
9415            // VARCHAR(n) enforces an upper bound on character count.
9416            (Value::Text(s), DataType::Varchar(max)) => {
9417                if u32::try_from(s.chars().count()).unwrap_or(u32::MAX) <= max {
9418                    Some(Value::Text(s))
9419                } else {
9420                    return Err(EngineError::Unsupported(alloc::format!(
9421                        "value for VARCHAR({max}) column `{col_name}` exceeds length: \
9422                     {} chars",
9423                        s.chars().count()
9424                    )));
9425                }
9426            }
9427            // v6.0.1: f32 → SQ8 INSERT-time quantisation. Triggered
9428            // when the column declares `VECTOR(N) USING SQ8` and
9429            // the INSERT VALUES expression yields a raw f32 vector
9430            // (the normal pgvector-shape literal). Dim mismatch
9431            // falls through the `_ => None` arm and surfaces as
9432            // `TypeMismatch` with the expected SQ8 column type —
9433            // matching the F32 path's existing error.
9434            (
9435                Value::Vector(v),
9436                DataType::Vector {
9437                    dim,
9438                    encoding: VecEncoding::Sq8,
9439                },
9440            ) if v.len() == dim as usize => {
9441                Some(Value::Sq8Vector(spg_storage::quantize::quantize(&v)))
9442            }
9443            // v6.0.3: f32 → f16 INSERT-time conversion for HALF
9444            // columns. Bit-exact at the storage layer (modulo
9445            // half-precision rounding); no rerank pass needed at
9446            // search time.
9447            (
9448                Value::Vector(v),
9449                DataType::Vector {
9450                    dim,
9451                    encoding: VecEncoding::F16,
9452                },
9453            ) if v.len() == dim as usize => Some(Value::HalfVector(
9454                spg_storage::halfvec::HalfVector::from_f32_slice(&v),
9455            )),
9456            // CHAR(n) right-pads with U+0020 to exactly n chars; if the input
9457            // is already longer we reject (PG truncates trailing-space-only;
9458            // staying strict for v1).
9459            (Value::Text(s), DataType::Char(size)) => {
9460                let len = u32::try_from(s.chars().count()).unwrap_or(u32::MAX);
9461                if len > size {
9462                    return Err(EngineError::Unsupported(alloc::format!(
9463                        "value for CHAR({size}) column `{col_name}` exceeds length: \
9464                     {len} chars"
9465                    )));
9466                }
9467                let need = (size - len) as usize;
9468                let mut padded = s;
9469                padded.reserve(need);
9470                for _ in 0..need {
9471                    padded.push(' ');
9472                }
9473                Some(Value::Text(padded))
9474            }
9475            _ => None,
9476        };
9477    coerced.ok_or(EngineError::Storage(StorageError::TypeMismatch {
9478        column: col_name.into(),
9479        expected,
9480        actual,
9481        position,
9482    }))
9483}
9484
9485#[cfg(test)]
9486mod tests {
9487    use super::*;
9488    use alloc::vec;
9489
9490    fn unwrap_command_ok(r: &QueryResult) -> usize {
9491        match r {
9492            QueryResult::CommandOk { affected, .. } => *affected,
9493            QueryResult::Rows { .. } => panic!("expected CommandOk, got Rows"),
9494        }
9495    }
9496
9497    #[test]
9498    fn create_table_registers_schema() {
9499        let mut e = Engine::new();
9500        e.execute("CREATE TABLE foo (a INT NOT NULL, b TEXT)")
9501            .unwrap();
9502        assert_eq!(e.catalog().table_count(), 1);
9503        let t = e.catalog().get("foo").unwrap();
9504        assert_eq!(t.schema().columns.len(), 2);
9505        assert_eq!(t.schema().columns[0].ty, DataType::Int);
9506        assert!(!t.schema().columns[0].nullable);
9507        assert_eq!(t.schema().columns[1].ty, DataType::Text);
9508    }
9509
9510    #[test]
9511    fn create_table_vector_default_is_f32_encoded() {
9512        let mut e = Engine::new();
9513        e.execute("CREATE TABLE t (v VECTOR(8))").unwrap();
9514        let t = e.catalog().get("t").unwrap();
9515        assert_eq!(
9516            t.schema().columns[0].ty,
9517            DataType::Vector {
9518                dim: 8,
9519                encoding: VecEncoding::F32,
9520            },
9521        );
9522    }
9523
9524    #[test]
9525    fn create_table_vector_using_sq8_succeeds() {
9526        // v6.0.1 step 3: the step-1 fence in `column_def_to_schema`
9527        // is lifted. CREATE TABLE persists an SQ8 column type in
9528        // the catalog; INSERT (next test) quantises raw f32 input.
9529        let mut e = Engine::new();
9530        e.execute("CREATE TABLE t (v VECTOR(8) USING SQ8)").unwrap();
9531        let t = e.catalog().get("t").unwrap();
9532        assert_eq!(
9533            t.schema().columns[0].ty,
9534            DataType::Vector {
9535                dim: 8,
9536                encoding: VecEncoding::Sq8,
9537            },
9538        );
9539    }
9540
9541    #[test]
9542    fn insert_into_sq8_column_quantises_f32_payload() {
9543        // v6.0.1 step 3: INSERT-time `coerce_value` rewrites a raw
9544        // `Value::Vector(Vec<f32>)` literal into the column's
9545        // quantised representation. The row that lands in the
9546        // catalog must therefore hold a `Value::Sq8Vector`, not the
9547        // original f32 buffer — that's the bit that delivers the
9548        // 4× compression target.
9549        let mut e = Engine::new();
9550        e.execute("CREATE TABLE t (v VECTOR(4) USING SQ8)").unwrap();
9551        e.execute("INSERT INTO t VALUES ([0.0, 0.25, 0.5, 1.0])")
9552            .unwrap();
9553        let t = e.catalog().get("t").unwrap();
9554        assert_eq!(t.rows().len(), 1);
9555        match &t.rows()[0].values[0] {
9556            Value::Sq8Vector(q) => {
9557                assert_eq!(q.bytes.len(), 4);
9558                // min/max are derived from the payload: min=0.0, max=1.0.
9559                assert!((q.min - 0.0).abs() < 1e-6);
9560                assert!((q.max - 1.0).abs() < 1e-6);
9561            }
9562            other => panic!("expected Sq8Vector cell, got {other:?}"),
9563        }
9564    }
9565
9566    #[test]
9567    fn create_table_vector_using_half_succeeds_and_insert_converts_to_f16() {
9568        // v6.0.3: CREATE TABLE accepts USING HALF; INSERT path
9569        // converts the incoming `Value::Vector(Vec<f32>)` cell
9570        // into `Value::HalfVector(HalfVector)` via the new
9571        // `coerce_value` arm. The dequantised round-trip is
9572        // bit-exact for f16-representable values, so 0.0 / 0.25
9573        // / 0.5 / 1.0 hit their grid points exactly.
9574        let mut e = Engine::new();
9575        e.execute("CREATE TABLE t (v VECTOR(4) USING HALF)")
9576            .unwrap();
9577        e.execute("INSERT INTO t VALUES ([0.0, 0.25, 0.5, 1.0])")
9578            .unwrap();
9579        let t = e.catalog().get("t").unwrap();
9580        assert_eq!(t.rows().len(), 1);
9581        match &t.rows()[0].values[0] {
9582            Value::HalfVector(h) => {
9583                assert_eq!(h.dim(), 4);
9584                let back = h.to_f32_vec();
9585                let expected = alloc::vec![0.0_f32, 0.25, 0.5, 1.0];
9586                for (g, e) in back.iter().zip(expected.iter()) {
9587                    assert!(
9588                        (g - e).abs() < 1e-6,
9589                        "{g} vs {e} should be exact on f16 grid"
9590                    );
9591                }
9592            }
9593            other => panic!("expected HalfVector cell, got {other:?}"),
9594        }
9595    }
9596
9597    #[test]
9598    fn alter_index_rebuild_in_place_succeeds() {
9599        // v6.0.4: bare REBUILD (no encoding switch) walks every
9600        // row again to rebuild the NSW graph. Verifies the engine
9601        // dispatch + storage helper plumbing without changing any
9602        // cell encoding.
9603        let mut e = Engine::new();
9604        e.execute("CREATE TABLE t (id INT NOT NULL, v VECTOR(3) NOT NULL)")
9605            .unwrap();
9606        for i in 0..8_i32 {
9607            #[allow(clippy::cast_precision_loss)]
9608            let base = (i as f32) * 0.1;
9609            e.execute(&alloc::format!(
9610                "INSERT INTO t VALUES ({i}, [{base}, {b1}, {b2}])",
9611                b1 = base + 0.01,
9612                b2 = base + 0.02,
9613            ))
9614            .unwrap();
9615        }
9616        e.execute("CREATE INDEX t_idx ON t USING hnsw (v)").unwrap();
9617        e.execute("ALTER INDEX t_idx REBUILD").unwrap();
9618        // Schema encoding stays F32 (no encoding clause).
9619        assert_eq!(
9620            e.catalog().get("t").unwrap().schema().columns[1].ty,
9621            DataType::Vector {
9622                dim: 3,
9623                encoding: VecEncoding::F32,
9624            },
9625        );
9626    }
9627
9628    #[test]
9629    fn alter_index_rebuild_with_encoding_switches_cell_type() {
9630        // v6.0.4: REBUILD WITH (encoding = SQ8) recodes every
9631        // stored cell from F32 → SQ8 + rebuilds the graph atop the
9632        // new encoding. Post-rebuild, cells must be Sq8Vector and
9633        // the schema must report encoding = Sq8.
9634        let mut e = Engine::new();
9635        e.execute("CREATE TABLE t (id INT NOT NULL, v VECTOR(4) NOT NULL)")
9636            .unwrap();
9637        e.execute("INSERT INTO t VALUES (1, [0.0, 0.25, 0.5, 1.0])")
9638            .unwrap();
9639        e.execute("CREATE INDEX t_idx ON t USING hnsw (v)").unwrap();
9640        e.execute("ALTER INDEX t_idx REBUILD WITH (encoding = SQ8)")
9641            .unwrap();
9642        let t = e.catalog().get("t").unwrap();
9643        assert_eq!(
9644            t.schema().columns[1].ty,
9645            DataType::Vector {
9646                dim: 4,
9647                encoding: VecEncoding::Sq8,
9648            },
9649        );
9650        assert!(matches!(t.rows()[0].values[1], Value::Sq8Vector(_)));
9651    }
9652
9653    #[test]
9654    fn alter_index_rebuild_unknown_index_errors() {
9655        let mut e = Engine::new();
9656        let err = e.execute("ALTER INDEX nope REBUILD").unwrap_err();
9657        assert!(
9658            matches!(
9659                &err,
9660                EngineError::Storage(StorageError::IndexNotFound { name }) if name == "nope"
9661            ),
9662            "got: {err}"
9663        );
9664    }
9665
9666    #[test]
9667    fn alter_index_rebuild_on_btree_index_errors() {
9668        // REBUILD on a B-tree index has no semantic meaning in
9669        // v6.0.4 — rejected at the storage layer with `Unsupported`.
9670        let mut e = Engine::new();
9671        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
9672        e.execute("INSERT INTO t VALUES (1)").unwrap();
9673        e.execute("CREATE INDEX t_idx ON t (id)").unwrap();
9674        let err = e.execute("ALTER INDEX t_idx REBUILD").unwrap_err();
9675        assert!(
9676            matches!(&err, EngineError::Storage(StorageError::Unsupported(_))),
9677            "got: {err}"
9678        );
9679    }
9680
9681    #[test]
9682    fn prepared_insert_substitutes_placeholders() {
9683        // v6.1.1: prepare() parses once; execute_prepared() walks the
9684        // AST and replaces $1/$2 with the param Values BEFORE the
9685        // dispatch sees them. Same logical result as a simple-query
9686        // INSERT, but parse happens once per *statement*, not per
9687        // execution.
9688        let mut e = Engine::new();
9689        e.execute("CREATE TABLE t (id INT NOT NULL, name TEXT NOT NULL)")
9690            .unwrap();
9691        let stmt = e.prepare("INSERT INTO t VALUES ($1, $2)").unwrap();
9692        for (id, name) in [(1, "alice"), (2, "bob"), (3, "carol")] {
9693            e.execute_prepared(
9694                stmt.clone(),
9695                &[Value::Int(id), Value::Text(name.into())],
9696            )
9697            .unwrap();
9698        }
9699        // Read back via simple-query SELECT.
9700        let rows_result = e.execute("SELECT id, name FROM t").unwrap();
9701        let QueryResult::Rows { rows, .. } = rows_result else {
9702            panic!("expected Rows")
9703        };
9704        assert_eq!(rows.len(), 3);
9705    }
9706
9707    #[test]
9708    fn prepared_select_with_placeholder_filters_rows() {
9709        let mut e = Engine::new();
9710        e.execute("CREATE TABLE t (id INT NOT NULL, v INT NOT NULL)")
9711            .unwrap();
9712        for i in 0..10_i32 {
9713            e.execute(&alloc::format!("INSERT INTO t VALUES ({i}, {})", i * 7))
9714                .unwrap();
9715        }
9716        let stmt = e
9717            .prepare("SELECT id FROM t WHERE v = $1")
9718            .unwrap();
9719        let QueryResult::Rows { rows, .. } = e
9720            .execute_prepared(stmt, &[Value::Int(35)])
9721            .unwrap()
9722        else {
9723            panic!("expected Rows")
9724        };
9725        // v = 35 means i*7 = 35 → i = 5.
9726        assert_eq!(rows.len(), 1);
9727        assert_eq!(rows[0].values[0], Value::Int(5));
9728    }
9729
9730    #[test]
9731    fn prepared_too_few_params_errors() {
9732        let mut e = Engine::new();
9733        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
9734        let stmt = e.prepare("INSERT INTO t VALUES ($1)").unwrap();
9735        let err = e.execute_prepared(stmt, &[]).unwrap_err();
9736        assert!(
9737            matches!(
9738                &err,
9739                EngineError::Eval(EvalError::PlaceholderOutOfRange { n: 1, bound: 0 })
9740            ),
9741            "got: {err}"
9742        );
9743    }
9744
9745    #[test]
9746    fn insert_into_half_column_dim_mismatch_errors() {
9747        let mut e = Engine::new();
9748        e.execute("CREATE TABLE t (v VECTOR(4) USING HALF)")
9749            .unwrap();
9750        let err = e.execute("INSERT INTO t VALUES ([1.0, 2.0])").unwrap_err();
9751        assert!(matches!(
9752            &err,
9753            EngineError::Storage(StorageError::TypeMismatch { .. })
9754        ));
9755    }
9756
9757    #[test]
9758    fn insert_into_sq8_column_dim_mismatch_errors() {
9759        // Dim mismatch falls through the `coerce_value` Vector→Sq8
9760        // arm's guard and surfaces as `TypeMismatch` — the same
9761        // error the F32 path produces today, so client error
9762        // handling stays uniform across encodings.
9763        let mut e = Engine::new();
9764        e.execute("CREATE TABLE t (v VECTOR(4) USING SQ8)").unwrap();
9765        let err = e.execute("INSERT INTO t VALUES ([1.0, 2.0])").unwrap_err();
9766        assert!(
9767            matches!(
9768                &err,
9769                EngineError::Storage(StorageError::TypeMismatch { .. })
9770            ),
9771            "got: {err}",
9772        );
9773    }
9774
9775    #[test]
9776    fn create_table_duplicate_errors() {
9777        let mut e = Engine::new();
9778        e.execute("CREATE TABLE foo (a INT)").unwrap();
9779        let err = e.execute("CREATE TABLE foo (a INT)").unwrap_err();
9780        assert!(matches!(
9781            err,
9782            EngineError::Storage(StorageError::DuplicateTable { ref name }) if name == "foo"
9783        ));
9784    }
9785
9786    #[test]
9787    fn insert_into_unknown_table_errors() {
9788        let mut e = Engine::new();
9789        let err = e.execute("INSERT INTO ghost VALUES (1)").unwrap_err();
9790        assert!(matches!(
9791            err,
9792            EngineError::Storage(StorageError::TableNotFound { ref name }) if name == "ghost"
9793        ));
9794    }
9795
9796    #[test]
9797    fn insert_happy_path_reports_one_affected() {
9798        let mut e = Engine::new();
9799        e.execute("CREATE TABLE foo (a INT NOT NULL)").unwrap();
9800        let r = e.execute("INSERT INTO foo VALUES (42)").unwrap();
9801        assert_eq!(unwrap_command_ok(&r), 1);
9802        assert_eq!(e.catalog().get("foo").unwrap().row_count(), 1);
9803    }
9804
9805    #[test]
9806    fn insert_arity_mismatch_propagates() {
9807        let mut e = Engine::new();
9808        e.execute("CREATE TABLE foo (a INT, b TEXT)").unwrap();
9809        let err = e.execute("INSERT INTO foo VALUES (1)").unwrap_err();
9810        assert!(matches!(
9811            err,
9812            EngineError::Storage(StorageError::ArityMismatch { .. })
9813        ));
9814    }
9815
9816    #[test]
9817    fn insert_negative_integer_via_unary_minus() {
9818        let mut e = Engine::new();
9819        e.execute("CREATE TABLE foo (a INT NOT NULL)").unwrap();
9820        e.execute("INSERT INTO foo VALUES (-7)").unwrap();
9821        let rows = e.catalog().get("foo").unwrap().rows();
9822        assert_eq!(rows[0].values[0], Value::Int(-7));
9823    }
9824
9825    #[test]
9826    fn insert_non_literal_expr_unsupported() {
9827        let mut e = Engine::new();
9828        e.execute("CREATE TABLE foo (a INT NOT NULL)").unwrap();
9829        let err = e.execute("INSERT INTO foo VALUES (1 + 2)").unwrap_err();
9830        assert!(matches!(err, EngineError::Unsupported(_)));
9831    }
9832
9833    #[test]
9834    fn select_star_returns_all_rows_in_insertion_order() {
9835        let mut e = Engine::new();
9836        e.execute("CREATE TABLE foo (a INT NOT NULL, b TEXT NOT NULL)")
9837            .unwrap();
9838        e.execute("INSERT INTO foo VALUES (1, 'one')").unwrap();
9839        e.execute("INSERT INTO foo VALUES (2, 'two')").unwrap();
9840        e.execute("INSERT INTO foo VALUES (3, 'three')").unwrap();
9841
9842        let r = e.execute("SELECT * FROM foo").unwrap();
9843        let QueryResult::Rows { columns, rows } = r else {
9844            panic!("expected Rows")
9845        };
9846        assert_eq!(columns.len(), 2);
9847        assert_eq!(columns[0].name, "a");
9848        assert_eq!(rows.len(), 3);
9849        assert_eq!(
9850            rows[1].values,
9851            vec![Value::Int(2), Value::Text("two".into())]
9852        );
9853    }
9854
9855    #[test]
9856    fn select_star_on_empty_table_returns_zero_rows() {
9857        let mut e = Engine::new();
9858        e.execute("CREATE TABLE foo (a INT)").unwrap();
9859        let r = e.execute("SELECT * FROM foo").unwrap();
9860        match r {
9861            QueryResult::Rows { rows, .. } => assert!(rows.is_empty()),
9862            QueryResult::CommandOk { .. } => panic!("expected Rows"),
9863        }
9864    }
9865
9866    // --- v0.4: WHERE + projection ------------------------------------------
9867
9868    fn make_three_row_users(e: &mut Engine) {
9869        e.execute("CREATE TABLE users (id INT NOT NULL, name TEXT NOT NULL, score INT)")
9870            .unwrap();
9871        e.execute("INSERT INTO users VALUES (1, 'alice', 90)")
9872            .unwrap();
9873        e.execute("INSERT INTO users VALUES (2, 'bob', NULL)")
9874            .unwrap();
9875        e.execute("INSERT INTO users VALUES (3, 'cara', 70)")
9876            .unwrap();
9877    }
9878
9879    fn unwrap_rows(r: QueryResult) -> (Vec<ColumnSchema>, Vec<Row>) {
9880        match r {
9881            QueryResult::Rows { columns, rows } => (columns, rows),
9882            QueryResult::CommandOk { .. } => panic!("expected Rows"),
9883        }
9884    }
9885
9886    #[test]
9887    fn where_filter_passes_only_true_rows() {
9888        let mut e = Engine::new();
9889        make_three_row_users(&mut e);
9890        let r = e.execute("SELECT * FROM users WHERE id > 1").unwrap();
9891        let (_, rows) = unwrap_rows(r);
9892        assert_eq!(rows.len(), 2);
9893        assert_eq!(rows[0].values[0], Value::Int(2));
9894        assert_eq!(rows[1].values[0], Value::Int(3));
9895    }
9896
9897    #[test]
9898    fn where_with_null_result_filters_out_row() {
9899        let mut e = Engine::new();
9900        make_three_row_users(&mut e);
9901        // score is NULL for bob → score > 80 is NULL → row excluded
9902        let r = e.execute("SELECT * FROM users WHERE score > 80").unwrap();
9903        let (_, rows) = unwrap_rows(r);
9904        assert_eq!(rows.len(), 1);
9905        assert_eq!(rows[0].values[1], Value::Text("alice".into()));
9906    }
9907
9908    #[test]
9909    fn projection_named_columns() {
9910        let mut e = Engine::new();
9911        make_three_row_users(&mut e);
9912        let r = e.execute("SELECT name, score FROM users").unwrap();
9913        let (cols, rows) = unwrap_rows(r);
9914        assert_eq!(cols.len(), 2);
9915        assert_eq!(cols[0].name, "name");
9916        assert_eq!(cols[1].name, "score");
9917        assert_eq!(rows.len(), 3);
9918        assert_eq!(
9919            rows[0].values,
9920            vec![Value::Text("alice".into()), Value::Int(90)]
9921        );
9922    }
9923
9924    #[test]
9925    fn projection_with_column_alias() {
9926        let mut e = Engine::new();
9927        make_three_row_users(&mut e);
9928        let r = e
9929            .execute("SELECT name AS who FROM users WHERE id = 1")
9930            .unwrap();
9931        let (cols, rows) = unwrap_rows(r);
9932        assert_eq!(cols[0].name, "who");
9933        assert_eq!(rows.len(), 1);
9934        assert_eq!(rows[0].values[0], Value::Text("alice".into()));
9935    }
9936
9937    #[test]
9938    fn qualified_column_with_table_alias_resolves() {
9939        let mut e = Engine::new();
9940        make_three_row_users(&mut e);
9941        let r = e
9942            .execute("SELECT u.id, u.name FROM users AS u WHERE u.id < 3")
9943            .unwrap();
9944        let (cols, rows) = unwrap_rows(r);
9945        assert_eq!(cols.len(), 2);
9946        assert_eq!(rows.len(), 2);
9947    }
9948
9949    #[test]
9950    fn qualified_column_with_wrong_alias_errors() {
9951        let mut e = Engine::new();
9952        make_three_row_users(&mut e);
9953        let err = e.execute("SELECT x.id FROM users AS u").unwrap_err();
9954        assert!(matches!(
9955            err,
9956            EngineError::Eval(EvalError::UnknownQualifier { ref qualifier }) if qualifier == "x"
9957        ));
9958    }
9959
9960    #[test]
9961    fn select_unknown_column_errors_in_projection() {
9962        let mut e = Engine::new();
9963        make_three_row_users(&mut e);
9964        let err = e.execute("SELECT ghost FROM users").unwrap_err();
9965        assert!(matches!(
9966            err,
9967            EngineError::Eval(EvalError::ColumnNotFound { ref name }) if name == "ghost"
9968        ));
9969    }
9970
9971    #[test]
9972    fn where_unknown_column_errors() {
9973        let mut e = Engine::new();
9974        make_three_row_users(&mut e);
9975        let err = e
9976            .execute("SELECT * FROM users WHERE ghost = 1")
9977            .unwrap_err();
9978        assert!(matches!(
9979            err,
9980            EngineError::Eval(EvalError::ColumnNotFound { .. })
9981        ));
9982    }
9983
9984    #[test]
9985    fn expression_projection_evaluates_and_renders() {
9986        // Compound expressions in the SELECT list are evaluated per row;
9987        // the output column is typed TEXT, name defaults to the expression.
9988        let mut e = Engine::new();
9989        e.execute("CREATE TABLE t (a INT NOT NULL)").unwrap();
9990        e.execute("INSERT INTO t VALUES (3)").unwrap();
9991        let (_, rows) = unwrap_rows(e.execute("SELECT 1 + 2 FROM t").unwrap());
9992        assert_eq!(rows.len(), 1);
9993        // The expression evaluates to integer 3; rendered as the cell value
9994        // (storage::Value::Int(3) since arithmetic kept ints).
9995        assert_eq!(rows[0].values[0], Value::Int(3));
9996    }
9997
9998    #[test]
9999    fn select_unknown_table_errors() {
10000        let mut e = Engine::new();
10001        let err = e.execute("SELECT * FROM ghost").unwrap_err();
10002        assert!(matches!(
10003            err,
10004            EngineError::Storage(StorageError::TableNotFound { .. })
10005        ));
10006    }
10007
10008    #[test]
10009    fn invalid_sql_returns_parse_error() {
10010        // v4.4: UPDATE is now real SQL, so use a true syntactic
10011        // garbage payload for the parse-error path.
10012        let mut e = Engine::new();
10013        let err = e.execute("THIS_IS_NOT_A_KEYWORD foo bar baz").unwrap_err();
10014        assert!(matches!(err, EngineError::Parse(_)));
10015    }
10016
10017    // --- v0.8 CREATE INDEX + index seek ------------------------------------
10018
10019    #[test]
10020    fn create_index_registers_on_table() {
10021        let mut e = Engine::new();
10022        make_three_row_users(&mut e);
10023        e.execute("CREATE INDEX by_name ON users (name)").unwrap();
10024        let t = e.catalog().get("users").unwrap();
10025        assert_eq!(t.indices().len(), 1);
10026        assert_eq!(t.indices()[0].name, "by_name");
10027    }
10028
10029    #[test]
10030    fn create_index_on_unknown_table_errors() {
10031        let mut e = Engine::new();
10032        let err = e.execute("CREATE INDEX i ON ghost (a)").unwrap_err();
10033        assert!(matches!(
10034            err,
10035            EngineError::Storage(StorageError::TableNotFound { .. })
10036        ));
10037    }
10038
10039    #[test]
10040    fn create_index_on_unknown_column_errors() {
10041        let mut e = Engine::new();
10042        make_three_row_users(&mut e);
10043        let err = e.execute("CREATE INDEX i ON users (ghost)").unwrap_err();
10044        assert!(matches!(
10045            err,
10046            EngineError::Storage(StorageError::ColumnNotFound { .. })
10047        ));
10048    }
10049
10050    #[test]
10051    fn select_eq_uses_index_returns_same_rows_as_scan() {
10052        // Build two engines: one with an index, one without. Same query →
10053        // same row set (index is a planner optimisation, not a semantic
10054        // change).
10055        let mut without = Engine::new();
10056        make_three_row_users(&mut without);
10057        let mut with = Engine::new();
10058        make_three_row_users(&mut with);
10059        with.execute("CREATE INDEX by_id ON users (id)").unwrap();
10060
10061        let q = "SELECT * FROM users WHERE id = 2";
10062        let (_, no_idx_rows) = unwrap_rows(without.execute(q).unwrap());
10063        let (_, idx_rows) = unwrap_rows(with.execute(q).unwrap());
10064        assert_eq!(no_idx_rows, idx_rows);
10065        assert_eq!(idx_rows.len(), 1);
10066    }
10067
10068    #[test]
10069    fn select_eq_with_no_matching_index_value_returns_empty() {
10070        let mut e = Engine::new();
10071        make_three_row_users(&mut e);
10072        e.execute("CREATE INDEX by_id ON users (id)").unwrap();
10073        let (_, rows) = unwrap_rows(e.execute("SELECT * FROM users WHERE id = 999").unwrap());
10074        assert_eq!(rows.len(), 0);
10075    }
10076
10077    // --- v0.9 transactions -------------------------------------------------
10078
10079    #[test]
10080    fn begin_sets_in_transaction_flag() {
10081        let mut e = Engine::new();
10082        assert!(!e.in_transaction());
10083        e.execute("BEGIN").unwrap();
10084        assert!(e.in_transaction());
10085    }
10086
10087    #[test]
10088    fn double_begin_errors() {
10089        let mut e = Engine::new();
10090        e.execute("BEGIN").unwrap();
10091        let err = e.execute("BEGIN").unwrap_err();
10092        assert_eq!(err, EngineError::TransactionAlreadyOpen);
10093    }
10094
10095    #[test]
10096    fn commit_without_begin_errors() {
10097        let mut e = Engine::new();
10098        let err = e.execute("COMMIT").unwrap_err();
10099        assert_eq!(err, EngineError::NoActiveTransaction);
10100    }
10101
10102    #[test]
10103    fn rollback_without_begin_errors() {
10104        let mut e = Engine::new();
10105        let err = e.execute("ROLLBACK").unwrap_err();
10106        assert_eq!(err, EngineError::NoActiveTransaction);
10107    }
10108
10109    #[test]
10110    fn commit_applies_shadow_to_committed_catalog() {
10111        let mut e = Engine::new();
10112        e.execute("CREATE TABLE t (v INT NOT NULL)").unwrap();
10113        e.execute("BEGIN").unwrap();
10114        e.execute("INSERT INTO t VALUES (1)").unwrap();
10115        e.execute("INSERT INTO t VALUES (2)").unwrap();
10116        e.execute("COMMIT").unwrap();
10117        assert!(!e.in_transaction());
10118        assert_eq!(e.catalog().get("t").unwrap().row_count(), 2);
10119    }
10120
10121    #[test]
10122    fn rollback_discards_shadow() {
10123        let mut e = Engine::new();
10124        e.execute("CREATE TABLE t (v INT NOT NULL)").unwrap();
10125        e.execute("BEGIN").unwrap();
10126        e.execute("INSERT INTO t VALUES (1)").unwrap();
10127        e.execute("INSERT INTO t VALUES (2)").unwrap();
10128        e.execute("ROLLBACK").unwrap();
10129        assert!(!e.in_transaction());
10130        assert_eq!(e.catalog().get("t").unwrap().row_count(), 0);
10131    }
10132
10133    #[test]
10134    fn select_during_tx_sees_uncommitted_writes_own_session() {
10135        // The shadow catalog is read by SELECTs while a TX is open — the
10136        // session can see its own pending writes.
10137        let mut e = Engine::new();
10138        e.execute("CREATE TABLE t (v INT NOT NULL)").unwrap();
10139        e.execute("BEGIN").unwrap();
10140        e.execute("INSERT INTO t VALUES (42)").unwrap();
10141        let (_, rows) = unwrap_rows(e.execute("SELECT * FROM t").unwrap());
10142        assert_eq!(rows.len(), 1);
10143        assert_eq!(rows[0].values[0], Value::Int(42));
10144    }
10145
10146    #[test]
10147    fn snapshot_with_no_users_is_bare_catalog_format() {
10148        let mut e = Engine::new();
10149        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
10150        let bytes = e.snapshot();
10151        assert_eq!(
10152            &bytes[..8],
10153            b"SPGDB001",
10154            "must be the bare v3.x catalog magic"
10155        );
10156        let e2 = Engine::restore_envelope(&bytes).unwrap();
10157        assert!(e2.users().is_empty());
10158        assert_eq!(e2.catalog().table_count(), 1);
10159    }
10160
10161    #[test]
10162    fn snapshot_with_users_round_trips_both_via_envelope() {
10163        let mut e = Engine::new();
10164        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
10165        e.create_user("alice", "pw1", Role::Admin, [9; 16]).unwrap();
10166        e.create_user("bob", "pw2", Role::ReadOnly, [5; 16])
10167            .unwrap();
10168        let bytes = e.snapshot();
10169        assert_eq!(&bytes[..8], b"SPGENV01", "must be the v4.1 envelope magic");
10170        let e2 = Engine::restore_envelope(&bytes).unwrap();
10171        assert_eq!(e2.users().len(), 2);
10172        assert_eq!(e2.verify_user("alice", "pw1"), Some(Role::Admin));
10173        assert_eq!(e2.verify_user("bob", "pw2"), Some(Role::ReadOnly));
10174        assert_eq!(e2.verify_user("alice", "wrong"), None);
10175        assert_eq!(e2.catalog().table_count(), 1);
10176    }
10177
10178    #[test]
10179    fn ddl_inside_tx_also_rolled_back() {
10180        let mut e = Engine::new();
10181        e.execute("BEGIN").unwrap();
10182        e.execute("CREATE TABLE t (v INT)").unwrap();
10183        // Visible inside the TX.
10184        e.execute("SELECT * FROM t").unwrap();
10185        e.execute("ROLLBACK").unwrap();
10186        // Gone after rollback.
10187        let err = e.execute("SELECT * FROM t").unwrap_err();
10188        assert!(matches!(
10189            err,
10190            EngineError::Storage(StorageError::TableNotFound { .. })
10191        ));
10192    }
10193
10194    // ── v6.1.2: CREATE / DROP PUBLICATION (engine-side) ──────
10195
10196    #[test]
10197    fn create_publication_lands_in_catalog() {
10198        let mut e = Engine::new();
10199        assert!(e.publications().is_empty());
10200        e.execute("CREATE PUBLICATION pub_a").unwrap();
10201        assert_eq!(e.publications().len(), 1);
10202        assert!(e.publications().contains("pub_a"));
10203    }
10204
10205    #[test]
10206    fn create_publication_duplicate_errors() {
10207        let mut e = Engine::new();
10208        e.execute("CREATE PUBLICATION pub_a").unwrap();
10209        let err = e.execute("CREATE PUBLICATION pub_a").unwrap_err();
10210        assert!(
10211            alloc::format!("{err:?}").contains("DuplicateName"),
10212            "got {err:?}"
10213        );
10214    }
10215
10216    #[test]
10217    fn drop_publication_silent_when_absent() {
10218        let mut e = Engine::new();
10219        // PG-compatible: DROP a publication that doesn't exist
10220        // succeeds (no-op) but reports zero affected.
10221        let r = e.execute("DROP PUBLICATION nope").unwrap();
10222        match r {
10223            QueryResult::CommandOk { affected, .. } => assert_eq!(affected, 0),
10224            other => panic!("expected CommandOk, got {other:?}"),
10225        }
10226    }
10227
10228    #[test]
10229    fn drop_publication_present_reports_one_affected() {
10230        let mut e = Engine::new();
10231        e.execute("CREATE PUBLICATION pub_a").unwrap();
10232        let r = e.execute("DROP PUBLICATION pub_a").unwrap();
10233        match r {
10234            QueryResult::CommandOk {
10235                affected,
10236                modified_catalog,
10237            } => {
10238                assert_eq!(affected, 1);
10239                assert!(modified_catalog);
10240            }
10241            other => panic!("expected CommandOk, got {other:?}"),
10242        }
10243        assert!(e.publications().is_empty());
10244    }
10245
10246    #[test]
10247    fn publications_persist_across_snapshot_restore() {
10248        // The persist-across-restart ship-gate at the engine layer —
10249        // snapshot → restore_envelope round trip must preserve the
10250        // publication catalog. The spg-server e2e covers the
10251        // process-restart variant.
10252        let mut e = Engine::new();
10253        e.execute("CREATE PUBLICATION pub_a").unwrap();
10254        e.execute("CREATE PUBLICATION pub_b FOR ALL TABLES").unwrap();
10255        let snap = e.snapshot();
10256        let e2 = Engine::restore_envelope(&snap).unwrap();
10257        assert_eq!(e2.publications().len(), 2);
10258        assert!(e2.publications().contains("pub_a"));
10259        assert!(e2.publications().contains("pub_b"));
10260    }
10261
10262    #[test]
10263    fn create_publication_allowed_inside_transaction() {
10264        // v6.1.4 dropped the v6.1.2 in-TX guard — PG allows
10265        // CREATE PUBLICATION inside a TX and the auto-commit
10266        // wrap path needs the same allowance.
10267        let mut e = Engine::new();
10268        e.execute("BEGIN").unwrap();
10269        e.execute("CREATE PUBLICATION pub_a").unwrap();
10270        e.execute("COMMIT").unwrap();
10271        assert!(e.publications().contains("pub_a"));
10272    }
10273
10274    // ── v6.1.3: SHOW PUBLICATIONS + FOR-list variants ───────
10275
10276    #[test]
10277    fn create_publication_for_table_list_lands_with_scope() {
10278        let mut e = Engine::new();
10279        e.execute("CREATE TABLE t1 (id INT NOT NULL)").unwrap();
10280        e.execute("CREATE TABLE t2 (id INT NOT NULL)").unwrap();
10281        e.execute("CREATE PUBLICATION pub_a FOR TABLE t1, t2")
10282            .unwrap();
10283        let scope = e.publications().get("pub_a").cloned();
10284        let Some(spg_sql::ast::PublicationScope::ForTables(ts)) = scope else {
10285            panic!("expected ForTables scope, got {scope:?}")
10286        };
10287        assert_eq!(ts, alloc::vec!["t1".to_string(), "t2".to_string()]);
10288    }
10289
10290    #[test]
10291    fn create_publication_all_tables_except_lands_with_scope() {
10292        let mut e = Engine::new();
10293        e.execute("CREATE PUBLICATION pub_a FOR ALL TABLES EXCEPT t3")
10294            .unwrap();
10295        let scope = e.publications().get("pub_a").cloned();
10296        let Some(spg_sql::ast::PublicationScope::AllTablesExcept(ts)) = scope else {
10297            panic!("expected AllTablesExcept scope, got {scope:?}")
10298        };
10299        assert_eq!(ts, alloc::vec!["t3".to_string()]);
10300    }
10301
10302    #[test]
10303    fn show_publications_empty_returns_zero_rows() {
10304        let e = Engine::new();
10305        let r = e.execute_readonly("SHOW PUBLICATIONS").unwrap();
10306        let QueryResult::Rows { rows, columns } = r else {
10307            panic!()
10308        };
10309        assert!(rows.is_empty());
10310        assert_eq!(columns.len(), 3);
10311        assert_eq!(columns[0].name, "name");
10312        assert_eq!(columns[1].name, "scope");
10313        assert_eq!(columns[2].name, "table_count");
10314    }
10315
10316    #[test]
10317    fn show_publications_returns_one_row_per_publication_ordered_by_name() {
10318        let mut e = Engine::new();
10319        e.execute("CREATE PUBLICATION z_pub").unwrap();
10320        e.execute("CREATE PUBLICATION a_pub FOR TABLE t1, t2")
10321            .unwrap();
10322        e.execute("CREATE PUBLICATION m_pub FOR ALL TABLES EXCEPT bad")
10323            .unwrap();
10324        let r = e.execute_readonly("SHOW PUBLICATIONS").unwrap();
10325        let QueryResult::Rows { rows, .. } = r else {
10326            panic!()
10327        };
10328        assert_eq!(rows.len(), 3);
10329        // Alphabetical order: a_pub, m_pub, z_pub.
10330        let names: Vec<&str> = rows
10331            .iter()
10332            .map(|r| {
10333                if let Value::Text(s) = &r.values[0] {
10334                    s.as_str()
10335                } else {
10336                    panic!()
10337                }
10338            })
10339            .collect();
10340        assert_eq!(names, alloc::vec!["a_pub", "m_pub", "z_pub"]);
10341        // Row 0 — a_pub scope summary + table_count = 2.
10342        match &rows[0].values[1] {
10343            Value::Text(s) => assert_eq!(s, "FOR TABLE t1, t2"),
10344            other => panic!("expected Text, got {other:?}"),
10345        }
10346        assert_eq!(rows[0].values[2], Value::Int(2));
10347        // Row 1 — m_pub.
10348        match &rows[1].values[1] {
10349            Value::Text(s) => assert_eq!(s, "FOR ALL TABLES EXCEPT bad"),
10350            other => panic!("expected Text, got {other:?}"),
10351        }
10352        assert_eq!(rows[1].values[2], Value::Int(1));
10353        // Row 2 — z_pub (AllTables → NULL count).
10354        match &rows[2].values[1] {
10355            Value::Text(s) => assert_eq!(s, "FOR ALL TABLES"),
10356            other => panic!("expected Text, got {other:?}"),
10357        }
10358        assert_eq!(rows[2].values[2], Value::Null);
10359    }
10360
10361    #[test]
10362    fn for_list_scopes_persist_across_snapshot() {
10363        // The v6.1.2 envelope-v3 round-trip exercised AllTables;
10364        // v6.1.3 needs the scope-1 / scope-2 tags to survive too.
10365        let mut e = Engine::new();
10366        e.execute("CREATE PUBLICATION p1 FOR TABLE t1, t2").unwrap();
10367        e.execute("CREATE PUBLICATION p2 FOR ALL TABLES EXCEPT bad, worse")
10368            .unwrap();
10369        let snap = e.snapshot();
10370        let e2 = Engine::restore_envelope(&snap).unwrap();
10371        assert_eq!(e2.publications().len(), 2);
10372        let p1 = e2.publications().get("p1").cloned();
10373        let Some(spg_sql::ast::PublicationScope::ForTables(ts)) = p1 else {
10374            panic!("p1 scope lost: {p1:?}")
10375        };
10376        assert_eq!(ts, alloc::vec!["t1".to_string(), "t2".to_string()]);
10377        let p2 = e2.publications().get("p2").cloned();
10378        let Some(spg_sql::ast::PublicationScope::AllTablesExcept(ts)) = p2 else {
10379            panic!("p2 scope lost: {p2:?}")
10380        };
10381        assert_eq!(ts, alloc::vec!["bad".to_string(), "worse".to_string()]);
10382    }
10383
10384    // ── v6.1.4: CREATE / DROP SUBSCRIPTION + SHOW + envelope v4 ─
10385
10386    #[test]
10387    fn create_subscription_lands_in_catalog_with_defaults() {
10388        let mut e = Engine::new();
10389        e.execute(
10390            "CREATE SUBSCRIPTION sub_a CONNECTION 'host=127.0.0.1 port=20002' PUBLICATION pub_a",
10391        )
10392        .unwrap();
10393        let s = e.subscriptions().get("sub_a").cloned().expect("present");
10394        assert_eq!(s.conn_str, "host=127.0.0.1 port=20002");
10395        assert_eq!(s.publications, alloc::vec!["pub_a".to_string()]);
10396        assert!(s.enabled);
10397        assert_eq!(s.last_received_pos, 0);
10398    }
10399
10400    #[test]
10401    fn create_subscription_duplicate_name_errors() {
10402        let mut e = Engine::new();
10403        e.execute("CREATE SUBSCRIPTION s CONNECTION 'host=x' PUBLICATION p")
10404            .unwrap();
10405        let err = e
10406            .execute("CREATE SUBSCRIPTION s CONNECTION 'host=y' PUBLICATION p")
10407            .unwrap_err();
10408        assert!(
10409            alloc::format!("{err:?}").contains("DuplicateName"),
10410            "got {err:?}"
10411        );
10412    }
10413
10414    #[test]
10415    fn drop_subscription_silent_when_absent() {
10416        let mut e = Engine::new();
10417        let r = e.execute("DROP SUBSCRIPTION never").unwrap();
10418        match r {
10419            QueryResult::CommandOk { affected, .. } => assert_eq!(affected, 0),
10420            other => panic!("expected CommandOk, got {other:?}"),
10421        }
10422    }
10423
10424    #[test]
10425    fn subscription_advance_updates_last_pos_monotone() {
10426        let mut e = Engine::new();
10427        e.execute("CREATE SUBSCRIPTION s CONNECTION 'h=x' PUBLICATION p")
10428            .unwrap();
10429        assert!(e.subscription_advance("s", 100));
10430        assert_eq!(e.subscriptions().get("s").unwrap().last_received_pos, 100);
10431        assert!(e.subscription_advance("s", 50)); // stale → ignored
10432        assert_eq!(e.subscriptions().get("s").unwrap().last_received_pos, 100);
10433        assert!(e.subscription_advance("s", 200));
10434        assert_eq!(e.subscriptions().get("s").unwrap().last_received_pos, 200);
10435        assert!(!e.subscription_advance("missing", 1));
10436    }
10437
10438    #[test]
10439    fn show_subscriptions_returns_rows_ordered_by_name() {
10440        let mut e = Engine::new();
10441        e.execute("CREATE SUBSCRIPTION z_sub CONNECTION 'h=x' PUBLICATION p1, p2")
10442            .unwrap();
10443        e.execute("CREATE SUBSCRIPTION a_sub CONNECTION 'h=y' PUBLICATION p3")
10444            .unwrap();
10445        let r = e.execute_readonly("SHOW SUBSCRIPTIONS").unwrap();
10446        let QueryResult::Rows { rows, columns } = r else {
10447            panic!()
10448        };
10449        assert_eq!(rows.len(), 2);
10450        assert_eq!(columns.len(), 5);
10451        assert_eq!(columns[0].name, "name");
10452        assert_eq!(columns[4].name, "last_received_pos");
10453        // Alphabetical: a_sub, z_sub.
10454        let names: Vec<&str> = rows
10455            .iter()
10456            .map(|r| {
10457                if let Value::Text(s) = &r.values[0] {
10458                    s.as_str()
10459                } else {
10460                    panic!()
10461                }
10462            })
10463            .collect();
10464        assert_eq!(names, alloc::vec!["a_sub", "z_sub"]);
10465        // Row 0: a_sub
10466        assert_eq!(rows[0].values[1], Value::Text("h=y".to_string()));
10467        assert_eq!(rows[0].values[2], Value::Text("p3".to_string()));
10468        assert_eq!(rows[0].values[3], Value::Bool(true));
10469        assert_eq!(rows[0].values[4], Value::BigInt(0));
10470        // Row 1: z_sub — publications join with ", "
10471        assert_eq!(rows[1].values[2], Value::Text("p1, p2".to_string()));
10472    }
10473
10474    #[test]
10475    fn subscriptions_persist_across_snapshot_envelope_v4() {
10476        let mut e = Engine::new();
10477        e.execute("CREATE SUBSCRIPTION s1 CONNECTION 'h=A' PUBLICATION p1, p2")
10478            .unwrap();
10479        e.execute("CREATE SUBSCRIPTION s2 CONNECTION 'h=B' PUBLICATION p3")
10480            .unwrap();
10481        e.subscription_advance("s2", 42);
10482        let snap = e.snapshot();
10483        let e2 = Engine::restore_envelope(&snap).unwrap();
10484        assert_eq!(e2.subscriptions().len(), 2);
10485        let s1 = e2.subscriptions().get("s1").unwrap();
10486        assert_eq!(s1.conn_str, "h=A");
10487        assert_eq!(s1.publications, alloc::vec!["p1".to_string(), "p2".to_string()]);
10488        assert_eq!(s1.last_received_pos, 0);
10489        let s2 = e2.subscriptions().get("s2").unwrap();
10490        assert_eq!(s2.last_received_pos, 42);
10491    }
10492
10493    #[test]
10494    fn v3_envelope_loads_with_empty_subscriptions() {
10495        // v3 snapshot (publications-only). Forge it by hand so we
10496        // verify v6.1.4 readers don't panic — they must surface
10497        // empty subscriptions and a populated publication table.
10498        let mut e = Engine::new();
10499        e.execute("CREATE PUBLICATION pub_legacy").unwrap();
10500        let catalog = e.catalog.serialize();
10501        let users = crate::users::serialize_users(&e.users);
10502        let pubs = e.publications.serialize();
10503        let mut buf = Vec::new();
10504        buf.extend_from_slice(b"SPGENV01");
10505        buf.push(3u8); // v3
10506        buf.extend_from_slice(&u32::try_from(catalog.len()).unwrap().to_le_bytes());
10507        buf.extend_from_slice(&catalog);
10508        buf.extend_from_slice(&u32::try_from(users.len()).unwrap().to_le_bytes());
10509        buf.extend_from_slice(&users);
10510        buf.extend_from_slice(&u32::try_from(pubs.len()).unwrap().to_le_bytes());
10511        buf.extend_from_slice(&pubs);
10512        let crc = spg_crypto::crc32::crc32(&buf);
10513        buf.extend_from_slice(&crc.to_le_bytes());
10514
10515        let e2 = Engine::restore_envelope(&buf).expect("v3 envelope restores under v4 reader");
10516        assert!(e2.subscriptions().is_empty());
10517        assert!(e2.publications().contains("pub_legacy"));
10518    }
10519
10520    #[test]
10521    fn create_subscription_allowed_inside_transaction() {
10522        let mut e = Engine::new();
10523        e.execute("BEGIN").unwrap();
10524        e.execute("CREATE SUBSCRIPTION s CONNECTION 'h=x' PUBLICATION p")
10525            .unwrap();
10526        e.execute("COMMIT").unwrap();
10527        assert!(e.subscriptions().contains("s"));
10528    }
10529
10530    #[test]
10531    // ── v6.2.0: ANALYZE + spg_statistic + envelope v5 ──────────
10532
10533    #[test]
10534    fn analyze_populates_histogram_bounds() {
10535        let mut e = Engine::new();
10536        e.execute("CREATE TABLE t (id INT NOT NULL, name TEXT)").unwrap();
10537        for i in 0..50 {
10538            e.execute(&alloc::format!(
10539                "INSERT INTO t VALUES ({i}, 'name{i}')"
10540            ))
10541            .unwrap();
10542        }
10543        e.execute("ANALYZE t").unwrap();
10544        let stats = e.statistics();
10545        let id_stats = stats.get("t", "id").unwrap();
10546        assert!(id_stats.histogram_bounds.len() >= 2);
10547        assert_eq!(id_stats.histogram_bounds.first().unwrap(), "0");
10548        assert_eq!(id_stats.histogram_bounds.last().unwrap(), "49");
10549        assert!((id_stats.null_frac - 0.0).abs() < 1e-6);
10550        assert_eq!(id_stats.n_distinct, 50);
10551    }
10552
10553    #[test]
10554    fn reanalyze_overwrites_prior_stats() {
10555        let mut e = Engine::new();
10556        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
10557        for i in 0..10 {
10558            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
10559        }
10560        e.execute("ANALYZE t").unwrap();
10561        let n1 = e.statistics().get("t", "id").unwrap().n_distinct;
10562        assert_eq!(n1, 10);
10563        for i in 10..30 {
10564            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
10565        }
10566        e.execute("ANALYZE t").unwrap();
10567        let n2 = e.statistics().get("t", "id").unwrap().n_distinct;
10568        assert_eq!(n2, 30);
10569    }
10570
10571    #[test]
10572    fn analyze_unknown_table_errors() {
10573        let mut e = Engine::new();
10574        let err = e.execute("ANALYZE nonexistent").unwrap_err();
10575        assert!(matches!(err, EngineError::Storage(StorageError::TableNotFound { .. })));
10576    }
10577
10578    #[test]
10579    fn bare_analyze_covers_all_user_tables() {
10580        let mut e = Engine::new();
10581        e.execute("CREATE TABLE t1 (id INT NOT NULL)").unwrap();
10582        e.execute("CREATE TABLE t2 (name TEXT NOT NULL)").unwrap();
10583        e.execute("INSERT INTO t1 VALUES (1)").unwrap();
10584        e.execute("INSERT INTO t2 VALUES ('alice')").unwrap();
10585        let r = e.execute("ANALYZE").unwrap();
10586        match r {
10587            QueryResult::CommandOk { affected, modified_catalog } => {
10588                assert_eq!(affected, 2);
10589                assert!(modified_catalog);
10590            }
10591            other => panic!("expected CommandOk, got {other:?}"),
10592        }
10593        assert!(e.statistics().get("t1", "id").is_some());
10594        assert!(e.statistics().get("t2", "name").is_some());
10595    }
10596
10597    #[test]
10598    fn select_from_spg_statistic_returns_rows_per_column() {
10599        let mut e = Engine::new();
10600        e.execute("CREATE TABLE t (id INT NOT NULL, label TEXT)")
10601            .unwrap();
10602        e.execute("INSERT INTO t VALUES (1, 'a')").unwrap();
10603        e.execute("INSERT INTO t VALUES (2, 'b')").unwrap();
10604        e.execute("ANALYZE t").unwrap();
10605        let r = e.execute_readonly("SELECT * FROM spg_statistic").unwrap();
10606        let QueryResult::Rows { rows, columns } = r else {
10607            panic!()
10608        };
10609        // v6.7.0 — spg_statistic gained a `cold_row_count` column.
10610        assert_eq!(columns.len(), 6);
10611        assert_eq!(columns[0].name, "table_name");
10612        assert_eq!(columns[4].name, "histogram_bounds");
10613        assert_eq!(columns[5].name, "cold_row_count");
10614        assert_eq!(rows.len(), 2, "one row per column of t");
10615        // Sorted by (table_name, column_name).
10616        match (&rows[0].values[0], &rows[0].values[1]) {
10617            (Value::Text(t), Value::Text(c)) => {
10618                assert_eq!(t, "t");
10619                // BTreeMap orders (table, column); columns "id" < "label".
10620                assert_eq!(c, "id");
10621            }
10622            _ => panic!(),
10623        }
10624    }
10625
10626    #[test]
10627    fn analyze_skips_vector_columns() {
10628        // Vector columns have their own stats shape (HNSW graph);
10629        // ANALYZE leaves them out of spg_statistic.
10630        let mut e = Engine::new();
10631        e.execute("CREATE TABLE t (id INT NOT NULL, v VECTOR(3) NOT NULL)")
10632            .unwrap();
10633        e.execute("INSERT INTO t VALUES (1, [1, 2, 3])").unwrap();
10634        e.execute("ANALYZE t").unwrap();
10635        assert!(e.statistics().get("t", "id").is_some());
10636        assert!(e.statistics().get("t", "v").is_none());
10637    }
10638
10639    #[test]
10640    fn statistics_persist_across_envelope_v5_round_trip() {
10641        let mut e = Engine::new();
10642        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
10643        for i in 0..20 {
10644            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
10645        }
10646        e.execute("ANALYZE").unwrap();
10647        let snap = e.snapshot();
10648        let e2 = Engine::restore_envelope(&snap).unwrap();
10649        let s = e2.statistics().get("t", "id").unwrap();
10650        assert_eq!(s.n_distinct, 20);
10651    }
10652
10653    // ── v6.2.1 auto-analyze threshold ───────────────────────────
10654
10655    #[test]
10656    fn auto_analyze_threshold_fires_after_10pct_of_min_rows_on_small_table() {
10657        // For a table with 0 rows then 10 inserts → modified=10,
10658        // row_count=10. Threshold = 0.1 × max(10, 100) = 10. So
10659        // after the 10th INSERT the threshold is met.
10660        let mut e = Engine::new();
10661        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
10662        for i in 0..9 {
10663            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
10664        }
10665        assert!(e.tables_needing_analyze().is_empty(), "9 < threshold");
10666        e.execute("INSERT INTO t VALUES (9)").unwrap();
10667        let needs = e.tables_needing_analyze();
10668        assert_eq!(needs, alloc::vec!["t".to_string()]);
10669    }
10670
10671    #[test]
10672    fn auto_analyze_threshold_uses_10pct_of_row_count_for_large_tables() {
10673        // After ANALYZE on 1000 rows, threshold = 0.1 × row_count.
10674        // Each new INSERT bumps both modified and row_count, so to
10675        // trigger from N=1000 we need modifications ≥ 0.1 × (1000+M),
10676        // i.e. M ≥ 112. The test inserts 50 (no fire), then 150
10677        // more (200 total mods, row_count=1200, threshold=120 → fire).
10678        let mut e = Engine::new();
10679        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
10680        for i in 0..1000 {
10681            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
10682        }
10683        e.execute("ANALYZE t").unwrap();
10684        assert!(e.tables_needing_analyze().is_empty(), "fresh ANALYZE");
10685        for i in 1000..1050 {
10686            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
10687        }
10688        assert!(
10689            e.tables_needing_analyze().is_empty(),
10690            "50 inserts < threshold of ~105"
10691        );
10692        for i in 1050..1200 {
10693            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
10694        }
10695        assert_eq!(
10696            e.tables_needing_analyze(),
10697            alloc::vec!["t".to_string()],
10698            "200 inserts > 0.1 × 1200 threshold"
10699        );
10700    }
10701
10702    #[test]
10703    fn auto_analyze_threshold_resets_after_analyze() {
10704        let mut e = Engine::new();
10705        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
10706        for i in 0..200 {
10707            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
10708        }
10709        assert!(!e.tables_needing_analyze().is_empty());
10710        e.execute("ANALYZE").unwrap();
10711        assert!(
10712            e.tables_needing_analyze().is_empty(),
10713            "ANALYZE must reset the counter"
10714        );
10715    }
10716
10717    #[test]
10718    fn auto_analyze_threshold_tracks_updates_and_deletes() {
10719        let mut e = Engine::new();
10720        e.execute("CREATE TABLE t (id INT NOT NULL, label TEXT)").unwrap();
10721        for i in 0..50 {
10722            e.execute(&alloc::format!("INSERT INTO t VALUES ({i}, 'x')"))
10723                .unwrap();
10724        }
10725        e.execute("ANALYZE t").unwrap();
10726        // UPDATE 20 rows + DELETE 5 → modified=25. Threshold = 0.1
10727        // × max(50, 100) = 10. So 25 >= 10 → trigger.
10728        e.execute("UPDATE t SET label = 'y' WHERE id < 20").unwrap();
10729        e.execute("DELETE FROM t WHERE id >= 45").unwrap();
10730        assert_eq!(
10731            e.tables_needing_analyze(),
10732            alloc::vec!["t".to_string()]
10733        );
10734    }
10735
10736    #[test]
10737    fn v4_envelope_loads_with_empty_statistics() {
10738        // Forge a v4 envelope by hand: catalog + users + pubs +
10739        // subs trailer, no statistics. A v6.2.0 reader must accept
10740        // it and surface an empty Statistics.
10741        let mut e = Engine::new();
10742        e.create_user("alice", "secret", crate::users::Role::ReadOnly, [0u8; 16])
10743            .unwrap();
10744        let catalog = e.catalog.serialize();
10745        let users = crate::users::serialize_users(&e.users);
10746        let pubs = e.publications.serialize();
10747        let subs = e.subscriptions.serialize();
10748        let mut buf = Vec::new();
10749        buf.extend_from_slice(b"SPGENV01");
10750        buf.push(4u8);
10751        buf.extend_from_slice(&u32::try_from(catalog.len()).unwrap().to_le_bytes());
10752        buf.extend_from_slice(&catalog);
10753        buf.extend_from_slice(&u32::try_from(users.len()).unwrap().to_le_bytes());
10754        buf.extend_from_slice(&users);
10755        buf.extend_from_slice(&u32::try_from(pubs.len()).unwrap().to_le_bytes());
10756        buf.extend_from_slice(&pubs);
10757        buf.extend_from_slice(&u32::try_from(subs.len()).unwrap().to_le_bytes());
10758        buf.extend_from_slice(&subs);
10759        let crc = spg_crypto::crc32::crc32(&buf);
10760        buf.extend_from_slice(&crc.to_le_bytes());
10761        let e2 = Engine::restore_envelope(&buf).expect("v4 envelope restores");
10762        assert!(e2.statistics().is_empty());
10763    }
10764
10765    #[test]
10766    fn v1_v2_envelope_loads_with_empty_publications() {
10767        // A snapshot taken before v6.1.2 (no publication trailer,
10768        // envelope v2) must still deserialise — and the resulting
10769        // engine must report zero publications. Use the engine's own
10770        // round-trip with no publications: that emits v3 but with an
10771        // empty pubs block. Then forge a v2 envelope by hand to lock
10772        // the back-compat path.
10773        let mut e = Engine::new();
10774        // Force users to be non-empty so the snapshot takes the
10775        // envelope path rather than the bare-catalog fallback.
10776        e.create_user(
10777            "alice",
10778            "secret",
10779            crate::users::Role::ReadOnly,
10780            [0u8; 16],
10781        )
10782        .unwrap();
10783
10784        // Forge an envelope v2: same shape as v3 but no pubs trailer.
10785        let catalog = e.catalog.serialize();
10786        let users = crate::users::serialize_users(&e.users);
10787        let mut buf = Vec::new();
10788        buf.extend_from_slice(b"SPGENV01");
10789        buf.push(2u8); // v2
10790        buf.extend_from_slice(
10791            &u32::try_from(catalog.len()).unwrap().to_le_bytes(),
10792        );
10793        buf.extend_from_slice(&catalog);
10794        buf.extend_from_slice(
10795            &u32::try_from(users.len()).unwrap().to_le_bytes(),
10796        );
10797        buf.extend_from_slice(&users);
10798        let crc = spg_crypto::crc32::crc32(&buf);
10799        buf.extend_from_slice(&crc.to_le_bytes());
10800
10801        let e2 = Engine::restore_envelope(&buf).expect("v2 envelope restores");
10802        assert!(e2.publications().is_empty());
10803    }
10804}