Skip to main content

spg_engine/
lib.rs

1//! SPG execution engine — v0.3 wires the SQL front-end to the in-memory
2//! storage layer. Implements `CREATE TABLE`, single-row `INSERT VALUES`, and
3//! `SELECT * FROM <table>` (no WHERE yet — that lands in v0.4 alongside
4//! expression evaluation against rows).
5#![no_std]
6
7extern crate alloc;
8
9pub mod aggregate;
10pub mod describe;
11pub mod eval;
12pub mod json;
13pub mod memoize;
14pub mod plan_cache;
15pub mod publications;
16pub mod query_stats;
17pub mod reorder;
18pub mod selectivity;
19pub mod statistics;
20pub mod subscriptions;
21pub mod users;
22
23pub use crate::users::{Role, ScramSecrets, UserError, UserStore};
24
25use alloc::borrow::Cow;
26use alloc::boxed::Box;
27use alloc::collections::BTreeMap;
28use alloc::string::{String, ToString};
29use alloc::vec::Vec;
30use core::fmt;
31
32use spg_sql::ast::{
33    BinOp, ColumnDef, ColumnName, ColumnTypeName, CreateIndexStatement,
34    CreatePublicationStatement, CreateSubscriptionStatement, CreateTableStatement,
35    CreateUserStatement, Expr, FrameBound, FrameKind, FromClause, IndexMethod, InsertStatement,
36    JoinKind, Literal, OrderBy, SelectItem, SelectStatement, Statement, UnOp, UnionKind,
37    VecEncoding as SqlVecEncoding, WindowFrame,
38};
39use spg_sql::parser::{self, ParseError};
40use spg_storage::{
41    Catalog, ColumnSchema, CompactReport, DataType, IndexKey, IndexKind, Row, StorageError, Table,
42    TableSchema, Value, VecEncoding,
43};
44
45use crate::eval::{EvalContext, EvalError};
46
47/// Result of executing one statement.
48#[derive(Debug, Clone, PartialEq)]
49#[non_exhaustive]
50pub enum QueryResult {
51    /// DDL or DML succeeded.
52    ///
53    /// `affected` is the row count for `INSERT` and 0 elsewhere.
54    /// `modified_catalog` tells the server whether this statement
55    /// caused the *committed* catalog to change — it's the signal to
56    /// snapshot/audit. False for `BEGIN`/`ROLLBACK`, false for writeful
57    /// statements executed inside a transaction (those only touch the
58    /// shadow), and true for `COMMIT` and for writes outside a TX.
59    CommandOk {
60        affected: usize,
61        modified_catalog: bool,
62    },
63    /// `SELECT` returned a (possibly empty) row set.
64    Rows {
65        columns: Vec<ColumnSchema>,
66        rows: Vec<Row>,
67    },
68}
69
70/// All errors the engine can return.
71///
72/// Marked `#[non_exhaustive]` from v7.5.0 onward: external `match`
73/// must include a `_` arm so new variants in subsequent v7.x releases
74/// are not breaking changes.
75#[derive(Debug, Clone, PartialEq)]
76#[non_exhaustive]
77pub enum EngineError {
78    Parse(ParseError),
79    Storage(StorageError),
80    Eval(EvalError),
81    /// Front-end accepted a construct that the v0.x executor doesn't support.
82    Unsupported(String),
83    /// `BEGIN` while another transaction is already open.
84    TransactionAlreadyOpen,
85    /// `COMMIT` / `ROLLBACK` with no active transaction.
86    NoActiveTransaction,
87    /// v4.0 sentinel: `execute_readonly` got a statement that
88    /// mutates engine state (INSERT / CREATE / BEGIN / COMMIT / …).
89    /// The caller should retake the write lock and dispatch through
90    /// `execute(&mut self)` instead.
91    WriteRequired,
92    /// v4.2: a SELECT would have returned more rows than the
93    /// configured `max_query_rows` cap. Carries the cap.
94    RowLimitExceeded(usize),
95    /// v4.5: cooperative cancellation — the host (server's
96    /// per-query watchdog) set the cancel flag while a long-running
97    /// SELECT / UPDATE / DELETE was scanning rows. The partial work
98    /// is discarded; the caller should surface this as a timeout
99    /// to the client.
100    Cancelled,
101}
102
103impl fmt::Display for EngineError {
104    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
105        match self {
106            Self::Parse(e) => write!(f, "parse: {e}"),
107            Self::Storage(e) => write!(f, "storage: {e}"),
108            Self::Eval(e) => write!(f, "eval: {e}"),
109            Self::Unsupported(s) => write!(f, "unsupported: {s}"),
110            Self::TransactionAlreadyOpen => f.write_str("a transaction is already open"),
111            Self::NoActiveTransaction => f.write_str("no active transaction"),
112            Self::WriteRequired => {
113                f.write_str("statement requires a write lock (use execute, not execute_readonly)")
114            }
115            Self::RowLimitExceeded(n) => {
116                write!(f, "query exceeded max_query_rows={n}")
117            }
118            Self::Cancelled => f.write_str("query cancelled (timeout or client request)"),
119        }
120    }
121}
122
123impl From<ParseError> for EngineError {
124    fn from(e: ParseError) -> Self {
125        Self::Parse(e)
126    }
127}
128impl From<StorageError> for EngineError {
129    fn from(e: StorageError) -> Self {
130        Self::Storage(e)
131    }
132}
133impl From<EvalError> for EngineError {
134    fn from(e: EvalError) -> Self {
135        Self::Eval(e)
136    }
137}
138
139/// The execution engine. Holds the catalog and (later) other server-scope
140/// state. `Engine::new()` is intentionally cheap so callers can construct one
141/// per database, per test.
142/// Function pointer that returns "now" as microseconds since Unix
143/// epoch. The engine is `no_std`, so it can't reach for `std::time`
144/// itself — callers (`spg-server`, the sqllogictest runner) inject a
145/// concrete implementation. `None` means `NOW()` / `CURRENT_*` raise
146/// `Unsupported`.
147pub type ClockFn = fn() -> i64;
148
149/// Function pointer that produces 16 cryptographically random bytes.
150/// Like `ClockFn`, the engine is `no_std` and can't reach for /dev/urandom
151/// itself — host (`spg-server`) injects an OS-backed source. `None`
152/// means SQL-driven `CREATE USER` falls back to a deterministic salt
153/// derived from the username (acceptable in tests; the server always
154/// installs a real RNG so production paths never see this).
155pub type SaltFn = fn() -> [u8; 16];
156
157/// v4.5 cooperative cancellation token. A long-running SELECT /
158/// UPDATE / DELETE checks `is_cancelled` at row-loop checkpoints
159/// and bails with `EngineError::Cancelled`. The host
160/// (`spg-server`) creates an `AtomicBool` per query, spawns a
161/// watchdog thread that sets it after `SPG_QUERY_TIMEOUT_MS`,
162/// and passes it via `execute_with_cancel` / `execute_readonly_with_cancel`.
163///
164/// `CancelToken::none()` is a no-op — used by the legacy `execute`
165/// and `execute_readonly` entry points so existing callers don't
166/// change.
167#[derive(Debug, Clone, Copy)]
168pub struct CancelToken<'a> {
169    flag: Option<&'a core::sync::atomic::AtomicBool>,
170}
171
172impl<'a> CancelToken<'a> {
173    #[must_use]
174    pub const fn none() -> Self {
175        Self { flag: None }
176    }
177
178    #[must_use]
179    pub const fn from_flag(f: &'a core::sync::atomic::AtomicBool) -> Self {
180        Self { flag: Some(f) }
181    }
182
183    #[must_use]
184    pub fn is_cancelled(self) -> bool {
185        self.flag
186            .is_some_and(|f| f.load(core::sync::atomic::Ordering::Relaxed))
187    }
188
189    /// Returns `Err(Cancelled)` if the token has been tripped.
190    /// Used at row-loop checkpoints to bail cooperatively without
191    /// scattering raw `is_cancelled` checks across the executor.
192    #[inline]
193    pub fn check(self) -> Result<(), EngineError> {
194        if self.is_cancelled() {
195            Err(EngineError::Cancelled)
196        } else {
197            Ok(())
198        }
199    }
200}
201
202// ---- snapshot envelope (v4.1, extended with CRC32 in v4.37,  ----
203// ----   publications in v6.1.2 v3, subscriptions in v6.1.4 v4) ----
204//
205// Wraps a catalog blob + a user blob behind a small header so the
206// server can persist both atomically without inventing a new file.
207// Bare catalog blobs (v3.x) still load via `restore_envelope` since
208// the magic check fails fast and the function falls back to
209// `Catalog::deserialize`.
210//
211// Layout — v1 (v4.1, no CRC):
212//   [8 bytes magic "SPGENV01"]
213//   [u8 version = 1]
214//   [u32 catalog_len][catalog bytes]
215//   [u32 users_len][users bytes]
216//
217// Layout — v2 (v4.37, CRC32 of body):
218//   [8 bytes magic "SPGENV01"]
219//   [u8 version = 2]
220//   [u32 catalog_len][catalog bytes]
221//   [u32 users_len][users bytes]
222//   [u32 crc32]                      ← CRC32 of every byte before it.
223//
224// Layout — v3 (v6.1.2, publications trailer):
225//   [8 bytes magic "SPGENV01"]
226//   [u8 version = 3]
227//   [u32 catalog_len][catalog bytes]
228//   [u32 users_len][users bytes]
229//   [u32 pubs_len][publications bytes]
230//   [u32 crc32]
231//
232// Layout — v4 (v6.1.4, subscriptions trailer):
233//   [8 bytes magic "SPGENV01"]
234//   [u8 version = 4]
235//   [u32 catalog_len][catalog bytes]
236//   [u32 users_len][users bytes]
237//   [u32 pubs_len][publications bytes]
238//   [u32 subs_len][subscriptions bytes]
239//   [u32 crc32]
240//
241// Layout — v5 (v6.2.0, statistics trailer):
242//   [8 bytes magic "SPGENV01"]
243//   [u8 version = 5]
244//   [u32 catalog_len][catalog bytes]
245//   [u32 users_len][users bytes]
246//   [u32 pubs_len][publications bytes]
247//   [u32 subs_len][subscriptions bytes]
248//   [u32 stats_len][statistics bytes]      ← NEW
249//   [u32 crc32]
250//
251// Writers emit v5 from v6.2.0 on. Readers accept all of {v1, v2,
252// v3, v4, v5}: v1/v2 load with empty publications / subscriptions /
253// statistics; v3 loads with empty subscriptions + statistics; v4
254// loads with empty statistics; v5 deserialises all three. Older
255// SPG versions reading a v5 envelope fall through the version
256// match to `EnvelopeParse::Bare` — pre-v6.2.0 binaries cannot
257// open v6.2.0+ snapshots (matches the v6.1.2 / v6.1.4 breaks).
258
259const ENVELOPE_MAGIC: &[u8; 8] = b"SPGENV01";
260const ENVELOPE_VERSION_V1: u8 = 1;
261const ENVELOPE_VERSION_V2: u8 = 2;
262const ENVELOPE_VERSION_V3: u8 = 3;
263const ENVELOPE_VERSION_V4: u8 = 4;
264const ENVELOPE_VERSION_V5: u8 = 5;
265
266fn build_envelope(
267    catalog: &[u8],
268    users: &[u8],
269    pubs: &[u8],
270    subs: &[u8],
271    stats: &[u8],
272) -> Vec<u8> {
273    let mut out = Vec::with_capacity(
274        8 + 1
275            + 4
276            + catalog.len()
277            + 4
278            + users.len()
279            + 4
280            + pubs.len()
281            + 4
282            + subs.len()
283            + 4
284            + stats.len()
285            + 4,
286    );
287    out.extend_from_slice(ENVELOPE_MAGIC);
288    out.push(ENVELOPE_VERSION_V5);
289    out.extend_from_slice(
290        &u32::try_from(catalog.len())
291            .expect("≤ 4G catalog")
292            .to_le_bytes(),
293    );
294    out.extend_from_slice(catalog);
295    out.extend_from_slice(
296        &u32::try_from(users.len())
297            .expect("≤ 4G users")
298            .to_le_bytes(),
299    );
300    out.extend_from_slice(users);
301    out.extend_from_slice(
302        &u32::try_from(pubs.len())
303            .expect("≤ 4G publications")
304            .to_le_bytes(),
305    );
306    out.extend_from_slice(pubs);
307    out.extend_from_slice(
308        &u32::try_from(subs.len())
309            .expect("≤ 4G subscriptions")
310            .to_le_bytes(),
311    );
312    out.extend_from_slice(subs);
313    out.extend_from_slice(
314        &u32::try_from(stats.len())
315            .expect("≤ 4G statistics")
316            .to_le_bytes(),
317    );
318    out.extend_from_slice(stats);
319    let crc = spg_crypto::crc32::crc32(&out);
320    out.extend_from_slice(&crc.to_le_bytes());
321    out
322}
323
324/// Outcome of envelope parsing: either bare-catalog fallback, a
325/// successfully split section trio from a v1/v2/v3 envelope, or an
326/// explicit corruption error from a v2/v3 CRC mismatch. `Bare`
327/// (catalog-only fallback) preserves v3.x readability. v1/v2
328/// envelopes set `publications` to `None`; v3 sets it to the
329/// publications byte slice.
330enum EnvelopeParse<'a> {
331    Bare,
332    Pair {
333        catalog: &'a [u8],
334        users: &'a [u8],
335        publications: Option<&'a [u8]>,
336        subscriptions: Option<&'a [u8]>,
337        statistics: Option<&'a [u8]>,
338    },
339    CrcMismatch {
340        expected: u32,
341        computed: u32,
342    },
343}
344
345/// Returns `EnvelopeParse::Pair` for a valid v1 / v2 / v3 envelope,
346/// `Bare` for a buffer that doesn't look like an envelope (v3.x
347/// bare catalog fallback), and `CrcMismatch` for a v2/v3 envelope
348/// whose trailing CRC32 doesn't match the body.
349fn split_envelope(buf: &[u8]) -> EnvelopeParse<'_> {
350    if buf.len() < 8 + 1 + 4 || &buf[..8] != ENVELOPE_MAGIC {
351        return EnvelopeParse::Bare;
352    }
353    let version = buf[8];
354    if !matches!(
355        version,
356        ENVELOPE_VERSION_V1
357            | ENVELOPE_VERSION_V2
358            | ENVELOPE_VERSION_V3
359            | ENVELOPE_VERSION_V4
360            | ENVELOPE_VERSION_V5
361    ) {
362        return EnvelopeParse::Bare;
363    }
364    let mut p = 9usize;
365    let Some(cat_len_bytes) = buf.get(p..p + 4) else {
366        return EnvelopeParse::Bare;
367    };
368    let Ok(cat_len_arr) = cat_len_bytes.try_into() else {
369        return EnvelopeParse::Bare;
370    };
371    let cat_len = u32::from_le_bytes(cat_len_arr) as usize;
372    p += 4;
373    if p + cat_len + 4 > buf.len() {
374        return EnvelopeParse::Bare;
375    }
376    let catalog = &buf[p..p + cat_len];
377    p += cat_len;
378    let Some(user_len_bytes) = buf.get(p..p + 4) else {
379        return EnvelopeParse::Bare;
380    };
381    let Ok(user_len_arr) = user_len_bytes.try_into() else {
382        return EnvelopeParse::Bare;
383    };
384    let user_len = u32::from_le_bytes(user_len_arr) as usize;
385    p += 4;
386    if p + user_len > buf.len() {
387        return EnvelopeParse::Bare;
388    }
389    let users = &buf[p..p + user_len];
390    p += user_len;
391    let publications = if matches!(
392        version,
393        ENVELOPE_VERSION_V3 | ENVELOPE_VERSION_V4 | ENVELOPE_VERSION_V5
394    ) {
395        // [u32 pubs_len][publications bytes]
396        let Some(pubs_len_bytes) = buf.get(p..p + 4) else {
397            return EnvelopeParse::Bare;
398        };
399        let Ok(pubs_len_arr) = pubs_len_bytes.try_into() else {
400            return EnvelopeParse::Bare;
401        };
402        let pubs_len = u32::from_le_bytes(pubs_len_arr) as usize;
403        p += 4;
404        if p + pubs_len > buf.len() {
405            return EnvelopeParse::Bare;
406        }
407        let pubs_slice = &buf[p..p + pubs_len];
408        p += pubs_len;
409        Some(pubs_slice)
410    } else {
411        None
412    };
413    let subscriptions = if matches!(version, ENVELOPE_VERSION_V4 | ENVELOPE_VERSION_V5) {
414        // [u32 subs_len][subscriptions bytes]
415        let Some(subs_len_bytes) = buf.get(p..p + 4) else {
416            return EnvelopeParse::Bare;
417        };
418        let Ok(subs_len_arr) = subs_len_bytes.try_into() else {
419            return EnvelopeParse::Bare;
420        };
421        let subs_len = u32::from_le_bytes(subs_len_arr) as usize;
422        p += 4;
423        if p + subs_len > buf.len() {
424            return EnvelopeParse::Bare;
425        }
426        let subs_slice = &buf[p..p + subs_len];
427        p += subs_len;
428        Some(subs_slice)
429    } else {
430        None
431    };
432    let statistics = if version == ENVELOPE_VERSION_V5 {
433        // [u32 stats_len][statistics bytes]
434        let Some(stats_len_bytes) = buf.get(p..p + 4) else {
435            return EnvelopeParse::Bare;
436        };
437        let Ok(stats_len_arr) = stats_len_bytes.try_into() else {
438            return EnvelopeParse::Bare;
439        };
440        let stats_len = u32::from_le_bytes(stats_len_arr) as usize;
441        p += 4;
442        if p + stats_len > buf.len() {
443            return EnvelopeParse::Bare;
444        }
445        let stats_slice = &buf[p..p + stats_len];
446        p += stats_len;
447        Some(stats_slice)
448    } else {
449        None
450    };
451    if matches!(
452        version,
453        ENVELOPE_VERSION_V2 | ENVELOPE_VERSION_V3 | ENVELOPE_VERSION_V4 | ENVELOPE_VERSION_V5
454    ) {
455        if p + 4 != buf.len() {
456            return EnvelopeParse::Bare;
457        }
458        let Ok(crc_arr) = buf[p..p + 4].try_into() else {
459            return EnvelopeParse::Bare;
460        };
461        let expected = u32::from_le_bytes(crc_arr);
462        let computed = spg_crypto::crc32::crc32(&buf[..p]);
463        if expected != computed {
464            return EnvelopeParse::CrcMismatch { expected, computed };
465        }
466    } else if p != buf.len() {
467        // v1: must end exactly at the users section.
468        return EnvelopeParse::Bare;
469    }
470    EnvelopeParse::Pair {
471        catalog,
472        users,
473        publications,
474        subscriptions,
475        statistics,
476    }
477}
478
479/// v4.41.1 opaque transaction handle. Returned by `Engine::alloc_tx_id`,
480/// threaded through `Engine::execute_in` so dispatch can identify which
481/// in-flight TX a statement belongs to. `IMPLICIT_TX` is the reserved
482/// slot every legacy caller — engine self-tests, spg-cli, spg-embedded,
483/// startup replay — implicitly uses through the unchanged
484/// `Engine::execute(sql)` API. v4.41.1 keeps at most one active slot at
485/// runtime (dispatch holds `engine.write()` across the wrap, same as
486/// v4.34); the map shape is here to let v4.42 turn on N in-flight
487/// implicit TXs without reshuffling the engine internals.
488#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
489pub struct TxId(pub u64);
490
491/// Reserved slot used by `Engine::execute(sql)` — the legacy single-
492/// global-shadow path. New `alloc_tx_id` handles start at 1.
493pub const IMPLICIT_TX: TxId = TxId(0);
494
495/// v6.7.3 — default segment-size threshold used by `COMPACT COLD
496/// SEGMENTS` when no explicit target is supplied. Segments whose
497/// `OwnedSegment::bytes().len()` is **strictly** less than this
498/// value are eligible to merge. spg-server reads
499/// `SPG_COMPACTION_TARGET_SEGMENT_BYTES` to override.
500pub const COMPACTION_TARGET_DEFAULT_BYTES: u64 = 4 * 1024 * 1024;
501
502/// Per-slot transaction state. Held inside `tx_catalogs[tx_id]` for the
503/// lifetime of a BEGIN..COMMIT (or BEGIN..ROLLBACK) window. Drops when
504/// the TX commits (its `catalog` is moved over `Engine.catalog`) or
505/// rolls back (slot removed, catalog discarded).
506#[derive(Debug, Default, Clone)]
507struct TxState {
508    /// The TX's shadow copy of the catalog. Started as a clone of
509    /// `Engine.catalog` at BEGIN time; writes flow into it; COMMIT
510    /// installs it over `Engine.catalog`. `Catalog::clone()` is O(1)
511    /// since v4.40 (`PersistentVec` rows + `PersistentBTreeMap` indices).
512    catalog: Catalog,
513    /// Per-TX savepoint stack. Each entry pairs the savepoint name with
514    /// a clone of `catalog` at the moment `SAVEPOINT <name>` fired.
515    /// `ROLLBACK TO <name>` restores from the entry and pops everything
516    /// after it; `RELEASE <name>` discards the entry and everything
517    /// after; COMMIT/ROLLBACK clears the whole stack.
518    savepoints: Vec<(String, Catalog)>,
519}
520
521#[derive(Debug, Default)]
522pub struct Engine {
523    /// Committed catalog — what survives `Engine::snapshot()` and what
524    /// outside-TX `SELECT`s read.
525    catalog: Catalog,
526    /// Active TX slots, keyed by `TxId`. Empty when no TX is in flight.
527    /// v4.41.1 runtime invariant: at most one entry (single-writer
528    /// model unchanged). v4.42 will let dispatch hold multiple entries
529    /// concurrently for group commit + engine MVCC.
530    tx_catalogs: BTreeMap<TxId, TxState>,
531    /// Which slot the next exec_* call should mutate. Set by
532    /// `execute_in(sql, tx_id)` at the entry point; legacy `execute(sql)`
533    /// sets it to `IMPLICIT_TX`. None when no TX is in flight (read /
534    /// write goes straight against `catalog`).
535    current_tx: Option<TxId>,
536    /// Monotonic counter for `alloc_tx_id`. Starts at 1 — slot 0 is
537    /// reserved for `IMPLICIT_TX`.
538    next_tx_id: u64,
539    /// Optional wall clock used to satisfy `NOW()` / `CURRENT_TIMESTAMP`
540    /// / `CURRENT_DATE`. Set by the host environment.
541    clock: Option<ClockFn>,
542    /// v4.1 cryptographic RNG for per-user password salt. Set by the
543    /// host. `None` means SQL-driven `CREATE USER` uses a
544    /// deterministic fallback — see `SaltFn`.
545    salt_fn: Option<SaltFn>,
546    /// v4.2 per-query row cap. `None` = unlimited. When set, a
547    /// SELECT that materialises more than `n` rows returns
548    /// `EngineError::RowLimitExceeded`. Enforced before the result
549    /// is shaped into wire frames so a runaway scan can't blow the
550    /// server's heap.
551    max_query_rows: Option<usize>,
552    /// v4.1 RBAC user table. Empty means "no RBAC configured yet" —
553    /// the server decides what that means at the auth boundary
554    /// (open mode vs legacy single-password mode). User CRUD goes
555    /// through `create_user`/`drop_user`/`verify_user`; persistence
556    /// rides the snapshot envelope alongside the catalog.
557    users: UserStore,
558    /// v6.1.2 logical-replication publication catalog. Empty until
559    /// `CREATE PUBLICATION` runs. Persistence rides the v3 envelope
560    /// trailer (see `build_envelope`).
561    publications: publications::Publications,
562    /// v6.1.4 logical-replication subscription catalog. Empty until
563    /// `CREATE SUBSCRIPTION` runs. Persistence rides the v4 envelope
564    /// trailer.
565    subscriptions: subscriptions::Subscriptions,
566    /// v6.2.0 — per-column statistics for the cost-based optimizer.
567    /// Populated by `ANALYZE`; queried via `spg_statistic` virtual
568    /// table. Persistence rides the v5 envelope trailer.
569    statistics: statistics::Statistics,
570    /// v6.3.0 — engine-level plan cache. Caches the post-`prepare()`
571    /// `Statement` keyed on SQL text. In-memory only — does NOT ride
572    /// the snapshot envelope (rebuilt on demand after restart).
573    plan_cache: plan_cache::PlanCache,
574    /// v6.5.1 — per-distinct-SQL execution stats. In-memory only,
575    /// surfaced via `spg_stat_query` virtual table. Updated by the
576    /// `execute_*` paths after a successful execute.
577    query_stats: query_stats::QueryStats,
578    /// v6.5.2 — connection-state provider callback. spg-server
579    /// registers a function at startup that snapshots its
580    /// per-pgwire-connection registry into `ActivityRow`s; engine
581    /// reads through it on every `SELECT * FROM spg_stat_activity`.
582    /// `None` ⇒ no-data (returns empty rows; matches the no_std
583    /// embedded callers that don't run pgwire).
584    activity_provider: Option<ActivityProvider>,
585    /// v6.5.3 — audit-chain provider + verifier. Same pattern as
586    /// activity_provider: spg-server registers both at startup;
587    /// engine reads through on `SELECT * FROM spg_audit_chain` and
588    /// `SELECT * FROM spg_audit_verify`. `None` ⇒ no-data.
589    audit_chain_provider: Option<AuditChainProvider>,
590    audit_verifier: Option<AuditVerifier>,
591    /// v6.5.6 — slow-query log threshold in microseconds. When set,
592    /// every successful execute whose elapsed exceeds the threshold
593    /// gets fed to the registered slow-query log callback (so
594    /// spg-server can emit a structured log line). Default `None`
595    /// = no slow-query logging.
596    slow_query_threshold_us: Option<u64>,
597    slow_query_logger: Option<SlowQueryLogger>,
598}
599
600/// v6.5.6 — callback signature for slow-query log emission. Called
601/// with `(sql, elapsed_us)` once per successful execute that crosses
602/// the threshold.
603pub type SlowQueryLogger = fn(&str, u64);
604
605/// v6.5.4 — synthesise a `CREATE TABLE` statement from catalog
606/// state. Round-trips through `Engine::execute` to recreate the
607/// same schema (sans data + indexes — indexes are emitted as a
608/// separate `CREATE INDEX` chain in `spg_database_ddl`).
609fn render_create_table(name: &str, columns: &[ColumnSchema]) -> String {
610    let mut out = alloc::format!("CREATE TABLE {name} (");
611    for (i, col) in columns.iter().enumerate() {
612        if i > 0 {
613            out.push_str(", ");
614        }
615        out.push_str(&col.name);
616        out.push(' ');
617        out.push_str(&render_data_type(col.ty));
618        if !col.nullable {
619            out.push_str(" NOT NULL");
620        }
621        if col.auto_increment {
622            out.push_str(" AUTO_INCREMENT");
623        }
624    }
625    out.push(')');
626    out
627}
628
629fn render_data_type(ty: DataType) -> String {
630    match ty {
631        DataType::SmallInt => "SMALLINT".into(),
632        DataType::Int => "INT".into(),
633        DataType::BigInt => "BIGINT".into(),
634        DataType::Float => "FLOAT".into(),
635        DataType::Text => "TEXT".into(),
636        DataType::Varchar(n) => alloc::format!("VARCHAR({n})"),
637        DataType::Char(n) => alloc::format!("CHAR({n})"),
638        DataType::Bool => "BOOL".into(),
639        DataType::Vector { dim, encoding } => match encoding {
640            spg_storage::VecEncoding::F32 => alloc::format!("VECTOR({dim})"),
641            spg_storage::VecEncoding::Sq8 => alloc::format!("VECTOR({dim}) USING SQ8"),
642            spg_storage::VecEncoding::F16 => alloc::format!("VECTOR({dim}) USING HALF"),
643        },
644        DataType::Numeric { precision, scale } => {
645            alloc::format!("NUMERIC({precision},{scale})")
646        }
647        DataType::Date => "DATE".into(),
648        DataType::Timestamp => "TIMESTAMP".into(),
649        DataType::Interval => "INTERVAL".into(),
650        DataType::Json => "JSON".into(),
651        DataType::Jsonb => "JSONB".into(),
652        DataType::Timestamptz => "TIMESTAMPTZ".into(),
653        DataType::Bytes => "BYTEA".into(),
654    }
655}
656
657/// v6.5.2 — one row of `spg_stat_activity`. Engine-public so
658/// spg-server can construct rows without re-exporting internal
659/// dispatch types.
660#[derive(Debug, Clone)]
661pub struct ActivityRow {
662    pub pid: u32,
663    pub user: String,
664    pub started_at_us: i64,
665    pub current_sql: String,
666    pub wait_event: String,
667    pub elapsed_us: i64,
668    pub in_transaction: bool,
669}
670
671/// v6.5.2 — provider callback type. Fresh snapshot returned each
672/// call; engine doesn't cache the slice.
673pub type ActivityProvider = fn() -> Vec<ActivityRow>;
674
675/// v6.5.3 — one row of `spg_audit_chain`. Engine-public so
676/// spg-server can construct rows directly from `AuditEntry`.
677#[derive(Debug, Clone)]
678pub struct AuditRow {
679    pub seq: i64,
680    pub ts_ms: i64,
681    pub prev_hash_hex: String,
682    pub entry_hash_hex: String,
683    pub sql: String,
684}
685
686/// v6.5.3 — chain-table provider + verifier. spg-server registers
687/// fn pointers that snapshot / verify the audit log. `verify`
688/// returns `(verified_count, broken_at_seq)` — `broken_at_seq` is
689/// `-1` on a clean chain.
690pub type AuditChainProvider = fn() -> Vec<AuditRow>;
691pub type AuditVerifier = fn() -> (i64, i64);
692
693impl Engine {
694    pub fn new() -> Self {
695        Self {
696            catalog: Catalog::new(),
697            tx_catalogs: BTreeMap::new(),
698            current_tx: None,
699            next_tx_id: 1,
700            clock: None,
701            salt_fn: None,
702            max_query_rows: None,
703            users: UserStore::new(),
704            publications: publications::Publications::new(),
705            subscriptions: subscriptions::Subscriptions::new(),
706            statistics: statistics::Statistics::new(),
707            plan_cache: plan_cache::PlanCache::new(),
708            query_stats: query_stats::QueryStats::new(),
709            activity_provider: None,
710            audit_chain_provider: None,
711            audit_verifier: None,
712            slow_query_threshold_us: None,
713            slow_query_logger: None,
714        }
715    }
716
717    /// Construct an engine restored from a previously-snapshotted catalog
718    /// (see `snapshot()`).
719    pub fn restore(catalog: Catalog) -> Self {
720        Self {
721            catalog,
722            tx_catalogs: BTreeMap::new(),
723            current_tx: None,
724            next_tx_id: 1,
725            clock: None,
726            salt_fn: None,
727            max_query_rows: None,
728            users: UserStore::new(),
729            publications: publications::Publications::new(),
730            subscriptions: subscriptions::Subscriptions::new(),
731            statistics: statistics::Statistics::new(),
732            plan_cache: plan_cache::PlanCache::new(),
733            query_stats: query_stats::QueryStats::new(),
734            activity_provider: None,
735            audit_chain_provider: None,
736            audit_verifier: None,
737            slow_query_threshold_us: None,
738            slow_query_logger: None,
739        }
740    }
741
742    /// Restore an engine + user table from a v4.1 envelope produced
743    /// by `snapshot_with_users()`. Falls back to plain catalog-only
744    /// restore if the envelope magic isn't present (so v3.x snapshot
745    /// files still load). v6.1.2 adds the optional publications
746    /// trailer (envelope v3); a v1/v2 envelope deserialises to an
747    /// empty publication table.
748    pub fn restore_envelope(buf: &[u8]) -> Result<Self, EngineError> {
749        match split_envelope(buf) {
750            EnvelopeParse::Pair {
751                catalog: catalog_bytes,
752                users: user_bytes,
753                publications: pub_bytes,
754                subscriptions: sub_bytes,
755                statistics: stats_bytes,
756            } => {
757                let catalog = Catalog::deserialize(catalog_bytes).map_err(EngineError::Storage)?;
758                let users = users::deserialize_users(user_bytes)
759                    .map_err(|e| EngineError::Unsupported(alloc::format!("users restore: {e}")))?;
760                let publications = match pub_bytes {
761                    Some(b) => publications::Publications::deserialize(b).map_err(|e| {
762                        EngineError::Unsupported(alloc::format!("publications restore: {e:?}"))
763                    })?,
764                    None => publications::Publications::new(),
765                };
766                let subscriptions = match sub_bytes {
767                    Some(b) => subscriptions::Subscriptions::deserialize(b).map_err(|e| {
768                        EngineError::Unsupported(alloc::format!("subscriptions restore: {e:?}"))
769                    })?,
770                    None => subscriptions::Subscriptions::new(),
771                };
772                let statistics = match stats_bytes {
773                    Some(b) => statistics::Statistics::deserialize(b).map_err(|e| {
774                        EngineError::Unsupported(alloc::format!("statistics restore: {e:?}"))
775                    })?,
776                    None => statistics::Statistics::new(),
777                };
778                Ok(Self {
779                    catalog,
780                    tx_catalogs: BTreeMap::new(),
781                    current_tx: None,
782                    next_tx_id: 1,
783                    clock: None,
784                    salt_fn: None,
785                    max_query_rows: None,
786                    users,
787                    publications,
788                    subscriptions,
789                    statistics,
790                    plan_cache: plan_cache::PlanCache::new(),
791                    query_stats: query_stats::QueryStats::new(),
792                    activity_provider: None,
793                    audit_chain_provider: None,
794                    audit_verifier: None,
795                    slow_query_threshold_us: None,
796                    slow_query_logger: None,
797                })
798            }
799            EnvelopeParse::CrcMismatch { expected, computed } => {
800                Err(EngineError::Storage(StorageError::Corrupt(alloc::format!(
801                    "snapshot envelope CRC32 mismatch (expected={expected:#010x}, computed={computed:#010x})"
802                ))))
803            }
804            EnvelopeParse::Bare => {
805                let catalog = Catalog::deserialize(buf).map_err(EngineError::Storage)?;
806                Ok(Self::restore(catalog))
807            }
808        }
809    }
810
811    pub const fn users(&self) -> &UserStore {
812        &self.users
813    }
814
815    /// `salt` is supplied by the caller (the host has a random
816    /// source; the engine is `no_std`). Caller should pass a fresh
817    /// 16-byte random value per user.
818    pub fn create_user(
819        &mut self,
820        name: &str,
821        password: &str,
822        role: Role,
823        salt: [u8; 16],
824    ) -> Result<(), UserError> {
825        self.users.create(name, password, role, salt)?;
826        // v4.8: also derive SCRAM-SHA-256 secrets so PG-wire SASL
827        // auth can verify without re-running PBKDF2 per attempt.
828        // Uses a fresh salt from the host RNG (falls back to a
829        // deterministic per-username salt when no RNG is wired, same
830        // as the legacy hash path).
831        let scram_salt = self.salt_fn.map_or_else(
832            || {
833                let mut s = [0u8; users::SCRAM_SALT_LEN];
834                let digest = spg_crypto::hash(name.as_bytes());
835                // Use bytes 16..32 of BLAKE3 so we don't reuse the
836                // exact same fallback salt as the BLAKE3 hash path.
837                s.copy_from_slice(&digest[16..32]);
838                s
839            },
840            |f| f(),
841        );
842        self.users
843            .enable_scram(name, password, scram_salt, users::SCRAM_DEFAULT_ITERS)?;
844        Ok(())
845    }
846
847    pub fn drop_user(&mut self, name: &str) -> Result<(), UserError> {
848        self.users.drop(name)
849    }
850
851    pub fn verify_user(&self, name: &str, password: &str) -> Option<Role> {
852        self.users.verify(name, password)
853    }
854
855    /// Builder: attach a wall clock so `NOW()` / `CURRENT_TIMESTAMP` /
856    /// `CURRENT_DATE` evaluate to a real value instead of erroring out.
857    #[must_use]
858    pub const fn with_clock(mut self, clock: ClockFn) -> Self {
859        self.clock = Some(clock);
860        self
861    }
862
863    /// Builder: attach an OS-backed RNG for per-user password salts.
864    /// The host (`spg-server`) typically wires this to `/dev/urandom`.
865    #[must_use]
866    pub const fn with_salt_fn(mut self, f: SaltFn) -> Self {
867        self.salt_fn = Some(f);
868        self
869    }
870
871    /// Builder: cap the number of rows a single SELECT may return.
872    /// Exceeding the cap raises `EngineError::RowLimitExceeded` —
873    /// the bound is checked inside the executor so a runaway
874    /// catalog scan can't allocate millions of rows before the
875    /// server gets a chance to reject the result.
876    #[must_use]
877    pub const fn with_max_query_rows(mut self, n: usize) -> Self {
878        self.max_query_rows = Some(n);
879        self
880    }
881
882    /// The *committed* catalog. Note: during a transaction this returns the
883    /// pre-TX state — `SELECT` inside a TX goes through `execute()` and reads
884    /// the shadow. Tests that inspect outside-TX state should use this.
885    pub const fn catalog(&self) -> &Catalog {
886        &self.catalog
887    }
888
889    /// Serialize the *committed* catalog to bytes. v0.6 was full-snapshot; v0.9
890    /// adds the rule that an open TX's shadow is never snapshotted — only the
891    /// post-COMMIT state is persisted. v4.1 wraps the catalog in an envelope
892    /// when there are users to persist; an empty user table snapshots as the
893    /// bare catalog format (backwards-compat with v3.x readers). v6.1.2
894    /// adds publications to the envelope condition: either non-empty
895    /// users OR non-empty publications now triggers the envelope path.
896    pub fn snapshot(&self) -> Vec<u8> {
897        if self.users.is_empty()
898            && self.publications.is_empty()
899            && self.subscriptions.is_empty()
900            && self.statistics.is_empty()
901        {
902            self.catalog.serialize()
903        } else {
904            build_envelope(
905                &self.catalog.serialize(),
906                &users::serialize_users(&self.users),
907                &self.publications.serialize(),
908                &self.subscriptions.serialize(),
909                &self.statistics.serialize(),
910            )
911        }
912    }
913
914    /// True when at least one TX slot is in flight. v4.41.1 runtime
915    /// invariant: at most one slot active at a time (dispatch holds
916    /// `engine.write()` across the entire wrap). v4.42 will let this
917    /// return true with multiple slots concurrently.
918    pub fn in_transaction(&self) -> bool {
919        !self.tx_catalogs.is_empty()
920    }
921
922    /// v4.41.1 allocate a fresh TX handle. Used by spg-server dispatch
923    /// to scope each implicit-wrap BEGIN..stmt..COMMIT to its own slot
924    /// in `tx_catalogs`. v4.42 — the commit-barrier leader allocates
925    /// one of these per task in its group, runs `BEGIN`+sql+`COMMIT`
926    /// sequentially under a single `engine.write()` so each task's
927    /// mutations accumulate into shared state, then either keeps the
928    /// accumulated state (fsync OK) or restores the pre-image via
929    /// `replace_catalog` (fsync err).
930    pub fn alloc_tx_id(&mut self) -> TxId {
931        let id = TxId(self.next_tx_id);
932        self.next_tx_id = self.next_tx_id.saturating_add(1);
933        id
934    }
935
936    /// v4.42 — atomically replace the live catalog. Used by the
937    /// commit-barrier leader to roll back a group whose batched
938    /// fsync failed: the leader snapshots `engine.catalog().clone()`
939    /// (O(1) Arc bump after the v4.39/v4.40 persistent migration)
940    /// at group start, sequentially applies each task's BEGIN+sql+
941    /// COMMIT under the same write lock to accumulate mutations
942    /// into shared state, batches the WAL bytes, fsyncs once, and
943    /// on failure calls this with the pre-image to undo every
944    /// task in the group at once.
945    ///
946    /// **Does NOT touch `tx_catalogs` / `current_tx`.** Any
947    /// explicit-TX slot from a concurrent client (created via the
948    /// legacy `IMPLICIT_TX`-less dispatch path or via the future
949    /// MVCC-readers v5+ work) has its own snapshot baked into the
950    /// slot — restoring `self.catalog` to the pre-image leaves
951    /// those slots untouched, exactly as they were when the leader
952    /// took the lock. The leader's own implicit-TX slots are all
953    /// already discarded (`exec_commit` removed them as each
954    /// task's COMMIT ran) by the time this is reached.
955    pub fn replace_catalog(&mut self, catalog: Catalog) {
956        self.catalog = catalog;
957    }
958
959    /// v6.7.0 — public shim around `Catalog::freeze_oldest_to_cold`
960    /// so tests + the spg-server freezer can drive a freeze without
961    /// reaching into the private `active_catalog_mut`. v6.7.4
962    /// parallel freezer will build on this surface.
963    ///
964    /// Marks the table's cached `cold_row_count` stale because the
965    /// freeze added cold locators that ANALYZE hasn't yet refreshed.
966    pub fn freeze_oldest_to_cold(
967        &mut self,
968        table_name: &str,
969        index_name: &str,
970        max_rows: usize,
971    ) -> Result<spg_storage::FreezeReport, EngineError> {
972        let report = self
973            .active_catalog_mut()
974            .freeze_oldest_to_cold(table_name, index_name, max_rows)
975            .map_err(EngineError::Storage)?;
976        if let Some(t) = self.active_catalog_mut().get_mut(table_name) {
977            t.mark_cold_row_count_stale();
978        }
979        Ok(report)
980    }
981
982    /// v6.7.5 — public shim used by the spg-server follower's
983    /// segment-forwarding receiver. Registers a cold-tier segment
984    /// at a specific id (the master's id, as transmitted on the
985    /// wire) so the follower's BTree-Cold locators stay byte-
986    /// identical with the master's. Wraps
987    /// `Catalog::load_segment_bytes_at` under the standard
988    /// clone-mutate-replace pattern.
989    ///
990    /// Returns `Ok(())` on success **and** on the "slot already
991    /// occupied" case — a follower mid-reconnect may receive a
992    /// segment chunk for a segment_id it already has on disk
993    /// (forwarded last session); the caller should treat that
994    /// path as a no-op rather than a fatal error.
995    pub fn receive_cold_segment(
996        &mut self,
997        segment_id: u32,
998        bytes: Vec<u8>,
999    ) -> Result<(), EngineError> {
1000        let mut new_cat = self.catalog.clone();
1001        match new_cat.load_segment_bytes_at(segment_id, bytes) {
1002            Ok(()) => {
1003                self.replace_catalog(new_cat);
1004                Ok(())
1005            }
1006            Err(StorageError::Corrupt(msg)) if msg.contains("already occupied") => Ok(()),
1007            Err(e) => Err(EngineError::Storage(e)),
1008        }
1009    }
1010
1011    /// v6.7.3 — public shim around `Catalog::compact_cold_segments`
1012    /// driving every BTree index on every user table. Returns one
1013    /// `(table, index, report)` triple for each merge that
1014    /// actually happened (no-op (table, index) pairs are filtered
1015    /// out so callers can size persist-side work to the live
1016    /// merges). Caller is responsible for persisting each
1017    /// `report.merged_segment_bytes` and updating the on-disk
1018    /// segment registry; engine layer is no_std and never
1019    /// touches disk.
1020    ///
1021    /// Marks every touched table's cached `cold_row_count` stale
1022    /// — compaction GC'd some shadowed rows, so the count must be
1023    /// re-derived on the next ANALYZE.
1024    pub fn compact_cold_segments_with_target(
1025        &mut self,
1026        target_segment_bytes: u64,
1027    ) -> Result<Vec<(String, String, CompactReport)>, EngineError> {
1028        let table_names = self.active_catalog().table_names();
1029        let mut reports: Vec<(String, String, CompactReport)> = Vec::new();
1030        for tname in table_names {
1031            if is_internal_table_name(&tname) {
1032                continue;
1033            }
1034            let idx_names: Vec<String> = {
1035                let Some(t) = self.active_catalog().get(&tname) else {
1036                    continue;
1037                };
1038                t.indices()
1039                    .iter()
1040                    .filter(|i| matches!(i.kind, IndexKind::BTree(_)))
1041                    .map(|i| i.name.clone())
1042                    .collect()
1043            };
1044            for iname in idx_names {
1045                let report = self
1046                    .active_catalog_mut()
1047                    .compact_cold_segments(&tname, &iname, target_segment_bytes)
1048                    .map_err(EngineError::Storage)?;
1049                if report.merged_segment_id.is_some() {
1050                    if let Some(t) = self.active_catalog_mut().get_mut(&tname) {
1051                        t.mark_cold_row_count_stale();
1052                    }
1053                    reports.push((tname.clone(), iname, report));
1054                }
1055            }
1056        }
1057        Ok(reports)
1058    }
1059
1060    fn active_catalog(&self) -> &Catalog {
1061        match self.current_tx {
1062            Some(t) => self
1063                .tx_catalogs
1064                .get(&t)
1065                .map_or(&self.catalog, |s| &s.catalog),
1066            None => &self.catalog,
1067        }
1068    }
1069
1070    fn active_catalog_mut(&mut self) -> &mut Catalog {
1071        let tx = self.current_tx;
1072        match tx {
1073            Some(t) => match self.tx_catalogs.get_mut(&t) {
1074                Some(s) => &mut s.catalog,
1075                None => &mut self.catalog,
1076            },
1077            None => &mut self.catalog,
1078        }
1079    }
1080
1081    /// Read-only execute path. Succeeds for `SELECT` / `SHOW TABLES`
1082    /// / `SHOW COLUMNS`; returns `EngineError::WriteRequired` for
1083    /// every other statement, so the caller can fall through to the
1084    /// `&mut self` `execute` path under a write lock. Engine state is
1085    /// not mutated even on the success path (`rewrite_clock_calls`
1086    /// and `resolve_order_by_position` both mutate the locally-owned
1087    /// AST, not `self`).
1088    ///
1089    /// **v4.0 concurrency**: this is the entry point the server takes
1090    /// under an `RwLock::read()` so multiple `SELECT` clients run in
1091    /// parallel without serialising on a single mutex.
1092    pub fn execute_readonly(&self, sql: &str) -> Result<QueryResult, EngineError> {
1093        self.execute_readonly_with_cancel(sql, CancelToken::none())
1094    }
1095
1096    /// v4.5 — read path with cooperative cancellation. Token's
1097    /// `is_cancelled` is checked at the start (so a watchdog that
1098    /// already fired returns Cancelled immediately) and at row-loop
1099    /// checkpoints inside `exec_select`. SHOW paths are O(small) and
1100    /// don't bother checking.
1101    pub fn execute_readonly_with_cancel(
1102        &self,
1103        sql: &str,
1104        cancel: CancelToken<'_>,
1105    ) -> Result<QueryResult, EngineError> {
1106        cancel.check()?;
1107        let mut stmt = parser::parse_statement(sql)?;
1108        let now_micros = self.clock.map(|f| f());
1109        rewrite_clock_calls(&mut stmt, now_micros);
1110        if let Statement::Select(s) = &mut stmt {
1111            resolve_order_by_position(s);
1112            // v6.2.3 — cost-based JOIN reorder (read path).
1113            reorder::reorder_joins(s, &self.catalog, &self.statistics);
1114        }
1115        let result = match stmt {
1116            Statement::Select(s) => self.exec_select_cancel(&s, cancel),
1117            Statement::ShowTables => Ok(self.exec_show_tables()),
1118            Statement::ShowColumns(table) => self.exec_show_columns(&table),
1119            Statement::ShowUsers => Ok(self.exec_show_users()),
1120            Statement::ShowPublications => Ok(self.exec_show_publications()),
1121            Statement::ShowSubscriptions => Ok(self.exec_show_subscriptions()),
1122            Statement::WaitForWalPosition { .. } => Err(EngineError::Unsupported(
1123                "WAIT FOR WAL POSITION must be handled by the server layer".into(),
1124            )),
1125            Statement::Explain(e) => self.exec_explain(&e, cancel),
1126            _ => Err(EngineError::WriteRequired),
1127        };
1128        self.enforce_row_limit(result)
1129    }
1130
1131    /// v4.2: cap result-set size. Applied after the executor
1132    /// materialises rows but before they leave the engine — wrapping
1133    /// every Rows-returning exec_* function would scatter the check.
1134    fn enforce_row_limit(
1135        &self,
1136        result: Result<QueryResult, EngineError>,
1137    ) -> Result<QueryResult, EngineError> {
1138        if let (Ok(QueryResult::Rows { rows, .. }), Some(cap)) = (&result, self.max_query_rows)
1139            && rows.len() > cap
1140        {
1141            return Err(EngineError::RowLimitExceeded(cap));
1142        }
1143        result
1144    }
1145
1146    pub fn execute(&mut self, sql: &str) -> Result<QueryResult, EngineError> {
1147        self.execute_in_with_cancel(sql, IMPLICIT_TX, CancelToken::none())
1148    }
1149
1150    /// v4.5 — write path with cooperative cancellation. Same dispatch
1151    /// as `execute_in_with_cancel(sql, IMPLICIT_TX, cancel)`. Kept as
1152    /// a separate entry point for backward-compat with the v4.5
1153    /// public API.
1154    pub fn execute_with_cancel(
1155        &mut self,
1156        sql: &str,
1157        cancel: CancelToken<'_>,
1158    ) -> Result<QueryResult, EngineError> {
1159        self.execute_in_with_cancel(sql, IMPLICIT_TX, cancel)
1160    }
1161
1162    /// v4.41.1 multi-slot write entry. Routes `sql` through the TX
1163    /// slot identified by `tx_id` so spg-server dispatch can scope
1164    /// each implicit-wrap BEGIN..stmt..COMMIT to its own slot in
1165    /// `tx_catalogs`. `IMPLICIT_TX` is the legacy single-slot path
1166    /// every other caller (engine self-tests, replay, spg-embedded)
1167    /// implicitly takes via `execute()` / `execute_with_cancel()`.
1168    pub fn execute_in(&mut self, sql: &str, tx_id: TxId) -> Result<QueryResult, EngineError> {
1169        self.execute_in_with_cancel(sql, tx_id, CancelToken::none())
1170    }
1171
1172    /// v4.41.1 write path with cooperative cancellation + explicit TX
1173    /// scope. Sets `self.current_tx` for the duration of the call so
1174    /// every `exec_*` helper transparently sees its TX's shadow
1175    /// catalog and savepoint stack; restores on exit so the field is
1176    /// only valid mid-call (no leakage across calls).
1177    pub fn execute_in_with_cancel(
1178        &mut self,
1179        sql: &str,
1180        tx_id: TxId,
1181        cancel: CancelToken<'_>,
1182    ) -> Result<QueryResult, EngineError> {
1183        let saved = self.current_tx;
1184        self.current_tx = Some(tx_id);
1185        let result = self.execute_inner_with_cancel(sql, cancel);
1186        self.current_tx = saved;
1187        result
1188    }
1189
1190    /// v6.1.1 — parse and pre-process a SQL string ONCE so the
1191    /// resulting [`Statement`] can be cached and re-executed via
1192    /// [`Engine::execute_prepared`]. Returns the same `Statement`
1193    /// the simple-query path would synthesise internally (clock
1194    /// rewrites + ORDER BY position-ref resolution applied at
1195    /// prepare time, since both are session-independent). The
1196    /// `$N` placeholders in the SQL stay as `Expr::Placeholder(n)`
1197    /// nodes; they're resolved to concrete values per-call by
1198    /// `execute_prepared`'s substitution walk.
1199    ///
1200    /// Pgwire's `Parse` (P) message lands here.
1201    pub fn prepare(&self, sql: &str) -> Result<Statement, ParseError> {
1202        let mut stmt = parser::parse_statement(sql)?;
1203        let now_micros = self.clock.map(|f| f());
1204        rewrite_clock_calls(&mut stmt, now_micros);
1205        if let Statement::Select(s) = &mut stmt {
1206            // v6.4.1 — expand `GROUP BY ALL` to every non-aggregate
1207            // SELECT-list item BEFORE position / alias resolution so
1208            // downstream passes see the explicit list.
1209            expand_group_by_all(s);
1210            resolve_order_by_position(s);
1211            // v6.2.3 — cost-based JOIN reorder. No-op for
1212            // single-table FROMs or any non-INNER join shape.
1213            reorder::reorder_joins(s, &self.catalog, &self.statistics);
1214        }
1215        Ok(stmt)
1216    }
1217
1218    /// v6.3.0 — cached prepare. Returns a cloned `Statement` from
1219    /// the plan cache on hit, runs the full `prepare()` path on miss
1220    /// and inserts the resulting plan before returning. Skipping the
1221    /// parse + JOIN-reorder pipeline on hit is the dominant win for
1222    /// JDBC / sqlx / pgx clients that reuse the same SQL string.
1223    ///
1224    /// Returns a cloned `Statement` (not a borrow) because the
1225    /// pgwire layer owns its `PreparedStmt` map per-session and the
1226    /// engine-level cache must stay available for other sessions.
1227    /// Clone cost on a 5-table JOIN AST is well under the parse cost
1228    /// it replaces.
1229    pub fn prepare_cached(&mut self, sql: &str) -> Result<Statement, ParseError> {
1230        // v6.3.1 — version-aware lookup. If the cached plan was
1231        // prepared before the most recent ANALYZE, evict and replan.
1232        let current_version = self.statistics.version();
1233        if let Some(plan) = self.plan_cache.get(sql) {
1234            if plan.statistics_version == current_version {
1235                return Ok(plan.stmt.clone());
1236            }
1237            // Stale entry — fall through to evict + re-prepare.
1238        }
1239        self.plan_cache.evict(sql);
1240        let stmt = self.prepare(sql)?;
1241        let source_tables = plan_cache::collect_source_tables(&stmt);
1242        let plan = plan_cache::PreparedPlan {
1243            stmt: stmt.clone(),
1244            statistics_version: current_version,
1245            source_tables,
1246            describe_columns: alloc::vec::Vec::new(),
1247        };
1248        self.plan_cache.insert(String::from(sql), plan);
1249        Ok(stmt)
1250    }
1251
1252    /// v6.3.0 — read-only accessor for tests and v6.3.1 invalidation.
1253    pub fn plan_cache(&self) -> &plan_cache::PlanCache {
1254        &self.plan_cache
1255    }
1256
1257    /// v6.3.0 — mutable accessor for v6.3.1 invalidation hooks.
1258    pub fn plan_cache_mut(&mut self) -> &mut plan_cache::PlanCache {
1259        &mut self.plan_cache
1260    }
1261
1262    /// v6.3.3 — Describe a prepared `Statement` without executing.
1263    /// Returns `(parameter_oids, output_columns)`. Empty
1264    /// `output_columns` means the statement has no row-producing
1265    /// shape we could resolve here (JOIN, subquery, non-SELECT, …)
1266    /// — pgwire layer maps that to a `NoData` reply.
1267    pub fn describe_prepared(
1268        &self,
1269        stmt: &Statement,
1270    ) -> (Vec<u32>, Vec<ColumnSchema>) {
1271        describe::describe_prepared(stmt, self.active_catalog())
1272    }
1273
1274    /// v6.1.1 — execute a [`Statement`] previously returned by
1275    /// [`Engine::prepare`], substituting `Expr::Placeholder(n)`
1276    /// nodes for the corresponding [`Value`] in `params` (1-based
1277    /// per PG: `$1` → `params[0]`). Bind-time string parameters
1278    /// are decoded into typed `Value`s by the pgwire layer before
1279    /// this call so the resulting AST hits the same execution
1280    /// path as a simple query — no SQL re-parse.
1281    ///
1282    /// Pgwire's `Execute` (E) message after a `Bind` (B) lands here.
1283    pub fn execute_prepared(
1284        &mut self,
1285        mut stmt: Statement,
1286        params: &[Value],
1287    ) -> Result<QueryResult, EngineError> {
1288        substitute_placeholders(&mut stmt, params)?;
1289        self.execute_stmt_with_cancel(stmt, CancelToken::none())
1290    }
1291
1292    fn execute_inner_with_cancel(
1293        &mut self,
1294        sql: &str,
1295        cancel: CancelToken<'_>,
1296    ) -> Result<QueryResult, EngineError> {
1297        cancel.check()?;
1298        let stmt = self.prepare(sql)?;
1299        // v6.5.1 — wrap the executor with a wall-clock window so we
1300        // can record into spg_stat_query. Skip when the engine has
1301        // no clock attached (no_std embedded callers).
1302        let start_us = self.clock.map(|f| f());
1303        let result = self.execute_stmt_with_cancel(stmt, cancel);
1304        if let (Some(t0), Ok(_)) = (start_us, &result) {
1305            let now = self.clock.map_or(t0, |f| f());
1306            let elapsed = now.saturating_sub(t0).max(0) as u64;
1307            self.query_stats.record(sql, elapsed, now as u64);
1308            // v6.5.6 — slow-query log: fire callback when elapsed
1309            // exceeds the configured floor.
1310            if let (Some(threshold), Some(logger)) =
1311                (self.slow_query_threshold_us, self.slow_query_logger)
1312                && elapsed >= threshold
1313            {
1314                logger(sql, elapsed);
1315            }
1316        }
1317        result
1318    }
1319
1320    fn execute_stmt_with_cancel(
1321        &mut self,
1322        stmt: Statement,
1323        cancel: CancelToken<'_>,
1324    ) -> Result<QueryResult, EngineError> {
1325        cancel.check()?;
1326        let result = match stmt {
1327            Statement::CreateTable(s) => self.exec_create_table(s),
1328            // v7.9.15 — CREATE EXTENSION is a no-op on SPG. Returns
1329            // CommandOk with affected=0; modified_catalog=false so
1330            // the WAL doesn't grow a useless entry. mailrs F3.
1331            Statement::CreateExtension(_) => Ok(QueryResult::CommandOk {
1332                affected: 0,
1333                modified_catalog: false,
1334            }),
1335            // v7.9.27 — DO $$ ... $$ is also a no-op (SPG has no
1336            // PL/pgSQL). mailrs H1 + pg_dump compat.
1337            Statement::DoBlock => Ok(QueryResult::CommandOk {
1338                affected: 0,
1339                modified_catalog: false,
1340            }),
1341            Statement::CreateIndex(s) => self.exec_create_index(s),
1342            Statement::Insert(s) => self.exec_insert(s),
1343            Statement::Update(s) => self.exec_update_cancel(&s, cancel),
1344            Statement::Delete(s) => self.exec_delete_cancel(&s, cancel),
1345            Statement::Select(s) => self.exec_select_cancel(&s, cancel),
1346            Statement::Begin => self.exec_begin(),
1347            Statement::Commit => self.exec_commit(),
1348            Statement::Rollback => self.exec_rollback(),
1349            Statement::Savepoint(name) => self.exec_savepoint(name),
1350            Statement::RollbackToSavepoint(name) => self.exec_rollback_to_savepoint(&name),
1351            Statement::ReleaseSavepoint(name) => self.exec_release_savepoint(&name),
1352            Statement::ShowTables => Ok(self.exec_show_tables()),
1353            Statement::ShowColumns(table) => self.exec_show_columns(&table),
1354            Statement::ShowUsers => Ok(self.exec_show_users()),
1355            Statement::ShowPublications => Ok(self.exec_show_publications()),
1356            Statement::ShowSubscriptions => Ok(self.exec_show_subscriptions()),
1357            Statement::CreateUser(s) => self.exec_create_user(&s),
1358            Statement::DropUser(name) => self.exec_drop_user(&name),
1359            Statement::Explain(e) => self.exec_explain(&e, cancel),
1360            Statement::AlterIndex(s) => self.exec_alter_index(s),
1361            Statement::AlterTable(s) => self.exec_alter_table(s),
1362            Statement::CreatePublication(s) => self.exec_create_publication(s),
1363            Statement::DropPublication(name) => self.exec_drop_publication(&name),
1364            Statement::CreateSubscription(s) => self.exec_create_subscription(s),
1365            Statement::DropSubscription(name) => self.exec_drop_subscription(&name),
1366            // v6.1.7 — WAIT FOR WAL POSITION needs `lag_state`,
1367            // which lives in spg-server's ServerState. The engine
1368            // surfaces a clear error; the server-layer dispatch
1369            // intercepts the SQL before it reaches the engine on
1370            // a server build, so this arm only fires for
1371            // engine-only callers (spg-embedded, lib tests).
1372            Statement::WaitForWalPosition { .. } => Err(EngineError::Unsupported(
1373                "WAIT FOR WAL POSITION must be handled by the server layer".into(),
1374            )),
1375            // v6.2.0 — ANALYZE recomputes per-column histograms.
1376            Statement::Analyze(target) => self.exec_analyze(target.as_deref()),
1377            // v6.7.3 — COMPACT COLD SEGMENTS.
1378            Statement::CompactColdSegments => self.exec_compact_cold_segments(),
1379        };
1380        self.enforce_row_limit(result)
1381    }
1382
1383    /// v6.1.2 — `CREATE PUBLICATION` runtime path. Duplicate names
1384    /// surface as `EngineError::Unsupported` so the existing PG-wire
1385    /// error mapping stays uniform; the message carries the name so
1386    /// operators can grep replication-log noise. Inside-transaction
1387    /// invocation is rejected (matches `CREATE USER` / `DROP USER`
1388    /// stance) — replication-catalog mutation is a connection-level
1389    /// administrative op, not a transactional one.
1390    fn exec_create_publication(
1391        &mut self,
1392        s: CreatePublicationStatement,
1393    ) -> Result<QueryResult, EngineError> {
1394        // v6.1.4 — the v6.1.2 "no DDL inside a transaction" guard
1395        // was over-cautious: it also blocked the auto-commit wrap
1396        // path (which begins an internal TX around every WAL-
1397        // logged statement). PG itself allows CREATE PUBLICATION
1398        // inside a transaction (it rolls back with the TX).
1399        self.publications
1400            .create(s.name, s.scope)
1401            .map_err(|e| EngineError::Unsupported(alloc::format!("CREATE PUBLICATION: {e:?}")))?;
1402        Ok(QueryResult::CommandOk {
1403            affected: 1,
1404            modified_catalog: true,
1405        })
1406    }
1407
1408    /// v6.1.2 — `DROP PUBLICATION` runtime path. PG-compatible silent
1409    /// no-op when the publication doesn't exist (returns `affected=0`
1410    /// in that case so the wire-level command tag distinguishes
1411    /// "dropped" from "no-op", though both succeed).
1412    fn exec_drop_publication(&mut self, name: &str) -> Result<QueryResult, EngineError> {
1413        let removed = self.publications.drop(name);
1414        Ok(QueryResult::CommandOk {
1415            affected: usize::from(removed),
1416            modified_catalog: removed,
1417        })
1418    }
1419
1420    /// v6.1.2 — read access to the publication catalog. Used by
1421    /// the v6.1.5 publisher-side WAL filter, by `SHOW PUBLICATIONS`
1422    /// (v6.1.3+), and by e2e tests that need to assert state without
1423    /// going through the wire.
1424    pub const fn publications(&self) -> &publications::Publications {
1425        &self.publications
1426    }
1427
1428    /// v6.1.4 — `CREATE SUBSCRIPTION` runtime path. Defaults
1429    /// `enabled = true` and `last_received_pos = 0` for a freshly-
1430    /// created subscription. The actual worker thread is spawned
1431    /// by spg-server once the engine returns success.
1432    fn exec_create_subscription(
1433        &mut self,
1434        s: CreateSubscriptionStatement,
1435    ) -> Result<QueryResult, EngineError> {
1436        // See exec_create_publication — the in_transaction gate
1437        // was over-cautious; the auto-commit wrap path holds an
1438        // internal TX that this check was incorrectly blocking.
1439        let sub = subscriptions::Subscription {
1440            conn_str: s.conn_str,
1441            publications: s.publications,
1442            enabled: true,
1443            last_received_pos: 0,
1444        };
1445        self.subscriptions
1446            .create(s.name, sub)
1447            .map_err(|e| EngineError::Unsupported(alloc::format!("CREATE SUBSCRIPTION: {e:?}")))?;
1448        Ok(QueryResult::CommandOk {
1449            affected: 1,
1450            modified_catalog: true,
1451        })
1452    }
1453
1454    /// v6.1.4 — `DROP SUBSCRIPTION`. Silent no-op when the name
1455    /// doesn't exist (PG-compatible). The associated worker is
1456    /// torn down by spg-server when it observes the catalog
1457    /// change at the next snapshot or via the engine's
1458    /// subscriptions accessor (the worker polls the catalog on
1459    /// reconnect; v6.1.5's filter-side will tighten this to an
1460    /// explicit signal).
1461    fn exec_drop_subscription(&mut self, name: &str) -> Result<QueryResult, EngineError> {
1462        let removed = self.subscriptions.drop(name);
1463        Ok(QueryResult::CommandOk {
1464            affected: usize::from(removed),
1465            modified_catalog: removed,
1466        })
1467    }
1468
1469    /// v6.1.4 — read access to the subscription catalog. Used by
1470    /// the subscription worker (read its own row to find its
1471    /// publications + last applied position), by SHOW SUBSCRIPTIONS,
1472    /// and by e2e tests asserting state directly.
1473    pub const fn subscriptions(&self) -> &subscriptions::Subscriptions {
1474        &self.subscriptions
1475    }
1476
1477    /// v6.1.4 — write access to `last_received_pos`. Worker
1478    /// calls this after each apply batch (under the engine's
1479    /// write-lock). Returns `false` when the subscription was
1480    /// dropped between when the worker received the record and
1481    /// when this call landed.
1482    pub fn subscription_advance(&mut self, name: &str, pos: u64) -> bool {
1483        self.subscriptions.update_last_received_pos(name, pos)
1484    }
1485
1486    /// v6.1.4 — `SHOW SUBSCRIPTIONS` row materialisation. Returns
1487    /// `(name, conn_str, publications, enabled, last_received_pos)`
1488    /// ordered by subscription name. The `publications` column is
1489    /// the comma-joined list ("p1, p2") for ergonomic SHOW output;
1490    /// callers wanting structured access read `Engine::subscriptions`.
1491    fn exec_show_subscriptions(&self) -> QueryResult {
1492        let columns = alloc::vec![
1493            ColumnSchema::new("name", DataType::Text, false),
1494            ColumnSchema::new("conn_str", DataType::Text, false),
1495            ColumnSchema::new("publications", DataType::Text, false),
1496            ColumnSchema::new("enabled", DataType::Bool, false),
1497            ColumnSchema::new("last_received_pos", DataType::BigInt, false),
1498        ];
1499        let rows: Vec<Row> = self
1500            .subscriptions
1501            .iter()
1502            .map(|(name, sub)| {
1503                Row::new(alloc::vec![
1504                    Value::Text(name.clone()),
1505                    Value::Text(sub.conn_str.clone()),
1506                    Value::Text(sub.publications.join(", ")),
1507                    Value::Bool(sub.enabled),
1508                    Value::BigInt(i64::try_from(sub.last_received_pos).unwrap_or(i64::MAX)),
1509                ])
1510            })
1511            .collect();
1512        QueryResult::Rows { columns, rows }
1513    }
1514
1515    /// v6.2.0 — materialise `spg_statistic` rows. One row per
1516    /// `(table, column)` pair tracked in `Statistics`, with
1517    /// `histogram_bounds` rendered as a `[v0, v1, ...]` string —
1518    /// the same canonical form vector literals use for round-trip.
1519    fn exec_spg_statistic(&self) -> QueryResult {
1520        let columns = alloc::vec![
1521            ColumnSchema::new("table_name", DataType::Text, false),
1522            ColumnSchema::new("column_name", DataType::Text, false),
1523            ColumnSchema::new("null_frac", DataType::Float, false),
1524            ColumnSchema::new("n_distinct", DataType::BigInt, false),
1525            ColumnSchema::new("histogram_bounds", DataType::Text, false),
1526            // v6.7.0 — appended column (v6.2.0 stability contract
1527            // allows APPEND to spg_statistic, not reorder/rename).
1528            // Reports the cached per-table cold-row count; same
1529            // value across every column row of the same table.
1530            ColumnSchema::new("cold_row_count", DataType::BigInt, false),
1531        ];
1532        let rows: Vec<Row> = self
1533            .statistics
1534            .iter()
1535            .map(|((t, c), s)| {
1536                let cold = self
1537                    .catalog
1538                    .get(t)
1539                    .map_or(0, |table| table.cold_row_count());
1540                Row::new(alloc::vec![
1541                    Value::Text(t.clone()),
1542                    Value::Text(c.clone()),
1543                    Value::Float(f64::from(s.null_frac)),
1544                    Value::BigInt(i64::try_from(s.n_distinct).unwrap_or(i64::MAX)),
1545                    Value::Text(render_histogram_bounds(&s.histogram_bounds)),
1546                    Value::BigInt(i64::try_from(cold).unwrap_or(i64::MAX)),
1547                ])
1548            })
1549            .collect();
1550        QueryResult::Rows { columns, rows }
1551    }
1552
1553    /// v6.5.0 — materialise `spg_stat_replication` rows. One row
1554    /// per subscription with `(name, conn_str, publications,
1555    /// last_received_pos, enabled)`. Surface mirrors
1556    /// `SHOW SUBSCRIPTIONS` but follows the virtual-table dispatch
1557    /// shape so it composes with SELECT clauses (WHERE, projection
1558    /// onto specific columns, etc).
1559    fn exec_spg_stat_replication(&self) -> QueryResult {
1560        let columns = alloc::vec![
1561            ColumnSchema::new("name", DataType::Text, false),
1562            ColumnSchema::new("conn_str", DataType::Text, false),
1563            ColumnSchema::new("publications", DataType::Text, false),
1564            ColumnSchema::new("last_received_pos", DataType::BigInt, false),
1565            ColumnSchema::new("enabled", DataType::Bool, false),
1566        ];
1567        let rows: Vec<Row> = self
1568            .subscriptions
1569            .iter()
1570            .map(|(name, sub)| {
1571                Row::new(alloc::vec![
1572                    Value::Text(name.clone()),
1573                    Value::Text(sub.conn_str.clone()),
1574                    Value::Text(sub.publications.join(",")),
1575                    Value::BigInt(i64::try_from(sub.last_received_pos).unwrap_or(i64::MAX)),
1576                    Value::Bool(sub.enabled),
1577                ])
1578            })
1579            .collect();
1580        QueryResult::Rows { columns, rows }
1581    }
1582
1583    /// v6.5.0 — materialise `spg_stat_segment` rows. One row per
1584    /// cold-tier segment with `(segment_id, num_rows, num_pages,
1585    /// total_bytes)`.
1586    ///
1587    /// v6.7.0 — appended `table_name` column resolves the v6.5.0
1588    /// carve-out. Walks every user table's BTree indices to find
1589    /// which table's Cold locators point at each segment. Empty
1590    /// string for orphan segments (loaded via SPG_PRELOAD_COLD_SEGMENT
1591    /// before any index registered a locator). The walk is
1592    /// O(tables × indices × keys); cached per call, not across
1593    /// calls — re-walked on every `SELECT * FROM spg_stat_segment`.
1594    fn exec_spg_stat_segment(&self) -> QueryResult {
1595        let columns = alloc::vec![
1596            ColumnSchema::new("segment_id", DataType::BigInt, false),
1597            ColumnSchema::new("table_name", DataType::Text, false),
1598            ColumnSchema::new("num_rows", DataType::BigInt, false),
1599            ColumnSchema::new("num_pages", DataType::BigInt, false),
1600            ColumnSchema::new("total_bytes", DataType::BigInt, false),
1601        ];
1602        // v6.7.0 — build a segment_id → table_name map by walking
1603        // every user table's BTree indices once. O(tables × indices
1604        // × keys) for the v6.5.0 carve-out resolution; acceptable
1605        // because spg_stat_segment is operator-facing (not on a
1606        // hot-loop path).
1607        let mut segment_owners: alloc::collections::BTreeMap<u32, String> = BTreeMap::new();
1608        for tname in self.catalog.table_names() {
1609            if is_internal_table_name(&tname) {
1610                continue;
1611            }
1612            let Some(t) = self.catalog.get(&tname) else {
1613                continue;
1614            };
1615            for idx in t.indices() {
1616                if let spg_storage::IndexKind::BTree(map) = &idx.kind {
1617                    for (_, locs) in map.iter() {
1618                        for loc in locs {
1619                            if let spg_storage::RowLocator::Cold { segment_id, .. } = loc {
1620                                segment_owners.entry(*segment_id).or_insert_with(|| tname.clone());
1621                            }
1622                        }
1623                    }
1624                }
1625            }
1626        }
1627        let rows: Vec<Row> = self
1628            .catalog
1629            .cold_segment_ids_global()
1630            .iter()
1631            .filter_map(|&id| {
1632                let seg = self.catalog.cold_segment(id)?;
1633                let meta = seg.meta();
1634                let owner = segment_owners
1635                    .get(&id)
1636                    .cloned()
1637                    .unwrap_or_default();
1638                Some(Row::new(alloc::vec![
1639                    Value::BigInt(i64::from(id)),
1640                    Value::Text(owner),
1641                    Value::BigInt(i64::try_from(meta.num_rows).unwrap_or(i64::MAX)),
1642                    Value::BigInt(i64::from(meta.num_pages)),
1643                    Value::BigInt(i64::try_from(meta.total_bytes).unwrap_or(i64::MAX)),
1644                ]))
1645            })
1646            .collect();
1647        QueryResult::Rows { columns, rows }
1648    }
1649
1650    /// v6.5.1 — materialise `spg_stat_query` rows. One row per
1651    /// distinct SQL text recorded since the engine booted, capped
1652    /// at `QUERY_STATS_MAX` (1024). Columns:
1653    ///   sql, exec_count, total_us, mean_us, max_us, last_seen_us
1654    /// mean_us = total_us / exec_count (saturating).
1655    fn exec_spg_stat_query(&self) -> QueryResult {
1656        let columns = alloc::vec![
1657            ColumnSchema::new("sql", DataType::Text, false),
1658            ColumnSchema::new("exec_count", DataType::BigInt, false),
1659            ColumnSchema::new("total_us", DataType::BigInt, false),
1660            ColumnSchema::new("mean_us", DataType::BigInt, false),
1661            ColumnSchema::new("max_us", DataType::BigInt, false),
1662            ColumnSchema::new("last_seen_us", DataType::BigInt, false),
1663        ];
1664        let rows: Vec<Row> = self
1665            .query_stats
1666            .snapshot()
1667            .into_iter()
1668            .map(|(sql, s)| {
1669                let mean = if s.exec_count == 0 {
1670                    0
1671                } else {
1672                    s.total_us / s.exec_count
1673                };
1674                Row::new(alloc::vec![
1675                    Value::Text(sql),
1676                    Value::BigInt(i64::try_from(s.exec_count).unwrap_or(i64::MAX)),
1677                    Value::BigInt(i64::try_from(s.total_us).unwrap_or(i64::MAX)),
1678                    Value::BigInt(i64::try_from(mean).unwrap_or(i64::MAX)),
1679                    Value::BigInt(i64::try_from(s.max_us).unwrap_or(i64::MAX)),
1680                    Value::BigInt(i64::try_from(s.last_seen_us).unwrap_or(i64::MAX)),
1681                ])
1682            })
1683            .collect();
1684        QueryResult::Rows { columns, rows }
1685    }
1686
1687    /// v6.5.2 — register a connection-state provider. spg-server
1688    /// calls this at startup with a function that snapshots its
1689    /// per-pgwire-connection registry. Engine reads through the
1690    /// callback on `SELECT * FROM spg_stat_activity`.
1691    #[must_use]
1692    pub const fn with_activity_provider(mut self, f: ActivityProvider) -> Self {
1693        self.activity_provider = Some(f);
1694        self
1695    }
1696
1697    /// v6.5.3 — register audit chain provider + verifier.
1698    #[must_use]
1699    pub const fn with_audit_providers(
1700        mut self,
1701        chain: AuditChainProvider,
1702        verify: AuditVerifier,
1703    ) -> Self {
1704        self.audit_chain_provider = Some(chain);
1705        self.audit_verifier = Some(verify);
1706        self
1707    }
1708
1709    /// v6.5.6 — register a slow-query log callback. `threshold_us`
1710    /// is the floor (in microseconds); only executes above the floor
1711    /// fire the callback. spg-server wires this from
1712    /// `SPG_SLOW_QUERY_THRESHOLD_MS` (default 100 ms).
1713    #[must_use]
1714    pub const fn with_slow_query_log(
1715        mut self,
1716        threshold_us: u64,
1717        logger: SlowQueryLogger,
1718    ) -> Self {
1719        self.slow_query_threshold_us = Some(threshold_us);
1720        self.slow_query_logger = Some(logger);
1721        self
1722    }
1723
1724    /// v6.5.6 — operator knob for plan cache cap. spg-server reads
1725    /// `SPG_PLAN_CACHE_MAX` env at startup; uses this to override
1726    /// the compile-time default of 256.
1727    pub fn set_plan_cache_max(&mut self, n: usize) {
1728        self.plan_cache.set_max_entries(n);
1729    }
1730
1731    /// v6.5.2 — materialise `spg_stat_activity` rows. Pulls a fresh
1732    /// snapshot from the registered `ActivityProvider`. Returns an
1733    /// empty result set when no provider is registered (the no_std
1734    /// embedded path with no pgwire layer).
1735    fn exec_spg_stat_activity(&self) -> QueryResult {
1736        let columns = alloc::vec![
1737            ColumnSchema::new("pid", DataType::Int, false),
1738            ColumnSchema::new("user", DataType::Text, false),
1739            ColumnSchema::new("started_at_us", DataType::BigInt, false),
1740            ColumnSchema::new("current_sql", DataType::Text, false),
1741            ColumnSchema::new("wait_event", DataType::Text, false),
1742            ColumnSchema::new("elapsed_us", DataType::BigInt, false),
1743            ColumnSchema::new("in_transaction", DataType::Bool, false),
1744        ];
1745        let rows: Vec<Row> = self
1746            .activity_provider
1747            .map(|f| f())
1748            .unwrap_or_default()
1749            .into_iter()
1750            .map(|r| {
1751                Row::new(alloc::vec![
1752                    Value::Int(i32::try_from(r.pid).unwrap_or(i32::MAX)),
1753                    Value::Text(r.user),
1754                    Value::BigInt(r.started_at_us),
1755                    Value::Text(r.current_sql),
1756                    Value::Text(r.wait_event),
1757                    Value::BigInt(r.elapsed_us),
1758                    Value::Bool(r.in_transaction),
1759                ])
1760            })
1761            .collect();
1762        QueryResult::Rows { columns, rows }
1763    }
1764
1765    /// v6.5.4 — materialise `spg_table_ddl` rows. One row per user
1766    /// table with `(table_name, ddl)`. Reconstructed from catalog
1767    /// state on demand.
1768    fn exec_spg_table_ddl(&self) -> QueryResult {
1769        let columns = alloc::vec![
1770            ColumnSchema::new("table_name", DataType::Text, false),
1771            ColumnSchema::new("ddl", DataType::Text, false),
1772        ];
1773        let rows: Vec<Row> = self
1774            .catalog
1775            .table_names()
1776            .into_iter()
1777            .filter(|n| !is_internal_table_name(n))
1778            .filter_map(|name| {
1779                let table = self.catalog.get(&name)?;
1780                let ddl = render_create_table(&name, &table.schema().columns);
1781                Some(Row::new(alloc::vec![
1782                    Value::Text(name),
1783                    Value::Text(ddl),
1784                ]))
1785            })
1786            .collect();
1787        QueryResult::Rows { columns, rows }
1788    }
1789
1790    /// v6.5.4 — materialise `spg_role_ddl` rows. One row per user
1791    /// with `(role_name, ddl)`. Password is redacted (matches the
1792    /// `Statement::CreateUser` Display which prints `'<redacted>'`).
1793    fn exec_spg_role_ddl(&self) -> QueryResult {
1794        let columns = alloc::vec![
1795            ColumnSchema::new("role_name", DataType::Text, false),
1796            ColumnSchema::new("ddl", DataType::Text, false),
1797        ];
1798        let rows: Vec<Row> = self
1799            .users
1800            .iter()
1801            .map(|(name, rec)| {
1802                let ddl = alloc::format!(
1803                    "CREATE USER {name} WITH PASSWORD '<redacted>' ROLE '{}'",
1804                    rec.role.as_str(),
1805                );
1806                Row::new(alloc::vec![Value::Text(String::from(name)), Value::Text(ddl)])
1807            })
1808            .collect();
1809        QueryResult::Rows { columns, rows }
1810    }
1811
1812    /// v6.5.4 — materialise `spg_database_ddl`: single row whose
1813    /// `ddl` column concatenates every user table's CREATE +
1814    /// every role's CREATE in deterministic catalog order. Suitable
1815    /// for piping back through `Engine::execute` to recreate a
1816    /// schema-equivalent database.
1817    fn exec_spg_database_ddl(&self) -> QueryResult {
1818        let columns = alloc::vec![ColumnSchema::new("ddl", DataType::Text, false)];
1819        let mut out = String::new();
1820        for (name, rec) in self.users.iter() {
1821            out.push_str(&alloc::format!(
1822                "CREATE USER {name} WITH PASSWORD '<redacted>' ROLE '{}';\n",
1823                rec.role.as_str(),
1824            ));
1825        }
1826        for name in self.catalog.table_names() {
1827            if is_internal_table_name(&name) {
1828                continue;
1829            }
1830            if let Some(table) = self.catalog.get(&name) {
1831                out.push_str(&render_create_table(&name, &table.schema().columns));
1832                out.push_str(";\n");
1833            }
1834        }
1835        QueryResult::Rows {
1836            columns,
1837            rows: alloc::vec![Row::new(alloc::vec![Value::Text(out)])],
1838        }
1839    }
1840
1841    /// v6.5.3 — materialise `spg_audit_chain` rows. Pulls a fresh
1842    /// snapshot from the registered provider; empty when no
1843    /// provider is set.
1844    fn exec_spg_audit_chain(&self) -> QueryResult {
1845        let columns = alloc::vec![
1846            ColumnSchema::new("seq", DataType::BigInt, false),
1847            ColumnSchema::new("ts_ms", DataType::BigInt, false),
1848            ColumnSchema::new("prev_hash", DataType::Text, false),
1849            ColumnSchema::new("entry_hash", DataType::Text, false),
1850            ColumnSchema::new("sql", DataType::Text, false),
1851        ];
1852        let rows: Vec<Row> = self
1853            .audit_chain_provider
1854            .map(|f| f())
1855            .unwrap_or_default()
1856            .into_iter()
1857            .map(|r| {
1858                Row::new(alloc::vec![
1859                    Value::BigInt(r.seq),
1860                    Value::BigInt(r.ts_ms),
1861                    Value::Text(r.prev_hash_hex),
1862                    Value::Text(r.entry_hash_hex),
1863                    Value::Text(r.sql),
1864                ])
1865            })
1866            .collect();
1867        QueryResult::Rows { columns, rows }
1868    }
1869
1870    /// v6.5.3 — materialise `spg_audit_verify` single-row result.
1871    /// `(verified_count, broken_at_seq)` — broken_at_seq is `-1`
1872    /// on a clean chain. Returns one row with both values 0 when
1873    /// no verifier is registered (no-data fallback for embedded
1874    /// callers).
1875    fn exec_spg_audit_verify(&self) -> QueryResult {
1876        let columns = alloc::vec![
1877            ColumnSchema::new("verified_count", DataType::BigInt, false),
1878            ColumnSchema::new("broken_at_seq", DataType::BigInt, false),
1879        ];
1880        let (verified, broken) = self.audit_verifier.map(|f| f()).unwrap_or((0, -1));
1881        let row = Row::new(alloc::vec![
1882            Value::BigInt(verified),
1883            Value::BigInt(broken),
1884        ]);
1885        QueryResult::Rows {
1886            columns,
1887            rows: alloc::vec![row],
1888        }
1889    }
1890
1891    /// v6.5.1 — read-only accessor for tests + v6.5.6 ops resets.
1892    pub fn query_stats(&self) -> &query_stats::QueryStats {
1893        &self.query_stats
1894    }
1895
1896    /// v6.5.1 — mutable accessor (clear, etc).
1897    pub fn query_stats_mut(&mut self) -> &mut query_stats::QueryStats {
1898        &mut self.query_stats
1899    }
1900
1901    /// v6.2.0 — read access to the per-column statistics table.
1902    /// Used by the planner (v6.2.2 selectivity functions read this),
1903    /// by `SELECT * FROM spg_statistic`, and by e2e tests.
1904    pub const fn statistics(&self) -> &statistics::Statistics {
1905        &self.statistics
1906    }
1907
1908    /// v6.2.1 — return tables whose modified-row count crossed the
1909    /// auto-analyze threshold since the last ANALYZE on that table.
1910    /// The threshold is `0.1 × max(row_count, MIN_ROWS_FOR_AUTO_
1911    /// ANALYZE)` — combines PG-style fractional + absolute lower
1912    /// bound so a fresh / tiny table doesn't get hammered on every
1913    /// INSERT.
1914    ///
1915    /// Designed to be cheap: walks every user table's
1916    /// `Catalog::table_names()` + reads `statistics::modified_
1917    /// since_last_analyze()` (BTreeMap lookup). The background
1918    /// worker calls this under `engine.read()` then drops the lock
1919    /// before re-acquiring `engine.write()` for the actual ANALYZE.
1920    pub fn tables_needing_analyze(&self) -> Vec<String> {
1921        const MIN_ROWS: u64 = 100;
1922        let mut out = Vec::new();
1923        for name in self.catalog.table_names() {
1924            if is_internal_table_name(&name) {
1925                continue;
1926            }
1927            let Some(table) = self.catalog.get(&name) else {
1928                continue;
1929            };
1930            let row_count = table.rows().len() as u64;
1931            let modified = self.statistics.modified_since_last_analyze(&name);
1932            // Threshold: ceil(0.1 × max(row_count, MIN_ROWS)),
1933            // computed in integer arithmetic so spg-engine stays
1934            // no_std without pulling in libm. `(n + 9) / 10` is
1935            // `ceil(n / 10)` for non-negative `n`.
1936            let base = row_count.max(MIN_ROWS);
1937            let threshold = base.saturating_add(9) / 10;
1938            if modified >= threshold {
1939                out.push(name);
1940            }
1941        }
1942        out
1943    }
1944
1945    /// v6.2.0 — `ANALYZE [<table>]` runtime. Bare `ANALYZE` walks
1946    /// every user table; `ANALYZE <name>` re-stats one. For each
1947    /// target table, single-pass scan + per-column histogram +
1948    /// `null_frac` + `n_distinct`. Replaces the table's prior
1949    /// stats; resets the modified-row counter.
1950    ///
1951    /// v6.2.0 doesn't sample — it scans the full table. v6.2.x
1952    /// can add reservoir sampling at the > 100 K-row mark; not a
1953    /// scope blocker for the current commit since rows ≤ 100 K
1954    /// analyse in milliseconds.
1955    fn exec_analyze(&mut self, target: Option<&str>) -> Result<QueryResult, EngineError> {
1956        let names: Vec<String> = if let Some(name) = target {
1957            // Verify the table exists; surface a clear error if not.
1958            if self.catalog.get(name).is_none() {
1959                return Err(EngineError::Storage(StorageError::TableNotFound {
1960                    name: name.to_string(),
1961                }));
1962            }
1963            alloc::vec![name.to_string()]
1964        } else {
1965            self.catalog
1966                .table_names()
1967                .into_iter()
1968                .filter(|n| !is_internal_table_name(n))
1969                .collect()
1970        };
1971        let mut analysed = 0usize;
1972        for table_name in &names {
1973            self.analyze_one_table(table_name)?;
1974            analysed += 1;
1975        }
1976        // v6.3.1 — plan cache invalidation. Bump stats version so
1977        // future lookups see the new generation, and selectively
1978        // evict every plan whose `source_tables` overlap with the
1979        // ANALYZE target set. Bare ANALYZE (all tables) clears the
1980        // whole cache.
1981        if analysed > 0 {
1982            self.statistics.bump_version();
1983            if target.is_some() {
1984                for t in &names {
1985                    self.plan_cache.evict_referencing(t);
1986                }
1987            } else {
1988                self.plan_cache.clear();
1989            }
1990        }
1991        Ok(QueryResult::CommandOk {
1992            affected: analysed,
1993            modified_catalog: true,
1994        })
1995    }
1996
1997    /// v6.7.3 — `COMPACT COLD SEGMENTS` runtime path. Drives the
1998    /// engine-layer compaction shim with the default
1999    /// 4 MiB segment-size threshold. spg-server intercepts the
2000    /// SQL before it reaches the engine on a server build —
2001    /// it reads `SPG_COMPACTION_TARGET_SEGMENT_BYTES`, calls
2002    /// `Engine::compact_cold_segments_with_target` directly with
2003    /// the env value, and persists every merged segment to
2004    /// `<db>.spg/segments/`. This arm only fires for engine-only
2005    /// callers (spg-embedded, lib tests); in that mode merged
2006    /// segments live in memory and are dropped at process exit.
2007    fn exec_compact_cold_segments(&mut self) -> Result<QueryResult, EngineError> {
2008        let target = COMPACTION_TARGET_DEFAULT_BYTES;
2009        let reports = self.compact_cold_segments_with_target(target)?;
2010        let columns = alloc::vec![
2011            ColumnSchema::new("table_name", DataType::Text, false),
2012            ColumnSchema::new("index_name", DataType::Text, false),
2013            ColumnSchema::new("sources_merged", DataType::BigInt, false),
2014            ColumnSchema::new("merged_segment_id", DataType::BigInt, false),
2015            ColumnSchema::new("merged_rows", DataType::BigInt, false),
2016            ColumnSchema::new("deleted_rows_pruned", DataType::BigInt, false),
2017            ColumnSchema::new("bytes_reclaimed_estimate", DataType::BigInt, false),
2018        ];
2019        let rows: Vec<Row> = reports
2020            .into_iter()
2021            .map(|(tname, iname, report)| {
2022                Row::new(alloc::vec![
2023                    Value::Text(tname),
2024                    Value::Text(iname),
2025                    Value::BigInt(i64::try_from(report.sources.len()).unwrap_or(i64::MAX)),
2026                    Value::BigInt(i64::from(report.merged_segment_id.unwrap_or(0))),
2027                    Value::BigInt(i64::try_from(report.merged_rows).unwrap_or(i64::MAX)),
2028                    Value::BigInt(
2029                        i64::try_from(report.deleted_rows_pruned).unwrap_or(i64::MAX),
2030                    ),
2031                    Value::BigInt(
2032                        i64::try_from(report.bytes_reclaimed_estimate).unwrap_or(i64::MAX),
2033                    ),
2034                ])
2035            })
2036            .collect();
2037        Ok(QueryResult::Rows { columns, rows })
2038    }
2039
2040    /// Walk a single table's rows once and (re-)populate per-column
2041    /// stats. Drops the existing stats for `table` first so columns
2042    /// that have been DROP-ed between ANALYZEs don't leave stale
2043    /// rows.
2044    fn analyze_one_table(&mut self, table_name: &str) -> Result<(), EngineError> {
2045        let table = self.catalog.get(table_name).ok_or_else(|| {
2046            EngineError::Storage(StorageError::TableNotFound {
2047                name: table_name.to_string(),
2048            })
2049        })?;
2050        let schema = table.schema().clone();
2051        let row_count = table.rows().len();
2052        // For each column, collect (sorted) non-NULL textual values
2053        // + count NULLs; then ask `statistics::build_histogram` to
2054        // produce the 101 bounds and `estimate_n_distinct` the
2055        // distinct count.
2056        self.statistics.clear_table(table_name);
2057        for (col_pos, col_schema) in schema.columns.iter().enumerate() {
2058            // v6.2.0 skip: vector columns have their own stats
2059            // shape (HNSW graph topology). v6.2 deliberation #1.
2060            if matches!(col_schema.ty, DataType::Vector { .. }) {
2061                continue;
2062            }
2063            let mut non_null_values: Vec<Value> = Vec::with_capacity(row_count);
2064            let mut nulls: u64 = 0;
2065            for row in table.rows() {
2066                match row.values.get(col_pos) {
2067                    Some(Value::Null) | None => nulls += 1,
2068                    Some(v) => non_null_values.push(v.clone()),
2069                }
2070            }
2071            // Sort by type-aware ordering (Int as int, Text as
2072            // lex, etc.) so histogram bounds reflect the column's
2073            // natural order — not lexicographic on the string
2074            // representation, which would put "9" after "49".
2075            non_null_values.sort_by(|a, b| sort_values_for_histogram(a, b));
2076            let non_null: Vec<String> = non_null_values
2077                .iter()
2078                .map(canonical_value_repr)
2079                .collect();
2080            let null_frac = if row_count == 0 {
2081                0.0
2082            } else {
2083                #[allow(clippy::cast_precision_loss)]
2084                let f = nulls as f32 / row_count as f32;
2085                f
2086            };
2087            let n_distinct = statistics::estimate_n_distinct(&non_null);
2088            let histogram_bounds = statistics::build_histogram(&non_null);
2089            self.statistics.set(
2090                table_name.to_string(),
2091                col_schema.name.clone(),
2092                statistics::ColumnStats {
2093                    null_frac,
2094                    n_distinct,
2095                    histogram_bounds,
2096                },
2097            );
2098        }
2099        self.statistics.reset_modified(table_name);
2100        // v6.7.0 — refresh the per-table cold_rows cache. Walk the
2101        // BTree indices and count Cold locators (MAX across
2102        // indices); store the result on the table. Surfaced via
2103        // `spg_statistic.cold_row_count` (new column) and
2104        // `spg_stat_segment.table_name` (new column).
2105        let cold_count = {
2106            let table = self
2107                .active_catalog()
2108                .get(table_name)
2109                .expect("table still present");
2110            table.count_cold_locators()
2111        };
2112        let table_mut = self
2113            .active_catalog_mut()
2114            .get_mut(table_name)
2115            .expect("table still present");
2116        table_mut.set_cold_row_count(cold_count);
2117        Ok(())
2118    }
2119
2120    /// v6.1.3 — `SHOW PUBLICATIONS` row materialisation. Returns
2121    /// `(name, scope, table_count)` ordered by publication name.
2122    ///   - `scope` is the human-readable string:
2123    ///       `"FOR ALL TABLES"` /
2124    ///       `"FOR TABLE t1, t2"` /
2125    ///       `"FOR ALL TABLES EXCEPT t1, t2"`.
2126    ///   - `table_count` is NULL for `AllTables`, the list length
2127    ///     otherwise. NULLability lets clients distinguish "publish
2128    ///     everything" from "publish exactly 0 tables" (the v6.1.3
2129    ///     parser forbids the empty list, but the column shape is
2130    ///     ready for the v6.1.5 publisher-side semantics).
2131    fn exec_show_publications(&self) -> QueryResult {
2132        let columns = alloc::vec![
2133            ColumnSchema::new("name", DataType::Text, false),
2134            ColumnSchema::new("scope", DataType::Text, false),
2135            ColumnSchema::new("table_count", DataType::Int, true),
2136        ];
2137        let rows: Vec<Row> = self
2138            .publications
2139            .iter()
2140            .map(|(name, scope)| {
2141                let (scope_str, count_val) = match scope {
2142                    spg_sql::ast::PublicationScope::AllTables => {
2143                        ("FOR ALL TABLES".to_string(), Value::Null)
2144                    }
2145                    spg_sql::ast::PublicationScope::ForTables(ts) => (
2146                        alloc::format!("FOR TABLE {}", ts.join(", ")),
2147                        Value::Int(i32::try_from(ts.len()).unwrap_or(i32::MAX)),
2148                    ),
2149                    spg_sql::ast::PublicationScope::AllTablesExcept(ts) => (
2150                        alloc::format!("FOR ALL TABLES EXCEPT {}", ts.join(", ")),
2151                        Value::Int(i32::try_from(ts.len()).unwrap_or(i32::MAX)),
2152                    ),
2153                };
2154                Row::new(alloc::vec![
2155                    Value::Text(name.clone()),
2156                    Value::Text(scope_str),
2157                    count_val,
2158                ])
2159            })
2160            .collect();
2161        QueryResult::Rows { columns, rows }
2162    }
2163
2164    /// v4.1 `SHOW USERS` — `(name, role)` per row, ordered by name.
2165    fn exec_show_users(&self) -> QueryResult {
2166        let columns = alloc::vec![
2167            ColumnSchema::new("name", DataType::Text, false),
2168            ColumnSchema::new("role", DataType::Text, false),
2169        ];
2170        let rows: Vec<Row> = self
2171            .users
2172            .iter()
2173            .map(|(name, rec)| {
2174                Row::new(alloc::vec![
2175                    Value::Text(name.to_string()),
2176                    Value::Text(rec.role.as_str().to_string()),
2177                ])
2178            })
2179            .collect();
2180        QueryResult::Rows { columns, rows }
2181    }
2182
2183    fn exec_create_user(&mut self, s: &CreateUserStatement) -> Result<QueryResult, EngineError> {
2184        if self.in_transaction() {
2185            return Err(EngineError::Unsupported(
2186                "CREATE USER is not allowed inside a transaction".into(),
2187            ));
2188        }
2189        let role = users::Role::parse(&s.role).ok_or_else(|| {
2190            EngineError::Unsupported(alloc::format!("invalid role: {:?}", s.role))
2191        })?;
2192        // Prefer the host-injected RNG. Falls back to a deterministic
2193        // salt derived from the username only when no RNG is wired —
2194        // acceptable for tests; the server always installs one.
2195        let salt = self.salt_fn.map_or_else(
2196            || {
2197                let mut s_bytes = [0u8; 16];
2198                let digest = spg_crypto::hash(s.name.as_bytes());
2199                s_bytes.copy_from_slice(&digest[..16]);
2200                s_bytes
2201            },
2202            |f| f(),
2203        );
2204        self.users
2205            .create(&s.name, &s.password, role, salt)
2206            .map_err(|e| EngineError::Unsupported(alloc::format!("CREATE USER: {e}")))?;
2207        Ok(QueryResult::CommandOk {
2208            affected: 1,
2209            modified_catalog: true,
2210        })
2211    }
2212
2213    fn exec_drop_user(&mut self, name: &str) -> Result<QueryResult, EngineError> {
2214        if self.in_transaction() {
2215            return Err(EngineError::Unsupported(
2216                "DROP USER is not allowed inside a transaction".into(),
2217            ));
2218        }
2219        self.users
2220            .drop(name)
2221            .map_err(|e| EngineError::Unsupported(alloc::format!("DROP USER: {e}")))?;
2222        Ok(QueryResult::CommandOk {
2223            affected: 1,
2224            modified_catalog: true,
2225        })
2226    }
2227
2228    /// v4.4 `UPDATE <table> SET col = expr [, ...] [WHERE cond]`.
2229    /// Filter pass uses the same WHERE eval as `exec_select`. Per
2230    /// matched row, evaluate each RHS expression against the *old*
2231    /// row, then call `Table::update_row` which rebuilds indices.
2232    /// Indexed columns are correctly reflected because rebuild
2233    /// happens after the cell rewrite.
2234    fn exec_update_cancel(
2235        &mut self,
2236        stmt: &spg_sql::ast::UpdateStatement,
2237        cancel: CancelToken<'_>,
2238    ) -> Result<QueryResult, EngineError> {
2239        // v5.2.3: if the WHERE is a PK equality and matches a cold-
2240        // tier row, promote it back to the hot tier *before* the
2241        // hot-row walk. The promote pushes the row to the end of
2242        // `table.rows`, where the upcoming SET-evaluation loop will
2243        // pick it up and apply the assignments. Lookups for the key
2244        // never observe a gap because `promote_cold_row` inserts the
2245        // hot row before retiring the cold locator.
2246        if let Some(w) = &stmt.where_ {
2247            let schema_cols = self
2248                .active_catalog()
2249                .get(&stmt.table)
2250                .ok_or_else(|| {
2251                    EngineError::Storage(StorageError::TableNotFound {
2252                        name: stmt.table.clone(),
2253                    })
2254                })?
2255                .schema()
2256                .columns
2257                .clone();
2258            if let Some((col_pos, key)) = try_pk_predicate(w, &schema_cols, stmt.table.as_str())
2259                && let Some(idx_name) = self
2260                    .active_catalog()
2261                    .get(&stmt.table)
2262                    .and_then(|t| t.index_on(col_pos).map(|i| i.name.clone()))
2263            {
2264                // Promote may be a no-op (key is hot-only or absent);
2265                // we don't care about the return value here — the
2266                // subsequent hot walk will either match or not.
2267                let _ = self
2268                    .active_catalog_mut()
2269                    .promote_cold_row(&stmt.table, &idx_name, &key);
2270            }
2271        }
2272
2273        let table = self
2274            .active_catalog_mut()
2275            .get_mut(&stmt.table)
2276            .ok_or_else(|| {
2277                EngineError::Storage(StorageError::TableNotFound {
2278                    name: stmt.table.clone(),
2279                })
2280            })?;
2281        let schema_cols: Vec<ColumnSchema> = table.schema().columns.clone();
2282        // Resolve each SET target to a column position once, validate
2283        // up front so a typo'd column doesn't leave a partial mutation
2284        // behind.
2285        let mut targets: Vec<(usize, &Expr)> = Vec::with_capacity(stmt.assignments.len());
2286        for (col, expr) in &stmt.assignments {
2287            let pos = schema_cols
2288                .iter()
2289                .position(|c| c.name == *col)
2290                .ok_or_else(|| {
2291                    EngineError::Eval(EvalError::ColumnNotFound { name: col.clone() })
2292                })?;
2293            targets.push((pos, expr));
2294        }
2295        let ctx = EvalContext::new(&schema_cols, Some(stmt.table.as_str()));
2296        // Walk every row, evaluate WHERE then SET expressions. We
2297        // gather (position, new_values) tuples first and apply them
2298        // afterwards so the WHERE/RHS evaluation reads the original
2299        // row state — matches PG semantics (UPDATE doesn't see its
2300        // own writes).
2301        let mut planned: Vec<(usize, Vec<Value>)> = Vec::new();
2302        for (i, row) in table.rows().iter().enumerate() {
2303            // v4.5: cooperative cancel checkpoint every 256 rows so
2304            // a runaway UPDATE without WHERE doesn't drag past the
2305            // server's query-timeout watchdog.
2306            if i.is_multiple_of(256) {
2307                cancel.check()?;
2308            }
2309            if let Some(w) = &stmt.where_ {
2310                let cond = eval::eval_expr(w, row, &ctx)?;
2311                if !matches!(cond, Value::Bool(true)) {
2312                    continue;
2313                }
2314            }
2315            let mut new_vals = row.values.clone();
2316            for (pos, expr) in &targets {
2317                let v = eval::eval_expr(expr, row, &ctx)?;
2318                new_vals[*pos] =
2319                    coerce_value(v, schema_cols[*pos].ty, &schema_cols[*pos].name, *pos)?;
2320            }
2321            planned.push((i, new_vals));
2322        }
2323        // v7.6.6 — capture pre-update row values for the FK
2324        // enforcement passes below. `planned` carries new values
2325        // only; pair them with the old row.
2326        let plan_with_old: Vec<(usize, Vec<Value>, Vec<Value>)> = planned
2327            .iter()
2328            .map(|(pos, new_vals)| (*pos, table.rows()[*pos].values.clone(), new_vals.clone()))
2329            .collect();
2330        let self_fks = table.schema().foreign_keys.clone();
2331        let affected = planned.len();
2332        // Release mutable borrow on `table` for the FK passes.
2333        let _ = table;
2334        // v7.6.6 — Stage 2a: outbound FK check. For every row whose
2335        // local FK columns changed, the new value must exist in the
2336        // parent.
2337        if !self_fks.is_empty() {
2338            let new_rows: Vec<Vec<Value>> = planned
2339                .iter()
2340                .map(|(_pos, new_vals)| new_vals.clone())
2341                .collect();
2342            enforce_fk_inserts(self.active_catalog(), &stmt.table, &self_fks, &new_rows)?;
2343        }
2344        // v7.6.6 — Stage 2b: inbound FK check. For every row that
2345        // changed value in a column that *some other table* uses as
2346        // a FK parent column, react per `on_update` action.
2347        let child_plan = plan_fk_parent_updates(self.active_catalog(), &stmt.table, &plan_with_old)?;
2348        // Stage 3a — apply each child-side action.
2349        for step in &child_plan {
2350            apply_fk_child_step(self.active_catalog_mut(), step)?;
2351        }
2352        // Stage 3b — apply the original UPDATE.
2353        let table = self
2354            .active_catalog_mut()
2355            .get_mut(&stmt.table)
2356            .ok_or_else(|| {
2357                EngineError::Storage(StorageError::TableNotFound {
2358                    name: stmt.table.clone(),
2359                })
2360            })?;
2361        // v7.9.4 — snapshot post-update values for RETURNING.
2362        let updated_for_returning: Vec<Vec<Value>> =
2363            if stmt.returning.is_some() {
2364                planned.iter().map(|(_pos, vals)| vals.clone()).collect()
2365            } else {
2366                Vec::new()
2367            };
2368        for (pos, vals) in planned {
2369            table.update_row(pos, vals)?;
2370        }
2371        let _ = table;
2372        // v6.2.1 — auto-analyze modified-row tracking for UPDATE.
2373        if !self.in_transaction() && affected > 0 {
2374            self.statistics
2375                .record_modifications(&stmt.table, affected as u64);
2376        }
2377        // v7.9.4 — RETURNING projection.
2378        if let Some(items) = &stmt.returning {
2379            return self.build_returning_rows(
2380                &stmt.table,
2381                items,
2382                updated_for_returning,
2383            );
2384        }
2385        Ok(QueryResult::CommandOk {
2386            affected,
2387            modified_catalog: !self.in_transaction(),
2388        })
2389    }
2390
2391    /// v4.4 `DELETE FROM <table> [WHERE cond]`. Collects matching
2392    /// positions then delegates to `Table::delete_rows` (single index
2393    /// rebuild for the batch).
2394    fn exec_delete_cancel(
2395        &mut self,
2396        stmt: &spg_sql::ast::DeleteStatement,
2397        cancel: CancelToken<'_>,
2398    ) -> Result<QueryResult, EngineError> {
2399        // v5.2.3: PK-targeted DELETE → first retire any cold-tier
2400        // locator for the key. The cold row body stays in the
2401        // segment (becoming shadowed garbage that a future
2402        // compaction pass reclaims) but the index no longer
2403        // resolves it. The shadow count contributes to the
2404        // affected total; the subsequent hot walk handles any hot
2405        // rows for the same key.
2406        let mut cold_shadow_count: usize = 0;
2407        if let Some(w) = &stmt.where_ {
2408            let schema_cols = self
2409                .active_catalog()
2410                .get(&stmt.table)
2411                .ok_or_else(|| {
2412                    EngineError::Storage(StorageError::TableNotFound {
2413                        name: stmt.table.clone(),
2414                    })
2415                })?
2416                .schema()
2417                .columns
2418                .clone();
2419            if let Some((col_pos, key)) = try_pk_predicate(w, &schema_cols, stmt.table.as_str())
2420                && let Some(idx_name) = self
2421                    .active_catalog()
2422                    .get(&stmt.table)
2423                    .and_then(|t| t.index_on(col_pos).map(|i| i.name.clone()))
2424            {
2425                cold_shadow_count = self
2426                    .active_catalog_mut()
2427                    .shadow_cold_row(&stmt.table, &idx_name, &key)
2428                    .unwrap_or(0);
2429            }
2430        }
2431
2432        let table = self
2433            .active_catalog_mut()
2434            .get_mut(&stmt.table)
2435            .ok_or_else(|| {
2436                EngineError::Storage(StorageError::TableNotFound {
2437                    name: stmt.table.clone(),
2438                })
2439            })?;
2440        let schema_cols: Vec<ColumnSchema> = table.schema().columns.clone();
2441        let ctx = EvalContext::new(&schema_cols, Some(stmt.table.as_str()));
2442        let mut positions: Vec<usize> = Vec::new();
2443        // v7.6.3 — collect every to-delete row's full Value tuple
2444        // alongside its position, so the FK enforcement pass can
2445        // run after the mut borrow drops.
2446        let mut to_delete_rows: Vec<Vec<Value>> = Vec::new();
2447        for (i, row) in table.rows().iter().enumerate() {
2448            if i.is_multiple_of(256) {
2449                cancel.check()?;
2450            }
2451            let keep = if let Some(w) = &stmt.where_ {
2452                let cond = eval::eval_expr(w, row, &ctx)?;
2453                !matches!(cond, Value::Bool(true))
2454            } else {
2455                false
2456            };
2457            if !keep {
2458                positions.push(i);
2459                to_delete_rows.push(row.values.clone());
2460            }
2461        }
2462        // v7.6.3 / v7.6.4 — Stage 2: FK enforcement on the immutable
2463        // catalog. Release the mut borrow and run reverse-scan
2464        // against every child table whose FK targets this table.
2465        // RESTRICT / NoAction raise an error; CASCADE returns a
2466        // cascade plan that stage 3 applies after the primary delete.
2467        // SET NULL / SET DEFAULT remain Unsupported until v7.6.5.
2468        let _ = table;
2469        let cascade_plan = plan_fk_parent_deletions(
2470            self.active_catalog(),
2471            &stmt.table,
2472            &positions,
2473            &to_delete_rows,
2474        )?;
2475        // Stage 3a — apply each FK child step (SET NULL / SET
2476        // DEFAULT / CASCADE delete) before deleting the parent.
2477        // The plan is already ordered: nulls/defaults first, then
2478        // cascade deletes (so a row mutated and later deleted
2479        // surfaces as deleted — though v7.6.5 doesn't produce
2480        // that overlap today).
2481        for step in &cascade_plan {
2482            apply_fk_child_step(self.active_catalog_mut(), step)?;
2483        }
2484        // Stage 3b — actually delete the original target rows.
2485        let table = self
2486            .active_catalog_mut()
2487            .get_mut(&stmt.table)
2488            .ok_or_else(|| {
2489                EngineError::Storage(StorageError::TableNotFound {
2490                    name: stmt.table.clone(),
2491                })
2492            })?;
2493        let affected = table.delete_rows(&positions) + cold_shadow_count;
2494        let _ = table;
2495        // v6.2.1 — auto-analyze modified-row tracking for DELETE.
2496        if !self.in_transaction() && affected > 0 {
2497            self.statistics
2498                .record_modifications(&stmt.table, affected as u64);
2499        }
2500        // v7.9.4 — RETURNING projection over the soon-to-be-gone
2501        // rows. `to_delete_rows` was snapshotted in stage 1 before
2502        // mutation, so the projection sees the pre-delete state
2503        // (matches PG semantics: DELETE RETURNING returns the row
2504        // as it was just before removal).
2505        if let Some(items) = &stmt.returning {
2506            return self.build_returning_rows(
2507                &stmt.table,
2508                items,
2509                to_delete_rows,
2510            );
2511        }
2512        Ok(QueryResult::CommandOk {
2513            affected,
2514            modified_catalog: !self.in_transaction(),
2515        })
2516    }
2517
2518    /// `SHOW TABLES` — one row per table in the active catalog.
2519    /// Column name is `name` so result-set consumers can downstream
2520    /// `SELECT name FROM ...` style logic if needed.
2521    /// v4.26: `EXPLAIN [ANALYZE] <select>`. Returns a single-column
2522    /// `QUERY PLAN` text table — first line names the top operator
2523    /// (Scan / Aggregate / Window / etc.), indented children list
2524    /// FROM joins, WHERE filters, ORDER BY / LIMIT, projection
2525    /// shape, and any active index hits. `ANALYZE` execs the inner
2526    /// SELECT and appends actual-row + elapsed-micros annotations.
2527    #[allow(clippy::format_push_string)]
2528    fn exec_explain(
2529        &self,
2530        e: &spg_sql::ast::ExplainStatement,
2531        cancel: CancelToken<'_>,
2532    ) -> Result<QueryResult, EngineError> {
2533        let mut lines = Vec::<String>::new();
2534        explain_select(&e.inner, self, 0, &mut lines);
2535        if e.suggest {
2536            // v6.8.3 — index advisor. Walks the SELECT's FROM
2537            // tables + WHERE column refs; for each (table, column)
2538            // pair that lacks an index, append a SUGGEST line with
2539            // a copy-pastable `CREATE INDEX` statement. This is a
2540            // pure-syntax heuristic — no cardinality estimation —
2541            // matching the v6.8.3 design intent of "tell the
2542            // operator where indexes are missing", not "give the
2543            // mathematically optimal index set".
2544            let suggestions = build_index_suggestions(&e.inner, self);
2545            for s in suggestions {
2546                lines.push(s);
2547            }
2548        } else if e.analyze {
2549            // v6.2.4 — EXPLAIN ANALYZE annotates each operator line
2550            // with `(rows=N)` where the row count is computable
2551            // without re-executing the full query:
2552            //   - Top-level operator (first non-indented line):
2553            //     rows = final result.len()
2554            //   - "From: <table> [full scan]" lines: rows =
2555            //     table.rows().len() (catalog read; no execution)
2556            //   - "From: <table> [index seek]": indeterminate —
2557            //     the index step would need re-execution; v6.2.5
2558            //     adds per-operator wall-clock + hot/cold rows
2559            //     instrumentation that makes this concrete.
2560            //   - Everything else: marked `(—)` so the surface
2561            //     stays well-defined without silently dropping
2562            //     stats. v6.2.5 fills in via inline executor
2563            //     instrumentation.
2564            // Total elapsed lands on a trailing `Total: …` line.
2565            let started = self.clock.map(|f| f());
2566            let exec = self.exec_select_cancel(&e.inner, cancel)?;
2567            let elapsed_micros = match (self.clock, started) {
2568                (Some(f), Some(s)) => Some(f().saturating_sub(s)),
2569                _ => None,
2570            };
2571            let row_count = if let QueryResult::Rows { rows, .. } = &exec {
2572                rows.len()
2573            } else {
2574                0
2575            };
2576            annotate_explain_lines(&mut lines, row_count, self);
2577            let mut total = alloc::format!("Total: rows={row_count}");
2578            if let Some(us) = elapsed_micros {
2579                total.push_str(&alloc::format!(" elapsed={us}us"));
2580            }
2581            lines.push(total);
2582        }
2583        let columns = alloc::vec![ColumnSchema::new("QUERY PLAN", DataType::Text, false)];
2584        let rows: Vec<Row> = lines
2585            .into_iter()
2586            .map(|l| Row::new(alloc::vec![Value::Text(l)]))
2587            .collect();
2588        Ok(QueryResult::Rows { columns, rows })
2589    }
2590
2591    fn exec_show_tables(&self) -> QueryResult {
2592        let columns = alloc::vec![ColumnSchema::new("name", DataType::Text, false)];
2593        let rows: Vec<Row> = self
2594            .active_catalog()
2595            .table_names()
2596            .into_iter()
2597            .map(|n| Row::new(alloc::vec![Value::Text(n)]))
2598            .collect();
2599        QueryResult::Rows { columns, rows }
2600    }
2601
2602    /// `SHOW COLUMNS FROM <table>` — one row per column with the
2603    /// declared name, SQL type rendering, and nullability flag.
2604    fn exec_show_columns(&self, table_name: &str) -> Result<QueryResult, EngineError> {
2605        let table =
2606            self.active_catalog()
2607                .get(table_name)
2608                .ok_or_else(|| StorageError::TableNotFound {
2609                    name: table_name.into(),
2610                })?;
2611        let columns = alloc::vec![
2612            ColumnSchema::new("name", DataType::Text, false),
2613            ColumnSchema::new("type", DataType::Text, false),
2614            ColumnSchema::new("nullable", DataType::Bool, false),
2615        ];
2616        let rows: Vec<Row> = table
2617            .schema()
2618            .columns
2619            .iter()
2620            .map(|c| {
2621                Row::new(alloc::vec![
2622                    Value::Text(c.name.clone()),
2623                    Value::Text(alloc::format!("{}", c.ty)),
2624                    Value::Bool(c.nullable),
2625                ])
2626            })
2627            .collect();
2628        Ok(QueryResult::Rows { columns, rows })
2629    }
2630
2631    fn exec_begin(&mut self) -> Result<QueryResult, EngineError> {
2632        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2633        if self.tx_catalogs.contains_key(&tx_id) {
2634            return Err(EngineError::TransactionAlreadyOpen);
2635        }
2636        self.tx_catalogs.insert(
2637            tx_id,
2638            TxState {
2639                catalog: self.catalog.clone(),
2640                savepoints: Vec::new(),
2641            },
2642        );
2643        Ok(QueryResult::CommandOk {
2644            affected: 0,
2645            modified_catalog: false,
2646        })
2647    }
2648
2649    fn exec_commit(&mut self) -> Result<QueryResult, EngineError> {
2650        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2651        let state = self
2652            .tx_catalogs
2653            .remove(&tx_id)
2654            .ok_or(EngineError::NoActiveTransaction)?;
2655        self.catalog = state.catalog;
2656        // All savepoints become permanent at COMMIT and the stack
2657        // resets for the next TX (`state.savepoints` is discarded with
2658        // `state`).
2659        Ok(QueryResult::CommandOk {
2660            affected: 0,
2661            modified_catalog: true,
2662        })
2663    }
2664
2665    fn exec_rollback(&mut self) -> Result<QueryResult, EngineError> {
2666        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2667        if self.tx_catalogs.remove(&tx_id).is_none() {
2668            return Err(EngineError::NoActiveTransaction);
2669        }
2670        // savepoints discarded with the TxState
2671        Ok(QueryResult::CommandOk {
2672            affected: 0,
2673            modified_catalog: false,
2674        })
2675    }
2676
2677    fn exec_savepoint(&mut self, name: String) -> Result<QueryResult, EngineError> {
2678        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2679        let state = self
2680            .tx_catalogs
2681            .get_mut(&tx_id)
2682            .ok_or(EngineError::NoActiveTransaction)?;
2683        // PG re-uses an existing savepoint name by dropping the older
2684        // entry and pushing a fresh one — match that behaviour so
2685        // application code can `SAVEPOINT sp; ...; SAVEPOINT sp` freely.
2686        state.savepoints.retain(|(n, _)| n != &name);
2687        let snapshot = state.catalog.clone();
2688        state.savepoints.push((name, snapshot));
2689        Ok(QueryResult::CommandOk {
2690            affected: 0,
2691            modified_catalog: false,
2692        })
2693    }
2694
2695    fn exec_rollback_to_savepoint(&mut self, name: &str) -> Result<QueryResult, EngineError> {
2696        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2697        let state = self
2698            .tx_catalogs
2699            .get_mut(&tx_id)
2700            .ok_or(EngineError::NoActiveTransaction)?;
2701        let pos = state
2702            .savepoints
2703            .iter()
2704            .rposition(|(n, _)| n == name)
2705            .ok_or_else(|| {
2706                EngineError::Unsupported(alloc::format!("savepoint not found: {name}"))
2707            })?;
2708        // The savepoint stays on the stack (PG semantics): a later
2709        // `RELEASE` or further `ROLLBACK TO` is still allowed. Everything
2710        // after it is discarded.
2711        let snapshot = state.savepoints[pos].1.clone();
2712        state.savepoints.truncate(pos + 1);
2713        state.catalog = snapshot;
2714        Ok(QueryResult::CommandOk {
2715            affected: 0,
2716            modified_catalog: false,
2717        })
2718    }
2719
2720    fn exec_release_savepoint(&mut self, name: &str) -> Result<QueryResult, EngineError> {
2721        let tx_id = self.current_tx.ok_or(EngineError::NoActiveTransaction)?;
2722        let state = self
2723            .tx_catalogs
2724            .get_mut(&tx_id)
2725            .ok_or(EngineError::NoActiveTransaction)?;
2726        let pos = state
2727            .savepoints
2728            .iter()
2729            .rposition(|(n, _)| n == name)
2730            .ok_or_else(|| {
2731                EngineError::Unsupported(alloc::format!("savepoint not found: {name}"))
2732            })?;
2733        // RELEASE keeps the work since the savepoint, just discards the
2734        // bookmark plus everything nested under it.
2735        state.savepoints.truncate(pos);
2736        Ok(QueryResult::CommandOk {
2737            affected: 0,
2738            modified_catalog: false,
2739        })
2740    }
2741
2742    /// v6.0.4 — synchronous `ALTER INDEX <name> REBUILD [WITH
2743    /// (encoding = …)]`. Walks every table in the active catalog
2744    /// looking for an index matching `stmt.name`, then delegates the
2745    /// rebuild (including any encoding switch) to
2746    /// `Table::rebuild_nsw_index`. The "live" non-blocking
2747    /// optimisation is v6.0.4.1 / v6.1.x territory.
2748    /// v6.7.2 — `ALTER TABLE t SET hot_tier_bytes = X`. Dispatch
2749    /// arm. Currently the only setting is `hot_tier_bytes`; later
2750    /// v6.7.x can extend `AlterTableTarget` without touching this
2751    /// arm structure.
2752    fn exec_alter_table(
2753        &mut self,
2754        s: spg_sql::ast::AlterTableStatement,
2755    ) -> Result<QueryResult, EngineError> {
2756        match s.target {
2757            spg_sql::ast::AlterTableTarget::SetHotTierBytes(n) => {
2758                let table = self
2759                    .active_catalog_mut()
2760                    .get_mut(&s.name)
2761                    .ok_or_else(|| {
2762                        EngineError::Storage(StorageError::TableNotFound {
2763                            name: s.name.clone(),
2764                        })
2765                    })?;
2766                table.schema_mut().hot_tier_bytes = Some(n);
2767            }
2768            spg_sql::ast::AlterTableTarget::AddForeignKey(fk) => {
2769                // v7.6.8 — resolve FK against the live catalog first
2770                // (validates parent table, columns, indices). Then
2771                // verify every existing row in the child table
2772                // satisfies the new constraint. Then install it.
2773                let cols_snapshot = self
2774                    .active_catalog()
2775                    .get(&s.name)
2776                    .ok_or_else(|| {
2777                        EngineError::Storage(StorageError::TableNotFound {
2778                            name: s.name.clone(),
2779                        })
2780                    })?
2781                    .schema()
2782                    .columns
2783                    .clone();
2784                let storage_fk = resolve_foreign_key(
2785                    &s.name,
2786                    &cols_snapshot,
2787                    fk,
2788                    self.active_catalog(),
2789                )?;
2790                // Verify existing rows. Treat them as a virtual
2791                // INSERT batch — reusing the v7.6.2 enforce helper.
2792                let existing_rows: Vec<Vec<Value>> = self
2793                    .active_catalog()
2794                    .get(&s.name)
2795                    .expect("checked above")
2796                    .rows()
2797                    .iter()
2798                    .map(|r| r.values.clone())
2799                    .collect();
2800                enforce_fk_inserts(
2801                    self.active_catalog(),
2802                    &s.name,
2803                    core::slice::from_ref(&storage_fk),
2804                    &existing_rows,
2805                )?;
2806                // Reject duplicate constraint name.
2807                let table = self
2808                    .active_catalog_mut()
2809                    .get_mut(&s.name)
2810                    .expect("checked above");
2811                if let Some(name) = &storage_fk.name
2812                    && table
2813                        .schema()
2814                        .foreign_keys
2815                        .iter()
2816                        .any(|f| f.name.as_ref() == Some(name))
2817                {
2818                    return Err(EngineError::Unsupported(alloc::format!(
2819                        "ALTER TABLE ADD CONSTRAINT: a constraint named {name:?} already exists"
2820                    )));
2821                }
2822                table.schema_mut().foreign_keys.push(storage_fk);
2823            }
2824            spg_sql::ast::AlterTableTarget::DropForeignKey(name) => {
2825                let table = self
2826                    .active_catalog_mut()
2827                    .get_mut(&s.name)
2828                    .ok_or_else(|| {
2829                        EngineError::Storage(StorageError::TableNotFound {
2830                            name: s.name.clone(),
2831                        })
2832                    })?;
2833                let fks = &mut table.schema_mut().foreign_keys;
2834                let before = fks.len();
2835                fks.retain(|f| f.name.as_ref() != Some(&name));
2836                if fks.len() == before {
2837                    return Err(EngineError::Unsupported(alloc::format!(
2838                        "ALTER TABLE DROP CONSTRAINT: no FK named {name:?} on {:?}",
2839                        s.name
2840                    )));
2841                }
2842            }
2843        }
2844        Ok(QueryResult::CommandOk {
2845            affected: 0,
2846            modified_catalog: !self.in_transaction(),
2847        })
2848    }
2849
2850    fn exec_alter_index(
2851        &mut self,
2852        stmt: spg_sql::ast::AlterIndexStatement,
2853    ) -> Result<QueryResult, EngineError> {
2854        // Translate the optional SQL-side encoding choice into the
2855        // storage-side enum; the same SqlVecEncoding -> VecEncoding
2856        // bridge `column_type_to_data_type` uses.
2857        let spg_sql::ast::AlterIndexStatement {
2858            name: idx_name,
2859            target,
2860        } = stmt;
2861        let spg_sql::ast::AlterIndexTarget::Rebuild { encoding } = target;
2862        let target = encoding.map(|e| match e {
2863            SqlVecEncoding::F32 => VecEncoding::F32,
2864            SqlVecEncoding::Sq8 => VecEncoding::Sq8,
2865            SqlVecEncoding::F16 => VecEncoding::F16,
2866        });
2867        // Linear scan: index names are globally unique within a
2868        // catalog (enforced by add_nsw_index_inner) so the first
2869        // match is the only one. Save the table name to avoid
2870        // borrowing while we then take a mut borrow.
2871        let table_name = {
2872            let cat = self.active_catalog();
2873            let mut found: Option<String> = None;
2874            for tname in cat.table_names() {
2875                if let Some(t) = cat.get(&tname)
2876                    && t.indices().iter().any(|i| i.name == idx_name)
2877                {
2878                    found = Some(tname);
2879                    break;
2880                }
2881            }
2882            found.ok_or_else(|| {
2883                EngineError::Storage(StorageError::IndexNotFound {
2884                    name: idx_name.clone(),
2885                })
2886            })?
2887        };
2888        let table = self
2889            .active_catalog_mut()
2890            .get_mut(&table_name)
2891            .expect("table found above");
2892        table.rebuild_nsw_index(&idx_name, target)?;
2893        // v6.3.1 — ALTER INDEX REBUILD potentially with new encoding
2894        // changes cost characteristics; evict any cached plans.
2895        self.plan_cache.evict_referencing(&table_name);
2896        Ok(QueryResult::CommandOk {
2897            affected: 0,
2898            modified_catalog: !self.in_transaction(),
2899        })
2900    }
2901
2902    fn exec_create_index(
2903        &mut self,
2904        stmt: CreateIndexStatement,
2905    ) -> Result<QueryResult, EngineError> {
2906        let table = self
2907            .active_catalog_mut()
2908            .get_mut(&stmt.table)
2909            .ok_or_else(|| {
2910                EngineError::Storage(StorageError::TableNotFound {
2911                    name: stmt.table.clone(),
2912                })
2913            })?;
2914        // `IF NOT EXISTS` reduces DuplicateIndex to a no-op CommandOk.
2915        if stmt.if_not_exists && table.indices().iter().any(|i| i.name == stmt.name) {
2916            return Ok(QueryResult::CommandOk {
2917                affected: 0,
2918                modified_catalog: false,
2919            });
2920        }
2921        // v7.9.14 — multi-column index parses through; engine
2922        // builds a single-column BTree on the leading column only.
2923        // The extras live on the AST so spg-server's dispatcher
2924        // can emit a PG-wire NoticeResponse / log line. Composite
2925        // BTree keys land in v7.10.
2926        let _ = &stmt.extra_columns; // intentional drop on engine side
2927        let table_name = stmt.table.clone();
2928        // v6.8.0 — resolve INCLUDE column names to positions. Done
2929        // before `add_index` so a typo error surfaces before any
2930        // catalog mutation lands.
2931        let included_positions: Vec<usize> = if stmt.included_columns.is_empty() {
2932            Vec::new()
2933        } else {
2934            let schema = table.schema();
2935            stmt.included_columns
2936                .iter()
2937                .map(|c| {
2938                    schema.column_position(c).ok_or_else(|| {
2939                        EngineError::Storage(StorageError::ColumnNotFound {
2940                            column: c.clone(),
2941                        })
2942                    })
2943                })
2944                .collect::<Result<Vec<_>, _>>()?
2945        };
2946        match stmt.method {
2947            IndexMethod::BTree => table.add_index(stmt.name.clone(), &stmt.column)?,
2948            IndexMethod::Hnsw => {
2949                if !included_positions.is_empty() {
2950                    return Err(EngineError::Unsupported(
2951                        "INCLUDE columns are not supported on HNSW indexes".into(),
2952                    ));
2953                }
2954                table.add_nsw_index(stmt.name.clone(), &stmt.column, spg_storage::NSW_DEFAULT_M)?;
2955            }
2956            // v6.7.1 — BRIN. Pure metadata; no in-memory data.
2957            IndexMethod::Brin => {
2958                if !included_positions.is_empty() {
2959                    return Err(EngineError::Unsupported(
2960                        "INCLUDE columns are not supported on BRIN indexes".into(),
2961                    ));
2962                }
2963                table.add_brin_index(stmt.name.clone(), &stmt.column)?;
2964            }
2965        }
2966        if !included_positions.is_empty()
2967            && let Some(idx) = table.indices_mut().iter_mut().find(|i| i.name == stmt.name)
2968        {
2969            idx.included_columns = included_positions;
2970        }
2971        // v6.8.1 — persist partial-index predicate. Stored as the
2972        // expression's Display form so the catalog snapshot stays
2973        // pure (storage has no spg-sql dependency). The runtime
2974        // maintenance path treats partial indexes identically to
2975        // full indexes for v6.8.1 (over-maintenance is safe; the
2976        // planner-side "use partial when query WHERE implies the
2977        // predicate" pass is STABILITY carve-out).
2978        if let Some(pred_expr) = &stmt.partial_predicate {
2979            let canonical = pred_expr.to_string();
2980            if matches!(stmt.method, IndexMethod::Hnsw | IndexMethod::Brin) {
2981                return Err(EngineError::Unsupported(
2982                    "WHERE predicates are not supported on HNSW or BRIN indexes".into(),
2983                ));
2984            }
2985            if let Some(idx) = table.indices_mut().iter_mut().find(|i| i.name == stmt.name) {
2986                idx.partial_predicate = Some(canonical);
2987            }
2988        }
2989        // v6.8.2 — persist expression index key. Same Display-form
2990        // storage; the runtime maintenance pass evaluates each
2991        // row's expression to derive the index key, but for v6.8.2
2992        // the engine falls through to the bare-column-reference
2993        // path and the expression is preserved for format-layer
2994        // round-trip + future planner work. Carved-out in
2995        // STABILITY § "Out of v6.8".
2996        if let Some(key_expr) = &stmt.expression {
2997            if matches!(stmt.method, IndexMethod::Hnsw | IndexMethod::Brin) {
2998                return Err(EngineError::Unsupported(
2999                    "Expression keys are not supported on HNSW or BRIN indexes".into(),
3000                ));
3001            }
3002            let canonical = key_expr.to_string();
3003            if let Some(idx) = table.indices_mut().iter_mut().find(|i| i.name == stmt.name) {
3004                idx.expression = Some(canonical);
3005            }
3006        }
3007        // v7.9.29 — persist `is_unique` flag on the storage Index.
3008        // Combined with `partial_predicate`, INSERT enforcement
3009        // checks that no other row whose predicate evaluates true
3010        // shares the same indexed key. Parser already rejected
3011        // `UNIQUE` on HNSW / BRIN, so plain BTree here.
3012        // For multi-column UNIQUE INDEX the extras matter (the
3013        // full tuple is the uniqueness key), so resolve them to
3014        // column positions and persist on the index too.
3015        if stmt.is_unique {
3016            let mut extra_positions: alloc::vec::Vec<usize> = alloc::vec::Vec::new();
3017            for col_name in &stmt.extra_columns {
3018                let pos = table
3019                    .schema()
3020                    .columns
3021                    .iter()
3022                    .position(|c| c.name.eq_ignore_ascii_case(col_name))
3023                    .ok_or_else(|| {
3024                        EngineError::Unsupported(alloc::format!(
3025                            "UNIQUE INDEX {:?}: extra column {col_name:?} not in table {:?}",
3026                            stmt.name, stmt.table
3027                        ))
3028                    })?;
3029                extra_positions.push(pos);
3030            }
3031            if let Some(idx) = table.indices_mut().iter_mut().find(|i| i.name == stmt.name) {
3032                idx.is_unique = true;
3033                idx.extra_column_positions = extra_positions;
3034            }
3035            // At index-creation time, check the existing rows for
3036            // pre-existing duplicates that would have violated the
3037            // new constraint — otherwise CREATE UNIQUE INDEX would
3038            // silently leave duplicates in place.
3039            let snapshot_indices = table.indices().to_vec();
3040            let snapshot_rows: alloc::vec::Vec<spg_storage::Row> =
3041                table.rows().iter().cloned().collect();
3042            let snapshot_schema = table.schema().clone();
3043            let idx_ref = snapshot_indices
3044                .iter()
3045                .find(|i| i.name == stmt.name)
3046                .expect("just-added index");
3047            check_existing_unique_violation(idx_ref, &snapshot_schema, &snapshot_rows)?;
3048        }
3049        // v6.3.1 — adding an index can change the optimal plan for
3050        // any cached query that references this table.
3051        self.plan_cache.evict_referencing(&table_name);
3052        Ok(QueryResult::CommandOk {
3053            affected: 0,
3054            modified_catalog: !self.in_transaction(),
3055        })
3056    }
3057
3058    fn exec_create_table(
3059        &mut self,
3060        stmt: CreateTableStatement,
3061    ) -> Result<QueryResult, EngineError> {
3062        if stmt.if_not_exists && self.active_catalog().get(&stmt.name).is_some() {
3063            return Ok(QueryResult::CommandOk {
3064                affected: 0,
3065                modified_catalog: false,
3066            });
3067        }
3068        let table_name = stmt.name.clone();
3069        // v7.9.13 — pluck the names of any columns marked
3070        // `PRIMARY KEY` inline so the post-create-table pass can
3071        // build an implicit BTree index. mailrs F1.
3072        let inline_pk_columns: Vec<String> = stmt
3073            .columns
3074            .iter()
3075            .filter(|c| c.is_primary_key)
3076            .map(|c| c.name.clone())
3077            .collect();
3078        // v7.9.19 — table-level constraints: PRIMARY KEY (a, b, ...)
3079        // and UNIQUE (a, b, ...). Each builds a BTree index on the
3080        // leading column (the existing single-column storage tier)
3081        // and registers a UniquenessConstraint on the schema for
3082        // INSERT-time enforcement of the full tuple. mailrs G1/G6.
3083        let cols = stmt
3084            .columns
3085            .into_iter()
3086            .map(column_def_to_schema)
3087            .collect::<Result<Vec<_>, _>>()?;
3088        // Composite NOT-NULL implication for PRIMARY KEY columns.
3089        let mut cols = cols;
3090        for tc in &stmt.table_constraints {
3091            if let spg_sql::ast::TableConstraint::PrimaryKey { columns, .. } = tc {
3092                for col_name in columns {
3093                    if let Some(col) = cols.iter_mut().find(|c| c.name == *col_name) {
3094                        col.nullable = false;
3095                    }
3096                }
3097            }
3098        }
3099        // v7.6.1 — resolve every FK in the statement against the
3100        // already-known catalog. Validates: parent table exists,
3101        // parent column names exist, arity matches, parent columns
3102        // have a PK / UNIQUE index. Self-referencing FKs (parent
3103        // table == this table) resolve against the column list we
3104        // just built — they don't need the catalog yet.
3105        let mut fks: Vec<spg_storage::ForeignKeyConstraint> =
3106            Vec::with_capacity(stmt.foreign_keys.len());
3107        for fk in stmt.foreign_keys {
3108            fks.push(resolve_foreign_key(
3109                &table_name,
3110                &cols,
3111                fk,
3112                self.active_catalog(),
3113            )?);
3114        }
3115        let mut schema = TableSchema::new(table_name.clone(), cols);
3116        schema.foreign_keys = fks;
3117        // v7.9.19 — translate AST table_constraints to storage
3118        // UniquenessConstraints (column name → position) so the
3119        // INSERT enforcement helper sees positions directly.
3120        let mut uc_storage: Vec<spg_storage::UniquenessConstraint> = Vec::new();
3121        for tc in &stmt.table_constraints {
3122            let (is_pk, names) = match tc {
3123                spg_sql::ast::TableConstraint::PrimaryKey { columns, .. } => {
3124                    (true, columns.clone())
3125                }
3126                spg_sql::ast::TableConstraint::Unique { columns, .. } => {
3127                    (false, columns.clone())
3128                }
3129            };
3130            let mut positions = Vec::with_capacity(names.len());
3131            for n in &names {
3132                let pos = schema
3133                    .columns
3134                    .iter()
3135                    .position(|c| c.name == *n)
3136                    .ok_or_else(|| {
3137                        EngineError::Unsupported(alloc::format!(
3138                            "table constraint references unknown column {n:?}"
3139                        ))
3140                    })?;
3141                positions.push(pos);
3142            }
3143            uc_storage.push(spg_storage::UniquenessConstraint {
3144                is_primary_key: is_pk,
3145                columns: positions,
3146            });
3147        }
3148        schema.uniqueness_constraints = uc_storage.clone();
3149        self.active_catalog_mut().create_table(schema)?;
3150        // v7.9.13 — implicit BTree per inline PK column +
3151        // v7.9.19 — implicit BTree on the leading column of every
3152        // table-level PRIMARY KEY / UNIQUE constraint.
3153        let table = self
3154            .active_catalog_mut()
3155            .get_mut(&table_name)
3156            .expect("just created");
3157        for (i, col_name) in inline_pk_columns.iter().enumerate() {
3158            let idx_name = if inline_pk_columns.len() == 1 {
3159                alloc::format!("{table_name}_pkey")
3160            } else {
3161                alloc::format!("{table_name}_pkey_{i}")
3162            };
3163            if let Err(e) = table.add_index(idx_name, col_name) {
3164                return Err(EngineError::Storage(e));
3165            }
3166        }
3167        for (i, tc) in stmt.table_constraints.iter().enumerate() {
3168            let (is_pk, names) = match tc {
3169                spg_sql::ast::TableConstraint::PrimaryKey { columns, .. } => {
3170                    (true, columns)
3171                }
3172                spg_sql::ast::TableConstraint::Unique { columns, .. } => {
3173                    (false, columns)
3174                }
3175            };
3176            let leading = &names[0];
3177            // Skip if a same-column BTree already exists (e.g.
3178            // inline PK on the leading column).
3179            let already = table
3180                .indices()
3181                .iter()
3182                .any(|idx| {
3183                    matches!(idx.kind, spg_storage::IndexKind::BTree(_))
3184                        && table.schema().columns[idx.column_position].name == *leading
3185                });
3186            if already {
3187                continue;
3188            }
3189            let suffix = if is_pk { "pkey" } else { "key" };
3190            let idx_name = if names.len() == 1 {
3191                alloc::format!("{table_name}_{leading}_{suffix}")
3192            } else {
3193                alloc::format!("{table_name}_{leading}_{suffix}_{i}")
3194            };
3195            if let Err(e) = table.add_index(idx_name, leading) {
3196                return Err(EngineError::Storage(e));
3197            }
3198        }
3199        Ok(QueryResult::CommandOk {
3200            affected: 0,
3201            modified_catalog: !self.in_transaction(),
3202        })
3203    }
3204
3205    fn exec_insert(&mut self, stmt: InsertStatement) -> Result<QueryResult, EngineError> {
3206        // v7.9.21 — snapshot the clock fn pointer before the mut
3207        // borrow on the catalog opens; runtime DEFAULT eval needs
3208        // it inside the row hot loop.
3209        let clock = self.clock;
3210        let table = self
3211            .active_catalog_mut()
3212            .get_mut(&stmt.table)
3213            .ok_or_else(|| {
3214                EngineError::Storage(StorageError::TableNotFound {
3215                    name: stmt.table.clone(),
3216                })
3217            })?;
3218        // v3.1.5: clone the columns vector only (not the whole
3219        // TableSchema — saves one String alloc for the table name).
3220        // We need an owned snapshot because we'll call `table.insert`
3221        // (mutable borrow on `table`) inside the row loop while
3222        // reading schema fields.
3223        let column_meta: Vec<ColumnSchema> = table.schema().columns.clone();
3224        let schema_cols_len = column_meta.len();
3225        // Build a permutation `tuple_pos[c] = Some(j)` meaning schema
3226        // column `c` is filled from the `j`-th tuple slot; `None` means
3227        // "fill with NULL". Validated once and reused for every row.
3228        let tuple_pos: Option<Vec<Option<usize>>> = match &stmt.columns {
3229            None => None, // 1-1 mapping, fast path
3230            Some(cols) => {
3231                let mut map = alloc::vec![None; schema_cols_len];
3232                for (j, name) in cols.iter().enumerate() {
3233                    let idx = column_meta
3234                        .iter()
3235                        .position(|c| c.name == *name)
3236                        .ok_or_else(|| {
3237                            EngineError::Eval(EvalError::ColumnNotFound { name: name.clone() })
3238                        })?;
3239                    if map[idx].is_some() {
3240                        return Err(EngineError::Storage(StorageError::ArityMismatch {
3241                            expected: schema_cols_len,
3242                            actual: cols.len(),
3243                        }));
3244                    }
3245                    map[idx] = Some(j);
3246                }
3247                // Omitted columns must either be nullable, carry a
3248                // DEFAULT, or be AUTO_INCREMENT. Catch NOT NULL
3249                // omissions up front so the WAL stays clean.
3250                for (i, col) in column_meta.iter().enumerate() {
3251                    if map[i].is_none()
3252                        && !col.nullable
3253                        && col.default.is_none()
3254                        && col.runtime_default.is_none()
3255                        && !col.auto_increment
3256                    {
3257                        return Err(EngineError::Storage(StorageError::NullInNotNull {
3258                            column: col.name.clone(),
3259                        }));
3260                    }
3261                }
3262                Some(map)
3263            }
3264        };
3265        let expected_tuple_len = stmt.columns.as_ref().map_or(schema_cols_len, Vec::len);
3266        // v7.6.2 — snapshot this table's FK list before the
3267        // mutable-borrow window so we can run parent lookups
3268        // against the immutable catalog after parsing. Empty vec is
3269        // the no-FK fast path; clone cost is O(fks * arity) which
3270        // is < 100 ns for typical schemas.
3271        let fks = table.schema().foreign_keys.clone();
3272        let mut affected = 0usize;
3273        // Stage 1 — parse + AUTO_INC + coerce all rows under the
3274        // single mutable borrow.
3275        let mut all_values: Vec<Vec<Value>> = Vec::with_capacity(stmt.rows.len());
3276        for tuple in stmt.rows {
3277            if tuple.len() != expected_tuple_len {
3278                return Err(EngineError::Storage(StorageError::ArityMismatch {
3279                    expected: expected_tuple_len,
3280                    actual: tuple.len(),
3281                }));
3282            }
3283            // Fast path: no column-list permutation → tuple slot j
3284            // maps to schema column j. We can zip schema with tuple
3285            // and skip the `raw_tuple` staging allocation entirely.
3286            let values: Vec<Value> = if let Some(map) = &tuple_pos {
3287                // Permuted path: still need raw_tuple to index by `map[i]`.
3288                let raw_tuple: Vec<Value> = tuple
3289                    .into_iter()
3290                    .map(literal_expr_to_value)
3291                    .collect::<Result<_, _>>()?;
3292                let mut out = Vec::with_capacity(schema_cols_len);
3293                for (i, col) in column_meta.iter().enumerate() {
3294                    let mut raw = match map[i] {
3295                        Some(j) => raw_tuple[j].clone(),
3296                        None => resolve_column_default_free(col, clock)?,
3297                    };
3298                    if col.auto_increment && raw.is_null() {
3299                        let next = table.next_auto_value(i).ok_or_else(|| {
3300                            EngineError::Unsupported(alloc::format!(
3301                                "AUTO_INCREMENT applies to integer columns only (column `{}`)",
3302                                col.name
3303                            ))
3304                        })?;
3305                        raw = Value::BigInt(next);
3306                    }
3307                    out.push(coerce_value(raw, col.ty, &col.name, i)?);
3308                }
3309                out
3310            } else {
3311                // 1-1 mapping fast path: single Vec alloc, no raw_tuple.
3312                let mut out = Vec::with_capacity(schema_cols_len);
3313                for (i, (col, expr)) in column_meta.iter().zip(tuple).enumerate() {
3314                    let mut raw = literal_expr_to_value(expr)?;
3315                    if col.auto_increment && raw.is_null() {
3316                        let next = table.next_auto_value(i).ok_or_else(|| {
3317                            EngineError::Unsupported(alloc::format!(
3318                                "AUTO_INCREMENT applies to integer columns only (column `{}`)",
3319                                col.name
3320                            ))
3321                        })?;
3322                        raw = Value::BigInt(next);
3323                    }
3324                    out.push(coerce_value(raw, col.ty, &col.name, i)?);
3325                }
3326                out
3327            };
3328            all_values.push(values);
3329        }
3330        // Stage 2 — FK enforcement on the immutable catalog.
3331        // Non-lexical lifetimes release the mutable borrow on
3332        // `table` here since stage 1 was the last use. The
3333        // parent-table lookup runs before any row is committed.
3334        let uniqueness = table.schema().uniqueness_constraints.clone();
3335        let _ = table;
3336        if !fks.is_empty() {
3337            enforce_fk_inserts(self.active_catalog(), &stmt.table, &fks, &all_values)?;
3338        }
3339        // v7.9.19 — composite UNIQUE / PRIMARY KEY enforcement.
3340        enforce_uniqueness_inserts(
3341            self.active_catalog(),
3342            &stmt.table,
3343            &uniqueness,
3344            &all_values,
3345        )?;
3346        // v7.9.29 — CREATE UNIQUE INDEX [WHERE pred] enforcement.
3347        // Independent of table-level UniquenessConstraint (which
3348        // can't carry a predicate). Walks the table's indexes;
3349        // for each `is_unique` index, only rows whose
3350        // partial_predicate evaluates truthy are checked for
3351        // collision. mailrs K1.
3352        enforce_unique_index_inserts(
3353            self.active_catalog(),
3354            &stmt.table,
3355            &all_values,
3356        )?;
3357        // v7.9.8 / v7.9.9 — ON CONFLICT handling.
3358        //   - `DO NOTHING` filters `all_values` to non-conflicting
3359        //     rows + drops within-batch duplicates.
3360        //   - `DO UPDATE SET …` ALSO filters, but for each
3361        //     conflicting row it queues an UPDATE on the existing
3362        //     row using the incoming row's values as `EXCLUDED.*`.
3363        let mut pending_updates: Vec<(usize, Vec<Value>)> = Vec::new();
3364        let mut skipped_count = 0usize;
3365        if let Some(clause) = &stmt.on_conflict {
3366            let conflict_cols = resolve_on_conflict_columns(
3367                self.active_catalog(),
3368                &stmt.table,
3369                clause.target_columns.as_slice(),
3370            )?;
3371            let mut kept: Vec<Vec<Value>> = Vec::with_capacity(all_values.len());
3372            let mut seen_keys: Vec<Vec<Value>> = Vec::new();
3373            for values in all_values {
3374                let key_tuple: Vec<&Value> =
3375                    conflict_cols.iter().map(|&c| &values[c]).collect();
3376                // SQL spec: NULL in any conflict column means "no
3377                // conflict possible" (NULL ≠ NULL for uniqueness).
3378                let has_null_key = key_tuple.iter().any(|v| matches!(v, Value::Null));
3379                let collides_with_table = !has_null_key
3380                    && on_conflict_keys_exist(
3381                        self.active_catalog(),
3382                        &stmt.table,
3383                        &conflict_cols,
3384                        &key_tuple,
3385                    );
3386                let key_tuple_owned: Vec<Value> =
3387                    key_tuple.iter().map(|v| (*v).clone()).collect();
3388                let collides_with_batch = !has_null_key
3389                    && seen_keys.iter().any(|k| k == &key_tuple_owned);
3390                let collides = collides_with_table || collides_with_batch;
3391                match (&clause.action, collides) {
3392                    (_, false) => {
3393                        seen_keys.push(key_tuple_owned);
3394                        kept.push(values);
3395                    }
3396                    (spg_sql::ast::OnConflictAction::Nothing, true) => {
3397                        skipped_count += 1;
3398                    }
3399                    (
3400                        spg_sql::ast::OnConflictAction::Update {
3401                            assignments,
3402                            where_,
3403                        },
3404                        true,
3405                    ) => {
3406                        if !collides_with_table {
3407                            skipped_count += 1;
3408                            continue;
3409                        }
3410                        let target_pos = lookup_row_position_by_keys(
3411                            self.active_catalog(),
3412                            &stmt.table,
3413                            &conflict_cols,
3414                            &key_tuple,
3415                        )
3416                        .ok_or_else(|| {
3417                            EngineError::Unsupported(
3418                                "ON CONFLICT DO UPDATE: conflict detected but row \
3419                                 position could not be resolved (cold-tier row?)"
3420                                    .into(),
3421                            )
3422                        })?;
3423                        let updated = apply_on_conflict_assignments(
3424                            self.active_catalog(),
3425                            &stmt.table,
3426                            target_pos,
3427                            &values,
3428                            assignments,
3429                            where_.as_ref(),
3430                        )?;
3431                        if let Some(new_row) = updated {
3432                            pending_updates.push((target_pos, new_row));
3433                        } else {
3434                            skipped_count += 1;
3435                        }
3436                    }
3437                }
3438            }
3439            all_values = kept;
3440        }
3441        // Stage 3 — insert all rows under a fresh mutable borrow.
3442        let table = self
3443            .active_catalog_mut()
3444            .get_mut(&stmt.table)
3445            .ok_or_else(|| {
3446                EngineError::Storage(StorageError::TableNotFound {
3447                    name: stmt.table.clone(),
3448                })
3449            })?;
3450        // v7.9.4 — keep RETURNING projection rows separate per
3451        // INSERT and per UPDATE branch so DO UPDATE pushes the new
3452        // post-update state, not the incoming-only values.
3453        let mut returning_rows: Vec<Vec<Value>> = Vec::new();
3454        for values in all_values {
3455            if stmt.returning.is_some() {
3456                returning_rows.push(values.clone());
3457            }
3458            table.insert(Row::new(values))?;
3459            affected += 1;
3460        }
3461        // v7.9.9 — apply ON CONFLICT DO UPDATE rewrites collected
3462        // in the conflict-resolution pass. update_row handles
3463        // index maintenance + body re-encoding.
3464        for (pos, new_row) in pending_updates {
3465            if stmt.returning.is_some() {
3466                returning_rows.push(new_row.clone());
3467            }
3468            table.update_row(pos, new_row)?;
3469            affected += 1;
3470        }
3471        let _ = skipped_count;
3472        // v7.9.4/v7.9.9 — RETURNING streams the rows that ended
3473        // up in the table after this statement (insert or
3474        // post-update on conflict).
3475        if let Some(items) = &stmt.returning {
3476            let _ = table;
3477            return self.build_returning_rows(
3478                &stmt.table,
3479                items,
3480                returning_rows,
3481            );
3482        }
3483        // v6.2.1 — auto-analyze: track per-table modified-row
3484        // counter so the background sweep can decide when to
3485        // re-ANALYZE. Cheap path on the autocommit-wrap hot loop
3486        // — one BTreeMap entry update per INSERT batch.
3487        if !self.in_transaction() && affected > 0 {
3488            self.statistics
3489                .record_modifications(&stmt.table, affected as u64);
3490        }
3491        Ok(QueryResult::CommandOk {
3492            affected,
3493            modified_catalog: !self.in_transaction(),
3494        })
3495    }
3496
3497    /// v4.5: SELECT with cooperative cancellation. The token is
3498    /// honoured between UNION peers and inside the bare-SELECT row
3499    /// loop; HNSW kNN graph walks and the aggregate executor don't
3500    /// honour it yet (deferred — those paths bound their work
3501    /// internally by `LIMIT k` and `GROUP BY` cardinality).
3502    /// v6.10.2 — cold-tier time-travel scan. Resolves the segment
3503    /// by id, decodes each row body against the table's current
3504    /// schema, applies the SELECT's projection + optional WHERE +
3505    /// optional LIMIT, returns a `Rows` result. JOINs / aggregates
3506    /// / ORDER BY are unsupported on this path (STABILITY carve-
3507    /// out); operators wanting them should restore the segment
3508    /// into a regular table first.
3509    fn exec_select_as_of_segment(
3510        &self,
3511        stmt: &SelectStatement,
3512        from: &spg_sql::ast::FromClause,
3513        segment_id: u32,
3514    ) -> Result<QueryResult, EngineError> {
3515        // v6.10.2 scope: no joins, no aggregates, no ORDER BY,
3516        // no GROUP BY / HAVING / UNION / OFFSET / DISTINCT.
3517        if !from.joins.is_empty()
3518            || stmt.group_by.is_some()
3519            || stmt.having.is_some()
3520            || !stmt.unions.is_empty()
3521            || !stmt.order_by.is_empty()
3522            || stmt.offset.is_some()
3523            || stmt.distinct
3524            || aggregate::uses_aggregate(stmt)
3525        {
3526            return Err(EngineError::Unsupported(
3527                "AS OF SEGMENT supports SELECT projection + WHERE + LIMIT only \
3528                 (joins / aggregates / ORDER BY are STABILITY § \"Out of v6.10\")"
3529                    .into(),
3530            ));
3531        }
3532        let table = self
3533            .active_catalog()
3534            .get(&from.primary.name)
3535            .ok_or_else(|| StorageError::TableNotFound {
3536                name: from.primary.name.clone(),
3537            })?;
3538        let schema = table.schema().clone();
3539        let schema_cols = &schema.columns;
3540        let alias = from
3541            .primary
3542            .alias
3543            .as_deref()
3544            .unwrap_or(from.primary.name.as_str());
3545        let ctx = EvalContext::new(schema_cols, Some(alias));
3546        let seg = self
3547            .active_catalog()
3548            .cold_segment(segment_id)
3549            .ok_or_else(|| {
3550                EngineError::Unsupported(alloc::format!(
3551                    "AS OF SEGMENT: cold segment {segment_id} not registered"
3552                ))
3553            })?;
3554        let mut out_rows: Vec<Row> = Vec::new();
3555        let mut limit_remaining: Option<usize> =
3556            stmt.limit_literal().and_then(|n| usize::try_from(n).ok());
3557        for (_key, body) in seg.scan() {
3558            let (row, _consumed) = spg_storage::decode_row_body_dense(&body, &schema)
3559                .map_err(EngineError::Storage)?;
3560            if let Some(where_expr) = &stmt.where_ {
3561                let cond = self.eval_expr_simple(where_expr, &row, &ctx)?;
3562                if !matches!(cond, Value::Bool(true)) {
3563                    continue;
3564                }
3565            }
3566            // Projection.
3567            let projected = self.project_row_simple(&row, &stmt.items, schema_cols, alias)?;
3568            out_rows.push(projected);
3569            if let Some(rem) = limit_remaining.as_mut() {
3570                if *rem == 0 {
3571                    out_rows.pop();
3572                    break;
3573                }
3574                *rem -= 1;
3575            }
3576        }
3577        // Output column schema: derive from SELECT items.
3578        let columns = self.derive_output_columns(&stmt.items, schema_cols, alias);
3579        Ok(QueryResult::Rows {
3580            columns,
3581            rows: out_rows,
3582        })
3583    }
3584
3585    /// v6.10.2 — simple-path WHERE eval that doesn't go through
3586    /// the correlated-subquery / Memoize machinery. AS OF SEGMENT
3587    /// scan paths predicate against a snapshot frozen segment, no
3588    /// cross-row state.
3589    fn eval_expr_simple(
3590        &self,
3591        expr: &Expr,
3592        row: &Row,
3593        ctx: &EvalContext,
3594    ) -> Result<Value, EngineError> {
3595        let cancel = CancelToken::none();
3596        self.eval_expr_with_correlated(expr, row, ctx, cancel, None)
3597    }
3598
3599    /// v7.9.4 — INSERT / UPDATE / DELETE RETURNING projector.
3600    /// Given the table name, the user-supplied projection items,
3601    /// and the mutated rows (post-insert / post-update values, or
3602    /// pre-delete snapshot), build a `QueryResult::Rows` whose
3603    /// schema describes the projected columns. Mailrs migration
3604    /// blocker #1.
3605    fn build_returning_rows(
3606        &self,
3607        table_name: &str,
3608        items: &[SelectItem],
3609        mutated_rows: Vec<Vec<Value>>,
3610    ) -> Result<QueryResult, EngineError> {
3611        let table = self.active_catalog().get(table_name).ok_or_else(|| {
3612            EngineError::Storage(StorageError::TableNotFound {
3613                name: table_name.into(),
3614            })
3615        })?;
3616        let schema_cols = table.schema().columns.clone();
3617        let columns = self.derive_output_columns(items, &schema_cols, table_name);
3618        let mut out_rows: Vec<Row> = Vec::with_capacity(mutated_rows.len());
3619        for values in mutated_rows {
3620            let row = Row::new(values);
3621            let projected = self.project_row_simple(&row, items, &schema_cols, table_name)?;
3622            out_rows.push(projected);
3623        }
3624        Ok(QueryResult::Rows {
3625            columns,
3626            rows: out_rows,
3627        })
3628    }
3629
3630    /// v6.10.2 — projection for AS OF SEGMENT. Resolves
3631    /// `SelectItem::Wildcard` to all schema columns and
3632    /// `SelectItem::Expr` via the regular eval path.
3633    fn project_row_simple(
3634        &self,
3635        row: &Row,
3636        items: &[SelectItem],
3637        schema_cols: &[ColumnSchema],
3638        alias: &str,
3639    ) -> Result<Row, EngineError> {
3640        let ctx = EvalContext::new(schema_cols, Some(alias));
3641        let cancel = CancelToken::none();
3642        let mut out_vals = Vec::new();
3643        for item in items {
3644            match item {
3645                SelectItem::Wildcard => {
3646                    out_vals.extend(row.values.iter().cloned());
3647                }
3648                SelectItem::Expr { expr, .. } => {
3649                    let v = self.eval_expr_with_correlated(expr, row, &ctx, cancel, None)?;
3650                    out_vals.push(v);
3651                }
3652            }
3653        }
3654        Ok(Row::new(out_vals))
3655    }
3656
3657    /// v6.10.2 — derive the output `ColumnSchema` list for an
3658    /// AS OF SEGMENT projection. Wildcards take the full schema;
3659    /// expressions take the alias if present or a synthetic
3660    /// `?column?` (PG convention) otherwise.
3661    fn derive_output_columns(
3662        &self,
3663        items: &[SelectItem],
3664        schema_cols: &[ColumnSchema],
3665        _alias: &str,
3666    ) -> Vec<ColumnSchema> {
3667        let mut out = Vec::new();
3668        for item in items {
3669            match item {
3670                SelectItem::Wildcard => {
3671                    out.extend(schema_cols.iter().cloned());
3672                }
3673                SelectItem::Expr { alias, .. } => {
3674                    let name = alias
3675                        .clone()
3676                        .unwrap_or_else(|| "?column?".to_string());
3677                    // Default to Text; the caller's row values
3678                    // carry the actual type. v6.10.2 scope.
3679                    out.push(ColumnSchema::new(name, DataType::Text, true));
3680                }
3681            }
3682        }
3683        out
3684    }
3685
3686    fn exec_select_cancel(
3687        &self,
3688        stmt: &SelectStatement,
3689        cancel: CancelToken<'_>,
3690    ) -> Result<QueryResult, EngineError> {
3691        cancel.check()?;
3692        // v6.10.2 — cold-tier time-travel short-circuit. When the
3693        // primary TableRef carries `AS OF SEGMENT '<id>'`, run a
3694        // dedicated cold-segment scan instead of the regular
3695        // hot+index path. The scope is intentionally narrow for
3696        // v6.10.2 — bare `SELECT * FROM <t> AS OF SEGMENT 'id'`,
3697        // optionally with a single-column-equality WHERE. JOINs /
3698        // aggregates / ORDER BY / subqueries on top of a time-
3699        // travelled scan are STABILITY § "Out of v6.10".
3700        if let Some(from) = &stmt.from
3701            && let Some(seg_id) = from.primary.as_of_segment
3702        {
3703            return self.exec_select_as_of_segment(stmt, from, seg_id);
3704        }
3705        // v6.2.0 / v6.5.0 — virtual-table short-circuits. Detected
3706        // pre-CTE because they don't read from the catalog and
3707        // shouldn't participate in regular FROM resolution.
3708        if let Some(from) = &stmt.from
3709            && from.joins.is_empty()
3710            && stmt.where_.is_none()
3711            && stmt.group_by.is_none()
3712            && stmt.having.is_none()
3713            && stmt.unions.is_empty()
3714            && stmt.order_by.is_empty()
3715            && stmt.limit.is_none()
3716            && stmt.offset.is_none()
3717            && !stmt.distinct
3718            && stmt.items.iter().all(|i| matches!(i, SelectItem::Wildcard))
3719        {
3720            let lower = from.primary.name.to_ascii_lowercase();
3721            match lower.as_str() {
3722                "spg_statistic" => return Ok(self.exec_spg_statistic()),
3723                // v6.5.0 — observability v2 virtual tables.
3724                "spg_stat_replication" => return Ok(self.exec_spg_stat_replication()),
3725                "spg_stat_segment" => return Ok(self.exec_spg_stat_segment()),
3726                "spg_stat_query" => return Ok(self.exec_spg_stat_query()),
3727                "spg_stat_activity" => return Ok(self.exec_spg_stat_activity()),
3728                "spg_audit_chain" => return Ok(self.exec_spg_audit_chain()),
3729                "spg_audit_verify" => return Ok(self.exec_spg_audit_verify()),
3730                "spg_table_ddl" => return Ok(self.exec_spg_table_ddl()),
3731                "spg_role_ddl" => return Ok(self.exec_spg_role_ddl()),
3732                "spg_database_ddl" => return Ok(self.exec_spg_database_ddl()),
3733                _ => {}
3734            }
3735        }
3736        // v4.11: CTEs materialise into a temporary enriched catalog
3737        // *before* anything else — the body SELECT can then refer
3738        // to CTE names via the regular FROM-clause resolution.
3739        // Uncorrelated only: each CTE body runs once against the
3740        // current catalog, not against later CTEs' results (left-
3741        // to-right materialisation would relax this, but we keep
3742        // it simple for v4.11 MVP).
3743        if !stmt.ctes.is_empty() {
3744            return self.exec_with_ctes(stmt, cancel);
3745        }
3746        // v4.10: subqueries (uncorrelated) are resolved here, before
3747        // the executor sees the row loop. We clone the statement so
3748        // we can mutate without disturbing the caller's AST — most
3749        // queries pass through with no subquery nodes and the clone
3750        // is cheap; with subqueries the materialisation cost
3751        // dominates anyway.
3752        let mut stmt_owned;
3753        let stmt_ref: &SelectStatement = if expr_tree_has_subquery(stmt) {
3754            stmt_owned = stmt.clone();
3755            self.resolve_select_subqueries(&mut stmt_owned, cancel)?;
3756            &stmt_owned
3757        } else {
3758            stmt
3759        };
3760        if stmt_ref.unions.is_empty() {
3761            return self.exec_bare_select_cancel(stmt_ref, cancel);
3762        }
3763        // UNION path: clone-strip the head into a bare block (its own
3764        // DISTINCT and any inner ORDER BY are dropped by parser rule —
3765        // the wrapper SelectStatement carries them), execute, then chain
3766        // peers with left-associative dedup semantics.
3767        let mut head = stmt_ref.clone();
3768        head.unions = Vec::new();
3769        head.order_by = Vec::new();
3770        head.limit = None;
3771        let QueryResult::Rows { columns, mut rows } =
3772            self.exec_bare_select_cancel(&head, cancel)?
3773        else {
3774            unreachable!("bare SELECT cannot return CommandOk")
3775        };
3776        for (kind, peer) in &stmt_ref.unions {
3777            let QueryResult::Rows {
3778                columns: peer_cols,
3779                rows: peer_rows,
3780            } = self.exec_bare_select_cancel(peer, cancel)?
3781            else {
3782                unreachable!("bare SELECT cannot return CommandOk")
3783            };
3784            if peer_cols.len() != columns.len() {
3785                return Err(EngineError::Unsupported(alloc::format!(
3786                    "UNION arity mismatch: head has {} columns, peer has {}",
3787                    columns.len(),
3788                    peer_cols.len()
3789                )));
3790            }
3791            rows.extend(peer_rows);
3792            if matches!(kind, UnionKind::Distinct) {
3793                rows = dedup_rows(rows);
3794            }
3795        }
3796        // ORDER BY at the top of a UNION applies to the combined result.
3797        // Eval against the projected schema (NOT the source table).
3798        if !stmt.order_by.is_empty() {
3799            let synth_ctx = EvalContext::new(&columns, None);
3800            let descs: Vec<bool> = stmt.order_by.iter().map(|o| o.desc).collect();
3801            let mut tagged: Vec<(Vec<f64>, Row)> = Vec::with_capacity(rows.len());
3802            for r in rows {
3803                let keys = build_order_keys(&stmt.order_by, &r, &synth_ctx)?;
3804                tagged.push((keys, r));
3805            }
3806            sort_by_keys(&mut tagged, &descs);
3807            rows = tagged.into_iter().map(|(_, r)| r).collect();
3808        }
3809        apply_offset_and_limit(&mut rows, stmt.offset_literal(), stmt.limit_literal());
3810        Ok(QueryResult::Rows { columns, rows })
3811    }
3812
3813    #[allow(clippy::too_many_lines)]
3814    #[allow(clippy::too_many_lines)] // huge match — splitting fragments the planner
3815    fn exec_bare_select_cancel(
3816        &self,
3817        stmt: &SelectStatement,
3818        cancel: CancelToken<'_>,
3819    ) -> Result<QueryResult, EngineError> {
3820        // v4.12: window-function path. When the projection contains
3821        // any `name(args) OVER (...)` we route to the dedicated
3822        // executor — partition + sort + per-row window value before
3823        // the regular projection.
3824        if select_has_window(stmt) {
3825            return self.exec_select_with_window(stmt, cancel);
3826        }
3827        // Constant SELECT (no FROM) — evaluate each item once against an
3828        // empty dummy row. Useful for `SELECT 1`, `SELECT coalesce(...)`,
3829        // `SELECT '7'::INT`. Column references will surface as
3830        // ColumnNotFound on eval since the schema is empty.
3831        let Some(from) = &stmt.from else {
3832            let empty_schema: Vec<ColumnSchema> = Vec::new();
3833            let ctx = EvalContext::new(&empty_schema, None);
3834            let projection = build_projection(&stmt.items, &empty_schema, "")?;
3835            let dummy_row = Row::new(Vec::new());
3836            let mut values = Vec::with_capacity(projection.len());
3837            for p in &projection {
3838                values.push(eval::eval_expr(&p.expr, &dummy_row, &ctx)?);
3839            }
3840            let columns: Vec<ColumnSchema> = projection
3841                .into_iter()
3842                .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
3843                .collect();
3844            return Ok(QueryResult::Rows {
3845                columns,
3846                rows: alloc::vec![Row::new(values)],
3847            });
3848        };
3849        // Multi-table FROM (one or more joined peers) goes through the
3850        // nested-loop join executor. Single-table FROM stays on the
3851        // existing scan + index-seek path.
3852        if !from.joins.is_empty() {
3853            return self.exec_joined_select(stmt, from);
3854        }
3855        let primary = &from.primary;
3856        let table = self.active_catalog().get(&primary.name).ok_or_else(|| {
3857            StorageError::TableNotFound {
3858                name: primary.name.clone(),
3859            }
3860        })?;
3861        let schema_cols = &table.schema().columns;
3862        // The qualifier accepted on column refs is the alias (if any) else the
3863        // bare table name.
3864        let alias = primary.alias.as_deref().unwrap_or(primary.name.as_str());
3865        let ctx = EvalContext::new(schema_cols, Some(alias));
3866
3867        // NSW kNN planner: `ORDER BY col <-> literal LIMIT k` with no
3868        // WHERE and an NSW index on `col` skips the full scan. The
3869        // walk returns rows already in ascending-distance order, so
3870        // ORDER BY / LIMIT are honoured implicitly.
3871        if let Some(nsw_rows) = try_nsw_knn(stmt, table, schema_cols, alias) {
3872            return materialise_in_order(stmt, table, schema_cols, alias, &nsw_rows);
3873        }
3874
3875        // Index seek: if WHERE is `col = literal` (or commuted) and the
3876        // referenced column has an index, dispatch each locator through
3877        // the catalog (hot tier → borrow, cold tier → page-read +
3878        // decode) and iterate just those rows. Otherwise fall back to a
3879        // full scan over the hot tier (cold-tier rows are only reached
3880        // via index seek in v5.1 — full table scans against cold-tier
3881        // data ship in v5.2 with the freezer's per-segment scan API).
3882        let indexed_rows: Option<Vec<Cow<'_, Row>>> = stmt
3883            .where_
3884            .as_ref()
3885            .and_then(|w| try_index_seek(w, schema_cols, self.active_catalog(), table, alias));
3886
3887        // Aggregate path: filter rows first, then hand off to the
3888        // aggregate executor which does its own projection + ORDER BY.
3889        if aggregate::uses_aggregate(stmt) {
3890            let mut filtered: Vec<&Row> = Vec::new();
3891            // v6.2.6 — Memoize: per-query LRU cache for correlated
3892            // scalar subqueries. Fresh per row-loop entry so each
3893            // SELECT execution gets an isolated cache.
3894            let mut memo = memoize::MemoizeCache::new();
3895            if let Some(rows) = &indexed_rows {
3896                for cow in rows {
3897                    let row = cow.as_ref();
3898                    if let Some(where_expr) = &stmt.where_ {
3899                        let cond = self.eval_expr_with_correlated(
3900                            where_expr,
3901                            row,
3902                            &ctx,
3903                            cancel,
3904                            Some(&mut memo),
3905                        )?;
3906                        if !matches!(cond, Value::Bool(true)) {
3907                            continue;
3908                        }
3909                    }
3910                    filtered.push(row);
3911                }
3912            } else {
3913                for i in 0..table.row_count() {
3914                    let row = &table.rows()[i];
3915                    if let Some(where_expr) = &stmt.where_ {
3916                        let cond = self.eval_expr_with_correlated(
3917                            where_expr,
3918                            row,
3919                            &ctx,
3920                            cancel,
3921                            Some(&mut memo),
3922                        )?;
3923                        if !matches!(cond, Value::Bool(true)) {
3924                            continue;
3925                        }
3926                    }
3927                    filtered.push(row);
3928                }
3929            }
3930            let mut agg = aggregate::run(stmt, &filtered, schema_cols, Some(alias))?;
3931            apply_offset_and_limit(&mut agg.rows, stmt.offset_literal(), stmt.limit_literal());
3932            return Ok(QueryResult::Rows {
3933                columns: agg.columns,
3934                rows: agg.rows,
3935            });
3936        }
3937
3938        let projection = build_projection(&stmt.items, schema_cols, alias)?;
3939
3940        // Materialise the filter pass into `(order_key, projected_row)`
3941        // tuples. The order key is `None` when there's no ORDER BY clause.
3942        let mut tagged: Vec<(Vec<f64>, Row)> = Vec::new();
3943        // v6.2.6 — Memoize per-row WHERE eval shares one cache.
3944        let mut memo = memoize::MemoizeCache::new();
3945        // Inline the per-row work in a closure so the indexed and full-
3946        // scan branches share the body.
3947        let mut process_row = |row: &Row, loop_idx: usize| -> Result<(), EngineError> {
3948            if loop_idx.is_multiple_of(256) {
3949                cancel.check()?;
3950            }
3951            if let Some(where_expr) = &stmt.where_ {
3952                let cond = self.eval_expr_with_correlated(
3953                    where_expr,
3954                    row,
3955                    &ctx,
3956                    cancel,
3957                    Some(&mut memo),
3958                )?;
3959                if !matches!(cond, Value::Bool(true)) {
3960                    return Ok(());
3961                }
3962            }
3963            let mut values = Vec::with_capacity(projection.len());
3964            for p in &projection {
3965                values.push(eval::eval_expr(&p.expr, row, &ctx)?);
3966            }
3967            let order_keys = if stmt.order_by.is_empty() {
3968                Vec::new()
3969            } else {
3970                build_order_keys(&stmt.order_by, row, &ctx)?
3971            };
3972            tagged.push((order_keys, Row::new(values)));
3973            Ok(())
3974        };
3975        if let Some(rows) = &indexed_rows {
3976            for (loop_idx, cow) in rows.iter().enumerate() {
3977                process_row(cow.as_ref(), loop_idx)?;
3978            }
3979        } else {
3980            for i in 0..table.row_count() {
3981                process_row(&table.rows()[i], i)?;
3982            }
3983        }
3984
3985        if !stmt.order_by.is_empty() {
3986            // Partial-sort fast path: when LIMIT is small relative to
3987            // the row count, select_nth_unstable + sort just the
3988            // prefix is O(n + k log k) instead of O(n log n). DISTINCT
3989            // requires the full sort because de-dup happens after.
3990            let keep = if stmt.distinct {
3991                None
3992            } else {
3993                stmt.limit_literal()
3994                    .map(|l| l as usize + stmt.offset_literal().map_or(0, |o| o as usize))
3995            };
3996            let descs: Vec<bool> = stmt.order_by.iter().map(|o| o.desc).collect();
3997            partial_sort_tagged(&mut tagged, keep, &descs);
3998        }
3999
4000        let mut output_rows: Vec<Row> = tagged.into_iter().map(|(_, r)| r).collect();
4001        if stmt.distinct {
4002            output_rows = dedup_rows(output_rows);
4003        }
4004        apply_offset_and_limit(&mut output_rows, stmt.offset_literal(), stmt.limit_literal());
4005
4006        let columns: Vec<ColumnSchema> = projection
4007            .into_iter()
4008            .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
4009            .collect();
4010
4011        Ok(QueryResult::Rows {
4012            columns,
4013            rows: output_rows,
4014        })
4015    }
4016
4017    /// Multi-table SELECT executor (one or more JOIN peers).
4018    ///
4019    /// v1.10 builds the joined row set up-front via nested-loop joins,
4020    /// then runs WHERE + projection + ORDER BY against the combined
4021    /// rows. No index seek. Aggregates and DISTINCT still work because
4022    /// the executor delegates projection through the same shared paths.
4023    #[allow(clippy::too_many_lines)]
4024    fn exec_joined_select(
4025        &self,
4026        stmt: &SelectStatement,
4027        from: &FromClause,
4028    ) -> Result<QueryResult, EngineError> {
4029        // Resolve every table reference up front so we surface
4030        // TableNotFound before we start the cartesian work.
4031        let primary_table = self
4032            .active_catalog()
4033            .get(&from.primary.name)
4034            .ok_or_else(|| StorageError::TableNotFound {
4035                name: from.primary.name.clone(),
4036            })?;
4037        let primary_alias = from
4038            .primary
4039            .alias
4040            .as_deref()
4041            .unwrap_or(from.primary.name.as_str())
4042            .to_string();
4043        let mut joined_tables: Vec<(&Table, String, JoinKind, Option<&Expr>)> = Vec::new();
4044        for j in &from.joins {
4045            let t = self.active_catalog().get(&j.table.name).ok_or_else(|| {
4046                StorageError::TableNotFound {
4047                    name: j.table.name.clone(),
4048                }
4049            })?;
4050            let a = j
4051                .table
4052                .alias
4053                .as_deref()
4054                .unwrap_or(j.table.name.as_str())
4055                .to_string();
4056            joined_tables.push((t, a, j.kind, j.on.as_ref()));
4057        }
4058
4059        // Build the combined schema: composite "alias.col" names so the
4060        // qualified-column resolver can find anything by exact match.
4061        let mut combined_schema: Vec<ColumnSchema> = Vec::new();
4062        for col in &primary_table.schema().columns {
4063            combined_schema.push(ColumnSchema::new(
4064                alloc::format!("{primary_alias}.{}", col.name),
4065                col.ty,
4066                col.nullable,
4067            ));
4068        }
4069        for (t, a, _, _) in &joined_tables {
4070            for col in &t.schema().columns {
4071                combined_schema.push(ColumnSchema::new(
4072                    alloc::format!("{a}.{}", col.name),
4073                    col.ty,
4074                    col.nullable,
4075                ));
4076            }
4077        }
4078        let ctx = EvalContext::new(&combined_schema, None);
4079
4080        // Nested-loop join. Starting set: every primary row, padded with
4081        // (no joined columns yet).
4082        let mut working: Vec<Row> = primary_table.rows().iter().cloned().collect();
4083        let mut produced_len = primary_table.schema().columns.len();
4084        for (t, _, kind, on) in &joined_tables {
4085            let right_arity = t.schema().columns.len();
4086            let mut next: Vec<Row> = Vec::new();
4087            for left in &working {
4088                let mut left_matched = false;
4089                for right in t.rows() {
4090                    let mut combined_vals = left.values.clone();
4091                    combined_vals.extend(right.values.iter().cloned());
4092                    // Pad combined to the eventual full width so the
4093                    // partial schema still matches positions used by ON.
4094                    let combined = Row::new(combined_vals);
4095                    let keep = if let Some(on_expr) = on {
4096                        let cond = eval::eval_expr(on_expr, &combined, &ctx)?;
4097                        matches!(cond, Value::Bool(true))
4098                    } else {
4099                        // CROSS / comma-list: every pair survives.
4100                        true
4101                    };
4102                    if keep {
4103                        next.push(combined);
4104                        left_matched = true;
4105                    }
4106                }
4107                if !left_matched && matches!(kind, JoinKind::Left) {
4108                    // LEFT OUTER JOIN: emit the left row with NULLs on
4109                    // the right side when no peer matched.
4110                    let mut combined_vals = left.values.clone();
4111                    for _ in 0..right_arity {
4112                        combined_vals.push(Value::Null);
4113                    }
4114                    next.push(Row::new(combined_vals));
4115                }
4116            }
4117            working = next;
4118            produced_len += right_arity;
4119            debug_assert!(produced_len <= combined_schema.len());
4120        }
4121
4122        // WHERE filter against combined rows.
4123        let mut filtered: Vec<Row> = Vec::new();
4124        for row in working {
4125            if let Some(where_expr) = &stmt.where_ {
4126                let cond = eval::eval_expr(where_expr, &row, &ctx)?;
4127                if !matches!(cond, Value::Bool(true)) {
4128                    continue;
4129                }
4130            }
4131            filtered.push(row);
4132        }
4133
4134        // Aggregate path: handle GROUP BY / aggregate calls over the
4135        // joined+filtered rows.
4136        if aggregate::uses_aggregate(stmt) {
4137            let refs: Vec<&Row> = filtered.iter().collect();
4138            let mut agg = aggregate::run(stmt, &refs, &combined_schema, None)?;
4139            apply_offset_and_limit(&mut agg.rows, stmt.offset_literal(), stmt.limit_literal());
4140            return Ok(QueryResult::Rows {
4141                columns: agg.columns,
4142                rows: agg.rows,
4143            });
4144        }
4145
4146        let projection = build_projection(&stmt.items, &combined_schema, "")?;
4147        let mut tagged: Vec<(Vec<f64>, Row)> = Vec::new();
4148        for row in &filtered {
4149            let mut values = Vec::with_capacity(projection.len());
4150            for p in &projection {
4151                values.push(eval::eval_expr(&p.expr, row, &ctx)?);
4152            }
4153            let order_keys = if stmt.order_by.is_empty() {
4154                Vec::new()
4155            } else {
4156                build_order_keys(&stmt.order_by, row, &ctx)?
4157            };
4158            tagged.push((order_keys, Row::new(values)));
4159        }
4160        if !stmt.order_by.is_empty() {
4161            let keep = if stmt.distinct {
4162                None
4163            } else {
4164                stmt.limit_literal()
4165                    .map(|l| l as usize + stmt.offset_literal().map_or(0, |o| o as usize))
4166            };
4167            let descs: Vec<bool> = stmt.order_by.iter().map(|o| o.desc).collect();
4168            partial_sort_tagged(&mut tagged, keep, &descs);
4169        }
4170        let mut output_rows: Vec<Row> = tagged.into_iter().map(|(_, r)| r).collect();
4171        if stmt.distinct {
4172            output_rows = dedup_rows(output_rows);
4173        }
4174        apply_offset_and_limit(&mut output_rows, stmt.offset_literal(), stmt.limit_literal());
4175        let columns: Vec<ColumnSchema> = projection
4176            .into_iter()
4177            .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
4178            .collect();
4179        Ok(QueryResult::Rows {
4180            columns,
4181            rows: output_rows,
4182        })
4183    }
4184}
4185
4186/// One row-producing projection: an expression to evaluate, the resulting
4187/// column's user-visible name, its inferred type, and nullability.
4188#[derive(Debug, Clone)]
4189struct ProjectedItem {
4190    expr: Expr,
4191    output_name: String,
4192    ty: DataType,
4193    nullable: bool,
4194}
4195
4196/// Dedupe a row set, preserving first-seen order. `Row`'s `PartialEq` is
4197/// structural (`Vec<Value>` ⇒ pairwise `Value` equality), which gives SQL
4198/// `NULL = NULL → TRUE` and `NaN = NaN → FALSE`. The first agrees with
4199/// the spec's "two NULLs are not distinct"; the second is a tolerated
4200/// quirk for v1 (no NaN literals are reachable from the SQL surface).
4201fn dedup_rows(rows: Vec<Row>) -> Vec<Row> {
4202    let mut out: Vec<Row> = Vec::with_capacity(rows.len());
4203    for r in rows {
4204        if !out.iter().any(|seen| seen == &r) {
4205            out.push(r);
4206        }
4207    }
4208    out
4209}
4210
4211/// Coerce a `Value` to an `f64` sort key for ORDER BY. Numbers map directly;
4212/// NULL sorts last (treated as `+∞`); booleans are 0.0 / 1.0; text uses lex
4213/// order via the byte values; vectors are not sortable.
4214fn value_to_order_key(v: &Value) -> Result<f64, EngineError> {
4215    match v {
4216        Value::Null => Ok(f64::INFINITY),
4217        Value::SmallInt(n) => Ok(f64::from(*n)),
4218        Value::Int(n) => Ok(f64::from(*n)),
4219        Value::Date(d) => Ok(f64::from(*d)),
4220        #[allow(clippy::cast_precision_loss)]
4221        Value::Timestamp(t) => Ok(*t as f64),
4222        #[allow(clippy::cast_precision_loss)]
4223        Value::Numeric { scaled, scale } => {
4224            // Scaled integer / 10^scale, computed via f64 for sort
4225            // ordering only. Precision losses here only matter for
4226            // ORDER BY tie-breaks well past 15 significant digits.
4227            // `f64::powi` lives in std; we hand-roll the loop so the
4228            // no_std engine crate doesn't need it.
4229            let mut divisor = 1.0_f64;
4230            for _ in 0..*scale {
4231                divisor *= 10.0;
4232            }
4233            Ok((*scaled as f64) / divisor)
4234        }
4235        #[allow(clippy::cast_precision_loss)]
4236        Value::BigInt(n) => Ok(*n as f64),
4237        Value::Float(x) => Ok(*x),
4238        Value::Bool(b) => Ok(if *b { 1.0 } else { 0.0 }),
4239        Value::Text(s) => {
4240            // Lex order by codepoints — good enough for ORDER BY name.
4241            // Map first 8 bytes packed into u64 as a coarse key; ties fall to
4242            // partial_cmp Equal. v1.x can swap in a real string comparator.
4243            let mut key: u64 = 0;
4244            for &b in s.as_bytes().iter().take(8) {
4245                key = (key << 8) | u64::from(b);
4246            }
4247            #[allow(clippy::cast_precision_loss)]
4248            Ok(key as f64)
4249        }
4250        Value::Vector(_) | Value::Sq8Vector(_) | Value::HalfVector(_) => {
4251            Err(EngineError::Unsupported(
4252                "ORDER BY of a raw vector column is not meaningful — use `<->`".into(),
4253            ))
4254        }
4255        Value::Interval { .. } => Err(EngineError::Unsupported(
4256            "ORDER BY of an INTERVAL is not supported in v2.11 \
4257             (months vs micros has no single canonical ordering)"
4258                .into(),
4259        )),
4260        Value::Json(_) => Err(EngineError::Unsupported(
4261            "ORDER BY of a JSON value is not supported — cast the document to text first".into(),
4262        )),
4263        // v7.5.0 — Value is #[non_exhaustive]; future variants need
4264        // an explicit ORDER BY mapping. Surface as Unsupported until
4265        // engine support is added.
4266        _ => Err(EngineError::Unsupported(
4267            "ORDER BY of this value type is not supported".into(),
4268        )),
4269    }
4270}
4271
4272/// Try to plan a WHERE clause as an equality lookup against an existing
4273/// index. Returns the candidate row indices on success; `None` means the
4274/// caller should fall back to a full scan.
4275///
4276/// v0.8 recognises a single top-level `col = literal` (in either operand
4277/// order). AND chains and range scans land in later milestones.
4278/// Look for `ORDER BY col <dist-op> literal LIMIT k` against an
4279/// NSW-indexed vector column. Recognised distance ops: `<->` (L2),
4280/// `<#>` (inner product), `<=>` (cosine). When a WHERE clause is
4281/// present, the planner does an "over-fetch and filter" pass — it
4282/// asks the graph for `k * over_fetch` candidates, evaluates WHERE
4283/// against each, and trims back to `k`. Returns the row indices in
4284/// ascending-distance order when the plan applies.
4285fn try_nsw_knn(
4286    stmt: &SelectStatement,
4287    table: &Table,
4288    schema_cols: &[ColumnSchema],
4289    table_alias: &str,
4290) -> Option<Vec<usize>> {
4291    if stmt.distinct {
4292        return None;
4293    }
4294    let limit = usize::try_from(stmt.limit_literal()?).ok()?;
4295    if limit == 0 {
4296        return None;
4297    }
4298    // v6.4.0 — NSW kNN dispatch needs a single ORDER BY key on the
4299    // distance metric. Multi-key ORDER BY falls through to the
4300    // generic sort path.
4301    if stmt.order_by.len() != 1 {
4302        return None;
4303    }
4304    let order = &stmt.order_by[0];
4305    // NSW kNN returns rows ascending by distance — DESC inverts the
4306    // natural order, so the planner can't handle it without a sort
4307    // pass. Fall back to the generic ORDER BY path.
4308    if order.desc {
4309        return None;
4310    }
4311    let Expr::Binary { lhs, op, rhs } = &order.expr else {
4312        return None;
4313    };
4314    let metric = match op {
4315        BinOp::L2Distance => spg_storage::NswMetric::L2,
4316        BinOp::InnerProduct => spg_storage::NswMetric::InnerProduct,
4317        BinOp::CosineDistance => spg_storage::NswMetric::Cosine,
4318        _ => return None,
4319    };
4320    // Accept both `col <op> literal` and `literal <op> col`.
4321    let ((Expr::Column(col), literal) | (literal, Expr::Column(col))) =
4322        (lhs.as_ref(), rhs.as_ref())
4323    else {
4324        return None;
4325    };
4326    if let Some(q) = &col.qualifier
4327        && q != table_alias
4328    {
4329        return None;
4330    }
4331    let col_pos = schema_cols.iter().position(|s| s.name == col.name)?;
4332    let query = literal_to_vector(literal)?;
4333    let idx = spg_storage::nsw_index_on(table, col_pos)?;
4334    if let Some(where_expr) = &stmt.where_ {
4335        // Over-fetch and filter. The factor (10×) is a heuristic that
4336        // covers typical selectivity for the corpus tests; v2.x will
4337        // make it configurable.
4338        let over_fetch = limit.saturating_mul(10).max(NSW_OVER_FETCH_FLOOR);
4339        let candidates = spg_storage::nsw_query(table, &idx.name, &query, over_fetch, metric);
4340        let ctx = EvalContext::new(schema_cols, Some(table_alias));
4341        let mut kept: Vec<usize> = Vec::with_capacity(limit);
4342        for i in candidates {
4343            let row = &table.rows()[i];
4344            let cond = eval::eval_expr(where_expr, row, &ctx).ok()?;
4345            if matches!(cond, Value::Bool(true)) {
4346                kept.push(i);
4347                if kept.len() >= limit {
4348                    break;
4349                }
4350            }
4351        }
4352        Some(kept)
4353    } else {
4354        Some(spg_storage::nsw_query(
4355            table, &idx.name, &query, limit, metric,
4356        ))
4357    }
4358}
4359
4360/// Lower bound on the over-fetch pool when WHERE is present — even
4361/// for tiny `LIMIT 1` queries we keep enough candidates to absorb a
4362/// few WHERE rejections.
4363const NSW_OVER_FETCH_FLOOR: usize = 32;
4364
4365/// Pull a `Vec<f32>` out of a literal-or-cast expression. Returns
4366/// `None` for anything we can't fold at plan time.
4367fn literal_to_vector(e: &Expr) -> Option<Vec<f32>> {
4368    match e {
4369        Expr::Literal(Literal::Vector(v)) => Some(v.clone()),
4370        Expr::Cast { expr, .. } => literal_to_vector(expr),
4371        _ => None,
4372    }
4373}
4374
4375/// Materialise rows in a planner-supplied order (used by the NSW path)
4376/// without re-running ORDER BY. The projection + LIMIT slot mirror the
4377/// equivalent block in `exec_bare_select`.
4378fn materialise_in_order(
4379    stmt: &SelectStatement,
4380    table: &Table,
4381    schema_cols: &[ColumnSchema],
4382    table_alias: &str,
4383    ordered_rows: &[usize],
4384) -> Result<QueryResult, EngineError> {
4385    let ctx = EvalContext::new(schema_cols, Some(table_alias));
4386    let projection = build_projection(&stmt.items, schema_cols, table_alias)?;
4387    let mut output_rows: Vec<Row> = Vec::with_capacity(ordered_rows.len());
4388    for &i in ordered_rows {
4389        let row = &table.rows()[i];
4390        let mut values = Vec::with_capacity(projection.len());
4391        for p in &projection {
4392            values.push(eval::eval_expr(&p.expr, row, &ctx)?);
4393        }
4394        output_rows.push(Row::new(values));
4395    }
4396    apply_offset_and_limit(&mut output_rows, stmt.offset_literal(), stmt.limit_literal());
4397    let columns: Vec<ColumnSchema> = projection
4398        .into_iter()
4399        .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
4400        .collect();
4401    Ok(QueryResult::Rows {
4402        columns,
4403        rows: output_rows,
4404    })
4405}
4406
4407fn try_index_seek<'a>(
4408    where_expr: &Expr,
4409    schema_cols: &[ColumnSchema],
4410    catalog: &'a Catalog,
4411    table: &'a Table,
4412    table_alias: &str,
4413) -> Option<Vec<Cow<'a, Row>>> {
4414    let Expr::Binary {
4415        lhs,
4416        op: BinOp::Eq,
4417        rhs,
4418    } = where_expr
4419    else {
4420        return None;
4421    };
4422    let (col_pos, value) = resolve_col_literal_pair(lhs, rhs, schema_cols, table_alias)
4423        .or_else(|| resolve_col_literal_pair(rhs, lhs, schema_cols, table_alias))?;
4424    let idx = table.index_on(col_pos)?;
4425    let key = IndexKey::from_value(&value)?;
4426    let locators = idx.lookup_eq(&key);
4427    let table_name = table.schema().name.as_str();
4428    // v5.1: each locator dispatches to either the hot tier (zero-
4429    // copy borrow of `table.rows()[i]`) or a cold-tier segment
4430    // (one page read + dense row decode, ~µs scale). Cold rows are
4431    // returned as `Cow::Owned` so the caller's `&Row` iteration
4432    // doesn't see a tier distinction; pre-freezer (no cold
4433    // segments loaded) every locator is `Hot` and every entry is
4434    // `Cow::Borrowed` — identical cost to the pre-v5.1 path.
4435    let mut out: Vec<Cow<'a, Row>> = Vec::with_capacity(locators.len());
4436    for loc in locators {
4437        match *loc {
4438            spg_storage::RowLocator::Hot(i) => {
4439                if let Some(row) = table.rows().get(i) {
4440                    out.push(Cow::Borrowed(row));
4441                }
4442            }
4443            spg_storage::RowLocator::Cold { segment_id, .. } => {
4444                if let Some(row) = catalog.resolve_cold_locator(table_name, segment_id, &key) {
4445                    out.push(Cow::Owned(row));
4446                }
4447            }
4448        }
4449    }
4450    Some(out)
4451}
4452
4453/// v5.2.3: extract `(column_position, IndexKey)` when `where_expr`
4454/// is a simple `col = literal` predicate suitable for a `BTree` index
4455/// seek. Used by `exec_update_cancel` / `exec_delete_cancel` to
4456/// decide whether a write touches a cold-tier row (which requires
4457/// promote-on-write / shadow-on-delete) before falling through to
4458/// the hot-tier row walk.
4459///
4460/// Returns `None` for any predicate shape the planner can't push
4461/// down to an index seek — complex WHERE clauses always take the
4462/// hot-only path (cold rows are immutable to non-indexed writes
4463/// until a future scan-fanout sub-version).
4464fn try_pk_predicate(
4465    where_expr: &Expr,
4466    schema_cols: &[ColumnSchema],
4467    table_alias: &str,
4468) -> Option<(usize, IndexKey)> {
4469    let Expr::Binary {
4470        lhs,
4471        op: BinOp::Eq,
4472        rhs,
4473    } = where_expr
4474    else {
4475        return None;
4476    };
4477    let (col_pos, value) = resolve_col_literal_pair(lhs, rhs, schema_cols, table_alias)
4478        .or_else(|| resolve_col_literal_pair(rhs, lhs, schema_cols, table_alias))?;
4479    let key = IndexKey::from_value(&value)?;
4480    Some((col_pos, key))
4481}
4482
4483fn resolve_col_literal_pair(
4484    col_side: &Expr,
4485    lit_side: &Expr,
4486    schema_cols: &[ColumnSchema],
4487    table_alias: &str,
4488) -> Option<(usize, Value)> {
4489    let Expr::Column(c) = col_side else {
4490        return None;
4491    };
4492    if let Some(q) = &c.qualifier
4493        && q != table_alias
4494    {
4495        return None;
4496    }
4497    let pos = schema_cols.iter().position(|s| s.name == c.name)?;
4498    let Expr::Literal(l) = lit_side else {
4499        return None;
4500    };
4501    let v = match l {
4502        Literal::Integer(n) => {
4503            if let Ok(small) = i32::try_from(*n) {
4504                Value::Int(small)
4505            } else {
4506                Value::BigInt(*n)
4507            }
4508        }
4509        Literal::Float(x) => Value::Float(*x),
4510        Literal::String(s) => Value::Text(s.clone()),
4511        Literal::Bool(b) => Value::Bool(*b),
4512        Literal::Null => Value::Null,
4513        // Vector and Interval literals can't be used as B-tree index keys.
4514        // Tell the planner to fall back to full-scan.
4515        Literal::Vector(_) | Literal::Interval { .. } => return None,
4516    };
4517    Some((pos, v))
4518}
4519
4520/// Find the schema entry that a SELECT-list `Expr::Column` refers to.
4521/// Mirrors `resolve_column` in `eval.rs`, but returns a proper
4522/// `EngineError` so the projection-build path keeps `UnknownQualifier`
4523/// vs `ColumnNotFound` distinct.
4524fn resolve_projection_column<'a>(
4525    c: &ColumnName,
4526    schema_cols: &'a [ColumnSchema],
4527    table_alias: &str,
4528) -> Result<&'a ColumnSchema, EngineError> {
4529    if let Some(q) = &c.qualifier {
4530        let composite = alloc::format!("{q}.{name}", name = c.name);
4531        if let Some(s) = schema_cols.iter().find(|s| s.name == composite) {
4532            return Ok(s);
4533        }
4534        // Single-table case: the qualifier may equal the active alias —
4535        // then look for the bare column name.
4536        if q == table_alias
4537            && let Some(s) = schema_cols.iter().find(|s| s.name == c.name)
4538        {
4539            return Ok(s);
4540        }
4541        // For multi-table schemas the qualifier is unknown only if no
4542        // column bears the "<q>." prefix. For single-table, the alias
4543        // mismatch alone is enough.
4544        let prefix = alloc::format!("{q}.");
4545        let qualifier_known =
4546            q == table_alias || schema_cols.iter().any(|s| s.name.starts_with(&prefix));
4547        if !qualifier_known {
4548            return Err(EngineError::Eval(EvalError::UnknownQualifier {
4549                qualifier: q.clone(),
4550            }));
4551        }
4552        return Err(EngineError::Eval(EvalError::ColumnNotFound {
4553            name: c.name.clone(),
4554        }));
4555    }
4556    if let Some(s) = schema_cols.iter().find(|s| s.name == c.name) {
4557        return Ok(s);
4558    }
4559    let suffix = alloc::format!(".{name}", name = c.name);
4560    let mut matches = schema_cols.iter().filter(|s| s.name.ends_with(&suffix));
4561    let first = matches.next();
4562    let extra = matches.next();
4563    match (first, extra) {
4564        (Some(s), None) => Ok(s),
4565        (Some(_), Some(_)) => Err(EngineError::Eval(EvalError::TypeMismatch {
4566            detail: alloc::format!("ambiguous column reference: {}", c.name),
4567        })),
4568        _ => Err(EngineError::Eval(EvalError::ColumnNotFound {
4569            name: c.name.clone(),
4570        })),
4571    }
4572}
4573
4574fn build_projection(
4575    items: &[SelectItem],
4576    schema_cols: &[ColumnSchema],
4577    table_alias: &str,
4578) -> Result<Vec<ProjectedItem>, EngineError> {
4579    let mut out = Vec::new();
4580    for item in items {
4581        match item {
4582            SelectItem::Wildcard => {
4583                for col in schema_cols {
4584                    out.push(ProjectedItem {
4585                        expr: Expr::Column(ColumnName {
4586                            qualifier: None,
4587                            name: col.name.clone(),
4588                        }),
4589                        output_name: col.name.clone(),
4590                        ty: col.ty,
4591                        nullable: col.nullable,
4592                    });
4593                }
4594            }
4595            SelectItem::Expr { expr, alias } => {
4596                // Plain column ref keeps full schema info (real type +
4597                // nullability). Compound expressions evaluate fine but have
4598                // no static type — surface them as nullable TEXT, which is
4599                // what most clients render anyway.
4600                if let Expr::Column(c) = expr {
4601                    let sch = resolve_projection_column(c, schema_cols, table_alias)?;
4602                    let output_name = alias.clone().unwrap_or_else(|| c.name.clone());
4603                    out.push(ProjectedItem {
4604                        expr: expr.clone(),
4605                        output_name,
4606                        ty: sch.ty,
4607                        nullable: sch.nullable,
4608                    });
4609                } else {
4610                    let output_name = alias.clone().unwrap_or_else(|| expr.to_string());
4611                    out.push(ProjectedItem {
4612                        expr: expr.clone(),
4613                        output_name,
4614                        ty: DataType::Text,
4615                        nullable: true,
4616                    });
4617                }
4618            }
4619        }
4620    }
4621    Ok(out)
4622}
4623
4624/// Promote an integer to a NUMERIC value at the requested scale.
4625/// Rejects values that, after scaling, would overflow the column's
4626/// precision budget.
4627fn numeric_from_integer(
4628    n: i128,
4629    precision: u8,
4630    scale: u8,
4631    col_name: &str,
4632) -> Result<Value, EngineError> {
4633    let factor = pow10_i128(scale);
4634    let scaled = n.checked_mul(factor).ok_or_else(|| {
4635        EngineError::Unsupported(alloc::format!(
4636            "integer overflow scaling value for column `{col_name}` to scale {scale}"
4637        ))
4638    })?;
4639    check_precision(scaled, precision, col_name)?;
4640    Ok(Value::Numeric { scaled, scale })
4641}
4642
4643/// Float → NUMERIC. Uses round-half-away-from-zero on `x * 10^scale`,
4644/// then verifies the result fits the column's precision.
4645#[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
4646fn numeric_from_float(
4647    x: f64,
4648    precision: u8,
4649    scale: u8,
4650    col_name: &str,
4651) -> Result<Value, EngineError> {
4652    if !x.is_finite() {
4653        return Err(EngineError::Unsupported(alloc::format!(
4654            "cannot store non-finite float in NUMERIC column `{col_name}`"
4655        )));
4656    }
4657    let mut factor = 1.0_f64;
4658    for _ in 0..scale {
4659        factor *= 10.0;
4660    }
4661    // Round half-away-from-zero by biasing then casting (`as i128`
4662    // truncates toward zero, so the bias + truncation gives the
4663    // desired rounding). `f64::floor` / `ceil` live in std; we don't
4664    // need them — the cast handles the truncation step.
4665    let shifted = x * factor;
4666    let biased = if shifted >= 0.0 {
4667        shifted + 0.5
4668    } else {
4669        shifted - 0.5
4670    };
4671    // Range-check before casting back to i128 — the cast itself is
4672    // saturating in Rust, which would silently truncate huge inputs.
4673    if !(-1e38..=1e38).contains(&biased) {
4674        return Err(EngineError::Unsupported(alloc::format!(
4675            "value {x} overflows NUMERIC range for column `{col_name}`"
4676        )));
4677    }
4678    let scaled = biased as i128;
4679    check_precision(scaled, precision, col_name)?;
4680    Ok(Value::Numeric { scaled, scale })
4681}
4682
4683/// Move a Numeric value from `src_scale` to `dst_scale`. Going up
4684/// multiplies by 10; going down rounds half-away-from-zero.
4685fn numeric_rescale(
4686    scaled: i128,
4687    src_scale: u8,
4688    precision: u8,
4689    dst_scale: u8,
4690    col_name: &str,
4691) -> Result<Value, EngineError> {
4692    let new_scaled = if dst_scale >= src_scale {
4693        let bump = pow10_i128(dst_scale - src_scale);
4694        scaled.checked_mul(bump).ok_or_else(|| {
4695            EngineError::Unsupported(alloc::format!(
4696                "overflow rescaling NUMERIC for column `{col_name}`"
4697            ))
4698        })?
4699    } else {
4700        let drop = pow10_i128(src_scale - dst_scale);
4701        let half = drop / 2;
4702        if scaled >= 0 {
4703            (scaled + half) / drop
4704        } else {
4705            (scaled - half) / drop
4706        }
4707    };
4708    check_precision(new_scaled, precision, col_name)?;
4709    Ok(Value::Numeric {
4710        scaled: new_scaled,
4711        scale: dst_scale,
4712    })
4713}
4714
4715/// Drop the fractional part of a scaled integer, returning the integer
4716/// portion (toward zero). Used for NUMERIC → INT casts.
4717const fn numeric_truncate_to_integer(scaled: i128, scale: u8) -> i128 {
4718    if scale == 0 {
4719        return scaled;
4720    }
4721    let factor = pow10_i128_const(scale);
4722    scaled / factor
4723}
4724
4725/// Verify a scaled NUMERIC value fits the column's declared precision.
4726/// `precision == 0` is the "unconstrained" form (bare `NUMERIC`); we
4727/// skip the check there.
4728fn check_precision(scaled: i128, precision: u8, col_name: &str) -> Result<(), EngineError> {
4729    if precision == 0 {
4730        return Ok(());
4731    }
4732    let limit = pow10_i128(precision);
4733    if scaled.unsigned_abs() >= limit.unsigned_abs() {
4734        return Err(EngineError::Unsupported(alloc::format!(
4735            "NUMERIC value exceeds precision {precision} for column `{col_name}`"
4736        )));
4737    }
4738    Ok(())
4739}
4740
4741const fn pow10_i128_const(p: u8) -> i128 {
4742    let mut acc: i128 = 1;
4743    let mut i = 0;
4744    while i < p {
4745        acc *= 10;
4746        i += 1;
4747    }
4748    acc
4749}
4750
4751fn pow10_i128(p: u8) -> i128 {
4752    pow10_i128_const(p)
4753}
4754
4755/// Walk a parsed `Statement`, swapping any `NOW()` /
4756/// `CURRENT_TIMESTAMP()` / `CURRENT_DATE()` function calls for a
4757/// literal cast that wraps the engine's per-statement clock reading.
4758/// When `now_micros` is `None`, calls stay as-is and surface as
4759/// `unknown function` at eval time — keeps the error path explicit.
4760/// v4.10: pre-walk the WHERE / projection / etc. of a SELECT and
4761/// replace every subquery node with a materialised literal. SPG
4762/// only supports uncorrelated subqueries — the inner SELECT does
4763/// not see outer-row columns, so the result is the same for every
4764/// outer row and can be evaluated once.
4765///
4766/// Returns the rewritten statement; the caller passes this to the
4767/// regular row-loop executor which no longer sees Subquery nodes
4768/// in its tree.
4769impl Engine {
4770    /// v4.12 window executor. Implements `ROW_NUMBER` / `RANK` /
4771    /// `DENSE_RANK` and the partition-aware aggregates `SUM` /
4772    /// `AVG` / `COUNT` / `MIN` / `MAX`. The plan is:
4773    /// 1. Apply the WHERE filter.
4774    /// 2. For each unique `WindowFunction` node in the projection,
4775    ///    partition + sort, compute the per-row value.
4776    /// 3. Append the window values as synthetic columns (`__win_N`)
4777    ///    to the row schema.
4778    /// 4. Rewrite the projection to read those columns.
4779    /// 5. Hand off to the regular project / ORDER BY / LIMIT pipe.
4780    #[allow(
4781        clippy::too_many_lines,
4782        clippy::type_complexity,
4783        clippy::needless_range_loop
4784    )] // window-eval is one cohesive pipe; splitting fragments
4785    fn exec_select_with_window(
4786        &self,
4787        stmt: &SelectStatement,
4788        cancel: CancelToken<'_>,
4789    ) -> Result<QueryResult, EngineError> {
4790        let from = stmt.from.as_ref().ok_or_else(|| {
4791            EngineError::Unsupported("window functions require a FROM clause".into())
4792        })?;
4793        // For v4.12 we only support a single-table FROM. Joins +
4794        // windows is queued for v5.x.
4795        if !from.joins.is_empty() {
4796            return Err(EngineError::Unsupported(
4797                "JOIN with window functions not yet supported".into(),
4798            ));
4799        }
4800        let primary = &from.primary;
4801        let table = self.active_catalog().get(&primary.name).ok_or_else(|| {
4802            StorageError::TableNotFound {
4803                name: primary.name.clone(),
4804            }
4805        })?;
4806        let alias = primary.alias.as_deref().unwrap_or(primary.name.as_str());
4807        let schema_cols = &table.schema().columns;
4808        let ctx = EvalContext::new(schema_cols, Some(alias));
4809
4810        // 1) Filter pass.
4811        let mut filtered: Vec<&Row> = Vec::new();
4812        for (i, row) in table.rows().iter().enumerate() {
4813            if i.is_multiple_of(256) {
4814                cancel.check()?;
4815            }
4816            if let Some(w) = &stmt.where_ {
4817                let cond = eval::eval_expr(w, row, &ctx)?;
4818                if !matches!(cond, Value::Bool(true)) {
4819                    continue;
4820                }
4821            }
4822            filtered.push(row);
4823        }
4824        let n_rows = filtered.len();
4825
4826        // 2) Collect unique window function nodes from projection.
4827        let mut window_nodes: Vec<Expr> = Vec::new();
4828        for item in &stmt.items {
4829            if let SelectItem::Expr { expr, .. } = item {
4830                collect_window_nodes(expr, &mut window_nodes);
4831            }
4832        }
4833
4834        // 3) For each window, compute per-row value.
4835        // Index: same order as window_nodes; for row i, win_vals[w][i].
4836        let mut win_vals: Vec<Vec<Value>> = Vec::with_capacity(window_nodes.len());
4837        for wnode in &window_nodes {
4838            let Expr::WindowFunction {
4839                name,
4840                args,
4841                partition_by,
4842                order_by,
4843                frame,
4844                null_treatment,
4845            } = wnode
4846            else {
4847                unreachable!("collect_window_nodes pushes only WindowFunction");
4848            };
4849            // Compute (partition_key, order_key, original_index) for each row.
4850            let mut indexed: Vec<(Vec<Value>, Vec<(Value, bool)>, usize)> =
4851                Vec::with_capacity(n_rows);
4852            for (i, row) in filtered.iter().enumerate() {
4853                let pkey: Vec<Value> = partition_by
4854                    .iter()
4855                    .map(|p| eval::eval_expr(p, row, &ctx))
4856                    .collect::<Result<_, _>>()?;
4857                let okey: Vec<(Value, bool)> = order_by
4858                    .iter()
4859                    .map(|(e, desc)| eval::eval_expr(e, row, &ctx).map(|v| (v, *desc)))
4860                    .collect::<Result<_, _>>()?;
4861                indexed.push((pkey, okey, i));
4862            }
4863            // Sort by (partition_key, order_key). Partition key uses
4864            // a stable encoded form; order key respects ASC/DESC.
4865            indexed.sort_by(|a, b| {
4866                let p_cmp = partition_key_cmp(&a.0, &b.0);
4867                if p_cmp != core::cmp::Ordering::Equal {
4868                    return p_cmp;
4869                }
4870                order_key_cmp(&a.1, &b.1)
4871            });
4872            // Per-partition compute.
4873            let mut out_vals: Vec<Value> = alloc::vec![Value::Null; n_rows];
4874            let mut p_start = 0;
4875            while p_start < indexed.len() {
4876                let mut p_end = p_start + 1;
4877                while p_end < indexed.len()
4878                    && partition_key_cmp(&indexed[p_start].0, &indexed[p_end].0)
4879                        == core::cmp::Ordering::Equal
4880                {
4881                    p_end += 1;
4882                }
4883                // Compute the function within this partition slice.
4884                compute_window_partition(
4885                    name,
4886                    args,
4887                    !order_by.is_empty(),
4888                    frame.as_ref(),
4889                    *null_treatment,
4890                    &indexed[p_start..p_end],
4891                    &filtered,
4892                    &ctx,
4893                    &mut out_vals,
4894                )?;
4895                p_start = p_end;
4896            }
4897            win_vals.push(out_vals);
4898        }
4899
4900        // 4) Build extended schema: original columns + synthetic.
4901        let mut ext_cols = schema_cols.clone();
4902        for i in 0..window_nodes.len() {
4903            ext_cols.push(ColumnSchema::new(
4904                alloc::format!("__win_{i}"),
4905                DataType::Text, // type doesn't matter for projection eval
4906                true,
4907            ));
4908        }
4909        // 5) Build extended rows: each row gets its window values appended.
4910        let mut ext_rows: Vec<Row> = Vec::with_capacity(n_rows);
4911        for i in 0..n_rows {
4912            let mut values = filtered[i].values.clone();
4913            for w in 0..window_nodes.len() {
4914                values.push(win_vals[w][i].clone());
4915            }
4916            ext_rows.push(Row::new(values));
4917        }
4918        // 6) Rewrite the projection: WindowFunction nodes → Column(__win_N).
4919        let mut rewritten_items: Vec<SelectItem> = Vec::with_capacity(stmt.items.len());
4920        for item in &stmt.items {
4921            let new_item = match item {
4922                SelectItem::Wildcard => SelectItem::Wildcard,
4923                SelectItem::Expr { expr, alias } => {
4924                    let mut e = expr.clone();
4925                    rewrite_window_to_columns(&mut e, &window_nodes);
4926                    SelectItem::Expr {
4927                        expr: e,
4928                        alias: alias.clone(),
4929                    }
4930                }
4931            };
4932            rewritten_items.push(new_item);
4933        }
4934
4935        // 7) Project into final rows.
4936        let ext_ctx = EvalContext::new(&ext_cols, Some(alias));
4937        let projection = build_projection(&rewritten_items, &ext_cols, alias)?;
4938        let mut tagged: Vec<(Vec<f64>, Row)> = Vec::with_capacity(n_rows);
4939        for (i, row) in ext_rows.iter().enumerate() {
4940            if i.is_multiple_of(256) {
4941                cancel.check()?;
4942            }
4943            let mut values = Vec::with_capacity(projection.len());
4944            for p in &projection {
4945                values.push(eval::eval_expr(&p.expr, row, &ext_ctx)?);
4946            }
4947            let order_keys = if stmt.order_by.is_empty() {
4948                Vec::new()
4949            } else {
4950                let mut keys = Vec::with_capacity(stmt.order_by.len());
4951                for o in &stmt.order_by {
4952                    let mut e = o.expr.clone();
4953                    rewrite_window_to_columns(&mut e, &window_nodes);
4954                    let key = eval::eval_expr(&e, row, &ext_ctx)?;
4955                    keys.push(value_to_order_key(&key)?);
4956                }
4957                keys
4958            };
4959            tagged.push((order_keys, Row::new(values)));
4960        }
4961        // ORDER BY + LIMIT/OFFSET on the projected rows.
4962        if !stmt.order_by.is_empty() {
4963            let descs: Vec<bool> = stmt.order_by.iter().map(|o| o.desc).collect();
4964            sort_by_keys(&mut tagged, &descs);
4965        }
4966        let mut out_rows: Vec<Row> = tagged.into_iter().map(|(_, r)| r).collect();
4967        apply_offset_and_limit(&mut out_rows, stmt.offset_literal(), stmt.limit_literal());
4968        let final_cols: Vec<ColumnSchema> = projection
4969            .into_iter()
4970            .map(|p| ColumnSchema::new(p.output_name, p.ty, p.nullable))
4971            .collect();
4972        Ok(QueryResult::Rows {
4973            columns: final_cols,
4974            rows: out_rows,
4975        })
4976    }
4977
4978    /// v4.11: materialise each CTE into a temp table inside a
4979    /// cloned catalog, then run the body SELECT against a fresh
4980    /// engine instance that owns the enriched catalog. The clone
4981    /// is moderately expensive — only paid by CTE-bearing queries.
4982    /// Subqueries inside CTE bodies / the main body resolve as
4983    /// usual; `clock_fn` is propagated so `NOW()` lines up.
4984    fn exec_with_ctes(
4985        &self,
4986        stmt: &SelectStatement,
4987        cancel: CancelToken<'_>,
4988    ) -> Result<QueryResult, EngineError> {
4989        cancel.check()?;
4990        let mut catalog = self.active_catalog().clone();
4991        for cte in &stmt.ctes {
4992            if catalog.get(&cte.name).is_some() {
4993                return Err(EngineError::Unsupported(alloc::format!(
4994                    "CTE name {:?} shadows an existing table; rename the CTE",
4995                    cte.name
4996                )));
4997            }
4998            let (columns, rows) = if cte.recursive {
4999                self.materialise_recursive_cte(cte, &catalog, cancel)?
5000            } else {
5001                let body_result = self.exec_select_cancel(&cte.body, cancel)?;
5002                let QueryResult::Rows { columns, rows } = body_result else {
5003                    return Err(EngineError::Unsupported(alloc::format!(
5004                        "CTE {:?} body did not return rows",
5005                        cte.name
5006                    )));
5007                };
5008                (columns, rows)
5009            };
5010            // v4.22: the projection builder labels any non-column
5011            // expression as Text — including literal SELECT 1.
5012            // Promote each column's type to whatever the rows
5013            // actually carry so the CTE storage table accepts them.
5014            let inferred = infer_column_types(&columns, &rows);
5015            let mut columns = inferred;
5016            // v4.22: apply optional `WITH name(a, b, c)` overrides.
5017            if !cte.column_overrides.is_empty() {
5018                if cte.column_overrides.len() != columns.len() {
5019                    return Err(EngineError::Unsupported(alloc::format!(
5020                        "CTE {:?} column list has {} names but body returns {} columns",
5021                        cte.name,
5022                        cte.column_overrides.len(),
5023                        columns.len()
5024                    )));
5025                }
5026                for (col, name) in columns.iter_mut().zip(cte.column_overrides.iter()) {
5027                    col.name.clone_from(name);
5028                }
5029            }
5030            let schema = TableSchema::new(cte.name.clone(), columns);
5031            catalog.create_table(schema).map_err(EngineError::Storage)?;
5032            let table = catalog
5033                .get_mut(&cte.name)
5034                .expect("just-created CTE table must exist");
5035            for row in rows {
5036                table.insert(row).map_err(EngineError::Storage)?;
5037            }
5038        }
5039        // Strip CTEs from the body before running on the temp engine
5040        // so we don't recurse forever.
5041        let mut body = stmt.clone();
5042        body.ctes = Vec::new();
5043        let mut temp = Engine::restore(catalog);
5044        if let Some(c) = self.clock {
5045            temp = temp.with_clock(c);
5046        }
5047        if let Some(f) = self.salt_fn {
5048            temp = temp.with_salt_fn(f);
5049        }
5050        temp.exec_select_cancel(&body, cancel)
5051    }
5052
5053    /// v4.22: materialise a WITH RECURSIVE CTE. The body must be a
5054    /// UNION (or UNION ALL) of an anchor that does not reference
5055    /// the CTE name, and one or more recursive terms that do. The
5056    /// anchor runs first; each subsequent iteration runs the
5057    /// recursive term against a temp catalog where the CTE name is
5058    /// bound to the *previous* iteration's output. Iteration stops
5059    /// when the recursive term yields no rows; UNION (DISTINCT)
5060    /// deduplicates against the accumulated result, UNION ALL does
5061    /// not. A hard cap on total rows prevents runaway queries.
5062    #[allow(clippy::too_many_lines)]
5063    fn materialise_recursive_cte(
5064        &self,
5065        cte: &spg_sql::ast::Cte,
5066        base_catalog: &Catalog,
5067        cancel: CancelToken<'_>,
5068    ) -> Result<(Vec<ColumnSchema>, Vec<Row>), EngineError> {
5069        const MAX_TOTAL_ROWS: usize = 1_000_000;
5070        const MAX_ITERATIONS: usize = 100_000;
5071        cancel.check()?;
5072        if cte.body.unions.is_empty() {
5073            return Err(EngineError::Unsupported(alloc::format!(
5074                "WITH RECURSIVE {:?} body must be a UNION of an anchor and a recursive term",
5075                cte.name
5076            )));
5077        }
5078        // Anchor: the body's leading SELECT, with unions stripped.
5079        let mut anchor = cte.body.clone();
5080        let union_terms = core::mem::take(&mut anchor.unions);
5081        anchor.ctes = Vec::new();
5082        // Anchor must not reference the CTE name.
5083        if select_refers_to(&anchor, &cte.name) {
5084            return Err(EngineError::Unsupported(alloc::format!(
5085                "WITH RECURSIVE {:?}: the anchor must not reference the CTE itself",
5086                cte.name
5087            )));
5088        }
5089        let anchor_result = self.exec_select_cancel(&anchor, cancel)?;
5090        let QueryResult::Rows {
5091            columns: anchor_cols,
5092            rows: anchor_rows,
5093        } = anchor_result
5094        else {
5095            return Err(EngineError::Unsupported(alloc::format!(
5096                "WITH RECURSIVE {:?}: anchor did not return rows",
5097                cte.name
5098            )));
5099        };
5100        // The projection builder labels non-column expressions Text;
5101        // refine column types from the anchor's actual values so the
5102        // intermediate iter-catalog tables accept them.
5103        let mut columns = infer_column_types(&anchor_cols, &anchor_rows);
5104        if !cte.column_overrides.is_empty() {
5105            if cte.column_overrides.len() != columns.len() {
5106                return Err(EngineError::Unsupported(alloc::format!(
5107                    "CTE {:?} column list has {} names but anchor returns {} columns",
5108                    cte.name,
5109                    cte.column_overrides.len(),
5110                    columns.len()
5111                )));
5112            }
5113            for (col, name) in columns.iter_mut().zip(cte.column_overrides.iter()) {
5114                col.name.clone_from(name);
5115            }
5116        }
5117        let mut all_rows: Vec<Row> = anchor_rows.clone();
5118        let mut working_set: Vec<Row> = anchor_rows;
5119        let mut seen: alloc::collections::BTreeSet<Vec<u8>> = alloc::collections::BTreeSet::new();
5120        // Track at least one "all UNION ALL" flag — if every union
5121        // kind is ALL we skip the dedup step (faster + matches PG).
5122        let all_union_all = union_terms.iter().all(|(k, _)| matches!(k, UnionKind::All));
5123        if !all_union_all {
5124            for r in &all_rows {
5125                seen.insert(encode_row_key(r));
5126            }
5127        }
5128        for iter in 0..MAX_ITERATIONS {
5129            cancel.check()?;
5130            if working_set.is_empty() {
5131                break;
5132            }
5133            // Build a fresh catalog: base + CTE bound to working_set.
5134            let mut iter_catalog = base_catalog.clone();
5135            let schema = TableSchema::new(cte.name.clone(), columns.clone());
5136            iter_catalog
5137                .create_table(schema)
5138                .map_err(EngineError::Storage)?;
5139            {
5140                let table = iter_catalog.get_mut(&cte.name).expect("just-created");
5141                for row in &working_set {
5142                    table.insert(row.clone()).map_err(EngineError::Storage)?;
5143                }
5144            }
5145            let mut iter_engine = Engine::restore(iter_catalog);
5146            if let Some(c) = self.clock {
5147                iter_engine = iter_engine.with_clock(c);
5148            }
5149            if let Some(f) = self.salt_fn {
5150                iter_engine = iter_engine.with_salt_fn(f);
5151            }
5152            // Run each recursive term in sequence and collect new rows.
5153            let mut next_set: Vec<Row> = Vec::new();
5154            for (_, term) in &union_terms {
5155                let mut term = term.clone();
5156                term.ctes = Vec::new();
5157                let r = iter_engine.exec_select_cancel(&term, cancel)?;
5158                let QueryResult::Rows {
5159                    columns: rc,
5160                    rows: rs,
5161                } = r
5162                else {
5163                    return Err(EngineError::Unsupported(alloc::format!(
5164                        "WITH RECURSIVE {:?}: recursive term did not return rows",
5165                        cte.name
5166                    )));
5167                };
5168                if rc.len() != columns.len() {
5169                    return Err(EngineError::Unsupported(alloc::format!(
5170                        "WITH RECURSIVE {:?}: column count of recursive term ({}) does not match anchor ({})",
5171                        cte.name,
5172                        rc.len(),
5173                        columns.len()
5174                    )));
5175                }
5176                for row in rs {
5177                    if !all_union_all {
5178                        let key = encode_row_key(&row);
5179                        if !seen.insert(key) {
5180                            continue;
5181                        }
5182                    }
5183                    next_set.push(row);
5184                }
5185            }
5186            if next_set.is_empty() {
5187                break;
5188            }
5189            all_rows.extend(next_set.iter().cloned());
5190            working_set = next_set;
5191            if all_rows.len() > MAX_TOTAL_ROWS {
5192                return Err(EngineError::Unsupported(alloc::format!(
5193                    "WITH RECURSIVE {:?}: produced more than {MAX_TOTAL_ROWS} rows — likely runaway recursion",
5194                    cte.name
5195                )));
5196            }
5197            if iter + 1 == MAX_ITERATIONS {
5198                return Err(EngineError::Unsupported(alloc::format!(
5199                    "WITH RECURSIVE {:?}: exceeded {MAX_ITERATIONS} iterations",
5200                    cte.name
5201                )));
5202            }
5203        }
5204        Ok((columns, all_rows))
5205    }
5206
5207    fn resolve_select_subqueries(
5208        &self,
5209        stmt: &mut SelectStatement,
5210        cancel: CancelToken<'_>,
5211    ) -> Result<(), EngineError> {
5212        for item in &mut stmt.items {
5213            if let SelectItem::Expr { expr, .. } = item {
5214                self.resolve_expr_subqueries(expr, cancel)?;
5215            }
5216        }
5217        if let Some(w) = &mut stmt.where_ {
5218            self.resolve_expr_subqueries(w, cancel)?;
5219        }
5220        if let Some(gs) = &mut stmt.group_by {
5221            for g in gs {
5222                self.resolve_expr_subqueries(g, cancel)?;
5223            }
5224        }
5225        if let Some(h) = &mut stmt.having {
5226            self.resolve_expr_subqueries(h, cancel)?;
5227        }
5228        for o in &mut stmt.order_by {
5229            self.resolve_expr_subqueries(&mut o.expr, cancel)?;
5230        }
5231        for (_, peer) in &mut stmt.unions {
5232            self.resolve_select_subqueries(peer, cancel)?;
5233        }
5234        Ok(())
5235    }
5236
5237    #[allow(clippy::only_used_in_recursion)] // engine handle reads aren't really pure
5238    fn resolve_expr_subqueries(
5239        &self,
5240        e: &mut Expr,
5241        cancel: CancelToken<'_>,
5242    ) -> Result<(), EngineError> {
5243        // Replace-on-this-node cases first.
5244        if let Some(replacement) = self.subquery_replacement(e, cancel)? {
5245            *e = replacement;
5246            return Ok(());
5247        }
5248        match e {
5249            Expr::Binary { lhs, rhs, .. } => {
5250                self.resolve_expr_subqueries(lhs, cancel)?;
5251                self.resolve_expr_subqueries(rhs, cancel)?;
5252            }
5253            Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
5254                self.resolve_expr_subqueries(expr, cancel)?;
5255            }
5256            Expr::FunctionCall { args, .. } => {
5257                for a in args {
5258                    self.resolve_expr_subqueries(a, cancel)?;
5259                }
5260            }
5261            Expr::Like { expr, pattern, .. } => {
5262                self.resolve_expr_subqueries(expr, cancel)?;
5263                self.resolve_expr_subqueries(pattern, cancel)?;
5264            }
5265            Expr::Extract { source, .. } => self.resolve_expr_subqueries(source, cancel)?,
5266            // v4.12 window functions — recurse into args + ORDER BY
5267            // + PARTITION BY in case they carry inner subqueries.
5268            Expr::WindowFunction {
5269                args,
5270                partition_by,
5271                order_by,
5272                ..
5273            } => {
5274                for a in args {
5275                    self.resolve_expr_subqueries(a, cancel)?;
5276                }
5277                for p in partition_by {
5278                    self.resolve_expr_subqueries(p, cancel)?;
5279                }
5280                for (e, _) in order_by {
5281                    self.resolve_expr_subqueries(e, cancel)?;
5282                }
5283            }
5284            // Subquery nodes are handled in subquery_replacement
5285            // (which returned None — defensive no-op); Literal /
5286            // Column are leaves.
5287            Expr::ScalarSubquery(_)
5288            | Expr::Exists { .. }
5289            | Expr::InSubquery { .. }
5290            | Expr::Literal(_)
5291            | Expr::Placeholder(_)
5292            | Expr::Column(_) => {}
5293        }
5294        Ok(())
5295    }
5296
5297    /// v4.23: per-row eval that handles correlated subqueries.
5298    /// Equivalent to `eval::eval_expr` when the expression has no
5299    /// subqueries; otherwise clones the expression, substitutes
5300    /// outer-row columns into each surviving subquery node, runs
5301    /// the inner SELECT, and replaces the node with the literal
5302    /// result. Only the WHERE-filter call sites use this path so
5303    /// the uncorrelated fast path is preserved everywhere else.
5304    fn eval_expr_with_correlated(
5305        &self,
5306        expr: &Expr,
5307        row: &Row,
5308        ctx: &EvalContext<'_>,
5309        cancel: CancelToken<'_>,
5310        memo: Option<&mut memoize::MemoizeCache>,
5311    ) -> Result<Value, EngineError> {
5312        if !expr_has_subquery(expr) {
5313            return eval::eval_expr(expr, row, ctx).map_err(EngineError::Eval);
5314        }
5315        let mut e = expr.clone();
5316        self.resolve_correlated_in_expr(&mut e, row, ctx, cancel, memo)?;
5317        eval::eval_expr(&e, row, ctx).map_err(EngineError::Eval)
5318    }
5319
5320    fn resolve_correlated_in_expr(
5321        &self,
5322        e: &mut Expr,
5323        row: &Row,
5324        ctx: &EvalContext<'_>,
5325        cancel: CancelToken<'_>,
5326        mut memo: Option<&mut memoize::MemoizeCache>,
5327    ) -> Result<(), EngineError> {
5328        match e {
5329            Expr::ScalarSubquery(inner) => {
5330                // v6.2.6 — Memoize: build the cache key from the
5331                // pre-substitution subquery repr + the outer row's
5332                // values. Two outer rows with identical correlated
5333                // values hit the same entry.
5334                let cache_key = memo.as_ref().map(|_| memoize::CacheKey {
5335                    subquery_repr: alloc::format!("{}", **inner),
5336                    outer_values: row.values.clone(),
5337                });
5338                if let (Some(cache), Some(k)) = (memo.as_deref_mut(), cache_key.as_ref())
5339                    && let Some(cached) = cache.get(k)
5340                {
5341                    *e = value_to_literal_expr(cached)?;
5342                    return Ok(());
5343                }
5344                let mut s = (**inner).clone();
5345                substitute_outer_columns(&mut s, row, ctx);
5346                let r = self.exec_select_cancel(&s, cancel)?;
5347                let QueryResult::Rows { rows, .. } = r else {
5348                    return Err(EngineError::Unsupported(
5349                        "scalar subquery: inner did not return rows".into(),
5350                    ));
5351                };
5352                let value = match rows.as_slice() {
5353                    [] => Value::Null,
5354                    [r0] => r0.values.first().cloned().unwrap_or(Value::Null),
5355                    _ => {
5356                        return Err(EngineError::Unsupported(alloc::format!(
5357                            "scalar subquery returned {} rows; expected 0 or 1",
5358                            rows.len()
5359                        )));
5360                    }
5361                };
5362                if let (Some(cache), Some(k)) = (memo.as_deref_mut(), cache_key) {
5363                    cache.insert(k, value.clone());
5364                }
5365                *e = value_to_literal_expr(value)?;
5366            }
5367            Expr::Exists { subquery, negated } => {
5368                let mut s = (**subquery).clone();
5369                substitute_outer_columns(&mut s, row, ctx);
5370                let r = self.exec_select_cancel(&s, cancel)?;
5371                let exists = matches!(r, QueryResult::Rows { rows, .. } if !rows.is_empty());
5372                let bit = if *negated { !exists } else { exists };
5373                *e = Expr::Literal(Literal::Bool(bit));
5374            }
5375            Expr::InSubquery {
5376                expr: lhs,
5377                subquery,
5378                negated,
5379            } => {
5380                self.resolve_correlated_in_expr(lhs, row, ctx, cancel, memo.as_deref_mut())?;
5381                let lhs_val = eval::eval_expr(lhs, row, ctx).map_err(EngineError::Eval)?;
5382                let mut s = (**subquery).clone();
5383                substitute_outer_columns(&mut s, row, ctx);
5384                let r = self.exec_select_cancel(&s, cancel)?;
5385                let QueryResult::Rows { columns, rows, .. } = r else {
5386                    return Err(EngineError::Unsupported(
5387                        "IN-subquery: inner did not return rows".into(),
5388                    ));
5389                };
5390                if columns.len() != 1 {
5391                    return Err(EngineError::Unsupported(alloc::format!(
5392                        "IN-subquery must project exactly one column; got {}",
5393                        columns.len()
5394                    )));
5395                }
5396                let mut found = false;
5397                let mut any_null = false;
5398                for r0 in rows {
5399                    let v = r0.values.into_iter().next().unwrap_or(Value::Null);
5400                    if v.is_null() {
5401                        any_null = true;
5402                        continue;
5403                    }
5404                    if value_cmp(&v, &lhs_val) == core::cmp::Ordering::Equal {
5405                        found = true;
5406                        break;
5407                    }
5408                }
5409                let bit = if found {
5410                    !*negated
5411                } else if any_null {
5412                    return Err(EngineError::Unsupported(
5413                        "IN-subquery with NULL in result and no match: NULL semantics not yet implemented".into(),
5414                    ));
5415                } else {
5416                    *negated
5417                };
5418                *e = Expr::Literal(Literal::Bool(bit));
5419            }
5420            Expr::Binary { lhs, rhs, .. } => {
5421                self.resolve_correlated_in_expr(lhs, row, ctx, cancel, memo.as_deref_mut())?;
5422                self.resolve_correlated_in_expr(rhs, row, ctx, cancel, memo.as_deref_mut())?;
5423            }
5424            Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
5425                self.resolve_correlated_in_expr(expr, row, ctx, cancel, memo.as_deref_mut())?;
5426            }
5427            Expr::Like { expr, pattern, .. } => {
5428                self.resolve_correlated_in_expr(expr, row, ctx, cancel, memo.as_deref_mut())?;
5429                self.resolve_correlated_in_expr(pattern, row, ctx, cancel, memo.as_deref_mut())?;
5430            }
5431            Expr::FunctionCall { args, .. } => {
5432                for a in args {
5433                    self.resolve_correlated_in_expr(a, row, ctx, cancel, memo.as_deref_mut())?;
5434                }
5435            }
5436            Expr::Extract { source, .. } => {
5437                self.resolve_correlated_in_expr(source, row, ctx, cancel, memo.as_deref_mut())?;
5438            }
5439            Expr::WindowFunction { .. } | Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => {}
5440        }
5441        Ok(())
5442    }
5443
5444    fn subquery_replacement(
5445        &self,
5446        e: &Expr,
5447        cancel: CancelToken<'_>,
5448    ) -> Result<Option<Expr>, EngineError> {
5449        match e {
5450            Expr::ScalarSubquery(inner) => {
5451                let mut s = (**inner).clone();
5452                // Recurse into the inner SELECT first so nested
5453                // subqueries materialise bottom-up.
5454                self.resolve_select_subqueries(&mut s, cancel)?;
5455                let r = match self.exec_bare_select_cancel(&s, cancel) {
5456                    Ok(r) => r,
5457                    Err(e) if is_correlation_error(&e) => return Ok(None),
5458                    Err(e) => return Err(e),
5459                };
5460                let QueryResult::Rows { rows, .. } = r else {
5461                    return Err(EngineError::Unsupported(
5462                        "scalar subquery: inner statement did not return rows".into(),
5463                    ));
5464                };
5465                let value = match rows.as_slice() {
5466                    [] => Value::Null,
5467                    [row] => row.values.first().cloned().unwrap_or(Value::Null),
5468                    _ => {
5469                        return Err(EngineError::Unsupported(alloc::format!(
5470                            "scalar subquery returned {} rows; expected 0 or 1",
5471                            rows.len()
5472                        )));
5473                    }
5474                };
5475                Ok(Some(value_to_literal_expr(value)?))
5476            }
5477            Expr::Exists { subquery, negated } => {
5478                let mut s = (**subquery).clone();
5479                self.resolve_select_subqueries(&mut s, cancel)?;
5480                let r = match self.exec_bare_select_cancel(&s, cancel) {
5481                    Ok(r) => r,
5482                    Err(e) if is_correlation_error(&e) => return Ok(None),
5483                    Err(e) => return Err(e),
5484                };
5485                let exists = match r {
5486                    QueryResult::Rows { rows, .. } => !rows.is_empty(),
5487                    QueryResult::CommandOk { .. } => false,
5488                };
5489                let bit = if *negated { !exists } else { exists };
5490                Ok(Some(Expr::Literal(Literal::Bool(bit))))
5491            }
5492            Expr::InSubquery {
5493                expr,
5494                subquery,
5495                negated,
5496            } => {
5497                let mut s = (**subquery).clone();
5498                self.resolve_select_subqueries(&mut s, cancel)?;
5499                let r = match self.exec_bare_select_cancel(&s, cancel) {
5500                    Ok(r) => r,
5501                    Err(e) if is_correlation_error(&e) => return Ok(None),
5502                    Err(e) => return Err(e),
5503                };
5504                let QueryResult::Rows { columns, rows, .. } = r else {
5505                    return Err(EngineError::Unsupported(
5506                        "IN-subquery: inner statement did not return rows".into(),
5507                    ));
5508                };
5509                if columns.len() != 1 {
5510                    return Err(EngineError::Unsupported(alloc::format!(
5511                        "IN-subquery must project exactly one column; got {}",
5512                        columns.len()
5513                    )));
5514                }
5515                // Build the same OR-Eq chain the parse-time literal-list
5516                // path constructs, with each value lifted into a Literal.
5517                let mut acc: Option<Expr> = None;
5518                for row in rows {
5519                    let v = row.values.into_iter().next().unwrap_or(Value::Null);
5520                    let lit = value_to_literal_expr(v)?;
5521                    let cmp = Expr::Binary {
5522                        lhs: expr.clone(),
5523                        op: BinOp::Eq,
5524                        rhs: Box::new(lit),
5525                    };
5526                    acc = Some(match acc {
5527                        None => cmp,
5528                        Some(prev) => Expr::Binary {
5529                            lhs: Box::new(prev),
5530                            op: BinOp::Or,
5531                            rhs: Box::new(cmp),
5532                        },
5533                    });
5534                }
5535                let combined = acc.unwrap_or(Expr::Literal(Literal::Bool(false)));
5536                let final_expr = if *negated {
5537                    Expr::Unary {
5538                        op: UnOp::Not,
5539                        expr: Box::new(combined),
5540                    }
5541                } else {
5542                    combined
5543                };
5544                Ok(Some(final_expr))
5545            }
5546            _ => Ok(None),
5547        }
5548    }
5549}
5550
5551// ---- v4.12 window-function helpers ----
5552// The (partition-key, order-key, original-index) tuple shape used
5553// across these helpers is intrinsic to the planner. Factoring it
5554// into a typedef adds indirection without making the code clearer,
5555// so several lints are allowed inline on the affected functions
5556// rather than module-wide.
5557
5558/// v4.22: cheap structural scan for `FROM <name>` (qualified or
5559/// not) inside a SELECT — used to verify the anchor of a WITH
5560/// RECURSIVE CTE doesn't recurse into itself. Conservative: walks
5561/// FROM joins, subqueries, and unions.
5562fn select_refers_to(stmt: &SelectStatement, target: &str) -> bool {
5563    if let Some(from) = &stmt.from
5564        && from_refers_to(from, target)
5565    {
5566        return true;
5567    }
5568    for (_, peer) in &stmt.unions {
5569        if select_refers_to(peer, target) {
5570            return true;
5571        }
5572    }
5573    for item in &stmt.items {
5574        if let SelectItem::Expr { expr, .. } = item
5575            && expr_refers_to(expr, target)
5576        {
5577            return true;
5578        }
5579    }
5580    if let Some(w) = &stmt.where_
5581        && expr_refers_to(w, target)
5582    {
5583        return true;
5584    }
5585    false
5586}
5587
5588fn from_refers_to(from: &FromClause, target: &str) -> bool {
5589    if from.primary.name.eq_ignore_ascii_case(target) {
5590        return true;
5591    }
5592    from.joins
5593        .iter()
5594        .any(|j| j.table.name.eq_ignore_ascii_case(target))
5595}
5596
5597fn expr_refers_to(e: &Expr, target: &str) -> bool {
5598    match e {
5599        Expr::ScalarSubquery(s) => select_refers_to(s, target),
5600        Expr::Exists { subquery, .. } | Expr::InSubquery { subquery, .. } => {
5601            select_refers_to(subquery, target)
5602        }
5603        Expr::Binary { lhs, rhs, .. } => expr_refers_to(lhs, target) || expr_refers_to(rhs, target),
5604        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
5605            expr_refers_to(expr, target)
5606        }
5607        Expr::Like { expr, pattern, .. } => {
5608            expr_refers_to(expr, target) || expr_refers_to(pattern, target)
5609        }
5610        Expr::FunctionCall { args, .. } => args.iter().any(|a| expr_refers_to(a, target)),
5611        Expr::Extract { source, .. } => expr_refers_to(source, target),
5612        Expr::WindowFunction {
5613            args,
5614            partition_by,
5615            order_by,
5616            ..
5617        } => {
5618            args.iter().any(|a| expr_refers_to(a, target))
5619                || partition_by.iter().any(|p| expr_refers_to(p, target))
5620                || order_by.iter().any(|(o, _)| expr_refers_to(o, target))
5621        }
5622        Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => false,
5623    }
5624}
5625
5626/// v4.22: pick more specific column types from observed rows when
5627/// the projection builder defaulted to Text (the v1.x behavior for
5628/// non-column expressions). Lets `WITH t(n) AS (SELECT 1 ...)`
5629/// land an Int column in the CTE storage table rather than failing
5630/// the insert with "expected TEXT, got INT".
5631fn infer_column_types(columns: &[ColumnSchema], rows: &[Row]) -> Vec<ColumnSchema> {
5632    let mut out = columns.to_vec();
5633    for (col_idx, col) in out.iter_mut().enumerate() {
5634        if col.ty != DataType::Text {
5635            continue;
5636        }
5637        let mut inferred: Option<DataType> = None;
5638        let mut all_null = true;
5639        for row in rows {
5640            let Some(v) = row.values.get(col_idx) else {
5641                continue;
5642            };
5643            let ty = match v {
5644                Value::Null => continue,
5645                Value::SmallInt(_) => DataType::SmallInt,
5646                Value::Int(_) => DataType::Int,
5647                Value::BigInt(_) => DataType::BigInt,
5648                Value::Float(_) => DataType::Float,
5649                Value::Bool(_) => DataType::Bool,
5650                Value::Vector(_) => DataType::Vector {
5651                    dim: 0,
5652                    encoding: VecEncoding::F32,
5653                },
5654                _ => DataType::Text,
5655            };
5656            all_null = false;
5657            inferred = Some(match inferred {
5658                None => ty,
5659                Some(prev) if prev == ty => prev,
5660                Some(_) => DataType::Text,
5661            });
5662        }
5663        if let Some(t) = inferred {
5664            col.ty = t;
5665            col.nullable = true;
5666        } else if all_null {
5667            col.nullable = true;
5668        }
5669    }
5670    out
5671}
5672
5673/// v4.26: render a human-readable plan tree for `EXPLAIN <select>`.
5674/// Lines are pushed into `out`; `depth` controls indentation. We
5675/// describe the rewritten SELECT — what the executor *would* do —
5676/// using the engine handle to spot indexed lookups and table shapes.
5677#[allow(clippy::too_many_lines, clippy::format_push_string)]
5678/// v6.2.4 — Walk every line of the rendered plan tree and append
5679/// per-operator stats. Lines that name a known operator get
5680/// `(rows=N)` (`actual_rows` of the top-level operator equals the
5681/// final result row count; scans report their catalog row count
5682/// as the rows-considered metric). Other lines — Filter / Join /
5683/// GroupBy / OrderBy etc. — are marked `(—)` so the surface is
5684/// complete-by-construction; v6.2.5 fills these in via inline
5685/// executor counters.
5686/// v6.8.3 — surface "CREATE INDEX …" suggestions for every
5687/// `(table, column)` pair the query touches via WHERE / JOIN
5688/// that doesn't already have an index on the owning table.
5689/// Walks the SELECT's FROM clauses + WHERE expression tree;
5690/// returns one line per missing index. Deterministic order:
5691/// FROM-clause iteration order, then column-reference walk
5692/// order inside each WHERE. Each suggestion is a copy-pastable
5693/// DDL string.
5694fn build_index_suggestions(stmt: &SelectStatement, engine: &Engine) -> Vec<String> {
5695    use alloc::collections::BTreeSet;
5696    let mut seen: BTreeSet<(String, String)> = BTreeSet::new();
5697    let mut out: Vec<String> = Vec::new();
5698    let cat = engine.active_catalog();
5699    // Build a (table, qualifier-or-alias) list from the FROM clause
5700    // so unqualified column refs in WHERE resolve to the correct
5701    // table.
5702    let Some(from) = &stmt.from else {
5703        return out;
5704    };
5705    let mut tables: Vec<String> = Vec::new();
5706    tables.push(from.primary.name.clone());
5707    for j in &from.joins {
5708        tables.push(j.table.name.clone());
5709    }
5710    // Collect column refs from the WHERE expression. JOIN ON
5711    // predicates also feed in.
5712    let mut col_refs: Vec<spg_sql::ast::ColumnName> = Vec::new();
5713    if let Some(w) = &stmt.where_ {
5714        collect_column_refs(w, &mut col_refs);
5715    }
5716    for j in &from.joins {
5717        if let Some(on) = &j.on {
5718            collect_column_refs(on, &mut col_refs);
5719        }
5720    }
5721    for cn in &col_refs {
5722        // Resolve owner table: explicit qualifier first, else
5723        // first table in FROM that has a column of this name.
5724        let owner: Option<String> = if let Some(q) = &cn.qualifier {
5725            tables.iter().find(|t| t == &q).cloned()
5726        } else {
5727            tables.iter().find_map(|t| {
5728                cat.get(t).and_then(|tbl| {
5729                    if tbl.schema().column_position(&cn.name).is_some() {
5730                        Some(t.clone())
5731                    } else {
5732                        None
5733                    }
5734                })
5735            })
5736        };
5737        let Some(owner) = owner else {
5738            continue;
5739        };
5740        let Some(tbl) = cat.get(&owner) else {
5741            continue;
5742        };
5743        let Some(col_pos) = tbl.schema().column_position(&cn.name) else {
5744            continue;
5745        };
5746        // Skip if any BTree index already covers this column as
5747        // its key.
5748        let already_indexed = tbl.indices().iter().any(|i| {
5749            matches!(i.kind, spg_storage::IndexKind::BTree(_))
5750                && i.column_position == col_pos
5751                && i.expression.is_none()
5752                && i.partial_predicate.is_none()
5753        });
5754        if already_indexed {
5755            continue;
5756        }
5757        if seen.insert((owner.clone(), cn.name.clone())) {
5758            out.push(alloc::format!(
5759                "SUGGEST: CREATE INDEX ix_{}_{} ON {} ({})",
5760                owner,
5761                cn.name,
5762                owner,
5763                cn.name
5764            ));
5765        }
5766    }
5767    out
5768}
5769
5770/// Walks an `Expr` and pushes every `ColumnName` it references.
5771/// Order is depth-first, left-to-right.
5772fn collect_column_refs(expr: &Expr, out: &mut Vec<spg_sql::ast::ColumnName>) {
5773    match expr {
5774        Expr::Column(cn) => out.push(cn.clone()),
5775        Expr::FunctionCall { args, .. } => {
5776            for a in args {
5777                collect_column_refs(a, out);
5778            }
5779        }
5780        Expr::Binary { lhs, rhs, .. } => {
5781            collect_column_refs(lhs, out);
5782            collect_column_refs(rhs, out);
5783        }
5784        Expr::Unary { expr: e, .. } => collect_column_refs(e, out),
5785        _ => {}
5786    }
5787}
5788
5789fn annotate_explain_lines(lines: &mut [String], total_rows: usize, engine: &Engine) {
5790    let catalog = engine.active_catalog();
5791    let cold_ids = catalog.cold_segment_ids_global();
5792    let any_cold = !cold_ids.is_empty();
5793    let cold_ids_repr = if any_cold {
5794        let mut s = alloc::string::String::from("[");
5795        for (i, id) in cold_ids.iter().enumerate() {
5796            if i > 0 {
5797                s.push(',');
5798            }
5799            s.push_str(&alloc::format!("{id}"));
5800        }
5801        s.push(']');
5802        s
5803    } else {
5804        alloc::string::String::new()
5805    };
5806    for (idx, line) in lines.iter_mut().enumerate() {
5807        let trimmed = line.trim_start();
5808        let is_top_level = idx == 0;
5809        if is_top_level {
5810            line.push_str(&alloc::format!(" (rows={total_rows})"));
5811            continue;
5812        }
5813        if let Some(rest) = trimmed.strip_prefix("From: ") {
5814            let (name, scan_kind) = match rest.split_once(" [") {
5815                Some((n, k)) => (n.trim(), k.trim_end_matches(']')),
5816                None => (rest.trim(), ""),
5817            };
5818            let bare = name.split_whitespace().next().unwrap_or(name);
5819            let hot = catalog.get(bare).map(|t| t.rows().len());
5820            // v6.2.7 — `cold_segments=[id0,id1,…]` enumerates every
5821            // cold-tier segment the scan COULD have walked. v6.2.x
5822            // can tighten to per-table by walking the table's
5823            // BTree-index cold locators.
5824            let annot = match (hot, scan_kind) {
5825                (Some(h), "full scan") => {
5826                    let mut s = alloc::format!(" (hot_rows={h}");
5827                    if any_cold {
5828                        s.push_str(&alloc::format!(
5829                            ", cold_tier=present, cold_segments={cold_ids_repr}"
5830                        ));
5831                    }
5832                    s.push(')');
5833                    s
5834                }
5835                (Some(h), "index seek") => {
5836                    let mut s = alloc::format!(" (hot_rows≤{h}");
5837                    if any_cold {
5838                        s.push_str(&alloc::format!(
5839                            ", cold_tier=present, cold_segments={cold_ids_repr}"
5840                        ));
5841                    }
5842                    s.push(')');
5843                    s
5844                }
5845                _ => " (rows=—)".to_string(),
5846            };
5847            line.push_str(&annot);
5848            continue;
5849        }
5850        // Filter / GroupBy / Having / OrderBy / Limit / Join etc.
5851        line.push_str(" (rows=—)");
5852    }
5853}
5854
5855fn explain_select(stmt: &SelectStatement, engine: &Engine, depth: usize, out: &mut Vec<String>) {
5856    let pad = "  ".repeat(depth);
5857    // 1) Top-level operator label.
5858    let top = if !stmt.ctes.is_empty() {
5859        if stmt.ctes.iter().any(|c| c.recursive) {
5860            "CTEScan (WITH RECURSIVE)"
5861        } else {
5862            "CTEScan (WITH)"
5863        }
5864    } else if !stmt.unions.is_empty() {
5865        "UnionScan"
5866    } else if select_has_window(stmt) {
5867        "WindowAgg"
5868    } else if aggregate::uses_aggregate(stmt) {
5869        "Aggregate"
5870    } else if stmt.distinct {
5871        "Distinct"
5872    } else if stmt.from.is_some() {
5873        "TableScan"
5874    } else {
5875        "Result"
5876    };
5877    out.push(alloc::format!("{pad}{top}"));
5878    let child = "  ".repeat(depth + 1);
5879    // 2) CTE bodies.
5880    for cte in &stmt.ctes {
5881        let head = if cte.recursive {
5882            alloc::format!("{child}CTE (recursive): {}", cte.name)
5883        } else {
5884            alloc::format!("{child}CTE: {}", cte.name)
5885        };
5886        out.push(head);
5887        explain_select(&cte.body, engine, depth + 2, out);
5888    }
5889    // 3) FROM details — primary table + joins, index hits.
5890    if let Some(from) = &stmt.from {
5891        let mut tag = alloc::format!("{child}From: {}", from.primary.name);
5892        if let Some(alias) = &from.primary.alias {
5893            tag.push_str(&alloc::format!(" AS {alias}"));
5894        }
5895        // Try to detect an index-seek opportunity on WHERE against
5896        // the primary table — same heuristic the executor uses.
5897        if let Some(w) = &stmt.where_
5898            && let Some(table) = engine.active_catalog().get(&from.primary.name)
5899        {
5900            let alias = from.primary.alias.as_deref().unwrap_or(&from.primary.name);
5901            let cols = &table.schema().columns;
5902            if try_index_seek(w, cols, engine.active_catalog(), table, alias).is_some() {
5903                tag.push_str(" [index seek]");
5904            } else {
5905                tag.push_str(" [full scan]");
5906            }
5907        } else {
5908            tag.push_str(" [full scan]");
5909        }
5910        out.push(tag);
5911        for j in &from.joins {
5912            let kind = match j.kind {
5913                spg_sql::ast::JoinKind::Inner => "INNER JOIN",
5914                spg_sql::ast::JoinKind::Left => "LEFT JOIN",
5915                spg_sql::ast::JoinKind::Cross => "CROSS JOIN",
5916            };
5917            let mut s = alloc::format!("{child}{kind}: {}", j.table.name);
5918            if let Some(alias) = &j.table.alias {
5919                s.push_str(&alloc::format!(" AS {alias}"));
5920            }
5921            if j.on.is_some() {
5922                s.push_str(" (ON …)");
5923            }
5924            out.push(s);
5925        }
5926    }
5927    // 4) WHERE / GROUP BY / HAVING / ORDER BY / LIMIT / OFFSET.
5928    if let Some(w) = &stmt.where_ {
5929        let mut s = alloc::format!("{child}Filter: {w}");
5930        if expr_has_subquery(w) {
5931            s.push_str(" [subquery]");
5932        }
5933        out.push(s);
5934    }
5935    if let Some(gs) = &stmt.group_by {
5936        let mut parts = Vec::new();
5937        for g in gs {
5938            parts.push(alloc::format!("{g}"));
5939        }
5940        out.push(alloc::format!("{child}GroupBy: {}", parts.join(", ")));
5941    }
5942    if let Some(h) = &stmt.having {
5943        out.push(alloc::format!("{child}Having: {h}"));
5944    }
5945    for o in &stmt.order_by {
5946        let dir = if o.desc { "DESC" } else { "ASC" };
5947        out.push(alloc::format!("{child}OrderBy: {} {dir}", o.expr));
5948    }
5949    if let Some(lim) = stmt.limit {
5950        out.push(alloc::format!("{child}Limit: {lim}"));
5951    }
5952    if let Some(off) = stmt.offset {
5953        out.push(alloc::format!("{child}Offset: {off}"));
5954    }
5955    // 5) Projection — collapse Wildcard or render N items.
5956    if stmt
5957        .items
5958        .iter()
5959        .any(|it| matches!(it, SelectItem::Wildcard))
5960    {
5961        out.push(alloc::format!("{child}Project: *"));
5962    } else {
5963        out.push(alloc::format!(
5964            "{child}Project: {} item(s)",
5965            stmt.items.len()
5966        ));
5967    }
5968    // 6) Recurse into UNION peers.
5969    for (kind, peer) in &stmt.unions {
5970        let label = match kind {
5971            UnionKind::All => "UNION ALL",
5972            UnionKind::Distinct => "UNION",
5973        };
5974        out.push(alloc::format!("{child}{label}"));
5975        explain_select(peer, engine, depth + 2, out);
5976    }
5977}
5978
5979/// v4.23: recognise the engine errors that indicate the inner
5980/// SELECT couldn't be evaluated in isolation because it references
5981/// an outer column — used by `subquery_replacement` to skip
5982/// materialisation and let row-eval handle it instead.
5983fn is_correlation_error(e: &EngineError) -> bool {
5984    matches!(
5985        e,
5986        EngineError::Eval(
5987            eval::EvalError::ColumnNotFound { .. } | eval::EvalError::UnknownQualifier { .. }
5988        )
5989    )
5990}
5991
5992/// v4.23: walk every Expr in `stmt` and replace each Column ref
5993/// that targets the outer scope (qualifier matches the outer
5994/// table alias) with a Literal carrying the outer row's value.
5995/// Conservative: only qualified refs are substituted, so the user
5996/// must write `outer_alias.col` to reference an outer column. This
5997/// matches PG's lexical scoping for correlated subqueries and
5998/// avoids accidentally rebinding inner columns of the same name.
5999fn substitute_outer_columns(stmt: &mut SelectStatement, row: &Row, ctx: &EvalContext<'_>) {
6000    let Some(outer_alias) = ctx.table_alias else {
6001        return;
6002    };
6003    substitute_in_select(stmt, row, ctx, outer_alias);
6004}
6005
6006fn substitute_in_select(
6007    stmt: &mut SelectStatement,
6008    row: &Row,
6009    ctx: &EvalContext<'_>,
6010    outer_alias: &str,
6011) {
6012    for item in &mut stmt.items {
6013        if let SelectItem::Expr { expr, .. } = item {
6014            substitute_in_expr(expr, row, ctx, outer_alias);
6015        }
6016    }
6017    if let Some(w) = &mut stmt.where_ {
6018        substitute_in_expr(w, row, ctx, outer_alias);
6019    }
6020    if let Some(gs) = &mut stmt.group_by {
6021        for g in gs {
6022            substitute_in_expr(g, row, ctx, outer_alias);
6023        }
6024    }
6025    if let Some(h) = &mut stmt.having {
6026        substitute_in_expr(h, row, ctx, outer_alias);
6027    }
6028    for o in &mut stmt.order_by {
6029        substitute_in_expr(&mut o.expr, row, ctx, outer_alias);
6030    }
6031    for (_, peer) in &mut stmt.unions {
6032        substitute_in_select(peer, row, ctx, outer_alias);
6033    }
6034}
6035
6036fn substitute_in_expr(e: &mut Expr, row: &Row, ctx: &EvalContext<'_>, outer_alias: &str) {
6037    if let Expr::Column(c) = e
6038        && let Some(qual) = &c.qualifier
6039        && qual.eq_ignore_ascii_case(outer_alias)
6040    {
6041        // Look up the column's index in the outer schema.
6042        if let Some(idx) = ctx
6043            .columns
6044            .iter()
6045            .position(|sc| sc.name.eq_ignore_ascii_case(&c.name))
6046        {
6047            let v = row.values.get(idx).cloned().unwrap_or(Value::Null);
6048            if let Ok(lit) = value_to_literal_expr(v) {
6049                *e = lit;
6050                return;
6051            }
6052        }
6053    }
6054    match e {
6055        Expr::Binary { lhs, rhs, .. } => {
6056            substitute_in_expr(lhs, row, ctx, outer_alias);
6057            substitute_in_expr(rhs, row, ctx, outer_alias);
6058        }
6059        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
6060            substitute_in_expr(expr, row, ctx, outer_alias);
6061        }
6062        Expr::Like { expr, pattern, .. } => {
6063            substitute_in_expr(expr, row, ctx, outer_alias);
6064            substitute_in_expr(pattern, row, ctx, outer_alias);
6065        }
6066        Expr::FunctionCall { args, .. } => {
6067            for a in args {
6068                substitute_in_expr(a, row, ctx, outer_alias);
6069            }
6070        }
6071        Expr::Extract { source, .. } => substitute_in_expr(source, row, ctx, outer_alias),
6072        Expr::WindowFunction {
6073            args,
6074            partition_by,
6075            order_by,
6076            ..
6077        } => {
6078            for a in args {
6079                substitute_in_expr(a, row, ctx, outer_alias);
6080            }
6081            for p in partition_by {
6082                substitute_in_expr(p, row, ctx, outer_alias);
6083            }
6084            for (o, _) in order_by {
6085                substitute_in_expr(o, row, ctx, outer_alias);
6086            }
6087        }
6088        Expr::ScalarSubquery(s) => substitute_in_select(s, row, ctx, outer_alias),
6089        Expr::Exists { subquery, .. } | Expr::InSubquery { subquery, .. } => {
6090            substitute_in_select(subquery, row, ctx, outer_alias);
6091        }
6092        Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => {}
6093    }
6094}
6095
6096/// v4.22: encode a Row to a comparable byte key for UNION-DISTINCT
6097/// dedup inside the recursive iteration. Crude but deterministic
6098/// — Debug prints embed type discriminants so NULL ≠ "" ≠ 0.
6099fn encode_row_key(row: &Row) -> Vec<u8> {
6100    let mut out = Vec::new();
6101    for v in &row.values {
6102        let s = alloc::format!("{v:?}|");
6103        out.extend_from_slice(s.as_bytes());
6104    }
6105    out
6106}
6107
6108fn select_has_window(stmt: &SelectStatement) -> bool {
6109    for item in &stmt.items {
6110        if let SelectItem::Expr { expr, .. } = item
6111            && expr_has_window(expr)
6112        {
6113            return true;
6114        }
6115    }
6116    false
6117}
6118
6119fn expr_has_window(e: &Expr) -> bool {
6120    match e {
6121        Expr::WindowFunction { .. } => true,
6122        Expr::Binary { lhs, rhs, .. } => expr_has_window(lhs) || expr_has_window(rhs),
6123        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
6124            expr_has_window(expr)
6125        }
6126        Expr::FunctionCall { args, .. } => args.iter().any(expr_has_window),
6127        Expr::Like { expr, pattern, .. } => expr_has_window(expr) || expr_has_window(pattern),
6128        Expr::Extract { source, .. } => expr_has_window(source),
6129        Expr::ScalarSubquery(_)
6130        | Expr::Exists { .. }
6131        | Expr::InSubquery { .. }
6132        | Expr::Literal(_)
6133        | Expr::Placeholder(_)
6134        | Expr::Column(_) => false,
6135    }
6136}
6137
6138fn collect_window_nodes(e: &Expr, out: &mut Vec<Expr>) {
6139    if let Expr::WindowFunction { .. } = e {
6140        // Deduplicate by structural equality on the expression
6141        // (cheap because window args + partition + order are
6142        // small). Without dedup we'd recompute identical windows
6143        // once per occurrence in the projection.
6144        if !out.iter().any(|x| x == e) {
6145            out.push(e.clone());
6146        }
6147        return;
6148    }
6149    match e {
6150        // Already handled by the early-return at the top.
6151        Expr::WindowFunction { .. } => unreachable!(),
6152        Expr::Binary { lhs, rhs, .. } => {
6153            collect_window_nodes(lhs, out);
6154            collect_window_nodes(rhs, out);
6155        }
6156        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
6157            collect_window_nodes(expr, out);
6158        }
6159        Expr::FunctionCall { args, .. } => {
6160            for a in args {
6161                collect_window_nodes(a, out);
6162            }
6163        }
6164        Expr::Like { expr, pattern, .. } => {
6165            collect_window_nodes(expr, out);
6166            collect_window_nodes(pattern, out);
6167        }
6168        Expr::Extract { source, .. } => collect_window_nodes(source, out),
6169        _ => {}
6170    }
6171}
6172
6173fn rewrite_window_to_columns(e: &mut Expr, window_nodes: &[Expr]) {
6174    if let Expr::WindowFunction { .. } = e
6175        && let Some(idx) = window_nodes.iter().position(|w| w == e)
6176    {
6177        *e = Expr::Column(spg_sql::ast::ColumnName {
6178            qualifier: None,
6179            name: alloc::format!("__win_{idx}"),
6180        });
6181        return;
6182    }
6183    match e {
6184        Expr::Binary { lhs, rhs, .. } => {
6185            rewrite_window_to_columns(lhs, window_nodes);
6186            rewrite_window_to_columns(rhs, window_nodes);
6187        }
6188        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
6189            rewrite_window_to_columns(expr, window_nodes);
6190        }
6191        Expr::FunctionCall { args, .. } => {
6192            for a in args {
6193                rewrite_window_to_columns(a, window_nodes);
6194            }
6195        }
6196        Expr::Like { expr, pattern, .. } => {
6197            rewrite_window_to_columns(expr, window_nodes);
6198            rewrite_window_to_columns(pattern, window_nodes);
6199        }
6200        Expr::Extract { source, .. } => rewrite_window_to_columns(source, window_nodes),
6201        _ => {}
6202    }
6203}
6204
6205/// Total order over partition-key tuples. NULL sorts as the
6206/// lowest value (matches the `<` partial order's NULL-last
6207/// behaviour with `INFINITY` flipped).
6208fn partition_key_cmp(a: &[Value], b: &[Value]) -> core::cmp::Ordering {
6209    for (x, y) in a.iter().zip(b.iter()) {
6210        let c = value_cmp(x, y);
6211        if c != core::cmp::Ordering::Equal {
6212            return c;
6213        }
6214    }
6215    a.len().cmp(&b.len())
6216}
6217
6218fn order_key_cmp(a: &[(Value, bool)], b: &[(Value, bool)]) -> core::cmp::Ordering {
6219    for ((va, desc), (vb, _)) in a.iter().zip(b.iter()) {
6220        let c = value_cmp(va, vb);
6221        let c = if *desc { c.reverse() } else { c };
6222        if c != core::cmp::Ordering::Equal {
6223            return c;
6224        }
6225    }
6226    a.len().cmp(&b.len())
6227}
6228
6229#[allow(clippy::match_same_arms)] // explicit arms per type document the supported pairs
6230fn value_cmp(a: &Value, b: &Value) -> core::cmp::Ordering {
6231    use core::cmp::Ordering;
6232    match (a, b) {
6233        (Value::Null, Value::Null) => Ordering::Equal,
6234        (Value::Null, _) => Ordering::Less,
6235        (_, Value::Null) => Ordering::Greater,
6236        (Value::Int(x), Value::Int(y)) => x.cmp(y),
6237        (Value::BigInt(x), Value::BigInt(y)) => x.cmp(y),
6238        (Value::SmallInt(x), Value::SmallInt(y)) => x.cmp(y),
6239        (Value::Text(x), Value::Text(y)) => x.cmp(y),
6240        (Value::Bool(x), Value::Bool(y)) => x.cmp(y),
6241        (Value::Float(x), Value::Float(y)) => x.partial_cmp(y).unwrap_or(Ordering::Equal),
6242        (Value::Date(x), Value::Date(y)) => x.cmp(y),
6243        (Value::Timestamp(x), Value::Timestamp(y)) => x.cmp(y),
6244        // Cross-type compare: fall back to the debug rendering —
6245        // same-partition is the goal, exact order is irrelevant.
6246        _ => alloc::format!("{a:?}").cmp(&alloc::format!("{b:?}")),
6247    }
6248}
6249
6250/// Compute the window function's per-row output for one partition.
6251/// `slice` has (partition key, order key, original-row-index)
6252/// tuples already sorted by order key. `filtered_rows` is the
6253/// full row list indexed by original-row-index. `out_vals` is
6254/// the destination, also indexed by original-row-index.
6255#[allow(
6256    clippy::too_many_arguments,
6257    clippy::cast_possible_truncation,
6258    clippy::cast_possible_wrap,
6259    clippy::cast_precision_loss,
6260    clippy::cast_sign_loss,
6261    clippy::doc_markdown,
6262    clippy::too_many_lines,
6263    clippy::type_complexity,
6264    clippy::match_same_arms
6265)]
6266fn compute_window_partition(
6267    name: &str,
6268    args: &[Expr],
6269    ordered: bool,
6270    frame: Option<&WindowFrame>,
6271    null_treatment: spg_sql::ast::NullTreatment,
6272    slice: &[(Vec<Value>, Vec<(Value, bool)>, usize)],
6273    filtered_rows: &[&Row],
6274    ctx: &EvalContext<'_>,
6275    out_vals: &mut [Value],
6276) -> Result<(), EngineError> {
6277    let ignore_nulls = matches!(null_treatment, spg_sql::ast::NullTreatment::Ignore);
6278    let lower = name.to_ascii_lowercase();
6279    match lower.as_str() {
6280        "row_number" => {
6281            for (rank, (_, _, idx)) in slice.iter().enumerate() {
6282                out_vals[*idx] = Value::BigInt((rank + 1) as i64);
6283            }
6284            Ok(())
6285        }
6286        "rank" => {
6287            let mut prev_key: Option<&[(Value, bool)]> = None;
6288            let mut current_rank: i64 = 1;
6289            for (i, (_, okey, idx)) in slice.iter().enumerate() {
6290                if let Some(p) = prev_key
6291                    && order_key_cmp(p, okey) != core::cmp::Ordering::Equal
6292                {
6293                    current_rank = (i + 1) as i64;
6294                }
6295                if prev_key.is_none() {
6296                    current_rank = 1;
6297                }
6298                out_vals[*idx] = Value::BigInt(current_rank);
6299                prev_key = Some(okey.as_slice());
6300            }
6301            Ok(())
6302        }
6303        "dense_rank" => {
6304            let mut prev_key: Option<&[(Value, bool)]> = None;
6305            let mut current_rank: i64 = 0;
6306            for (_, okey, idx) in slice {
6307                if prev_key.is_none_or(|p| order_key_cmp(p, okey) != core::cmp::Ordering::Equal) {
6308                    current_rank += 1;
6309                }
6310                out_vals[*idx] = Value::BigInt(current_rank);
6311                prev_key = Some(okey.as_slice());
6312            }
6313            Ok(())
6314        }
6315        "sum" | "avg" | "min" | "max" | "count" | "count_star" => {
6316            // Pre-evaluate the function arg per row in the slice
6317            // (count_star has no arg).
6318            let arg_values: Vec<Value> = if lower == "count_star" || args.is_empty() {
6319                slice.iter().map(|_| Value::Null).collect()
6320            } else {
6321                slice
6322                    .iter()
6323                    .map(|(_, _, idx)| eval::eval_expr(&args[0], filtered_rows[*idx], ctx))
6324                    .collect::<Result<_, _>>()
6325                    .map_err(EngineError::Eval)?
6326            };
6327            // v4.20: pick the effective frame. Explicit frame
6328            // overrides the implicit default (running for ordered,
6329            // whole-partition for unordered).
6330            let eff = effective_frame(frame, ordered)?;
6331            #[allow(clippy::needless_range_loop)]
6332            for i in 0..slice.len() {
6333                let (lo, hi) = frame_bounds_for_row(&eff, i, slice);
6334                let mut sum: f64 = 0.0;
6335                let mut count: i64 = 0;
6336                let mut min_v: Option<f64> = None;
6337                let mut max_v: Option<f64> = None;
6338                let mut row_count: i64 = 0;
6339                if lo <= hi {
6340                    for j in lo..=hi {
6341                        let v = &arg_values[j];
6342                        match lower.as_str() {
6343                            "count_star" => row_count += 1,
6344                            "count" => {
6345                                if !v.is_null() {
6346                                    count += 1;
6347                                }
6348                            }
6349                            _ => {
6350                                if let Some(x) = value_to_f64(v) {
6351                                    sum += x;
6352                                    count += 1;
6353                                    min_v = Some(min_v.map_or(x, |m| m.min(x)));
6354                                    max_v = Some(max_v.map_or(x, |m| m.max(x)));
6355                                }
6356                            }
6357                        }
6358                    }
6359                }
6360                let value = match lower.as_str() {
6361                    "count_star" => Value::BigInt(row_count),
6362                    "count" => Value::BigInt(count),
6363                    "sum" => Value::Float(sum),
6364                    "avg" => {
6365                        if count == 0 {
6366                            Value::Null
6367                        } else {
6368                            Value::Float(sum / count as f64)
6369                        }
6370                    }
6371                    "min" => min_v.map_or(Value::Null, Value::Float),
6372                    "max" => max_v.map_or(Value::Null, Value::Float),
6373                    _ => unreachable!(),
6374                };
6375                let (_, _, idx) = &slice[i];
6376                out_vals[*idx] = value;
6377            }
6378            Ok(())
6379        }
6380        "lag" | "lead" => {
6381            // lag(expr [, offset [, default]])
6382            // lead(expr [, offset [, default]])
6383            if args.is_empty() {
6384                return Err(EngineError::Unsupported(alloc::format!(
6385                    "{lower}() requires at least one argument"
6386                )));
6387            }
6388            let offset: i64 = if args.len() >= 2 {
6389                let v = eval::eval_expr(&args[1], filtered_rows[slice[0].2], ctx)
6390                    .map_err(EngineError::Eval)?;
6391                match v {
6392                    Value::SmallInt(n) => i64::from(n),
6393                    Value::Int(n) => i64::from(n),
6394                    Value::BigInt(n) => n,
6395                    _ => {
6396                        return Err(EngineError::Unsupported(alloc::format!(
6397                            "{lower}() offset must be integer"
6398                        )));
6399                    }
6400                }
6401            } else {
6402                1
6403            };
6404            let default: Value = if args.len() >= 3 {
6405                eval::eval_expr(&args[2], filtered_rows[slice[0].2], ctx)
6406                    .map_err(EngineError::Eval)?
6407            } else {
6408                Value::Null
6409            };
6410            let values: Vec<Value> = slice
6411                .iter()
6412                .map(|(_, _, idx)| eval::eval_expr(&args[0], filtered_rows[*idx], ctx))
6413                .collect::<Result<_, _>>()
6414                .map_err(EngineError::Eval)?;
6415            let n = slice.len();
6416            for (i, (_, _, idx)) in slice.iter().enumerate() {
6417                let signed_offset = if lower == "lag" { -offset } else { offset };
6418                let v = if ignore_nulls {
6419                    // v6.4.2 — IGNORE NULLS: walk in the offset direction
6420                    // skipping NULL values; the `offset`-th non-NULL
6421                    // encountered is the result.
6422                    let step: i64 = if signed_offset >= 0 { 1 } else { -1 };
6423                    let needed: i64 = signed_offset.abs();
6424                    if needed == 0 {
6425                        values[i].clone()
6426                    } else {
6427                        let mut j: i64 = i as i64;
6428                        let mut hits: i64 = 0;
6429                        let mut found: Option<Value> = None;
6430                        loop {
6431                            j += step;
6432                            if j < 0 || j >= n as i64 {
6433                                break;
6434                            }
6435                            #[allow(clippy::cast_sign_loss)]
6436                            let v = &values[j as usize];
6437                            if !v.is_null() {
6438                                hits += 1;
6439                                if hits == needed {
6440                                    found = Some(v.clone());
6441                                    break;
6442                                }
6443                            }
6444                        }
6445                        found.unwrap_or_else(|| default.clone())
6446                    }
6447                } else {
6448                    let target_signed = i64::try_from(i).unwrap_or(i64::MAX) + signed_offset;
6449                    if target_signed < 0
6450                        || target_signed >= i64::try_from(n).unwrap_or(i64::MAX)
6451                    {
6452                        default.clone()
6453                    } else {
6454                        #[allow(clippy::cast_sign_loss)]
6455                        {
6456                            values[target_signed as usize].clone()
6457                        }
6458                    }
6459                };
6460                out_vals[*idx] = v;
6461            }
6462            Ok(())
6463        }
6464        "first_value" | "last_value" | "nth_value" => {
6465            if args.is_empty() {
6466                return Err(EngineError::Unsupported(alloc::format!(
6467                    "{lower}() requires at least one argument"
6468                )));
6469            }
6470            let values: Vec<Value> = slice
6471                .iter()
6472                .map(|(_, _, idx)| eval::eval_expr(&args[0], filtered_rows[*idx], ctx))
6473                .collect::<Result<_, _>>()
6474                .map_err(EngineError::Eval)?;
6475            let nth: usize = if lower == "nth_value" {
6476                if args.len() < 2 {
6477                    return Err(EngineError::Unsupported(
6478                        "nth_value() requires (expr, n)".into(),
6479                    ));
6480                }
6481                let v = eval::eval_expr(&args[1], filtered_rows[slice[0].2], ctx)
6482                    .map_err(EngineError::Eval)?;
6483                let raw = match v {
6484                    Value::SmallInt(n) => i64::from(n),
6485                    Value::Int(n) => i64::from(n),
6486                    Value::BigInt(n) => n,
6487                    _ => {
6488                        return Err(EngineError::Unsupported(
6489                            "nth_value() n must be integer".into(),
6490                        ));
6491                    }
6492                };
6493                if raw < 1 {
6494                    return Err(EngineError::Unsupported(
6495                        "nth_value() n must be >= 1".into(),
6496                    ));
6497                }
6498                #[allow(clippy::cast_sign_loss)]
6499                {
6500                    raw as usize
6501                }
6502            } else {
6503                0
6504            };
6505            let eff = effective_frame(frame, ordered)?;
6506            for i in 0..slice.len() {
6507                let (lo, hi) = frame_bounds_for_row(&eff, i, slice);
6508                let (_, _, idx) = &slice[i];
6509                let v = if lo > hi {
6510                    Value::Null
6511                } else if ignore_nulls && matches!(lower.as_str(), "first_value" | "last_value") {
6512                    // v6.4.2 — IGNORE NULLS: skip NULL cells when
6513                    // selecting the boundary value within the frame.
6514                    if lower == "first_value" {
6515                        (lo..=hi)
6516                            .find_map(|j| {
6517                                let v = &values[j];
6518                                (!v.is_null()).then(|| v.clone())
6519                            })
6520                            .unwrap_or(Value::Null)
6521                    } else {
6522                        (lo..=hi)
6523                            .rev()
6524                            .find_map(|j| {
6525                                let v = &values[j];
6526                                (!v.is_null()).then(|| v.clone())
6527                            })
6528                            .unwrap_or(Value::Null)
6529                    }
6530                } else {
6531                    match lower.as_str() {
6532                        "first_value" => values[lo].clone(),
6533                        "last_value" => values[hi].clone(),
6534                        "nth_value" => {
6535                            let pos = lo + nth - 1;
6536                            if pos > hi {
6537                                Value::Null
6538                            } else {
6539                                values[pos].clone()
6540                            }
6541                        }
6542                        _ => unreachable!(),
6543                    }
6544                };
6545                out_vals[*idx] = v;
6546            }
6547            Ok(())
6548        }
6549        "ntile" => {
6550            if args.is_empty() {
6551                return Err(EngineError::Unsupported(
6552                    "ntile(n) requires an integer argument".into(),
6553                ));
6554            }
6555            let v = eval::eval_expr(&args[0], filtered_rows[slice[0].2], ctx)
6556                .map_err(EngineError::Eval)?;
6557            let bucket_count: i64 = match v {
6558                Value::SmallInt(n) => i64::from(n),
6559                Value::Int(n) => i64::from(n),
6560                Value::BigInt(n) => n,
6561                _ => {
6562                    return Err(EngineError::Unsupported(
6563                        "ntile() argument must be integer".into(),
6564                    ));
6565                }
6566            };
6567            if bucket_count < 1 {
6568                return Err(EngineError::Unsupported(
6569                    "ntile() argument must be >= 1".into(),
6570                ));
6571            }
6572            #[allow(clippy::cast_sign_loss)]
6573            let buckets = bucket_count as usize;
6574            let n = slice.len();
6575            // Each bucket gets `base` rows; the first `extras` buckets
6576            // get one extra. PG semantics.
6577            let base = n / buckets;
6578            let extras = n % buckets;
6579            let mut bucket: usize = 1;
6580            let mut remaining_in_bucket = if extras > 0 { base + 1 } else { base };
6581            let mut buckets_with_extra_remaining = extras;
6582            for (_, _, idx) in slice {
6583                if remaining_in_bucket == 0 {
6584                    bucket += 1;
6585                    buckets_with_extra_remaining = buckets_with_extra_remaining.saturating_sub(1);
6586                    remaining_in_bucket = if buckets_with_extra_remaining > 0 {
6587                        base + 1
6588                    } else {
6589                        base
6590                    };
6591                    // Edge: if base==0 and extras==0, all rows fit;
6592                    // shouldn't reach here, but guard anyway.
6593                    if remaining_in_bucket == 0 {
6594                        remaining_in_bucket = 1;
6595                    }
6596                }
6597                out_vals[*idx] = Value::BigInt(i64::try_from(bucket).unwrap_or(i64::MAX));
6598                remaining_in_bucket -= 1;
6599            }
6600            Ok(())
6601        }
6602        "percent_rank" => {
6603            // (rank - 1) / (n - 1) where rank is the standard RANK().
6604            // Single-row partitions get 0.
6605            let n = slice.len();
6606            let mut prev_key: Option<&[(Value, bool)]> = None;
6607            let mut current_rank: i64 = 1;
6608            for (i, (_, okey, idx)) in slice.iter().enumerate() {
6609                if let Some(p) = prev_key
6610                    && order_key_cmp(p, okey) != core::cmp::Ordering::Equal
6611                {
6612                    current_rank = i64::try_from(i + 1).unwrap_or(i64::MAX);
6613                }
6614                if prev_key.is_none() {
6615                    current_rank = 1;
6616                }
6617                #[allow(clippy::cast_precision_loss)]
6618                let pr = if n <= 1 {
6619                    0.0
6620                } else {
6621                    (current_rank - 1) as f64 / (n - 1) as f64
6622                };
6623                out_vals[*idx] = Value::Float(pr);
6624                prev_key = Some(okey.as_slice());
6625            }
6626            Ok(())
6627        }
6628        "cume_dist" => {
6629            // # rows up to and including this row's peer group / n.
6630            let n = slice.len();
6631            // First pass: find peer-group-end rank for each row.
6632            for i in 0..slice.len() {
6633                let peer_end = peer_group_end(slice, i);
6634                #[allow(clippy::cast_precision_loss)]
6635                let cd = (peer_end + 1) as f64 / n as f64;
6636                let (_, _, idx) = &slice[i];
6637                out_vals[*idx] = Value::Float(cd);
6638            }
6639            Ok(())
6640        }
6641        other => Err(EngineError::Unsupported(alloc::format!(
6642            "window function {other:?} not supported (v4.21: row_number/rank/dense_rank/sum/avg/count/min/max/lag/lead/first_value/last_value/nth_value/ntile/percent_rank/cume_dist)"
6643        ))),
6644    }
6645}
6646
6647/// v4.20: resolve the user-provided frame down to a normalised
6648/// `(kind, start, end)`. `None` means default — derive from
6649/// `ordered`: ordered ⇒ RANGE UNBOUNDED PRECEDING AND CURRENT ROW,
6650/// unordered ⇒ ROWS UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING.
6651/// Single-bound shorthand (e.g. `ROWS 5 PRECEDING`) normalises
6652/// end → CURRENT ROW per the PG spec.
6653fn effective_frame(
6654    frame: Option<&WindowFrame>,
6655    ordered: bool,
6656) -> Result<(FrameKind, FrameBound, FrameBound), EngineError> {
6657    match frame {
6658        None => {
6659            if ordered {
6660                Ok((
6661                    FrameKind::Range,
6662                    FrameBound::UnboundedPreceding,
6663                    FrameBound::CurrentRow,
6664                ))
6665            } else {
6666                Ok((
6667                    FrameKind::Rows,
6668                    FrameBound::UnboundedPreceding,
6669                    FrameBound::UnboundedFollowing,
6670                ))
6671            }
6672        }
6673        Some(fr) => {
6674            let end = fr.end.clone().unwrap_or(FrameBound::CurrentRow);
6675            // Reject start > end (a few impossible combinations).
6676            if matches!(fr.start, FrameBound::UnboundedFollowing)
6677                || matches!(end, FrameBound::UnboundedPreceding)
6678            {
6679                return Err(EngineError::Unsupported(alloc::format!(
6680                    "invalid frame: start={:?} end={:?}",
6681                    fr.start,
6682                    end
6683                )));
6684            }
6685            // RANGE OFFSET PRECEDING / FOLLOWING needs value-typed
6686            // arithmetic on the ORDER BY key (e.g. `RANGE BETWEEN
6687            // INTERVAL '1 day' PRECEDING AND CURRENT ROW`). Not
6688            // implemented in v4.20.
6689            if fr.kind == FrameKind::Range
6690                && (matches!(
6691                    fr.start,
6692                    FrameBound::OffsetPreceding(_) | FrameBound::OffsetFollowing(_)
6693                ) || matches!(
6694                    end,
6695                    FrameBound::OffsetPreceding(_) | FrameBound::OffsetFollowing(_)
6696                ))
6697            {
6698                return Err(EngineError::Unsupported(
6699                    "RANGE with explicit offset bounds is not supported (v4.20: only UNBOUNDED / CURRENT ROW for RANGE)".into(),
6700                ));
6701            }
6702            Ok((fr.kind, fr.start.clone(), end))
6703        }
6704    }
6705}
6706
6707/// Compute `(lo, hi)` row-index bounds inside the partition slice
6708/// for the row at position `i`. Inclusive, clamped to
6709/// `[0, slice.len()-1]`. Empty result if `lo > hi`.
6710#[allow(clippy::type_complexity)]
6711fn frame_bounds_for_row(
6712    eff: &(FrameKind, FrameBound, FrameBound),
6713    i: usize,
6714    slice: &[(Vec<Value>, Vec<(Value, bool)>, usize)],
6715) -> (usize, usize) {
6716    let (kind, start, end) = eff;
6717    let n = slice.len();
6718    let last = n.saturating_sub(1);
6719    let (mut lo, mut hi) = match kind {
6720        FrameKind::Rows => {
6721            let lo = match start {
6722                FrameBound::UnboundedPreceding => 0,
6723                FrameBound::OffsetPreceding(k) => {
6724                    let k = usize::try_from(*k).unwrap_or(usize::MAX);
6725                    i.saturating_sub(k)
6726                }
6727                FrameBound::CurrentRow => i,
6728                FrameBound::OffsetFollowing(k) => {
6729                    let k = usize::try_from(*k).unwrap_or(usize::MAX);
6730                    i.saturating_add(k).min(last)
6731                }
6732                FrameBound::UnboundedFollowing => last,
6733            };
6734            let hi = match end {
6735                FrameBound::UnboundedPreceding => 0,
6736                FrameBound::OffsetPreceding(k) => {
6737                    let k = usize::try_from(*k).unwrap_or(usize::MAX);
6738                    i.saturating_sub(k)
6739                }
6740                FrameBound::CurrentRow => i,
6741                FrameBound::OffsetFollowing(k) => {
6742                    let k = usize::try_from(*k).unwrap_or(usize::MAX);
6743                    i.saturating_add(k).min(last)
6744                }
6745                FrameBound::UnboundedFollowing => last,
6746            };
6747            (lo, hi)
6748        }
6749        FrameKind::Range => {
6750            // RANGE bounds are peer-aware. With only UNBOUNDED and
6751            // CURRENT ROW supported (rejected at effective_frame for
6752            // explicit offsets), the start/end map to the
6753            // partition's full extent at the same-order-key peer
6754            // group boundary.
6755            let lo = match start {
6756                FrameBound::UnboundedPreceding => 0,
6757                FrameBound::CurrentRow => peer_group_start(slice, i),
6758                FrameBound::UnboundedFollowing => last,
6759                _ => unreachable!("offset bounds rejected for RANGE"),
6760            };
6761            let hi = match end {
6762                FrameBound::UnboundedPreceding => 0,
6763                FrameBound::CurrentRow => peer_group_end(slice, i),
6764                FrameBound::UnboundedFollowing => last,
6765                _ => unreachable!("offset bounds rejected for RANGE"),
6766            };
6767            (lo, hi)
6768        }
6769    };
6770    if hi >= n {
6771        hi = last;
6772    }
6773    if lo >= n {
6774        lo = last;
6775    }
6776    (lo, hi)
6777}
6778
6779/// Find the inclusive index of the first row with the same ORDER
6780/// BY key as `slice[i]`. Slice is already sorted by partition then
6781/// order, so peers are contiguous.
6782#[allow(clippy::type_complexity)]
6783fn peer_group_start(slice: &[(Vec<Value>, Vec<(Value, bool)>, usize)], i: usize) -> usize {
6784    let key = &slice[i].1;
6785    let mut j = i;
6786    while j > 0 && order_key_cmp(&slice[j - 1].1, key) == core::cmp::Ordering::Equal {
6787        j -= 1;
6788    }
6789    j
6790}
6791
6792/// Find the inclusive index of the last row with the same ORDER
6793/// BY key as `slice[i]`.
6794#[allow(clippy::type_complexity)]
6795fn peer_group_end(slice: &[(Vec<Value>, Vec<(Value, bool)>, usize)], i: usize) -> usize {
6796    let key = &slice[i].1;
6797    let mut j = i;
6798    while j + 1 < slice.len() && order_key_cmp(&slice[j + 1].1, key) == core::cmp::Ordering::Equal {
6799        j += 1;
6800    }
6801    j
6802}
6803
6804fn value_to_f64(v: &Value) -> Option<f64> {
6805    match v {
6806        Value::SmallInt(n) => Some(f64::from(*n)),
6807        Value::Int(n) => Some(f64::from(*n)),
6808        #[allow(clippy::cast_precision_loss)]
6809        Value::BigInt(n) => Some(*n as f64),
6810        Value::Float(x) => Some(*x),
6811        _ => None,
6812    }
6813}
6814
6815/// Quick scan for any subquery-bearing node in a SELECT's WHERE /
6816/// projection / `order_by` — saves cloning the AST when there are
6817/// none (the common case).
6818fn expr_tree_has_subquery(stmt: &SelectStatement) -> bool {
6819    let mut any = false;
6820    for item in &stmt.items {
6821        if let SelectItem::Expr { expr, .. } = item {
6822            any = any || expr_has_subquery(expr);
6823        }
6824    }
6825    if let Some(w) = &stmt.where_ {
6826        any = any || expr_has_subquery(w);
6827    }
6828    if let Some(h) = &stmt.having {
6829        any = any || expr_has_subquery(h);
6830    }
6831    for o in &stmt.order_by {
6832        any = any || expr_has_subquery(&o.expr);
6833    }
6834    for (_, peer) in &stmt.unions {
6835        any = any || expr_tree_has_subquery(peer);
6836    }
6837    any
6838}
6839
6840fn expr_has_subquery(e: &Expr) -> bool {
6841    match e {
6842        Expr::ScalarSubquery(_) | Expr::Exists { .. } | Expr::InSubquery { .. } => true,
6843        Expr::Binary { lhs, rhs, .. } => expr_has_subquery(lhs) || expr_has_subquery(rhs),
6844        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
6845            expr_has_subquery(expr)
6846        }
6847        Expr::FunctionCall { args, .. } => args.iter().any(expr_has_subquery),
6848        Expr::Like { expr, pattern, .. } => expr_has_subquery(expr) || expr_has_subquery(pattern),
6849        Expr::Extract { source, .. } => expr_has_subquery(source),
6850        Expr::WindowFunction {
6851            args,
6852            partition_by,
6853            order_by,
6854            ..
6855        } => {
6856            args.iter().any(expr_has_subquery)
6857                || partition_by.iter().any(expr_has_subquery)
6858                || order_by.iter().any(|(e, _)| expr_has_subquery(e))
6859        }
6860        Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => false,
6861    }
6862}
6863
6864/// v4.10 helper: materialise a runtime `Value` back into an AST
6865/// `Expr::Literal` for the subquery-rewrite path. Supports the
6866/// types `Literal` can represent (Integer / Float / Text / Bool /
6867/// Null). Date / Timestamp / Numeric / Vector / Interval / JSON
6868/// would lose precision through Literal and aren't supported in
6869/// uncorrelated-subquery results; they error with a clear hint.
6870fn value_to_literal_expr(v: Value) -> Result<Expr, EngineError> {
6871    let lit = match v {
6872        Value::Null => Literal::Null,
6873        Value::SmallInt(n) => Literal::Integer(i64::from(n)),
6874        Value::Int(n) => Literal::Integer(i64::from(n)),
6875        Value::BigInt(n) => Literal::Integer(n),
6876        Value::Float(x) => Literal::Float(x),
6877        Value::Text(s) | Value::Json(s) => Literal::String(s),
6878        Value::Bool(b) => Literal::Bool(b),
6879        other => {
6880            return Err(EngineError::Unsupported(alloc::format!(
6881                "subquery result type {:?} not yet materialisable; cast to text or integer in the inner SELECT",
6882                other.data_type()
6883            )));
6884        }
6885    };
6886    Ok(Expr::Literal(lit))
6887}
6888
6889/// v6.1.1 — walk the prepared `Statement` AST and replace every
6890/// `Expr::Placeholder(n)` with `Expr::Literal(value_to_literal(
6891/// params[n-1]))`. The dispatch downstream sees a `Statement`
6892/// indistinguishable from a simple-query parse, so the exec path
6893/// stays unchanged.
6894///
6895/// Errors fall into one shape: a `$N` references past the bound
6896/// `params.len()`. Out-of-range happens when the Bind didn't
6897/// supply enough values; pgwire surfaces this as a protocol error
6898/// to the client.
6899fn substitute_placeholders(stmt: &mut Statement, params: &[Value]) -> Result<(), EngineError> {
6900    match stmt {
6901        Statement::Select(s) => substitute_select(s, params)?,
6902        Statement::Insert(ins) => {
6903            for row in &mut ins.rows {
6904                for e in row {
6905                    substitute_expr(e, params)?;
6906                }
6907            }
6908        }
6909        Statement::Update(u) => {
6910            for (_, e) in &mut u.assignments {
6911                substitute_expr(e, params)?;
6912            }
6913            if let Some(w) = &mut u.where_ {
6914                substitute_expr(w, params)?;
6915            }
6916        }
6917        Statement::Delete(d) => {
6918            if let Some(w) = &mut d.where_ {
6919                substitute_expr(w, params)?;
6920            }
6921        }
6922        Statement::Explain(e) => substitute_select(&mut e.inner, params)?,
6923        // Other statements (CREATE / BEGIN / SHOW / …) have no
6924        // expression slots; no walk needed.
6925        _ => {}
6926    }
6927    Ok(())
6928}
6929
6930fn substitute_select(
6931    s: &mut SelectStatement,
6932    params: &[Value],
6933) -> Result<(), EngineError> {
6934    for item in &mut s.items {
6935        if let SelectItem::Expr { expr, .. } = item {
6936            substitute_expr(expr, params)?;
6937        }
6938    }
6939    if let Some(w) = &mut s.where_ {
6940        substitute_expr(w, params)?;
6941    }
6942    if let Some(gs) = &mut s.group_by {
6943        for g in gs {
6944            substitute_expr(g, params)?;
6945        }
6946    }
6947    if let Some(h) = &mut s.having {
6948        substitute_expr(h, params)?;
6949    }
6950    for o in &mut s.order_by {
6951        substitute_expr(&mut o.expr, params)?;
6952    }
6953    for (_, peer) in &mut s.unions {
6954        substitute_select(peer, params)?;
6955    }
6956    // v7.9.24 — LIMIT $N / OFFSET $N placeholder resolution.
6957    // mailrs H2. After this pass each LIMIT/OFFSET that was a
6958    // Placeholder is rewritten to Literal so the existing
6959    // `LimitExpr::as_literal` path consumes a concrete u32.
6960    if let Some(le) = s.limit {
6961        s.limit = Some(resolve_limit_placeholder(le, params)?);
6962    }
6963    if let Some(le) = s.offset {
6964        s.offset = Some(resolve_limit_placeholder(le, params)?);
6965    }
6966    Ok(())
6967}
6968
6969fn resolve_limit_placeholder(
6970    le: spg_sql::ast::LimitExpr,
6971    params: &[Value],
6972) -> Result<spg_sql::ast::LimitExpr, EngineError> {
6973    use spg_sql::ast::LimitExpr;
6974    match le {
6975        LimitExpr::Literal(_) => Ok(le),
6976        LimitExpr::Placeholder(n) => {
6977            let idx = usize::from(n).saturating_sub(1);
6978            let v = params.get(idx).ok_or_else(|| {
6979                EngineError::Eval(EvalError::PlaceholderOutOfRange {
6980                    n,
6981                    bound: u16::try_from(params.len()).unwrap_or(u16::MAX),
6982                })
6983            })?;
6984            let int = match v {
6985                Value::SmallInt(x) => Some(i64::from(*x)),
6986                Value::Int(x) => Some(i64::from(*x)),
6987                Value::BigInt(x) => Some(*x),
6988                _ => None,
6989            }
6990            .ok_or_else(|| {
6991                EngineError::Unsupported(alloc::format!(
6992                    "LIMIT/OFFSET ${n} bound to non-integer {v:?}"
6993                ))
6994            })?;
6995            if int < 0 {
6996                return Err(EngineError::Unsupported(alloc::format!(
6997                    "LIMIT/OFFSET ${n} bound to negative value {int}"
6998                )));
6999            }
7000            let bounded = u32::try_from(int).map_err(|_| {
7001                EngineError::Unsupported(alloc::format!(
7002                    "LIMIT/OFFSET ${n} value {int} exceeds u32 range"
7003                ))
7004            })?;
7005            Ok(LimitExpr::Literal(bounded))
7006        }
7007    }
7008}
7009
7010fn substitute_expr(e: &mut Expr, params: &[Value]) -> Result<(), EngineError> {
7011    if let Expr::Placeholder(n) = e {
7012        let idx = usize::from(*n).saturating_sub(1);
7013        let v = params.get(idx).ok_or_else(|| {
7014            EngineError::Eval(EvalError::PlaceholderOutOfRange {
7015                n: *n,
7016                bound: u16::try_from(params.len()).unwrap_or(u16::MAX),
7017            })
7018        })?;
7019        *e = Expr::Literal(value_to_literal(v.clone()));
7020        return Ok(());
7021    }
7022    match e {
7023        Expr::Binary { lhs, rhs, .. } => {
7024            substitute_expr(lhs, params)?;
7025            substitute_expr(rhs, params)?;
7026        }
7027        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
7028            substitute_expr(expr, params)?;
7029        }
7030        Expr::FunctionCall { args, .. } => {
7031            for a in args {
7032                substitute_expr(a, params)?;
7033            }
7034        }
7035        Expr::Like { expr, pattern, .. } => {
7036            substitute_expr(expr, params)?;
7037            substitute_expr(pattern, params)?;
7038        }
7039        Expr::Extract { source, .. } => substitute_expr(source, params)?,
7040        Expr::ScalarSubquery(s) => substitute_select(s, params)?,
7041        Expr::Exists { subquery, .. } => substitute_select(subquery, params)?,
7042        Expr::InSubquery { expr, subquery, .. } => {
7043            substitute_expr(expr, params)?;
7044            substitute_select(subquery, params)?;
7045        }
7046        Expr::WindowFunction {
7047            args,
7048            partition_by,
7049            order_by,
7050            ..
7051        } => {
7052            for a in args {
7053                substitute_expr(a, params)?;
7054            }
7055            for p in partition_by {
7056                substitute_expr(p, params)?;
7057            }
7058            for (e, _) in order_by {
7059                substitute_expr(e, params)?;
7060            }
7061        }
7062        Expr::Literal(_) | Expr::Column(_) => {}
7063        // Already handled above.
7064        Expr::Placeholder(_) => unreachable!("Placeholder handled at top of fn"),
7065    }
7066    Ok(())
7067}
7068
7069/// v6.1.1 — convert a runtime `Value` into the closest matching
7070/// `Literal` for the substitute walker. Lossless for the simple
7071/// scalars (Int / Float / Text / Bool); Numeric / Date / Timestamp
7072/// / Json / Interval render as their canonical text form so the
7073/// downstream coerce_value can re-parse against the target column
7074/// type. SQ8 / HalfVector cells are NOT expected as bind params;
7075/// pgwire's Bind decodes vector params to the f32 representation
7076/// before they reach this helper.
7077/// v6.2.0 — total ordering on `Value`s used by ANALYZE to sort a
7078/// column's non-NULL sample before histogram building. Cross-type
7079/// pairs (Int vs Float, Date vs Timestamp, …) compare via the
7080/// same widening the eval-side `compare` operator uses; everything
7081/// else (the genuinely-incompatible pairs) falls back to ordering
7082/// by canonical string form so the sort is still total + stable.
7083/// Vector / SQ8 / Half / Json / Numeric / Interval values reach
7084/// here only via the string-fallback path because vector columns
7085/// are filtered out upstream.
7086fn sort_values_for_histogram(a: &Value, b: &Value) -> core::cmp::Ordering {
7087    use core::cmp::Ordering;
7088    match (a, b) {
7089        (Value::SmallInt(a), Value::SmallInt(b)) => a.cmp(b),
7090        (Value::Int(a), Value::Int(b)) => a.cmp(b),
7091        (Value::BigInt(a), Value::BigInt(b)) => a.cmp(b),
7092        (Value::SmallInt(a), Value::Int(b)) => i32::from(*a).cmp(b),
7093        (Value::Int(a), Value::SmallInt(b)) => a.cmp(&i32::from(*b)),
7094        (Value::Int(a), Value::BigInt(b)) => i64::from(*a).cmp(b),
7095        (Value::BigInt(a), Value::Int(b)) => a.cmp(&i64::from(*b)),
7096        (Value::SmallInt(a), Value::BigInt(b)) => i64::from(*a).cmp(b),
7097        (Value::BigInt(a), Value::SmallInt(b)) => a.cmp(&i64::from(*b)),
7098        (Value::Float(a), Value::Float(b)) => a.partial_cmp(b).unwrap_or(Ordering::Equal),
7099        (Value::Text(a), Value::Text(b)) | (Value::Json(a), Value::Json(b)) => a.cmp(b),
7100        (Value::Bool(a), Value::Bool(b)) => a.cmp(b),
7101        (Value::Date(a), Value::Date(b)) => a.cmp(b),
7102        (Value::Timestamp(a), Value::Timestamp(b)) => a.cmp(b),
7103        // Mixed numeric/float — widen to f64 and compare.
7104        (Value::SmallInt(n), Value::Float(x)) => {
7105            (f64::from(*n)).partial_cmp(x).unwrap_or(Ordering::Equal)
7106        }
7107        (Value::Float(x), Value::SmallInt(n)) => {
7108            x.partial_cmp(&f64::from(*n)).unwrap_or(Ordering::Equal)
7109        }
7110        (Value::Int(n), Value::Float(x)) => {
7111            (f64::from(*n)).partial_cmp(x).unwrap_or(Ordering::Equal)
7112        }
7113        (Value::Float(x), Value::Int(n)) => {
7114            x.partial_cmp(&f64::from(*n)).unwrap_or(Ordering::Equal)
7115        }
7116        (Value::BigInt(n), Value::Float(x)) => {
7117            #[allow(clippy::cast_precision_loss)]
7118            let nf = *n as f64;
7119            nf.partial_cmp(x).unwrap_or(Ordering::Equal)
7120        }
7121        (Value::Float(x), Value::BigInt(n)) => {
7122            #[allow(clippy::cast_precision_loss)]
7123            let nf = *n as f64;
7124            x.partial_cmp(&nf).unwrap_or(Ordering::Equal)
7125        }
7126        // Cross-type fallback: lexicographic on canonical form.
7127        // Total + stable so the sort is well-defined.
7128        _ => canonical_value_repr(a).cmp(&canonical_value_repr(b)),
7129    }
7130}
7131
7132/// v6.2.0 — render the histogram bounds list as a `[v0, v1, ...]`
7133/// string for the `spg_statistic.histogram_bounds` column. Values
7134/// containing `,` or `[` / `]` are JSON-style escaped so the
7135/// rendering round-trips through a future parser; v6.2.0 only
7136/// uses the rendered form for human consumption, so the escaping
7137/// is conservative.
7138fn render_histogram_bounds(bounds: &[alloc::string::String]) -> alloc::string::String {
7139    let mut out = alloc::string::String::with_capacity(bounds.len() * 8 + 2);
7140    out.push('[');
7141    for (i, b) in bounds.iter().enumerate() {
7142        if i > 0 {
7143            out.push_str(", ");
7144        }
7145        let needs_quote = b.contains([',', '[', ']', '"']) || b.is_empty();
7146        if needs_quote {
7147            out.push('"');
7148            for ch in b.chars() {
7149                if ch == '"' || ch == '\\' {
7150                    out.push('\\');
7151                }
7152                out.push(ch);
7153            }
7154            out.push('"');
7155        } else {
7156            out.push_str(b);
7157        }
7158    }
7159    out.push(']');
7160    out
7161}
7162
7163/// v6.2.0 — canonical textual form of a `Value` for histogram
7164/// bound storage. Strings used by ANALYZE for sort + bound output.
7165/// INT / BIGINT → decimal; FLOAT → shortest-round-trip via
7166/// `{:?}`; TEXT pass-through; BOOL → `t` / `f`; DATE / TIMESTAMP →
7167/// the same form `format_date` / `format_timestamp` produce for
7168/// SQL Display. Vector / SQ8 / Half / Json / Numeric / Interval
7169/// reach this only via a non-Vector column (vector columns are
7170/// skipped upstream); they fall back to a Debug-derived form so
7171/// stats still serialise without crashing.
7172pub(crate) fn canonical_value_repr(v: &Value) -> alloc::string::String {
7173    match v {
7174        Value::Null => "NULL".to_string(),
7175        Value::SmallInt(n) => alloc::format!("{n}"),
7176        Value::Int(n) => alloc::format!("{n}"),
7177        Value::BigInt(n) => alloc::format!("{n}"),
7178        Value::Float(x) => alloc::format!("{x:?}"),
7179        Value::Text(s) | Value::Json(s) => s.clone(),
7180        Value::Bool(b) => if *b { "t" } else { "f" }.to_string(),
7181        Value::Date(d) => eval::format_date(*d),
7182        Value::Timestamp(t) => eval::format_timestamp(*t),
7183        Value::Interval { months, micros } => eval::format_interval(*months, *micros),
7184        Value::Numeric { scaled, scale } => eval::format_numeric(*scaled, *scale),
7185        Value::Vector(_) | Value::Sq8Vector(_) | Value::HalfVector(_) => {
7186            // Unreachable in practice (vector columns are filtered
7187            // out before this). Defensive fallback so a future
7188            // vector-stats path doesn't crash.
7189            alloc::format!("{v:?}")
7190        }
7191        // v7.5.0 — Value is #[non_exhaustive] for downstream
7192        // forward-compat. Future variants fall through to Debug
7193        // form here (same shape as the vector fallback above).
7194        _ => alloc::format!("{v:?}"),
7195    }
7196}
7197
7198/// v6.2.0 — true for engine-managed catalog tables that the bare
7199/// `ANALYZE` (no target) should skip. v6.2.0 has no internal
7200/// tables yet (publications / subscriptions / users / statistics
7201/// all live as engine fields, not catalog tables), so this is a
7202/// reserved future-proofing hook — every existing user table is
7203/// analysed.
7204const fn is_internal_table_name(_name: &str) -> bool {
7205    false
7206}
7207
7208fn value_to_literal(v: Value) -> Literal {
7209    match v {
7210        Value::Null => Literal::Null,
7211        Value::SmallInt(n) => Literal::Integer(i64::from(n)),
7212        Value::Int(n) => Literal::Integer(i64::from(n)),
7213        Value::BigInt(n) => Literal::Integer(n),
7214        Value::Float(x) => Literal::Float(x),
7215        Value::Text(s) | Value::Json(s) => Literal::String(s),
7216        Value::Bool(b) => Literal::Bool(b),
7217        Value::Vector(v) => Literal::Vector(v),
7218        Value::Numeric { scaled, scale } => {
7219            Literal::String(eval::format_numeric(scaled, scale))
7220        }
7221        Value::Date(d) => Literal::String(eval::format_date(d)),
7222        Value::Timestamp(t) => Literal::String(eval::format_timestamp(t)),
7223        Value::Interval { months, micros } => Literal::Interval {
7224            months,
7225            micros,
7226            text: eval::format_interval(months, micros),
7227        },
7228        // SQ8 / halfvec cells dequantise to f32 before reaching the
7229        // substitute walker; pgwire's Bind path handles that.
7230        Value::Sq8Vector(q) => Literal::Vector(spg_storage::quantize::dequantize(&q)),
7231        Value::HalfVector(h) => Literal::Vector(h.to_f32_vec()),
7232        // v7.5.0 — Value is #[non_exhaustive]; future variants
7233        // render as Debug-form String literal until explicit
7234        // mapping is added.
7235        v => Literal::String(alloc::format!("{v:?}")),
7236    }
7237}
7238
7239fn rewrite_clock_calls(stmt: &mut Statement, now_micros: Option<i64>) {
7240    let Some(now) = now_micros else {
7241        return;
7242    };
7243    match stmt {
7244        Statement::Select(s) => rewrite_select_clock(s, now),
7245        Statement::Insert(ins) => {
7246            for row in &mut ins.rows {
7247                for e in row {
7248                    rewrite_expr_clock(e, now);
7249                }
7250            }
7251        }
7252        _ => {}
7253    }
7254}
7255
7256fn rewrite_select_clock(s: &mut SelectStatement, now: i64) {
7257    for item in &mut s.items {
7258        if let SelectItem::Expr { expr, .. } = item {
7259            rewrite_expr_clock(expr, now);
7260        }
7261    }
7262    if let Some(w) = &mut s.where_ {
7263        rewrite_expr_clock(w, now);
7264    }
7265    if let Some(gs) = &mut s.group_by {
7266        for g in gs {
7267            rewrite_expr_clock(g, now);
7268        }
7269    }
7270    if let Some(h) = &mut s.having {
7271        rewrite_expr_clock(h, now);
7272    }
7273    for o in &mut s.order_by {
7274        rewrite_expr_clock(&mut o.expr, now);
7275    }
7276    for (_, peer) in &mut s.unions {
7277        rewrite_select_clock(peer, now);
7278    }
7279}
7280
7281/// v3.0.3 hot path: every recursion lands in exactly one `match` arm.
7282/// Literal / Column-with-qualifier (the dominant cases on a typical
7283/// AST) take a single pattern dispatch and exit. The clock-rewrite
7284/// targets (zero-arg `NOW` / `CURRENT_TIMESTAMP` / `CURRENT_DATE`
7285/// functions, and bare `CURRENT_TIMESTAMP` / `CURRENT_DATE` column
7286/// refs) sit on their own arms with match guards so the fall-through
7287/// to the recursive arms is unambiguous.
7288fn rewrite_expr_clock(e: &mut Expr, now: i64) {
7289    // Fast-path test on the no-recursion shapes first. We can't fold
7290    // them into the big match below because they need to *replace* `e`
7291    // outright; the recursive arms below match on its sub-fields.
7292    if let Some(replacement) = clock_replacement_for(e, now) {
7293        *e = replacement;
7294        return;
7295    }
7296    match e {
7297        Expr::Binary { lhs, rhs, .. } => {
7298            rewrite_expr_clock(lhs, now);
7299            rewrite_expr_clock(rhs, now);
7300        }
7301        Expr::Unary { expr, .. } | Expr::Cast { expr, .. } | Expr::IsNull { expr, .. } => {
7302            rewrite_expr_clock(expr, now);
7303        }
7304        Expr::FunctionCall { args, .. } => {
7305            for a in args {
7306                rewrite_expr_clock(a, now);
7307            }
7308        }
7309        Expr::Like { expr, pattern, .. } => {
7310            rewrite_expr_clock(expr, now);
7311            rewrite_expr_clock(pattern, now);
7312        }
7313        Expr::Extract { source, .. } => rewrite_expr_clock(source, now),
7314        // v4.10 subquery nodes — recurse into the inner SELECT's
7315        // expression slots so e.g. SELECT NOW() in a scalar
7316        // subquery picks up the same instant as the outer query.
7317        Expr::ScalarSubquery(s) => rewrite_select_clock(s, now),
7318        Expr::Exists { subquery, .. } => rewrite_select_clock(subquery, now),
7319        Expr::InSubquery { expr, subquery, .. } => {
7320            rewrite_expr_clock(expr, now);
7321            rewrite_select_clock(subquery, now);
7322        }
7323        // v4.12 window functions — args + PARTITION BY + ORDER BY
7324        // may all reference clock literals.
7325        Expr::WindowFunction {
7326            args,
7327            partition_by,
7328            order_by,
7329            ..
7330        } => {
7331            for a in args {
7332                rewrite_expr_clock(a, now);
7333            }
7334            for p in partition_by {
7335                rewrite_expr_clock(p, now);
7336            }
7337            for (e, _) in order_by {
7338                rewrite_expr_clock(e, now);
7339            }
7340        }
7341        Expr::Literal(_) | Expr::Placeholder(_) | Expr::Column(_) => {}
7342    }
7343}
7344
7345/// Returns `Some(Expr)` when `e` is one of the clock-call shapes that
7346/// must be rewritten; otherwise `None` so the caller falls through to
7347/// the recursive walk. Identifies both function-call forms (`NOW()` /
7348/// `CURRENT_TIMESTAMP()` / `CURRENT_DATE()`) and bare-identifier forms
7349/// (`CURRENT_TIMESTAMP` / `CURRENT_DATE` as unqualified column refs,
7350/// which is how PG accepts them without parens).
7351fn clock_replacement_for(e: &Expr, now: i64) -> Option<Expr> {
7352    let (kind, name) = match e {
7353        Expr::FunctionCall { name, args } if args.is_empty() => (ClockSite::Fn, name.as_str()),
7354        Expr::Column(c) if c.qualifier.is_none() => (ClockSite::BareIdent, c.name.as_str()),
7355        _ => return None,
7356    };
7357    // ASCII case-insensitive name match. Limited to the three keywords
7358    // that actually need rewriting.
7359    let matched = match name.len() {
7360        3 if kind == ClockSite::Fn && name.eq_ignore_ascii_case("now") => Some(true),
7361        12 if name.eq_ignore_ascii_case("current_date") => Some(false),
7362        17 if name.eq_ignore_ascii_case("current_timestamp") => Some(true),
7363        _ => None,
7364    };
7365    let is_timestamp = matched?;
7366    let payload = if is_timestamp {
7367        now
7368    } else {
7369        now.div_euclid(86_400_000_000)
7370    };
7371    let target = if is_timestamp {
7372        spg_sql::ast::CastTarget::Timestamp
7373    } else {
7374        spg_sql::ast::CastTarget::Date
7375    };
7376    Some(Expr::Cast {
7377        expr: alloc::boxed::Box::new(Expr::Literal(spg_sql::ast::Literal::Integer(payload))),
7378        target,
7379    })
7380}
7381
7382#[derive(Debug, Clone, Copy, PartialEq, Eq)]
7383enum ClockSite {
7384    Fn,
7385    BareIdent,
7386}
7387
7388/// `ORDER BY <integer>` references the N-th SELECT item (1-based).
7389/// Swap the integer literal for the matching item's expression so the
7390/// executor doesn't need a special-case branch. Recurses into UNION
7391/// peers because each peer keeps its own SELECT list.
7392/// v6.4.1 — expand `GROUP BY ALL` to every non-aggregate SELECT-list
7393/// item. Mirrors DuckDB / PG 19 semantics. Wildcards (`SELECT * …`)
7394/// are NOT expanded by GROUP BY ALL (PG 19 leaves the wildcard intact
7395/// and groups by whatever explicit non-aggregates remain — none in
7396/// the wildcard-only case, which still works for non-aggregate
7397/// queries).
7398fn expand_group_by_all(s: &mut SelectStatement) {
7399    if !s.group_by_all {
7400        for (_, peer) in &mut s.unions {
7401            expand_group_by_all(peer);
7402        }
7403        return;
7404    }
7405    let mut groups: Vec<Expr> = Vec::new();
7406    for item in &s.items {
7407        if let SelectItem::Expr { expr, .. } = item
7408            && !aggregate::contains_aggregate(expr)
7409        {
7410            groups.push(expr.clone());
7411        }
7412    }
7413    s.group_by = Some(groups);
7414    s.group_by_all = false;
7415    for (_, peer) in &mut s.unions {
7416        expand_group_by_all(peer);
7417    }
7418}
7419
7420fn resolve_order_by_position(s: &mut SelectStatement) {
7421    // v6.4.0 — iterate every ORDER BY key. Position references
7422    // (`ORDER BY 2`) bind to the 1-based projection index;
7423    // identifier references that match a SELECT-list alias bind to
7424    // the projected expression (Step 4 of L3a).
7425    for order in &mut s.order_by {
7426        match &order.expr {
7427            Expr::Literal(Literal::Integer(n)) if *n >= 1 => {
7428                if let Ok(idx_one_based) = usize::try_from(*n) {
7429                    let idx = idx_one_based - 1;
7430                    if idx < s.items.len()
7431                        && let SelectItem::Expr { expr, .. } = &s.items[idx]
7432                    {
7433                        order.expr = expr.clone();
7434                    }
7435                }
7436            }
7437            Expr::Column(c) if c.qualifier.is_none() => {
7438                // Alias-in-ORDER-BY lookup.
7439                for item in &s.items {
7440                    if let SelectItem::Expr {
7441                        expr,
7442                        alias: Some(a),
7443                    } = item
7444                        && a == &c.name
7445                    {
7446                        order.expr = expr.clone();
7447                        break;
7448                    }
7449                }
7450            }
7451            _ => {}
7452        }
7453    }
7454    for (_, peer) in &mut s.unions {
7455        resolve_order_by_position(peer);
7456    }
7457}
7458
7459/// Sort `tagged` by `f64` key, reversing the comparator under DESC.
7460/// Used by the UNION ORDER BY path; per-block paths inline the same
7461/// comparator because they already hold `&OrderBy` directly.
7462/// v3.1.1: partial-sort helper. When `keep` (= offset + limit) is
7463/// strictly less than `tagged.len()`, run `select_nth_unstable_by` to
7464/// partition the prefix in O(n), then sort just that prefix in O(k
7465/// log k). Total O(n + k log k), vs O(n log n) for a full sort. The
7466/// caller decides what `keep` is; passing `None` (no LIMIT) keeps the
7467/// full-sort behaviour.
7468///
7469/// `tagged` holds `(Option<f64>, Row)` (the SELECT path) — `None` keys
7470/// sort last in ascending order, mirroring NULL-sorts-last in SQL.
7471fn partial_sort_tagged(
7472    tagged: &mut Vec<(Vec<f64>, Row)>,
7473    keep: Option<usize>,
7474    descs: &[bool],
7475) {
7476    let cmp = |a: &(Vec<f64>, Row), b: &(Vec<f64>, Row)| cmp_multi_key(&a.0, &b.0, descs);
7477    match keep {
7478        Some(k) if k < tagged.len() && k > 0 => {
7479            let pivot = k - 1;
7480            tagged.select_nth_unstable_by(pivot, cmp);
7481            tagged[..k].sort_by(cmp);
7482            tagged.truncate(k);
7483        }
7484        _ => {
7485            tagged.sort_by(cmp);
7486        }
7487    }
7488}
7489
7490fn sort_by_keys(tagged: &mut [(Vec<f64>, Row)], descs: &[bool]) {
7491    tagged.sort_by(|a, b| cmp_multi_key(&a.0, &b.0, descs));
7492}
7493
7494/// v6.4.0 — multi-key ORDER BY comparator. Each key's per-key DESC
7495/// flag is honored independently. NULL is encoded as `f64::INFINITY`
7496/// so it sorts last in ASC and first in DESC (matches PG default).
7497fn cmp_multi_key(a: &[f64], b: &[f64], descs: &[bool]) -> core::cmp::Ordering {
7498    use core::cmp::Ordering;
7499    for (i, (ka, kb)) in a.iter().zip(b.iter()).enumerate() {
7500        let ord = ka.partial_cmp(kb).unwrap_or(Ordering::Equal);
7501        let ord = if descs.get(i).copied().unwrap_or(false) {
7502            ord.reverse()
7503        } else {
7504            ord
7505        };
7506        if ord != Ordering::Equal {
7507            return ord;
7508        }
7509    }
7510    Ordering::Equal
7511}
7512
7513/// v6.4.0 — eval every ORDER BY expression for a row and pack the
7514/// resulting keys into a `Vec<f64>`. NULL → `f64::INFINITY`.
7515fn build_order_keys(
7516    order_by: &[OrderBy],
7517    row: &Row,
7518    ctx: &EvalContext,
7519) -> Result<Vec<f64>, EngineError> {
7520    let mut keys = Vec::with_capacity(order_by.len());
7521    for o in order_by {
7522        let v = eval::eval_expr(&o.expr, row, ctx)?;
7523        keys.push(value_to_order_key(&v)?);
7524    }
7525    Ok(keys)
7526}
7527
7528/// Drop the first `offset` rows then truncate to `limit`. PG / `MySQL`
7529/// agree: OFFSET applies *after* ORDER BY but *before* LIMIT (so
7530/// `LIMIT 10 OFFSET 5` keeps rows 6..=15).
7531fn apply_offset_and_limit(rows: &mut Vec<Row>, offset: Option<u32>, limit: Option<u32>) {
7532    if let Some(off) = offset {
7533        let off = off as usize;
7534        if off >= rows.len() {
7535            rows.clear();
7536        } else {
7537            rows.drain(..off);
7538        }
7539    }
7540    if let Some(n) = limit {
7541        rows.truncate(n as usize);
7542    }
7543}
7544
7545/// v7.6.1 — resolve a parser-level `ForeignKeyConstraint` (column
7546/// names + parent table name) into the storage-layer shape (column
7547/// indices + same parent table). Validates everything the engine
7548/// needs to know about the FK at CREATE TABLE time:
7549///
7550///   - parent table exists (catalog lookup, unless self-referencing)
7551///   - parent columns exist on the parent table
7552///   - parent column list matches the local arity (defaults to the
7553///     parent's primary index column when omitted)
7554///   - parent columns are covered by a `BTree` UNIQUE-class index
7555///     (SPG's stand-in for `PRIMARY KEY`/`UNIQUE`) — required so
7556///     the v7.6.2 INSERT path can do an O(log n) parent lookup
7557///   - local columns exist on the table being created
7558fn resolve_foreign_key(
7559    local_table_name: &str,
7560    local_cols: &[ColumnSchema],
7561    fk: spg_sql::ast::ForeignKeyConstraint,
7562    catalog: &Catalog,
7563) -> Result<spg_storage::ForeignKeyConstraint, EngineError> {
7564    // Resolve local columns.
7565    let mut local_columns = Vec::with_capacity(fk.columns.len());
7566    for name in &fk.columns {
7567        let pos = local_cols
7568            .iter()
7569            .position(|c| c.name == *name)
7570            .ok_or_else(|| {
7571                EngineError::Unsupported(alloc::format!(
7572                    "FOREIGN KEY references unknown local column {name:?}"
7573                ))
7574            })?;
7575        local_columns.push(pos);
7576    }
7577    // Self-referencing FK: parent table is the one we're creating.
7578    // The parent column resolution uses the local column list since
7579    // the catalog doesn't have this table yet.
7580    let is_self_ref = fk.parent_table == local_table_name;
7581    let (parent_cols_for_lookup, parent_table_str): (&[ColumnSchema], &str) = if is_self_ref {
7582        (local_cols, local_table_name)
7583    } else {
7584        let parent_table = catalog.get(&fk.parent_table).ok_or_else(|| {
7585            EngineError::Storage(StorageError::TableNotFound {
7586                name: fk.parent_table.clone(),
7587            })
7588        })?;
7589        (parent_table.schema().columns.as_slice(), fk.parent_table.as_str())
7590    };
7591    // Resolve parent column names → positions. If the FK omitted the
7592    // parent column list, fall back to the parent's primary index
7593    // column (single-column only — composite default is rejected
7594    // because there's no unambiguous "PK" in SPG's index list).
7595    let parent_columns: Vec<usize> = if fk.parent_columns.is_empty() {
7596        if fk.columns.len() != 1 {
7597            return Err(EngineError::Unsupported(
7598                "composite FOREIGN KEY without explicit parent column list is not supported \
7599                 — list the parent columns explicitly"
7600                    .into(),
7601            ));
7602        }
7603        // Find a single BTree index on the parent and use its column.
7604        let pos = pick_pk_index_column(catalog, parent_table_str, is_self_ref, local_cols)
7605            .ok_or_else(|| {
7606                EngineError::Unsupported(alloc::format!(
7607                    "parent table {parent_table_str:?} has no PRIMARY-key / UNIQUE BTree index \
7608                     to default the FOREIGN KEY against"
7609                ))
7610            })?;
7611        alloc::vec![pos]
7612    } else {
7613        let mut out = Vec::with_capacity(fk.parent_columns.len());
7614        for name in &fk.parent_columns {
7615            let pos = parent_cols_for_lookup
7616                .iter()
7617                .position(|c| c.name == *name)
7618                .ok_or_else(|| {
7619                    EngineError::Unsupported(alloc::format!(
7620                        "FOREIGN KEY references unknown parent column \
7621                         {name:?} on table {parent_table_str:?}"
7622                    ))
7623                })?;
7624            out.push(pos);
7625        }
7626        out
7627    };
7628    if parent_columns.len() != local_columns.len() {
7629        return Err(EngineError::Unsupported(alloc::format!(
7630            "FOREIGN KEY arity mismatch: {} local columns vs {} parent columns",
7631            local_columns.len(),
7632            parent_columns.len()
7633        )));
7634    }
7635    // For non-self-referencing FKs, verify the parent column set is
7636    // covered by a BTree index. SPG doesn't have a `PRIMARY KEY`
7637    // declaration; the convention is "the parent column for FK
7638    // purposes must have a BTree index" — which the user creates via
7639    // `CREATE INDEX ... USING btree (col)` (the default). We accept
7640    // any single-column BTree index that covers a parent column;
7641    // composite parent column lists require an index whose `column_position`
7642    // matches the first parent column (multi-column BTree indices
7643    // are not in the v7.x roadmap).
7644    if !is_self_ref {
7645        let parent_table = catalog
7646            .get(&fk.parent_table)
7647            .expect("checked above");
7648        let primary_parent_col = parent_columns[0];
7649        let has_btree = parent_table.schema().columns.get(primary_parent_col).is_some()
7650            && parent_table
7651                .indices()
7652                .iter()
7653                .any(|idx| {
7654                    matches!(idx.kind, spg_storage::IndexKind::BTree(_))
7655                        && idx.column_position == primary_parent_col
7656                        && idx.partial_predicate.is_none()
7657                });
7658        if !has_btree {
7659            return Err(EngineError::Unsupported(alloc::format!(
7660                "FOREIGN KEY parent column on {:?} is not covered by an unconditional BTree \
7661                 index — create one with `CREATE INDEX ... ON {} ({})` first",
7662                parent_table_str,
7663                parent_table_str,
7664                parent_table.schema().columns[primary_parent_col].name,
7665            )));
7666        }
7667    }
7668    let on_delete = fk_action_sql_to_storage(fk.on_delete);
7669    let on_update = fk_action_sql_to_storage(fk.on_update);
7670    Ok(spg_storage::ForeignKeyConstraint {
7671        name: fk.name,
7672        local_columns,
7673        parent_table: fk.parent_table,
7674        parent_columns,
7675        on_delete,
7676        on_update,
7677    })
7678}
7679
7680/// v7.6.1 — pick a sentinel "primary key" column from the parent
7681/// table when the FK didn't name parent columns. Picks the first
7682/// single-column unconditional BTree index — that's the closest
7683/// thing SPG has to a PRIMARY KEY today. Self-referencing FKs use
7684/// `local_cols` as the column source.
7685fn pick_pk_index_column(
7686    catalog: &Catalog,
7687    parent_name: &str,
7688    is_self_ref: bool,
7689    local_cols: &[ColumnSchema],
7690) -> Option<usize> {
7691    if is_self_ref {
7692        // Self-ref FK omitted parent columns: pick column 0 by
7693        // convention (no catalog entry yet). Engine will widen this
7694        // when v7.6.7 lands; v7.6.1 only handles the explicit form.
7695        let _ = local_cols;
7696        return Some(0);
7697    }
7698    let parent = catalog.get(parent_name)?;
7699    parent.indices().iter().find_map(|idx| {
7700        if matches!(idx.kind, spg_storage::IndexKind::BTree(_))
7701            && idx.partial_predicate.is_none()
7702            && idx.included_columns.is_empty()
7703            && idx.expression.is_none()
7704        {
7705            Some(idx.column_position)
7706        } else {
7707            None
7708        }
7709    })
7710}
7711
7712/// v7.9.8 / v7.9.10 — resolve the column positions that
7713/// identify a conflict for ON CONFLICT. Returns a Vec of
7714/// column positions (1 element for single-column form, N for
7715/// composite). When the user wrote bare `ON CONFLICT DO …`,
7716/// falls back to the table's first unconditional BTree index
7717/// (always single-column today).
7718fn resolve_on_conflict_columns(
7719    catalog: &Catalog,
7720    table_name: &str,
7721    target: &[String],
7722) -> Result<Vec<usize>, EngineError> {
7723    let table = catalog.get(table_name).ok_or_else(|| {
7724        EngineError::Storage(StorageError::TableNotFound {
7725            name: table_name.into(),
7726        })
7727    })?;
7728    if target.is_empty() {
7729        let pos = table
7730            .indices()
7731            .iter()
7732            .find_map(|idx| {
7733                if matches!(idx.kind, spg_storage::IndexKind::BTree(_))
7734                    && idx.partial_predicate.is_none()
7735                    && idx.included_columns.is_empty()
7736                    && idx.expression.is_none()
7737                {
7738                    Some(idx.column_position)
7739                } else {
7740                    None
7741                }
7742            })
7743            .ok_or_else(|| {
7744                EngineError::Unsupported(alloc::format!(
7745                    "ON CONFLICT without target requires a UNIQUE BTree index on {table_name:?}"
7746                ))
7747            })?;
7748        return Ok(alloc::vec![pos]);
7749    }
7750    let mut out = Vec::with_capacity(target.len());
7751    for name in target {
7752        let pos = table
7753            .schema()
7754            .columns
7755            .iter()
7756            .position(|c| c.name == *name)
7757            .ok_or_else(|| {
7758                EngineError::Unsupported(alloc::format!(
7759                    "ON CONFLICT target column {name:?} not found on {table_name:?}"
7760                ))
7761            })?;
7762        out.push(pos);
7763    }
7764    Ok(out)
7765}
7766
7767/// v7.9.8 — check whether the BTree index on `column_pos` of
7768/// `table_name` already has a row with this key.
7769fn on_conflict_key_exists(
7770    catalog: &Catalog,
7771    table_name: &str,
7772    column_pos: usize,
7773    key: &Value,
7774) -> bool {
7775    let Some(table) = catalog.get(table_name) else {
7776        return false;
7777    };
7778    let Some(idx_key) = spg_storage::IndexKey::from_value(key) else {
7779        return false;
7780    };
7781    table.indices().iter().any(|idx| {
7782        matches!(idx.kind, spg_storage::IndexKind::BTree(_))
7783            && idx.column_position == column_pos
7784            && idx.partial_predicate.is_none()
7785            && !idx.lookup_eq(&idx_key).is_empty()
7786    })
7787}
7788
7789/// v7.9.9 / v7.9.10 — look up an existing row's position by
7790/// matching all `column_positions` against the incoming `key`
7791/// tuple. Single-column shape (one column) reduces to the
7792/// canonical PK lookup; composite shapes scan linearly until
7793/// every position matches.
7794fn lookup_row_position_by_keys(
7795    catalog: &Catalog,
7796    table_name: &str,
7797    column_positions: &[usize],
7798    key: &[&Value],
7799) -> Option<usize> {
7800    let table = catalog.get(table_name)?;
7801    table.rows().iter().position(|r| {
7802        column_positions
7803            .iter()
7804            .enumerate()
7805            .all(|(i, &pos)| r.values.get(pos) == Some(key[i]))
7806    })
7807}
7808
7809/// v7.9.10 — does the table already contain a row whose
7810/// `column_positions` tuple equals `key`? Single-column shape
7811/// uses the existing BTree fast path; composite shapes fall
7812/// back to a row scan.
7813fn on_conflict_keys_exist(
7814    catalog: &Catalog,
7815    table_name: &str,
7816    column_positions: &[usize],
7817    key: &[&Value],
7818) -> bool {
7819    if column_positions.len() == 1 {
7820        return on_conflict_key_exists(
7821            catalog,
7822            table_name,
7823            column_positions[0],
7824            key[0],
7825        );
7826    }
7827    let Some(table) = catalog.get(table_name) else {
7828        return false;
7829    };
7830    table.rows().iter().any(|r| {
7831        column_positions
7832            .iter()
7833            .enumerate()
7834            .all(|(i, &pos)| r.values.get(pos) == Some(key[i]))
7835    })
7836}
7837
7838/// v7.9.9 — apply ON CONFLICT DO UPDATE SET assignments to an
7839/// existing row.
7840///
7841/// `incoming` is the rejected INSERT row (used to resolve
7842/// `EXCLUDED.col` references in the assignment exprs);
7843/// `target_pos` is the position of the existing row in the table.
7844/// Each assignment substitutes `EXCLUDED.col` with the matching
7845/// incoming value, evaluates the resulting expression against
7846/// the existing row, and writes the new value into the
7847/// corresponding column of the returned `Vec<Value>`. If
7848/// `where_` evaluates falsy, returns Ok(None) — PG behaviour:
7849/// the conflicting row is silently kept unchanged.
7850fn apply_on_conflict_assignments(
7851    catalog: &Catalog,
7852    table_name: &str,
7853    target_pos: usize,
7854    incoming: &[Value],
7855    assignments: &[(String, Expr)],
7856    where_: Option<&Expr>,
7857) -> Result<Option<Vec<Value>>, EngineError> {
7858    let table = catalog.get(table_name).ok_or_else(|| {
7859        EngineError::Storage(StorageError::TableNotFound {
7860            name: table_name.into(),
7861        })
7862    })?;
7863    let schema_cols = table.schema().columns.clone();
7864    let existing = table
7865        .rows()
7866        .get(target_pos)
7867        .ok_or_else(|| {
7868            EngineError::Unsupported(alloc::format!(
7869                "ON CONFLICT DO UPDATE: row position {target_pos} out of bounds on {table_name:?}"
7870            ))
7871        })?
7872        .clone();
7873    let ctx = eval::EvalContext::new(&schema_cols, Some(table_name));
7874    // Optional WHERE filter on the conflict row.
7875    if let Some(w) = where_ {
7876        let pred = w.clone();
7877        let pred = substitute_excluded_refs(pred, &schema_cols, incoming);
7878        let v = eval::eval_expr(&pred, &existing, &ctx)?;
7879        if !matches!(v, Value::Bool(true)) {
7880            return Ok(None);
7881        }
7882    }
7883    let mut new_values = existing.values.clone();
7884    for (col_name, expr) in assignments {
7885        let target_idx = schema_cols
7886            .iter()
7887            .position(|c| c.name == *col_name)
7888            .ok_or_else(|| {
7889                EngineError::Eval(EvalError::ColumnNotFound {
7890                    name: col_name.clone(),
7891                })
7892            })?;
7893        let sub = substitute_excluded_refs(expr.clone(), &schema_cols, incoming);
7894        let v = eval::eval_expr(&sub, &existing, &ctx)?;
7895        new_values[target_idx] =
7896            coerce_value(v, schema_cols[target_idx].ty, col_name, target_idx)?;
7897    }
7898    Ok(Some(new_values))
7899}
7900
7901/// v7.9.9 — walk an `Expr` tree replacing any `Column { qualifier:
7902/// "EXCLUDED", name }` reference with a `Literal` of the matching
7903/// value from the incoming-row vec. Resolution against the
7904/// child-table column list (by name).
7905fn substitute_excluded_refs(
7906    expr: Expr,
7907    schema_cols: &[ColumnSchema],
7908    incoming: &[Value],
7909) -> Expr {
7910    use spg_sql::ast::ColumnName;
7911    match expr {
7912        Expr::Column(ColumnName { qualifier, name })
7913            if qualifier
7914                .as_deref()
7915                .is_some_and(|q| q.eq_ignore_ascii_case("excluded")) =>
7916        {
7917            let pos = schema_cols.iter().position(|c| c.name == name);
7918            match pos {
7919                Some(p) => {
7920                    let v = incoming.get(p).cloned().unwrap_or(Value::Null);
7921                    value_to_literal_expr(v).unwrap_or_else(|_| {
7922                        Expr::Literal(spg_sql::ast::Literal::Null)
7923                    })
7924                }
7925                None => Expr::Column(ColumnName { qualifier, name }),
7926            }
7927        }
7928        Expr::Binary { op, lhs, rhs } => Expr::Binary {
7929            op,
7930            lhs: Box::new(substitute_excluded_refs(*lhs, schema_cols, incoming)),
7931            rhs: Box::new(substitute_excluded_refs(*rhs, schema_cols, incoming)),
7932        },
7933        Expr::Unary { op, expr } => Expr::Unary {
7934            op,
7935            expr: Box::new(substitute_excluded_refs(*expr, schema_cols, incoming)),
7936        },
7937        Expr::FunctionCall { name, args } => Expr::FunctionCall {
7938            name,
7939            args: args
7940                .into_iter()
7941                .map(|a| substitute_excluded_refs(a, schema_cols, incoming))
7942                .collect(),
7943        },
7944        other => other,
7945    }
7946}
7947
7948/// v7.6.2 / v7.6.7 — INSERT-side FK enforcement. For every row
7949/// about to be inserted into `child_table`, every FK declared on
7950/// that table is checked: the row's FK columns must either be
7951/// NULL (SQL spec skip) or match an existing parent row via the
7952/// parent's BTree PK / UNIQUE index.
7953///
7954/// Returns `EngineError::Unsupported` with a `FOREIGN KEY violation`
7955/// payload on first failure.
7956///
7957/// **Self-referencing FKs (v7.6.7 widening):** when `fk.parent_table
7958/// == child_table`, the parent rows visible to this check are
7959///  (a) rows already committed to the table, plus
7960///  (b) earlier rows from the *same* `rows` batch.
7961/// This makes `INSERT INTO tree VALUES (1, NULL), (2, 1), (3, 2)`
7962/// work in a single statement — common pattern for bulk-loading
7963/// hierarchies.
7964/// v7.9.19 — enforce table-level UNIQUE / PRIMARY KEY tuple
7965/// constraints at INSERT time. For each constraint declared on
7966/// the target table, check that no existing row + no earlier row
7967/// in the same batch has the same full-column tuple. NULL in
7968/// any column lifts the row out of the check (SQL spec: NULL
7969/// ≠ NULL for uniqueness). mailrs G1 + G6.
7970fn enforce_uniqueness_inserts(
7971    catalog: &Catalog,
7972    child_table: &str,
7973    constraints: &[spg_storage::UniquenessConstraint],
7974    rows: &[Vec<Value>],
7975) -> Result<(), EngineError> {
7976    if constraints.is_empty() {
7977        return Ok(());
7978    }
7979    let table = catalog.get(child_table).ok_or_else(|| {
7980        EngineError::Storage(StorageError::TableNotFound {
7981            name: child_table.into(),
7982        })
7983    })?;
7984    for uc in constraints {
7985        for (batch_idx, row_values) in rows.iter().enumerate() {
7986            let key: Vec<&Value> = uc.columns.iter().map(|&i| &row_values[i]).collect();
7987            let has_null = key.iter().any(|v| matches!(v, Value::Null));
7988            if has_null {
7989                continue;
7990            }
7991            // Table-side collision: scan existing rows.
7992            let collides_in_table = table.rows().iter().any(|prow| {
7993                uc.columns
7994                    .iter()
7995                    .enumerate()
7996                    .all(|(i, &p)| prow.values.get(p) == Some(key[i]))
7997            });
7998            // Batch-side collision: earlier rows in the same INSERT.
7999            let collides_in_batch = rows[..batch_idx].iter().any(|earlier| {
8000                uc.columns
8001                    .iter()
8002                    .enumerate()
8003                    .all(|(i, &p)| earlier.get(p) == Some(key[i]))
8004            });
8005            if collides_in_table || collides_in_batch {
8006                let kind = if uc.is_primary_key { "PRIMARY KEY" } else { "UNIQUE" };
8007                let col_names: Vec<String> = uc
8008                    .columns
8009                    .iter()
8010                    .map(|&i| table.schema().columns[i].name.clone())
8011                    .collect();
8012                return Err(EngineError::Unsupported(alloc::format!(
8013                    "{kind} violation on {child_table:?} columns {col_names:?}: \
8014                     row #{batch_idx} duplicates an existing key"
8015                )));
8016            }
8017        }
8018    }
8019    Ok(())
8020}
8021
8022/// v7.9.29 — `true` iff `v` counts as a truthy SQL value for a
8023/// WHERE-style predicate. NULL → false (three-valued logic
8024/// collapses to "skip this row" for index inclusion). Numeric
8025/// non-zero, BIGINT non-zero, TINYINT non-zero, BOOLEAN true → true.
8026/// Everything else (strings, vectors, JSON, …) is not a valid
8027/// predicate result and surfaces as `false` so a malformed
8028/// predicate degrades to "row not in index" rather than panicking.
8029fn predicate_truthy(v: &spg_storage::Value) -> bool {
8030    use spg_storage::Value as V;
8031    match v {
8032        V::Bool(b) => *b,
8033        V::Int(n) => *n != 0,
8034        V::BigInt(n) => *n != 0,
8035        V::SmallInt(n) => *n != 0,
8036        _ => false,
8037    }
8038}
8039
8040/// v7.9.29 — at CREATE UNIQUE INDEX time, scan the table's
8041/// committed rows for pre-existing duplicates. If any pair of rows
8042/// matches the predicate AND has the same index key, refuse to
8043/// create the index so the user fixes the data before retrying.
8044fn check_existing_unique_violation(
8045    idx: &spg_storage::Index,
8046    schema: &spg_storage::TableSchema,
8047    rows: &[spg_storage::Row],
8048) -> Result<(), EngineError> {
8049    let predicate_expr = match idx.partial_predicate.as_deref() {
8050        Some(s) => Some(spg_sql::parser::parse_expression(s).map_err(|e| {
8051            EngineError::Unsupported(alloc::format!(
8052                "stored partial predicate {s:?} failed to re-parse: {e:?}"
8053            ))
8054        })?),
8055        None => None,
8056    };
8057    let ctx = eval::EvalContext::new(&schema.columns, None);
8058    let key_positions = unique_key_positions(idx);
8059    let mut seen: alloc::vec::Vec<alloc::vec::Vec<spg_storage::Value>> = alloc::vec::Vec::new();
8060    for row in rows {
8061        if let Some(expr) = &predicate_expr {
8062            let v = eval::eval_expr(expr, row, &ctx).map_err(|e| {
8063                EngineError::Unsupported(alloc::format!(
8064                    "evaluating UNIQUE INDEX predicate against existing row: {e:?}"
8065                ))
8066            })?;
8067            if !predicate_truthy(&v) {
8068                continue;
8069            }
8070        }
8071        let key: alloc::vec::Vec<spg_storage::Value> = key_positions
8072            .iter()
8073            .map(|&p| {
8074                row.values
8075                    .get(p)
8076                    .cloned()
8077                    .unwrap_or(spg_storage::Value::Null)
8078            })
8079            .collect();
8080        if key.iter().any(|v| matches!(v, spg_storage::Value::Null)) {
8081            continue;
8082        }
8083        if seen.iter().any(|other| *other == key) {
8084            return Err(EngineError::Unsupported(alloc::format!(
8085                "CREATE UNIQUE INDEX {:?}: existing rows already violate the constraint",
8086                idx.name
8087            )));
8088        }
8089        seen.push(key);
8090    }
8091    Ok(())
8092}
8093
8094/// v7.9.29 — full key tuple for a UNIQUE INDEX (leading +
8095/// extra positions). For single-column indexes this is just
8096/// `[column_position]`.
8097fn unique_key_positions(idx: &spg_storage::Index) -> alloc::vec::Vec<usize> {
8098    let mut out = alloc::vec::Vec::with_capacity(1 + idx.extra_column_positions.len());
8099    out.push(idx.column_position);
8100    out.extend_from_slice(&idx.extra_column_positions);
8101    out
8102}
8103
8104/// v7.9.29 — at INSERT time, walk every `is_unique` index on the
8105/// target table. For each, eval the index's optional predicate
8106/// against (a) the candidate row and (b) every committed row plus
8107/// earlier batch rows; only rows where the predicate is truthy
8108/// participate. A duplicate key among predicate-matching rows is a
8109/// uniqueness violation. NULL keys lift the row out of the check
8110/// (matching PG's "UNIQUE allows multiple NULLs" semantics).
8111fn enforce_unique_index_inserts(
8112    catalog: &Catalog,
8113    table_name: &str,
8114    rows: &[alloc::vec::Vec<spg_storage::Value>],
8115) -> Result<(), EngineError> {
8116    let table = catalog.get(table_name).ok_or_else(|| {
8117        EngineError::Storage(StorageError::TableNotFound {
8118            name: table_name.into(),
8119        })
8120    })?;
8121    let schema = table.schema();
8122    let ctx = eval::EvalContext::new(&schema.columns, None);
8123    for idx in table.indices() {
8124        if !idx.is_unique {
8125            continue;
8126        }
8127        // Re-parse the predicate once per index per batch.
8128        let predicate_expr = match idx.partial_predicate.as_deref() {
8129            Some(s) => Some(spg_sql::parser::parse_expression(s).map_err(|e| {
8130                EngineError::Unsupported(alloc::format!(
8131                    "UNIQUE INDEX {:?} predicate {s:?} failed to re-parse: {e:?}",
8132                    idx.name
8133                ))
8134            })?),
8135            None => None,
8136        };
8137        let key_positions = unique_key_positions(idx);
8138        let key_of = |values: &[spg_storage::Value]| -> alloc::vec::Vec<spg_storage::Value> {
8139            key_positions
8140                .iter()
8141                .map(|&p| {
8142                    values
8143                        .get(p)
8144                        .cloned()
8145                        .unwrap_or(spg_storage::Value::Null)
8146                })
8147                .collect()
8148        };
8149        // Helper: does `values` participate in this index? (predicate
8150        // truthy when present.) Wraps `values` into a transient Row
8151        // because eval_expr requires &Row.
8152        let participates = |values: &[spg_storage::Value]| -> Result<bool, EngineError> {
8153            let Some(expr) = &predicate_expr else {
8154                return Ok(true);
8155            };
8156            let tmp_row = spg_storage::Row {
8157                values: values.to_vec(),
8158            };
8159            let v = eval::eval_expr(expr, &tmp_row, &ctx).map_err(|e| {
8160                EngineError::Unsupported(alloc::format!(
8161                    "UNIQUE INDEX {:?} predicate eval: {e:?}",
8162                    idx.name
8163                ))
8164            })?;
8165            Ok(predicate_truthy(&v))
8166        };
8167        for (batch_idx, row_values) in rows.iter().enumerate() {
8168            if !participates(row_values)? {
8169                continue;
8170            }
8171            let key = key_of(row_values);
8172            if key.iter().any(|v| matches!(v, spg_storage::Value::Null)) {
8173                continue;
8174            }
8175            // Committed-table collision.
8176            for prow in table.rows() {
8177                if !participates(&prow.values)? {
8178                    continue;
8179                }
8180                if key_of(&prow.values) == key {
8181                    return Err(EngineError::Unsupported(alloc::format!(
8182                        "UNIQUE INDEX {:?} violation on {table_name:?}: \
8183                         row #{batch_idx} duplicates an existing key",
8184                        idx.name
8185                    )));
8186                }
8187            }
8188            // Within-batch collision: earlier rows in the same INSERT.
8189            for earlier in &rows[..batch_idx] {
8190                if !participates(earlier)? {
8191                    continue;
8192                }
8193                if key_of(earlier) == key {
8194                    return Err(EngineError::Unsupported(alloc::format!(
8195                        "UNIQUE INDEX {:?} violation on {table_name:?}: \
8196                         row #{batch_idx} duplicates an earlier row in the same batch",
8197                        idx.name
8198                    )));
8199                }
8200            }
8201        }
8202    }
8203    Ok(())
8204}
8205
8206fn enforce_fk_inserts(
8207    catalog: &Catalog,
8208    child_table: &str,
8209    fks: &[spg_storage::ForeignKeyConstraint],
8210    rows: &[Vec<Value>],
8211) -> Result<(), EngineError> {
8212    for fk in fks {
8213        let parent_is_self = fk.parent_table == child_table;
8214        let parent = if parent_is_self {
8215            // Self-ref: read the current state of the same table.
8216            // The mut borrow on child has been dropped by the caller.
8217            catalog.get(child_table).ok_or_else(|| {
8218                EngineError::Storage(StorageError::TableNotFound {
8219                    name: child_table.into(),
8220                })
8221            })?
8222        } else {
8223            catalog.get(&fk.parent_table).ok_or_else(|| {
8224                EngineError::Storage(StorageError::TableNotFound {
8225                    name: fk.parent_table.clone(),
8226                })
8227            })?
8228        };
8229        for (batch_idx, row_values) in rows.iter().enumerate() {
8230            // Single-column FK fast path: try the parent's BTree
8231            // index for an O(log n) lookup. Composite FKs fall back
8232            // to a parent-row scan.
8233            if fk.local_columns.len() == 1 {
8234                let v = &row_values[fk.local_columns[0]];
8235                if matches!(v, Value::Null) {
8236                    continue;
8237                }
8238                let parent_col = fk.parent_columns[0];
8239                let key = spg_storage::IndexKey::from_value(v).ok_or_else(|| {
8240                    EngineError::Unsupported(alloc::format!(
8241                        "FOREIGN KEY column value of type {:?} is not index-eligible",
8242                        v.data_type()
8243                    ))
8244                })?;
8245                let present_committed = parent.indices().iter().any(|idx| {
8246                    matches!(idx.kind, spg_storage::IndexKind::BTree(_))
8247                        && idx.column_position == parent_col
8248                        && idx.partial_predicate.is_none()
8249                        && !idx.lookup_eq(&key).is_empty()
8250                });
8251                // v7.6.7 self-ref widening: also accept a match
8252                // against earlier rows in this same batch when the
8253                // FK points at the table being inserted into.
8254                let present_in_batch = parent_is_self
8255                    && rows[..batch_idx].iter().any(|earlier| {
8256                        earlier.get(parent_col) == Some(v)
8257                    });
8258                if !(present_committed || present_in_batch) {
8259                    return Err(EngineError::Unsupported(alloc::format!(
8260                        "FOREIGN KEY violation: no parent row in {:?} where {} = {:?}",
8261                        fk.parent_table,
8262                        parent
8263                            .schema()
8264                            .columns
8265                            .get(parent_col)
8266                            .map_or("?", |c| c.name.as_str()),
8267                        v,
8268                    )));
8269                }
8270            } else {
8271                // Composite FK: scan parent rows. v7.6.7 also
8272                // accepts a match against earlier rows in the same
8273                // batch (self-ref bulk-loading of hierarchies).
8274                if fk.local_columns
8275                    .iter()
8276                    .all(|&i| matches!(row_values.get(i), Some(Value::Null)))
8277                {
8278                    continue;
8279                }
8280                let local: Vec<&Value> = fk.local_columns.iter().map(|&i| &row_values[i]).collect();
8281                let parent_match_committed = parent.rows().iter().any(|prow| {
8282                    fk.parent_columns
8283                        .iter()
8284                        .enumerate()
8285                        .all(|(i, &pi)| prow.values.get(pi) == Some(local[i]))
8286                });
8287                let parent_match_in_batch = parent_is_self
8288                    && rows[..batch_idx].iter().any(|earlier| {
8289                        fk.parent_columns
8290                            .iter()
8291                            .enumerate()
8292                            .all(|(i, &pi)| earlier.get(pi) == Some(local[i]))
8293                    });
8294                if !(parent_match_committed || parent_match_in_batch) {
8295                    return Err(EngineError::Unsupported(alloc::format!(
8296                        "FOREIGN KEY violation: no parent row in {:?} matching composite key",
8297                        fk.parent_table,
8298                    )));
8299                }
8300            }
8301        }
8302    }
8303    Ok(())
8304}
8305
8306/// v7.6.4 / v7.6.5 — one step of the FK action plan computed for a
8307/// DELETE on a parent. The plan is a list of these steps, stacked
8308/// across the FK graph by `plan_fk_parent_deletions`.
8309#[derive(Debug, Clone)]
8310struct FkChildStep {
8311    child_table: String,
8312    action: FkChildAction,
8313}
8314
8315#[derive(Debug, Clone)]
8316enum FkChildAction {
8317    /// CASCADE — remove these rows. Sorted, deduplicated positions.
8318    Delete { positions: Vec<usize> },
8319    /// SET NULL — for each (row, column) in the flat list, write
8320    /// NULL into that child cell. Multiple FKs on the same row may
8321    /// produce overlapping entries (deduped at plan time).
8322    SetNull {
8323        positions: Vec<usize>,
8324        columns: Vec<usize>,
8325    },
8326    /// SET DEFAULT — same shape as SetNull but writes the column's
8327    /// declared DEFAULT value (resolved at plan time). Columns
8328    /// without a DEFAULT raise an error during planning.
8329    SetDefault {
8330        positions: Vec<usize>,
8331        columns: Vec<usize>,
8332        defaults: Vec<Value>,
8333    },
8334}
8335
8336/// v7.6.3 → v7.6.5 — plan FK fallout for a DELETE on a parent table.
8337///
8338/// Walks every table in the catalog looking for FKs whose
8339/// `parent_table` is `parent_table_name`. For each such FK + each
8340/// to-be-deleted parent row:
8341///
8342///   - RESTRICT / NoAction → error, no plan returned
8343///   - CASCADE → child rows get scheduled for deletion; recursive
8344///   - SetNull → child FK column(s) scheduled to be NULL-ed.
8345///     Verified NULL-able at plan time.
8346///   - SetDefault → child FK column(s) scheduled to be reset to
8347///     their declared DEFAULT. Columns without a DEFAULT raise.
8348///
8349/// SET NULL / SET DEFAULT do NOT cascade further — the child row
8350/// stays; only one of its columns mutates.
8351fn plan_fk_parent_deletions(
8352    catalog: &Catalog,
8353    parent_table_name: &str,
8354    to_delete_positions: &[usize],
8355    to_delete_rows: &[Vec<Value>],
8356) -> Result<Vec<FkChildStep>, EngineError> {
8357    use alloc::collections::{BTreeMap, BTreeSet};
8358    if to_delete_rows.is_empty() {
8359        return Ok(Vec::new());
8360    }
8361    let mut delete_plan: BTreeMap<String, BTreeSet<usize>> = BTreeMap::new();
8362    // setnull / setdefault keyed by child_table → (row_idx, col_idx) → optional default
8363    let mut setnull_plan: BTreeMap<String, BTreeSet<(usize, usize)>> = BTreeMap::new();
8364    let mut setdefault_plan: BTreeMap<String, BTreeMap<(usize, usize), Value>> =
8365        BTreeMap::new();
8366    let mut visited: BTreeSet<(String, usize)> = BTreeSet::new();
8367    for &p in to_delete_positions {
8368        visited.insert((parent_table_name.to_string(), p));
8369    }
8370    let mut work: Vec<(String, Vec<Value>)> = to_delete_rows
8371        .iter()
8372        .map(|r| (parent_table_name.to_string(), r.clone()))
8373        .collect();
8374    while let Some((cur_parent, parent_row)) = work.pop() {
8375        for child_name in catalog.table_names() {
8376            let child = catalog
8377                .get(&child_name)
8378                .expect("table_names → catalog.get round-trip is total");
8379            for fk in &child.schema().foreign_keys {
8380                if fk.parent_table != cur_parent {
8381                    continue;
8382                }
8383                let parent_key: Vec<&Value> = fk
8384                    .parent_columns
8385                    .iter()
8386                    .map(|&pi| &parent_row[pi])
8387                    .collect();
8388                if parent_key.iter().any(|v| matches!(v, Value::Null)) {
8389                    continue;
8390                }
8391                for (child_row_idx, child_row) in child.rows().iter().enumerate() {
8392                    if child_name == cur_parent
8393                        && visited.contains(&(child_name.clone(), child_row_idx))
8394                    {
8395                        continue;
8396                    }
8397                    let matches_key = fk
8398                        .local_columns
8399                        .iter()
8400                        .enumerate()
8401                        .all(|(i, &li)| child_row.values.get(li) == Some(parent_key[i]));
8402                    if !matches_key {
8403                        continue;
8404                    }
8405                    match fk.on_delete {
8406                        spg_storage::FkAction::Restrict
8407                        | spg_storage::FkAction::NoAction => {
8408                            return Err(EngineError::Unsupported(alloc::format!(
8409                                "FOREIGN KEY violation: DELETE on {cur_parent:?} is \
8410                                 restricted by FK from {child_name:?}.{:?}",
8411                                fk.local_columns,
8412                            )));
8413                        }
8414                        spg_storage::FkAction::Cascade => {
8415                            if visited.insert((child_name.clone(), child_row_idx)) {
8416                                delete_plan
8417                                    .entry(child_name.clone())
8418                                    .or_default()
8419                                    .insert(child_row_idx);
8420                                work.push((child_name.clone(), child_row.values.clone()));
8421                            }
8422                        }
8423                        spg_storage::FkAction::SetNull => {
8424                            // Verify every local FK column is NULL-able.
8425                            for &li in &fk.local_columns {
8426                                let col = child.schema().columns.get(li).ok_or_else(|| {
8427                                    EngineError::Unsupported(alloc::format!(
8428                                        "FK local column {li} missing in {child_name:?}"
8429                                    ))
8430                                })?;
8431                                if !col.nullable {
8432                                    return Err(EngineError::Unsupported(alloc::format!(
8433                                        "FOREIGN KEY ON DELETE SET NULL: column \
8434                                         {child_name:?}.{:?} is NOT NULL — cannot SET NULL",
8435                                        col.name,
8436                                    )));
8437                                }
8438                            }
8439                            let entry = setnull_plan.entry(child_name.clone()).or_default();
8440                            for &li in &fk.local_columns {
8441                                entry.insert((child_row_idx, li));
8442                            }
8443                        }
8444                        spg_storage::FkAction::SetDefault => {
8445                            // Resolve the DEFAULT for every local FK col.
8446                            let entry =
8447                                setdefault_plan.entry(child_name.clone()).or_default();
8448                            for &li in &fk.local_columns {
8449                                let col = child.schema().columns.get(li).ok_or_else(|| {
8450                                    EngineError::Unsupported(alloc::format!(
8451                                        "FK local column {li} missing in {child_name:?}"
8452                                    ))
8453                                })?;
8454                                let default = col.default.clone().ok_or_else(|| {
8455                                    EngineError::Unsupported(alloc::format!(
8456                                        "FOREIGN KEY ON DELETE SET DEFAULT: column \
8457                                         {child_name:?}.{:?} has no DEFAULT declared",
8458                                        col.name,
8459                                    ))
8460                                })?;
8461                                entry.insert((child_row_idx, li), default);
8462                            }
8463                        }
8464                    }
8465                }
8466            }
8467        }
8468    }
8469    // Flatten the three plans into the ordered `FkChildStep` list.
8470    // Deletes are applied last per child (after any null/default
8471    // re-writes on the same child) so a child row that's both
8472    // re-written and then cascade-deleted only ends up deleted —
8473    // but in v7.6.5 SetNull/Cascade never overlap on the same row
8474    // (a single FK chooses exactly one action), so the order is
8475    // mostly a precaution.
8476    let mut steps: Vec<FkChildStep> = Vec::new();
8477    for (child_table, entries) in setnull_plan {
8478        let (positions, columns): (Vec<usize>, Vec<usize>) = entries.into_iter().unzip();
8479        steps.push(FkChildStep {
8480            child_table,
8481            action: FkChildAction::SetNull { positions, columns },
8482        });
8483    }
8484    for (child_table, entries) in setdefault_plan {
8485        let mut positions = Vec::with_capacity(entries.len());
8486        let mut columns = Vec::with_capacity(entries.len());
8487        let mut defaults = Vec::with_capacity(entries.len());
8488        for ((p, c), v) in entries {
8489            positions.push(p);
8490            columns.push(c);
8491            defaults.push(v);
8492        }
8493        steps.push(FkChildStep {
8494            child_table,
8495            action: FkChildAction::SetDefault {
8496                positions,
8497                columns,
8498                defaults,
8499            },
8500        });
8501    }
8502    for (child_table, positions) in delete_plan {
8503        steps.push(FkChildStep {
8504            child_table,
8505            action: FkChildAction::Delete {
8506                positions: positions.into_iter().collect(),
8507            },
8508        });
8509    }
8510    Ok(steps)
8511}
8512
8513/// v7.6.6 — plan FK fallout for an UPDATE that mutates parent-side
8514/// PK/UNIQUE columns. Walks every other table whose FK references
8515/// `parent_table_name`; for each FK whose parent_columns overlap a
8516/// mutated column, decides the action by `fk.on_update`.
8517///
8518///   - RESTRICT / NoAction → error if any child references the OLD
8519///     value
8520///   - CASCADE → child FK columns get rewritten to the NEW parent
8521///     value (a SetNull-style update step with the new value)
8522///   - SetNull → child FK columns set to NULL
8523///   - SetDefault → child FK columns set to declared default
8524///
8525/// `plan_with_old` is `(row_position, old_values, new_values)` so
8526/// the planner can detect "did this row's parent key actually
8527/// change?" — only rows where at least one referenced parent
8528/// column moved trigger inbound work.
8529fn plan_fk_parent_updates(
8530    catalog: &Catalog,
8531    parent_table_name: &str,
8532    plan_with_old: &[(usize, Vec<Value>, Vec<Value>)],
8533) -> Result<Vec<FkChildStep>, EngineError> {
8534    use alloc::collections::BTreeMap;
8535    if plan_with_old.is_empty() {
8536        return Ok(Vec::new());
8537    }
8538    // For each child table we may touch, build per-child step
8539    // lists. UPDATE never deletes children — `delete_plan` stays
8540    // empty here but is kept structurally aligned with
8541    // `plan_fk_parent_deletions` for future use.
8542    let delete_plan: BTreeMap<String, alloc::collections::BTreeSet<usize>> = BTreeMap::new();
8543    let mut setnull_plan: BTreeMap<
8544        String,
8545        alloc::collections::BTreeSet<(usize, usize)>,
8546    > = BTreeMap::new();
8547    let mut setdefault_plan: BTreeMap<String, BTreeMap<(usize, usize), Value>> =
8548        BTreeMap::new();
8549    // Cascade-update plan: child_table → row_idx → col_idx → new_value
8550    let mut cascade_plan: BTreeMap<String, BTreeMap<(usize, usize), Value>> = BTreeMap::new();
8551
8552    for child_name in catalog.table_names() {
8553        let child = catalog
8554            .get(&child_name)
8555            .expect("table_names → catalog.get total");
8556        for fk in &child.schema().foreign_keys {
8557            if fk.parent_table != parent_table_name {
8558                continue;
8559            }
8560            for (_pos, old_row, new_row) in plan_with_old {
8561                // Did any parent FK column change?
8562                let key_changed = fk
8563                    .parent_columns
8564                    .iter()
8565                    .any(|&pi| old_row.get(pi) != new_row.get(pi));
8566                if !key_changed {
8567                    continue;
8568                }
8569                // The OLD parent key — used to find referring children.
8570                let old_key: Vec<&Value> = fk
8571                    .parent_columns
8572                    .iter()
8573                    .map(|&pi| &old_row[pi])
8574                    .collect();
8575                if old_key.iter().any(|v| matches!(v, Value::Null)) {
8576                    // NULL parent has no children — skip.
8577                    continue;
8578                }
8579                let new_key: Vec<&Value> = fk
8580                    .parent_columns
8581                    .iter()
8582                    .map(|&pi| &new_row[pi])
8583                    .collect();
8584                for (child_row_idx, child_row) in child.rows().iter().enumerate() {
8585                    // Self-ref same-row updates: a row updating its
8586                    // own PK doesn't restrict itself.
8587                    if child_name == parent_table_name
8588                        && plan_with_old
8589                            .iter()
8590                            .any(|(p, _, _)| *p == child_row_idx)
8591                    {
8592                        continue;
8593                    }
8594                    let matches_key = fk
8595                        .local_columns
8596                        .iter()
8597                        .enumerate()
8598                        .all(|(i, &li)| child_row.values.get(li) == Some(old_key[i]));
8599                    if !matches_key {
8600                        continue;
8601                    }
8602                    match fk.on_update {
8603                        spg_storage::FkAction::Restrict
8604                        | spg_storage::FkAction::NoAction => {
8605                            return Err(EngineError::Unsupported(alloc::format!(
8606                                "FOREIGN KEY violation: UPDATE on {parent_table_name:?} PK is \
8607                                 restricted by FK from {child_name:?}.{:?}",
8608                                fk.local_columns,
8609                            )));
8610                        }
8611                        spg_storage::FkAction::Cascade => {
8612                            // Rewrite child FK columns to new key.
8613                            let entry = cascade_plan.entry(child_name.clone()).or_default();
8614                            for (i, &li) in fk.local_columns.iter().enumerate() {
8615                                entry.insert((child_row_idx, li), new_key[i].clone());
8616                            }
8617                        }
8618                        spg_storage::FkAction::SetNull => {
8619                            for &li in &fk.local_columns {
8620                                let col = child.schema().columns.get(li).ok_or_else(|| {
8621                                    EngineError::Unsupported(alloc::format!(
8622                                        "FK local column {li} missing in {child_name:?}"
8623                                    ))
8624                                })?;
8625                                if !col.nullable {
8626                                    return Err(EngineError::Unsupported(alloc::format!(
8627                                        "FOREIGN KEY ON UPDATE SET NULL: column \
8628                                         {child_name:?}.{:?} is NOT NULL",
8629                                        col.name,
8630                                    )));
8631                                }
8632                            }
8633                            let entry = setnull_plan.entry(child_name.clone()).or_default();
8634                            for &li in &fk.local_columns {
8635                                entry.insert((child_row_idx, li));
8636                            }
8637                        }
8638                        spg_storage::FkAction::SetDefault => {
8639                            let entry =
8640                                setdefault_plan.entry(child_name.clone()).or_default();
8641                            for &li in &fk.local_columns {
8642                                let col = child.schema().columns.get(li).ok_or_else(|| {
8643                                    EngineError::Unsupported(alloc::format!(
8644                                        "FK local column {li} missing in {child_name:?}"
8645                                    ))
8646                                })?;
8647                                let default = col.default.clone().ok_or_else(|| {
8648                                    EngineError::Unsupported(alloc::format!(
8649                                        "FOREIGN KEY ON UPDATE SET DEFAULT: column \
8650                                         {child_name:?}.{:?} has no DEFAULT",
8651                                        col.name,
8652                                    ))
8653                                })?;
8654                                entry.insert((child_row_idx, li), default);
8655                            }
8656                        }
8657                    }
8658                }
8659            }
8660        }
8661    }
8662    // Flatten into FkChildStep list. UPDATE doesn't produce
8663    // DeleteSteps (CASCADE on UPDATE just rewrites FK values).
8664    let mut steps: Vec<FkChildStep> = Vec::new();
8665    for (child_table, entries) in cascade_plan {
8666        let mut positions = Vec::with_capacity(entries.len());
8667        let mut columns = Vec::with_capacity(entries.len());
8668        let mut defaults = Vec::with_capacity(entries.len());
8669        for ((p, c), v) in entries {
8670            positions.push(p);
8671            columns.push(c);
8672            defaults.push(v);
8673        }
8674        // We reuse `FkChildAction::SetDefault` for cascade-update:
8675        // both shapes are "write a known value into specific cells"
8676        // — `apply_per_cell_writes` doesn't care whether the value
8677        // came from a DEFAULT declaration or a new parent key.
8678        steps.push(FkChildStep {
8679            child_table,
8680            action: FkChildAction::SetDefault {
8681                positions,
8682                columns,
8683                defaults,
8684            },
8685        });
8686    }
8687    for (child_table, entries) in setnull_plan {
8688        let (positions, columns): (Vec<usize>, Vec<usize>) = entries.into_iter().unzip();
8689        steps.push(FkChildStep {
8690            child_table,
8691            action: FkChildAction::SetNull { positions, columns },
8692        });
8693    }
8694    for (child_table, entries) in setdefault_plan {
8695        let mut positions = Vec::with_capacity(entries.len());
8696        let mut columns = Vec::with_capacity(entries.len());
8697        let mut defaults = Vec::with_capacity(entries.len());
8698        for ((p, c), v) in entries {
8699            positions.push(p);
8700            columns.push(c);
8701            defaults.push(v);
8702        }
8703        steps.push(FkChildStep {
8704            child_table,
8705            action: FkChildAction::SetDefault {
8706                positions,
8707                columns,
8708                defaults,
8709            },
8710        });
8711    }
8712    let _ = delete_plan; // UPDATE never deletes children.
8713    Ok(steps)
8714}
8715
8716/// v7.6.5 — apply one FK child step to the catalog. Encapsulates
8717/// the three action variants so the DELETE executor stays a
8718/// simple loop over the planned steps.
8719fn apply_fk_child_step(
8720    catalog: &mut Catalog,
8721    step: &FkChildStep,
8722) -> Result<(), EngineError> {
8723    let child = catalog.get_mut(&step.child_table).ok_or_else(|| {
8724        EngineError::Storage(StorageError::TableNotFound {
8725            name: step.child_table.clone(),
8726        })
8727    })?;
8728    match &step.action {
8729        FkChildAction::Delete { positions } => {
8730            let _ = child.delete_rows(positions);
8731        }
8732        FkChildAction::SetNull { positions, columns } => {
8733            apply_per_cell_writes(child, positions, columns, |_| Value::Null)?;
8734        }
8735        FkChildAction::SetDefault {
8736            positions,
8737            columns,
8738            defaults,
8739        } => {
8740            apply_per_cell_writes(child, positions, columns, |i| defaults[i].clone())?;
8741        }
8742    }
8743    Ok(())
8744}
8745
8746/// v7.6.5 — write new values into selected child cells via
8747/// `Table::update_row` (the catalog's existing UPDATE entry).
8748/// Groups writes by row position so multi-column updates on the
8749/// same row only call `update_row` once. `value_for(i)` produces
8750/// the new value for the i-th (position, column) entry.
8751fn apply_per_cell_writes(
8752    child: &mut spg_storage::Table,
8753    positions: &[usize],
8754    columns: &[usize],
8755    mut value_for: impl FnMut(usize) -> Value,
8756) -> Result<(), EngineError> {
8757    use alloc::collections::BTreeMap;
8758    let mut by_row: BTreeMap<usize, Vec<(usize, Value)>> = BTreeMap::new();
8759    for i in 0..positions.len() {
8760        by_row
8761            .entry(positions[i])
8762            .or_default()
8763            .push((columns[i], value_for(i)));
8764    }
8765    for (pos, mutations) in by_row {
8766        let mut new_values = child.rows()[pos].values.clone();
8767        for (col, v) in mutations {
8768            if let Some(slot) = new_values.get_mut(col) {
8769                *slot = v;
8770            }
8771        }
8772        child
8773            .update_row(pos, new_values)
8774            .map_err(EngineError::Storage)?;
8775    }
8776    Ok(())
8777}
8778
8779fn fk_action_sql_to_storage(a: spg_sql::ast::FkAction) -> spg_storage::FkAction {
8780    match a {
8781        spg_sql::ast::FkAction::Restrict => spg_storage::FkAction::Restrict,
8782        spg_sql::ast::FkAction::Cascade => spg_storage::FkAction::Cascade,
8783        spg_sql::ast::FkAction::SetNull => spg_storage::FkAction::SetNull,
8784        spg_sql::ast::FkAction::SetDefault => spg_storage::FkAction::SetDefault,
8785        spg_sql::ast::FkAction::NoAction => spg_storage::FkAction::NoAction,
8786    }
8787}
8788
8789/// v7.9.21 — resolve a column's DEFAULT for INSERT-time
8790/// default-fill. Free fn (rather than `&self`) so callers
8791/// with an active `&mut Table` borrow can still use it.
8792/// Literal defaults take the cached path (`col.default`);
8793/// runtime defaults hit `clock_fn` at each call. mailrs G4.
8794fn resolve_column_default_free(
8795    col: &ColumnSchema,
8796    clock_fn: Option<ClockFn>,
8797) -> Result<Value, EngineError> {
8798    if let Some(rt) = &col.runtime_default {
8799        return eval_runtime_default_free(rt, col.ty, clock_fn);
8800    }
8801    Ok(col.default.clone().unwrap_or(Value::Null))
8802}
8803
8804fn eval_runtime_default_free(
8805    rt: &str,
8806    ty: DataType,
8807    clock_fn: Option<ClockFn>,
8808) -> Result<Value, EngineError> {
8809    let s = rt.trim().to_ascii_lowercase();
8810    let canonical = s.trim_end_matches("()");
8811    let now_us = match clock_fn {
8812        Some(f) => f(),
8813        None => 0,
8814    };
8815    let v = match canonical {
8816        "now" | "current_timestamp" | "localtimestamp" => {
8817            Value::Timestamp(now_us)
8818        }
8819        "current_date" => Value::Date((now_us / 86_400_000_000) as i32),
8820        "current_time" | "localtime" => Value::Timestamp(now_us),
8821        other => {
8822            return Err(EngineError::Unsupported(alloc::format!(
8823                "runtime DEFAULT expression {other:?} not supported \
8824                 (v7.9.21 whitelist: now() / current_timestamp / \
8825                 current_date / current_time / localtimestamp / \
8826                 localtime)"
8827            )));
8828        }
8829    };
8830    coerce_value(v, ty, "DEFAULT", 0)
8831}
8832
8833/// v7.9.21 — true when a DEFAULT expression needs INSERT-time
8834/// evaluation rather than being cacheable as a literal Value.
8835/// FunctionCall is the immediate case (`now()`,
8836/// `current_timestamp`). Literal expressions and simple sign-
8837/// flipped numerics still take the static-cache path.
8838fn is_runtime_default_expr(expr: &Expr) -> bool {
8839    match expr {
8840        Expr::FunctionCall { .. } => true,
8841        Expr::Unary { expr, .. } => is_runtime_default_expr(expr),
8842        _ => false,
8843    }
8844}
8845
8846fn column_def_to_schema(c: ColumnDef) -> Result<ColumnSchema, EngineError> {
8847    let ty = column_type_to_data_type(c.ty);
8848    let mut schema = ColumnSchema::new(c.name.clone(), ty, c.nullable);
8849    if let Some(default_expr) = c.default {
8850        // v7.9.21 — distinguish literal defaults (evaluated once
8851        // at CREATE TABLE) from expression defaults (deferred to
8852        // INSERT). Function calls (`now()`, `current_timestamp`
8853        // — see v7.9.20 keyword promotion) take the runtime path.
8854        // Literals continue to cache. mailrs G4.
8855        if is_runtime_default_expr(&default_expr) {
8856            let display = alloc::format!("{default_expr}");
8857            schema = schema.with_runtime_default(display);
8858        } else {
8859            let raw = literal_expr_to_value(default_expr)?;
8860            let coerced = coerce_value(raw, ty, &c.name, 0)?;
8861            schema = schema.with_default(coerced);
8862        }
8863    }
8864    if c.auto_increment {
8865        // AUTO_INCREMENT only makes sense on integer-shaped columns.
8866        if !matches!(ty, DataType::SmallInt | DataType::Int | DataType::BigInt) {
8867            return Err(EngineError::Unsupported(alloc::format!(
8868                "AUTO_INCREMENT requires an integer column type, got {ty:?}"
8869            )));
8870        }
8871        schema = schema.with_auto_increment();
8872    }
8873    Ok(schema)
8874}
8875
8876/// v7.10.4 — decode a BYTEA literal. Accepts:
8877///   * `\xDEADBEEF` (case-insensitive hex; whitespace stripped)
8878///   * `Hello\000world` (backslash escape form; `\\` for literal backslash)
8879///   * Anything else → raw UTF-8 bytes of the input (PG accepts this too).
8880fn decode_bytea_literal(s: &str) -> Result<alloc::vec::Vec<u8>, &'static str> {
8881    let s = s.trim();
8882    if let Some(hex) = s.strip_prefix("\\x").or_else(|| s.strip_prefix("\\X")) {
8883        // Hex form. Each pair of hex digits → one byte.
8884        let cleaned: alloc::string::String = hex.chars().filter(|c| !c.is_whitespace()).collect();
8885        if cleaned.len() % 2 != 0 {
8886            return Err("odd-length hex literal");
8887        }
8888        let mut out = alloc::vec::Vec::with_capacity(cleaned.len() / 2);
8889        let cleaned_bytes = cleaned.as_bytes();
8890        for i in (0..cleaned_bytes.len()).step_by(2) {
8891            let hi = hex_nibble(cleaned_bytes[i])?;
8892            let lo = hex_nibble(cleaned_bytes[i + 1])?;
8893            out.push((hi << 4) | lo);
8894        }
8895        return Ok(out);
8896    }
8897    // Escape form or raw. Walk char-by-char; `\\` and `\NNN` octal
8898    // sequences decode; anything else is a literal byte.
8899    let bytes = s.as_bytes();
8900    let mut out = alloc::vec::Vec::with_capacity(bytes.len());
8901    let mut i = 0;
8902    while i < bytes.len() {
8903        let b = bytes[i];
8904        if b == b'\\' && i + 1 < bytes.len() {
8905            let n = bytes[i + 1];
8906            if n == b'\\' {
8907                out.push(b'\\');
8908                i += 2;
8909                continue;
8910            }
8911            if n.is_ascii_digit() && i + 3 < bytes.len() && bytes[i + 2].is_ascii_digit()
8912                && bytes[i + 3].is_ascii_digit()
8913            {
8914                let oct = |x: u8| (x - b'0') as u32;
8915                let v = oct(n) * 64 + oct(bytes[i + 2]) * 8 + oct(bytes[i + 3]);
8916                if v <= 0xFF {
8917                    out.push(v as u8);
8918                    i += 4;
8919                    continue;
8920                }
8921            }
8922        }
8923        out.push(b);
8924        i += 1;
8925    }
8926    Ok(out)
8927}
8928
8929fn hex_nibble(b: u8) -> Result<u8, &'static str> {
8930    match b {
8931        b'0'..=b'9' => Ok(b - b'0'),
8932        b'a'..=b'f' => Ok(b - b'a' + 10),
8933        b'A'..=b'F' => Ok(b - b'A' + 10),
8934        _ => Err("invalid hex digit"),
8935    }
8936}
8937
8938/// v7.10.4 — encode BYTEA bytes in PG hex output format
8939/// (`\x` prefix, lowercase hex pairs). Used by Text-side
8940/// round-trip + the wire layer's text-mode encoder.
8941fn encode_bytea_hex(b: &[u8]) -> alloc::string::String {
8942    let mut out = alloc::string::String::with_capacity(2 + 2 * b.len());
8943    out.push_str("\\x");
8944    for byte in b {
8945        let hi = byte >> 4;
8946        let lo = byte & 0x0F;
8947        out.push(hex_digit(hi));
8948        out.push(hex_digit(lo));
8949    }
8950    out
8951}
8952
8953const fn hex_digit(n: u8) -> char {
8954    match n {
8955        0..=9 => (b'0' + n) as char,
8956        10..=15 => (b'a' + n - 10) as char,
8957        _ => '?',
8958    }
8959}
8960
8961const fn column_type_to_data_type(t: ColumnTypeName) -> DataType {
8962    match t {
8963        ColumnTypeName::SmallInt => DataType::SmallInt,
8964        ColumnTypeName::Int => DataType::Int,
8965        ColumnTypeName::BigInt => DataType::BigInt,
8966        ColumnTypeName::Float => DataType::Float,
8967        ColumnTypeName::Text => DataType::Text,
8968        ColumnTypeName::Varchar(n) => DataType::Varchar(n),
8969        ColumnTypeName::Char(n) => DataType::Char(n),
8970        ColumnTypeName::Bool => DataType::Bool,
8971        ColumnTypeName::Vector { dim, encoding } => DataType::Vector {
8972            dim,
8973            encoding: match encoding {
8974                SqlVecEncoding::F32 => VecEncoding::F32,
8975                SqlVecEncoding::Sq8 => VecEncoding::Sq8,
8976                SqlVecEncoding::F16 => VecEncoding::F16,
8977            },
8978        },
8979        ColumnTypeName::Numeric(precision, scale) => DataType::Numeric { precision, scale },
8980        ColumnTypeName::Date => DataType::Date,
8981        ColumnTypeName::Timestamp => DataType::Timestamp,
8982        ColumnTypeName::Timestamptz => DataType::Timestamptz,
8983        ColumnTypeName::Json => DataType::Json,
8984        ColumnTypeName::Jsonb => DataType::Jsonb,
8985        ColumnTypeName::Bytes => DataType::Bytes,
8986    }
8987}
8988
8989/// Convert an INSERT VALUES expression to a storage Value. Supports literal
8990/// expressions, unary-minus over numeric literals, and pgvector-style
8991/// `'[..]'::vector` cast (v1.2). Anything more complex returns `Unsupported`.
8992fn literal_expr_to_value(expr: Expr) -> Result<Value, EngineError> {
8993    match expr {
8994        Expr::Literal(l) => Ok(literal_to_value(l)),
8995        Expr::Cast { expr, target } => {
8996            let inner_value = literal_expr_to_value(*expr)?;
8997            crate::eval::cast_value(inner_value, target).map_err(EngineError::Eval)
8998        }
8999        Expr::Unary {
9000            op: UnOp::Neg,
9001            expr,
9002        } => match *expr {
9003            Expr::Literal(Literal::Integer(n)) => {
9004                // Fold to i32 if it fits, else BigInt. Parser emits Integer(i64)
9005                // — overflow on negate of i64::MIN is the one edge case.
9006                let neg = n.checked_neg().ok_or_else(|| {
9007                    EngineError::Unsupported("integer literal overflow on negation".into())
9008                })?;
9009                Ok(int_value_for(neg))
9010            }
9011            Expr::Literal(Literal::Float(x)) => Ok(Value::Float(-x)),
9012            other => Err(EngineError::Unsupported(alloc::format!(
9013                "unary minus over non-literal expression: {other:?}"
9014            ))),
9015        },
9016        other => Err(EngineError::Unsupported(alloc::format!(
9017            "non-literal INSERT value expression: {other:?}"
9018        ))),
9019    }
9020}
9021
9022fn literal_to_value(l: Literal) -> Value {
9023    match l {
9024        Literal::Integer(n) => int_value_for(n),
9025        Literal::Float(x) => Value::Float(x),
9026        Literal::String(s) => Value::Text(s),
9027        Literal::Bool(b) => Value::Bool(b),
9028        Literal::Null => Value::Null,
9029        Literal::Vector(v) => Value::Vector(v),
9030        Literal::Interval { months, micros, .. } => Value::Interval { months, micros },
9031    }
9032}
9033
9034/// Pick `Int` (`i32`) when the literal fits, else `BigInt`. `INT` vs `BIGINT`
9035/// columns will still enforce the right tag downstream — this is just the
9036/// default we synthesise from an unannotated integer literal.
9037fn int_value_for(n: i64) -> Value {
9038    if let Ok(small) = i32::try_from(n) {
9039        Value::Int(small)
9040    } else {
9041        Value::BigInt(n)
9042    }
9043}
9044
9045/// Widen / narrow `v` to fit `expected`. Numerics permit safe widening
9046/// (`Int → BigInt`, `Int/BigInt → Float`) and best-effort narrowing
9047/// (`BigInt → Int` succeeds only when the value fits in `i32`). Everything
9048/// else returns `TypeMismatch` carrying the column name for caller diagnostics.
9049/// `NULL` is always permitted; the nullability check happens later in storage.
9050#[allow(clippy::too_many_lines)]
9051fn coerce_value(
9052    v: Value,
9053    expected: DataType,
9054    col_name: &str,
9055    position: usize,
9056) -> Result<Value, EngineError> {
9057    if v.is_null() {
9058        return Ok(Value::Null);
9059    }
9060    let actual = v.data_type().expect("non-null");
9061    if actual == expected {
9062        return Ok(v);
9063    }
9064    let coerced =
9065        match (v, expected) {
9066            (Value::Int(n), DataType::BigInt) => Some(Value::BigInt(i64::from(n))),
9067            (Value::Int(n), DataType::Float) => Some(Value::Float(f64::from(n))),
9068            (Value::Int(n), DataType::SmallInt) => i16::try_from(n).ok().map(Value::SmallInt),
9069            (Value::Int(n), DataType::Numeric { precision, scale }) => Some(numeric_from_integer(
9070                i128::from(n),
9071                precision,
9072                scale,
9073                col_name,
9074            )?),
9075            (Value::SmallInt(n), DataType::Int) => Some(Value::Int(i32::from(n))),
9076            (Value::SmallInt(n), DataType::BigInt) => Some(Value::BigInt(i64::from(n))),
9077            (Value::SmallInt(n), DataType::Float) => Some(Value::Float(f64::from(n))),
9078            (Value::SmallInt(n), DataType::Numeric { precision, scale }) => Some(
9079                numeric_from_integer(i128::from(n), precision, scale, col_name)?,
9080            ),
9081            (Value::BigInt(n), DataType::Int) => i32::try_from(n).ok().map(Value::Int),
9082            (Value::BigInt(n), DataType::SmallInt) => i16::try_from(n).ok().map(Value::SmallInt),
9083            #[allow(clippy::cast_precision_loss)]
9084            (Value::BigInt(n), DataType::Float) => Some(Value::Float(n as f64)),
9085            (Value::BigInt(n), DataType::Numeric { precision, scale }) => Some(
9086                numeric_from_integer(i128::from(n), precision, scale, col_name)?,
9087            ),
9088            (Value::Float(x), DataType::Numeric { precision, scale }) => {
9089                Some(numeric_from_float(x, precision, scale, col_name)?)
9090            }
9091            // Text → DATE / TIMESTAMP: parse canonical text forms.
9092            (Value::Text(s), DataType::Date) => {
9093                let d = eval::parse_date_literal(&s).ok_or_else(|| {
9094                    EngineError::Eval(EvalError::TypeMismatch {
9095                        detail: alloc::format!(
9096                            "cannot parse {s:?} as DATE for column `{col_name}`"
9097                        ),
9098                    })
9099                })?;
9100                Some(Value::Date(d))
9101            }
9102            // v4.9: Text ↔ JSON coercion. No structural validation —
9103            // any text literal is accepted; the responsibility for
9104            // valid JSON lies with the producer.
9105            (Value::Text(s), DataType::Json | DataType::Jsonb) => Some(Value::Json(s)),
9106            (Value::Json(s), DataType::Text) => Some(Value::Text(s)),
9107            // v7.10.4 — Text → BYTEA. Decode PG-style literal forms:
9108            //   - Hex:    `\x48656c6c6f`  (case-insensitive hex pairs)
9109            //   - Escape: `Hello\\000world`  (backslash + octal triples)
9110            //   - Plain:  any string → raw UTF-8 bytes (PG also accepts)
9111            // Errors surface as TypeMismatch so the operator gets a
9112            // clear "this literal isn't a bytea literal" hint.
9113            (Value::Text(s), DataType::Bytes) => {
9114                let bytes = decode_bytea_literal(&s).map_err(|e| {
9115                    EngineError::Eval(EvalError::TypeMismatch {
9116                        detail: alloc::format!(
9117                            "cannot parse {s:?} as BYTEA for column `{col_name}`: {e}"
9118                        ),
9119                    })
9120                })?;
9121                Some(Value::Bytes(bytes))
9122            }
9123            // v7.10.4 — BYTEA → Text round-trip uses the PG hex
9124            // output (lowercase, `\x` prefix). Important when a
9125            // SELECT pulls a bytea cell through a Text column path.
9126            (Value::Bytes(b), DataType::Text) => Some(Value::Text(encode_bytea_hex(&b))),
9127            (Value::Text(s), DataType::Timestamp | DataType::Timestamptz) => {
9128                let t = eval::parse_timestamp_literal(&s).ok_or_else(|| {
9129                    EngineError::Eval(EvalError::TypeMismatch {
9130                        detail: alloc::format!(
9131                            "cannot parse {s:?} as TIMESTAMP for column `{col_name}`"
9132                        ),
9133                    })
9134                })?;
9135                Some(Value::Timestamp(t))
9136            }
9137            // DATE ↔ TIMESTAMP convertibility (DATE → midnight,
9138            // TIMESTAMP → day truncation).
9139            (Value::Date(d), DataType::Timestamp | DataType::Timestamptz) => {
9140                Some(Value::Timestamp(i64::from(d) * 86_400_000_000))
9141            }
9142            // v7.9.21 — Value::Timestamp lands in either Timestamp
9143            // or Timestamptz columns; the on-disk layout is the
9144            // same i64 microseconds UTC.
9145            (Value::Timestamp(t), DataType::Timestamptz) => Some(Value::Timestamp(t)),
9146            (Value::Timestamp(t), DataType::Date) => {
9147                let days = t.div_euclid(86_400_000_000);
9148                i32::try_from(days).ok().map(Value::Date)
9149            }
9150            (
9151                Value::Numeric {
9152                    scaled,
9153                    scale: src_scale,
9154                },
9155                DataType::Numeric { precision, scale },
9156            ) => Some(numeric_rescale(
9157                scaled, src_scale, precision, scale, col_name,
9158            )?),
9159            #[allow(clippy::cast_precision_loss)]
9160            (Value::Numeric { scaled, scale }, DataType::Float) => {
9161                let mut div = 1.0_f64;
9162                for _ in 0..scale {
9163                    div *= 10.0;
9164                }
9165                Some(Value::Float((scaled as f64) / div))
9166            }
9167            (Value::Numeric { scaled, scale }, DataType::Int) => {
9168                let truncated = numeric_truncate_to_integer(scaled, scale);
9169                i32::try_from(truncated).ok().map(Value::Int)
9170            }
9171            (Value::Numeric { scaled, scale }, DataType::BigInt) => {
9172                let truncated = numeric_truncate_to_integer(scaled, scale);
9173                i64::try_from(truncated).ok().map(Value::BigInt)
9174            }
9175            (Value::Numeric { scaled, scale }, DataType::SmallInt) => {
9176                let truncated = numeric_truncate_to_integer(scaled, scale);
9177                i16::try_from(truncated).ok().map(Value::SmallInt)
9178            }
9179            // VARCHAR(n) enforces an upper bound on character count.
9180            (Value::Text(s), DataType::Varchar(max)) => {
9181                if u32::try_from(s.chars().count()).unwrap_or(u32::MAX) <= max {
9182                    Some(Value::Text(s))
9183                } else {
9184                    return Err(EngineError::Unsupported(alloc::format!(
9185                        "value for VARCHAR({max}) column `{col_name}` exceeds length: \
9186                     {} chars",
9187                        s.chars().count()
9188                    )));
9189                }
9190            }
9191            // v6.0.1: f32 → SQ8 INSERT-time quantisation. Triggered
9192            // when the column declares `VECTOR(N) USING SQ8` and
9193            // the INSERT VALUES expression yields a raw f32 vector
9194            // (the normal pgvector-shape literal). Dim mismatch
9195            // falls through the `_ => None` arm and surfaces as
9196            // `TypeMismatch` with the expected SQ8 column type —
9197            // matching the F32 path's existing error.
9198            (
9199                Value::Vector(v),
9200                DataType::Vector {
9201                    dim,
9202                    encoding: VecEncoding::Sq8,
9203                },
9204            ) if v.len() == dim as usize => {
9205                Some(Value::Sq8Vector(spg_storage::quantize::quantize(&v)))
9206            }
9207            // v6.0.3: f32 → f16 INSERT-time conversion for HALF
9208            // columns. Bit-exact at the storage layer (modulo
9209            // half-precision rounding); no rerank pass needed at
9210            // search time.
9211            (
9212                Value::Vector(v),
9213                DataType::Vector {
9214                    dim,
9215                    encoding: VecEncoding::F16,
9216                },
9217            ) if v.len() == dim as usize => Some(Value::HalfVector(
9218                spg_storage::halfvec::HalfVector::from_f32_slice(&v),
9219            )),
9220            // CHAR(n) right-pads with U+0020 to exactly n chars; if the input
9221            // is already longer we reject (PG truncates trailing-space-only;
9222            // staying strict for v1).
9223            (Value::Text(s), DataType::Char(size)) => {
9224                let len = u32::try_from(s.chars().count()).unwrap_or(u32::MAX);
9225                if len > size {
9226                    return Err(EngineError::Unsupported(alloc::format!(
9227                        "value for CHAR({size}) column `{col_name}` exceeds length: \
9228                     {len} chars"
9229                    )));
9230                }
9231                let need = (size - len) as usize;
9232                let mut padded = s;
9233                padded.reserve(need);
9234                for _ in 0..need {
9235                    padded.push(' ');
9236                }
9237                Some(Value::Text(padded))
9238            }
9239            _ => None,
9240        };
9241    coerced.ok_or(EngineError::Storage(StorageError::TypeMismatch {
9242        column: col_name.into(),
9243        expected,
9244        actual,
9245        position,
9246    }))
9247}
9248
9249#[cfg(test)]
9250mod tests {
9251    use super::*;
9252    use alloc::vec;
9253
9254    fn unwrap_command_ok(r: &QueryResult) -> usize {
9255        match r {
9256            QueryResult::CommandOk { affected, .. } => *affected,
9257            QueryResult::Rows { .. } => panic!("expected CommandOk, got Rows"),
9258        }
9259    }
9260
9261    #[test]
9262    fn create_table_registers_schema() {
9263        let mut e = Engine::new();
9264        e.execute("CREATE TABLE foo (a INT NOT NULL, b TEXT)")
9265            .unwrap();
9266        assert_eq!(e.catalog().table_count(), 1);
9267        let t = e.catalog().get("foo").unwrap();
9268        assert_eq!(t.schema().columns.len(), 2);
9269        assert_eq!(t.schema().columns[0].ty, DataType::Int);
9270        assert!(!t.schema().columns[0].nullable);
9271        assert_eq!(t.schema().columns[1].ty, DataType::Text);
9272    }
9273
9274    #[test]
9275    fn create_table_vector_default_is_f32_encoded() {
9276        let mut e = Engine::new();
9277        e.execute("CREATE TABLE t (v VECTOR(8))").unwrap();
9278        let t = e.catalog().get("t").unwrap();
9279        assert_eq!(
9280            t.schema().columns[0].ty,
9281            DataType::Vector {
9282                dim: 8,
9283                encoding: VecEncoding::F32,
9284            },
9285        );
9286    }
9287
9288    #[test]
9289    fn create_table_vector_using_sq8_succeeds() {
9290        // v6.0.1 step 3: the step-1 fence in `column_def_to_schema`
9291        // is lifted. CREATE TABLE persists an SQ8 column type in
9292        // the catalog; INSERT (next test) quantises raw f32 input.
9293        let mut e = Engine::new();
9294        e.execute("CREATE TABLE t (v VECTOR(8) USING SQ8)").unwrap();
9295        let t = e.catalog().get("t").unwrap();
9296        assert_eq!(
9297            t.schema().columns[0].ty,
9298            DataType::Vector {
9299                dim: 8,
9300                encoding: VecEncoding::Sq8,
9301            },
9302        );
9303    }
9304
9305    #[test]
9306    fn insert_into_sq8_column_quantises_f32_payload() {
9307        // v6.0.1 step 3: INSERT-time `coerce_value` rewrites a raw
9308        // `Value::Vector(Vec<f32>)` literal into the column's
9309        // quantised representation. The row that lands in the
9310        // catalog must therefore hold a `Value::Sq8Vector`, not the
9311        // original f32 buffer — that's the bit that delivers the
9312        // 4× compression target.
9313        let mut e = Engine::new();
9314        e.execute("CREATE TABLE t (v VECTOR(4) USING SQ8)").unwrap();
9315        e.execute("INSERT INTO t VALUES ([0.0, 0.25, 0.5, 1.0])")
9316            .unwrap();
9317        let t = e.catalog().get("t").unwrap();
9318        assert_eq!(t.rows().len(), 1);
9319        match &t.rows()[0].values[0] {
9320            Value::Sq8Vector(q) => {
9321                assert_eq!(q.bytes.len(), 4);
9322                // min/max are derived from the payload: min=0.0, max=1.0.
9323                assert!((q.min - 0.0).abs() < 1e-6);
9324                assert!((q.max - 1.0).abs() < 1e-6);
9325            }
9326            other => panic!("expected Sq8Vector cell, got {other:?}"),
9327        }
9328    }
9329
9330    #[test]
9331    fn create_table_vector_using_half_succeeds_and_insert_converts_to_f16() {
9332        // v6.0.3: CREATE TABLE accepts USING HALF; INSERT path
9333        // converts the incoming `Value::Vector(Vec<f32>)` cell
9334        // into `Value::HalfVector(HalfVector)` via the new
9335        // `coerce_value` arm. The dequantised round-trip is
9336        // bit-exact for f16-representable values, so 0.0 / 0.25
9337        // / 0.5 / 1.0 hit their grid points exactly.
9338        let mut e = Engine::new();
9339        e.execute("CREATE TABLE t (v VECTOR(4) USING HALF)")
9340            .unwrap();
9341        e.execute("INSERT INTO t VALUES ([0.0, 0.25, 0.5, 1.0])")
9342            .unwrap();
9343        let t = e.catalog().get("t").unwrap();
9344        assert_eq!(t.rows().len(), 1);
9345        match &t.rows()[0].values[0] {
9346            Value::HalfVector(h) => {
9347                assert_eq!(h.dim(), 4);
9348                let back = h.to_f32_vec();
9349                let expected = alloc::vec![0.0_f32, 0.25, 0.5, 1.0];
9350                for (g, e) in back.iter().zip(expected.iter()) {
9351                    assert!(
9352                        (g - e).abs() < 1e-6,
9353                        "{g} vs {e} should be exact on f16 grid"
9354                    );
9355                }
9356            }
9357            other => panic!("expected HalfVector cell, got {other:?}"),
9358        }
9359    }
9360
9361    #[test]
9362    fn alter_index_rebuild_in_place_succeeds() {
9363        // v6.0.4: bare REBUILD (no encoding switch) walks every
9364        // row again to rebuild the NSW graph. Verifies the engine
9365        // dispatch + storage helper plumbing without changing any
9366        // cell encoding.
9367        let mut e = Engine::new();
9368        e.execute("CREATE TABLE t (id INT NOT NULL, v VECTOR(3) NOT NULL)")
9369            .unwrap();
9370        for i in 0..8_i32 {
9371            #[allow(clippy::cast_precision_loss)]
9372            let base = (i as f32) * 0.1;
9373            e.execute(&alloc::format!(
9374                "INSERT INTO t VALUES ({i}, [{base}, {b1}, {b2}])",
9375                b1 = base + 0.01,
9376                b2 = base + 0.02,
9377            ))
9378            .unwrap();
9379        }
9380        e.execute("CREATE INDEX t_idx ON t USING hnsw (v)").unwrap();
9381        e.execute("ALTER INDEX t_idx REBUILD").unwrap();
9382        // Schema encoding stays F32 (no encoding clause).
9383        assert_eq!(
9384            e.catalog().get("t").unwrap().schema().columns[1].ty,
9385            DataType::Vector {
9386                dim: 3,
9387                encoding: VecEncoding::F32,
9388            },
9389        );
9390    }
9391
9392    #[test]
9393    fn alter_index_rebuild_with_encoding_switches_cell_type() {
9394        // v6.0.4: REBUILD WITH (encoding = SQ8) recodes every
9395        // stored cell from F32 → SQ8 + rebuilds the graph atop the
9396        // new encoding. Post-rebuild, cells must be Sq8Vector and
9397        // the schema must report encoding = Sq8.
9398        let mut e = Engine::new();
9399        e.execute("CREATE TABLE t (id INT NOT NULL, v VECTOR(4) NOT NULL)")
9400            .unwrap();
9401        e.execute("INSERT INTO t VALUES (1, [0.0, 0.25, 0.5, 1.0])")
9402            .unwrap();
9403        e.execute("CREATE INDEX t_idx ON t USING hnsw (v)").unwrap();
9404        e.execute("ALTER INDEX t_idx REBUILD WITH (encoding = SQ8)")
9405            .unwrap();
9406        let t = e.catalog().get("t").unwrap();
9407        assert_eq!(
9408            t.schema().columns[1].ty,
9409            DataType::Vector {
9410                dim: 4,
9411                encoding: VecEncoding::Sq8,
9412            },
9413        );
9414        assert!(matches!(t.rows()[0].values[1], Value::Sq8Vector(_)));
9415    }
9416
9417    #[test]
9418    fn alter_index_rebuild_unknown_index_errors() {
9419        let mut e = Engine::new();
9420        let err = e.execute("ALTER INDEX nope REBUILD").unwrap_err();
9421        assert!(
9422            matches!(
9423                &err,
9424                EngineError::Storage(StorageError::IndexNotFound { name }) if name == "nope"
9425            ),
9426            "got: {err}"
9427        );
9428    }
9429
9430    #[test]
9431    fn alter_index_rebuild_on_btree_index_errors() {
9432        // REBUILD on a B-tree index has no semantic meaning in
9433        // v6.0.4 — rejected at the storage layer with `Unsupported`.
9434        let mut e = Engine::new();
9435        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
9436        e.execute("INSERT INTO t VALUES (1)").unwrap();
9437        e.execute("CREATE INDEX t_idx ON t (id)").unwrap();
9438        let err = e.execute("ALTER INDEX t_idx REBUILD").unwrap_err();
9439        assert!(
9440            matches!(&err, EngineError::Storage(StorageError::Unsupported(_))),
9441            "got: {err}"
9442        );
9443    }
9444
9445    #[test]
9446    fn prepared_insert_substitutes_placeholders() {
9447        // v6.1.1: prepare() parses once; execute_prepared() walks the
9448        // AST and replaces $1/$2 with the param Values BEFORE the
9449        // dispatch sees them. Same logical result as a simple-query
9450        // INSERT, but parse happens once per *statement*, not per
9451        // execution.
9452        let mut e = Engine::new();
9453        e.execute("CREATE TABLE t (id INT NOT NULL, name TEXT NOT NULL)")
9454            .unwrap();
9455        let stmt = e.prepare("INSERT INTO t VALUES ($1, $2)").unwrap();
9456        for (id, name) in [(1, "alice"), (2, "bob"), (3, "carol")] {
9457            e.execute_prepared(
9458                stmt.clone(),
9459                &[Value::Int(id), Value::Text(name.into())],
9460            )
9461            .unwrap();
9462        }
9463        // Read back via simple-query SELECT.
9464        let rows_result = e.execute("SELECT id, name FROM t").unwrap();
9465        let QueryResult::Rows { rows, .. } = rows_result else {
9466            panic!("expected Rows")
9467        };
9468        assert_eq!(rows.len(), 3);
9469    }
9470
9471    #[test]
9472    fn prepared_select_with_placeholder_filters_rows() {
9473        let mut e = Engine::new();
9474        e.execute("CREATE TABLE t (id INT NOT NULL, v INT NOT NULL)")
9475            .unwrap();
9476        for i in 0..10_i32 {
9477            e.execute(&alloc::format!("INSERT INTO t VALUES ({i}, {})", i * 7))
9478                .unwrap();
9479        }
9480        let stmt = e
9481            .prepare("SELECT id FROM t WHERE v = $1")
9482            .unwrap();
9483        let QueryResult::Rows { rows, .. } = e
9484            .execute_prepared(stmt, &[Value::Int(35)])
9485            .unwrap()
9486        else {
9487            panic!("expected Rows")
9488        };
9489        // v = 35 means i*7 = 35 → i = 5.
9490        assert_eq!(rows.len(), 1);
9491        assert_eq!(rows[0].values[0], Value::Int(5));
9492    }
9493
9494    #[test]
9495    fn prepared_too_few_params_errors() {
9496        let mut e = Engine::new();
9497        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
9498        let stmt = e.prepare("INSERT INTO t VALUES ($1)").unwrap();
9499        let err = e.execute_prepared(stmt, &[]).unwrap_err();
9500        assert!(
9501            matches!(
9502                &err,
9503                EngineError::Eval(EvalError::PlaceholderOutOfRange { n: 1, bound: 0 })
9504            ),
9505            "got: {err}"
9506        );
9507    }
9508
9509    #[test]
9510    fn insert_into_half_column_dim_mismatch_errors() {
9511        let mut e = Engine::new();
9512        e.execute("CREATE TABLE t (v VECTOR(4) USING HALF)")
9513            .unwrap();
9514        let err = e.execute("INSERT INTO t VALUES ([1.0, 2.0])").unwrap_err();
9515        assert!(matches!(
9516            &err,
9517            EngineError::Storage(StorageError::TypeMismatch { .. })
9518        ));
9519    }
9520
9521    #[test]
9522    fn insert_into_sq8_column_dim_mismatch_errors() {
9523        // Dim mismatch falls through the `coerce_value` Vector→Sq8
9524        // arm's guard and surfaces as `TypeMismatch` — the same
9525        // error the F32 path produces today, so client error
9526        // handling stays uniform across encodings.
9527        let mut e = Engine::new();
9528        e.execute("CREATE TABLE t (v VECTOR(4) USING SQ8)").unwrap();
9529        let err = e.execute("INSERT INTO t VALUES ([1.0, 2.0])").unwrap_err();
9530        assert!(
9531            matches!(
9532                &err,
9533                EngineError::Storage(StorageError::TypeMismatch { .. })
9534            ),
9535            "got: {err}",
9536        );
9537    }
9538
9539    #[test]
9540    fn create_table_duplicate_errors() {
9541        let mut e = Engine::new();
9542        e.execute("CREATE TABLE foo (a INT)").unwrap();
9543        let err = e.execute("CREATE TABLE foo (a INT)").unwrap_err();
9544        assert!(matches!(
9545            err,
9546            EngineError::Storage(StorageError::DuplicateTable { ref name }) if name == "foo"
9547        ));
9548    }
9549
9550    #[test]
9551    fn insert_into_unknown_table_errors() {
9552        let mut e = Engine::new();
9553        let err = e.execute("INSERT INTO ghost VALUES (1)").unwrap_err();
9554        assert!(matches!(
9555            err,
9556            EngineError::Storage(StorageError::TableNotFound { ref name }) if name == "ghost"
9557        ));
9558    }
9559
9560    #[test]
9561    fn insert_happy_path_reports_one_affected() {
9562        let mut e = Engine::new();
9563        e.execute("CREATE TABLE foo (a INT NOT NULL)").unwrap();
9564        let r = e.execute("INSERT INTO foo VALUES (42)").unwrap();
9565        assert_eq!(unwrap_command_ok(&r), 1);
9566        assert_eq!(e.catalog().get("foo").unwrap().row_count(), 1);
9567    }
9568
9569    #[test]
9570    fn insert_arity_mismatch_propagates() {
9571        let mut e = Engine::new();
9572        e.execute("CREATE TABLE foo (a INT, b TEXT)").unwrap();
9573        let err = e.execute("INSERT INTO foo VALUES (1)").unwrap_err();
9574        assert!(matches!(
9575            err,
9576            EngineError::Storage(StorageError::ArityMismatch { .. })
9577        ));
9578    }
9579
9580    #[test]
9581    fn insert_negative_integer_via_unary_minus() {
9582        let mut e = Engine::new();
9583        e.execute("CREATE TABLE foo (a INT NOT NULL)").unwrap();
9584        e.execute("INSERT INTO foo VALUES (-7)").unwrap();
9585        let rows = e.catalog().get("foo").unwrap().rows();
9586        assert_eq!(rows[0].values[0], Value::Int(-7));
9587    }
9588
9589    #[test]
9590    fn insert_non_literal_expr_unsupported() {
9591        let mut e = Engine::new();
9592        e.execute("CREATE TABLE foo (a INT NOT NULL)").unwrap();
9593        let err = e.execute("INSERT INTO foo VALUES (1 + 2)").unwrap_err();
9594        assert!(matches!(err, EngineError::Unsupported(_)));
9595    }
9596
9597    #[test]
9598    fn select_star_returns_all_rows_in_insertion_order() {
9599        let mut e = Engine::new();
9600        e.execute("CREATE TABLE foo (a INT NOT NULL, b TEXT NOT NULL)")
9601            .unwrap();
9602        e.execute("INSERT INTO foo VALUES (1, 'one')").unwrap();
9603        e.execute("INSERT INTO foo VALUES (2, 'two')").unwrap();
9604        e.execute("INSERT INTO foo VALUES (3, 'three')").unwrap();
9605
9606        let r = e.execute("SELECT * FROM foo").unwrap();
9607        let QueryResult::Rows { columns, rows } = r else {
9608            panic!("expected Rows")
9609        };
9610        assert_eq!(columns.len(), 2);
9611        assert_eq!(columns[0].name, "a");
9612        assert_eq!(rows.len(), 3);
9613        assert_eq!(
9614            rows[1].values,
9615            vec![Value::Int(2), Value::Text("two".into())]
9616        );
9617    }
9618
9619    #[test]
9620    fn select_star_on_empty_table_returns_zero_rows() {
9621        let mut e = Engine::new();
9622        e.execute("CREATE TABLE foo (a INT)").unwrap();
9623        let r = e.execute("SELECT * FROM foo").unwrap();
9624        match r {
9625            QueryResult::Rows { rows, .. } => assert!(rows.is_empty()),
9626            QueryResult::CommandOk { .. } => panic!("expected Rows"),
9627        }
9628    }
9629
9630    // --- v0.4: WHERE + projection ------------------------------------------
9631
9632    fn make_three_row_users(e: &mut Engine) {
9633        e.execute("CREATE TABLE users (id INT NOT NULL, name TEXT NOT NULL, score INT)")
9634            .unwrap();
9635        e.execute("INSERT INTO users VALUES (1, 'alice', 90)")
9636            .unwrap();
9637        e.execute("INSERT INTO users VALUES (2, 'bob', NULL)")
9638            .unwrap();
9639        e.execute("INSERT INTO users VALUES (3, 'cara', 70)")
9640            .unwrap();
9641    }
9642
9643    fn unwrap_rows(r: QueryResult) -> (Vec<ColumnSchema>, Vec<Row>) {
9644        match r {
9645            QueryResult::Rows { columns, rows } => (columns, rows),
9646            QueryResult::CommandOk { .. } => panic!("expected Rows"),
9647        }
9648    }
9649
9650    #[test]
9651    fn where_filter_passes_only_true_rows() {
9652        let mut e = Engine::new();
9653        make_three_row_users(&mut e);
9654        let r = e.execute("SELECT * FROM users WHERE id > 1").unwrap();
9655        let (_, rows) = unwrap_rows(r);
9656        assert_eq!(rows.len(), 2);
9657        assert_eq!(rows[0].values[0], Value::Int(2));
9658        assert_eq!(rows[1].values[0], Value::Int(3));
9659    }
9660
9661    #[test]
9662    fn where_with_null_result_filters_out_row() {
9663        let mut e = Engine::new();
9664        make_three_row_users(&mut e);
9665        // score is NULL for bob → score > 80 is NULL → row excluded
9666        let r = e.execute("SELECT * FROM users WHERE score > 80").unwrap();
9667        let (_, rows) = unwrap_rows(r);
9668        assert_eq!(rows.len(), 1);
9669        assert_eq!(rows[0].values[1], Value::Text("alice".into()));
9670    }
9671
9672    #[test]
9673    fn projection_named_columns() {
9674        let mut e = Engine::new();
9675        make_three_row_users(&mut e);
9676        let r = e.execute("SELECT name, score FROM users").unwrap();
9677        let (cols, rows) = unwrap_rows(r);
9678        assert_eq!(cols.len(), 2);
9679        assert_eq!(cols[0].name, "name");
9680        assert_eq!(cols[1].name, "score");
9681        assert_eq!(rows.len(), 3);
9682        assert_eq!(
9683            rows[0].values,
9684            vec![Value::Text("alice".into()), Value::Int(90)]
9685        );
9686    }
9687
9688    #[test]
9689    fn projection_with_column_alias() {
9690        let mut e = Engine::new();
9691        make_three_row_users(&mut e);
9692        let r = e
9693            .execute("SELECT name AS who FROM users WHERE id = 1")
9694            .unwrap();
9695        let (cols, rows) = unwrap_rows(r);
9696        assert_eq!(cols[0].name, "who");
9697        assert_eq!(rows.len(), 1);
9698        assert_eq!(rows[0].values[0], Value::Text("alice".into()));
9699    }
9700
9701    #[test]
9702    fn qualified_column_with_table_alias_resolves() {
9703        let mut e = Engine::new();
9704        make_three_row_users(&mut e);
9705        let r = e
9706            .execute("SELECT u.id, u.name FROM users AS u WHERE u.id < 3")
9707            .unwrap();
9708        let (cols, rows) = unwrap_rows(r);
9709        assert_eq!(cols.len(), 2);
9710        assert_eq!(rows.len(), 2);
9711    }
9712
9713    #[test]
9714    fn qualified_column_with_wrong_alias_errors() {
9715        let mut e = Engine::new();
9716        make_three_row_users(&mut e);
9717        let err = e.execute("SELECT x.id FROM users AS u").unwrap_err();
9718        assert!(matches!(
9719            err,
9720            EngineError::Eval(EvalError::UnknownQualifier { ref qualifier }) if qualifier == "x"
9721        ));
9722    }
9723
9724    #[test]
9725    fn select_unknown_column_errors_in_projection() {
9726        let mut e = Engine::new();
9727        make_three_row_users(&mut e);
9728        let err = e.execute("SELECT ghost FROM users").unwrap_err();
9729        assert!(matches!(
9730            err,
9731            EngineError::Eval(EvalError::ColumnNotFound { ref name }) if name == "ghost"
9732        ));
9733    }
9734
9735    #[test]
9736    fn where_unknown_column_errors() {
9737        let mut e = Engine::new();
9738        make_three_row_users(&mut e);
9739        let err = e
9740            .execute("SELECT * FROM users WHERE ghost = 1")
9741            .unwrap_err();
9742        assert!(matches!(
9743            err,
9744            EngineError::Eval(EvalError::ColumnNotFound { .. })
9745        ));
9746    }
9747
9748    #[test]
9749    fn expression_projection_evaluates_and_renders() {
9750        // Compound expressions in the SELECT list are evaluated per row;
9751        // the output column is typed TEXT, name defaults to the expression.
9752        let mut e = Engine::new();
9753        e.execute("CREATE TABLE t (a INT NOT NULL)").unwrap();
9754        e.execute("INSERT INTO t VALUES (3)").unwrap();
9755        let (_, rows) = unwrap_rows(e.execute("SELECT 1 + 2 FROM t").unwrap());
9756        assert_eq!(rows.len(), 1);
9757        // The expression evaluates to integer 3; rendered as the cell value
9758        // (storage::Value::Int(3) since arithmetic kept ints).
9759        assert_eq!(rows[0].values[0], Value::Int(3));
9760    }
9761
9762    #[test]
9763    fn select_unknown_table_errors() {
9764        let mut e = Engine::new();
9765        let err = e.execute("SELECT * FROM ghost").unwrap_err();
9766        assert!(matches!(
9767            err,
9768            EngineError::Storage(StorageError::TableNotFound { .. })
9769        ));
9770    }
9771
9772    #[test]
9773    fn invalid_sql_returns_parse_error() {
9774        // v4.4: UPDATE is now real SQL, so use a true syntactic
9775        // garbage payload for the parse-error path.
9776        let mut e = Engine::new();
9777        let err = e.execute("THIS_IS_NOT_A_KEYWORD foo bar baz").unwrap_err();
9778        assert!(matches!(err, EngineError::Parse(_)));
9779    }
9780
9781    // --- v0.8 CREATE INDEX + index seek ------------------------------------
9782
9783    #[test]
9784    fn create_index_registers_on_table() {
9785        let mut e = Engine::new();
9786        make_three_row_users(&mut e);
9787        e.execute("CREATE INDEX by_name ON users (name)").unwrap();
9788        let t = e.catalog().get("users").unwrap();
9789        assert_eq!(t.indices().len(), 1);
9790        assert_eq!(t.indices()[0].name, "by_name");
9791    }
9792
9793    #[test]
9794    fn create_index_on_unknown_table_errors() {
9795        let mut e = Engine::new();
9796        let err = e.execute("CREATE INDEX i ON ghost (a)").unwrap_err();
9797        assert!(matches!(
9798            err,
9799            EngineError::Storage(StorageError::TableNotFound { .. })
9800        ));
9801    }
9802
9803    #[test]
9804    fn create_index_on_unknown_column_errors() {
9805        let mut e = Engine::new();
9806        make_three_row_users(&mut e);
9807        let err = e.execute("CREATE INDEX i ON users (ghost)").unwrap_err();
9808        assert!(matches!(
9809            err,
9810            EngineError::Storage(StorageError::ColumnNotFound { .. })
9811        ));
9812    }
9813
9814    #[test]
9815    fn select_eq_uses_index_returns_same_rows_as_scan() {
9816        // Build two engines: one with an index, one without. Same query →
9817        // same row set (index is a planner optimisation, not a semantic
9818        // change).
9819        let mut without = Engine::new();
9820        make_three_row_users(&mut without);
9821        let mut with = Engine::new();
9822        make_three_row_users(&mut with);
9823        with.execute("CREATE INDEX by_id ON users (id)").unwrap();
9824
9825        let q = "SELECT * FROM users WHERE id = 2";
9826        let (_, no_idx_rows) = unwrap_rows(without.execute(q).unwrap());
9827        let (_, idx_rows) = unwrap_rows(with.execute(q).unwrap());
9828        assert_eq!(no_idx_rows, idx_rows);
9829        assert_eq!(idx_rows.len(), 1);
9830    }
9831
9832    #[test]
9833    fn select_eq_with_no_matching_index_value_returns_empty() {
9834        let mut e = Engine::new();
9835        make_three_row_users(&mut e);
9836        e.execute("CREATE INDEX by_id ON users (id)").unwrap();
9837        let (_, rows) = unwrap_rows(e.execute("SELECT * FROM users WHERE id = 999").unwrap());
9838        assert_eq!(rows.len(), 0);
9839    }
9840
9841    // --- v0.9 transactions -------------------------------------------------
9842
9843    #[test]
9844    fn begin_sets_in_transaction_flag() {
9845        let mut e = Engine::new();
9846        assert!(!e.in_transaction());
9847        e.execute("BEGIN").unwrap();
9848        assert!(e.in_transaction());
9849    }
9850
9851    #[test]
9852    fn double_begin_errors() {
9853        let mut e = Engine::new();
9854        e.execute("BEGIN").unwrap();
9855        let err = e.execute("BEGIN").unwrap_err();
9856        assert_eq!(err, EngineError::TransactionAlreadyOpen);
9857    }
9858
9859    #[test]
9860    fn commit_without_begin_errors() {
9861        let mut e = Engine::new();
9862        let err = e.execute("COMMIT").unwrap_err();
9863        assert_eq!(err, EngineError::NoActiveTransaction);
9864    }
9865
9866    #[test]
9867    fn rollback_without_begin_errors() {
9868        let mut e = Engine::new();
9869        let err = e.execute("ROLLBACK").unwrap_err();
9870        assert_eq!(err, EngineError::NoActiveTransaction);
9871    }
9872
9873    #[test]
9874    fn commit_applies_shadow_to_committed_catalog() {
9875        let mut e = Engine::new();
9876        e.execute("CREATE TABLE t (v INT NOT NULL)").unwrap();
9877        e.execute("BEGIN").unwrap();
9878        e.execute("INSERT INTO t VALUES (1)").unwrap();
9879        e.execute("INSERT INTO t VALUES (2)").unwrap();
9880        e.execute("COMMIT").unwrap();
9881        assert!(!e.in_transaction());
9882        assert_eq!(e.catalog().get("t").unwrap().row_count(), 2);
9883    }
9884
9885    #[test]
9886    fn rollback_discards_shadow() {
9887        let mut e = Engine::new();
9888        e.execute("CREATE TABLE t (v INT NOT NULL)").unwrap();
9889        e.execute("BEGIN").unwrap();
9890        e.execute("INSERT INTO t VALUES (1)").unwrap();
9891        e.execute("INSERT INTO t VALUES (2)").unwrap();
9892        e.execute("ROLLBACK").unwrap();
9893        assert!(!e.in_transaction());
9894        assert_eq!(e.catalog().get("t").unwrap().row_count(), 0);
9895    }
9896
9897    #[test]
9898    fn select_during_tx_sees_uncommitted_writes_own_session() {
9899        // The shadow catalog is read by SELECTs while a TX is open — the
9900        // session can see its own pending writes.
9901        let mut e = Engine::new();
9902        e.execute("CREATE TABLE t (v INT NOT NULL)").unwrap();
9903        e.execute("BEGIN").unwrap();
9904        e.execute("INSERT INTO t VALUES (42)").unwrap();
9905        let (_, rows) = unwrap_rows(e.execute("SELECT * FROM t").unwrap());
9906        assert_eq!(rows.len(), 1);
9907        assert_eq!(rows[0].values[0], Value::Int(42));
9908    }
9909
9910    #[test]
9911    fn snapshot_with_no_users_is_bare_catalog_format() {
9912        let mut e = Engine::new();
9913        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
9914        let bytes = e.snapshot();
9915        assert_eq!(
9916            &bytes[..8],
9917            b"SPGDB001",
9918            "must be the bare v3.x catalog magic"
9919        );
9920        let e2 = Engine::restore_envelope(&bytes).unwrap();
9921        assert!(e2.users().is_empty());
9922        assert_eq!(e2.catalog().table_count(), 1);
9923    }
9924
9925    #[test]
9926    fn snapshot_with_users_round_trips_both_via_envelope() {
9927        let mut e = Engine::new();
9928        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
9929        e.create_user("alice", "pw1", Role::Admin, [9; 16]).unwrap();
9930        e.create_user("bob", "pw2", Role::ReadOnly, [5; 16])
9931            .unwrap();
9932        let bytes = e.snapshot();
9933        assert_eq!(&bytes[..8], b"SPGENV01", "must be the v4.1 envelope magic");
9934        let e2 = Engine::restore_envelope(&bytes).unwrap();
9935        assert_eq!(e2.users().len(), 2);
9936        assert_eq!(e2.verify_user("alice", "pw1"), Some(Role::Admin));
9937        assert_eq!(e2.verify_user("bob", "pw2"), Some(Role::ReadOnly));
9938        assert_eq!(e2.verify_user("alice", "wrong"), None);
9939        assert_eq!(e2.catalog().table_count(), 1);
9940    }
9941
9942    #[test]
9943    fn ddl_inside_tx_also_rolled_back() {
9944        let mut e = Engine::new();
9945        e.execute("BEGIN").unwrap();
9946        e.execute("CREATE TABLE t (v INT)").unwrap();
9947        // Visible inside the TX.
9948        e.execute("SELECT * FROM t").unwrap();
9949        e.execute("ROLLBACK").unwrap();
9950        // Gone after rollback.
9951        let err = e.execute("SELECT * FROM t").unwrap_err();
9952        assert!(matches!(
9953            err,
9954            EngineError::Storage(StorageError::TableNotFound { .. })
9955        ));
9956    }
9957
9958    // ── v6.1.2: CREATE / DROP PUBLICATION (engine-side) ──────
9959
9960    #[test]
9961    fn create_publication_lands_in_catalog() {
9962        let mut e = Engine::new();
9963        assert!(e.publications().is_empty());
9964        e.execute("CREATE PUBLICATION pub_a").unwrap();
9965        assert_eq!(e.publications().len(), 1);
9966        assert!(e.publications().contains("pub_a"));
9967    }
9968
9969    #[test]
9970    fn create_publication_duplicate_errors() {
9971        let mut e = Engine::new();
9972        e.execute("CREATE PUBLICATION pub_a").unwrap();
9973        let err = e.execute("CREATE PUBLICATION pub_a").unwrap_err();
9974        assert!(
9975            alloc::format!("{err:?}").contains("DuplicateName"),
9976            "got {err:?}"
9977        );
9978    }
9979
9980    #[test]
9981    fn drop_publication_silent_when_absent() {
9982        let mut e = Engine::new();
9983        // PG-compatible: DROP a publication that doesn't exist
9984        // succeeds (no-op) but reports zero affected.
9985        let r = e.execute("DROP PUBLICATION nope").unwrap();
9986        match r {
9987            QueryResult::CommandOk { affected, .. } => assert_eq!(affected, 0),
9988            other => panic!("expected CommandOk, got {other:?}"),
9989        }
9990    }
9991
9992    #[test]
9993    fn drop_publication_present_reports_one_affected() {
9994        let mut e = Engine::new();
9995        e.execute("CREATE PUBLICATION pub_a").unwrap();
9996        let r = e.execute("DROP PUBLICATION pub_a").unwrap();
9997        match r {
9998            QueryResult::CommandOk {
9999                affected,
10000                modified_catalog,
10001            } => {
10002                assert_eq!(affected, 1);
10003                assert!(modified_catalog);
10004            }
10005            other => panic!("expected CommandOk, got {other:?}"),
10006        }
10007        assert!(e.publications().is_empty());
10008    }
10009
10010    #[test]
10011    fn publications_persist_across_snapshot_restore() {
10012        // The persist-across-restart ship-gate at the engine layer —
10013        // snapshot → restore_envelope round trip must preserve the
10014        // publication catalog. The spg-server e2e covers the
10015        // process-restart variant.
10016        let mut e = Engine::new();
10017        e.execute("CREATE PUBLICATION pub_a").unwrap();
10018        e.execute("CREATE PUBLICATION pub_b FOR ALL TABLES").unwrap();
10019        let snap = e.snapshot();
10020        let e2 = Engine::restore_envelope(&snap).unwrap();
10021        assert_eq!(e2.publications().len(), 2);
10022        assert!(e2.publications().contains("pub_a"));
10023        assert!(e2.publications().contains("pub_b"));
10024    }
10025
10026    #[test]
10027    fn create_publication_allowed_inside_transaction() {
10028        // v6.1.4 dropped the v6.1.2 in-TX guard — PG allows
10029        // CREATE PUBLICATION inside a TX and the auto-commit
10030        // wrap path needs the same allowance.
10031        let mut e = Engine::new();
10032        e.execute("BEGIN").unwrap();
10033        e.execute("CREATE PUBLICATION pub_a").unwrap();
10034        e.execute("COMMIT").unwrap();
10035        assert!(e.publications().contains("pub_a"));
10036    }
10037
10038    // ── v6.1.3: SHOW PUBLICATIONS + FOR-list variants ───────
10039
10040    #[test]
10041    fn create_publication_for_table_list_lands_with_scope() {
10042        let mut e = Engine::new();
10043        e.execute("CREATE TABLE t1 (id INT NOT NULL)").unwrap();
10044        e.execute("CREATE TABLE t2 (id INT NOT NULL)").unwrap();
10045        e.execute("CREATE PUBLICATION pub_a FOR TABLE t1, t2")
10046            .unwrap();
10047        let scope = e.publications().get("pub_a").cloned();
10048        let Some(spg_sql::ast::PublicationScope::ForTables(ts)) = scope else {
10049            panic!("expected ForTables scope, got {scope:?}")
10050        };
10051        assert_eq!(ts, alloc::vec!["t1".to_string(), "t2".to_string()]);
10052    }
10053
10054    #[test]
10055    fn create_publication_all_tables_except_lands_with_scope() {
10056        let mut e = Engine::new();
10057        e.execute("CREATE PUBLICATION pub_a FOR ALL TABLES EXCEPT t3")
10058            .unwrap();
10059        let scope = e.publications().get("pub_a").cloned();
10060        let Some(spg_sql::ast::PublicationScope::AllTablesExcept(ts)) = scope else {
10061            panic!("expected AllTablesExcept scope, got {scope:?}")
10062        };
10063        assert_eq!(ts, alloc::vec!["t3".to_string()]);
10064    }
10065
10066    #[test]
10067    fn show_publications_empty_returns_zero_rows() {
10068        let e = Engine::new();
10069        let r = e.execute_readonly("SHOW PUBLICATIONS").unwrap();
10070        let QueryResult::Rows { rows, columns } = r else {
10071            panic!()
10072        };
10073        assert!(rows.is_empty());
10074        assert_eq!(columns.len(), 3);
10075        assert_eq!(columns[0].name, "name");
10076        assert_eq!(columns[1].name, "scope");
10077        assert_eq!(columns[2].name, "table_count");
10078    }
10079
10080    #[test]
10081    fn show_publications_returns_one_row_per_publication_ordered_by_name() {
10082        let mut e = Engine::new();
10083        e.execute("CREATE PUBLICATION z_pub").unwrap();
10084        e.execute("CREATE PUBLICATION a_pub FOR TABLE t1, t2")
10085            .unwrap();
10086        e.execute("CREATE PUBLICATION m_pub FOR ALL TABLES EXCEPT bad")
10087            .unwrap();
10088        let r = e.execute_readonly("SHOW PUBLICATIONS").unwrap();
10089        let QueryResult::Rows { rows, .. } = r else {
10090            panic!()
10091        };
10092        assert_eq!(rows.len(), 3);
10093        // Alphabetical order: a_pub, m_pub, z_pub.
10094        let names: Vec<&str> = rows
10095            .iter()
10096            .map(|r| {
10097                if let Value::Text(s) = &r.values[0] {
10098                    s.as_str()
10099                } else {
10100                    panic!()
10101                }
10102            })
10103            .collect();
10104        assert_eq!(names, alloc::vec!["a_pub", "m_pub", "z_pub"]);
10105        // Row 0 — a_pub scope summary + table_count = 2.
10106        match &rows[0].values[1] {
10107            Value::Text(s) => assert_eq!(s, "FOR TABLE t1, t2"),
10108            other => panic!("expected Text, got {other:?}"),
10109        }
10110        assert_eq!(rows[0].values[2], Value::Int(2));
10111        // Row 1 — m_pub.
10112        match &rows[1].values[1] {
10113            Value::Text(s) => assert_eq!(s, "FOR ALL TABLES EXCEPT bad"),
10114            other => panic!("expected Text, got {other:?}"),
10115        }
10116        assert_eq!(rows[1].values[2], Value::Int(1));
10117        // Row 2 — z_pub (AllTables → NULL count).
10118        match &rows[2].values[1] {
10119            Value::Text(s) => assert_eq!(s, "FOR ALL TABLES"),
10120            other => panic!("expected Text, got {other:?}"),
10121        }
10122        assert_eq!(rows[2].values[2], Value::Null);
10123    }
10124
10125    #[test]
10126    fn for_list_scopes_persist_across_snapshot() {
10127        // The v6.1.2 envelope-v3 round-trip exercised AllTables;
10128        // v6.1.3 needs the scope-1 / scope-2 tags to survive too.
10129        let mut e = Engine::new();
10130        e.execute("CREATE PUBLICATION p1 FOR TABLE t1, t2").unwrap();
10131        e.execute("CREATE PUBLICATION p2 FOR ALL TABLES EXCEPT bad, worse")
10132            .unwrap();
10133        let snap = e.snapshot();
10134        let e2 = Engine::restore_envelope(&snap).unwrap();
10135        assert_eq!(e2.publications().len(), 2);
10136        let p1 = e2.publications().get("p1").cloned();
10137        let Some(spg_sql::ast::PublicationScope::ForTables(ts)) = p1 else {
10138            panic!("p1 scope lost: {p1:?}")
10139        };
10140        assert_eq!(ts, alloc::vec!["t1".to_string(), "t2".to_string()]);
10141        let p2 = e2.publications().get("p2").cloned();
10142        let Some(spg_sql::ast::PublicationScope::AllTablesExcept(ts)) = p2 else {
10143            panic!("p2 scope lost: {p2:?}")
10144        };
10145        assert_eq!(ts, alloc::vec!["bad".to_string(), "worse".to_string()]);
10146    }
10147
10148    // ── v6.1.4: CREATE / DROP SUBSCRIPTION + SHOW + envelope v4 ─
10149
10150    #[test]
10151    fn create_subscription_lands_in_catalog_with_defaults() {
10152        let mut e = Engine::new();
10153        e.execute(
10154            "CREATE SUBSCRIPTION sub_a CONNECTION 'host=127.0.0.1 port=20002' PUBLICATION pub_a",
10155        )
10156        .unwrap();
10157        let s = e.subscriptions().get("sub_a").cloned().expect("present");
10158        assert_eq!(s.conn_str, "host=127.0.0.1 port=20002");
10159        assert_eq!(s.publications, alloc::vec!["pub_a".to_string()]);
10160        assert!(s.enabled);
10161        assert_eq!(s.last_received_pos, 0);
10162    }
10163
10164    #[test]
10165    fn create_subscription_duplicate_name_errors() {
10166        let mut e = Engine::new();
10167        e.execute("CREATE SUBSCRIPTION s CONNECTION 'host=x' PUBLICATION p")
10168            .unwrap();
10169        let err = e
10170            .execute("CREATE SUBSCRIPTION s CONNECTION 'host=y' PUBLICATION p")
10171            .unwrap_err();
10172        assert!(
10173            alloc::format!("{err:?}").contains("DuplicateName"),
10174            "got {err:?}"
10175        );
10176    }
10177
10178    #[test]
10179    fn drop_subscription_silent_when_absent() {
10180        let mut e = Engine::new();
10181        let r = e.execute("DROP SUBSCRIPTION never").unwrap();
10182        match r {
10183            QueryResult::CommandOk { affected, .. } => assert_eq!(affected, 0),
10184            other => panic!("expected CommandOk, got {other:?}"),
10185        }
10186    }
10187
10188    #[test]
10189    fn subscription_advance_updates_last_pos_monotone() {
10190        let mut e = Engine::new();
10191        e.execute("CREATE SUBSCRIPTION s CONNECTION 'h=x' PUBLICATION p")
10192            .unwrap();
10193        assert!(e.subscription_advance("s", 100));
10194        assert_eq!(e.subscriptions().get("s").unwrap().last_received_pos, 100);
10195        assert!(e.subscription_advance("s", 50)); // stale → ignored
10196        assert_eq!(e.subscriptions().get("s").unwrap().last_received_pos, 100);
10197        assert!(e.subscription_advance("s", 200));
10198        assert_eq!(e.subscriptions().get("s").unwrap().last_received_pos, 200);
10199        assert!(!e.subscription_advance("missing", 1));
10200    }
10201
10202    #[test]
10203    fn show_subscriptions_returns_rows_ordered_by_name() {
10204        let mut e = Engine::new();
10205        e.execute("CREATE SUBSCRIPTION z_sub CONNECTION 'h=x' PUBLICATION p1, p2")
10206            .unwrap();
10207        e.execute("CREATE SUBSCRIPTION a_sub CONNECTION 'h=y' PUBLICATION p3")
10208            .unwrap();
10209        let r = e.execute_readonly("SHOW SUBSCRIPTIONS").unwrap();
10210        let QueryResult::Rows { rows, columns } = r else {
10211            panic!()
10212        };
10213        assert_eq!(rows.len(), 2);
10214        assert_eq!(columns.len(), 5);
10215        assert_eq!(columns[0].name, "name");
10216        assert_eq!(columns[4].name, "last_received_pos");
10217        // Alphabetical: a_sub, z_sub.
10218        let names: Vec<&str> = rows
10219            .iter()
10220            .map(|r| {
10221                if let Value::Text(s) = &r.values[0] {
10222                    s.as_str()
10223                } else {
10224                    panic!()
10225                }
10226            })
10227            .collect();
10228        assert_eq!(names, alloc::vec!["a_sub", "z_sub"]);
10229        // Row 0: a_sub
10230        assert_eq!(rows[0].values[1], Value::Text("h=y".to_string()));
10231        assert_eq!(rows[0].values[2], Value::Text("p3".to_string()));
10232        assert_eq!(rows[0].values[3], Value::Bool(true));
10233        assert_eq!(rows[0].values[4], Value::BigInt(0));
10234        // Row 1: z_sub — publications join with ", "
10235        assert_eq!(rows[1].values[2], Value::Text("p1, p2".to_string()));
10236    }
10237
10238    #[test]
10239    fn subscriptions_persist_across_snapshot_envelope_v4() {
10240        let mut e = Engine::new();
10241        e.execute("CREATE SUBSCRIPTION s1 CONNECTION 'h=A' PUBLICATION p1, p2")
10242            .unwrap();
10243        e.execute("CREATE SUBSCRIPTION s2 CONNECTION 'h=B' PUBLICATION p3")
10244            .unwrap();
10245        e.subscription_advance("s2", 42);
10246        let snap = e.snapshot();
10247        let e2 = Engine::restore_envelope(&snap).unwrap();
10248        assert_eq!(e2.subscriptions().len(), 2);
10249        let s1 = e2.subscriptions().get("s1").unwrap();
10250        assert_eq!(s1.conn_str, "h=A");
10251        assert_eq!(s1.publications, alloc::vec!["p1".to_string(), "p2".to_string()]);
10252        assert_eq!(s1.last_received_pos, 0);
10253        let s2 = e2.subscriptions().get("s2").unwrap();
10254        assert_eq!(s2.last_received_pos, 42);
10255    }
10256
10257    #[test]
10258    fn v3_envelope_loads_with_empty_subscriptions() {
10259        // v3 snapshot (publications-only). Forge it by hand so we
10260        // verify v6.1.4 readers don't panic — they must surface
10261        // empty subscriptions and a populated publication table.
10262        let mut e = Engine::new();
10263        e.execute("CREATE PUBLICATION pub_legacy").unwrap();
10264        let catalog = e.catalog.serialize();
10265        let users = crate::users::serialize_users(&e.users);
10266        let pubs = e.publications.serialize();
10267        let mut buf = Vec::new();
10268        buf.extend_from_slice(b"SPGENV01");
10269        buf.push(3u8); // v3
10270        buf.extend_from_slice(&u32::try_from(catalog.len()).unwrap().to_le_bytes());
10271        buf.extend_from_slice(&catalog);
10272        buf.extend_from_slice(&u32::try_from(users.len()).unwrap().to_le_bytes());
10273        buf.extend_from_slice(&users);
10274        buf.extend_from_slice(&u32::try_from(pubs.len()).unwrap().to_le_bytes());
10275        buf.extend_from_slice(&pubs);
10276        let crc = spg_crypto::crc32::crc32(&buf);
10277        buf.extend_from_slice(&crc.to_le_bytes());
10278
10279        let e2 = Engine::restore_envelope(&buf).expect("v3 envelope restores under v4 reader");
10280        assert!(e2.subscriptions().is_empty());
10281        assert!(e2.publications().contains("pub_legacy"));
10282    }
10283
10284    #[test]
10285    fn create_subscription_allowed_inside_transaction() {
10286        let mut e = Engine::new();
10287        e.execute("BEGIN").unwrap();
10288        e.execute("CREATE SUBSCRIPTION s CONNECTION 'h=x' PUBLICATION p")
10289            .unwrap();
10290        e.execute("COMMIT").unwrap();
10291        assert!(e.subscriptions().contains("s"));
10292    }
10293
10294    #[test]
10295    // ── v6.2.0: ANALYZE + spg_statistic + envelope v5 ──────────
10296
10297    #[test]
10298    fn analyze_populates_histogram_bounds() {
10299        let mut e = Engine::new();
10300        e.execute("CREATE TABLE t (id INT NOT NULL, name TEXT)").unwrap();
10301        for i in 0..50 {
10302            e.execute(&alloc::format!(
10303                "INSERT INTO t VALUES ({i}, 'name{i}')"
10304            ))
10305            .unwrap();
10306        }
10307        e.execute("ANALYZE t").unwrap();
10308        let stats = e.statistics();
10309        let id_stats = stats.get("t", "id").unwrap();
10310        assert!(id_stats.histogram_bounds.len() >= 2);
10311        assert_eq!(id_stats.histogram_bounds.first().unwrap(), "0");
10312        assert_eq!(id_stats.histogram_bounds.last().unwrap(), "49");
10313        assert!((id_stats.null_frac - 0.0).abs() < 1e-6);
10314        assert_eq!(id_stats.n_distinct, 50);
10315    }
10316
10317    #[test]
10318    fn reanalyze_overwrites_prior_stats() {
10319        let mut e = Engine::new();
10320        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
10321        for i in 0..10 {
10322            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
10323        }
10324        e.execute("ANALYZE t").unwrap();
10325        let n1 = e.statistics().get("t", "id").unwrap().n_distinct;
10326        assert_eq!(n1, 10);
10327        for i in 10..30 {
10328            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
10329        }
10330        e.execute("ANALYZE t").unwrap();
10331        let n2 = e.statistics().get("t", "id").unwrap().n_distinct;
10332        assert_eq!(n2, 30);
10333    }
10334
10335    #[test]
10336    fn analyze_unknown_table_errors() {
10337        let mut e = Engine::new();
10338        let err = e.execute("ANALYZE nonexistent").unwrap_err();
10339        assert!(matches!(err, EngineError::Storage(StorageError::TableNotFound { .. })));
10340    }
10341
10342    #[test]
10343    fn bare_analyze_covers_all_user_tables() {
10344        let mut e = Engine::new();
10345        e.execute("CREATE TABLE t1 (id INT NOT NULL)").unwrap();
10346        e.execute("CREATE TABLE t2 (name TEXT NOT NULL)").unwrap();
10347        e.execute("INSERT INTO t1 VALUES (1)").unwrap();
10348        e.execute("INSERT INTO t2 VALUES ('alice')").unwrap();
10349        let r = e.execute("ANALYZE").unwrap();
10350        match r {
10351            QueryResult::CommandOk { affected, modified_catalog } => {
10352                assert_eq!(affected, 2);
10353                assert!(modified_catalog);
10354            }
10355            other => panic!("expected CommandOk, got {other:?}"),
10356        }
10357        assert!(e.statistics().get("t1", "id").is_some());
10358        assert!(e.statistics().get("t2", "name").is_some());
10359    }
10360
10361    #[test]
10362    fn select_from_spg_statistic_returns_rows_per_column() {
10363        let mut e = Engine::new();
10364        e.execute("CREATE TABLE t (id INT NOT NULL, label TEXT)")
10365            .unwrap();
10366        e.execute("INSERT INTO t VALUES (1, 'a')").unwrap();
10367        e.execute("INSERT INTO t VALUES (2, 'b')").unwrap();
10368        e.execute("ANALYZE t").unwrap();
10369        let r = e.execute_readonly("SELECT * FROM spg_statistic").unwrap();
10370        let QueryResult::Rows { rows, columns } = r else {
10371            panic!()
10372        };
10373        // v6.7.0 — spg_statistic gained a `cold_row_count` column.
10374        assert_eq!(columns.len(), 6);
10375        assert_eq!(columns[0].name, "table_name");
10376        assert_eq!(columns[4].name, "histogram_bounds");
10377        assert_eq!(columns[5].name, "cold_row_count");
10378        assert_eq!(rows.len(), 2, "one row per column of t");
10379        // Sorted by (table_name, column_name).
10380        match (&rows[0].values[0], &rows[0].values[1]) {
10381            (Value::Text(t), Value::Text(c)) => {
10382                assert_eq!(t, "t");
10383                // BTreeMap orders (table, column); columns "id" < "label".
10384                assert_eq!(c, "id");
10385            }
10386            _ => panic!(),
10387        }
10388    }
10389
10390    #[test]
10391    fn analyze_skips_vector_columns() {
10392        // Vector columns have their own stats shape (HNSW graph);
10393        // ANALYZE leaves them out of spg_statistic.
10394        let mut e = Engine::new();
10395        e.execute("CREATE TABLE t (id INT NOT NULL, v VECTOR(3) NOT NULL)")
10396            .unwrap();
10397        e.execute("INSERT INTO t VALUES (1, [1, 2, 3])").unwrap();
10398        e.execute("ANALYZE t").unwrap();
10399        assert!(e.statistics().get("t", "id").is_some());
10400        assert!(e.statistics().get("t", "v").is_none());
10401    }
10402
10403    #[test]
10404    fn statistics_persist_across_envelope_v5_round_trip() {
10405        let mut e = Engine::new();
10406        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
10407        for i in 0..20 {
10408            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
10409        }
10410        e.execute("ANALYZE").unwrap();
10411        let snap = e.snapshot();
10412        let e2 = Engine::restore_envelope(&snap).unwrap();
10413        let s = e2.statistics().get("t", "id").unwrap();
10414        assert_eq!(s.n_distinct, 20);
10415    }
10416
10417    // ── v6.2.1 auto-analyze threshold ───────────────────────────
10418
10419    #[test]
10420    fn auto_analyze_threshold_fires_after_10pct_of_min_rows_on_small_table() {
10421        // For a table with 0 rows then 10 inserts → modified=10,
10422        // row_count=10. Threshold = 0.1 × max(10, 100) = 10. So
10423        // after the 10th INSERT the threshold is met.
10424        let mut e = Engine::new();
10425        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
10426        for i in 0..9 {
10427            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
10428        }
10429        assert!(e.tables_needing_analyze().is_empty(), "9 < threshold");
10430        e.execute("INSERT INTO t VALUES (9)").unwrap();
10431        let needs = e.tables_needing_analyze();
10432        assert_eq!(needs, alloc::vec!["t".to_string()]);
10433    }
10434
10435    #[test]
10436    fn auto_analyze_threshold_uses_10pct_of_row_count_for_large_tables() {
10437        // After ANALYZE on 1000 rows, threshold = 0.1 × row_count.
10438        // Each new INSERT bumps both modified and row_count, so to
10439        // trigger from N=1000 we need modifications ≥ 0.1 × (1000+M),
10440        // i.e. M ≥ 112. The test inserts 50 (no fire), then 150
10441        // more (200 total mods, row_count=1200, threshold=120 → fire).
10442        let mut e = Engine::new();
10443        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
10444        for i in 0..1000 {
10445            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
10446        }
10447        e.execute("ANALYZE t").unwrap();
10448        assert!(e.tables_needing_analyze().is_empty(), "fresh ANALYZE");
10449        for i in 1000..1050 {
10450            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
10451        }
10452        assert!(
10453            e.tables_needing_analyze().is_empty(),
10454            "50 inserts < threshold of ~105"
10455        );
10456        for i in 1050..1200 {
10457            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
10458        }
10459        assert_eq!(
10460            e.tables_needing_analyze(),
10461            alloc::vec!["t".to_string()],
10462            "200 inserts > 0.1 × 1200 threshold"
10463        );
10464    }
10465
10466    #[test]
10467    fn auto_analyze_threshold_resets_after_analyze() {
10468        let mut e = Engine::new();
10469        e.execute("CREATE TABLE t (id INT NOT NULL)").unwrap();
10470        for i in 0..200 {
10471            e.execute(&alloc::format!("INSERT INTO t VALUES ({i})")).unwrap();
10472        }
10473        assert!(!e.tables_needing_analyze().is_empty());
10474        e.execute("ANALYZE").unwrap();
10475        assert!(
10476            e.tables_needing_analyze().is_empty(),
10477            "ANALYZE must reset the counter"
10478        );
10479    }
10480
10481    #[test]
10482    fn auto_analyze_threshold_tracks_updates_and_deletes() {
10483        let mut e = Engine::new();
10484        e.execute("CREATE TABLE t (id INT NOT NULL, label TEXT)").unwrap();
10485        for i in 0..50 {
10486            e.execute(&alloc::format!("INSERT INTO t VALUES ({i}, 'x')"))
10487                .unwrap();
10488        }
10489        e.execute("ANALYZE t").unwrap();
10490        // UPDATE 20 rows + DELETE 5 → modified=25. Threshold = 0.1
10491        // × max(50, 100) = 10. So 25 >= 10 → trigger.
10492        e.execute("UPDATE t SET label = 'y' WHERE id < 20").unwrap();
10493        e.execute("DELETE FROM t WHERE id >= 45").unwrap();
10494        assert_eq!(
10495            e.tables_needing_analyze(),
10496            alloc::vec!["t".to_string()]
10497        );
10498    }
10499
10500    #[test]
10501    fn v4_envelope_loads_with_empty_statistics() {
10502        // Forge a v4 envelope by hand: catalog + users + pubs +
10503        // subs trailer, no statistics. A v6.2.0 reader must accept
10504        // it and surface an empty Statistics.
10505        let mut e = Engine::new();
10506        e.create_user("alice", "secret", crate::users::Role::ReadOnly, [0u8; 16])
10507            .unwrap();
10508        let catalog = e.catalog.serialize();
10509        let users = crate::users::serialize_users(&e.users);
10510        let pubs = e.publications.serialize();
10511        let subs = e.subscriptions.serialize();
10512        let mut buf = Vec::new();
10513        buf.extend_from_slice(b"SPGENV01");
10514        buf.push(4u8);
10515        buf.extend_from_slice(&u32::try_from(catalog.len()).unwrap().to_le_bytes());
10516        buf.extend_from_slice(&catalog);
10517        buf.extend_from_slice(&u32::try_from(users.len()).unwrap().to_le_bytes());
10518        buf.extend_from_slice(&users);
10519        buf.extend_from_slice(&u32::try_from(pubs.len()).unwrap().to_le_bytes());
10520        buf.extend_from_slice(&pubs);
10521        buf.extend_from_slice(&u32::try_from(subs.len()).unwrap().to_le_bytes());
10522        buf.extend_from_slice(&subs);
10523        let crc = spg_crypto::crc32::crc32(&buf);
10524        buf.extend_from_slice(&crc.to_le_bytes());
10525        let e2 = Engine::restore_envelope(&buf).expect("v4 envelope restores");
10526        assert!(e2.statistics().is_empty());
10527    }
10528
10529    #[test]
10530    fn v1_v2_envelope_loads_with_empty_publications() {
10531        // A snapshot taken before v6.1.2 (no publication trailer,
10532        // envelope v2) must still deserialise — and the resulting
10533        // engine must report zero publications. Use the engine's own
10534        // round-trip with no publications: that emits v3 but with an
10535        // empty pubs block. Then forge a v2 envelope by hand to lock
10536        // the back-compat path.
10537        let mut e = Engine::new();
10538        // Force users to be non-empty so the snapshot takes the
10539        // envelope path rather than the bare-catalog fallback.
10540        e.create_user(
10541            "alice",
10542            "secret",
10543            crate::users::Role::ReadOnly,
10544            [0u8; 16],
10545        )
10546        .unwrap();
10547
10548        // Forge an envelope v2: same shape as v3 but no pubs trailer.
10549        let catalog = e.catalog.serialize();
10550        let users = crate::users::serialize_users(&e.users);
10551        let mut buf = Vec::new();
10552        buf.extend_from_slice(b"SPGENV01");
10553        buf.push(2u8); // v2
10554        buf.extend_from_slice(
10555            &u32::try_from(catalog.len()).unwrap().to_le_bytes(),
10556        );
10557        buf.extend_from_slice(&catalog);
10558        buf.extend_from_slice(
10559            &u32::try_from(users.len()).unwrap().to_le_bytes(),
10560        );
10561        buf.extend_from_slice(&users);
10562        let crc = spg_crypto::crc32::crc32(&buf);
10563        buf.extend_from_slice(&crc.to_le_bytes());
10564
10565        let e2 = Engine::restore_envelope(&buf).expect("v2 envelope restores");
10566        assert!(e2.publications().is_empty());
10567    }
10568}